diff -Naurp linux-2.6.4/arch/alpha/kernel/init_task.c linux-2.6.4-ck1/arch/alpha/kernel/init_task.c --- linux-2.6.4/arch/alpha/kernel/init_task.c 2004-01-09 22:56:39.000000000 +1100 +++ linux-2.6.4-ck1/arch/alpha/kernel/init_task.c 2004-03-11 22:45:15.103539128 +1100 @@ -19,4 +19,4 @@ EXPORT_SYMBOL(init_task); union thread_union init_thread_union __attribute__((section(".data.init_thread"))) - = { INIT_THREAD_INFO(init_task) }; + = { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; diff -Naurp linux-2.6.4/arch/arm/kernel/init_task.c linux-2.6.4-ck1/arch/arm/kernel/init_task.c --- linux-2.6.4/arch/arm/kernel/init_task.c 2004-01-09 22:56:37.000000000 +1100 +++ linux-2.6.4-ck1/arch/arm/kernel/init_task.c 2004-03-11 22:45:15.103539128 +1100 @@ -31,7 +31,7 @@ EXPORT_SYMBOL(init_mm); */ union thread_union init_thread_union __attribute__((__section__(".init.task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; /* * Initial task structure. diff -Naurp linux-2.6.4/arch/arm26/kernel/init_task.c linux-2.6.4-ck1/arch/arm26/kernel/init_task.c --- linux-2.6.4/arch/arm26/kernel/init_task.c 2004-01-09 22:56:39.000000000 +1100 +++ linux-2.6.4-ck1/arch/arm26/kernel/init_task.c 2004-03-11 22:45:15.104538972 +1100 @@ -34,7 +34,7 @@ EXPORT_SYMBOL(init_mm); */ union thread_union init_thread_union __attribute__((__section__(".init.task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; /* * Initial task structure. diff -Naurp linux-2.6.4/arch/cris/kernel/process.c linux-2.6.4-ck1/arch/cris/kernel/process.c --- linux-2.6.4/arch/cris/kernel/process.c 2004-01-09 22:56:38.000000000 +1100 +++ linux-2.6.4-ck1/arch/cris/kernel/process.c 2004-03-11 22:45:15.104538972 +1100 @@ -131,7 +131,7 @@ EXPORT_SYMBOL(init_mm); */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; /* * Initial task structure. diff -Naurp linux-2.6.4/arch/h8300/kernel/init_task.c linux-2.6.4-ck1/arch/h8300/kernel/init_task.c --- linux-2.6.4/arch/h8300/kernel/init_task.c 2004-01-09 22:56:39.000000000 +1100 +++ linux-2.6.4-ck1/arch/h8300/kernel/init_task.c 2004-03-11 22:45:15.105538817 +1100 @@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task); */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; diff -Naurp linux-2.6.4/arch/i386/kernel/entry.S linux-2.6.4-ck1/arch/i386/kernel/entry.S --- linux-2.6.4/arch/i386/kernel/entry.S 2004-03-11 22:35:41.119860894 +1100 +++ linux-2.6.4-ck1/arch/i386/kernel/entry.S 2004-03-11 22:51:59.350659837 +1100 @@ -882,6 +882,11 @@ ENTRY(sys_call_table) .long sys_utimes .long sys_fadvise64_64 .long sys_ni_syscall /* sys_vserver */ +#ifdef CONFIG_REISER4_FS + .long sys_reiser4 +#else + .long sys_ni_syscall +#endif .long sys_ioprio_set .long sys_ioprio_get diff -Naurp linux-2.6.4/arch/i386/kernel/init_task.c linux-2.6.4-ck1/arch/i386/kernel/init_task.c --- linux-2.6.4/arch/i386/kernel/init_task.c 2004-01-09 22:56:38.000000000 +1100 +++ linux-2.6.4-ck1/arch/i386/kernel/init_task.c 2004-03-11 22:45:15.107538506 +1100 @@ -26,7 +26,7 @@ EXPORT_SYMBOL(init_mm); */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; /* * Initial task structure. diff -Naurp linux-2.6.4/arch/ia64/kernel/init_task.c linux-2.6.4-ck1/arch/ia64/kernel/init_task.c --- linux-2.6.4/arch/ia64/kernel/init_task.c 2004-01-09 22:56:38.000000000 +1100 +++ linux-2.6.4-ck1/arch/ia64/kernel/init_task.c 2004-03-11 22:45:15.108538350 +1100 @@ -39,7 +39,7 @@ union { unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)]; } init_task_mem asm ("init_task") __attribute__((section(".data.init_task"))) = {{ .task = INIT_TASK(init_task_mem.s.task), - .thread_info = INIT_THREAD_INFO(init_task_mem.s.task) + .thread_info = INIT_THREAD_INFO(init_task_mem.s.task, init_thread_mem.s.thread_info) }}; EXPORT_SYMBOL(init_task); diff -Naurp linux-2.6.4/arch/m68k/kernel/process.c linux-2.6.4-ck1/arch/m68k/kernel/process.c --- linux-2.6.4/arch/m68k/kernel/process.c 2004-01-09 22:56:38.000000000 +1100 +++ linux-2.6.4-ck1/arch/m68k/kernel/process.c 2004-03-11 22:45:15.108538350 +1100 @@ -50,7 +50,7 @@ EXPORT_SYMBOL(init_mm); union thread_union init_thread_union __attribute__((section(".data.init_task"), aligned(THREAD_SIZE))) - = { INIT_THREAD_INFO(init_task) }; + = { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; /* initial task structure */ struct task_struct init_task = INIT_TASK(init_task); diff -Naurp linux-2.6.4/arch/m68knommu/kernel/init_task.c linux-2.6.4-ck1/arch/m68knommu/kernel/init_task.c --- linux-2.6.4/arch/m68knommu/kernel/init_task.c 2004-01-09 22:56:39.000000000 +1100 +++ linux-2.6.4-ck1/arch/m68knommu/kernel/init_task.c 2004-03-11 22:45:15.109538195 +1100 @@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task); */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; diff -Naurp linux-2.6.4/arch/mips/kernel/init_task.c linux-2.6.4-ck1/arch/mips/kernel/init_task.c --- linux-2.6.4/arch/mips/kernel/init_task.c 2004-03-11 21:26:26.000000000 +1100 +++ linux-2.6.4-ck1/arch/mips/kernel/init_task.c 2004-03-11 22:53:18.101406808 +1100 @@ -29,7 +29,7 @@ EXPORT_SYMBOL(init_mm); union thread_union init_thread_union __attribute__((__section__(".data.init_task"), __aligned__(THREAD_SIZE))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; /* * Initial task structure. diff -Naurp linux-2.6.4/arch/parisc/kernel/init_task.c linux-2.6.4-ck1/arch/parisc/kernel/init_task.c --- linux-2.6.4/arch/parisc/kernel/init_task.c 2004-02-18 21:09:38.000000000 +1100 +++ linux-2.6.4-ck1/arch/parisc/kernel/init_task.c 2004-03-11 22:45:15.111537884 +1100 @@ -50,7 +50,7 @@ EXPORT_SYMBOL(init_mm); unsigned char interrupt_stack[ISTACK_SIZE] __attribute__ ((section("init_istack"), aligned(4096))); union thread_union init_thread_union __attribute__((aligned(128))) __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__ ((aligned(4096))) = { {0}, }; #ifdef __LP64__ diff -Naurp linux-2.6.4/arch/ppc/kernel/process.c linux-2.6.4-ck1/arch/ppc/kernel/process.c --- linux-2.6.4/arch/ppc/kernel/process.c 2004-01-09 22:56:38.000000000 +1100 +++ linux-2.6.4-ck1/arch/ppc/kernel/process.c 2004-03-11 22:45:15.111537884 +1100 @@ -62,7 +62,7 @@ EXPORT_SYMBOL(init_mm); at the base of it from the stack pointer with 1 integer instruction. */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = -{ INIT_THREAD_INFO(init_task) }; +{ INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; /* initial task structure */ struct task_struct init_task = INIT_TASK(init_task); diff -Naurp linux-2.6.4/arch/ppc64/kernel/init_task.c linux-2.6.4-ck1/arch/ppc64/kernel/init_task.c --- linux-2.6.4/arch/ppc64/kernel/init_task.c 2004-01-09 22:56:39.000000000 +1100 +++ linux-2.6.4-ck1/arch/ppc64/kernel/init_task.c 2004-03-11 22:45:15.112537728 +1100 @@ -23,7 +23,7 @@ EXPORT_SYMBOL(init_mm); */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; /* * Initial task structure. diff -Naurp linux-2.6.4/arch/s390/kernel/init_task.c linux-2.6.4-ck1/arch/s390/kernel/init_task.c --- linux-2.6.4/arch/s390/kernel/init_task.c 2004-01-09 22:56:39.000000000 +1100 +++ linux-2.6.4-ck1/arch/s390/kernel/init_task.c 2004-03-11 22:45:15.112537728 +1100 @@ -31,7 +31,7 @@ EXPORT_SYMBOL(init_mm); */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; /* * Initial task structure. diff -Naurp linux-2.6.4/arch/sh/kernel/init_task.c linux-2.6.4-ck1/arch/sh/kernel/init_task.c --- linux-2.6.4/arch/sh/kernel/init_task.c 2004-01-09 22:56:37.000000000 +1100 +++ linux-2.6.4-ck1/arch/sh/kernel/init_task.c 2004-03-11 22:45:15.113537573 +1100 @@ -23,7 +23,7 @@ EXPORT_SYMBOL(init_mm); */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; /* * Initial task structure. diff -Naurp linux-2.6.4/arch/sparc/kernel/init_task.c linux-2.6.4-ck1/arch/sparc/kernel/init_task.c --- linux-2.6.4/arch/sparc/kernel/init_task.c 2004-01-09 22:56:39.000000000 +1100 +++ linux-2.6.4-ck1/arch/sparc/kernel/init_task.c 2004-03-11 22:45:15.113537573 +1100 @@ -22,4 +22,4 @@ EXPORT_SYMBOL(init_task); * in etrap.S which assumes it. */ __asm__(".section \".text\",#alloc\n"); -union thread_union init_thread_union = { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union = { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; diff -Naurp linux-2.6.4/arch/sparc64/kernel/init_task.c linux-2.6.4-ck1/arch/sparc64/kernel/init_task.c --- linux-2.6.4/arch/sparc64/kernel/init_task.c 2004-01-09 22:56:39.000000000 +1100 +++ linux-2.6.4-ck1/arch/sparc64/kernel/init_task.c 2004-03-11 22:45:15.114537417 +1100 @@ -21,7 +21,7 @@ EXPORT_SYMBOL(init_mm); * I do it anyways for completeness. */ __asm__ (".text"); -union thread_union init_thread_union = { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union = { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; /* * This is to make the init_thread+stack be the right size for >8k pagesize. diff -Naurp linux-2.6.4/arch/um/kernel/init_task.c linux-2.6.4-ck1/arch/um/kernel/init_task.c --- linux-2.6.4/arch/um/kernel/init_task.c 2004-01-09 22:56:37.000000000 +1100 +++ linux-2.6.4-ck1/arch/um/kernel/init_task.c 2004-03-11 22:45:15.114537417 +1100 @@ -41,7 +41,7 @@ EXPORT_SYMBOL(init_task); union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = -{ INIT_THREAD_INFO(init_task) }; +{ INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; struct task_struct *alloc_task_struct(void) { diff -Naurp linux-2.6.4/arch/v850/kernel/init_task.c linux-2.6.4-ck1/arch/v850/kernel/init_task.c --- linux-2.6.4/arch/v850/kernel/init_task.c 2004-01-09 22:56:39.000000000 +1100 +++ linux-2.6.4-ck1/arch/v850/kernel/init_task.c 2004-03-11 22:45:15.114537417 +1100 @@ -45,4 +45,4 @@ EXPORT_SYMBOL(init_task); */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; diff -Naurp linux-2.6.4/arch/x86_64/kernel/init_task.c linux-2.6.4-ck1/arch/x86_64/kernel/init_task.c --- linux-2.6.4/arch/x86_64/kernel/init_task.c 2004-01-09 22:56:39.000000000 +1100 +++ linux-2.6.4-ck1/arch/x86_64/kernel/init_task.c 2004-03-11 22:45:15.115537262 +1100 @@ -26,7 +26,7 @@ EXPORT_SYMBOL(init_mm); */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union.thread_info) }; /* * Initial task structure. diff -Naurp linux-2.6.4/fs/buffer.c linux-2.6.4-ck1/fs/buffer.c --- linux-2.6.4/fs/buffer.c 2004-03-11 21:26:32.000000000 +1100 +++ linux-2.6.4-ck1/fs/buffer.c 2004-03-11 22:45:15.119536640 +1100 @@ -242,6 +242,7 @@ int fsync_super(struct super_block *sb) return sync_blockdev(sb->s_bdev); } +EXPORT_SYMBOL(fsync_super); /* * Write out and wait upon all dirty data associated with this diff -Naurp linux-2.6.4/fs/fs-writeback.c linux-2.6.4-ck1/fs/fs-writeback.c --- linux-2.6.4/fs/fs-writeback.c 2004-03-11 21:26:32.000000000 +1100 +++ linux-2.6.4-ck1/fs/fs-writeback.c 2004-03-11 22:45:15.120536484 +1100 @@ -246,8 +246,8 @@ __writeback_single_inode(struct inode *i * on the writer throttling path, and we get decent balancing between many * throttled threads: we don't want them all piling up on __wait_on_inode. */ -static void -sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) +void +generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) { const unsigned long start = jiffies; /* livelock avoidance */ @@ -322,6 +322,16 @@ sync_sb_inodes(struct super_block *sb, s return; /* Leave any unwritten inodes on s_io */ } +static void +sync_sb_inodes (struct super_block *sb, struct writeback_control *wbc) +{ + if (sb->s_op->sync_inodes) + sb->s_op->sync_inodes(sb, wbc); + else + generic_sync_sb_inodes(sb, wbc); +} + + /* * Start writeback of dirty pagecache data against all unlocked inodes. * diff -Naurp linux-2.6.4/fs/inode.c linux-2.6.4-ck1/fs/inode.c --- linux-2.6.4/fs/inode.c 2004-03-11 21:26:33.000000000 +1100 +++ linux-2.6.4-ck1/fs/inode.c 2004-03-11 22:45:15.148532131 +1100 @@ -1012,7 +1012,7 @@ void generic_delete_inode(struct inode * EXPORT_SYMBOL(generic_delete_inode); -static void generic_forget_inode(struct inode *inode) +void generic_forget_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -1039,6 +1039,7 @@ static void generic_forget_inode(struct clear_inode(inode); destroy_inode(inode); } +EXPORT_SYMBOL(generic_forget_inode); /* * Normal UNIX filesystem behaviour: delete the diff -Naurp linux-2.6.4/fs/jbd/transaction.c linux-2.6.4-ck1/fs/jbd/transaction.c --- linux-2.6.4/fs/jbd/transaction.c 2004-02-04 22:06:03.000000000 +1100 +++ linux-2.6.4-ck1/fs/jbd/transaction.c 2004-03-11 22:45:15.150531820 +1100 @@ -108,6 +108,7 @@ alloc_transaction: jbd_debug(3, "New handle %p going live.\n", handle); + handle->h_journal = journal; repeat: /* @@ -248,6 +249,23 @@ static handle_t *new_handle(int nblocks) return handle; } +/* + * push @handle into ->fs_context stack + */ +static void push_handle(handle_t *handle) +{ + handle->h_parent = current->fs_context; + current->fs_context = (struct fs_activation *) handle; +} + +/* + * pop top of ->fs_context stack + */ +static void pop_handle(handle_t *handle) +{ + current->fs_context = (struct fs_activation *) handle->h_parent; +} + /** * handle_t *journal_start() - Obtain a new handle. * @journal: Journal to start transaction on. @@ -270,7 +288,7 @@ handle_t *journal_start(journal_t *journ if (!journal) return ERR_PTR(-EROFS); - if (handle) { + if (handle && handle->h_journal == journal) { J_ASSERT(handle->h_transaction->t_journal == journal); handle->h_ref++; return handle; @@ -280,12 +298,13 @@ handle_t *journal_start(journal_t *journ if (!handle) return ERR_PTR(-ENOMEM); - current->journal_info = handle; + push_handle(handle); err = start_this_handle(journal, handle); if (err < 0) { + kfree(handle); + pop_handle(handle); jbd_free_handle(handle); - current->journal_info = NULL; handle = ERR_PTR(err); } return handle; @@ -1360,7 +1379,7 @@ int journal_stop(handle_t *handle) } while (old_handle_count != transaction->t_handle_count); } - current->journal_info = NULL; + pop_handle(handle); spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); transaction->t_outstanding_credits -= handle->h_buffer_credits; diff -Naurp linux-2.6.4/fs/Kconfig linux-2.6.4-ck1/fs/Kconfig --- linux-2.6.4/fs/Kconfig 2004-03-11 22:38:39.811009558 +1100 +++ linux-2.6.4-ck1/fs/Kconfig 2004-03-11 22:45:15.117536951 +1100 @@ -193,6 +193,212 @@ config FS_MBCACHE default y if EXT2_FS=y || EXT3_FS=y default m if EXT2_FS=m || EXT3_FS=m +config REISER4_FS + bool "Reiser4 (EXPERIMENTAL very fast general purpose filesystem)" + depends on EXPERIMENTAL + ---help--- + Reiser4 is more than twice as fast for both reads and writes as + ReiserFS. That means it is four times as fast as NTFS by Microsoft. + (Proper benchmarks will appear in a few months at + www.namesys.com/benchmarks.html, please be patient for now). + + It is the storage layer of what will become a general purpose naming + system --- like what Microsoft wants OFS to be except designed with a + clean new semantic layer rather than being SQL based like OFS. + + It performs all filesystem operations as atomic transactions, which + means that it either performs a write, or it does not, and in the + event of a crash it does not partially perform it or corrupt it. + + It stores files in dancing trees, which are like balanced trees but + faster. It packs small files together so that they share blocks + without wasting space. This means you can use it to store really + small files. It also means that it saves you disk space. It avoids + hassling you with anachronisms like having a maximum number of + inodes, and wasting space if you use less than that number. + + It can handle really large directories, because its search + algorithms are logarithmic with size not linear. With Reiser4 you + should use subdirectories because they help YOU, not because they + help your filesystem's performance, or because your filesystem won't + be able to shrink a directory once you have let it grow. For squid + and similar applications, everything in one directory should perform + better. + + It has a plugin-based infrastructure, which means that you can easily + invent new kinds of files, and so can other people, so it will evolve + rapidly. + + We will be adding a variety of security features to it that DARPA has + funded us to write. + + "reiser4" is a distinct filesystem mount type from "reiserfs" (V3), + which means that "reiserfs" filesystems will be unaffected by any + reiser4 bugs. They have no code in common. Reiser4 is a complete + rewrite from scratch fully incorporating what we learned by experience + while doing "reiserfs" the first time. That was a lot.;-) + + Reiser4 is about as stable as the usual tornado for now --- it is + for use by developers and testers only. We don't use it for our web + server --- you should not either. This will change before 2.6.0. + ReiserFS V3 is the right choice for those who want a filesystem so + stable that we can go for months now without any bug reports while we + have millions of users. + + If you'd like to upgrade from reiserfs to reiser4, use tar to a + temporary disk, maybe using NFS/ssh/SFS to get to that disk, or ask + your favorite distro to sponsor writing a conversion program. + + Sponsored by the Defensed Advanced Research Projects Agency (DARPA) + of the United States Government. DARPA does not endorse this + project, it merely sponsors it. + See http://www.darpa.mil/ato/programs/chats.htm + + To learn more about reiser4, go to http://www.namesys.com + +config REISER4_FS_SYSCALL + bool +# bool "Enable reiser4 system call" + default n + depends on REISER4_FS + ---help--- + Adds sys_reiser4() syscall. + This code is not in good shape yet and may not compile and stuff like that. + +config REISER4_LARGE_KEY + bool "Use larger keys on reiser4 tree" + depends on REISER4_FS + default y + ---help--- + Make keys larger and use additional bits to order bodies of files within + a directory in the order of their names, which is what you want + normally. If you turn this off, file bodies will be ordered by creation + time, which is not optimal for most users. + + Warning: flipping this option makes your file system binary + incompatible. + +config REISER4_CHECK + bool "Enable reiser4 debug options" + depends on REISER4_FS + ---help--- + Don't use this unless you are a developer debugging reiser4. If + using a kernel made by a distro that thinks they are our competitor + (sigh) rather than made by Linus, always check each release to make + sure they have not turned this on to make us look slow as was done + once in the past. This checks everything imaginable while reiser4 + runs. + + When adding features to reiser4 you should set this, and then + extensively test the code, and then send to us and we will test it + again. Include a description of what you did to test it. All + reiser4 code must be tested, reviewed, and signed off on by two + persons before it will be accepted into a stable kernel by Hans. + +config REISER4_DEBUG + bool "Assertions" + depends on REISER4_CHECK + help + Turns on assertions checks. Eats a lot of CPU. + +config REISER4_FS_SYSCALL_DEBUG + bool "Enable reiser4 system call debug" + depends on REISER4_CHECK + help + Turns on debug reiser4_system_call. + +config REISER4_DEBUG_MODIFY + bool "Dirtying" + depends on REISER4_CHECK + help + Check that node is marked dirty each time it's modified. This is done + through maintaining checksum of node content. CPU hog. + +config REISER4_DEBUG_MEMCPY + bool "Memory copying" + depends on REISER4_CHECK + help + Use special non-inlined versions on memcpy, memset, and memmove in + reiser4 to estimate amount of CPU time spent in data copying. + +config REISER4_DEBUG_NODE + bool "Node consistency" + depends on REISER4_CHECK + help + Run consistency checks on nodes in balanced tree. CPU hog. + +config REISER4_ZERO_NEW_NODE + bool "Node zeroing" + depends on REISER4_CHECK + help + Zero new node before use. + +config REISER4_TRACE + bool "Tracing" + depends on REISER4_CHECK + help + Turn on tracing facility. This enables trace_flags mount option. + +config REISER4_EVENT_LOG + bool "Log events" + depends on REISER4_CHECK + help + Log events into user supplied file. This enables trace_file mount option. + +config REISER4_STATS + bool "Statistics" + depends on REISER4_CHECK + help + Turn on statistics collection. This increases size of in-memory super + block considerably. + +config REISER4_PROF + bool "Profiling" + depends on REISER4_CHECK + help + Turn on collection of profiling information available through sysfs. + +config REISER4_LOCKPROF + bool "Lock Profiling" + depends on REISER4_CHECK && PROFILING + help + Turn on collection of spin lock contention information. + +config REISER4_DEBUG_OUTPUT + bool "Printing" + depends on REISER4_CHECK + help + Enable compilation of functions that print internal kernel data + structures in human readable form. Useful for debugging. + +config REISER4_NOOPT + bool "Disable optimization" + depends on REISER4_CHECK + help + Disable compiler optimizations for reiser4 code. + +config REISER4_USE_EFLUSH +# bool "Enable emergency flush" + bool + default y + depends on REISER4_FS + help + Say Y unless you know what you are doing. Details are in reiser4/emergency_flush.c + +config REISER4_COPY_ON_CAPTURE + bool "Enable copy on capture" + depends on REISER4_FS + help + Say N unless you know what you are doing. This is under development + +config REISER4_BADBLOCKS + bool "Enable handling of badblocks in system areas" + depends on REISER4_FS + help + This allows you to use filesystems with badblocks in static reiser4 system areas + (such as superblock, bitmaps, journal header/footer). This imposes some performance + penalty, so say N unless you have such a filesystem. + config REISERFS_FS tristate "Reiserfs support" help @@ -288,7 +494,7 @@ config FS_POSIX_ACL # Never use this symbol for ifdefs. # bool - depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL + depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISER4_FS default y config XFS_FS diff -Naurp linux-2.6.4/fs/Makefile linux-2.6.4-ck1/fs/Makefile --- linux-2.6.4/fs/Makefile 2004-03-11 22:38:39.812009403 +1100 +++ linux-2.6.4-ck1/fs/Makefile 2004-03-11 22:45:15.117536951 +1100 @@ -46,6 +46,7 @@ obj-$(CONFIG_PROFILING) += dcookies.o # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ +obj-$(CONFIG_REISER4_FS) += reiser4/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_EXT2_FS) += ext2/ diff -Naurp linux-2.6.4/fs/reiser4/as_ops.c linux-2.6.4-ck1/fs/reiser4/as_ops.c --- linux-2.6.4/fs/reiser4/as_ops.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/as_ops.c 2004-03-11 22:45:15.171528555 +1100 @@ -0,0 +1,656 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Interface to VFS. Reiser4 address_space_operations are defined here. */ + +#include "forward.h" +#include "debug.h" +#include "dformat.h" +#include "coord.h" +#include "plugin/item/item.h" +#include "plugin/file/file.h" +#include "plugin/security/perm.h" +#include "plugin/disk_format/disk_format.h" +#include "plugin/plugin.h" +#include "plugin/plugin_set.h" +#include "plugin/plugin_hash.h" +#include "plugin/object.h" +#include "txnmgr.h" +#include "jnode.h" +#include "znode.h" +#include "block_alloc.h" +#include "tree.h" +#include "trace.h" +#include "vfs_ops.h" +#include "inode.h" +#include "page_cache.h" +#include "ktxnmgrd.h" +#include "super.h" +#include "reiser4.h" +#include "kattr.h" +#include "entd.h" +#include "emergency_flush.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* address space operations */ + +static int reiser4_readpage(struct file *, struct page *); + +static int reiser4_prepare_write(struct file *, + struct page *, unsigned, unsigned); + +static int reiser4_commit_write(struct file *, + struct page *, unsigned, unsigned); + +static int reiser4_set_page_dirty (struct page *); +static sector_t reiser4_bmap(struct address_space *, sector_t); +/* static int reiser4_direct_IO(int, struct inode *, + struct kiobuf *, unsigned long, int); */ + +/* address space operations */ + +/* as_ops->set_page_dirty() VFS method in reiser4_address_space_operations. + + It is used by others (except reiser4) to set reiser4 pages dirty. Reiser4 + itself uses set_page_dirty_internal(). + + The difference is that reiser4_set_page_dirty puts dirty page on + reiser4_inode->moved_pages. That list is processed by reiser4_writepages() + to do reiser4 specific work over dirty pages (allocation jnode, capturing, + atom creation) which cannot be done in the contexts where set_page_dirty is + called. + + Mostly this function is __set_page_dirty_nobuffers() but target page list + differs. +*/ +static int reiser4_set_page_dirty (struct page * page /* page to mark dirty */) +{ + int ret = 0; + + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page->mapping; + + if (mapping) { + spin_lock(&mapping->page_lock); + /* check for race with truncate */ + if (page->mapping) { + if (!mapping->backing_dev_info->memory_backed) + inc_page_state(nr_dirty); + list_del(&page->list); + list_add(&page->list, get_moved_pages(mapping)); + } + spin_unlock(&mapping->page_lock); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + } + return ret; +} + +/* ->readpage() VFS method in reiser4 address_space_operations + method serving file mmapping +*/ +static int +reiser4_readpage(struct file *f /* file to read from */ , + struct page *page /* page where to read data + * into */ ) +{ + struct inode *inode; + file_plugin *fplug; + int result; + reiser4_context ctx; + + /* + * basically calls ->readpage method of object plugin and handles + * errors. + */ + + assert("umka-078", f != NULL); + assert("umka-079", page != NULL); + assert("nikita-2280", PageLocked(page)); + assert("vs-976", !PageUptodate(page)); + + assert("vs-318", page->mapping && page->mapping->host); + assert("nikita-1352", (f == NULL) || (f->f_dentry->d_inode == page->mapping->host)); + + /* ->readpage can be called from page fault service routine */ + assert("nikita-3174", schedulable()); + + inode = page->mapping->host; + init_context(&ctx, inode->i_sb); + fplug = inode_file_plugin(inode); + if (fplug->readpage != NULL) + result = fplug->readpage(f, page); + else + result = RETERR(-EINVAL); + if (result != 0) { + SetPageError(page); + unlock_page(page); + } + reiser4_exit_context(&ctx); + return 0; +} + +/* ->readpages() VFS method in reiser4 address_space_operations + method serving page cache readahead + + reiser4_readpages works in the following way: on input it has coord which is set on extent that addresses first of + pages for which read requests are to be issued. So, reiser4_readpages just walks forward through extent unit, finds + which blocks are to be read and start read for them. + +reiser4_readpages can be called from two places: from +sys_read->reiser4_read->read_unix_file->read_extent->page_cache_readahead and +from +handling page fault: +handle_mm_fault->do_no_page->filemap_nopage->page_cache_readaround + +In first case coord is set by reiser4 read code. This case is detected by if +(is_in_reiser4_context()). + +In second case, coord is not set and currently, reiser4_readpages does +nothing. +*/ +static int +reiser4_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + file_plugin *fplug; + + if (is_in_reiser4_context()) { + /* we are called from reiser4 context, typically from method + which implements read into page cache. From read_extent, + for example */ + fplug = inode_file_plugin(mapping->host); + if (fplug->readpages) + fplug->readpages(file, mapping, pages); + } else { + /* we are called from page fault. Currently, we do not + * readahead in this case. */; + } + + /* __do_page_cache_readahead expects filesystem's readpages method to + * process every page on this list */ + while (!list_empty(pages)) { + struct page *page = list_entry(pages->prev, struct page, list); + list_del(&page->list); + page_cache_release(page); + } + return 0; +} + +/* prepares @page to be written. This means, that if we want to modify only some + part of page, page should be read first and than modified. Actually this function + almost the same as reiser4_readpage(). The differentce is only that, it does not + unlock the page in the case of error. This is needed because loop back device + driver expects it locked. */ +static int reiser4_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + int result; + file_plugin * fplug; + struct inode * inode; + reiser4_context ctx; + + inode = page->mapping->host; + init_context(&ctx, inode->i_sb); + fplug = inode_file_plugin(inode); + + if (fplug->prepare_write != NULL) + result = fplug->prepare_write(file, page, from, to); + else + result = RETERR(-EINVAL); + + /* don't commit transaction under inode semaphore */ + context_set_commit_async(&ctx); + reiser4_exit_context(&ctx); + + return result; +} + +/* captures jnode of @page to current atom. */ +static int reiser4_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + int result; + file_plugin *fplug; + struct inode *inode; + reiser4_context ctx; + + assert("umka-3101", file != NULL); + assert("umka-3102", page != NULL); + assert("umka-3093", PageLocked(page)); + + SetPageUptodate(page); + + inode = page->mapping->host; + init_context(&ctx, inode->i_sb); + fplug = inode_file_plugin(inode); + + if (fplug->capturepage) + result = fplug->capturepage(page); + else + result = RETERR(-EINVAL); + + /* here page is return locked. */ + assert("umka-3103", PageLocked(page)); + + /* don't commit transaction under inode semaphore */ + context_set_commit_async(&ctx); + reiser4_exit_context(&ctx); + return result; +} + +/* ->writepages() + ->vm_writeback() + ->set_page_dirty() + ->prepare_write() + ->commit_write() +*/ + +/* ->bmap() VFS method in reiser4 address_space_operations */ +reiser4_internal int +reiser4_lblock_to_blocknr(struct address_space *mapping, + sector_t lblock, reiser4_block_nr *blocknr) +{ + file_plugin *fplug; + int result; + reiser4_context ctx; + + init_context(&ctx, mapping->host->i_sb); + reiser4_stat_inc(vfs_calls.bmap); + + fplug = inode_file_plugin(mapping->host); + if (fplug && fplug->get_block) { + *blocknr = generic_block_bmap(mapping, lblock, fplug->get_block); + result = 0; + } else + result = RETERR(-EINVAL); + reiser4_exit_context(&ctx); + return result; +} + +/* ->bmap() VFS method in reiser4 address_space_operations */ +static sector_t +reiser4_bmap(struct address_space *mapping, sector_t lblock) +{ + reiser4_block_nr blocknr; + int result; + + result = reiser4_lblock_to_blocknr(mapping, lblock, &blocknr); + if (result == 0) + if (sizeof blocknr == sizeof(sector_t) || + !blocknr_is_fake(&blocknr)) + return blocknr; + else + return 0; + else + return result; +} + +/* ->invalidatepage method for reiser4 */ + +/* + * this is called for each truncated page from + * truncate_inode_pages()->truncate_{complete,partial}_page(). + * + * At the moment of call, page is under lock, and outstanding io (if any) has + * completed. + */ + +reiser4_internal int +reiser4_invalidatepage(struct page *page /* page to invalidate */, + unsigned long offset /* starting offset for partial + * invalidation */) +{ + int ret = 0; + reiser4_context ctx; + struct inode *inode; + + /* + * This is called to truncate file's page. + * + * Originally, reiser4 implemented truncate in a standard way + * (vmtruncate() calls ->invalidatepage() on all truncated pages + * first, then file system ->truncate() call-back is invoked). + * + * This lead to the problem when ->invalidatepage() was called on a + * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT + * process. That is, truncate was bypassing transactions. To avoid + * this, try_capture_page_to_invalidate() call was added here. + * + * After many troubles with vmtruncate() based truncate (including + * races with flush, tail conversion, etc.) it was re-written in the + * top-to-bottom style: items are killed in cut_tree_object() and + * pages belonging to extent are invalidated in kill_hook_extent(). So + * probably now additional call to capture is not needed here. + * + */ + + assert("nikita-3137", PageLocked(page)); + assert("nikita-3138", !PageWriteback(page)); + inode = page->mapping->host; + + /* + * ->invalidatepage() should only be called for the unformatted + * jnodes. Destruction of all other types of jnodes is performed + * separately. But, during some corner cases (like handling errors + * during mount) it is simpler to let ->invalidatepage to be called on + * them. Check for this, and do nothing. + */ + if (get_super_fake(inode->i_sb) == inode) + return 0; + if (get_cc_fake(inode->i_sb) == inode) + return 0; + if (get_super_private(inode->i_sb)->bitmap == inode) + return 0; + + assert("vs-1426", ergo(PagePrivate(page), + ((inode->i_state & I_JNODES) && + (reiser4_inode_data(inode)->jnodes > 0)))); + assert("vs-1427", ergo(PagePrivate(page), page->mapping == jnode_get_mapping(jnode_by_page(page)))); + assert("vs-1449", !test_bit(PG_arch_1, &page->flags)); + + init_context(&ctx, inode->i_sb); + /* capture page being truncated. */ + ret = try_capture_page_to_invalidate(page); + if (ret != 0) { + warning("nikita-3141", "Cannot capture: %i", ret); + print_page("page", page); + } else + assert("vs-1425", ((inode->i_state & I_JNODES) && + (reiser4_inode_data(inode)->jnodes > 0))); + + + if (offset == 0) { + jnode *node; + + /* remove jnode from transaction and detach it from page. */ + node = jnode_by_page(page); + if (node != NULL) { + assert("vs-1435", !JF_ISSET(node, JNODE_CC)); + jref(node); + JF_SET(node, JNODE_HEARD_BANSHEE); + /* page cannot be detached from jnode concurrently, + * because it is locked */ + uncapture_page(page); + + /* this detaches page from jnode, so that jdelete will not try to lock page which is already locked */ + UNDER_SPIN_VOID(jnode, + node, + page_clear_jnode(page, node)); + unhash_unformatted_jnode(node); + + jput(node); + } + } + reiser4_exit_context(&ctx); + return ret; +} + +#define INC_STAT(page, node, counter) \ + reiser4_stat_inc_at(page->mapping->host->i_sb, \ + level[jnode_get_level(node)].counter); + +#define INC_NSTAT(node, counter) INC_STAT(jnode_page(node), node, counter) + +int is_cced(const jnode *node); + +/* help function called from reiser4_releasepage(). It returns true if jnode + * can be detached from its page and page released. */ +static int +releasable(const jnode *node /* node to check */) +{ + assert("nikita-2781", node != NULL); + assert("nikita-2783", spin_jnode_is_locked(node)); + + /* is some thread is currently using jnode page, later cannot be + * detached */ + if (atomic_read(&node->d_count) != 0) { + INC_NSTAT(node, vm.release.loaded); + return 0; + } + + assert("vs-1214", !jnode_is_loaded(node)); + + /* this jnode is just a copy. Its page cannot be released, because + * otherwise next jload() would load obsolete data from disk + * (up-to-date version may still be in memory). */ + if (is_cced(node)) { + INC_NSTAT(node, vm.release.copy); + return 0; + } + + /* emergency flushed page can be released. This is what emergency + * flush is all about after all. */ + if (JF_ISSET(node, JNODE_EFLUSH)) { + INC_NSTAT(node, vm.release.eflushed); + return 1; /* yeah! */ + } + + /* can only release page if real block number is assigned to + it. Simple check for ->atom wouldn't do, because it is possible for + node to be clean, not it atom yet, and still having fake block + number. For example, node just created in jinit_new(). */ + if (blocknr_is_fake(jnode_get_block(node))) { + INC_NSTAT(node, vm.release.fake); + return 0; + } + /* dirty jnode cannot be released. It can however be submitted to disk + * as part of early flushing, but only after getting flush-prepped. */ + if (jnode_is_dirty(node)) { + INC_NSTAT(node, vm.release.dirty); + return 0; + } + /* overwrite set is only written by log writer. */ + if (JF_ISSET(node, JNODE_OVRWR)) { + INC_NSTAT(node, vm.release.ovrwr); + return 0; + } + /* jnode is already under writeback */ + if (JF_ISSET(node, JNODE_WRITEBACK)) { + INC_NSTAT(node, vm.release.writeback); + return 0; + } + /* page was modified through mmap, but its jnode is not yet + * captured. Don't discard modified data. */ + if (jnode_is_unformatted(node) && JF_ISSET(node, JNODE_KEEPME)) { + INC_NSTAT(node, vm.release.keepme); + return 0; + } + /* don't flush bitmaps or journal records */ + if (!jnode_is_znode(node) && !jnode_is_unformatted(node)) { + INC_NSTAT(node, vm.release.bitmap); + return 0; + } + return 1; +} + +#if REISER4_DEBUG +int jnode_is_releasable(jnode *node) +{ + return UNDER_SPIN(jload, node, releasable(node)); +} +#endif + +/* + * ->releasepage method for reiser4 + * + * This is called by VM scanner when it comes across clean page. What we have + * to do here is to check whether page can really be released (freed that is) + * and if so, detach jnode from it and remove page from the page cache. + * + * Check for releasability is done by releasable() function. + */ +reiser4_internal int +reiser4_releasepage(struct page *page, int gfp UNUSED_ARG) +{ + jnode *node; + + assert("nikita-2257", PagePrivate(page)); + assert("nikita-2259", PageLocked(page)); + assert("nikita-2892", !PageWriteback(page)); + assert("nikita-3019", schedulable()); + + /* NOTE-NIKITA: this can be called in the context of reiser4 call. It + is not clear what to do in this case. A lot of deadlocks seems be + possible. */ + + node = jnode_by_page(page); + assert("nikita-2258", node != NULL); + + INC_STAT(page, node, vm.release.try); + + /* is_page_cache_freeable() check + + (mapping + private + page_cache_get() by shrink_cache()) */ + if (page_count(page) > 3) + return 0; + + if (PageDirty(page)) + return 0; + + /* releasable() needs jnode lock, because it looks at the jnode fields + * and we need jload_lock here to avoid races with jload(). */ + LOCK_JNODE(node); + LOCK_JLOAD(node); + if (releasable(node)) { + struct address_space *mapping; + + mapping = page->mapping; + INC_STAT(page, node, vm.release.ok); + jref(node); + /* there is no need to synchronize against + * jnode_extent_write() here, because pages seen by + * jnode_extent_write() are !releasable(). */ + page_clear_jnode(page, node); + UNLOCK_JLOAD(node); + UNLOCK_JNODE(node); + + /* we are under memory pressure so release jnode also. */ + jput(node); + spin_lock(&mapping->page_lock); + /* shrink_list() + radix-tree */ + if (page_count(page) == 2) { + __remove_from_page_cache(page); + __put_page(page); + } + spin_unlock(&mapping->page_lock); + return 1; + } else { + UNLOCK_JLOAD(node); + UNLOCK_JNODE(node); + assert("nikita-3020", schedulable()); + return 0; + } +} + +#undef INC_NSTAT +#undef INC_STAT + +static void move_inode_out_from_sync_inodes_loop (struct address_space * mapping) +{ + /* work around infinite loop in pdflush->sync_sb_inodes. */ + /* Problem: ->writepages() is supposed to submit io for the pages from + * ->io_pages list and to clean this list. */ + mapping->dirtied_when = jiffies|1; + spin_lock(&inode_lock); + list_move(&mapping->host->i_list, &mapping->host->i_sb->s_dirty); + spin_unlock(&inode_lock); + +} + +/* reiser4 writepages() address space operation this captures anonymous pages + and anonymous jnodes. Anonymous pages are pages which are dirtied via + mmapping. Anonymous jnodes are ones which were created by reiser4_writepage + */ +reiser4_internal int +reiser4_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + int ret = 0; + struct inode *inode; + file_plugin *fplug; + + inode = mapping->host; + fplug = inode_file_plugin(inode); + if (fplug != NULL && fplug->capture != NULL) + /* call file plugin method to capture anonymous pages and + * anonymous jnodes */ + ret = fplug->capture(inode, wbc); + + move_inode_out_from_sync_inodes_loop(mapping); + return ret; +} + +/* start actual IO on @page */ +reiser4_internal int reiser4_start_up_io(struct page *page) +{ + block_sync_page(page); + return 0; +} + +/* + * reiser4 methods for VM + */ +struct address_space_operations reiser4_as_operations = { + /* called during memory pressure by kswapd */ + .writepage = reiser4_writepage, + /* called to read page from the storage when page is added into page + cache. This is done by page-fault handler. */ + .readpage = reiser4_readpage, + /* Start IO on page. This is called from wait_on_page_bit() and + lock_page() and its purpose is to actually start io by jabbing + device drivers. */ + .sync_page = reiser4_start_up_io, + /* called from + * reiser4_sync_inodes()->generic_sync_sb_inodes()->...->do_writepages() + * + * captures anonymous pages for given inode + */ + .writepages = reiser4_writepages, + /* marks page dirty. Note that this is never called by reiser4 + * directly. Reiser4 uses set_page_dirty_internal(). Reiser4 set page + * dirty is called for pages dirtied though mmap and moves dirty page + * to the special ->moved_list in its mapping. */ + .set_page_dirty = reiser4_set_page_dirty, + /* called during read-ahead */ + .readpages = reiser4_readpages, + .prepare_write = reiser4_prepare_write, /* loop back device driver and generic_file_write() call-back */ + .commit_write = reiser4_commit_write, /* loop back device driver and generic_file_write() call-back */ + /* map logical block number to disk block number. Used by FIBMAP ioctl + * and ..bmap pseudo file. */ + .bmap = reiser4_bmap, + /* called just before page is taken out from address space (on + truncate, umount, or similar). */ + .invalidatepage = reiser4_invalidatepage, + /* called when VM is about to take page from address space (due to + memory pressure). */ + .releasepage = reiser4_releasepage, + /* not yet implemented */ + .direct_IO = NULL +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/block_alloc.c linux-2.6.4-ck1/fs/reiser4/block_alloc.c --- linux-2.6.4/fs/reiser4/block_alloc.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/block_alloc.c 2004-03-11 22:45:15.174528088 +1100 @@ -0,0 +1,1204 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "debug.h" +#include "dformat.h" +#include "plugin/plugin.h" +#include "txnmgr.h" +#include "znode.h" +#include "block_alloc.h" +#include "tree.h" +#include "super.h" +#include "lib.h" + +#include /* for __u?? */ +#include /* for struct super_block */ +#include + +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */ + +/* We need to be able to reserve enough disk space to ensure that an atomic + operation will have enough disk space to flush (see flush.c and + http://namesys.com/v4/v4.html) and commit it once it is started. + + In our design a call for reserving disk space may fail but not an actual + block allocation. + + All free blocks, already allocated blocks, and all kinds of reserved blocks + are counted in different per-fs block counters. + + A reiser4 super block's set of block counters currently is: + + free -- free blocks, + used -- already allocated blocks, + + grabbed -- initially reserved for performing an fs operation, those blocks + are taken from free blocks, then grabbed disk space leaks from grabbed + blocks counter to other counters like "fake allocated", "flush + reserved", "used", the rest of not used grabbed space is returned to + free space at the end of fs operation; + + fake allocated -- counts all nodes without real disk block numbers assigned, + we have separate accounting for formatted and unformatted + nodes (for easier debugging); + + flush reserved -- disk space needed for flushing and committing an atom. + Each dirty already allocated block could be written as a + part of atom's overwrite set or as a part of atom's + relocate set. In both case one additional block is needed, + it is used as a wandered block if we do overwrite or as a + new location for a relocated block. + + In addition, blocks in some states are counted on per-thread and per-atom + basis. A reiser4 context has a counter of blocks grabbed by this transaction + and the sb's grabbed blocks counter is a sum of grabbed blocks counter values + of each reiser4 context. Each reiser4 atom has a counter of "flush reserved" + blocks, which are reserved for flush processing and atom commit. */ + +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate + number of blocks to grab for most expensive case of balancing when the leaf + node we insert new item to gets split and new leaf node is allocated. + + So, we need to grab blocks for + + 1) one block for possible dirtying the node we insert an item to. That block + would be used for node relocation at flush time or for allocating of a + wandered one, it depends what will be a result (what set, relocate or + overwrite the node gets assigned to) of the node processing by the flush + algorithm. + + 2) one block for either allocating a new node, or dirtying of right or left + clean neighbor, only one case may happen. + + VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current + node, and creation of new node. have I forgotten something? email me. + + These grabbed blocks are counted in both reiser4 context "grabbed blocks" + counter and in the fs-wide one (both ctx->grabbed_blocks and + sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is + decremented by 2. + + Suppose both two blocks were spent for dirtying of an already allocated clean + node (one block went from "grabbed" to "flush reserved") and for new block + allocating (one block went from "grabbed" to "fake allocated formatted"). + + Inserting of a child pointer to the parent node caused parent node to be + split, the balancing code takes care about this grabbing necessary space + immediately by calling reiser4_grab with BA_RESERVED flag set which means + "can use the 5% reserved disk space". + + At this moment insertion completes and grabbed blocks (if they were not used) + should be returned to the free space counter. + + However the atom life-cycle is not completed. The atom had one "flush + reserved" block added by our insertion and the new fake allocated node is + counted as a "fake allocated formatted" one. The atom has to be fully + processed by flush before commit. Suppose that the flush moved the first, + already allocated node to the atom's overwrite list, the new fake allocated + node, obviously, went into the atom relocate set. The reiser4 flush + allocates the new node using one unit from "fake allocated formatted" + counter, the log writer uses one from "flush reserved" for wandered block + allocation. + + And, it is not the end. When the wandered block is deallocated after the + atom gets fully played (see wander.c for term description), the disk space + occupied for it is returned to free blocks. */ + +/* BLOCK NUMBERS */ + +/* Any reiser4 node has a block number assigned to it. We use these numbers for + indexing in hash tables, so if a block has not yet been assigned a location + on disk we need to give it a temporary fake block number. + + Current implementation of reiser4 uses 64-bit integers for block numbers. We + use highest bit in 64-bit block number to distinguish fake and real block + numbers. So, only 63 bits may be used to addressing of real device + blocks. That "fake" block numbers space is divided into subspaces of fake + block numbers for data blocks and for shadow (working) bitmap blocks. + + Fake block numbers for data blocks are generated by a cyclic counter, which + gets incremented after each real block allocation. We assume that it is + impossible to overload this counter during one transaction life. */ + +/* Initialize a blocknr hint. */ +reiser4_internal void +blocknr_hint_init(reiser4_blocknr_hint * hint) +{ + xmemset(hint, 0, sizeof (reiser4_blocknr_hint)); +} + +/* Release any resources of a blocknr hint. */ +reiser4_internal void +blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG) +{ + /* No resources should be freed in current blocknr_hint implementation.*/ +} + +/* see above for explanation of fake block number. */ +/* Audited by: green(2002.06.11) */ +reiser4_internal int +blocknr_is_fake(const reiser4_block_nr * da) +{ + /* The reason for not simply returning result of '&' operation is that + while return value is (possibly 32bit) int, the reiser4_block_nr is + at least 64 bits long, and high bit (which is the only possible + non zero bit after the masking) would be stripped off */ + return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0; +} + +/* Static functions for / block counters + arithmetic. Mostly, they are isolated to not to code same assertions in + several places. */ +static void +sub_from_ctx_grabbed(reiser4_context *ctx, __u64 count) +{ + assert("zam-527", ctx->grabbed_blocks >= count); + ctx->grabbed_blocks -= count; +} + + +static void +sub_from_sb_grabbed(reiser4_super_info_data *sbinfo, __u64 count) +{ + assert("zam-525", sbinfo->blocks_grabbed >= count); + sbinfo->blocks_grabbed -= count; +} + +/* Decrease the counter of block reserved for flush in super block. */ +static void +sub_from_sb_flush_reserved (reiser4_super_info_data *sbinfo, __u64 count) +{ + assert ("vpf-291", sbinfo->blocks_flush_reserved >= count); + sbinfo->blocks_flush_reserved -= count; +} + +static void +add_to_sb_fake_allocated(reiser4_super_info_data *sbinfo, __u64 count, reiser4_ba_flags_t flags) +{ + if (flags & BA_FORMATTED) { + sbinfo->blocks_fake_allocated += count; + } else { + sbinfo->blocks_fake_allocated_unformatted += count; + } +} + +static void +sub_from_sb_fake_allocated(reiser4_super_info_data *sbinfo, __u64 count, reiser4_ba_flags_t flags) +{ + if (flags & BA_FORMATTED) { + assert("zam-806", sbinfo->blocks_fake_allocated >= count); + sbinfo->blocks_fake_allocated -= count; + } else { + assert("zam-528", sbinfo->blocks_fake_allocated_unformatted >= count); + sbinfo->blocks_fake_allocated_unformatted -= count; + } +} + +static void +sub_from_sb_used(reiser4_super_info_data *sbinfo, __u64 count) +{ + assert("zam-530", sbinfo->blocks_used >= count + sbinfo->min_blocks_used); + sbinfo->blocks_used -= count; +} + +static void +sub_from_cluster_reserved(reiser4_super_info_data *sbinfo, __u64 count) +{ + assert("edward-501", sbinfo->blocks_clustered >= count); + sbinfo->blocks_clustered -= count; +} + +/* Increase the counter of block reserved for flush in atom. */ +static void +add_to_atom_flush_reserved_nolock (txn_atom * atom, __u32 count) +{ + assert ("zam-772", atom != NULL); + assert ("zam-773", spin_atom_is_locked (atom)); + atom->flush_reserved += count; +} + +/* Decrease the counter of block reserved for flush in atom. */ +static void +sub_from_atom_flush_reserved_nolock (txn_atom * atom, __u32 count) +{ + assert ("zam-774", atom != NULL); + assert ("zam-775", spin_atom_is_locked (atom)); + assert ("nikita-2790", atom->flush_reserved >= count); + atom->flush_reserved -= count; +} + +/* super block has 6 counters: free, used, grabbed, fake allocated + (formatted and unformatted) and flush reserved. Their sum must be + number of blocks on a device. This function checks this */ +reiser4_internal int +check_block_counters(const struct super_block *super) +{ + __u64 sum; + + sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) + + reiser4_data_blocks(super) + reiser4_fake_allocated(super) + + reiser4_fake_allocated_unformatted(super) + flush_reserved(super) + + reiser4_clustered_blocks(super); + if (reiser4_block_count(super) != sum) { + printk("super block counters: " + "used %llu, free %llu, " + "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), " + "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n", + reiser4_data_blocks(super), + reiser4_free_blocks(super), + reiser4_grabbed_blocks(super), + reiser4_fake_allocated(super), + reiser4_fake_allocated_unformatted(super), + flush_reserved(super), + reiser4_clustered_blocks(super), + sum, reiser4_block_count(super)); + return 0; + } + return 1; +} + +#if REISER4_DEBUG_OUTPUT +reiser4_internal void +print_block_counters(const char *prefix, + const struct super_block *super, txn_atom *atom) +{ + if (super == NULL) + super = reiser4_get_current_sb(); + printk("%s:\tsuper: G: %llu, F: %llu, D: %llu, U: %llu + %llu, R: %llu, C: %llu, T: %llu\n", + prefix, + reiser4_grabbed_blocks(super), + reiser4_free_blocks(super), + reiser4_data_blocks(super), + reiser4_fake_allocated(super), + reiser4_fake_allocated_unformatted(super), + flush_reserved(super), + reiser4_clustered_blocks(super), + reiser4_block_count(super)); + printk("\tcontext: G: %llu", + get_current_context()->grabbed_blocks); + if (atom == NULL) + atom = get_current_atom_locked_nocheck(); + if (atom != NULL) { + printk("\tatom: R: %llu", atom->flush_reserved); + UNLOCK_ATOM(atom); + } + printk("\n"); +} +#endif + +/* Adjust "working" free blocks counter for number of blocks we are going to + allocate. Record number of grabbed blocks in fs-wide and per-thread + counters. This function should be called before bitmap scanning or + allocating fake block numbers + + @super -- pointer to reiser4 super block; + @count -- number of blocks we reserve; + + @return -- 0 if success, -ENOSPC, if all + free blocks are preserved or already allocated. +*/ + +static int +reiser4_grab(reiser4_context *ctx, __u64 count, reiser4_ba_flags_t flags) +{ + __u64 free_blocks; + int ret = 0, use_reserved = flags & BA_RESERVED; + reiser4_super_info_data *sbinfo; + + assert("vs-1276", ctx == get_current_context()); + + sbinfo = get_super_private(ctx->super); + + reiser4_spin_lock_sb(sbinfo); + + free_blocks = sbinfo->blocks_free; + + ON_TRACE(TRACE_ALLOC, "reiser4_grab: free_blocks %llu\n", free_blocks); + + if ((use_reserved && free_blocks < count) || + (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) { + ret = RETERR(-ENOSPC); + + ON_TRACE(TRACE_ALLOC, "reiser4_grab: ENOSPC: count %llu\n", count); + + goto unlock_and_ret; + } + + ctx->grabbed_blocks += count; + + sbinfo->blocks_grabbed += count; + sbinfo->blocks_free -= count; + +#if REISER4_DEBUG + ctx->grabbed_initially = count; + fill_backtrace(&ctx->grabbed_at, REISER4_BACKTRACE_DEPTH, 0); +#endif + + assert("nikita-2986", check_block_counters(ctx->super)); + + ON_TRACE(TRACE_ALLOC, "%s: grabbed %llu, free blocks left %llu\n", + __FUNCTION__, count, reiser4_free_blocks (ctx->super)); + + /* disable grab space in current context */ + ctx->grab_enabled = 0; + +unlock_and_ret: + reiser4_spin_unlock_sb(sbinfo); + + return ret; +} + +reiser4_internal int +reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags) +{ + int ret; + reiser4_context *ctx; + + assert("nikita-2964", ergo(flags & BA_CAN_COMMIT, + lock_stack_isclean(get_current_lock_stack()))); + ON_TRACE(TRACE_RESERVE, "grab_space: %llu block(s).", count); + + ctx = get_current_context(); + if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) { + ON_TRACE(TRACE_RESERVE, "grab disabled and not forced!\n"); + return 0; + } + + ret = reiser4_grab(ctx, count, flags); + if (ret == -ENOSPC) { + + /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */ + if (flags & BA_CAN_COMMIT) { + + ON_TRACE(TRACE_RESERVE, "force commit!.."); + + txnmgr_force_commit_all(ctx->super, 0); + + ctx->grab_enabled = 1; + ret = reiser4_grab(ctx, count, flags); + } + } + ON_TRACE(TRACE_RESERVE, "%s(%d)\n", (ret == 0) ? "ok" : "failed", ret); + /* + * allocation from reserved pool cannot fail. This is severe error. + */ + assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0)); + return ret; +} + +/* + * SPACE RESERVED FOR UNLINK/TRUNCATE + * + * Unlink and truncate require space in transaction (to update stat data, at + * least). But we don't want rm(1) to fail with "No space on device" error. + * + * Solution is to reserve 5% of disk space for truncates and + * unlinks. Specifically, normal space grabbing requests don't grab space from + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to + * drain it. Per super block delete_sema semaphore is used to allow only one + * thread at a time to grab from reserved area. + * + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT + * flag. + * + */ + +reiser4_internal int reiser4_grab_reserved(struct super_block *super, + __u64 count, reiser4_ba_flags_t flags) +{ + reiser4_super_info_data *sbinfo = get_super_private(super); + + assert("nikita-3175", flags & BA_CAN_COMMIT); + + /* Check the delete semaphore already taken by us, we assume that + * reading of machine word is atomic. */ + if (sbinfo->delete_sema_owner == current) { + if (reiser4_grab_space(count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) { + warning("zam-1003", "nested call of grab_reserved fails count=(%llu)", + (unsigned long long)count); + return RETERR(-ENOSPC); + } + return 0; + } + + if (reiser4_grab_space(count, flags)) { + down(&sbinfo->delete_sema); + assert("nikita-2929", sbinfo->delete_sema_owner == NULL); + sbinfo->delete_sema_owner = current; + + if (reiser4_grab_space(count, flags | BA_RESERVED)) { + warning("zam-833", + "reserved space is not enough (%llu)", (unsigned long long)count); + return RETERR(-ENOSPC); + } + } + return 0; +} + +reiser4_internal void +reiser4_release_reserved(struct super_block *super) +{ + reiser4_super_info_data *info; + + info = get_super_private(super); + if (info->delete_sema_owner == current) { + info->delete_sema_owner = NULL; + up(&info->delete_sema); + } +} + +static reiser4_super_info_data * +grabbed2fake_allocated_head(void) +{ + reiser4_context *ctx; + reiser4_super_info_data *sbinfo; + + ctx = get_current_context(); + sub_from_ctx_grabbed(ctx, 1); + + sbinfo = get_super_private(ctx->super); + reiser4_spin_lock_sb(sbinfo); + + sub_from_sb_grabbed(sbinfo, 1); + /* return sbinfo locked */ + return sbinfo; +} + +/* is called after @count fake block numbers are allocated and pointer to + those blocks are inserted into tree. */ +static void +grabbed2fake_allocated_formatted(void) +{ + reiser4_super_info_data *sbinfo; + + sbinfo = grabbed2fake_allocated_head(); + sbinfo->blocks_fake_allocated ++; + + assert("vs-922", check_block_counters(reiser4_get_current_sb())); + + reiser4_spin_unlock_sb(sbinfo); +} + +static void +grabbed2fake_allocated_unformatted(void) +{ + reiser4_super_info_data *sbinfo; + + sbinfo = grabbed2fake_allocated_head(); + sbinfo->blocks_fake_allocated_unformatted ++; + + assert("vs-9221", check_block_counters(reiser4_get_current_sb())); + + reiser4_spin_unlock_sb(sbinfo); +} + +reiser4_internal void +grabbed2cluster_reserved(int count) +{ + reiser4_context *ctx; + reiser4_super_info_data *sbinfo; + + ctx = get_current_context(); + sub_from_ctx_grabbed(ctx, count); + + sbinfo = get_super_private(ctx->super); + reiser4_spin_lock_sb(sbinfo); + + sub_from_sb_grabbed(sbinfo, count); + sbinfo->blocks_clustered += count; + + assert("edward-504", check_block_counters(ctx->super)); + + reiser4_spin_unlock_sb(sbinfo); +} + +reiser4_internal void +cluster_reserved2grabbed(int count) +{ + reiser4_context *ctx; + reiser4_super_info_data *sbinfo; + + ctx = get_current_context(); + + sbinfo = get_super_private(ctx->super); + reiser4_spin_lock_sb(sbinfo); + + sub_from_cluster_reserved(sbinfo, count); + sbinfo->blocks_grabbed += count; + + assert("edward-505", check_block_counters(ctx->super)); + + reiser4_spin_unlock_sb(sbinfo); + ctx->grabbed_blocks += count; +} + +reiser4_internal void +cluster_reserved2free(int count) +{ + reiser4_context *ctx; + reiser4_super_info_data *sbinfo; + + assert("edward-503", get_current_context()->grabbed_blocks == 0); + + ctx = get_current_context(); + sbinfo = get_super_private(ctx->super); + reiser4_spin_lock_sb(sbinfo); + + sub_from_cluster_reserved(sbinfo, count); + sbinfo->blocks_free += count; + + assert("edward-502", check_block_counters(ctx->super)); + + reiser4_spin_unlock_sb(sbinfo); +} + +static spinlock_t fake_lock = SPIN_LOCK_UNLOCKED; +static reiser4_block_nr fake_gen = 0; + +/* obtain a block number for new formatted node which will be used to refer + to this newly allocated node until real allocation is done */ +static inline void assign_fake_blocknr(reiser4_block_nr *blocknr) +{ + spin_lock(&fake_lock); + *blocknr = fake_gen++; + spin_unlock(&fake_lock); + + *blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK; + *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE; + assert("zam-394", zlook(current_tree, blocknr) == NULL); +} + +reiser4_internal int +assign_fake_blocknr_formatted(reiser4_block_nr *blocknr) +{ + ON_TRACE(TRACE_RESERVE, "assign_fake_blocknr_formatted: moving 1 grabbed block to fake allocated formatted\n"); + + assign_fake_blocknr(blocknr); + grabbed2fake_allocated_formatted(); + + return 0; +} + +/* return fake blocknr which will be used for unformatted nodes */ +reiser4_internal reiser4_block_nr +fake_blocknr_unformatted(void) +{ + reiser4_block_nr blocknr; + + ON_TRACE(TRACE_RESERVE, "fake_blocknr_unformatted: moving 1 grabbed block to fake allocated unformatted\n"); + + assign_fake_blocknr(&blocknr); + grabbed2fake_allocated_unformatted(); + + return blocknr; +} + + +/* adjust sb block counters, if real (on-disk) block allocation immediately + follows grabbing of free disk space. */ +static void +grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo, __u64 count) +{ + sub_from_ctx_grabbed(ctx, count); + + reiser4_spin_lock_sb(sbinfo); + + sub_from_sb_grabbed(sbinfo, count); + sbinfo->blocks_used += count; + + assert("nikita-2679", check_block_counters(ctx->super)); + + reiser4_spin_unlock_sb(sbinfo); +} + +/* adjust sb block counters when @count unallocated blocks get mapped to disk */ +static void +fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count, reiser4_ba_flags_t flags) +{ + reiser4_spin_lock_sb(sbinfo); + + sub_from_sb_fake_allocated(sbinfo, count, flags); + sbinfo->blocks_used += count; + + assert("nikita-2680", check_block_counters(reiser4_get_current_sb())); + + reiser4_spin_unlock_sb(sbinfo); +} + +static void +flush_reserved2used(txn_atom * atom, __u64 count) +{ + reiser4_super_info_data *sbinfo; + + assert("zam-787", atom != NULL); + assert("zam-788", spin_atom_is_locked(atom)); + + sub_from_atom_flush_reserved_nolock(atom, (__u32)count); + + sbinfo = get_current_super_private(); + reiser4_spin_lock_sb(sbinfo); + + sub_from_sb_flush_reserved(sbinfo, count); + sbinfo->blocks_used += count; + + assert ("zam-789", check_block_counters(reiser4_get_current_sb())); + + reiser4_spin_unlock_sb(sbinfo); +} + +/* update the per fs blocknr hint default value. */ +reiser4_internal void +update_blocknr_hint_default (const struct super_block *s, const reiser4_block_nr * block) +{ + reiser4_super_info_data *sbinfo = get_super_private(s); + + assert("nikita-3342", !blocknr_is_fake(block)); + + reiser4_spin_lock_sb(sbinfo); + if (*block < sbinfo->block_count) { + sbinfo->blocknr_hint_default = *block; + } else { + warning("zam-676", + "block number %llu is too large to be used in a blocknr hint\n", (unsigned long long) *block); + dump_stack(); + DEBUGON(1); + } + reiser4_spin_unlock_sb(sbinfo); +} + +/* get current value of the default blocknr hint. */ +reiser4_internal void get_blocknr_hint_default(reiser4_block_nr * result) +{ + reiser4_super_info_data * sbinfo = get_current_super_private(); + + reiser4_spin_lock_sb(sbinfo); + *result = sbinfo->blocknr_hint_default; + assert("zam-677", *result < sbinfo->block_count); + reiser4_spin_unlock_sb(sbinfo); +} + +/* Allocate "real" disk blocks by calling a proper space allocation plugin + * method. Blocks are allocated in one contiguous disk region. The plugin + * independent part accounts blocks by subtracting allocated amount from grabbed + * or fake block counter and add the same amount to the counter of allocated + * blocks. + * + * @hint -- a reiser4 blocknr hint object which contains further block + * allocation hints and parameters (search start, a stage of block + * which will be mapped to disk, etc.), + * @blk -- an out parameter for the beginning of the allocated region, + * @len -- in/out parameter, it should contain the maximum number of allocated + * blocks, after block allocation completes, it contains the length of + * allocated disk region. + * @flags -- see reiser4_ba_flags_t description. + * + * @return -- 0 if success, error code otherwise. + */ +reiser4_internal int +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk, + reiser4_block_nr * len, reiser4_ba_flags_t flags) +{ + __u64 needed = *len; + reiser4_context *ctx; + reiser4_super_info_data *sbinfo; + int ret; + + assert ("zam-986", hint != NULL); + + ctx = get_current_context(); + sbinfo = get_super_private(ctx->super); + + ON_TRACE(TRACE_RESERVE, "reiser4_alloc_blocks: needed %llu..", needed); + + assert("vpf-339", hint != NULL); + + ON_TRACE(TRACE_ALLOC, + "alloc_blocks: requested %llu, search from %llu\n", + (unsigned long long) *len, (unsigned long long) (hint ? hint->blk : ~0ull)); + + /* For write-optimized data we use default search start value, which is + * close to last write location. */ + if (flags & BA_USE_DEFAULT_SEARCH_START) { + reiser4_stat_inc(block_alloc.nohint); + get_blocknr_hint_default(&hint->blk); + } + + /* VITALY: allocator should grab this for internal/tx-lists/similar only. */ +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */ + if (hint->block_stage == BLOCK_NOT_COUNTED) { + ret = reiser4_grab_space_force(*len, flags); + if (ret != 0) + return ret; + } + + ret = sa_alloc_blocks(get_space_allocator(ctx->super), hint, (int) needed, blk, len); + + if (!ret) { + assert("zam-680", *blk < reiser4_block_count(ctx->super)); + assert("zam-681", *blk + *len <= reiser4_block_count(ctx->super)); + + if (flags & BA_PERMANENT) { + /* we assume that current atom exists at this moment */ + txn_atom * atom = get_current_atom_locked (); + atom -> nr_blocks_allocated += *len; + UNLOCK_ATOM (atom); + } + + switch (hint->block_stage) { + case BLOCK_NOT_COUNTED: + case BLOCK_GRABBED: + ON_TRACE(TRACE_RESERVE, "ok. %llu blocks grabbed to used.\n", *len); + grabbed2used(ctx, sbinfo, *len); + break; + case BLOCK_UNALLOCATED: + ON_TRACE(TRACE_RESERVE, "ok. %llu blocks fake allocated to used.\n", *len); + fake_allocated2used(sbinfo, *len, flags); + break; + case BLOCK_FLUSH_RESERVED: + ON_TRACE(TRACE_RESERVE, "ok. %llu flush reserved to used (get wandered?)\n", *len); + { + txn_atom * atom = get_current_atom_locked (); + flush_reserved2used(atom, *len); + UNLOCK_ATOM (atom); + } + break; + default: + impossible("zam-531", "wrong block stage"); + } + } else { + assert ("zam-821", ergo(hint->max_dist == 0 && !hint->backward, ret != -ENOSPC)); + if (hint->block_stage == BLOCK_NOT_COUNTED) + grabbed2free(ctx, sbinfo, needed); + } + + return ret; +} + +/* used -> fake_allocated -> grabbed -> free */ + +/* adjust sb block counters when @count unallocated blocks get unmapped from + disk */ +static void +used2fake_allocated(reiser4_super_info_data *sbinfo, __u64 count, reiser4_ba_flags_t flags) +{ + reiser4_spin_lock_sb(sbinfo); + + add_to_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED); + + sub_from_sb_used(sbinfo, count); + + assert("nikita-2681", check_block_counters(reiser4_get_current_sb())); + + reiser4_spin_unlock_sb(sbinfo); +} + +static void +used2flush_reserved(reiser4_super_info_data *sbinfo, txn_atom * atom, __u64 count, + reiser4_ba_flags_t flags UNUSED_ARG) +{ + assert("nikita-2791", atom != NULL); + assert("nikita-2792", spin_atom_is_locked(atom)); + + add_to_atom_flush_reserved_nolock(atom, (__u32)count); + + reiser4_spin_lock_sb(sbinfo); + + sbinfo->blocks_flush_reserved += count; + /*add_to_sb_flush_reserved(sbinfo, count);*/ + sub_from_sb_used(sbinfo, count); + + assert("nikita-2681", check_block_counters(reiser4_get_current_sb())); + + reiser4_spin_unlock_sb(sbinfo); +} + +/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */ +static void +fake_allocated2grabbed(reiser4_context *ctx, reiser4_super_info_data *sbinfo, __u64 count, reiser4_ba_flags_t flags) +{ + ctx->grabbed_blocks += count; + + reiser4_spin_lock_sb(sbinfo); + + assert("nikita-2682", check_block_counters(ctx->super)); + + sbinfo->blocks_grabbed += count; + sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED); + + assert("nikita-2683", check_block_counters(ctx->super)); + + reiser4_spin_unlock_sb(sbinfo); +} + +reiser4_internal void +fake_allocated2free(__u64 count, reiser4_ba_flags_t flags) +{ + reiser4_context *ctx; + reiser4_super_info_data *sbinfo; + + ctx = get_current_context(); + sbinfo = get_super_private(ctx->super); + + ON_TRACE(TRACE_RESERVE, "fake_allocated2free %llu blocks\n", count); + + fake_allocated2grabbed(ctx, sbinfo, count, flags); + grabbed2free(ctx, sbinfo, count); +} + +reiser4_internal void grabbed2free_mark(int mark) +{ + reiser4_context *ctx; + reiser4_super_info_data *sbinfo; + + ctx = get_current_context(); + sbinfo = get_super_private(ctx->super); + + assert("nikita-3007", mark >= 0); + assert("nikita-3006", + ctx->grabbed_blocks >= (__u64)mark); + grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark); +} + +/* Adjust free blocks count for blocks which were reserved but were not used. */ +reiser4_internal void +grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo, + __u64 count) +{ + ON_TRACE(TRACE_RESERVE, "grabbed2free: %llu\n", count); + + sub_from_ctx_grabbed(ctx, count); + + + reiser4_spin_lock_sb(sbinfo); + + sub_from_sb_grabbed(sbinfo, count); + sbinfo->blocks_free += count; + assert("nikita-2684", check_block_counters(ctx->super)); + + reiser4_spin_unlock_sb(sbinfo); +} + +reiser4_internal void +grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count) +{ + reiser4_context *ctx; + reiser4_super_info_data *sbinfo; + + assert("vs-1095", atom); + + ctx = get_current_context(); + sbinfo = get_super_private(ctx->super); + + sub_from_ctx_grabbed(ctx, count); + + add_to_atom_flush_reserved_nolock(atom, count); + + reiser4_spin_lock_sb(sbinfo); + + sbinfo->blocks_flush_reserved += count; + sub_from_sb_grabbed(sbinfo, count); + + assert ("vpf-292", check_block_counters(ctx->super)); + + ON_TRACE(TRACE_RESERVE, "__grabbed2flush_reserved_nolock %llu blocks: atom %u has %llu flush reserved blocks\n", + count, atom->atom_id, atom->flush_reserved); + + reiser4_spin_unlock_sb(sbinfo); +} + +reiser4_internal void +grabbed2flush_reserved(__u64 count) +{ + txn_atom * atom = get_current_atom_locked (); + + ON_TRACE(TRACE_RESERVE, "__grabbed2flush_reserved\n"); + + grabbed2flush_reserved_nolock (atom, count); + + UNLOCK_ATOM (atom); +} + +reiser4_internal void flush_reserved2grabbed(txn_atom * atom, __u64 count) +{ + reiser4_context *ctx; + reiser4_super_info_data *sbinfo; + + assert("nikita-2788", atom != NULL); + assert("nikita-2789", spin_atom_is_locked(atom)); + + ctx = get_current_context(); + sbinfo = get_super_private(ctx->super); + + ctx->grabbed_blocks += count; + + sub_from_atom_flush_reserved_nolock(atom, (__u32)count); + + reiser4_spin_lock_sb(sbinfo); + + sbinfo->blocks_grabbed += count; + sub_from_sb_flush_reserved(sbinfo, count); + + assert ("vpf-292", check_block_counters (ctx->super)); + + reiser4_spin_unlock_sb (sbinfo); +} + +/* release all blocks grabbed in context which where not used. */ +reiser4_internal void +all_grabbed2free(void) +{ + reiser4_context *ctx = get_current_context(); + + grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks); +} + +/* adjust sb block counters if real (on-disk) blocks do not become unallocated + after freeing, @count blocks become "grabbed". */ +static void +used2grabbed(reiser4_context *ctx, reiser4_super_info_data *sbinfo, __u64 count) +{ + ctx->grabbed_blocks += count; + + reiser4_spin_lock_sb(sbinfo); + + sbinfo->blocks_grabbed += count; + sub_from_sb_used(sbinfo, count); + + assert("nikita-2685", check_block_counters(ctx->super)); + + reiser4_spin_unlock_sb(sbinfo); +} + +/* this used to be done through used2grabbed and grabbed2free*/ +static void +used2free(reiser4_super_info_data *sbinfo, __u64 count) +{ + reiser4_spin_lock_sb(sbinfo); + + sbinfo->blocks_free += count; + sub_from_sb_used(sbinfo, count); + + assert("nikita-2685", check_block_counters(reiser4_get_current_sb())); + + reiser4_spin_unlock_sb(sbinfo); +} + +#if REISER4_DEBUG + +/* check "allocated" state of given block range */ +void +reiser4_check_blocks(const reiser4_block_nr * start, const reiser4_block_nr * len, int desired) +{ + sa_check_blocks(start, len, desired); +} + +/* check "allocated" state of given block */ +void +reiser4_check_block(const reiser4_block_nr * block, int desired) +{ + const reiser4_block_nr one = 1; + + reiser4_check_blocks(block, &one, desired); +} + +#endif + +/* Blocks deallocation function may do an actual deallocation through space + plugin allocation or store deleted block numbers in atom's delete_set data + structure depend on @defer parameter. */ + +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which + will be deleted from WORKING bitmap. They might be just unmapped from disk, or + freed but disk space is still grabbed by current thread, or these blocks must + not be counted in any reiser4 sb block counters, see block_stage_t comment */ + +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to + distinguish blocks allocated for unformatted and formatted nodes */ + +reiser4_internal int +reiser4_dealloc_blocks(const reiser4_block_nr * start, + const reiser4_block_nr * len, + block_stage_t target_stage, reiser4_ba_flags_t flags) +{ + txn_atom *atom = NULL; + int ret; + reiser4_context *ctx; + reiser4_super_info_data *sbinfo; + + ON_TRACE(TRACE_RESERVE, "reiser4_dealloc_blocks: %llu blocks", *len); + + ctx = get_current_context(); + sbinfo = get_super_private(ctx->super); + + if (REISER4_DEBUG) { + assert("zam-431", *len != 0); + assert("zam-432", *start != 0); + assert("zam-558", !blocknr_is_fake(start)); + + reiser4_spin_lock_sb(sbinfo); + assert("zam-562", *start < sbinfo->block_count); + reiser4_spin_unlock_sb(sbinfo); + } + + if (flags & BA_DEFER) { + blocknr_set_entry *bsep = NULL; + + ON_TRACE(TRACE_RESERVE, "put on delete set\n"); + + /* storing deleted block numbers in a blocknr set + datastructure for further actual deletion */ + do { + atom = get_current_atom_locked(); + assert("zam-430", atom != NULL); + + ret = blocknr_set_add_extent(atom, &atom->delete_set, &bsep, start, len); + + if (ret == -ENOMEM) + return ret; + + /* This loop might spin at most two times */ + } while (ret == -E_REPEAT); + + assert("zam-477", ret == 0); + assert("zam-433", atom != NULL); + + UNLOCK_ATOM(atom); + + } else { + assert("zam-425", get_current_super_private() != NULL); + sa_dealloc_blocks(get_space_allocator(ctx->super), *start, *len); + + if (flags & BA_PERMANENT) { + /* These blocks were counted as allocated, we have to revert it + * back if allocation is discarded. */ + txn_atom * atom = get_current_atom_locked (); + atom->nr_blocks_allocated -= *len; + UNLOCK_ATOM (atom); + } + + switch (target_stage) { + case BLOCK_NOT_COUNTED: + assert("vs-960", flags & BA_FORMATTED); + + ON_TRACE(TRACE_RESERVE, "moved from used to free\n"); + + /* VITALY: This is what was grabbed for internal/tx-lists/similar only */ + used2free(sbinfo, *len); + break; + + case BLOCK_GRABBED: + + ON_TRACE(TRACE_RESERVE, "moved from used to grabbed\n"); + + used2grabbed(ctx, sbinfo, *len); + break; + + case BLOCK_UNALLOCATED: + + ON_TRACE(TRACE_RESERVE, "moved from used to fake allocated\n"); + + used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED); + break; + + case BLOCK_FLUSH_RESERVED: { + txn_atom *atom; + + ON_TRACE(TRACE_RESERVE, "moved from used to flush reserved\n"); + + atom = get_current_atom_locked(); + used2flush_reserved(sbinfo, atom, *len, flags & BA_FORMATTED); + UNLOCK_ATOM(atom); + break; + } + default: + impossible("zam-532", "wrong block stage"); + } + } + + return 0; +} + +reiser4_internal int +reiser4_dealloc_block(const reiser4_block_nr * block, + block_stage_t stage, reiser4_ba_flags_t flags) +{ + const reiser4_block_nr one = 1; + return reiser4_dealloc_blocks(block, &one, stage, flags | BA_FORMATTED); +} + +/* wrappers for block allocator plugin methods */ +reiser4_internal int +pre_commit_hook(void) +{ + assert("zam-502", get_current_super_private() != NULL); + sa_pre_commit_hook(); + return 0; +} + +/* an actor which applies delete set to block allocator data */ +static int +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a, const reiser4_block_nr * b, void *data UNUSED_ARG) +{ + reiser4_context *ctx; + reiser4_super_info_data *sbinfo; + + __u64 len = 1; + + ctx = get_current_context(); + sbinfo = get_super_private(ctx->super); + + assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT); + assert("zam-552", sbinfo != NULL); + + if (b != NULL) + len = *b; + + if (REISER4_DEBUG) { + reiser4_spin_lock_sb(sbinfo); + + assert("zam-554", *a < reiser4_block_count(ctx->super)); + assert("zam-555", *a + len <= reiser4_block_count(ctx->super)); + + reiser4_spin_unlock_sb(sbinfo); + } + + sa_dealloc_blocks(&sbinfo->space_allocator, *a, len); + /* adjust sb block counters */ + used2free(sbinfo, len); + return 0; +} + +reiser4_internal void +post_commit_hook(void) +{ + txn_atom *atom; + + atom = get_current_atom_locked(); + assert("zam-452", atom->stage == ASTAGE_POST_COMMIT); + UNLOCK_ATOM(atom); + + /* do the block deallocation which was deferred + until commit is done */ + blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1); + + assert("zam-504", get_current_super_private() != NULL); + sa_post_commit_hook(); +} + +reiser4_internal void +post_write_back_hook(void) +{ + assert("zam-504", get_current_super_private() != NULL); + + sa_post_commit_hook(); +} + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/block_alloc.h linux-2.6.4-ck1/fs/reiser4/block_alloc.h --- linux-2.6.4/fs/reiser4/block_alloc.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/block_alloc.h 2004-03-11 22:45:15.174528088 +1100 @@ -0,0 +1,174 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#if !defined (__FS_REISER4_BLOCK_ALLOC_H__) +#define __FS_REISER4_BLOCK_ALLOC_H__ + +#include "dformat.h" +#include "forward.h" + +#include /* for __u?? */ +#include + +/* Mask when is applied to given block number shows is that block number is a fake one */ +#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL +/* Mask which isolates a type of object this fake block number was assigned to */ +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL + +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared + against these two values to understand is the object unallocated or bitmap + shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */ +#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL + +/* specification how block allocation was counted in sb block counters */ +typedef enum { + BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */ + BLOCK_GRABBED = 1, /* free space grabbed for further allocation + of this block */ + BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */ + BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object + ( unallocated formatted or unformatted + node) */ + BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block + number assigned */ +} block_stage_t; + +/* a hint for block allocator */ +struct reiser4_blocknr_hint { + /* FIXME: I think we want to add a longterm lock on the bitmap block here. This + is to prevent jnode_flush() calls from interleaving allocations on the same + bitmap, once a hint is established. */ + + /* search start hint */ + reiser4_block_nr blk; + /* if not zero, it is a region size we search for free blocks in */ + reiser4_block_nr max_dist; + /* level for allocation, may be useful have branch-level and higher + write-optimized. */ + tree_level level; + /* block allocator assumes that blocks, which will be mapped to disk, + are in this specified block_stage */ + block_stage_t block_stage; + /* If direction = 1 allocate blocks in backward direction from the end + * of disk to the beginning of disk. */ + int backward:1; + +}; + +/* These flags control block allocation/deallocation behavior */ +enum reiser4_ba_flags { + /* do allocatations from reserved (5%) area */ + BA_RESERVED = (1 << 0), + + /* block allocator can do commit trying to recover free space */ + BA_CAN_COMMIT = (1 << 1), + + /* if operation will be applied to formatted block */ + BA_FORMATTED = (1 << 2), + + /* defer actual block freeing until transaction commit */ + BA_DEFER = (1 << 3), + + /* allocate blocks for permanent fs objects (formatted or unformatted), not + wandered of log blocks */ + BA_PERMANENT = (1 << 4), + + /* grab space even it was disabled */ + BA_FORCE = (1 << 5), + + /* use default start value for free blocks search. */ + BA_USE_DEFAULT_SEARCH_START = (1 << 6) +}; + +typedef enum reiser4_ba_flags reiser4_ba_flags_t; + +extern void blocknr_hint_init(reiser4_blocknr_hint * hint); +extern void blocknr_hint_done(reiser4_blocknr_hint * hint); +extern void update_blocknr_hint_default(const struct super_block *, const reiser4_block_nr *); +extern void get_blocknr_hint_default(reiser4_block_nr *); + +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block * super); + +int assign_fake_blocknr_formatted(reiser4_block_nr *); +reiser4_block_nr fake_blocknr_unformatted(void); + + +/* free -> grabbed -> fake_allocated -> used */ + + +int reiser4_grab_space (__u64 count, reiser4_ba_flags_t flags); +void all_grabbed2free (void); +void grabbed2free (reiser4_context *, + reiser4_super_info_data *, __u64 count); +void fake_allocated2free (__u64 count, reiser4_ba_flags_t flags); +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count); +void grabbed2flush_reserved (__u64 count); +int reiser4_alloc_blocks (reiser4_blocknr_hint * hint, + reiser4_block_nr * start, + reiser4_block_nr * len, + reiser4_ba_flags_t flags); +int reiser4_dealloc_blocks (const reiser4_block_nr *, + const reiser4_block_nr *, + block_stage_t, reiser4_ba_flags_t flags); +int reiser4_dealloc_block (const reiser4_block_nr *, + block_stage_t, reiser4_ba_flags_t flags); + +#define reiser4_grab_space_force(count, flags) \ + reiser4_grab_space(count, flags | BA_FORCE) + +extern void grabbed2free_mark(int mark); +extern int reiser4_grab_reserved(struct super_block *, + __u64, reiser4_ba_flags_t); +extern void reiser4_release_reserved(struct super_block *super); + +/* grabbed -> fake_allocated */ + +/* fake_allocated -> used */ + +/* used -> fake_allocated -> grabbed -> free */ + +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count); + +extern int blocknr_is_fake(const reiser4_block_nr * da); + +extern void grabbed2cluster_reserved(int count); +extern void cluster_reserved2grabbed(int count); +extern void cluster_reserved2free(int count); + +extern int check_block_counters(const struct super_block *); + +#if REISER4_DEBUG + +extern void reiser4_check_blocks(const reiser4_block_nr *, const reiser4_block_nr *, int); +extern void reiser4_check_block(const reiser4_block_nr *, int); + +#else + +# define reiser4_check_blocks(beg, len, val) noop +# define reiser4_check_block(beg, val) noop + +#endif + +#if REISER4_DEBUG_OUTPUT +extern void print_block_counters(const char *, + const struct super_block *, + txn_atom *atom); +#else +#define print_block_counters(p, s, a) noop +#endif + +extern int pre_commit_hook(void); +extern void post_commit_hook(void); +extern void post_write_back_hook(void); + +#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */ + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/blocknrset.c linux-2.6.4-ck1/fs/reiser4/blocknrset.c --- linux-2.6.4/fs/reiser4/blocknrset.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/blocknrset.c 2004-03-11 22:45:15.175527933 +1100 @@ -0,0 +1,365 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* This file contains code for various block number sets used by the atom to + track the deleted set and wandered block mappings. */ + +#include "debug.h" +#include "dformat.h" +#include "type_safe_list.h" +#include "txnmgr.h" + +#include + +/* The proposed data structure for storing unordered block number sets is a + list of elements, each of which contains an array of block number or/and + array of block number pairs. That element called blocknr_set_entry is used + to store block numbers from the beginning and for extents from the end of + the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields + count numbers of blocks and extents. + + +------------------- blocknr_set_entry->data ------------------+ + |block1|block2| ... ... |pair3|pair2|pair1| + +------------------------------------------------------------+ + + When current blocknr_set_entry is full, allocate a new one. */ + +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete + * set (single blocks and block extents), in that case blocknr pair represent an + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs + * there represent a (real block) -> (wandered block) mapping. */ + +typedef struct blocknr_pair blocknr_pair; + +/* The total size of a blocknr_set_entry. */ +#define BLOCKNR_SET_ENTRY_SIZE 128 + +/* The number of blocks that can fit the blocknr data area. */ +#define BLOCKNR_SET_ENTRIES_NUMBER \ + ((BLOCKNR_SET_ENTRY_SIZE - \ + 2 * sizeof (unsigned) - \ + sizeof (blocknr_set_list_link)) / \ + sizeof (reiser4_block_nr)) + +/* An entry of the blocknr_set */ +struct blocknr_set_entry { + unsigned nr_singles; + unsigned nr_pairs; + blocknr_set_list_link link; + reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER]; +}; + +/* A pair of blocks as recorded in the blocknr_set_entry data. */ +struct blocknr_pair { + reiser4_block_nr a; + reiser4_block_nr b; +}; + +/* The list definition. */ +TYPE_SAFE_LIST_DEFINE(blocknr_set, blocknr_set_entry, link); + +/* Return the number of blocknr slots available in a blocknr_set_entry. */ +/* Audited by: green(2002.06.11) */ +static unsigned +bse_avail(blocknr_set_entry * bse) +{ + unsigned used = bse->nr_singles + 2 * bse->nr_pairs; + + assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used); + cassert(sizeof (blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE); + + return BLOCKNR_SET_ENTRIES_NUMBER - used; +} + +/* Initialize a blocknr_set_entry. */ +/* Audited by: green(2002.06.11) */ +static void +bse_init(blocknr_set_entry * bse) +{ + bse->nr_singles = 0; + bse->nr_pairs = 0; + blocknr_set_list_clean(bse); +} + +/* Allocate and initialize a blocknr_set_entry. */ +/* Audited by: green(2002.06.11) */ +static blocknr_set_entry * +bse_alloc(void) +{ + blocknr_set_entry *e; + + if ((e = (blocknr_set_entry *) kmalloc(sizeof (blocknr_set_entry), GFP_KERNEL)) == NULL) { + return NULL; + } + + bse_init(e); + + return e; +} + +/* Free a blocknr_set_entry. */ +/* Audited by: green(2002.06.11) */ +static void +bse_free(blocknr_set_entry * bse) +{ + kfree(bse); +} + +/* Add a block number to a blocknr_set_entry */ +/* Audited by: green(2002.06.11) */ +static void +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block) +{ + assert("jmacd-5099", bse_avail(bse) >= 1); + + bse->entries[bse->nr_singles++] = *block; +} + +/* Get a pair of block numbers */ +/* Audited by: green(2002.06.11) */ +static inline blocknr_pair * +bse_get_pair(blocknr_set_entry * bse, unsigned pno) +{ + assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1)); + + return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER - 2 * (pno + 1)); +} + +/* Add a pair of block numbers to a blocknr_set_entry */ +/* Audited by: green(2002.06.11) */ +static void +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a, const reiser4_block_nr * b) +{ + blocknr_pair *pair; + + assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL); + + pair = bse_get_pair(bse, bse->nr_pairs++); + + pair->a = *a; + pair->b = *b; +} + +/* Add either a block or pair of blocks to the block number set. The first + blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if + @b is non-NULL a pair is added. The block number set belongs to atom, and + the call is made with the atom lock held. There may not be enough space in + the current blocknr_set_entry. If new_bsep points to a non-NULL + blocknr_set_entry then it will be added to the blocknr_set and new_bsep + will be set to NULL. If new_bsep contains NULL then the atom lock will be + released and a new bse will be allocated in new_bsep. E_REPEAT will be + returned with the atom unlocked for the operation to be tried again. If + the operation succeeds, 0 is returned. If new_bsep is non-NULL and not + used during the call, it will be freed automatically. */ +/* Audited by: green(2002.06.11) */ +static int +blocknr_set_add(txn_atom * atom, + blocknr_set * bset, + blocknr_set_entry ** new_bsep, const reiser4_block_nr * a, const reiser4_block_nr * b) +{ + blocknr_set_entry *bse; + unsigned entries_needed; + + assert("jmacd-5101", a != NULL); + + entries_needed = (b == NULL) ? 1 : 2; + if (blocknr_set_list_empty(&bset->entries) || bse_avail(blocknr_set_list_front(&bset->entries)) + < entries_needed) { + /* See if a bse was previously allocated. */ + if (*new_bsep == NULL) { + UNLOCK_ATOM(atom); + *new_bsep = bse_alloc(); + return (*new_bsep != NULL) ? -E_REPEAT : RETERR(-ENOMEM); + } + + /* Put it on the head of the list. */ + blocknr_set_list_push_front(&bset->entries, *new_bsep); + + *new_bsep = NULL; + } + + /* Add the single or pair. */ + bse = blocknr_set_list_front(&bset->entries); + if (b == NULL) { + bse_put_single(bse, a); + } else { + bse_put_pair(bse, a, b); + } + + /* If new_bsep is non-NULL then there was an allocation race, free this copy. */ + if (*new_bsep != NULL) { + bse_free(*new_bsep); + *new_bsep = NULL; + } + + return 0; +} + +/* Add an extent to the block set. If the length is 1, it is treated as a + single block (e.g., reiser4_set_add_block). */ +/* Audited by: green(2002.06.11) */ +/* Auditor note: Entire call chain cannot hold any spinlocks, because + kmalloc might schedule. The only exception is atom spinlock, which is + properly freed. */ +reiser4_internal int +blocknr_set_add_extent(txn_atom * atom, + blocknr_set * bset, + blocknr_set_entry ** new_bsep, const reiser4_block_nr * start, const reiser4_block_nr * len) +{ + assert("jmacd-5102", start != NULL && len != NULL && *len > 0); + return blocknr_set_add(atom, bset, new_bsep, start, *len == 1 ? NULL : len); +} + +/* Add a block pair to the block set. It adds exactly a pair, which is checked + * by an assertion that both arguments are not null.*/ +/* Audited by: green(2002.06.11) */ +/* Auditor note: Entire call chain cannot hold any spinlocks, because + kmalloc might schedule. The only exception is atom spinlock, which is + properly freed. */ +reiser4_internal int +blocknr_set_add_pair(txn_atom * atom, + blocknr_set * bset, + blocknr_set_entry ** new_bsep, const reiser4_block_nr * a, const reiser4_block_nr * b) +{ + assert("jmacd-5103", a != NULL && b != NULL); + return blocknr_set_add(atom, bset, new_bsep, a, b); +} + +/* Initialize a blocknr_set. */ +/* Audited by: green(2002.06.11) */ +reiser4_internal void +blocknr_set_init(blocknr_set * bset) +{ + blocknr_set_list_init(&bset->entries); +} + +/* Release the entries of a blocknr_set. */ +/* Audited by: green(2002.06.11) */ +reiser4_internal void +blocknr_set_destroy(blocknr_set * bset) +{ + while (!blocknr_set_list_empty(&bset->entries)) { + bse_free(blocknr_set_list_pop_front(&bset->entries)); + } +} + +/* Merge blocknr_set entries out of @from into @into. */ +/* Audited by: green(2002.06.11) */ +/* Auditor comments: This merge does not know if merged sets contain + blocks pairs (As for wandered sets) or extents, so it cannot really merge + overlapping ranges if there is some. So I believe it may lead to + some blocks being presented several times in one blocknr_set. To help + debugging such problems it might help to check for duplicate entries on + actual processing of this set. Testing this kind of stuff right here is + also complicated by the fact that these sets are not sorted and going + through whole set on each element addition is going to be CPU-heavy task */ +reiser4_internal void +blocknr_set_merge(blocknr_set * from, blocknr_set * into) +{ + blocknr_set_entry *bse_into = NULL; + + /* If @from is empty, no work to perform. */ + if (blocknr_set_list_empty(&from->entries)) { + return; + } + + /* If @into is not empty, try merging partial-entries. */ + if (!blocknr_set_list_empty(&into->entries)) { + + /* Neither set is empty, pop the front to members and try to combine them. */ + blocknr_set_entry *bse_from; + unsigned into_avail; + + bse_into = blocknr_set_list_pop_front(&into->entries); + bse_from = blocknr_set_list_pop_front(&from->entries); + + /* Combine singles. */ + for (into_avail = bse_avail(bse_into); into_avail != 0 && bse_from->nr_singles != 0; into_avail -= 1) { + bse_put_single(bse_into, &bse_from->entries[--bse_from->nr_singles]); + } + + /* Combine pairs. */ + for (; into_avail > 1 && bse_from->nr_pairs != 0; into_avail -= 2) { + blocknr_pair *pair = bse_get_pair(bse_from, --bse_from->nr_pairs); + bse_put_pair(bse_into, &pair->a, &pair->b); + } + + /* If bse_from is empty, delete it now. */ + if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) { + bse_free(bse_from); + } else { + /* Otherwise, bse_into is full or nearly full (e.g., + it could have one slot avail and bse_from has one + pair left). Push it back onto the list. bse_from + becomes bse_into, which will be the new partial. */ + blocknr_set_list_push_front(&into->entries, bse_into); + bse_into = bse_from; + } + } + + /* Splice lists together. */ + blocknr_set_list_splice(&into->entries, &from->entries); + + /* Add the partial entry back to the head of the list. */ + if (bse_into != NULL) { + blocknr_set_list_push_front(&into->entries, bse_into); + } +} + +/* Iterate over all blocknr set elements. */ +reiser4_internal int +blocknr_set_iterator(txn_atom * atom, blocknr_set * bset, blocknr_set_actor_f actor, void *data, int delete) +{ + + blocknr_set_entry *entry; + + assert("zam-429", atom != NULL); + assert("zam-430", atom_is_protected(atom)); + assert("zam-431", bset != 0); + assert("zam-432", actor != NULL); + + entry = blocknr_set_list_front(&bset->entries); + while (!blocknr_set_list_end(&bset->entries, entry)) { + blocknr_set_entry *tmp = blocknr_set_list_next(entry); + unsigned int i; + int ret; + + for (i = 0; i < entry->nr_singles; i++) { + ret = actor(atom, &entry->entries[i], NULL, data); + + /* We can't break a loop if delete flag is set. */ + if (ret != 0 && !delete) + return ret; + } + + for (i = 0; i < entry->nr_pairs; i++) { + struct blocknr_pair *ab; + + ab = bse_get_pair(entry, i); + + ret = actor(atom, &ab->a, &ab->b, data); + + if (ret != 0 && !delete) + return ret; + } + + if (delete) { + blocknr_set_list_remove(entry); + bse_free(entry); + } + + entry = tmp; + } + + return 0; +} + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/bufmgr/wander.txt linux-2.6.4-ck1/fs/reiser4/bufmgr/wander.txt --- linux-2.6.4/fs/reiser4/bufmgr/wander.txt 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/bufmgr/wander.txt 2004-03-11 22:45:15.176527777 +1100 @@ -0,0 +1,184 @@ + +Before discussing the format of the commit record occupying the +journal area, we must revisit the topic of free space bitmap +management. At the time an atom is closing and formatting its commit +record, the question is how to deallocate the blocks deleted by the +atom. Those blocks become free once the atom commits, but they cannot +be re-allocated before that point in time. + +Modified bitmaps are always part of the overwrite set, meaning copies +are written to wandered positions (i.e., part of the log) before later +being overwritten. + +We have defined these terms: + +WORKING BITMAPS: the "current" in-memory bitmaps + +COMMIT BITMAPS: bitmap copies written to wandered, overwrite positions + +DELETE SET: the set of deleted blocks plus the set of former positions +of relocated blocks. These block positions are deallocated when the +atom commits. + +WANDERED SET: the set of temporary locations used to store overwrite +blocks before they are actually overwritten. These block positions +are deallocated some time after the atom commits, when it is ensured +that the atom will no longer replay during crash recovery. + +Both the delete set and the wandered set are blocks to be deleted, but +the details of handling these deletions are necessarily different. + +---- Consider first the handling of the DELETE SET. + +There are two ways to handle the delete set. Before reading their +descriptions, let me offer my opinion. The first is MORE complicated +but requires LESS data to be logged in the commit record. The second +is LESS complicated but requires MORE data to be logged in the commit +record. + +Strategy #1: MORE COMPLICATED, LESS LOGGED DATA + + At the time an atom closes, it creates a snapshot of all the + modified bitmaps. In other words, it creates commit bitmaps which + are copies of the working bitmaps. The delete set are immediately + deallocated in the commit bitmaps, which are written to their + wandered positions and later overwritten in their actual positions. + + This way, the commit record does not contain any record of the + delete set. + + But there are problems with this approach, too. First, there is + extra memory pressure associated with maintaining extra copies of + modified bitmaps. Second, it is less straight forward than it may + appear at first. Suppose there are two atoms that commit in + sequence, such that the first does not complete its commit (i.e., + finish all the required writes) before the second prepares to + commit. Which bitmaps does the second committing atom copy as its + commit bitmaps? It does not just copy the working bitmaps, since + those do not yet represent the first atom deallocations. + + Instead, it looks like we would end up maintaining multiple copies + of every bitmap. Each atom's commit bitmaps are the commit bitmaps + of the previous atom plus whatever modifications were made by the + atom itself. This means in addition to maintaining the working + bitmaps, we end up maintaining separate commit bitmaps. It is not + just as simple as copying the working bitmaps at the time of commit. + + This solution looks far too complicated to me. I admit that I have + not fully tried to understand the complexity, but I do not think the + advantages (smaller commit records) will outweigh the additional + complexity, not to mention the additional memory pressure. + +Strategy #2: LESS COMPLICATED, MORE LOGGED DATA + + In this solution, the commit bitmaps are the same as the working + bitmaps--no copies are made. We commit the working bitmaps without + deallocating the delete set and we include the delete set in the + commit record instead. + + Before I describe exactly how deallocation works in this case, let + me add that there is another reason why this method is preferred. + The wandered set has to be deleted after the atom commits, since it + does not become available until the atom will no longer be + replayed. With this approach to freeing the delete set, both kinds + of deletion can be handled in the same manner, since they both take + place after the atom commits. + + In other words, since we have to address deallocating the wandered + set after commit anyway, we might as well use the same mechanism for + deallocating the delete set. It means that additional data is + logged, but it reduces complexity in my opinion. + + Here's how it works. The atom stores a record of its delete set in + memory. When a block is deallocated or relocated, the bit is of + course not immediately deallocated in the working bitmaps. + + The delete set is included in the commit record, which is written to + the journal area. The delete set is just a set of block numbers, so + there are several possible representations. The implementation + could actually dynamically chose the representation to achieve the + best compression: (a) list of blocks, (b) bitmap, and (c) extent + compression. The second two options are likely to achieve + significant compression of the delete set unless fragmentation + becomes a problem. + + The atom maintains its in-memory copy of the delete set until the + commit record is flushed to the disk. At this point, those blocks + become available for new atoms to re-allocate. The atom releases + these blocks back into the working bitmaps through the process of + "reposession". The reposession process makes a younger atom + responsible for committing a deallocation from a previous atom. + + For each block in the committed atom's delete set, a younger atom is + selected (or created) to handle the deallocation of that block. The + working bitmap corresponding to the block being deleted is or was + already captured by the younger (reposessing) atom. The block is + simply marked as deallocated in the working bitmap block captured. + + The reposessing atom may immediately use this block or not, but in + either case the deallocation is committed once the reposessing atom + commits. For recovery purposes (not discussed here), each atom also + includes a list of atoms for which it resposesses. + +---- The commit record + +The commit record includes three lists: + + DELETE SET: The set of blocks deallocated by this atom, represented + as either a list, bitmap, or using extents. + + WANDER SET: A list of block-pairs giving the original location and + the temporary wandered location. During replay the temporary + location is copied to the original location. After replay is no + longer needed, the temporary locations are deallocated using + reposession as previously described. + + REPOSESSES FOR SET: A list of the previous atoms for which this atom + reposesses deallocated blocks. This is used to know which atoms + deallocations must be replayed during crash recovery. + +I propose that all of this information is included in the commit +record, which is written to the journal area. There may be multiple +journal areas (a significant complication) or there may not, but the +key point is that all of this data is written into a reserved, +cyclical journal area. Because the journal area is reserved and +written in a simple cyclical manner, there are no allocation decisions +needed to find space for these commit records. + +---- The example + +Consider a roughly 50G file being modified in a 100G file system. +Realize that due to maintaining the preserve set, it is not possible +to transactionally write a file larger than 50G on a 100G file system. +In the absolute worst case, no extent compression is possible and the +best representation of the delete set requires a bitmap covering the +entire file system. + +A 100G file system with 4K blocks has 3.27MB of bitmaps, and this is +the same as the worst-case representation of the delete set, assuming +just about every other block is deleted. In reality, we expect the +delete set to be much smaller because extent-compression would achieve +significant savings. + +The wander set could possibly be compressed, but that is a more +difficult task. Suppose we attempt to overwrite the entire 50GB file +instead of relocating it. A 50G file has 13 million blocks, therefore +the wander set requires storing 26 million block address pairs. With +8-byte block addresses that requires writing 210MB of wander set +data. Ouch! + +We should hope that the size of the wander set does not grow so large. +After all, its parent the extent record must be modified in this case, +so these blocks are all candidates for relocation. It would take a +dumb allocate/flush plugin to try to overwrite a 50G file instead of +relocating it. + +---- The conclusion + +I maintain that it is much simpler to write all of this data inside +reserved log areas. It is possible that we could write this data +outside the log, but then it will complicate the allocation and +deallocation proceedure, since space for the log itself must then be +allocated using ordinary methods. + +Comments? diff -Naurp linux-2.6.4/fs/reiser4/carry.c linux-2.6.4-ck1/fs/reiser4/carry.c --- linux-2.6.4/fs/reiser4/carry.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/carry.c 2004-03-11 22:45:15.183526689 +1100 @@ -0,0 +1,1444 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ +/* Functions to "carry" tree modification(s) upward. */ +/* Tree is modified one level at a time. As we modify a level we accumulate a + set of changes that need to be propagated to the next level. We manage + node locking such that any searches that collide with carrying are + restarted, from the root if necessary. + + Insertion of a new item may result in items being moved among nodes and + this requires the delimiting key to be updated at the least common parent + of the nodes modified to preserve search tree invariants. Also, insertion + may require allocation of a new node. A pointer to the new node has to be + inserted into some node on the parent level, etc. + + Tree carrying is meant to be analogous to arithmetic carrying. + + A carry operation is always associated with some node (&carry_node). + + Carry process starts with some initial set of operations to be performed + and an initial set of already locked nodes. Operations are performed one + by one. Performing each single operation has following possible effects: + + - content of carry node associated with operation is modified + - new carry nodes are locked and involved into carry process on this level + - new carry operations are posted to the next level + + After all carry operations on this level are done, process is repeated for + the accumulated sequence on carry operations for the next level. This + starts by trying to lock (in left to right order) all carry nodes + associated with carry operations on the parent level. After this, we decide + whether more nodes are required on the left of already locked set. If so, + all locks taken on the parent level are released, new carry nodes are + added, and locking process repeats. + + It may happen that balancing process fails owing to unrecoverable error on + some of upper levels of a tree (possible causes are io error, failure to + allocate new node, etc.). In this case we should unmount the filesystem, + rebooting if it is the root, and possibly advise the use of fsck. + + USAGE: + + + int some_tree_operation( znode *node, ... ) + { + // Allocate on a stack pool of carry objects: operations and nodes. + // Most carry processes will only take objects from here, without + // dynamic allocation. + +I feel uneasy about this pool. It adds to code complexity, I understand why it exists, but.... -Hans + + carry_pool pool; + carry_level lowest_level; + carry_op *op; + + init_carry_pool( &pool ); + init_carry_level( &lowest_level, &pool ); + + // operation may be one of: + // COP_INSERT --- insert new item into node + // COP_CUT --- remove part of or whole node + // COP_PASTE --- increase size of item + // COP_DELETE --- delete pointer from parent node + // COP_UPDATE --- update delimiting key in least + // common ancestor of two + // COP_MODIFY --- update parent to reflect changes in + // the child + + op = post_carry( &lowest_level, operation, node, 0 ); + if( IS_ERR( op ) || ( op == NULL ) ) { + handle error + } else { + // fill in remaining fields in @op, according to carry.h:carry_op + result = carry( &lowest_level, NULL ); + } + done_carry_pool( &pool ); + } + + When you are implementing node plugin method that participates in carry + (shifting, insertion, deletion, etc.), do the following: + + int foo_node_method( znode *node, ..., carry_level *todo ) + { + carry_op *op; + + .... + + // note, that last argument to post_carry() is non-null + // here, because @op is to be applied to the parent of @node, rather + // than to the @node itself as in the previous case. + + op = node_post_carry( todo, operation, node, 1 ); + // fill in remaining fields in @op, according to carry.h:carry_op + + .... + + } + + BATCHING: + + One of the main advantages of level-by-level balancing implemented here is + ability to batch updates on a parent level and to peform them more + efficiently as a result. + + Description To Be Done (TBD). + + DIFFICULTIES AND SUBTLE POINTS: + + 1. complex plumbing is required, because: + + a. effective allocation through pools is needed + + b. target of operation is not exactly known when operation is + posted. This is worked around through bitfields in &carry_node and + logic in lock_carry_node() + + c. of interaction with locking code: node should be added into sibling + list when pointer to it is inserted into its parent, which is some time + after node was created. Between these moments, node is somewhat in + suspended state and is only registered in the carry lists + + 2. whole balancing logic is implemented here, in particular, insertion + logic is coded in make_space(). + + 3. special cases like insertion (add_tree_root()) or deletion + (kill_tree_root()) of tree root and morphing of paste into insert + (insert_paste()) have to be handled. + + 4. there is non-trivial interdependency between allocation of new nodes + and almost everything else. This is mainly due to the (1.c) above. I shall + write about this later. + +*/ + +#include "forward.h" +#include "debug.h" +#include "key.h" +#include "coord.h" +#include "plugin/item/item.h" +#include "plugin/item/extent.h" +#include "plugin/node/node.h" +#include "jnode.h" +#include "znode.h" +#include "tree_mod.h" +#include "tree_walk.h" +#include "block_alloc.h" +#include "pool.h" +#include "tree.h" +#include "carry.h" +#include "carry_ops.h" +#include "super.h" +#include "reiser4.h" +#include "prof.h" + +#include + +/* level locking/unlocking */ +static int lock_carry_level(carry_level * level); +static void unlock_carry_level(carry_level * level, int failure); +static void done_carry_level(carry_level * level); +static void unlock_carry_node(carry_level * level, carry_node * node, int fail); + +int lock_carry_node(carry_level * level, carry_node * node); +int lock_carry_node_tail(carry_node * node); + +/* carry processing proper */ +static int carry_on_level(carry_level * doing, carry_level * todo); + +/* handlers for carry operations. */ + +static void fatal_carry_error(carry_level * doing, int ecode); +static int add_new_root(carry_level * level, carry_node * node, znode * fake); + +static int carry_estimate_reserve(carry_level * level); + +#if REISER4_DEBUG +typedef enum { + CARRY_TODO, + CARRY_DOING +} carry_queue_state; +static int carry_level_invariant(carry_level * level, carry_queue_state state); +#endif + +/* main entry point for tree balancing. + + Tree carry performs operations from @doing and while doing so accumulates + information about operations to be performed on the next level ("carried" + to the parent level). Carried operations are performed, causing possibly + more operations to be carried upward etc. carry() takes care about + locking and pinning znodes while operating on them. + + For usage, see comment at the top of fs/reiser4/carry.c + +*/ +reiser4_internal int +carry(carry_level * doing /* set of carry operations to be performed */ , + carry_level * done /* set of nodes, already performed at the + * previous level. NULL in most cases */ ) +{ + int result = 0; + carry_level done_area; + carry_level todo_area; + /* queue of new requests */ + carry_level *todo; + int wasreserved; + int reserve; + ON_DEBUG(STORE_COUNTERS;) + + assert("nikita-888", doing != NULL); + + trace_stamp(TRACE_CARRY); + + todo = &todo_area; + init_carry_level(todo, doing->pool); + if (done == NULL) { + /* queue of requests performed on the previous level */ + done = &done_area; + init_carry_level(done, doing->pool); + } + + wasreserved = perthread_pages_count(); + reserve = carry_estimate_reserve(doing); + result = perthread_pages_reserve(reserve, GFP_KERNEL); + if (result != 0) + return result; + + /* iterate until there is nothing more to do */ + while (result == 0 && carry_op_num(doing) > 0) { + carry_level *tmp; + + ON_STATS(todo->level_no = doing->level_no + 1); + + /* at this point @done is locked. */ + /* repeat lock/do/unlock while + + (1) lock_carry_level() fails due to deadlock avoidance, or + + (2) carry_on_level() decides that more nodes have to + be involved. + + (3) some unexpected error occured while balancing on the + upper levels. In this case all changes are rolled back. + + */ + while (1) { + result = lock_carry_level(doing); + if (result == 0) { + /* perform operations from @doing and + accumulate new requests in @todo */ + result = carry_on_level(doing, todo); + if (result == 0) + break; + else if (result != -E_REPEAT || + !doing->restartable) { + warning("nikita-1043", + "Fatal error during carry: %i", + result); + print_level("done", done); + print_level("doing", doing); + print_level("todo", todo); + /* do some rough stuff like aborting + all pending transcrashes and thus + pushing tree back to the consistent + state. Alternatvely, just panic. + */ + fatal_carry_error(doing, result); + return result; + } + } else if (result != -E_REPEAT) { + fatal_carry_error(doing, result); + return result; + } + reiser4_stat_level_inc(doing, carry_restart); + unlock_carry_level(doing, 1); + } + /* at this point @done can be safely unlocked */ + done_carry_level(done); + reiser4_stat_level_inc(doing, carry_done); + /* cyclically shift queues */ + tmp = done; + done = doing; + doing = todo; + todo = tmp; + init_carry_level(todo, doing->pool); + + /* give other threads chance to run */ + preempt_point(); + } + done_carry_level(done); + + assert("nikita-3460", perthread_pages_count() - wasreserved >= 0); + perthread_pages_release(perthread_pages_count() - wasreserved); + + /* all counters, but x_refs should remain the same. x_refs can change + owing to transaction manager */ + ON_DEBUG(CHECK_COUNTERS;) + return result; +} + +/* perform carry operations on given level. + + Optimizations proposed by pooh: + + (1) don't lock all nodes from queue at the same time. Lock nodes lazily as + required; + + (2) unlock node if there are no more operations to be performed upon it and + node didn't add any operation to @todo. This can be implemented by + attaching to each node two counters: counter of operaions working on this + node and counter and operations carried upward from this node. + +*/ +static int +carry_on_level(carry_level * doing /* queue of carry operations to + * do on this level */ , + carry_level * todo /* queue where new carry + * operations to be performed on + * the * parent level are + * accumulated during @doing + * processing. */ ) +{ + int result; + int (*f) (carry_op *, carry_level *, carry_level *); + carry_op *op; + carry_op *tmp_op; + + assert("nikita-1034", doing != NULL); + assert("nikita-1035", todo != NULL); + + trace_stamp(TRACE_CARRY); + + /* node can be inconsistent while in-transit */ + DISABLE_NODE_CHECK; + + /* @doing->nodes are locked. */ + + /* This function can be split into two phases: analysis and modification. + + Analysis calculates precisely what items should be moved between + nodes. This information is gathered in some structures attached to + each carry_node in a @doing queue. Analysis also determines whether + new nodes are to be allocated etc. + + After analysis is completed, actual modification is performed. Here + we can take advantage of "batch modification": if there are several + operations acting on the same node, modifications can be performed + more efficiently when batched together. + + Above is an optimization left for the future. + */ + /* Important, but delayed optimization: it's possible to batch + operations together and perform them more efficiently as a + result. For example, deletion of several neighboring items from a + node can be converted to a single ->cut() operation. + + Before processing queue, it should be scanned and "mergeable" + operations merged. + */ + result = 0; + for_all_ops(doing, op, tmp_op) { + carry_opcode opcode; + + assert("nikita-1041", op != NULL); + opcode = op->op; + assert("nikita-1042", op->op < COP_LAST_OP); + f = op_dispatch_table[op->op].handler; + result = f(op, doing, todo); + /* locking can fail with -E_REPEAT. Any different error is fatal + and will be handled by fatal_carry_error() sledgehammer. + */ + if (result != 0) + break; + } + if (result == 0) { + carry_plugin_info info; + carry_node *scan; + carry_node *tmp_scan; + + info.doing = doing; + info.todo = todo; + + assert("nikita-3002", carry_level_invariant(doing, CARRY_DOING)); + for_all_nodes(doing, scan, tmp_scan) { + znode *node; + + node = carry_real(scan); + assert("nikita-2547", node != NULL); + if (node_is_empty(node)) { + result = node_plugin_by_node(node)->prepare_removal(node, &info); + if (result != 0) + break; + } + } + } + ENABLE_NODE_CHECK; + return result; +} + +/* post carry operation + + This is main function used by external carry clients: node layout plugins + and tree operations to create new carry operation to be performed on some + level. + + New operation will be included in the @level queue. To actually perform it, + call carry( level, ... ). This function takes write lock on @node. Carry + manages all its locks by itself, don't worry about this. + + This function adds operation and node at the end of the queue. It is up to + caller to guarantee proper ordering of node queue. + +*/ +reiser4_internal carry_op * +post_carry(carry_level * level /* queue where new operation is to + * be posted at */ , + carry_opcode op /* opcode of operation */ , + znode * node /* node on which this operation + * will operate */ , + int apply_to_parent_p /* whether operation will operate + * directly on @node or on it + * parent. */ ) +{ + carry_op *result; + carry_node *child; + + assert("nikita-1046", level != NULL); + assert("nikita-1788", znode_is_write_locked(node)); + + result = add_op(level, POOLO_LAST, NULL); + if (IS_ERR(result)) + return result; + child = add_carry(level, POOLO_LAST, NULL); + if (IS_ERR(child)) { + reiser4_pool_free(&level->pool->op_pool, &result->header); + return (carry_op *) child; + } + result->node = child; + result->op = op; + child->parent = apply_to_parent_p; + if (ZF_ISSET(node, JNODE_ORPHAN)) + child->left_before = 1; + child->node = node; + return result; +} + +/* number of carry operations in a @level */ +reiser4_internal int +carry_op_num(const carry_level * level) +{ + return level->ops_num; +} + +/* initialise carry queue */ +reiser4_internal void +init_carry_level(carry_level * level /* level to initialise */ , + carry_pool * pool /* pool @level will allocate objects + * from */ ) +{ + assert("nikita-1045", level != NULL); + assert("nikita-967", pool != NULL); + + xmemset(level, 0, sizeof *level); + level->pool = pool; + + pool_level_list_init(&level->nodes); + pool_level_list_init(&level->ops); +} + +/* initialise pools within queue */ +reiser4_internal void +init_carry_pool(carry_pool * pool /* pool to initialise */ ) +{ + assert("nikita-945", pool != NULL); + + reiser4_init_pool(&pool->op_pool, sizeof (carry_op), CARRIES_POOL_SIZE, (char *) pool->op); + reiser4_init_pool(&pool->node_pool, sizeof (carry_node), NODES_LOCKED_POOL_SIZE, (char *) pool->node); +} + +/* finish with queue pools */ +reiser4_internal void +done_carry_pool(carry_pool * pool UNUSED_ARG /* pool to destroy */ ) +{ + reiser4_done_pool(&pool->op_pool); + reiser4_done_pool(&pool->node_pool); +} + +/* add new carry node to the @level. + + Returns pointer to the new carry node allocated from pool. It's up to + callers to maintain proper order in the @level. Assumption is that if carry + nodes on one level are already sorted and modifications are peroformed from + left to right, carry nodes added on the parent level will be ordered + automatically. To control ordering use @order and @reference parameters. + +*/ +reiser4_internal carry_node * +add_carry_skip(carry_level * level /* &carry_level to add node + * to */ , + pool_ordering order /* where to insert: at the + * beginning of @level, + * before @reference, after + * @reference, at the end + * of @level */ , + carry_node * reference /* reference node for + * insertion */ ) +{ + ON_DEBUG(carry_node * orig_ref = reference); + + trace_stamp(TRACE_CARRY); + if (order == POOLO_BEFORE) { + reference = find_left_carry(reference, level); + if (reference == NULL) + reference = carry_node_front(level); + else + reference = carry_node_next(reference); + } else if (order == POOLO_AFTER) { + reference = find_right_carry(reference, level); + if (reference == NULL) + reference = carry_node_back(level); + else + reference = carry_node_prev(reference); + } + assert("nikita-2209", + ergo(orig_ref != NULL, + carry_real(reference) == carry_real(orig_ref))); + return add_carry(level, order, reference); +} + +reiser4_internal carry_node * +add_carry(carry_level * level /* &carry_level to add node + * to */ , + pool_ordering order /* where to insert: at the + * beginning of @level, before + * @reference, after @reference, + * at the end of @level */ , + carry_node * reference /* reference node for + * insertion */ ) +{ + carry_node *result; + + result = (carry_node *) add_obj(&level->pool->node_pool, &level->nodes, order, &reference->header); + if (!IS_ERR(result) && (result != NULL)) + ++level->nodes_num; + return result; +} + +/* add new carry operation to the @level. + + Returns pointer to the new carry operations allocated from pool. It's up to + callers to maintain proper order in the @level. To control ordering use + @order and @reference parameters. + +*/ +reiser4_internal carry_op * +add_op(carry_level * level /* &carry_level to add node to */ , + pool_ordering order /* where to insert: at the beginning of + * @level, before @reference, after + * @reference, at the end of @level */ , + carry_op * reference /* reference node for insertion */ ) +{ + carry_op *result; + + trace_stamp(TRACE_CARRY); + result = (carry_op *) add_obj(&level->pool->op_pool, &level->ops, order, &reference->header); + if (!IS_ERR(result) && (result != NULL)) + ++level->ops_num; + return result; +} + +/* Return node on the right of which @node was created. + + Each node is created on the right of some existing node (or it is new root, + which is special case not handled here). + + @node is new node created on some level, but not yet inserted into its + parent, it has corresponding bit (JNODE_ORPHAN) set in zstate. + +*/ +reiser4_internal carry_node * +find_begetting_brother(carry_node * node /* node to start search + * from */ , + carry_level * kin UNUSED_ARG /* level to + * scan */ ) +{ + carry_node *scan; + + assert("nikita-1614", node != NULL); + assert("nikita-1615", kin != NULL); + assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree)); + assert("nikita-1619", ergo(carry_real(node) != NULL, + ZF_ISSET(carry_real(node), JNODE_ORPHAN))); + + for (scan = node;; scan = carry_node_prev(scan)) { + assert("nikita-1617", !carry_node_end(kin, scan)); + if ((scan->node != node->node) && !ZF_ISSET(scan->node, JNODE_ORPHAN)) { + assert("nikita-1618", carry_real(scan) != NULL); + break; + } + } + return scan; +} + +static cmp_t +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2) +{ + assert("nikita-2199", n1 != NULL); + assert("nikita-2200", n2 != NULL); + + if (n1 == n2) + return EQUAL_TO; + while (1) { + n1 = carry_node_next(n1); + if (carry_node_end(level, n1)) + return GREATER_THAN; + if (n1 == n2) + return LESS_THAN; + } + impossible("nikita-2201", "End of level reached"); +} + +reiser4_internal carry_node * +find_carry_node(carry_level * level, const znode * node) +{ + carry_node *scan; + carry_node *tmp_scan; + + assert("nikita-2202", level != NULL); + assert("nikita-2203", node != NULL); + + for_all_nodes(level, scan, tmp_scan) { + if (carry_real(scan) == node) + return scan; + } + return NULL; +} + +reiser4_internal znode * +carry_real(const carry_node * node) +{ + assert("nikita-3061", node != NULL); + + return node->lock_handle.node; +} + +reiser4_internal carry_node * +insert_carry_node(carry_level * doing, carry_level * todo, const znode * node) +{ + carry_node *base; + carry_node *scan; + carry_node *tmp_scan; + carry_node *proj; + + base = find_carry_node(doing, node); + assert("nikita-2204", base != NULL); + + for_all_nodes(todo, scan, tmp_scan) { + proj = find_carry_node(doing, scan->node); + assert("nikita-2205", proj != NULL); + if (carry_node_cmp(doing, proj, base) != LESS_THAN) + break; + } + return scan; +} + +reiser4_internal carry_node * +add_carry_atplace(carry_level *doing, carry_level *todo, znode *node) +{ + carry_node *reference; + + assert("nikita-2994", doing != NULL); + assert("nikita-2995", todo != NULL); + assert("nikita-2996", node != NULL); + + reference = insert_carry_node(doing, todo, node); + assert("nikita-2997", reference != NULL); + + return add_carry(todo, POOLO_BEFORE, reference); +} + +/* like post_carry(), but designed to be called from node plugin methods. + This function is different from post_carry() in that it finds proper place + to insert node in the queue. */ +reiser4_internal carry_op * +node_post_carry(carry_plugin_info * info /* carry parameters + * passed down to node + * plugin */ , + carry_opcode op /* opcode of operation */ , + znode * node /* node on which this + * operation will operate */ , + int apply_to_parent_p /* whether operation will + * operate directly on @node + * or on it parent. */ ) +{ + carry_op *result; + carry_node *child; + + assert("nikita-2207", info != NULL); + assert("nikita-2208", info->todo != NULL); + + if (info->doing == NULL) + return post_carry(info->todo, op, node, apply_to_parent_p); + + result = add_op(info->todo, POOLO_LAST, NULL); + if (IS_ERR(result)) + return result; + child = add_carry_atplace(info->doing, info->todo, node); + if (IS_ERR(child)) { + reiser4_pool_free(&info->todo->pool->op_pool, &result->header); + return (carry_op *) child; + } + result->node = child; + result->op = op; + child->parent = apply_to_parent_p; + if (ZF_ISSET(node, JNODE_ORPHAN)) + child->left_before = 1; + child->node = node; + return result; +} + +/* lock all carry nodes in @level */ +static int +lock_carry_level(carry_level * level /* level to lock */ ) +{ + int result; + carry_node *node; + carry_node *tmp_node; + + assert("nikita-881", level != NULL); + assert("nikita-2229", carry_level_invariant(level, CARRY_TODO)); + + trace_stamp(TRACE_CARRY); + + /* lock nodes from left to right */ + result = 0; + for_all_nodes(level, node, tmp_node) { + result = lock_carry_node(level, node); + if (result != 0) + break; + } + return result; +} + +/* Synchronize delimiting keys between @node and its left neighbor. + + To reduce contention on dk key and simplify carry code, we synchronize + delimiting keys only when carry ultimately leaves tree level (carrying + changes upward) and unlocks nodes at this level. + + This function first finds left neighbor of @node and then updates left + neighbor's right delimiting key to conincide with least key in @node. + +*/ +static void +sync_dkeys(znode *spot /* node to update */) +{ + reiser4_key pivot; + reiser4_tree *tree; + + assert("nikita-1610", spot != NULL); + assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk)); + + tree = znode_get_tree(spot); + WLOCK_DK(tree); + + assert("nikita-2192", znode_is_loaded(spot)); + + /* sync left delimiting key of @spot with key in its leftmost item */ + if (node_is_empty(spot)) + pivot = *znode_get_rd_key(spot); + else + leftmost_key_in_node(spot, &pivot); + + znode_set_ld_key(spot, &pivot); + + RLOCK_TREE(tree); + /* there can be sequence of empty nodes pending removal on the left of + @spot. Scan them and update their left and right delimiting keys to + match left delimiting key of @spot. Also, update right delimiting + key of first non-empty left neighbor. + */ + while (1) { + if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED)) + break; + + spot = spot->left; + if (spot == NULL) + break; + +#if 0 + /* on the leaf level we can only increase right delimiting key + * of a node on which we don't hold a long term lock. */ + assert("nikita-2930", + ergo(!znode_is_write_locked(spot) && + znode_get_level(spot) == LEAF_LEVEL, + keyge(&pivot, znode_get_rd_key(spot)))); +#endif + + znode_set_rd_key(spot, &pivot); + /* don't sink into the domain of another balancing */ + if (!znode_is_write_locked(spot)) + break; + if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE)) + znode_set_ld_key(spot, &pivot); + else + break; + } + + RUNLOCK_TREE(tree); + WUNLOCK_DK(tree); +} + +void +check_dkeys(const znode *node); + +/* unlock all carry nodes in @level */ +static void +unlock_carry_level(carry_level * level /* level to unlock */ , + int failure /* true if unlocking owing to + * failure */ ) +{ + carry_node *node; + carry_node *tmp_node; + + assert("nikita-889", level != NULL); + + trace_stamp(TRACE_CARRY); + + if (!failure) { + znode *spot; + + spot = NULL; + /* update delimiting keys */ + for_all_nodes(level, node, tmp_node) { + if (carry_real(node) != spot) { + spot = carry_real(node); + sync_dkeys(spot); + } + } + } + + /* nodes can be unlocked in arbitrary order. In preemptible + environment it's better to unlock in reverse order of locking, + though. + */ + for_all_nodes_back(level, node, tmp_node) { + /* all allocated nodes should be already linked to their + parents at this moment. */ + assert("nikita-1631", ergo(!failure, !ZF_ISSET(carry_real(node), + JNODE_ORPHAN))); + if (!failure) + node_check(carry_real(node), REISER4_NODE_DKEYS); + ON_DEBUG(check_dkeys(carry_real(node))); + unlock_carry_node(level, node, failure); + } + level->new_root = NULL; +} + +/* finish with @level + + Unlock nodes and release all allocated resources */ +static void +done_carry_level(carry_level * level /* level to finish */ ) +{ + carry_node *node; + carry_node *tmp_node; + carry_op *op; + carry_op *tmp_op; + + assert("nikita-1076", level != NULL); + + trace_stamp(TRACE_CARRY); + + unlock_carry_level(level, 0); + for_all_nodes(level, node, tmp_node) { + assert("nikita-2113", locks_list_is_clean(&node->lock_handle)); + assert("nikita-2114", owners_list_is_clean(&node->lock_handle)); + reiser4_pool_free(&level->pool->node_pool, &node->header); + } + for_all_ops(level, op, tmp_op) + reiser4_pool_free(&level->pool->op_pool, &op->header); +} + +/* helper function to complete locking of carry node + + Finish locking of carry node. There are several ways in which new carry + node can be added into carry level and locked. Normal is through + lock_carry_node(), but also from find_{left|right}_neighbor(). This + function factors out common final part of all locking scenarios. It + supposes that @node -> lock_handle is lock handle for lock just taken and + fills ->real_node from this lock handle. + +*/ +reiser4_internal int +lock_carry_node_tail(carry_node * node /* node to complete locking of */ ) +{ + assert("nikita-1052", node != NULL); + assert("nikita-1187", carry_real(node) != NULL); + assert("nikita-1188", !node->unlock); + + node->unlock = 1; + /* Load node content into memory and install node plugin by + looking at the node header. + + Most of the time this call is cheap because the node is + already in memory. + + Corresponding zrelse() is in unlock_carry_node() + */ + return zload(carry_real(node)); +} + +/* lock carry node + + "Resolve" node to real znode, lock it and mark as locked. + This requires recursive locking of znodes. + + When operation is posted to the parent level, node it will be applied to is + not yet known. For example, when shifting data between two nodes, + delimiting has to be updated in parent or parents of nodes involved. But + their parents is not yet locked and, moreover said nodes can be reparented + by concurrent balancing. + + To work around this, carry operation is applied to special "carry node" + rather than to the znode itself. Carry node consists of some "base" or + "reference" znode and flags indicating how to get to the target of carry + operation (->real_node field of carry_node) from base. + +*/ +reiser4_internal int +lock_carry_node(carry_level * level /* level @node is in */ , + carry_node * node /* node to lock */ ) +{ + int result; + znode *reference_point; + lock_handle lh; + lock_handle tmp_lh; + + assert("nikita-887", level != NULL); + assert("nikita-882", node != NULL); + + trace_stamp(TRACE_CARRY); + + result = 0; + reference_point = node->node; + init_lh(&lh); + init_lh(&tmp_lh); + if (node->left_before) { + /* handling of new nodes, allocated on the previous level: + + some carry ops were propably posted from the new node, but + this node neither has parent pointer set, nor is + connected. This will be done in ->create_hook() for + internal item. + + No then less, parent of new node has to be locked. To do + this, first go to the "left" in the carry order. This + depends on the decision to always allocate new node on the + right of existing one. + + Loop handles case when multiple nodes, all orphans, were + inserted. + + Strictly speaking, taking tree lock is not necessary here, + because all nodes scanned by loop in + find_begetting_brother() are write-locked by this thread, + and thus, their sibling linkage cannot change. + + */ + reference_point = UNDER_RW + (tree, znode_get_tree(reference_point), read, + find_begetting_brother(node, level)->node); + assert("nikita-1186", reference_point != NULL); + } + if (node->parent && (result == 0)) { + result = reiser4_get_parent(&tmp_lh, reference_point, ZNODE_WRITE_LOCK, 0); + if (result != 0) { + ; /* nothing */ + } else if (znode_get_level(tmp_lh.node) == 0) { + assert("nikita-1347", znode_above_root(tmp_lh.node)); + result = add_new_root(level, node, tmp_lh.node); + if (result == 0) { + reference_point = level->new_root; + move_lh(&lh, &node->lock_handle); + } + } else if ((level->new_root != NULL) && (level->new_root != znode_parent_nolock(reference_point))) { + /* parent of node exists, but this level aready + created different new root, so */ + warning("nikita-1109", + /* it should be "radicis", but tradition is + tradition. do banshees read latin? */ + "hodie natus est radici frater"); + result = -EIO; + } else { + move_lh(&lh, &tmp_lh); + reference_point = lh.node; + } + } + if (node->left && (result == 0)) { + assert("nikita-1183", node->parent); + assert("nikita-883", reference_point != NULL); + result = reiser4_get_left_neighbor( + &tmp_lh, reference_point, ZNODE_WRITE_LOCK, GN_CAN_USE_UPPER_LEVELS); + if (result == 0) { + done_lh(&lh); + move_lh(&lh, &tmp_lh); + reference_point = lh.node; + } + } + if (!node->parent && !node->left && !node->left_before) { + result = longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI); + } + if (result == 0) { + move_lh(&node->lock_handle, &lh); + result = lock_carry_node_tail(node); + } + done_lh(&tmp_lh); + done_lh(&lh); + return result; +} + +/* release a lock on &carry_node. + + Release if necessary lock on @node. This opearion is pair of + lock_carry_node() and is idempotent: you can call it more than once on the + same node. + +*/ +static void +unlock_carry_node(carry_level * level, + carry_node * node /* node to be released */ , + int failure /* 0 if node is unlocked due + * to some error */ ) +{ + znode *real_node; + + assert("nikita-884", node != NULL); + + trace_stamp(TRACE_CARRY); + + real_node = carry_real(node); + /* pair to zload() in lock_carry_node_tail() */ + zrelse(real_node); + if (node->unlock && (real_node != NULL)) { + assert("nikita-899", real_node == node->lock_handle.node); + longterm_unlock_znode(&node->lock_handle); + } + if (failure) { + if (node->deallocate && (real_node != NULL)) { + /* free node in bitmap + + Prepare node for removal. Last zput() will finish + with it. + */ + ZF_SET(real_node, JNODE_HEARD_BANSHEE); + } + if (node->free) { + assert("nikita-2177", locks_list_is_clean(&node->lock_handle)); + assert("nikita-2112", owners_list_is_clean(&node->lock_handle)); + reiser4_pool_free(&level->pool->node_pool, &node->header); + } + } +} + +/* fatal_carry_error() - all-catching error handling function + + It is possible that carry faces unrecoverable error, like unability to + insert pointer at the internal level. Our simple solution is just panic in + this situation. More sophisticated things like attempt to remount + file-system as read-only can be implemented without much difficlties. + + It is believed, that: + + 1. in stead of panicking, all current transactions can be aborted rolling + system back to the consistent state. + +Umm, if you simply panic without doing anything more at all, then all current +transactions are aborted and the system is rolled back to a consistent state, +by virtue of the design of the transactional mechanism. Well, wait, let's be +precise. If an internal node is corrupted on disk due to hardware failure, +then there may be no consistent state that can be rolled back to, so instead +we should say that it will rollback the transactions, which barring other +factors means rolling back to a consistent state. + +# Nikita: there is a subtle difference between panic and aborting +# transactions: machine doesn't reboot. Processes aren't killed. Processes +# don't using reiser4 (not that we care about such processes), or using other +# reiser4 mounts (about them we do care) will simply continue to run. With +# some luck, even application using aborted file system can survive: it will +# get some error, like EBADF, from each file descriptor on failed file system, +# but applications that do care about tolerance will cope with this (squid +# will). + +It would be a nice feature though to support rollback without rebooting +followed by remount, but this can wait for later versions. + + + 2. once isolated transactions will be implemented it will be possible to + roll back offending transaction. + +2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about +it more before deciding if it should be done. -Hans + +*/ +static void +fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level + * where + * unrecoverable + * error + * occurred */ , + int ecode /* error code */ ) +{ + assert("nikita-1230", doing != NULL); + assert("nikita-1231", ecode < 0); + + reiser4_panic("nikita-1232", "Carry failed: %i", ecode); +} + +/* add new root to the tree + + This function itself only manages changes in carry structures and delegates + all hard work (allocation of znode for new root, changes of parent and + sibling pointers to the add_tree_root(). + + Locking: old tree root is locked by carry at this point. Fake znode is also + locked. + +*/ +static int +add_new_root(carry_level * level /* carry level in context of which + * operation is performed */ , + carry_node * node /* carry node for existing root */ , + znode * fake /* "fake" znode already locked by + * us */ ) +{ + int result; + + assert("nikita-1104", level != NULL); + assert("nikita-1105", node != NULL); + + assert("nikita-1403", znode_is_write_locked(node->node)); + assert("nikita-1404", znode_is_write_locked(fake)); + + /* trying to create new root. */ + /* @node is root and it's already locked by us. This + means that nobody else can be trying to add/remove + tree root right now. + */ + if (level->new_root == NULL) + level->new_root = add_tree_root(node->node, fake); + if (!IS_ERR(level->new_root)) { + assert("nikita-1210", znode_is_root(level->new_root)); + node->deallocate = 1; + result = longterm_lock_znode(&node->lock_handle, level->new_root, ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI); + if (result == 0) + zput(level->new_root); + } else { + result = PTR_ERR(level->new_root); + level->new_root = NULL; + } + return result; +} + +/* allocate new znode and add the operation that inserts the + pointer to it into the parent node into the todo level + + Allocate new znode, add it into carry queue and post into @todo queue + request to add pointer to new node into its parent. + + This is carry related routing that calls new_node() to allocate new + node. +*/ +reiser4_internal carry_node * +add_new_znode(znode * brother /* existing left neighbor of new + * node */ , + carry_node * ref /* carry node after which new + * carry node is to be inserted + * into queue. This affects + * locking. */ , + carry_level * doing /* carry queue where new node is + * to be added */ , + carry_level * todo /* carry queue where COP_INSERT + * operation to add pointer to + * new node will ne added */ ) +{ + carry_node *fresh; + znode *new_znode; + carry_op *add_pointer; + carry_plugin_info info; + + assert("nikita-1048", brother != NULL); + assert("nikita-1049", todo != NULL); + + /* There is a lot of possible variations here: to what parent + new node will be attached and where. For simplicity, always + do the following: + + (1) new node and @brother will have the same parent. + + (2) new node is added on the right of @brother + + */ + + fresh = add_carry_skip(doing, ref ? POOLO_AFTER : POOLO_LAST, ref); + if (IS_ERR(fresh)) + return fresh; + + fresh->deallocate = 1; + fresh->free = 1; + + new_znode = new_node(brother, znode_get_level(brother)); + if (IS_ERR(new_znode)) + /* @fresh will be deallocated automatically by error + handling code in the caller. */ + return (carry_node *) new_znode; + + /* new_znode returned znode with x_count 1. Caller has to decrease + it. make_space() does. */ + + ZF_SET(new_znode, JNODE_ORPHAN); + fresh->node = new_znode; + + while (ZF_ISSET(carry_real(ref), JNODE_ORPHAN)) { + ref = carry_node_prev(ref); + assert("nikita-1606", !carry_node_end(doing, ref)); + } + + info.todo = todo; + info.doing = doing; + add_pointer = node_post_carry(&info, COP_INSERT, carry_real(ref), 1); + if (IS_ERR(add_pointer)) { + /* no need to deallocate @new_znode here: it will be + deallocated during carry error handling. */ + return (carry_node *) add_pointer; + } + + add_pointer->u.insert.type = COPT_CHILD; + add_pointer->u.insert.child = fresh; + add_pointer->u.insert.brother = brother; + /* initially new node spawns empty key range */ + WLOCK_DK(znode_get_tree(brother)); + znode_set_ld_key(new_znode, + znode_set_rd_key(new_znode, znode_get_rd_key(brother))); + WUNLOCK_DK(znode_get_tree(brother)); + return fresh; +} + +/* + * Estimate how many pages of memory have to be reserved to complete execution + * of @level. + */ +static int carry_estimate_reserve(carry_level * level) +{ + carry_op *op; + carry_op *tmp_op; + int result; + + result = 0; + for_all_ops(level, op, tmp_op) + result += op_dispatch_table[op->op].estimate(op, level); + return result; +} + +/* DEBUGGING FUNCTIONS. + + Probably we also should leave them on even when + debugging is turned off to print dumps at errors. +*/ +#if REISER4_DEBUG +static int +carry_level_invariant(carry_level * level, carry_queue_state state) +{ + carry_node *node; + carry_node *tmp_node; + + if (level == NULL) + return 0; + + if (level->track_type != 0 && + level->track_type != CARRY_TRACK_NODE && + level->track_type != CARRY_TRACK_CHANGE) + return 0; + + /* check that nodes are in ascending order */ + for_all_nodes(level, node, tmp_node) { + znode *left; + znode *right; + + reiser4_key lkey; + reiser4_key rkey; + + if (node != carry_node_front(level)) { + if (state == CARRY_TODO) { + right = node->node; + left = carry_node_prev(node)->node; + } else { + right = carry_real(node); + left = carry_real(carry_node_prev(node)); + } + if (right == NULL || left == NULL) + continue; + if (node_is_empty(right) || node_is_empty(left)) + continue; + if (!keyle(leftmost_key_in_node(left, &lkey), + leftmost_key_in_node(right, &rkey))) { + print_znode("left", left); + print_node_content("left", left, ~0); + print_znode("right", right); + print_node_content("right", right, ~0); + return 0; + } + } + } + return 1; +} +#endif + +#if REISER4_DEBUG_OUTPUT +/* get symbolic name for boolean */ +static const char * +tf(int boolean /* truth value */ ) +{ + return boolean ? "t" : "f"; +} + +/* symbolic name for carry operation */ +static const char * +carry_op_name(carry_opcode op /* carry opcode */ ) +{ + switch (op) { + case COP_INSERT: + return "COP_INSERT"; + case COP_DELETE: + return "COP_DELETE"; + case COP_CUT: + return "COP_CUT"; + case COP_PASTE: + return "COP_PASTE"; + case COP_UPDATE: + return "COP_UPDATE"; + case COP_MODIFY: + return "COP_MODIFY"; + case COP_EXTENT: + return "COP_EXTENT"; + case COP_INSERT_FLOW: + return "COP_INSERT_FLOW"; + default:{ + /* not mt safe, but who cares? */ + static char buf[20]; + + sprintf(buf, "unknown op: %x", op); + return buf; + } + } +} + +/* dump information about carry node */ +reiser4_internal void +print_carry(const char *prefix /* prefix to print */ , + carry_node * node /* node to print */ ) +{ + if (node == NULL) { + printk("%s: null\n", prefix); + return; + } + printk("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n", + prefix, node, tf(node->parent), tf(node->left), tf(node->unlock), tf(node->free), tf(node->deallocate)); + print_znode("\tnode", node->node); + print_znode("\treal_node", carry_real(node)); +} + +/* dump information about carry operation */ +reiser4_internal void +print_op(const char *prefix /* prefix to print */ , + carry_op * op /* operation to print */ ) +{ + if (op == NULL) { + printk("%s: null\n", prefix); + return; + } + printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op)); + print_carry("\tnode", op->node); + switch (op->op) { + case COP_INSERT: + case COP_PASTE: + print_coord("\tcoord", op->u.insert.d ? op->u.insert.d->coord : NULL, 0); + print_key("\tkey", op->u.insert.d ? op->u.insert.d->key : NULL); + print_carry("\tchild", op->u.insert.child); + break; + case COP_DELETE: + print_carry("\tchild", op->u.delete.child); + break; + case COP_CUT: + if (op->u.cut_or_kill.is_cut) { + print_coord("\tfrom", op->u.cut_or_kill.u.kill->params.from, 0); + print_coord("\tto", op->u.cut_or_kill.u.kill->params.to, 0); + } else { + print_coord("\tfrom", op->u.cut_or_kill.u.cut->params.from, 0); + print_coord("\tto", op->u.cut_or_kill.u.cut->params.to, 0); + } + break; + case COP_UPDATE: + print_carry("\tleft", op->u.update.left); + break; + case COP_MODIFY: + print_carry("\tchild", op->u.modify.child); + printk("\tflag: %x\n", op->u.modify.flag); + default: + /* do nothing */ + break; + } +} + +/* dump information about all nodes and operations in a @level */ +reiser4_internal void +print_level(const char *prefix /* prefix to print */ , + carry_level * level /* level to print */ ) +{ + carry_node *node; + carry_node *tmp_node; + carry_op *op; + carry_op *tmp_op; + + if (level == NULL) { + printk("%s: null\n", prefix); + return; + } + printk("%s: %p, restartable: %s\n", + prefix, level, tf(level->restartable)); + + for_all_nodes(level, node, tmp_node) + print_carry("\tcarry node", node); + for_all_ops(level, op, tmp_op) + print_op("\tcarry op", op); +} +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/carry.h linux-2.6.4-ck1/fs/reiser4/carry.h --- linux-2.6.4/fs/reiser4/carry.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/carry.h 2004-03-11 22:45:15.184526534 +1100 @@ -0,0 +1,447 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Functions and data types to "carry" tree modification(s) upward. + See fs/reiser4/carry.c for details. */ + +#if !defined( __FS_REISER4_CARRY_H__ ) +#define __FS_REISER4_CARRY_H__ + +#include "forward.h" +#include "debug.h" +#include "pool.h" +#include "znode.h" + +#include + +/* &carry_node - "location" of carry node. + + "location" of node that is involved or going to be involved into + carry process. Node where operation will be carried to on the + parent level cannot be recorded explicitly. Operation will be carried + usually to the parent of some node (where changes are performed at + the current level) or, to the left neighbor of its parent. But while + modifications are performed at the current level, parent may + change. So, we have to allow some indirection (or, positevly, + flexibility) in locating carry nodes. + +*/ +typedef struct carry_node { + /* pool linkage */ + reiser4_pool_header header; + + /* base node from which real_node is calculated. See + fs/reiser4/carry.c:lock_carry_node(). */ + znode *node; + + /* how to get ->real_node */ + /* to get ->real_node obtain parent of ->node*/ + __u32 parent:1; + /* to get ->real_node obtain left neighbor of parent of + ->node*/ + __u32 left:1; + __u32 left_before:1; + + /* locking */ + + /* this node was locked by carry process and should be + unlocked when carry leaves a level */ + __u32 unlock:1; + + /* disk block for this node was allocated by carry process and + should be deallocated when carry leaves a level */ + __u32 deallocate:1; + /* this carry node was allocated by carry process and should be + freed when carry leaves a level */ + __u32 free:1; + + /* type of lock we want to take on this node */ + lock_handle lock_handle; +} carry_node; + +/* &carry_opcode - elementary operations that can be carried upward + + Operations that carry() can handle. This list is supposed to be + expanded. + + Each carry operation (cop) is handled by appropriate function defined + in fs/reiser4/carry.c. For example COP_INSERT is handled by + fs/reiser4/carry.c:carry_insert() etc. These functions in turn + call plugins of nodes affected by operation to modify nodes' content + and to gather operations to be performed on the next level. + +*/ +typedef enum { + /* insert new item into node. */ + COP_INSERT, + /* delete pointer from parent node */ + COP_DELETE, + /* remove part of or whole node. */ + COP_CUT, + /* increase size of item. */ + COP_PASTE, + /* insert extent (that is sequence of unformatted nodes). */ + COP_EXTENT, + /* update delimiting key in least common ancestor of two + nodes. This is performed when items are moved between two + nodes. + */ + COP_UPDATE, + /* update parent to reflect changes in the child. 3.x format + emulation uses this to update "child size" in parent. */ + COP_MODIFY, + /* insert flow */ + COP_INSERT_FLOW, + COP_LAST_OP, +} carry_opcode; + +#define CARRY_FLOW_NEW_NODES_LIMIT 10 + +typedef enum { + COP_MODIFY_FREE_SPACE = (1 << 0), /* FIXME_JMACD currently unused + * -josh */ +} cop_modify_flag; + +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target + item is determined. */ +typedef enum { + /* target item is one containing pointer to the ->child node */ + COPT_CHILD, + /* target item is given explicitly by @coord */ + COPT_ITEM_DATA, + /* target item is given by key */ + COPT_KEY, + /* see insert_paste_common() for more comments on this. */ + COPT_PASTE_RESTARTED, +} cop_insert_pos_type; + +/* flags to cut and delete */ +typedef enum { + /* don't kill node even if it became completely empty as results of + * cut. This is needed for eottl handling. See carry_extent() for + * details. */ + DELETE_RETAIN_EMPTY = (1 << 0), + /* kill items as opposed to just cut them. Killing implies that items + * are really removed from the tree, ->kill_hook method is called. */ + DELETE_KILL = (1 << 1) +} cop_delete_flag; + +/* + * carry() implements "lock handle tracking" feature. + * + * Callers supply carry with node where to perform initial operation and lock + * handle on this node. Trying to optimize node utilization carry may actually + * move insertion point to different node. Callers expect that lock handle + * will rebe transferred to the new node also. + * + */ +typedef enum { + /* transfer lock handle along with insertion point */ + CARRY_TRACK_CHANGE = 1, + /* acquire new lock handle to the node where insertion point is. This + * is used when carry() client doesn't initially possess lock handle + * on the insertion point node, for example, by extent insertion + * code. See carry_extent(). */ + CARRY_TRACK_NODE = 2 +} carry_track_type; + +/* data supplied to COP_{INSERT|PASTE} by callers */ +typedef struct carry_insert_data { + /* position where new item is to be inserted */ + coord_t *coord; + /* new item description */ + reiser4_item_data *data; + /* key of new item */ + const reiser4_key *key; +} carry_insert_data; + +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */ +struct cut_kill_params { + /* coord where cut starts (inclusive) */ + coord_t *from; + /* coord where cut stops (inclusive, this item/unit will also be + * cut) */ + coord_t *to; + /* starting key. This is necessary when item and unit pos don't + * uniquely identify what portion or tree to remove. For example, this + * indicates what portion of extent unit will be affected. */ + const reiser4_key *from_key; + /* exclusive stop key */ + const reiser4_key *to_key; + /* if this is not NULL, smallest actually removed key is stored + * here. */ + reiser4_key *smallest_removed; +}; + +struct carry_cut_data { + struct cut_kill_params params; +}; + +struct carry_kill_data { + struct cut_kill_params params; + /* parameter to be passed to the ->kill_hook() method of item + * plugin */ + /*void *iplug_params;*/ /* FIXME: unused currently */ + /* if not NULL---inode whose items are being removed. This is needed + * for ->kill_hook() of extent item to update VM structures when + * removing pages. */ + struct inode *inode; + /* sibling list maintenance is complicated by existence of eottl. When + * eottl whose left and right neighbors are formatted leaves is + * removed, one has to connect said leaves in the sibling list. This + * cannot be done when extent removal is just started as locking rules + * require sibling list update to happen atomically with removal of + * extent item. Therefore: 1. pointers to left and right neighbors + * have to be passed down to the ->kill_hook() of extent item, and + * 2. said neighbors have to be locked. */ + lock_handle *left; + lock_handle *right; + /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */ + unsigned flags; +}; + +/* &carry_tree_op - operation to "carry" upward. + + Description of an operation we want to "carry" to the upper level of + a tree: e.g, when we insert something and there is not enough space + we allocate a new node and "carry" the operation of inserting a + pointer to the new node to the upper level, on removal of empty node, + we carry up operation of removing appropriate entry from parent. + + There are two types of carry ops: when adding or deleting node we + node at the parent level where appropriate modification has to be + performed is known in advance. When shifting items between nodes + (split, merge), delimiting key should be changed in the least common + parent of the nodes involved that is not known in advance. + + For the operations of the first type we store in &carry_op pointer to + the &carry_node at the parent level. For the operation of the second + type we store &carry_node or parents of the left and right nodes + modified and keep track of them upward until they coincide. + +*/ +typedef struct carry_op { + /* pool linkage */ + reiser4_pool_header header; + carry_opcode op; + /* node on which operation is to be performed: + + for insert, paste: node where new item is to be inserted + + for delete: node where pointer is to be deleted + + for cut: node to cut from + + for update: node where delimiting key is to be modified + + for modify: parent of modified node + + */ + carry_node *node; + union { + struct { + /* (sub-)type of insertion/paste. Taken from + cop_insert_pos_type. */ + __u8 type; + /* various operation flags. Taken from + cop_insert_flag. */ + __u8 flags; + carry_insert_data *d; + carry_node *child; + znode *brother; + } insert, paste, extent; + + struct { + int is_cut; + union { + carry_kill_data *kill; + carry_cut_data *cut; + } u; + } cut_or_kill; + + struct { + carry_node *left; + } update; + struct { + /* changed child */ + carry_node *child; + /* bitmask of changes. See &cop_modify_flag */ + __u32 flag; + } modify; + struct { + /* flags to deletion operation. Are taken from + cop_delete_flag */ + __u32 flags; + /* child to delete from parent. If this is + NULL, delete op->node. */ + carry_node *child; + } delete; + struct { + flow_t *flow; + coord_t *insert_point; + reiser4_item_data *data; + /* flow insertion is limited by number of new blocks + added in that operation which do not get any data + but part of flow. This limit is set by macro + CARRY_FLOW_NEW_NODES_LIMIT. This field stores number + of nodes added already during one carry_flow */ + int new_nodes; + } insert_flow; + } u; +} carry_op; + +/* &carry_op_pool - preallocated pool of carry operations, and nodes */ +typedef struct carry_pool { + carry_op op[CARRIES_POOL_SIZE]; + reiser4_pool op_pool; + carry_node node[NODES_LOCKED_POOL_SIZE]; + reiser4_pool node_pool; +} carry_pool; + +/* &carry_tree_level - carry process on given level + + Description of balancing process on the given level. + + No need for locking here, as carry_tree_level is essentially per + thread thing (for now). + +*/ +struct carry_level { + /* this level may be restarted */ + __u32 restartable:1; + /* list of carry nodes on this level, ordered by key order */ + pool_level_list_head nodes; + pool_level_list_head ops; + /* pool where new objects are allocated from */ + carry_pool *pool; + int ops_num; + int nodes_num; + /* new root created on this level, if any */ + znode *new_root; + /* This is set by caller (insert_by_key(), resize_item(), etc.) when + they want ->tracked to automagically wander to the node where + insertion point moved after insert or paste. + */ + carry_track_type track_type; + /* lock handle supplied by user that we are tracking. See + above. */ + lock_handle *tracked; +#if REISER4_STATS + tree_level level_no; +#endif +}; + +/* information carry passes to plugin methods that may add new operations to + the @todo queue */ +struct carry_plugin_info { + carry_level *doing; + carry_level *todo; +}; + +int carry(carry_level * doing, carry_level * done); + +carry_node *add_carry(carry_level * level, pool_ordering order, carry_node * reference); +carry_node *add_carry_skip(carry_level * level, pool_ordering order, carry_node * reference); +carry_op *add_op(carry_level * level, pool_ordering order, carry_op * reference); + +extern carry_node *insert_carry_node(carry_level * doing, + carry_level * todo, const znode * node); + +extern carry_node *add_carry_atplace(carry_level *doing, + carry_level *todo, znode *node); + +extern carry_node *find_begetting_brother(carry_node * node, carry_level * kin); + +extern void init_carry_pool(carry_pool * pool); +extern void done_carry_pool(carry_pool * pool); + +extern void init_carry_level(carry_level * level, carry_pool * pool); + +extern carry_op *post_carry(carry_level * level, carry_opcode op, znode * node, int apply_to_parent); +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op, znode * node, int apply_to_parent_p); + +extern int carry_op_num(const carry_level * level); + +carry_node *add_new_znode(znode * brother, carry_node * reference, carry_level * doing, carry_level * todo); + +carry_node *find_carry_node(carry_level * level, const znode * node); + +extern znode *carry_real(const carry_node * node); + +/* helper macros to iterate over carry queues */ + +#define carry_node_next( node ) \ + ( ( carry_node * ) pool_level_list_next( &( node ) -> header ) ) + +#define carry_node_prev( node ) \ + ( ( carry_node * ) pool_level_list_prev( &( node ) -> header ) ) + +#define carry_node_front( level ) \ + ( ( carry_node * ) pool_level_list_front( &( level ) -> nodes ) ) + +#define carry_node_back( level ) \ + ( ( carry_node * ) pool_level_list_back( &( level ) -> nodes ) ) + +#define carry_node_end( level, node ) \ + ( pool_level_list_end( &( level ) -> nodes, &( node ) -> header ) ) + +/* macro to iterate over all operations in a @level */ +#define for_all_ops( level /* carry level (of type carry_level *) */, \ + op /* pointer to carry operation, modified by loop (of \ + * type carry_op *) */, \ + tmp /* pointer to carry operation (of type carry_op *), \ + * used to make iterator stable in the face of \ + * deletions from the level */ ) \ +for( op = ( carry_op * ) pool_level_list_front( &level -> ops ), \ + tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ; \ + ! pool_level_list_end( &level -> ops, &op -> header ) ; \ + op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ) + +/* macro to iterate over all nodes in a @level */ +#define for_all_nodes( level /* carry level (of type carry_level *) */, \ + node /* pointer to carry node, modified by loop (of \ + * type carry_node *) */, \ + tmp /* pointer to carry node (of type carry_node *), \ + * used to make iterator stable in the face of * \ + * deletions from the level */ ) \ +for( node = carry_node_front( level ), \ + tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ; \ + node = tmp, tmp = carry_node_next( node ) ) + +/* macro to iterate over all nodes in a @level in reverse order + + This is used, because nodes are unlocked in reversed order of locking */ +#define for_all_nodes_back( level /* carry level (of type carry_level *) */, \ + node /* pointer to carry node, modified by loop \ + * (of type carry_node *) */, \ + tmp /* pointer to carry node (of type carry_node \ + * *), used to make iterator stable in the \ + * face of deletions from the level */ ) \ +for( node = carry_node_back( level ), \ + tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ; \ + node = tmp, tmp = carry_node_prev( node ) ) + +/* debugging function */ + +#if REISER4_DEBUG_OUTPUT +extern void print_carry(const char *prefix, carry_node * node); +extern void print_op(const char *prefix, carry_op * op); +extern void print_level(const char *prefix, carry_level * level); +#else +#define print_carry( p, n ) noop +#define print_op( p, o ) noop +#define print_level( p, l ) noop +#endif + +/* __FS_REISER4_CARRY_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/carry_ops.c linux-2.6.4-ck1/fs/reiser4/carry_ops.c --- linux-2.6.4/fs/reiser4/carry_ops.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/carry_ops.c 2004-03-11 22:45:15.188525912 +1100 @@ -0,0 +1,2073 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* implementation of carry operations */ + +#include "forward.h" +#include "debug.h" +#include "key.h" +#include "coord.h" +#include "plugin/item/item.h" +#include "plugin/node/node.h" +#include "jnode.h" +#include "znode.h" +#include "block_alloc.h" +#include "tree_walk.h" +#include "pool.h" +#include "tree_mod.h" +#include "carry.h" +#include "carry_ops.h" +#include "tree.h" +#include "super.h" +#include "reiser4.h" + +#include +#include + +static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node, + carry_level * doing, carry_level * todo, unsigned int including_insert_coord_p); + +extern int lock_carry_node(carry_level * level, carry_node * node); +extern int lock_carry_node_tail(carry_node * node); + +/* find left neighbor of a carry node + + Look for left neighbor of @node and add it to the @doing queue. See + comments in the body. + +*/ +static carry_node * +find_left_neighbor(carry_op * op /* node to find left + * neighbor of */ , + carry_level * doing /* level to scan */ ) +{ + int result; + carry_node *node; + carry_node *left; + int flags; + + node = op->node; + + /* first, check whether left neighbor is already in a @doing queue */ + if (carry_real(node)->left != NULL) { + /* NOTE: there is locking subtlety here. Look into + * find_right_neighbor() for more info */ + if (find_carry_node(doing, carry_real(node)->left) != NULL) { + left = node; + do { + left = carry_node_prev(left); + assert("nikita-3408", !carry_node_end(doing, + left)); + } while (carry_real(left) == carry_real(node)); + reiser4_stat_level_inc(doing, carry_left_in_carry); + return left; + } + } + + left = add_carry_skip(doing, POOLO_BEFORE, node); + if (IS_ERR(left)) + return left; + + left->node = node->node; + left->free = 1; + + flags = GN_TRY_LOCK; + if (!op->u.insert.flags & COPI_LOAD_LEFT) + flags |= GN_NO_ALLOC; + + /* then, feeling lucky, peek left neighbor in the cache. */ + result = reiser4_get_left_neighbor(&left->lock_handle, carry_real(node), + ZNODE_WRITE_LOCK, flags); + if (result == 0) { + /* ok, node found and locked. */ + result = lock_carry_node_tail(left); + if (result != 0) + left = ERR_PTR(result); + reiser4_stat_level_inc(doing, carry_left_in_cache); + } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) { + /* node is leftmost node in a tree, or neighbor wasn't in + cache, or there is an extent on the left. */ + if (REISER4_STATS && (result == -ENOENT)) + reiser4_stat_level_inc(doing, carry_left_missed); + if (REISER4_STATS && (result == -E_NO_NEIGHBOR)) + reiser4_stat_level_inc(doing, carry_left_not_avail); + reiser4_pool_free(&doing->pool->node_pool, &left->header); + left = NULL; + } else if (doing->restartable) { + /* if left neighbor is locked, and level is restartable, add + new node to @doing and restart. */ + assert("nikita-913", node->parent != 0); + assert("nikita-914", node->node != NULL); + left->left = 1; + left->free = 0; + left = ERR_PTR(-E_REPEAT); + } else { + /* left neighbor is locked, level cannot be restarted. Just + ignore left neighbor. */ + reiser4_pool_free(&doing->pool->node_pool, &left->header); + left = NULL; + reiser4_stat_level_inc(doing, carry_left_refuse); + } + return left; +} + +/* find right neighbor of a carry node + + Look for right neighbor of @node and add it to the @doing queue. See + comments in the body. + +*/ +static carry_node * +find_right_neighbor(carry_op * op /* node to find right + * neighbor of */ , + carry_level * doing /* level to scan */ ) +{ + int result; + carry_node *node; + carry_node *right; + lock_handle lh; + int flags; + + init_lh(&lh); + + node = op->node; + + /* first, check whether right neighbor is already in a @doing queue */ + if (carry_real(node)->right != NULL) { + /* Subtle: + * + * Q: why don't we need tree lock here, looking for the right + * neighbor? + * + * A: even if value of node->real_node->right were changed + * during find_carry_node() execution, outcome of execution + * wouldn't change, because (in short) other thread cannot add + * elements to the @doing, and if node->real_node->right + * already was in @doing, value of node->real_node->right + * couldn't change, because node cannot be inserted between + * locked neighbors. + */ + if (find_carry_node(doing, carry_real(node)->right) != NULL) { + /* + * What we are doing here (this is also applicable to + * the find_left_neighbor()). + * + * tree_walk.c code requires that insertion of a + * pointer to a child, modification of parent pointer + * in the child, and insertion of the child into + * sibling list are atomic (see + * plugin/item/internal.c:create_hook_internal()). + * + * carry allocates new node long before pointer to it + * is inserted into parent and, actually, long before + * parent is even known. Such allocated-but-orphaned + * nodes are only trackable through carry level lists. + * + * Situation that is handled here is following: @node + * has valid ->right pointer, but there is + * allocated-but-orphaned node in the carry queue that + * is logically between @node and @node->right. Here + * we are searching for it. Critical point is that + * this is only possible if @node->right is also in + * the carry queue (this is checked above), because + * this is the only way new orphaned node could be + * inserted between them (before inserting new node, + * make_space() first tries to shift to the right, so, + * right neighbor will be locked and queued). + * + */ + right = node; + do { + right = carry_node_next(right); + assert("nikita-3408", !carry_node_end(doing, + right)); + } while (carry_real(right) == carry_real(node)); + reiser4_stat_level_inc(doing, carry_right_in_carry); + return right; + } + } + + flags = GN_CAN_USE_UPPER_LEVELS; + if (!op->u.insert.flags & COPI_LOAD_RIGHT) + flags = GN_NO_ALLOC; + + /* then, try to lock right neighbor */ + init_lh(&lh); + result = reiser4_get_right_neighbor(&lh, carry_real(node), + ZNODE_WRITE_LOCK, flags); + if (result == 0) { + /* ok, node found and locked. */ + reiser4_stat_level_inc(doing, carry_right_in_cache); + right = add_carry_skip(doing, POOLO_AFTER, node); + if (!IS_ERR(right)) { + right->node = lh.node; + move_lh(&right->lock_handle, &lh); + right->free = 1; + result = lock_carry_node_tail(right); + if (result != 0) + right = ERR_PTR(result); + } + } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) { + /* node is rightmost node in a tree, or neighbor wasn't in + cache, or there is an extent on the right. */ + right = NULL; + if (REISER4_STATS && (result == -ENOENT)) + reiser4_stat_level_inc(doing, carry_right_missed); + if (REISER4_STATS && (result == -E_NO_NEIGHBOR)) + reiser4_stat_level_inc(doing, carry_right_not_avail); + } else + right = ERR_PTR(result); + done_lh(&lh); + return right; +} + +/* how much free space in a @node is needed for @op + + How much space in @node is required for completion of @op, where @op is + insert or paste operation. +*/ +static unsigned int +space_needed_for_op(znode * node /* znode data are + * inserted or + * pasted in */ , + carry_op * op /* carry + operation */ ) +{ + assert("nikita-919", op != NULL); + + switch (op->op) { + default: + impossible("nikita-1701", "Wrong opcode"); + case COP_INSERT: + return space_needed(node, NULL, op->u.insert.d->data, 1); + case COP_PASTE: + return space_needed(node, op->u.insert.d->coord, op->u.insert.d->data, 0); + } +} + +/* how much space in @node is required to insert or paste @data at + @coord. */ +reiser4_internal unsigned int +space_needed(const znode * node /* node data are inserted or + * pasted in */ , + const coord_t * coord /* coord where data are + * inserted or pasted + * at */ , + const reiser4_item_data * data /* data to insert or + * paste */ , + int insertion /* non-0 is inserting, 0---paste */ ) +{ + int result; + item_plugin *iplug; + + assert("nikita-917", node != NULL); + assert("nikita-918", node_plugin_by_node(node) != NULL); + assert("vs-230", !insertion || (coord == NULL)); + + result = 0; + iplug = data->iplug; + if (iplug->b.estimate != NULL) { + /* ask item plugin how much space is needed to insert this + item */ + result += iplug->b.estimate(insertion ? NULL : coord, data); + } else { + /* reasonable default */ + result += data->length; + } + if (insertion) { + node_plugin *nplug; + + nplug = node->nplug; + /* and add node overhead */ + if (nplug->item_overhead != NULL) { + result += nplug->item_overhead(node, 0); + } + } + return result; +} + +/* find &coord in parent where pointer to new child is to be stored. */ +static int +find_new_child_coord(carry_op * op /* COP_INSERT carry operation to + * insert pointer to new + * child */ ) +{ + int result; + znode *node; + znode *child; + + assert("nikita-941", op != NULL); + assert("nikita-942", op->op == COP_INSERT); + + trace_stamp(TRACE_CARRY); + + node = carry_real(op->node); + assert("nikita-943", node != NULL); + assert("nikita-944", node_plugin_by_node(node) != NULL); + + child = carry_real(op->u.insert.child); + result = find_new_child_ptr(node, child, op->u.insert.brother, op->u.insert.d->coord); + + build_child_ptr_data(child, op->u.insert.d->data); + return result; +} + +/* additional amount of free space in @node required to complete @op */ +static int +free_space_shortage(znode * node /* node to check */ , + carry_op * op /* operation being performed */ ) +{ + assert("nikita-1061", node != NULL); + assert("nikita-1062", op != NULL); + + switch (op->op) { + default: + impossible("nikita-1702", "Wrong opcode"); + case COP_INSERT: + case COP_PASTE: + return space_needed_for_op(node, op) - znode_free_space(node); + case COP_EXTENT: + /* when inserting extent shift data around until insertion + point is utmost in the node. */ + if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE) + return +1; + else + return -1; + } +} + +/* helper function: update node pointer in operation after insertion + point was probably shifted into @target. */ +static znode * +sync_op(carry_op * op, carry_node * target) +{ + znode *insertion_node; + + /* reget node from coord: shift might move insertion coord to + the neighbor */ + insertion_node = op->u.insert.d->coord->node; + /* if insertion point was actually moved into new node, + update carry node pointer in operation. */ + if (insertion_node != carry_real(op->node)) { + op->node = target; + assert("nikita-2540", carry_real(target) == insertion_node); + } + assert("nikita-2541", + carry_real(op->node) == op->u.insert.d->coord->node); + return insertion_node; +} + +/* "split" code is not yet ready */ +static int +get_split_point(carry_op * op, sideof dir) +{ + coord_t *coord; + + return 0; + + assert("nikita-2619", op->op == COP_INSERT || op->op == COP_PASTE || op->op == COP_EXTENT); + + coord = op->u.insert.d->coord; + + if (!coord_is_existing_item(coord)) + return 0; + if ((op->u.insert.flags & COPI_GLUE_LEFT) && dir == LEFT_SIDE && coord->unit_pos > 0) { + coord_prev_unit(coord); + /* split point is different from the insertion point, confine + shift to the source node. */ + op->u.insert.flags &= ~COPI_GO_LEFT; + return -1; + } + /* glueing to the right is not yet implemented */ + assert("nikita-2620", !(op->u.insert.flags & COPI_GLUE_RIGHT)); + return 0; +} + +static void +put_split_point(carry_op * op, int adj, __u32 flags) +{ + return; + + assert("nikita-2621", op->op == COP_INSERT || op->op == COP_PASTE || op->op == COP_EXTENT); + + op->u.insert.d->coord += adj; + op->u.insert.flags = flags; +} + +static int +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node) +{ + int result; + carry_track_type tracking; + znode *node; + + tracking = doing->track_type; + node = op->u.insert.d->coord->node; + + if (tracking == CARRY_TRACK_NODE || + (tracking == CARRY_TRACK_CHANGE && node != orig_node)) { + /* inserting or pasting into node different from + original. Update lock handle supplied by caller. */ + assert("nikita-1417", doing->tracked != NULL); + done_lh(doing->tracked); + init_lh(doing->tracked); + result = longterm_lock_znode(doing->tracked, node, + ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI); + reiser4_stat_level_inc(doing, track_lh); + ON_TRACE(TRACE_CARRY, "tracking: %i: %p -> %p\n", + tracking, orig_node, node); + } else + result = 0; + return result; +} + +/* This is insertion policy function. It shifts data to the left and right + neighbors of insertion coord and allocates new nodes until there is enough + free space to complete @op. + + See comments in the body. + + Assumes that the node format favors insertions at the right end of the node + as node40 does. + + See carry_flow() on detail about flow insertion +*/ +static int +make_space(carry_op * op /* carry operation, insert or paste */ , + carry_level * doing /* current carry queue */ , + carry_level * todo /* carry queue on the parent level */ ) +{ + znode *node; + int result; + int not_enough_space; + int blk_alloc; + znode *orig_node; + __u32 flags; + int adj; + + coord_t *coord; + + assert("nikita-890", op != NULL); + assert("nikita-891", todo != NULL); + assert("nikita-892", + op->op == COP_INSERT || + op->op == COP_PASTE || op->op == COP_EXTENT); + assert("nikita-1607", + carry_real(op->node) == op->u.insert.d->coord->node); + + trace_stamp(TRACE_CARRY); + + flags = op->u.insert.flags; + + /* NOTE check that new node can only be allocated after checking left + * and right neighbors. This is necessary for proper work of + * find_{left,right}_neighbor(). */ + assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE, + flags & COPI_DONT_SHIFT_LEFT)); + assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE, + flags & COPI_DONT_SHIFT_RIGHT)); + + coord = op->u.insert.d->coord; + orig_node = node = coord->node; + + assert("nikita-908", node != NULL); + assert("nikita-909", node_plugin_by_node(node) != NULL); + + result = 0; + /* If there is not enough space in a node, try to shift something to + the left neighbor. This is a bit tricky, as locking to the left is + low priority. This is handled by restart logic in carry(). + */ + not_enough_space = free_space_shortage(node, op); + if (not_enough_space <= 0) + /* it is possible that carry was called when there actually + was enough space in the node. For example, when inserting + leftmost item so that delimiting keys have to be updated. + */ + return make_space_tail(op, doing, orig_node); + if (!(flags & COPI_DONT_SHIFT_LEFT)) { + carry_node *left; + /* make note in statistics of an attempt to move + something into the left neighbor */ + reiser4_stat_level_inc(doing, insert_looking_left); + left = find_left_neighbor(op, doing); + if (unlikely(IS_ERR(left))) { + if (PTR_ERR(left) == -E_REPEAT) + return -E_REPEAT; + else { + /* some error other than restart request + occurred. This shouldn't happen. Issue a + warning and continue as if left neighbor + weren't existing. + */ + warning("nikita-924", + "Error accessing left neighbor: %li", + PTR_ERR(left)); + print_znode("node", node); + } + } else if (left != NULL) { + + adj = get_split_point(op, LEFT_SIDE); + /* shift everything possible on the left of and + including insertion coord into the left neighbor */ + result = carry_shift_data(LEFT_SIDE, coord, carry_real(left), doing, todo, flags & COPI_GO_LEFT); + put_split_point(op, adj, flags); + + /* reget node from coord: shift_left() might move + insertion coord to the left neighbor */ + node = sync_op(op, left); + + not_enough_space = free_space_shortage(node, op); + /* There is not enough free space in @node, but + may be, there is enough free space in + @left. Various balancing decisions are valid here. + The same for the shifiting to the right. + */ + } + } + /* If there still is not enough space, shift to the right */ + if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) { + carry_node *right; + + reiser4_stat_level_inc(doing, insert_looking_right); + right = find_right_neighbor(op, doing); + if (IS_ERR(right)) { + warning("nikita-1065", + "Error accessing right neighbor: %li", + PTR_ERR(right)); + print_znode("node", node); + } else if (right != NULL) { + adj = get_split_point(op, RIGHT_SIDE); + /* node containing insertion point, and its right + neighbor node are write locked by now. + + shift everything possible on the right of but + excluding insertion coord into the right neighbor + */ + result = carry_shift_data(RIGHT_SIDE, coord, + carry_real(right), + doing, todo, + flags & COPI_GO_RIGHT); + put_split_point(op, adj, flags); + /* reget node from coord: shift_right() might move + insertion coord to the right neighbor */ + node = sync_op(op, right); + not_enough_space = free_space_shortage(node, op); + } + } + /* If there is still not enough space, allocate new node(s). + + We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in + the carry operation flags (currently this is needed during flush + only). + */ + for (blk_alloc = 0; + not_enough_space > 0 && result == 0 && blk_alloc < 2 && + !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) { + carry_node *fresh; /* new node we are allocating */ + coord_t coord_shadow; /* remembered insertion point before + * shifting data into new node */ + carry_node *node_shadow; /* remembered insertion node before + * shifting */ + unsigned int gointo; /* whether insertion point should move + * into newly allocated node */ + + reiser4_stat_level_inc(doing, insert_alloc_new); + if (blk_alloc > 0) + reiser4_stat_level_inc(doing, insert_alloc_many); + + /* allocate new node on the right of @node. Znode and disk + fake block number for new node are allocated. + + add_new_znode() posts carry operation COP_INSERT with + COPT_CHILD option to the parent level to add + pointer to newly created node to its parent. + + Subtle point: if several new nodes are required to complete + insertion operation at this level, they will be inserted + into their parents in the order of creation, which means + that @node will be valid "cookie" at the time of insertion. + + */ + fresh = add_new_znode(node, op->node, doing, todo); + if (IS_ERR(fresh)) + return PTR_ERR(fresh); + + /* Try to shift into new node. */ + result = lock_carry_node(doing, fresh); + zput(carry_real(fresh)); + if (result != 0) { + warning("nikita-947", + "Cannot lock new node: %i", result); + print_znode("new", carry_real(fresh)); + print_znode("node", node); + return result; + } + + /* both nodes are write locked by now. + + shift everything possible on the right of and + including insertion coord into the right neighbor. + */ + coord_dup(&coord_shadow, op->u.insert.d->coord); + node_shadow = op->node; + adj = get_split_point(op, RIGHT_SIDE); + /* move insertion point into newly created node if: + + . insertion point is rightmost in the source node, or + . this is not the first node we are allocating in a row. + */ + gointo = + (blk_alloc > 0) || + coord_is_after_rightmost(op->u.insert.d->coord); + + result = carry_shift_data(RIGHT_SIDE, coord, carry_real(fresh), + doing, todo, gointo); + put_split_point(op, adj, flags); + /* if insertion point was actually moved into new node, + update carry node pointer in operation. */ + node = sync_op(op, fresh); + not_enough_space = free_space_shortage(node, op); + if ((not_enough_space > 0) && (node != coord_shadow.node)) { + /* there is not enough free in new node. Shift + insertion point back to the @shadow_node so that + next new node would be inserted between + @shadow_node and @fresh. + */ + coord_normalize(&coord_shadow); + coord_dup(coord, &coord_shadow); + node = coord->node; + op->node = node_shadow; + if (1 || (flags & COPI_STEP_BACK)) { + /* still not enough space?! Maybe there is + enough space in the source node (i.e., node + data are moved from) now. + */ + not_enough_space = free_space_shortage(node, op); + } + } + } + if (not_enough_space > 0) { + if (!(flags & COPI_DONT_ALLOCATE)) + warning("nikita-948", "Cannot insert new item"); + result = -E_NODE_FULL; + } + assert("nikita-1622", ergo(result == 0, + carry_real(op->node) == coord->node)); + assert("nikita-2616", coord == op->u.insert.d->coord); + if (result == 0) + result = make_space_tail(op, doing, orig_node); + return result; +} + +/* insert_paste_common() - common part of insert and paste operations + + This function performs common part of COP_INSERT and COP_PASTE. + + There are two ways in which insertion/paste can be requested: + + . by directly supplying reiser4_item_data. In this case, op -> + u.insert.type is set to COPT_ITEM_DATA. + + . by supplying child pointer to which is to inserted into parent. In this + case op -> u.insert.type == COPT_CHILD. + + . by supplying key of new item/unit. This is currently only used during + extent insertion + + This is required, because when new node is allocated we don't know at what + position pointer to it is to be stored in the parent. Actually, we don't + even know what its parent will be, because parent can be re-balanced + concurrently and new node re-parented, and because parent can be full and + pointer to the new node will go into some other node. + + insert_paste_common() resolves pointer to child node into position in the + parent by calling find_new_child_coord(), that fills + reiser4_item_data. After this, insertion/paste proceeds uniformly. + + Another complication is with finding free space during pasting. It may + happen that while shifting items to the neighbors and newly allocated + nodes, insertion coord can no longer be in the item we wanted to paste + into. At this point, paste becomes (morphs) into insert. Moreover free + space analysis has to be repeated, because amount of space required for + insertion is different from that of paste (item header overhead, etc). + + This function "unifies" different insertion modes (by resolving child + pointer or key into insertion coord), and then calls make_space() to free + enough space in the node by shifting data to the left and right and by + allocating new nodes if necessary. Carry operation knows amount of space + required for its completion. After enough free space is obtained, caller of + this function (carry_{insert,paste,etc.}) performs actual insertion/paste + by calling item plugin method. + +*/ +static int +insert_paste_common(carry_op * op /* carry operation being + * performed */ , + carry_level * doing /* current carry level */ , + carry_level * todo /* next carry level */ , + carry_insert_data * cdata /* pointer to + * cdata */ , + coord_t * coord /* insertion/paste coord */ , + reiser4_item_data * data /* data to be + * inserted/pasted */ ) +{ + assert("nikita-981", op != NULL); + assert("nikita-980", todo != NULL); + assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE) || (op->op == COP_EXTENT)); + + trace_stamp(TRACE_CARRY); + + if (op->u.insert.type == COPT_PASTE_RESTARTED) { + /* nothing to do. Fall through to make_space(). */ + ; + } else if (op->u.insert.type == COPT_KEY) { + node_search_result intra_node; + znode *node; + /* Problem with doing batching at the lowest level, is that + operations here are given by coords where modification is + to be performed, and one modification can invalidate coords + of all following operations. + + So, we are implementing yet another type for operation that + will use (the only) "locator" stable across shifting of + data between nodes, etc.: key (COPT_KEY). + + This clause resolves key to the coord in the node. + + But node can change also. Probably some pieces have to be + added to the lock_carry_node(), to lock node by its key. + + */ + /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain + if you need something else. */ + op->u.insert.d->coord = coord; + node = carry_real(op->node); + intra_node = node_plugin_by_node(node)->lookup + (node, op->u.insert.d->key, FIND_EXACT, op->u.insert.d->coord); + if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) { + warning("nikita-1715", "Intra node lookup failure: %i", intra_node); + print_znode("node", node); + return intra_node; + } + } else if (op->u.insert.type == COPT_CHILD) { + /* if we are asked to insert pointer to the child into + internal node, first convert pointer to the child into + coord within parent node. + */ + znode *child; + int result; + + op->u.insert.d = cdata; + op->u.insert.d->coord = coord; + op->u.insert.d->data = data; + op->u.insert.d->coord->node = carry_real(op->node); + result = find_new_child_coord(op); + child = carry_real(op->u.insert.child); + if (result != NS_NOT_FOUND) { + warning("nikita-993", "Cannot find a place for child pointer: %i", result); + print_znode("child", child); + print_znode("parent", carry_real(op->node)); + return result; + } + /* This only happens when we did multiple insertions at + the previous level, trying to insert single item and + it so happened, that insertion of pointers to all new + nodes before this one already caused parent node to + split (may be several times). + + I am going to come up with better solution. + + You are not expected to understand this. + -- v6root/usr/sys/ken/slp.c + */ + if (carry_real(op->node) != op->u.insert.d->coord->node) { + pool_ordering direction; + znode *z1; + znode *z2; + reiser4_key k1; + reiser4_key k2; + + z1 = op->u.insert.d->coord->node; + z2 = carry_real(op->node); + if (keyle(leftmost_key_in_node(z1, &k1), + leftmost_key_in_node(z2, &k2))) + direction = POOLO_BEFORE; + else + direction = POOLO_AFTER; + + op->node = add_carry_skip(doing, direction, op->node); + if (IS_ERR(op->node)) + return PTR_ERR(op->node); + op->node->node = op->u.insert.d->coord->node; + op->node->free = 1; + result = lock_carry_node(doing, op->node); + if (result != 0) + return result; + } + + op->u.insert.d->key = UNDER_RW(dk, znode_get_tree(child), read, + leftmost_key_in_node(child, znode_get_ld_key(child))); + op->u.insert.d->data->arg = op->u.insert.brother; + } else { + assert("vs-243", op->u.insert.d->coord != NULL); + op->u.insert.d->coord->node = carry_real(op->node); + } + + /* find free space. */ + return make_space(op, doing, todo); +} + +/* handle carry COP_INSERT operation. + + Insert new item into node. New item can be given in one of two ways: + + - by passing &tree_coord and &reiser4_item_data as part of @op. This is + only applicable at the leaf/twig level. + + - by passing a child node pointer to which is to be inserted by this + operation. + +*/ +static int +carry_insert(carry_op * op /* operation to perform */ , + carry_level * doing /* queue of operations @op + * is part of */ , + carry_level * todo /* queue where new operations + * are accumulated */ ) +{ + znode *node; + carry_insert_data cdata; + coord_t coord; + reiser4_item_data data; + carry_plugin_info info; + int result; + + assert("nikita-1036", op != NULL); + assert("nikita-1037", todo != NULL); + assert("nikita-1038", op->op == COP_INSERT); + + trace_stamp(TRACE_CARRY); + reiser4_stat_level_inc(doing, insert); + + /* FIXME-VS: init_coord used to be here. insert_paste_common seems to + use that zeroed coord */ + coord_init_zero(&coord); + + /* perform common functionality of insert and paste. */ + result = insert_paste_common(op, doing, todo, &cdata, &coord, &data); + if (result != 0) + return result; + + node = op->u.insert.d->coord->node; + assert("nikita-1039", node != NULL); + assert("nikita-1040", node_plugin_by_node(node) != NULL); + + assert("nikita-949", space_needed_for_op(node, op) <= znode_free_space(node)); + + /* ask node layout to create new item. */ + info.doing = doing; + info.todo = todo; + result = node_plugin_by_node(node)->create_item + (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data, &info); + doing->restartable = 0; + znode_make_dirty(node); + + return result; +} + +#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point ) +#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow ) +#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data ) + +static size_t +item_data_overhead(carry_op * op) +{ + if (flow_insert_data(op)->iplug->b.estimate == NULL) + return 0; + return (flow_insert_data(op)->iplug->b.estimate(NULL /* estimate insertion */, flow_insert_data(op)) - + flow_insert_data(op)->length); +} + +/* FIXME-VS: this is called several times during one make_flow_for_insertion + and it will always return the same result. Some optimization could be made + by calculating this value once at the beginning and passing it around. That + would reduce some flexibility in future changes +*/ +static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *); +static size_t +flow_insertion_overhead(carry_op * op) +{ + znode *node; + size_t insertion_overhead; + + node = flow_insert_point(op)->node; + insertion_overhead = 0; + if (node->nplug->item_overhead && + !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key, flow_insert_data(op))) + insertion_overhead = node->nplug->item_overhead(node, 0) + item_data_overhead(op); + return insertion_overhead; +} + +/* how many bytes of flow does fit to the node */ +static int +what_can_fit_into_node(carry_op * op) +{ + size_t free, overhead; + + overhead = flow_insertion_overhead(op); + free = znode_free_space(flow_insert_point(op)->node); + if (free <= overhead) + return 0; + free -= overhead; + /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */ + if (free < op->u.insert_flow.flow->length) + return free; + return (int)op->u.insert_flow.flow->length; +} + +/* in make_space_for_flow_insertion we need to check either whether whole flow + fits into a node or whether minimal fraction of flow fits into a node */ +static int +enough_space_for_whole_flow(carry_op * op) +{ + return (unsigned) what_can_fit_into_node(op) == op->u.insert_flow.flow->length; +} + +#define MIN_FLOW_FRACTION 1 +static int +enough_space_for_min_flow_fraction(carry_op * op) +{ + assert("vs-902", coord_is_after_rightmost(flow_insert_point(op))); + + return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION; +} + +/* this returns 0 if left neighbor was obtained successfully and everything + upto insertion point including it were shifted and left neighbor still has + some free space to put minimal fraction of flow into it */ +static int +make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo) +{ + carry_node *left; + znode *orig; + + left = find_left_neighbor(op, doing); + if (unlikely(IS_ERR(left))) { + warning("vs-899", "make_space_by_shift_left: " "error accessing left neighbor: %li", PTR_ERR(left)); + return 1; + } + if (left == NULL) + /* left neighbor either does not exist or is unformatted + node */ + return 1; + + orig = flow_insert_point(op)->node; + /* try to shift content of node @orig from its head upto insert point + including insertion point into the left neighbor */ + carry_shift_data(LEFT_SIDE, flow_insert_point(op), + carry_real(left), doing, todo, 1 /* including insert + * point */); + if (carry_real(left) != flow_insert_point(op)->node) { + /* insertion point did not move */ + return 1; + } + + /* insertion point is set after last item in the node */ + assert("vs-900", coord_is_after_rightmost(flow_insert_point(op))); + + if (!enough_space_for_min_flow_fraction(op)) { + /* insertion point node does not have enough free space to put + even minimal portion of flow into it, therefore, move + insertion point back to orig node (before first item) */ + coord_init_before_first_item(flow_insert_point(op), orig); + return 1; + } + + /* part of flow is to be written to the end of node */ + op->node = left; + return 0; +} + +/* this returns 0 if right neighbor was obtained successfully and everything to + the right of insertion point was shifted to it and node got enough free + space to put minimal fraction of flow into it */ +static int +make_space_by_shift_right(carry_op * op, carry_level * doing, carry_level * todo) +{ + carry_node *right; + + right = find_right_neighbor(op, doing); + if (unlikely(IS_ERR(right))) { + warning("nikita-1065", "shift_right_excluding_insert_point: " + "error accessing right neighbor: %li", PTR_ERR(right)); + return 1; + } + if (right) { + /* shift everything possible on the right of but excluding + insertion coord into the right neighbor */ + carry_shift_data(RIGHT_SIDE, flow_insert_point(op), + carry_real(right), doing, todo, 0 /* not + * including + * insert + * point */); + } else { + /* right neighbor either does not exist or is unformatted + node */ + ; + } + if (coord_is_after_rightmost(flow_insert_point(op))) { + if (enough_space_for_min_flow_fraction(op)) { + /* part of flow is to be written to the end of node */ + return 0; + } + } + + /* new node is to be added if insert point node did not get enough + space for whole flow */ + return 1; +} + +/* this returns 0 when insert coord is set at the node end and fraction of flow + fits into that node */ +static int +make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo) +{ + int result; + znode *node; + carry_node *new; + + node = flow_insert_point(op)->node; + + if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT) + return RETERR(-E_NODE_FULL); + /* add new node after insert point node */ + new = add_new_znode(node, op->node, doing, todo); + if (unlikely(IS_ERR(new))) { + return PTR_ERR(new); + } + result = lock_carry_node(doing, new); + zput(carry_real(new)); + if (unlikely(result)) { + return result; + } + op->u.insert_flow.new_nodes++; + if (!coord_is_after_rightmost(flow_insert_point(op))) { + carry_shift_data(RIGHT_SIDE, flow_insert_point(op), + carry_real(new), doing, todo, 0 /* not + * including + * insert + * point */); + + assert("vs-901", coord_is_after_rightmost(flow_insert_point(op))); + + if (enough_space_for_min_flow_fraction(op)) { + return 0; + } + if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT) + return RETERR(-E_NODE_FULL); + + /* add one more new node */ + new = add_new_znode(node, op->node, doing, todo); + if (unlikely(IS_ERR(new))) { + return PTR_ERR(new); + } + result = lock_carry_node(doing, new); + zput(carry_real(new)); + if (unlikely(result)) { + return result; + } + op->u.insert_flow.new_nodes++; + } + + /* move insertion point to new node */ + coord_init_before_first_item(flow_insert_point(op), carry_real(new)); + op->node = new; + return 0; +} + +static int +make_space_for_flow_insertion(carry_op * op, carry_level * doing, carry_level * todo) +{ + if (enough_space_for_whole_flow(op)) { + /* whole flow fits into insert point node */ + return 0; + } + + if (make_space_by_shift_left(op, doing, todo) == 0) { + /* insert point is shifted to left neighbor of original insert + point node and is set after last unit in that node. It has + enough space to fit at least minimal fraction of flow. */ + return 0; + } + + if (enough_space_for_whole_flow(op)) { + /* whole flow fits into insert point node */ + return 0; + } + + if (make_space_by_shift_right(op, doing, todo) == 0) { + /* insert point is still set to the same node, but there is + nothing to the right of insert point. */ + return 0; + } + + if (enough_space_for_whole_flow(op)) { + /* whole flow fits into insert point node */ + return 0; + } + + return make_space_by_new_nodes(op, doing, todo); +} + +/* implements COP_INSERT_FLOW operation */ +static int +carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo) +{ + int result; + flow_t *f; + coord_t *insert_point; + node_plugin *nplug; + int something_written; + carry_plugin_info info; + znode *orig_node; + lock_handle *orig_lh; + + f = op->u.insert_flow.flow; + result = 0; + + /* this flag is used to distinguish a need to have carry to propagate + leaf level modifications up in the tree when make_space fails not in + first iteration of the loop below */ + something_written = 0; + + /* carry system needs this to work */ + info.doing = doing; + info.todo = todo; + + orig_node = flow_insert_point(op)->node; + orig_lh = doing->tracked; + + while (f->length) { + result = make_space_for_flow_insertion(op, doing, todo); + if (result) + break; + + insert_point = flow_insert_point(op); + nplug = node_plugin_by_node(insert_point->node); + + /* compose item data for insertion/pasting */ + flow_insert_data(op)->data = f->data; + flow_insert_data(op)->length = what_can_fit_into_node(op); + + if (can_paste(insert_point, &f->key, flow_insert_data(op))) { + /* insert point is set to item of file we are writing to and we have to append to it */ + assert("vs-903", insert_point->between == AFTER_UNIT); + nplug->change_item_size(insert_point, flow_insert_data(op)->length); + flow_insert_data(op)->iplug->b.paste(insert_point, flow_insert_data(op), &info); + } else { + /* new item must be inserted */ + pos_in_node_t new_pos; + flow_insert_data(op)->length += item_data_overhead(op); + + /* FIXME-VS: this is because node40_create_item changes + insert_point for obscure reasons */ + switch (insert_point->between) { + case AFTER_ITEM: + new_pos = insert_point->item_pos + 1; + break; + case EMPTY_NODE: + new_pos = 0; + break; + case BEFORE_ITEM: + assert("vs-905", insert_point->item_pos == 0); + new_pos = 0; + break; + default: + impossible("vs-906", "carry_insert_flow: invalid coord"); + new_pos = 0; + break; + } + + nplug->create_item(insert_point, &f->key, flow_insert_data(op), &info); + coord_set_item_pos(insert_point, new_pos); + } + coord_init_after_item_end(insert_point); + doing->restartable = 0; + znode_make_dirty(insert_point->node); + + move_flow_forward(f, (unsigned) flow_insert_data(op)->length); + something_written = 1; + } + + if (orig_node != flow_insert_point(op)->node) { + /* move lock to new insert point */ + done_lh(orig_lh); + init_lh(orig_lh); + result = longterm_lock_znode(orig_lh, flow_insert_point(op)->node, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI); + } + + return result; +} + +/* implements COP_DELETE operation + + Remove pointer to @op -> u.delete.child from it's parent. + + This function also handles killing of a tree root is last pointer from it + was removed. This is complicated by our handling of "twig" level: root on + twig level is never killed. + +*/ +static int +carry_delete(carry_op * op /* operation to be performed */ , + carry_level * doing UNUSED_ARG /* current carry + * level */ , + carry_level * todo /* next carry level */ ) +{ + int result; + coord_t coord; + coord_t coord2; + znode *parent; + znode *child; + carry_plugin_info info; + reiser4_tree *tree; + + assert("nikita-893", op != NULL); + assert("nikita-894", todo != NULL); + assert("nikita-895", op->op == COP_DELETE); + trace_stamp(TRACE_CARRY); + reiser4_stat_level_inc(doing, delete); + + coord_init_zero(&coord); + coord_init_zero(&coord2); + + parent = carry_real(op->node); + child = op->u.delete.child ? + carry_real(op->u.delete.child) : op->node->node; + tree = znode_get_tree(child); + RLOCK_TREE(tree); + if (znode_parent(child) != parent) { + /* NOTE-NIKITA add stat counter for this. */ + parent = znode_parent(child); + assert("nikita-2581", find_carry_node(doing, parent)); + } + RUNLOCK_TREE(tree); + + assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL); + + /* Twig level horrors: tree should be of height at least 2. So, last + pointer from the root at twig level is preserved even if child is + empty. This is ugly, but so it was architectured. + */ + + if (znode_is_root(parent) && + (znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT) && (node_num_items(parent) == 1)) { + /* Delimiting key manipulations. */ + WLOCK_DK(tree); + znode_set_ld_key(child, znode_set_ld_key(parent, min_key())); + znode_set_rd_key(child, znode_set_rd_key(parent, max_key())); + WUNLOCK_DK(tree); + + /* @child escaped imminent death! */ + ZF_CLR(child, JNODE_HEARD_BANSHEE); + return 0; + } + + /* convert child pointer to the coord_t */ + result = find_child_ptr(parent, child, &coord); + if (result != NS_FOUND) { + warning("nikita-994", "Cannot find child pointer: %i", result); + print_znode("child", child); + print_znode("parent", parent); + print_coord_content("coord", &coord); + return result; + } + + coord_dup(&coord2, &coord); + info.doing = doing; + info.todo = todo; + { + struct carry_kill_data kdata; + kdata.params.from = &coord; + kdata.params.to = &coord2; + kdata.params.from_key = NULL; + kdata.params.to_key = NULL; + kdata.params.smallest_removed = NULL; + kdata.flags = op->u.delete.flags; + kdata.inode = 0; + kdata.left = 0; + kdata.right = 0; + result = node_plugin_by_node(parent)->cut_and_kill(&kdata, &info); + } + doing->restartable = 0; + + /* check whether root should be killed violently */ + if (znode_is_root(parent) && + /* don't kill roots at and lower than twig level */ + (znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT) && (node_num_items(parent) == 1)) { + result = kill_tree_root(coord.node); + } + + return result < 0 ? : 0; +} + +/* implements COP_CUT opration + + Cuts part or whole content of node. + +*/ +static int +carry_cut(carry_op * op /* operation to be performed */ , + carry_level * doing /* current carry level */ , + carry_level * todo /* next carry level */ ) +{ + int result; + carry_plugin_info info; + + assert("nikita-896", op != NULL); + assert("nikita-897", todo != NULL); + assert("nikita-898", op->op == COP_CUT); + trace_stamp(TRACE_CARRY); + reiser4_stat_level_inc(doing, cut); + + info.doing = doing; + info.todo = todo; + + if (op->u.cut_or_kill.is_cut) + result = node_plugin_by_node(carry_real(op->node))->cut(op->u.cut_or_kill.u.cut, &info); + else + result = node_plugin_by_node(carry_real(op->node))->cut_and_kill(op->u.cut_or_kill.u.kill, &info); + + doing->restartable = 0; + return result < 0 ? : 0; +} + +/* helper function for carry_paste(): returns true if @op can be continued as + paste */ +static int +can_paste(coord_t * icoord, const reiser4_key * key, const reiser4_item_data * data) +{ + coord_t circa; + item_plugin *new_iplug; + item_plugin *old_iplug; + int result = 0; /* to keep gcc shut */ + + assert("", icoord->between != AT_UNIT); + + /* obviously, one cannot paste when node is empty---there is nothing + to paste into. */ + if (node_is_empty(icoord->node)) + return 0; + /* if insertion point is at the middle of the item, then paste */ + if (!coord_is_between_items(icoord)) + return 1; + coord_dup(&circa, icoord); + circa.between = AT_UNIT; + + old_iplug = item_plugin_by_coord(&circa); + new_iplug = data->iplug; + + /* check whether we can paste to the item @icoord is "at" when we + ignore ->between field */ + if ((old_iplug == new_iplug) && item_can_contain_key(&circa, key, data)) { + result = 1; + } else if ((icoord->between == BEFORE_UNIT) || (icoord->between == BEFORE_ITEM)) { + /* otherwise, try to glue to the item at the left, if any */ + coord_dup(&circa, icoord); + if (coord_set_to_left(&circa)) { + result = 0; + coord_init_before_item(icoord); + } else { + old_iplug = item_plugin_by_coord(&circa); + result = (old_iplug == new_iplug) && item_can_contain_key(icoord, key, data); + if (result) { + coord_dup(icoord, &circa); + icoord->between = AFTER_UNIT; + } + } + } else if ((icoord->between == AFTER_UNIT) || (icoord->between == AFTER_ITEM)) { + coord_dup(&circa, icoord); + /* otherwise, try to glue to the item at the right, if any */ + if (coord_set_to_right(&circa)) { + result = 0; + coord_init_after_item(icoord); + } else { + int (*cck) (const coord_t *, const reiser4_key *, const reiser4_item_data *); + + old_iplug = item_plugin_by_coord(&circa); + + cck = old_iplug->b.can_contain_key; + if (cck == NULL) + /* item doesn't define ->can_contain_key + method? So it is not expandable. */ + result = 0; + else { + result = (old_iplug == new_iplug) && cck(&circa /*icoord */ , key, data); + if (result) { + coord_dup(icoord, &circa); + icoord->between = BEFORE_UNIT; + } + } + } + } else + impossible("nikita-2513", "Nothing works"); + if (result) { + if (icoord->between == BEFORE_ITEM) { + assert("vs-912", icoord->unit_pos == 0); + icoord->between = BEFORE_UNIT; + } else if (icoord->between == AFTER_ITEM) { + coord_init_after_item_end(icoord); + } + } + return result; +} + +/* implements COP_PASTE operation + + Paste data into existing item. This is complicated by the fact that after + we shifted something to the left or right neighbors trying to free some + space, item we were supposed to paste into can be in different node than + insertion coord. If so, we are no longer doing paste, but insert. See + comments in insert_paste_common(). + +*/ +static int +carry_paste(carry_op * op /* operation to be performed */ , + carry_level * doing UNUSED_ARG /* current carry + * level */ , + carry_level * todo /* next carry level */ ) +{ + znode *node; + carry_insert_data cdata; + coord_t dcoord; + reiser4_item_data data; + int result; + int real_size; + item_plugin *iplug; + carry_plugin_info info; + coord_t *coord; + + assert("nikita-982", op != NULL); + assert("nikita-983", todo != NULL); + assert("nikita-984", op->op == COP_PASTE); + + trace_stamp(TRACE_CARRY); + reiser4_stat_level_inc(doing, paste); + + coord_init_zero(&dcoord); + + result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data); + if (result != 0) + return result; + + coord = op->u.insert.d->coord; + + /* handle case when op -> u.insert.coord doesn't point to the item + of required type. restart as insert. */ + if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) { + op->op = COP_INSERT; + op->u.insert.type = COPT_PASTE_RESTARTED; + reiser4_stat_level_inc(doing, paste_restarted); + result = op_dispatch_table[COP_INSERT].handler(op, doing, todo); + + return result; + } + + node = coord->node; + iplug = item_plugin_by_coord(coord); + assert("nikita-992", iplug != NULL); + + assert("nikita-985", node != NULL); + assert("nikita-986", node_plugin_by_node(node) != NULL); + + assert("nikita-987", space_needed_for_op(node, op) <= znode_free_space(node)); + + assert("nikita-1286", coord_is_existing_item(coord)); + + real_size = space_needed_for_op(node, op); + if (real_size > 0) + node->nplug->change_item_size(coord, real_size); + + doing->restartable = 0; + info.doing = doing; + info.todo = todo; + + result = iplug->b.paste(coord, op->u.insert.d->data, &info); + + if (real_size < 0) + node->nplug->change_item_size(coord, real_size); + + /* if we pasted at the beginning of the item, update item's key. */ + if (coord->unit_pos == 0 && coord->between != AFTER_UNIT) + node->nplug->update_item_key(coord, op->u.insert.d->key, &info); + + znode_make_dirty(node); + return result; +} + +/* handle carry COP_EXTENT operation. */ +static int +carry_extent(carry_op * op /* operation to perform */ , + carry_level * doing /* queue of operations @op + * is part of */ , + carry_level * todo /* queue where new operations + * are accumulated */ ) +{ + znode *node; + carry_insert_data cdata; + coord_t coord; + reiser4_item_data data; + carry_op *delete_dummy; + carry_op *insert_extent; + int result; + carry_plugin_info info; + + assert("nikita-1751", op != NULL); + assert("nikita-1752", todo != NULL); + assert("nikita-1753", op->op == COP_EXTENT); + + trace_stamp(TRACE_CARRY); + reiser4_stat_level_inc(doing, extent); + + /* extent insertion overview: + + extents live on the TWIG LEVEL, which is level one above the leaf + one. This complicates extent insertion logic somewhat: it may + happen (and going to happen all the time) that in logical key + ordering extent has to be placed between items I1 and I2, located + at the leaf level, but I1 and I2 are in the same formatted leaf + node N1. To insert extent one has to + + (1) reach node N1 and shift data between N1, its neighbors and + possibly newly allocated nodes until I1 and I2 fall into different + nodes. Since I1 and I2 are still neighboring items in logical key + order, they will be necessary utmost items in their respective + nodes. + + (2) After this new extent item is inserted into node on the twig + level. + + Fortunately this process can reuse almost all code from standard + insertion procedure (viz. make_space() and insert_paste_common()), + due to the following observation: make_space() only shifts data up + to and excluding or including insertion point. It never + "over-moves" through insertion point. Thus, one can use + make_space() to perform step (1). All required for this is just to + instruct free_space_shortage() to keep make_space() shifting data + until insertion point is at the node border. + + */ + + /* perform common functionality of insert and paste. */ + result = insert_paste_common(op, doing, todo, &cdata, &coord, &data); + if (result != 0) + return result; + + node = op->u.extent.d->coord->node; + assert("nikita-1754", node != NULL); + assert("nikita-1755", node_plugin_by_node(node) != NULL); + assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE); + /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that + extent fits between items. */ + + info.doing = doing; + info.todo = todo; + + /* there is another complication due to placement of extents on the + twig level: extents are "rigid" in the sense that key-range + occupied by extent cannot grow indefinitely to the right as it is + for the formatted leaf nodes. Because of this when search finds two + adjacent extents on the twig level, it has to "drill" to the leaf + level, creating new node. Here we are removing this node. + */ + if (node_is_empty(node)) { + delete_dummy = node_post_carry(&info, COP_DELETE, node, 1); + if (IS_ERR(delete_dummy)) + return PTR_ERR(delete_dummy); + delete_dummy->u.delete.child = NULL; + delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY; + ZF_SET(node, JNODE_HEARD_BANSHEE); + } + + /* proceed with inserting extent item into parent. We are definitely + inserting rather than pasting if we get that far. */ + insert_extent = node_post_carry(&info, COP_INSERT, node, 1); + if (IS_ERR(insert_extent)) + /* @delete_dummy will be automatically destroyed on the level + exiting */ + return PTR_ERR(insert_extent); + /* NOTE-NIKITA insertion by key is simplest option here. Another + possibility is to insert on the left or right of already existing + item. + */ + insert_extent->u.insert.type = COPT_KEY; + insert_extent->u.insert.d = op->u.extent.d; + assert("nikita-1719", op->u.extent.d->key != NULL); + insert_extent->u.insert.d->data->arg = op->u.extent.d->coord; + insert_extent->u.insert.flags = znode_get_tree(node)->carry.new_extent_flags; + + /* + * if carry was asked to track lock handle we should actually track + * lock handle on the twig node rather than on the leaf where + * operation was started from. Transfer tracked lock handle. + */ + if (doing->track_type) { + assert("nikita-3242", doing->tracked != NULL); + assert("nikita-3244", todo->tracked == NULL); + todo->tracked = doing->tracked; + todo->track_type = CARRY_TRACK_NODE; + doing->tracked = NULL; + doing->track_type = 0; + } + + return 0; +} + +/* update key in @parent between pointers to @left and @right. + + Find coords of @left and @right and update delimiting key between them. + +*/ +static int +update_delimiting_key(znode * parent /* node key is updated + * in */ , + znode * left /* child of @parent */ , + znode * right /* child of @parent */ , + carry_level * doing /* current carry + * level */ , + carry_level * todo /* parent carry + * level */ , + const char **error_msg /* place to + * store error + * message */ ) +{ + coord_t left_pos; + coord_t right_pos; + int result; + reiser4_key ldkey; + carry_plugin_info info; + + assert("nikita-1177", right != NULL); + /* find position of right left child in a parent */ + result = find_child_ptr(parent, right, &right_pos); + if (result != NS_FOUND) { + *error_msg = "Cannot find position of right child"; + return result; + } + + if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) { + /* find position of the left child in a parent */ + result = find_child_ptr(parent, left, &left_pos); + if (result != NS_FOUND) { + *error_msg = "Cannot find position of left child"; + return result; + } + assert("nikita-1355", left_pos.node != NULL); + } else + left_pos.node = NULL; + + /* check that they are separated by exactly one key and are basically + sane */ + if (REISER4_DEBUG) { + if ((left_pos.node != NULL) + && !coord_is_existing_unit(&left_pos)) { + *error_msg = "Left child is bastard"; + return RETERR(-EIO); + } + if (!coord_is_existing_unit(&right_pos)) { + *error_msg = "Right child is bastard"; + return RETERR(-EIO); + } + if ((left_pos.node != NULL) && !coord_are_neighbors(&left_pos, &right_pos)) { + *error_msg = "Children are not direct siblings"; + return RETERR(-EIO); + } + } + *error_msg = NULL; + + info.doing = doing; + info.todo = todo; + if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE)) + leftmost_key_in_node(right, &ldkey); + else + UNDER_RW_VOID(dk, znode_get_tree(parent), read, + ldkey = *znode_get_rd_key(right)); + node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info); + doing->restartable = 0; + znode_make_dirty(parent); + return 0; +} + +/* implements COP_UPDATE opration + + Update delimiting keys. + +*/ +static int +carry_update(carry_op * op /* operation to be performed */ , + carry_level * doing /* current carry level */ , + carry_level * todo /* next carry level */ ) +{ + int result; + carry_node *missing UNUSED_ARG; + znode *left; + znode *right; + carry_node *lchild; + carry_node *rchild; + const char *error_msg; + reiser4_tree *tree; + + assert("nikita-902", op != NULL); + assert("nikita-903", todo != NULL); + assert("nikita-904", op->op == COP_UPDATE); + trace_stamp(TRACE_CARRY); + reiser4_stat_level_inc(doing, update); + + lchild = op->u.update.left; + rchild = op->node; + + if (lchild != NULL) { + assert("nikita-1001", lchild->parent); + assert("nikita-1003", !lchild->left); + left = carry_real(lchild); + } else + left = NULL; + + tree = znode_get_tree(rchild->node); + RLOCK_TREE(tree); + right = znode_parent(rchild->node); + if (REISER4_STATS) { + znode *old_right; + if (rchild != NULL) { + assert("nikita-1000", rchild->parent); + assert("nikita-1002", !rchild->left); + old_right = carry_real(rchild); + } else + old_right = NULL; + if (znode_parent(rchild->node) != old_right) + /* parent node was split, and pointer to @rchild was + inserted/moved into new node. Wonders of balkancing + (sic.). + */ + reiser4_stat_level_inc(doing, half_split_race); + } + RUNLOCK_TREE(tree); + + if (right != NULL) { + result = update_delimiting_key(right, + lchild ? lchild->node : NULL, rchild->node, doing, todo, &error_msg); + } else { + error_msg = "Cannot find node to update key in"; + result = RETERR(-EIO); + } + /* operation will be reposted to the next level by the + ->update_item_key() method of node plugin, if necessary. */ + + if (result != 0) { + warning("nikita-999", "Error updating delimiting key: %s (%i)", error_msg ? : "", result); + print_znode("left", left); + print_znode("right", right); + print_znode("lchild", lchild ? lchild->node : NULL); + print_znode("rchild", rchild->node); + } + return result; +} + +/* implements COP_MODIFY opration + + Notify parent about changes in its child + +*/ +static int +carry_modify(carry_op * op /* operation to be performed */ , + carry_level * doing UNUSED_ARG /* current carry + * level */ , + carry_level * todo UNUSED_ARG /* next curry level */ ) +{ + znode *node; + + assert("nikita-905", op != NULL); + assert("nikita-906", todo != NULL); + assert("nikita-907", op->op == COP_MODIFY); + trace_stamp(TRACE_CARRY); + reiser4_stat_level_inc(doing, modify); + + node = carry_real(op->node); + assert("nikita-995", node != NULL); +#ifdef MODIFY_EXISTS + if (node_plugin_by_node(node)->modify != NULL) + return node_plugin_by_node(node)->modify(node, op->u.modify.child->real_node, op->u.modify.flag, todo); + else +#endif + return 0; +} + +/* move items from @node during carry */ +static int +carry_shift_data(sideof side /* in what direction to move data */ , + coord_t * insert_coord /* coord where new item + * is to be inserted */ , + znode * node /* node which data are moved from */ , + carry_level * doing /* active carry queue */ , + carry_level * todo /* carry queue where new + * operations are to be put + * in */ , + unsigned int including_insert_coord_p /* true if + * @insertion_coord + * can be moved */ ) +{ + int result; + znode *source; + carry_plugin_info info; + node_plugin *nplug; + + source = insert_coord->node; + + info.doing = doing; + info.todo = todo; + + nplug = node_plugin_by_node(node); + result = nplug->shift(insert_coord, node, + (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0, + (int) including_insert_coord_p, &info); + /* the only error ->shift() method of node plugin can return is + -ENOMEM due to carry node/operation allocation. */ + assert("nikita-915", result >= 0 || result == -ENOMEM); + if (result > 0) { + doing->restartable = 0; + znode_make_dirty(source); + znode_make_dirty(node); + } + + ON_DEBUG_MODIFY(znode_post_write(node)); + + assert("nikita-2077", coord_check(insert_coord)); + return 0; +} + +typedef carry_node *(*carry_iterator) (carry_node * node); +static carry_node *find_dir_carry(carry_node * node, carry_level * level, carry_iterator iterator); + +/* look for the left neighbor of given carry node in a carry queue. + + This is used by find_left_neighbor(), but I am not sure that this + really gives any advantage. More statistics required. + +*/ +reiser4_internal carry_node * +find_left_carry(carry_node * node /* node to fine left neighbor + * of */ , + carry_level * level /* level to scan */ ) +{ + return find_dir_carry(node, level, (carry_iterator) pool_level_list_prev); +} + +/* look for the right neighbor of given carry node in a + carry queue. + + This is used by find_right_neighbor(), but I am not sure that this + really gives any advantage. More statistics required. + +*/ +reiser4_internal carry_node * +find_right_carry(carry_node * node /* node to fine right neighbor + * of */ , + carry_level * level /* level to scan */ ) +{ + return find_dir_carry(node, level, (carry_iterator) pool_level_list_next); +} + +/* look for the left or right neighbor of given carry node in a carry + queue. + + Helper function used by find_{left|right}_carry(). +*/ +static carry_node * +find_dir_carry(carry_node * node /* node to start scanning + * from */ , + carry_level * level /* level to scan */ , + carry_iterator iterator /* operation to + * move to the next + * node */ ) +{ + carry_node *neighbor; + + assert("nikita-1059", node != NULL); + assert("nikita-1060", level != NULL); + + /* scan list of carry nodes on this list dir-ward, skipping all + carry nodes referencing the same znode. */ + neighbor = node; + while (1) { + neighbor = iterator(neighbor); + if (pool_level_list_end(&level->nodes, &neighbor->header)) + return NULL; + if (carry_real(neighbor) != carry_real(node)) + return neighbor; + } +} + +#define TREE_HEIGHT_CAP (5) + +static int +cap_tree_height(reiser4_tree * tree) +{ + return tree->height >= TREE_HEIGHT_CAP ? TREE_HEIGHT_CAP : tree->height; +} + +static int capped_height(void) +{ + return cap_tree_height(current_tree); +} + +static int bytes_to_pages(int bytes) +{ + return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +static int +carry_estimate_znodes(void) +{ + return bytes_to_pages(capped_height() * sizeof(znode) * 3); +} + +static int +carry_estimate_bitmaps(void) +{ + if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) { + int bytes; + + bytes = capped_height() * + (0 + /* bnode should be added, but its is private to + * bitmap.c, skip for now. */ + 2 * sizeof(jnode)); /* working and commit jnodes */ + return bytes_to_pages(bytes) + 2; /* and their contents */ + } else + /* bitmaps were pre-loaded during mount */ + return 0; +} + +static int +carry_estimate_insert(carry_op * op, carry_level * level) +{ + return + carry_estimate_bitmaps() + + carry_estimate_znodes() + + 1 + /* new atom */ + capped_height() + /* new block on each level */ + 1 + /* and possibly extra new block at the leaf level */ + 3; /* loading of leaves into memory */ +} + +static int +carry_estimate_delete(carry_op * op, carry_level * level) +{ + return + carry_estimate_bitmaps() + + carry_estimate_znodes() + + 1 + /* new atom */ + 3; /* loading of leaves into memory */ +} + +static int +carry_estimate_cut(carry_op * op, carry_level * level) +{ + return + carry_estimate_bitmaps() + + carry_estimate_znodes() + + 1 + /* new atom */ + 3; /* loading of leaves into memory */ +} + +static int +carry_estimate_paste(carry_op * op, carry_level * level) +{ + return + carry_estimate_bitmaps() + + carry_estimate_znodes() + + 1 + /* new atom */ + capped_height() + /* new block on each level */ + 1 + /* and possibly extra new block at the leaf level */ + 3; /* loading of leaves into memory */ +} + +static int +carry_estimate_extent(carry_op * op, carry_level * level) +{ + return + carry_estimate_insert(op, level) + /* insert extent */ + carry_estimate_delete(op, level); /* kill leaf */ +} + +static int +carry_estimate_update(carry_op * op, carry_level * level) +{ + return 0; +} + +static int +carry_estimate_modify(carry_op * op, carry_level * level) +{ + return 0; +} + +static int +carry_estimate_insert_flow(carry_op * op, carry_level * level) +{ + int newnodes; + + newnodes = bytes_to_pages(op->u.insert_flow.flow->length); + /* + * roughly estimate insert_flow as a sequence of insertions. + */ + return newnodes * carry_estimate_insert(op, level); +} + +/* This is dispatch table for carry operations. It can be trivially + abstracted into useful plugin: tunable balancing policy is a good + thing. */ +reiser4_internal carry_op_handler op_dispatch_table[COP_LAST_OP] = { + [COP_INSERT] = { + .handler = carry_insert, + .estimate = carry_estimate_insert + }, + [COP_DELETE] = { + .handler = carry_delete, + .estimate = carry_estimate_delete + }, + [COP_CUT] = { + .handler = carry_cut, + .estimate = carry_estimate_cut + }, + [COP_PASTE] = { + .handler = carry_paste, + .estimate = carry_estimate_paste + }, + [COP_EXTENT] = { + .handler = carry_extent, + .estimate = carry_estimate_extent + }, + [COP_UPDATE] = { + .handler = carry_update, + .estimate = carry_estimate_update + }, + [COP_MODIFY] = { + .handler = carry_modify, + .estimate = carry_estimate_modify + }, + [COP_INSERT_FLOW] = { + .handler = carry_insert_flow, + .estimate = carry_estimate_insert_flow + } +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/carry_ops.h linux-2.6.4-ck1/fs/reiser4/carry_ops.h --- linux-2.6.4/fs/reiser4/carry_ops.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/carry_ops.h 2004-03-11 22:45:15.188525912 +1100 @@ -0,0 +1,41 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* implementation of carry operations */ + +#if !defined( __CARRY_OPS_H__ ) +#define __CARRY_OPS_H__ + +#include "forward.h" +#include "znode.h" +#include "carry.h" + +/* carry operation handlers */ +typedef struct carry_op_handler { + /* perform operation */ + int (*handler) (carry_op * op, carry_level * doing, carry_level * todo); + /* estimate memory requirements for @op */ + int (*estimate) (carry_op * op, carry_level * level); +} carry_op_handler; + +/* This is dispatch table for carry operations. It can be trivially + abstracted into useful plugin: tunable balancing policy is a good + thing. */ +extern carry_op_handler op_dispatch_table[COP_LAST_OP]; + +unsigned int space_needed(const znode * node, const coord_t * coord, const reiser4_item_data * data, int inserting); +extern carry_node *find_left_carry(carry_node * node, carry_level * level); +extern carry_node *find_right_carry(carry_node * node, carry_level * level); + +/* __CARRY_OPS_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/cluster.c linux-2.6.4-ck1/fs/reiser4/cluster.c --- linux-2.6.4/fs/reiser4/cluster.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/cluster.c 2004-03-11 22:45:15.189525756 +1100 @@ -0,0 +1,71 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Contains cluster operations for cryptcompress object plugin (see + http://www.namesys.com/cryptcompress_design.txt for details). */ + +/* Concepts of clustering. Definition of cluster size. + Data clusters, page clusters, disk clusters. + + + In order to compress plain text we first should split it into chunks. + Then we process each chunk independently by the following function: + + void alg(char *input_ptr, int input_length, char *output_ptr, int *output_length); + + where: + input_ptr is a pointer to the first byte of input chunk (that contains plain text), + input_len is a length of input chunk, + output_ptr is a pointer to the first byte of output chunk (that contains processed text), + *output_len is a length of output chunk. + + the length of output chunk depends both on input_len and on the content of + input chunk. input_len (which can be assigned an arbitrary value) affects the + compression quality (the more input_len the better the compression quality). + For each cryptcompress file we assign special attribute - cluster size: + + Cluster size is a file attribute, which determines the maximal size + of input chunk that we use for compression. + + So if we wanna compress a 10K-file with a cluster size of 4K, we split this file + into three chunks (first and second - 4K, third - 2K). Those chunks are + clusters in the space of file offsets (data clusters). + + Cluster sizes are represented as (PAGE_CACHE_SIZE << shift), where + shift (= 0, 1, 2,... ). You'll note that this representation + affects the allowed values for cluster size. This is stored in + disk stat-data (CLUSTER_STAT, layout is in reiser4_cluster_stat (see + (plugin/item/static_stat.h) for details). + Note that working with + cluster_size > PAGE_SIZE (when cluster_shift > 0, and cluster contains more + then one page) is suboptimal because before compression we should assemble + all cluster pages into one flow (this means superfluous memcpy during + read/write). So the better way to increase cluster size (and therefore + compression quality) is making PAGE_SIZE larger (for instance by page + clustering stuff of William Lee). But if you need PAGE_SIZE < cluster_size, + then use the page clustering offered by reiser4. + + The inode mapping of a cryptcompress file contains pages filled by plain text. + Cluster size also defines clustering in address space. For example, + 101K-file with cluster size 16K (cluster shift = 2), which can be mapped + into 26 pages, has 7 "page clusters": first six clusters contains 4 pages + and one cluster contains 2 pages (for the file tail). + + We split each output (compressed) chunk into special items to provide + tight packing of data on disk (currently only ctails hold compressed data). + This set of items we call a "disk cluster". + + Each cluster is defined (like pages are) by its index (e.g. offset, + but the unit is cluster size instead of PAGE_SIZE). Key offset of + the first unit of the first item of each disk cluster (we call this a + "key of disk cluster") is a multiple of the cluster index. + + All read/write/truncate operations are performed upon clusters. + For example, if we wanna read 40K of a cryptcompress file with cluster size 16K + from offset = 20K, we first need to read two clusters (of indexes 1, 2). This + means that all main methods of cryptcompress object plugin call appropriate + cluster operation. + + For the same index we use one structure (type reiser4_cluster_t) to + represent all data/page/disk clusters. (EDWARD-FIXME-HANS: are you + sure that is good style? and where is the code that goes with this comment....;-) ) +*/ diff -Naurp linux-2.6.4/fs/reiser4/compress.c linux-2.6.4-ck1/fs/reiser4/compress.c --- linux-2.6.4/fs/reiser4/compress.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/compress.c 2004-03-11 22:45:15.189525756 +1100 @@ -0,0 +1,38 @@ +/* compression plugins */ + + +#include "debug.h" +#include "plugin/plugin.h" +#include "plugin/cryptcompress.h" +#include + +static void compress_none(__u8 *buf, __u8 *src_first, unsigned src_len, + __u8 *dst_first, unsigned *dst_len) +{ + assert("edward-17", buf != NULL); + assert("edward-18", src_first != NULL); + assert("edward-19", src_len != 0); + assert("edward-20", dst_first != NULL); + assert("edward-21", dst_len != NULL); + + *dst_len = src_len; + memcpy(dst_first, src_first, src_len); +} + +/* compression plugins */ +compression_plugin compression_plugins[LAST_COMPRESSION_ID] = { + [NONE_COMPRESSION_ID] = { + .h = { + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, + .id = NONE_COMPRESSION_ID, + .pops = NULL, + .label = "none", + .desc = "Null compression", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .mem_req = MIN_CLUSTER_SIZE, + .overrun = 0, + .compress = compress_none, + .decompress = compress_none + } +}; diff -Naurp linux-2.6.4/fs/reiser4/context.c linux-2.6.4-ck1/fs/reiser4/context.c --- linux-2.6.4/fs/reiser4/context.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/context.c 2004-03-11 22:45:15.190525601 +1100 @@ -0,0 +1,308 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Manipulation of reiser4_context */ + +#include "debug.h" +#include "super.h" +#include "context.h" + +#include /* balance_dirty_pages() */ + +#if REISER4_DEBUG +/* List of all currently active contexts, used for debugging purposes. */ +context_list_head active_contexts; +/* lock protecting access to active_contexts. */ +spinlock_t active_contexts_lock; + +void +check_contexts(void) +{ + reiser4_context *ctx; + + spin_lock(&active_contexts_lock); + for_all_type_safe_list(context, &active_contexts, ctx) { + assert("", ctx->magic == context_magic); + } + spin_unlock(&active_contexts_lock); +} + +#endif + +struct { + void *task; + void *context; + void *path[16]; +} context_ok; + + + +reiser4_internal void get_context_ok(reiser4_context *ctx) +{ + int i; + void *addr = NULL, *frame = NULL; + +#define CTX_FRAME(nr) \ + case (nr): \ + addr = __builtin_return_address((nr)); \ + frame = __builtin_frame_address(nr); \ + break + + memset(&context_ok, 0, sizeof(context_ok)); + + context_ok.task = current; + context_ok.context = ctx; + for (i = 0; i < 16; i ++) { + switch(i) { + CTX_FRAME(0); + CTX_FRAME(1); + CTX_FRAME(2); + CTX_FRAME(3); + CTX_FRAME(4); + CTX_FRAME(5); + CTX_FRAME(6); + CTX_FRAME(7); + CTX_FRAME(8); + CTX_FRAME(9); + CTX_FRAME(10); + CTX_FRAME(11); + CTX_FRAME(12); + CTX_FRAME(13); + CTX_FRAME(14); + CTX_FRAME(15); + default: + impossible("", ""); + } + if (frame > (void *)ctx) + break; + context_ok.path[i] = addr; + } +#undef CTX_FRAME +} + + +/* initialise context and bind it to the current thread + + This function should be called at the beginning of reiser4 part of + syscall. +*/ +reiser4_internal int +init_context(reiser4_context * context /* pointer to the reiser4 context + * being initalised */ , + struct super_block *super /* super block we are going to + * work with */) +{ + assert("nikita-2662", !in_interrupt() && !in_irq()); + assert("nikita-3356", context != NULL); + assert("nikita-3357", super != NULL); + assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super)); + + xmemset(context, 0, sizeof *context); + + if (is_in_reiser4_context()) { + reiser4_context *parent; + + parent = (reiser4_context *) current->fs_context; + /* NOTE-NIKITA this is dubious */ + if (parent->super == super) { + context->parent = parent; +#if (REISER4_DEBUG) + ++context->parent->nr_children; +#endif + return 0; + } + } + + context->super = super; + context->magic = context_magic; + context->outer = current->fs_context; + current->fs_context = (struct fs_activation *) context; + + init_lock_stack(&context->stack); + + txn_begin(context); + + context->parent = context; + tap_list_init(&context->taps); +#if REISER4_DEBUG + context_list_clean(context); /* to satisfy assertion */ + spin_lock(&active_contexts_lock); + context_list_check(&active_contexts); + context_list_push_front(&active_contexts, context); + /*check_contexts();*/ + spin_unlock(&active_contexts_lock); + context->task = current; +#endif + context->flush_started = INITIAL_JIFFIES; + + grab_space_enable(); + log_entry(super, ":in"); + return 0; +} + +reiser4_internal reiser4_context * +get_context_by_lock_stack(lock_stack * owner) +{ + return container_of(owner, reiser4_context, stack); +} + +reiser4_internal int +is_in_reiser4_context(void) +{ + return + current->fs_context != NULL && + ((unsigned long) current->fs_context->owner) == context_magic; +} + +static void +balance_dirty_pages_at(reiser4_context * context) +{ + reiser4_super_info_data * sbinfo = get_super_private(context->super); + + if (context->nr_marked_dirty != 0 && sbinfo->fake && + !(current->flags & PF_MEMALLOC) && !current_is_pdflush()) { + balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping); + } +} + +reiser4_internal int reiser4_exit_context(reiser4_context * context) +{ + int result = 0; + + assert("nikita-3021", schedulable()); + + if (context == context->parent) { + if (!context->nobalance) + balance_dirty_pages_at(context); + result = txn_end(context); + } + done_context(context); + return (result > 0) ? 0 : result; +} + +/* release resources associated with context. + + This function should be called at the end of "session" with reiser4, + typically just before leaving reiser4 driver back to VFS. + + This is good place to put some degugging consistency checks, like that + thread released all locks and closed transcrash etc. + +*/ +reiser4_internal void +done_context(reiser4_context * context /* context being released */) +{ + reiser4_context *parent; + assert("nikita-860", context != NULL); + + parent = context->parent; + assert("nikita-2174", parent != NULL); + assert("nikita-2093", parent == parent->parent); + assert("nikita-859", parent->magic == context_magic); + assert("vs-646", (reiser4_context *) current->fs_context == parent); + assert("zam-686", !in_interrupt() && !in_irq()); + /* add more checks here */ + + if (parent == context) { + assert("jmacd-673", parent->trans == NULL); + assert("jmacd-1002", lock_stack_isclean(&parent->stack)); + assert("nikita-1936", no_counters_are_held()); + assert("nikita-3403", !delayed_inode_updates(context->dirty)); + assert("nikita-2626", tap_list_empty(taps_list())); + assert("zam-1004", get_super_private(context->super)->delete_sema_owner != current); + + log_entry(context->super, ":ex"); + + if (context->grabbed_blocks != 0) + all_grabbed2free(); + + /* + * synchronize against longterm_unlock_znode(): + * wake_up_requestor() wakes up requestors without holding + * zlock (otherwise they will immediately bump into that lock + * after wake up on another CPU). To work around (rare) + * situation where requestor has been woken up asynchronously + * and managed to run until completion (and destroy its + * context and lock stack) before wake_up_requestor() called + * wake_up() on it, wake_up_requestor() synchronize on lock + * stack spin lock. It has actually been observed that spin + * lock _was_ locked at this point, because + * wake_up_requestor() took interrupt. + */ + spin_lock_stack(&context->stack); + spin_unlock_stack(&context->stack); + +#if REISER4_DEBUG + /* remove from active contexts */ + spin_lock(&active_contexts_lock); + /*check_contexts();*/ + context_list_remove(parent); + spin_unlock(&active_contexts_lock); + + assert("zam-684", context->nr_children == 0); +#endif + current->fs_context = context->outer; + } else { +#if REISER4_DEBUG + parent->nr_children--; + assert("zam-685", parent->nr_children >= 0); +#endif + } +} + +/* Audited by: umka (2002.06.16) */ +reiser4_internal int +init_context_mgr(void) +{ +#if REISER4_DEBUG + spin_lock_init(&active_contexts_lock); + context_list_init(&active_contexts); +#endif + return 0; +} + +#if REISER4_DEBUG_OUTPUT +reiser4_internal void +print_context(const char *prefix, reiser4_context * context) +{ + if (context == NULL) { + printk("%s: null context\n", prefix); + return; + } +#if REISER4_TRACE + printk("%s: trace_flags: %x\n", prefix, context->trace_flags); +#endif +#if REISER4_DEBUG + print_lock_counters("\tlocks", &context->locks); + printk("pid: %i, comm: %s\n", context->task->pid, context->task->comm); +#endif + print_lock_stack("\tlock stack", &context->stack); + info_atom("\tatom", context->trans_in_ctx.atom); +} + +#if REISER4_DEBUG +void +print_contexts(void) +{ + reiser4_context *context; + + spin_lock(&active_contexts_lock); + + for_all_type_safe_list(context, &active_contexts, context) { + print_context("context", context); + } + + spin_unlock(&active_contexts_lock); +} +#endif +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/context.h linux-2.6.4-ck1/fs/reiser4/context.h --- linux-2.6.4/fs/reiser4/context.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/context.h 2004-03-11 22:45:15.191525445 +1100 @@ -0,0 +1,276 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Reiser4 context */ + +#if !defined( __REISER4_CONTEXT_H__ ) +#define __REISER4_CONTEXT_H__ + +#include "forward.h" +#include "debug.h" +#include "spin_macros.h" +#include "dformat.h" +#include "type_safe_list.h" +#include "jnode.h" +#include "tap.h" +#include "lock.h" + +#include /* for __u?? */ +#include /* for struct super_block */ +#include +#include /* for struct task_struct */ + +/* list of active lock stacks */ +ON_DEBUG(TYPE_SAFE_LIST_DECLARE(context);) +ON_DEBUG(TYPE_SAFE_LIST_DECLARE(flushers);) + +#if REISER4_DEBUG + +#define TRACKED_DELAYED_UPDATE (10) + +typedef struct { + ino_t ino; + int delayed; + void *stack[4]; +} dirty_inode_info[TRACKED_DELAYED_UPDATE]; + +extern void mark_inode_update(struct inode *object, int immediate); +extern int delayed_inode_updates(dirty_inode_info info); + +#else + +typedef struct {} dirty_inode_info; + +#define mark_inode_update(object, immediate) noop +#define delayed_inode_updates(info) noop + +#endif + + +/* global context used during system call. Variable of this type is + allocated on the stack at the beginning of the reiser4 part of the + system call and pointer to it is stored in the + current->fs_context. This allows us to avoid passing pointer to + current transaction and current lockstack (both in one-to-one mapping + with threads) all over the call chain. + + It's kind of like those global variables the prof used to tell you + not to use in CS1, except thread specific.;-) Nikita, this was a + good idea. + + In some situations it is desirable to have ability to enter reiser4_context + twice for the same thread (nested contexts). For example, there are some + functions that can be called either directly from VFS/VM or from already + active reiser4 context (->writepage, for example). + + In such situations "child" context acts like dummy: all activity is + actually performed in the top level context, and get_current_context() + always returns top level context. Of course, init_context()/done_context() + have to be properly nested any way. +*/ +struct reiser4_context { + /* magic constant. For identification of reiser4 contexts. */ + __u32 magic; + + /* current lock stack. See lock.[ch]. This is where list of all + locks taken by current thread is kept. This is also used in + deadlock detection. */ + lock_stack stack; + + /* current transcrash. */ + txn_handle *trans; + txn_handle trans_in_ctx; + + /* super block we are working with. To get the current tree + use &get_super_private (reiser4_get_current_sb ())->tree. */ + struct super_block *super; + + /* parent fs activation */ + struct fs_activation *outer; + + /* per-thread grabbed (for further allocation) blocks counter */ + reiser4_block_nr grabbed_blocks; + + /* parent context */ + reiser4_context *parent; + tap_list_head taps; + + /* grabbing space is enabled */ + int grab_enabled :1; + /* should be set when we are write dirty nodes to disk in jnode_flush or + * reiser4_write_logs() */ + int writeout_mode :1; + int entd :1; + int nobalance :1; + + /* count non-trivial jnode_set_dirty() calls */ + unsigned long nr_marked_dirty; + unsigned long flush_started; + unsigned long io_started; + +#if REISER4_DEBUG + /* A link of all active contexts. */ + context_list_link contexts_link; + lock_counters_info locks; + int nr_children; /* number of child contexts */ + struct task_struct *task; /* so we can easily find owner of the stack */ + + reiser4_block_nr grabbed_initially; + backtrace_path grabbed_at; + flushers_list_link flushers_link; + err_site err; + dirty_inode_info dirty; +#endif +#if REISER4_TRACE + /* per-thread tracing flags. Use reiser4_trace_flags enum to set + bits in it. */ + __u32 trace_flags; +#endif +#if REISER4_DEBUG_NODE + int disable_node_check; +#endif +}; + +#if REISER4_DEBUG +TYPE_SAFE_LIST_DEFINE(context, reiser4_context, contexts_link); +TYPE_SAFE_LIST_DEFINE(flushers, reiser4_context, flushers_link); +#endif + +extern reiser4_context *get_context_by_lock_stack(lock_stack *); + +/* Debugging helps. */ +extern int init_context_mgr(void); +#if REISER4_DEBUG_OUTPUT +extern void print_context(const char *prefix, reiser4_context * ctx); +#else +#define print_context(p,c) noop +#endif + +#if REISER4_DEBUG_OUTPUT && REISER4_DEBUG +extern void print_contexts(void); +#else +#define print_contexts() noop +#endif + +#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree)) +#define current_blocksize reiser4_get_current_sb()->s_blocksize +#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits + +extern int init_context(reiser4_context * context, struct super_block *super); +extern void done_context(reiser4_context * context); + +/* magic constant we store in reiser4_context allocated at the stack. Used to + catch accesses to staled or uninitialized contexts. */ +#define context_magic ((__u32) 0x4b1b5d0b) + +extern int is_in_reiser4_context(void); + +/* return context associated with given thread */ + +void get_context_ok(reiser4_context *); + +static inline reiser4_context * +get_context(const struct task_struct *tsk) +{ + assert("", ((reiser4_context *) tsk->fs_context)->magic == context_magic); + /* get_context_ok((reiser4_context *)(tsk->fs_context)); */ + return (reiser4_context *) tsk->fs_context; +} + + +static inline reiser4_context * +get_current_context_check(void) +{ + if (is_in_reiser4_context()) + return get_context(current); + else + return NULL; +} + +static inline reiser4_context * get_current_context(void);/* __attribute__((const));*/ + +/* return context associated with current thread */ +static inline reiser4_context * +get_current_context(void) +{ + return get_context(current); +} + +static inline int is_writeout_mode(void) +{ + return get_current_context()->writeout_mode; +} + +static inline void writeout_mode_enable(void) +{ + assert("zam-941", !get_current_context()->writeout_mode); + get_current_context()->writeout_mode = 1; +} + +static inline void writeout_mode_disable(void) +{ + assert("zam-942", get_current_context()->writeout_mode); + get_current_context()->writeout_mode = 0; +} + +static inline void grab_space_enable(void) +{ + get_current_context()->grab_enabled = 1; +} + +static inline void grab_space_disable(void) +{ + get_current_context()->grab_enabled = 0; +} + +static inline void grab_space_set_enabled (int enabled) +{ + get_current_context()->grab_enabled = enabled; +} + +static inline int is_grab_enabled(reiser4_context *ctx) +{ + return ctx->grab_enabled; +} + +#define REISER4_TRACE_CONTEXT (0) + +#if REISER4_TRACE_TREE && REISER4_TRACE_CONTEXT +extern int write_in_trace(const char *func, const char *mes); + +#define log_entry(super, str) \ +({ \ + if (super != NULL && get_super_private(super) != NULL && \ + get_super_private(super)->trace_file.buf != NULL) \ + write_in_trace(__FUNCTION__, str); \ +}) + +#else +#define log_entry(super, str) noop +#endif + +/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or + * flush would be performed when it is closed. This is necessary when handle + * has to be closed under some coarse semaphore, like i_sem of + * directory. Commit will be performed by ktxnmgrd. */ +static inline void context_set_commit_async(reiser4_context * context) +{ + context = context->parent; + context->nobalance = 1; + context->trans->flags |= TXNH_DONT_COMMIT; +} + +extern int reiser4_exit_context(reiser4_context * context); + +/* __REISER4_CONTEXT_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/coord.c linux-2.6.4-ck1/fs/reiser4/coord.c --- linux-2.6.4/fs/reiser4/coord.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/coord.c 2004-03-11 22:45:15.193525134 +1100 @@ -0,0 +1,983 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "forward.h" +#include "debug.h" +#include "dformat.h" +#include "tree.h" +#include "plugin/item/item.h" +#include "znode.h" +#include "coord.h" + +/* Internal constructor. */ +static inline void +coord_init_values(coord_t *coord, const znode *node, pos_in_node_t item_pos, + pos_in_node_t unit_pos, between_enum between) +{ + coord->node = (znode *) node; + coord_set_item_pos(coord, item_pos); + coord->unit_pos = unit_pos; + coord->between = between; + + /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */ +} + +/* after shifting of node content, coord previously set properly may become + invalid, try to "normalize" it. */ +reiser4_internal void +coord_normalize(coord_t *coord) +{ + znode *node; + + node = coord->node; + assert("vs-683", node); + + coord_clear_iplug(coord); + + if (node_is_empty(node)) { + coord_init_first_unit(coord, node); + } else if ((coord->between == AFTER_ITEM) || (coord->between == AFTER_UNIT)) { + return; + } else if (coord->item_pos == coord_num_items(coord) && coord->between == BEFORE_ITEM) { + coord_dec_item_pos(coord); + coord->between = AFTER_ITEM; + } else if (coord->unit_pos == coord_num_units(coord) && coord->between == BEFORE_UNIT) { + coord->unit_pos--; + coord->between = AFTER_UNIT; + } else if (coord->item_pos == coord_num_items(coord) && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) { + coord_dec_item_pos(coord); + coord->unit_pos = 0; + coord->between = AFTER_ITEM; + } +} + +/* Copy a coordinate. */ +reiser4_internal void +coord_dup(coord_t * coord, const coord_t * old_coord) +{ + assert("jmacd-9800", coord_check(old_coord)); + coord_dup_nocheck(coord, old_coord); +} + +/* Copy a coordinate without check. Useful when old_coord->node is not + loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */ +reiser4_internal void +coord_dup_nocheck(coord_t * coord, const coord_t * old_coord) +{ + coord->node = old_coord->node; + coord_set_item_pos(coord, old_coord->item_pos); + coord->unit_pos = old_coord->unit_pos; + coord->between = old_coord->between; + coord->iplugid = old_coord->iplugid; +} + +/* Initialize an invalid coordinate. */ +reiser4_internal void +coord_init_invalid(coord_t * coord, const znode * node) +{ + coord_init_values(coord, node, 0, 0, INVALID_COORD); +} + +reiser4_internal void +coord_init_first_unit_nocheck(coord_t * coord, const znode * node) +{ + coord_init_values(coord, node, 0, 0, AT_UNIT); +} + +/* Initialize a coordinate to point at the first unit of the first item. If the node is + empty, it is positioned at the EMPTY_NODE. */ +reiser4_internal void +coord_init_first_unit(coord_t * coord, const znode * node) +{ + int is_empty = node_is_empty(node); + + coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT)); + + assert("jmacd-9801", coord_check(coord)); +} + +/* Initialize a coordinate to point at the last unit of the last item. If the node is + empty, it is positioned at the EMPTY_NODE. */ +reiser4_internal void +coord_init_last_unit(coord_t * coord, const znode * node) +{ + int is_empty = node_is_empty(node); + + coord_init_values(coord, node, (is_empty ? 0 : node_num_items(node) - 1), 0, (is_empty ? EMPTY_NODE : AT_UNIT)); + if (!is_empty) + coord->unit_pos = coord_last_unit_pos(coord); + assert("jmacd-9802", coord_check(coord)); +} + +/* Initialize a coordinate to before the first item. If the node is empty, it is + positioned at the EMPTY_NODE. */ +reiser4_internal void +coord_init_before_first_item(coord_t * coord, const znode * node) +{ + int is_empty = node_is_empty(node); + + coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : BEFORE_UNIT)); + + assert("jmacd-9803", coord_check(coord)); +} + +/* Initialize a coordinate to after the last item. If the node is empty, it is positioned + at the EMPTY_NODE. */ +reiser4_internal void +coord_init_after_last_item(coord_t * coord, const znode * node) +{ + int is_empty = node_is_empty(node); + + coord_init_values(coord, node, + (is_empty ? 0 : node_num_items(node) - 1), 0, (is_empty ? EMPTY_NODE : AFTER_ITEM)); + + assert("jmacd-9804", coord_check(coord)); +} + +/* Initialize a coordinate to after last unit in the item. Coord must be set + already to existing item */ +reiser4_internal void +coord_init_after_item_end(coord_t * coord) +{ + coord->between = AFTER_UNIT; + coord->unit_pos = coord_last_unit_pos(coord); +} + +/* Initialize a coordinate to before the item. Coord must be set already to existing item */ +reiser4_internal void +coord_init_before_item(coord_t * coord) +{ + coord->unit_pos = 0; + coord->between = BEFORE_ITEM; +} + +/* Initialize a coordinate to after the item. Coord must be set already to existing item */ +reiser4_internal void +coord_init_after_item(coord_t * coord) +{ + coord->unit_pos = 0; + coord->between = AFTER_ITEM; +} + +/* Initialize a coordinate by 0s. Used in places where init_coord was used and + it was not clear how actually */ +reiser4_internal void +coord_init_zero(coord_t * coord) +{ + xmemset(coord, 0, sizeof (*coord)); +} + +/* Return the number of units at the present item. Asserts coord_is_existing_item(). */ +reiser4_internal unsigned +coord_num_units(const coord_t * coord) +{ + assert("jmacd-9806", coord_is_existing_item(coord)); + + return item_plugin_by_coord(coord)->b.nr_units(coord); +} + +/* Returns true if the coord was initializewd by coord_init_invalid (). */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +coord_is_invalid(const coord_t * coord) +{ + return coord->between == INVALID_COORD; +} + +/* Returns true if the coordinate is positioned at an existing item, not before or after + an item. It may be placed at, before, or after any unit within the item, whether + existing or not. */ +reiser4_internal int +coord_is_existing_item(const coord_t * coord) +{ + switch (coord->between) { + case EMPTY_NODE: + case BEFORE_ITEM: + case AFTER_ITEM: + case INVALID_COORD: + return 0; + + case BEFORE_UNIT: + case AT_UNIT: + case AFTER_UNIT: + return coord->item_pos < coord_num_items(coord); + } + + IF_TRACE(TRACE_COORDS, print_coord("unreachable", coord, 0)); + impossible("jmacd-9900", "unreachable coord: %p", coord); + return 0; +} + +/* Returns true if the coordinate is positioned at an existing unit, not before or after a + unit. */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +coord_is_existing_unit(const coord_t * coord) +{ + switch (coord->between) { + case EMPTY_NODE: + case BEFORE_UNIT: + case AFTER_UNIT: + case BEFORE_ITEM: + case AFTER_ITEM: + case INVALID_COORD: + return 0; + + case AT_UNIT: + return (coord->item_pos < coord_num_items(coord) && coord->unit_pos < coord_num_units(coord)); + } + + impossible("jmacd-9902", "unreachable"); + return 0; +} + +/* Returns true if the coordinate is positioned at the first unit of the first item. Not + true for empty nodes nor coordinates positioned before the first item. */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +coord_is_leftmost_unit(const coord_t * coord) +{ + return (coord->between == AT_UNIT && coord->item_pos == 0 && coord->unit_pos == 0); +} + +#if REISER4_DEBUG +/* For assertions only, checks for a valid coordinate. */ +int +coord_check(const coord_t * coord) +{ + if (coord->node == NULL) { + return 0; + } + if (znode_above_root(coord->node)) + return 1; + + switch (coord->between) { + default: + case INVALID_COORD: + return 0; + case EMPTY_NODE: + if (!node_is_empty(coord->node)) { + return 0; + } + return coord->item_pos == 0 && coord->unit_pos == 0; + + case BEFORE_UNIT: + case AFTER_UNIT: + if (node_is_empty(coord->node) && (coord->item_pos == 0) && (coord->unit_pos == 0)) + return 1; + case AT_UNIT: + break; + case AFTER_ITEM: + case BEFORE_ITEM: + /* before/after item should not set unit_pos. */ + if (coord->unit_pos != 0) { + return 0; + } + break; + } + + if (coord->item_pos >= node_num_items(coord->node)) { + return 0; + } + + /* FIXME-VS: we are going to check unit_pos. This makes no sense when + between is set either AFTER_ITEM or BEFORE_ITEM */ + if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM) + return 1; + + if (coord_is_iplug_set(coord) && + coord->unit_pos > item_plugin_by_coord(coord)->b.nr_units(coord) - 1) { + return 0; + } + return 1; +} +#endif + +/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev. + Returns 1 if the new position is does not exist. */ +static int +coord_adjust_items(coord_t * coord, unsigned items, int is_next) +{ + /* If the node is invalid, leave it. */ + if (coord->between == INVALID_COORD) { + return 1; + } + + /* If the node is empty, set it appropriately. */ + if (items == 0) { + coord->between = EMPTY_NODE; + coord_set_item_pos(coord, 0); + coord->unit_pos = 0; + return 1; + } + + /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */ + if (coord->between == EMPTY_NODE) { + coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM); + coord_set_item_pos(coord, 0); + coord->unit_pos = 0; + return 0; + } + + /* If the item_pos is out-of-range, set it appropriatly. */ + if (coord->item_pos >= items) { + coord->between = AFTER_ITEM; + coord_set_item_pos(coord, items - 1); + coord->unit_pos = 0; + /* If is_next, return 1 (can't go any further). */ + return is_next; + } + + return 0; +} + +/* Advances the coordinate by one unit to the right. If empty, no change. If + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is an + existing unit. */ +reiser4_internal int +coord_next_unit(coord_t * coord) +{ + unsigned items = coord_num_items(coord); + + if (coord_adjust_items(coord, items, 1) == 1) { + return 1; + } + + switch (coord->between) { + case BEFORE_UNIT: + /* Now it is positioned at the same unit. */ + coord->between = AT_UNIT; + return 0; + + case AFTER_UNIT: + case AT_UNIT: + /* If it was at or after a unit and there are more units in this item, + advance to the next one. */ + if (coord->unit_pos < coord_last_unit_pos(coord)) { + coord->unit_pos += 1; + coord->between = AT_UNIT; + return 0; + } + + /* Otherwise, it is crossing an item boundary and treated as if it was + after the current item. */ + coord->between = AFTER_ITEM; + coord->unit_pos = 0; + /* FALLTHROUGH */ + + case AFTER_ITEM: + /* Check for end-of-node. */ + if (coord->item_pos == items - 1) { + return 1; + } + + coord_inc_item_pos(coord); + coord->unit_pos = 0; + coord->between = AT_UNIT; + return 0; + + case BEFORE_ITEM: + /* The adjust_items checks ensure that we are valid here. */ + coord->unit_pos = 0; + coord->between = AT_UNIT; + return 0; + + case INVALID_COORD: + case EMPTY_NODE: + /* Handled in coord_adjust_items(). */ + break; + } + + impossible("jmacd-9902", "unreachable"); + return 0; +} + +/* Advances the coordinate by one item to the right. If empty, no change. If + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is + an existing item. */ +reiser4_internal int +coord_next_item(coord_t * coord) +{ + unsigned items = coord_num_items(coord); + + if (coord_adjust_items(coord, items, 1) == 1) { + return 1; + } + + switch (coord->between) { + case AFTER_UNIT: + case AT_UNIT: + case BEFORE_UNIT: + case AFTER_ITEM: + /* Check for end-of-node. */ + if (coord->item_pos == items - 1) { + coord->between = AFTER_ITEM; + coord->unit_pos = 0; + coord_clear_iplug(coord); + return 1; + } + + /* Anywhere in an item, go to the next one. */ + coord->between = AT_UNIT; + coord_inc_item_pos(coord); + coord->unit_pos = 0; + return 0; + + case BEFORE_ITEM: + /* The out-of-range check ensures that we are valid here. */ + coord->unit_pos = 0; + coord->between = AT_UNIT; + return 0; + case INVALID_COORD: + case EMPTY_NODE: + /* Handled in coord_adjust_items(). */ + break; + } + + impossible("jmacd-9903", "unreachable"); + return 0; +} + +/* Advances the coordinate by one unit to the left. If empty, no change. If + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position + is an existing unit. */ +reiser4_internal int +coord_prev_unit(coord_t * coord) +{ + unsigned items = coord_num_items(coord); + + if (coord_adjust_items(coord, items, 0) == 1) { + return 1; + } + + switch (coord->between) { + case AT_UNIT: + case BEFORE_UNIT: + if (coord->unit_pos > 0) { + coord->unit_pos -= 1; + coord->between = AT_UNIT; + return 0; + } + + if (coord->item_pos == 0) { + coord->between = BEFORE_ITEM; + return 1; + } + + coord_dec_item_pos(coord); + coord->unit_pos = coord_last_unit_pos(coord); + coord->between = AT_UNIT; + return 0; + + case AFTER_UNIT: + /* What if unit_pos is out-of-range? */ + assert("jmacd-5442", coord->unit_pos <= coord_last_unit_pos(coord)); + coord->between = AT_UNIT; + return 0; + + case BEFORE_ITEM: + if (coord->item_pos == 0) { + return 1; + } + + coord_dec_item_pos(coord); + /* FALLTHROUGH */ + + case AFTER_ITEM: + coord->between = AT_UNIT; + coord->unit_pos = coord_last_unit_pos(coord); + return 0; + + case INVALID_COORD: + case EMPTY_NODE: + break; + } + + impossible("jmacd-9904", "unreachable"); + return 0; +} + +/* Advances the coordinate by one item to the left. If empty, no change. If + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position + is an existing item. */ +reiser4_internal int +coord_prev_item(coord_t * coord) +{ + unsigned items = coord_num_items(coord); + + if (coord_adjust_items(coord, items, 0) == 1) { + return 1; + } + + switch (coord->between) { + case AT_UNIT: + case AFTER_UNIT: + case BEFORE_UNIT: + case BEFORE_ITEM: + + if (coord->item_pos == 0) { + coord->between = BEFORE_ITEM; + coord->unit_pos = 0; + return 1; + } + + coord_dec_item_pos(coord); + coord->unit_pos = 0; + coord->between = AT_UNIT; + return 0; + + case AFTER_ITEM: + coord->between = AT_UNIT; + coord->unit_pos = 0; + return 0; + + case INVALID_COORD: + case EMPTY_NODE: + break; + } + + impossible("jmacd-9905", "unreachable"); + return 0; +} + +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */ +reiser4_internal void +coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir) +{ + assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE); + if (dir == LEFT_SIDE) { + coord_init_first_unit(coord, node); + } else { + coord_init_last_unit(coord, node); + } +} + +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof + argument. */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +coord_is_after_sideof_unit(coord_t * coord, sideof dir) +{ + assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE); + if (dir == LEFT_SIDE) { + return coord_is_before_leftmost(coord); + } else { + return coord_is_after_rightmost(coord); + } +} + +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +coord_sideof_unit(coord_t * coord, sideof dir) +{ + assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE); + if (dir == LEFT_SIDE) { + return coord_prev_unit(coord); + } else { + return coord_next_unit(coord); + } +} + +reiser4_internal int +coords_equal(const coord_t * c1, const coord_t * c2) +{ + assert("nikita-2840", c1 != NULL); + assert("nikita-2841", c2 != NULL); + + /* assertion to track changes in coord_t */ + cassert(sizeof(*c1) == sizeof(c1->node) + + sizeof(c1->item_pos) + + sizeof(c1->unit_pos) + + sizeof(c1->iplugid) + sizeof(c1->between) + sizeof(c1->pad) + sizeof(c1->body)); + return + c1->node == c2->node && + c1->item_pos == c2->item_pos && + c1->unit_pos == c2->unit_pos && + c1->between == c2->between; +} + +/* Returns true if two coordinates are consider equal. Coordinates that are between units + or items are considered equal. */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +coord_eq(const coord_t * c1, const coord_t * c2) +{ + assert("nikita-1807", c1 != NULL); + assert("nikita-1808", c2 != NULL); + + if (coords_equal(c1, c2)) { + return 1; + } + if (c1->node != c2->node) { + return 0; + } + + switch (c1->between) { + case INVALID_COORD: + case EMPTY_NODE: + case AT_UNIT: + return 0; + + case BEFORE_UNIT: + /* c2 must be after the previous unit. */ + return (c1->item_pos == c2->item_pos && c2->between == AFTER_UNIT && c2->unit_pos == c1->unit_pos - 1); + + case AFTER_UNIT: + /* c2 must be before the next unit. */ + return (c1->item_pos == c2->item_pos && c2->between == BEFORE_UNIT && c2->unit_pos == c1->unit_pos + 1); + + case BEFORE_ITEM: + /* c2 must be after the previous item. */ + return (c1->item_pos == c2->item_pos - 1 && c2->between == AFTER_ITEM); + + case AFTER_ITEM: + /* c2 must be before the next item. */ + return (c1->item_pos == c2->item_pos + 1 && c2->between == BEFORE_ITEM); + } + + impossible("jmacd-9906", "unreachable"); + return 0; +} + +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost + return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */ +/* Audited by: green(2002.06.15) */ +reiser4_internal coord_wrt_node coord_wrt(const coord_t * coord) +{ + if (coord_is_before_leftmost(coord)) { + return COORD_ON_THE_LEFT; + } + + if (coord_is_after_rightmost(coord)) { + return COORD_ON_THE_RIGHT; + } + + return COORD_INSIDE; +} + +/* Returns true if the coordinate is positioned after the last item or after the last unit + of the last item or it is an empty node. */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +coord_is_after_rightmost(const coord_t * coord) +{ + assert("jmacd-7313", coord_check(coord)); + + switch (coord->between) { + case INVALID_COORD: + case AT_UNIT: + case BEFORE_UNIT: + case BEFORE_ITEM: + return 0; + + case EMPTY_NODE: + return 1; + + case AFTER_ITEM: + return (coord->item_pos == node_num_items(coord->node) - 1); + + case AFTER_UNIT: + return ((coord->item_pos == node_num_items(coord->node) - 1) && + coord->unit_pos == coord_last_unit_pos(coord)); + } + + impossible("jmacd-9908", "unreachable"); + return 0; +} + +/* Returns true if the coordinate is positioned before the first item or it is an empty + node. */ +reiser4_internal int +coord_is_before_leftmost(const coord_t * coord) +{ + /* FIXME-VS: coord_check requires node to be loaded whereas it is not + necessary to check if coord is set before leftmost + assert ("jmacd-7313", coord_check (coord)); */ + switch (coord->between) { + case INVALID_COORD: + case AT_UNIT: + case AFTER_ITEM: + case AFTER_UNIT: + return 0; + + case EMPTY_NODE: + return 1; + + case BEFORE_ITEM: + case BEFORE_UNIT: + return (coord->item_pos == 0) && (coord->unit_pos == 0); + } + + impossible("jmacd-9908", "unreachable"); + return 0; +} + +/* Returns true if the coordinate is positioned after a item, before a item, after the + last unit of an item, before the first unit of an item, or at an empty node. */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +coord_is_between_items(const coord_t * coord) +{ + assert("jmacd-7313", coord_check(coord)); + + switch (coord->between) { + case INVALID_COORD: + case AT_UNIT: + return 0; + + case AFTER_ITEM: + case BEFORE_ITEM: + case EMPTY_NODE: + return 1; + + case BEFORE_UNIT: + return coord->unit_pos == 0; + + case AFTER_UNIT: + return coord->unit_pos == coord_last_unit_pos(coord); + } + + impossible("jmacd-9908", "unreachable"); + return 0; +} + +/* Returns true if the coordinates are positioned at adjacent units, regardless of + before-after or item boundaries. */ +reiser4_internal int +coord_are_neighbors(coord_t * c1, coord_t * c2) +{ + coord_t *left; + coord_t *right; + + assert("nikita-1241", c1 != NULL); + assert("nikita-1242", c2 != NULL); + assert("nikita-1243", c1->node == c2->node); + assert("nikita-1244", coord_is_existing_unit(c1)); + assert("nikita-1245", coord_is_existing_unit(c2)); + + left = right = 0; + switch (coord_compare(c1, c2)) { + case COORD_CMP_ON_LEFT: + left = c1; + right = c2; + break; + case COORD_CMP_ON_RIGHT: + left = c2; + right = c1; + break; + case COORD_CMP_SAME: + return 0; + default: + wrong_return_value("nikita-1246", "compare_coords()"); + } + assert("vs-731", left && right); + if (left->item_pos == right->item_pos) { + return left->unit_pos + 1 == right->unit_pos; + } else if (left->item_pos + 1 == right->item_pos) { + return (left->unit_pos == coord_last_unit_pos(left)) && (right->unit_pos == 0); + } else { + return 0; + } +} + +/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT, + COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2. */ +/* Audited by: green(2002.06.15) */ +reiser4_internal coord_cmp coord_compare(coord_t * c1, coord_t * c2) +{ + assert("vs-209", c1->node == c2->node); + assert("vs-194", coord_is_existing_unit(c1) + && coord_is_existing_unit(c2)); + + if (c1->item_pos > c2->item_pos) + return COORD_CMP_ON_RIGHT; + if (c1->item_pos < c2->item_pos) + return COORD_CMP_ON_LEFT; + if (c1->unit_pos > c2->unit_pos) + return COORD_CMP_ON_RIGHT; + if (c1->unit_pos < c2->unit_pos) + return COORD_CMP_ON_LEFT; + return COORD_CMP_SAME; +} + +/* If the coordinate is between items, shifts it to the right. Returns 0 on success and + non-zero if there is no position to the right. */ +reiser4_internal int +coord_set_to_right(coord_t * coord) +{ + unsigned items = coord_num_items(coord); + + if (coord_adjust_items(coord, items, 1) == 1) { + return 1; + } + + switch (coord->between) { + case AT_UNIT: + return 0; + + case BEFORE_ITEM: + case BEFORE_UNIT: + coord->between = AT_UNIT; + return 0; + + case AFTER_UNIT: + if (coord->unit_pos < coord_last_unit_pos(coord)) { + coord->unit_pos += 1; + coord->between = AT_UNIT; + return 0; + } else { + + coord->unit_pos = 0; + + if (coord->item_pos == items - 1) { + coord->between = AFTER_ITEM; + return 1; + } + + coord_inc_item_pos(coord); + coord->between = AT_UNIT; + return 0; + } + + case AFTER_ITEM: + if (coord->item_pos == items - 1) { + return 1; + } + + coord_inc_item_pos(coord); + coord->unit_pos = 0; + coord->between = AT_UNIT; + return 0; + + case INVALID_COORD: + case EMPTY_NODE: + break; + } + + impossible("jmacd-9920", "unreachable"); + return 0; +} + +/* If the coordinate is between items, shifts it to the left. Returns 0 on success and + non-zero if there is no position to the left. */ +reiser4_internal int +coord_set_to_left(coord_t * coord) +{ + unsigned items = coord_num_items(coord); + + if (coord_adjust_items(coord, items, 0) == 1) { + return 1; + } + + switch (coord->between) { + case AT_UNIT: + return 0; + + case AFTER_UNIT: + coord->between = AT_UNIT; + return 0; + + case AFTER_ITEM: + coord->between = AT_UNIT; + coord->unit_pos = coord_last_unit_pos(coord); + return 0; + + case BEFORE_UNIT: + if (coord->unit_pos > 0) { + coord->unit_pos -= 1; + coord->between = AT_UNIT; + return 0; + } else { + + if (coord->item_pos == 0) { + coord->between = BEFORE_ITEM; + return 1; + } + + coord->unit_pos = coord_last_unit_pos(coord); + coord_dec_item_pos(coord); + coord->between = AT_UNIT; + return 0; + } + + case BEFORE_ITEM: + if (coord->item_pos == 0) { + return 1; + } + + coord_dec_item_pos(coord); + coord->unit_pos = coord_last_unit_pos(coord); + coord->between = AT_UNIT; + return 0; + + case INVALID_COORD: + case EMPTY_NODE: + break; + } + + impossible("jmacd-9920", "unreachable"); + return 0; +} + +reiser4_internal const char * +coord_tween_tostring(between_enum n) +{ + switch (n) { + case BEFORE_UNIT: + return "before unit"; + case BEFORE_ITEM: + return "before item"; + case AT_UNIT: + return "at unit"; + case AFTER_UNIT: + return "after unit"; + case AFTER_ITEM: + return "after item"; + case EMPTY_NODE: + return "empty node"; + case INVALID_COORD: + return "invalid"; + default:{ + static char buf[30]; + + sprintf(buf, "unknown: %i", n); + return buf; + } + } +} + +reiser4_internal void +print_coord(const char *mes, const coord_t * coord, int node) +{ + if (coord == NULL) { + printk("%s: null\n", mes); + return; + } + printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n", + mes, coord->item_pos, coord->unit_pos, coord_tween_tostring(coord->between), coord->iplugid); + if (node) + print_znode("\tnode", coord->node); +} + +reiser4_internal int +item_utmost_child_real_block(const coord_t * coord, sideof side, reiser4_block_nr * blk) +{ + return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord, side, blk); +} + +reiser4_internal int +item_utmost_child(const coord_t * coord, sideof side, jnode ** child) +{ + return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child); +} + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/coord.h linux-2.6.4-ck1/fs/reiser4/coord.h --- linux-2.6.4/fs/reiser4/coord.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/coord.h 2004-03-11 22:45:15.194524979 +1100 @@ -0,0 +1,328 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Coords */ + +#if !defined( __REISER4_COORD_H__ ) +#define __REISER4_COORD_H__ + +#include "forward.h" +#include "debug.h" +#include "dformat.h" + +/* insertions happen between coords in the tree, so we need some means + of specifying the sense of betweenness. */ +typedef enum { + BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */ + AT_UNIT, + AFTER_UNIT, + BEFORE_ITEM, + AFTER_ITEM, + INVALID_COORD, + EMPTY_NODE, +} between_enum; + +/* location of coord w.r.t. its node */ +typedef enum { + COORD_ON_THE_LEFT = -1, + COORD_ON_THE_RIGHT = +1, + COORD_INSIDE = 0 +} coord_wrt_node; + +typedef enum { + COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1 +} coord_cmp; + +struct coord { + /* node in a tree */ + /* 0 */ znode *node; + + /* position of item within node */ + /* 4 */ pos_in_node_t item_pos; + /* position of unit within item */ + /* 6 */ pos_in_node_t unit_pos; + /* optimization: plugin of item is stored in coord_t. Until this was + implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid + is invalidated (set to 0xff) on each modification of ->item_pos, + and all such modifications are funneled through coord_*_item_pos() + functions below. + */ + /* 8 */ char iplugid; + /* position of coord w.r.t. to neighboring items and/or units. + Values are taken from &between_enum above. + */ + /* 9 */ char between; + /* padding. It will be added by the compiler anyway to conform to the + * C language alignment requirements. We keep it here to be on the + * safe side and to have a clear picture of the memory layout of this + * structure. */ + /* 10 */ __u16 pad; + /* 12 */ void *body; +}; + +#define INVALID_PLUGID ((char)((1 << 8) - 1)) + +static inline void +coord_clear_iplug(coord_t * coord) +{ + assert("nikita-2835", coord != NULL); + coord->iplugid = INVALID_PLUGID; + coord->body = NULL; +} + +static inline int +coord_is_iplug_set(const coord_t * coord) +{ + assert("nikita-2836", coord != NULL); + return coord->iplugid != INVALID_PLUGID; +} + +static inline void +coord_set_item_pos(coord_t * coord, pos_in_node_t pos) +{ + assert("nikita-2478", coord != NULL); + coord->item_pos = pos; + coord_clear_iplug(coord); +} + +static inline void +coord_dec_item_pos(coord_t * coord) +{ + assert("nikita-2480", coord != NULL); + --coord->item_pos; + coord_clear_iplug(coord); +} + +static inline void +coord_inc_item_pos(coord_t * coord) +{ + assert("nikita-2481", coord != NULL); + ++coord->item_pos; + coord_clear_iplug(coord); +} + +static inline void +coord_add_item_pos(coord_t * coord, int delta) +{ + assert("nikita-2482", coord != NULL); + coord->item_pos += delta; + coord_clear_iplug(coord); +} + +static inline void +coord_invalid_item_pos(coord_t * coord) +{ + assert("nikita-2832", coord != NULL); + coord->item_pos = (unsigned short)~0; + coord_clear_iplug(coord); +} + +/* Reverse a direction. */ +static inline sideof +sideof_reverse(sideof side) +{ + return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE; +} + +/* NOTE: There is a somewhat odd mixture of the following opposed terms: + + "first" and "last" + "next" and "prev" + "before" and "after" + "leftmost" and "rightmost" + + But I think the chosen names are decent the way they are. +*/ + +/* COORD INITIALIZERS */ + +/* Initialize an invalid coordinate. */ +extern void coord_init_invalid(coord_t * coord, const znode * node); + +extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node); + +/* Initialize a coordinate to point at the first unit of the first item. If the node is + empty, it is positioned at the EMPTY_NODE. */ +extern void coord_init_first_unit(coord_t * coord, const znode * node); + +/* Initialize a coordinate to point at the last unit of the last item. If the node is + empty, it is positioned at the EMPTY_NODE. */ +extern void coord_init_last_unit(coord_t * coord, const znode * node); + +/* Initialize a coordinate to before the first item. If the node is empty, it is + positioned at the EMPTY_NODE. */ +extern void coord_init_before_first_item(coord_t * coord, const znode * node); + +/* Initialize a coordinate to after the last item. If the node is empty, it is positioned + at the EMPTY_NODE. */ +extern void coord_init_after_last_item(coord_t * coord, const znode * node); + +/* Initialize a coordinate to after last unit in the item. Coord must be set + already to existing item */ +void coord_init_after_item_end(coord_t * coord); + +/* Initialize a coordinate to before the item. Coord must be set already to existing item */ +void coord_init_before_item(coord_t *); +/* Initialize a coordinate to after the item. Coord must be set already to existing item */ +void coord_init_after_item(coord_t *); + +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */ +extern void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir); + +/* Initialize a coordinate by 0s. Used in places where init_coord was used and + it was not clear how actually + FIXME-VS: added by vs (2002, june, 8) */ +extern void coord_init_zero(coord_t * coord); + +/* COORD METHODS */ + +/* after shifting of node content, coord previously set properly may become + invalid, try to "normalize" it. */ +void coord_normalize(coord_t * coord); + +/* Copy a coordinate. */ +extern void coord_dup(coord_t * coord, const coord_t * old_coord); + +/* Copy a coordinate without check. */ +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord); + +unsigned coord_num_units(const coord_t * coord); + +/* Return the last valid unit number at the present item (i.e., + coord_num_units() - 1). */ +static inline unsigned +coord_last_unit_pos(const coord_t * coord) +{ + return coord_num_units(coord) - 1; +} + +#if REISER4_DEBUG +/* For assertions only, checks for a valid coordinate. */ +extern int coord_check(const coord_t * coord); +#endif + +extern int coords_equal(const coord_t * c1, const coord_t * c2); + +/* Returns true if two coordinates are consider equal. Coordinates that are between units + or items are considered equal. */ +extern int coord_eq(const coord_t * c1, const coord_t * c2); + +extern void print_coord(const char *mes, const coord_t * coord, int print_node); + +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost + return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */ +extern coord_wrt_node coord_wrt(const coord_t * coord); + +/* Returns true if the coordinates are positioned at adjacent units, regardless of + before-after or item boundaries. */ +extern int coord_are_neighbors(coord_t * c1, coord_t * c2); + +/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT, + NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2. */ +extern coord_cmp coord_compare(coord_t * c1, coord_t * c2); + +/* COORD PREDICATES */ + +/* Returns true if the coord was initializewd by coord_init_invalid (). */ +extern int coord_is_invalid(const coord_t * coord); + +/* Returns true if the coordinate is positioned at an existing item, not before or after + an item. It may be placed at, before, or after any unit within the item, whether + existing or not. If this is true you can call methods of the item plugin. */ +extern int coord_is_existing_item(const coord_t * coord); + +/* Returns true if the coordinate is positioned after a item, before a item, after the + last unit of an item, before the first unit of an item, or at an empty node. */ +extern int coord_is_between_items(const coord_t * coord); + +/* Returns true if the coordinate is positioned at an existing unit, not before or after a + unit. */ +extern int coord_is_existing_unit(const coord_t * coord); + +/* Returns true if the coordinate is positioned at an empty node. */ +extern int coord_is_empty(const coord_t * coord); + +/* Returns true if the coordinate is positioned at the first unit of the first item. Not + true for empty nodes nor coordinates positioned before the first item. */ +extern int coord_is_leftmost_unit(const coord_t * coord); + +/* Returns true if the coordinate is positioned after the last item or after the last unit + of the last item or it is an empty node. */ +extern int coord_is_after_rightmost(const coord_t * coord); + +/* Returns true if the coordinate is positioned before the first item or it is an empty + node. */ +extern int coord_is_before_leftmost(const coord_t * coord); + +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof + argument. */ +extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir); + +/* COORD MODIFIERS */ + +/* Advances the coordinate by one unit to the right. If empty, no change. If + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is + an existing unit. */ +extern int coord_next_unit(coord_t * coord); + +/* Advances the coordinate by one item to the right. If empty, no change. If + coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new position is + an existing item. */ +extern int coord_next_item(coord_t * coord); + +/* Advances the coordinate by one unit to the left. If empty, no change. If + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position + is an existing unit. */ +extern int coord_prev_unit(coord_t * coord); + +/* Advances the coordinate by one item to the left. If empty, no change. If + coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new position + is an existing item. */ +extern int coord_prev_item(coord_t * coord); + +/* If the coordinate is between items, shifts it to the right. Returns 0 on success and + non-zero if there is no position to the right. */ +extern int coord_set_to_right(coord_t * coord); + +/* If the coordinate is between items, shifts it to the left. Returns 0 on success and + non-zero if there is no position to the left. */ +extern int coord_set_to_left(coord_t * coord); + +/* If the coordinate is at an existing unit, set to after that unit. Returns 0 on success + and non-zero if the unit did not exist. */ +extern int coord_set_after_unit(coord_t * coord); + +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */ +extern int coord_sideof_unit(coord_t * coord, sideof dir); + +/* iterate over all units in @node */ +#define for_all_units( coord, node ) \ + for( coord_init_before_first_item( ( coord ), ( node ) ) ; \ + coord_next_unit( coord ) == 0 ; ) + +/* iterate over all items in @node */ +#define for_all_items( coord, node ) \ + for( coord_init_before_first_item( ( coord ), ( node ) ) ; \ + coord_next_item( coord ) == 0 ; ) + +#if REISER4_DEBUG_OUTPUT +extern const char *coord_tween_tostring(between_enum n); +#endif + +/* COORD/ITEM METHODS */ + +extern int item_utmost_child_real_block(const coord_t * coord, sideof side, reiser4_block_nr * blk); +extern int item_utmost_child(const coord_t * coord, sideof side, jnode ** child); + +/* __REISER4_COORD_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/crab_lock.c linux-2.6.4-ck1/fs/reiser4/crab_lock.c --- linux-2.6.4/fs/reiser4/crab_lock.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/crab_lock.c 2004-03-11 22:45:15.194524979 +1100 @@ -0,0 +1,98 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "debug.h" +#include "crab_lock.h" +#include "znode.h" + +#if !REISER4_ALL_IN_ONE +reiser4_internal void +crab_init(crab_lock_t *clock) +{ + xmemset(clock, 0, sizeof *clock); +} + +reiser4_internal int +crab_prepare(crab_lock_t *clock, znode *node) +{ + int result; + + spin_lock_znode(node); + RLOCK_ZLOCK(&node->lock); + if (!znode_is_wlocked(node) && ZJNODE(node)->atom == NULL) { + clock->node = zref(node); + clock->version = node->version; + clock->locked = 0; + result = 0; + } else + result = -E_REPEAT; + RUNLOCK_ZLOCK(&node->lock); + spin_unlock_znode(node); + return result; +} + +reiser4_internal int +crab_lock(crab_lock_t *plock, crab_lock_t *clock, znode *node) +{ + int result; + znode *parent; + + parent = plock->node; + spin_lock_znode(parent); + RLOCK_ZLOCK(&parent->lock); + if (parent->version == plock->version && ZJNODE(parent)->atom == NULL) { + spin_lock_znode(node); + RLOCK_ZLOCK(&node->lock); + if (!znode_is_wlocked(node) && ZJNODE(node)->atom == NULL) { + clock->node = zref(node); + clock->version = node->version; + clock->locked = 1; + /* RLOCK_DLOCK(node); */ + result = 0; + } else + result = -E_REPEAT; + RUNLOCK_ZLOCK(&node->lock); + spin_unlock_znode(node); + } else + result = -E_REPEAT; + RUNLOCK_ZLOCK(&parent->lock); + spin_unlock_znode(parent); + return result; +} + +reiser4_internal void +crab_unlock(crab_lock_t *clock) +{ + if (clock->locked) { + /* RUNLOCK_DLOCK(clock->node); */ + clock->locked = 0; + } +} + +reiser4_internal void +crab_done(crab_lock_t *clock) +{ + if (clock->node != NULL) { + zput(clock->node); + clock->node = NULL; + } +} + +reiser4_internal void +crab_move(crab_lock_t *to, crab_lock_t *from) +{ + *to = *from; + crab_init(from); +} + +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/crab_lock.h linux-2.6.4-ck1/fs/reiser4/crab_lock.h --- linux-2.6.4/fs/reiser4/crab_lock.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/crab_lock.h 2004-03-11 22:45:15.195524823 +1100 @@ -0,0 +1,33 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#if !defined(__FS_REISER4_CRAB_LOCK_H__) +#define __FS_REISER4_CRAB_LOCK_H__ + +#include "lock.h" + +typedef struct { + znode *node; + int locked; + __u64 version; +} crab_lock_t; + +void crab_init (crab_lock_t *clock); +int crab_prepare (crab_lock_t *clock, znode *node); +int crab_lock (crab_lock_t *parent, crab_lock_t *child, znode *node); +void crab_unlock (crab_lock_t *clock); +void crab_done (crab_lock_t *clock); + +void crab_move (crab_lock_t *to, crab_lock_t *from); + +/* __FS_REISER4_CRAB_LOCK_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/crypt.c linux-2.6.4-ck1/fs/reiser4/crypt.c --- linux-2.6.4/fs/reiser4/crypt.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/crypt.c 2004-03-11 22:45:15.195524823 +1100 @@ -0,0 +1,116 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ +/* Crypto-plugins for reiser4 cryptcompress objects */ + +#include "debug.h" +#include "plugin/plugin.h" +#include +#include +#define MAX_CRYPTO_BLOCKSIZE 128 +#define NONE_EXPKEY_WORDS 8 +#define NONE_BLOCKSIZE 8 + +/* + +Default align() method of the crypto-plugin (look for description of this method +in plugin/plugin.h) + +1) creates the aligning armored format of the input flow before encryption. + "armored" means that padding is filled by private data (for example, + pseudo-random sequence of bytes is not private data). +2) returns length of appended padding + + [ flow | aligning_padding ] + ^ + | + @pad +*/ +static int align_cluster_common(__u8 *pad /* pointer to the first byte of aligning format */, + int flow_size /* size of non-aligned flow */, + int blocksize /* crypto-block size */) +{ + int pad_size; + + assert("edward-01", pad != NULL); + assert("edward-02", flow_size != 0); + assert("edward-03", blocksize != 0 || blocksize <= MAX_CRYPTO_BLOCKSIZE); + + pad_size = blocksize - (flow_size % blocksize); + get_random_bytes (pad, pad_size); + return pad_size; +} + +/* common scale method (look for description of this method in plugin/plugin.h) + for all symmetric algorithms which doesn't scale anything +*/ +static loff_t scale_common(struct inode * inode UNUSED_ARG, + size_t blocksize UNUSED_ARG /* crypto block size, which is returned + by blocksize method of crypto plugin */, + loff_t src_off /* offset to scale */) +{ + return src_off; +} + +/* blocksize method (look for description of this method in plugin/plugin.h) + for none crypto plugin */ +static size_t blocksize_none (__u16 keysize UNUSED_ARG /* size of private key, bits */) +{ + return NONE_BLOCKSIZE; +} + +/* set_key (look for description of this method in plugin/plugin.h) + for none crypto plugin */ +static int set_key_none(__u32 *expkey /* cpu key */, + const __u8 *key UNUSED_ARG) +{ + memset(expkey, 0, NONE_EXPKEY_WORDS * sizeof(__u32)); + return 0; +} + +/* plugin->encrypt, + plugin->decrypt for none crypto plugin + (look for description of this methods in plugin/plugin.h) +*/ +static void crypt_none (__u32 *expkey UNUSED_ARG, __u8 *dst, const __u8 *src) +{ + assert("edward-04", dst != NULL); + assert("edward-05", src != NULL); + + memcpy(dst, src, NONE_BLOCKSIZE); +} + +/* EDWARD-FIXME-HANS: why is this not in the plugin directory? */ + +/* crypto plugins */ +crypto_plugin crypto_plugins[LAST_CRYPTO_ID] = { + [NONE_CRYPTO_ID] = { + .h = { + .type_id = REISER4_CRYPTO_PLUGIN_TYPE, + .id = NONE_CRYPTO_ID, + .pops = NULL, + /* this is a special crypto algorithm which + doesn't change data, this is useful for + debuging purposes and various benchmarks */ + .label = "none", + .desc = "Id rearrangement", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .nr_keywords = NONE_EXPKEY_WORDS, + .blocksize = blocksize_none, + .scale = scale_common, + .align_cluster = align_cluster_common, + .set_key = set_key_none, + .encrypt = crypt_none, + .decrypt = crypt_none + } +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/debug.c linux-2.6.4-ck1/fs/reiser4/debug.c --- linux-2.6.4/fs/reiser4/debug.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/debug.c 2004-03-11 22:45:15.196524668 +1100 @@ -0,0 +1,708 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Debugging facilities. */ + +#include "kattr.h" +#include "reiser4.h" +#include "context.h" +#include "super.h" +#include "txnmgr.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +__u32 reiser4_current_trace_flags = 0; + +extern void cond_resched(void); + +static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE]; +static spinlock_t panic_guard = SPIN_LOCK_UNLOCKED; + +/* Your best friend. Call it on each occasion. This is called by + fs/reiser4/debug.h:reiser4_panic(). */ +reiser4_internal void +reiser4_do_panic(const char *format /* format string */ , ... /* rest */) +{ + static int in_panic = 0; + va_list args; + + if (in_panic == 0) { + in_panic = 1; + + spin_lock(&panic_guard); + va_start(args, format); + vsnprintf(panic_buf, sizeof(panic_buf), format, args); + va_end(args); + printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf); + spin_unlock(&panic_guard); + + DEBUGON(1); + + /* do something more impressive here, print content of + get_current_context() */ + if (get_current_context_check() != NULL) { + struct super_block *super; + reiser4_context *ctx; + + print_lock_counters("pins held", lock_counters()); + print_contexts(); + ctx = get_current_context(); + super = ctx->super; + if (get_super_private(super) != NULL && + reiser4_is_debugged(super, REISER4_VERBOSE_PANIC)) + print_znodes("znodes", current_tree); +#if REISER4_DEBUG + { + reiser4_context *top; + extern spinlock_t active_contexts_lock; + + top = ctx->parent; + spin_lock(&active_contexts_lock); + context_list_remove(top); + spin_unlock(&active_contexts_lock); + } +#endif + } + } + BUG(); + /* to make gcc happy about noreturn attribute */ + panic("%s", panic_buf); +} + +reiser4_internal void +reiser4_print_prefix(const char *level, int reperr, const char *mid, + const char *function, const char *file, int lineno) +{ + const char *comm; + int pid; + + if (unlikely(in_interrupt() || in_irq())) { + comm = "interrupt"; + pid = 0; + } else { + comm = current->comm; + pid = current->pid; + } + printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n", + level, comm, pid, function, file, lineno, mid); + if (reperr) + report_err(); +} + +/* Preemption point: this should be called periodically during long running + operations (carry, allocate, and squeeze are best examples) */ +reiser4_internal int +preempt_point(void) +{ + assert("nikita-3008", schedulable()); + cond_resched(); + return signal_pending(current); +} + +#if REISER4_DEBUG +/* Debugging aid: return struct where information about locks taken by current + thread is accumulated. This can be used to formulate lock ordering + constraints and various assertions. + +*/ +lock_counters_info * +lock_counters(void) +{ + reiser4_context *ctx = get_current_context(); + assert("jmacd-1123", ctx != NULL); + return &ctx->locks; +} + +/* check that no spinlocks are held */ +int schedulable(void) +{ + if (REISER4_DEBUG && get_current_context_check() != NULL) { + lock_counters_info *counters; + + counters = lock_counters(); + if (counters->spin_locked != 0) { + print_lock_counters("in atomic", counters); + return 0; + } + return 1; + } + might_sleep(); + return 1; +} +#endif + +#if REISER4_DEBUG_OUTPUT && REISER4_DEBUG +void +print_lock_counters(const char *prefix, const lock_counters_info * info) +{ + printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n" + "jload: %i, " + "txnh: %i, atom: %i, stack: %i, txnmgr: %i, " + "ktxnmgrd: %i, fq: %i, reiser4_sb: %i\n" + "inode: %i, " + "cbk_cache: %i (r:%i,w%i), " + "epoch: %i, eflush: %i, " + "zlock: %i (r:%i, w:%i)\n" + "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n" + "d: %i, x: %i, t: %i\n", prefix, + info->spin_locked_jnode, + info->rw_locked_tree, info->read_locked_tree, + info->write_locked_tree, + + info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk, + + info->spin_locked_jload, + info->spin_locked_txnh, + info->spin_locked_atom, info->spin_locked_stack, + info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd, + info->spin_locked_fq, info->spin_locked_super, + info->spin_locked_inode_object, + + info->rw_locked_cbk_cache, + info->read_locked_cbk_cache, + info->write_locked_cbk_cache, + + info->spin_locked_epoch, + info->spin_locked_super_eflush, + + info->rw_locked_zlock, + info->read_locked_zlock, + info->write_locked_zlock, + + info->spin_locked, + info->long_term_locked_znode, + info->inode_sem_r, info->inode_sem_w, + info->d_refs, info->x_refs, info->t_refs); +} +#endif + +reiser4_internal int +reiser4_are_all_debugged(struct super_block *super, __u32 flags) +{ + return (get_super_private(super)->debug_flags & flags) == flags; +} + +reiser4_internal int +reiser4_is_debugged(struct super_block *super, __u32 flag) +{ + return get_super_private(super)->debug_flags & flag; +} + +#if REISER4_TRACE +/* tracing setup: global trace flags stored in global variable plus + per-thread trace flags plus per-fs trace flags. + */ +__u32 get_current_trace_flags(void) +{ + __u32 flags; + reiser4_context *ctx; + + flags = reiser4_current_trace_flags; + ctx = get_current_context_check(); + if (ctx) { + flags |= ctx->trace_flags; + flags |= get_super_private(ctx->super)->trace_flags; + } + return flags; +} +#endif + +/* allocate memory. This calls kmalloc(), performs some additional checks, and + keeps track of how many memory was allocated on behalf of current super + block. */ +reiser4_internal void * +reiser4_kmalloc(size_t size /* number of bytes to allocate */ , + int gfp_flag /* allocation flag */ ) +{ + void *result; + + assert("nikita-3009", ergo(gfp_flag & __GFP_WAIT, schedulable())); + + result = kmalloc(size, gfp_flag); + if (REISER4_DEBUG && result != NULL) { + unsigned int usedsize; + reiser4_super_info_data *sbinfo; + + usedsize = ksize(result); + + sbinfo = get_current_super_private(); + + assert("nikita-3459", usedsize >= size); + assert("nikita-1407", sbinfo != NULL); + reiser4_spin_lock_sb(sbinfo); + ON_DEBUG(sbinfo->kmalloc_allocated += usedsize); + reiser4_spin_unlock_sb(sbinfo); + } + return result; +} + +/* release memory allocated by reiser4_kmalloc() and update counter. */ +reiser4_internal void +reiser4_kfree(void *area /* memory to from */) +{ + assert("nikita-1410", area != NULL); + return reiser4_kfree_in_sb(area, reiser4_get_current_sb()); +} + +reiser4_internal void +reiser4_kfree_in_sb(void *area /* memory to from */, struct super_block *sb) +{ + assert("nikita-2729", area != NULL); + if (REISER4_DEBUG) { + unsigned int size; + reiser4_super_info_data *sbinfo; + + size = ksize(area); + + sbinfo = get_super_private(sb); + + reiser4_spin_lock_sb(sbinfo); + assert("nikita-2730", sbinfo->kmalloc_allocated >= (int) size); + ON_DEBUG(sbinfo->kmalloc_allocated -= size); + reiser4_spin_unlock_sb(sbinfo); + } + kfree(area); +} + + +#if defined(CONFIG_REISER4_NOOPT) +void __you_cannot_kmalloc_that_much(void) +{ + BUG(); +} +#endif + +#if REISER4_DEBUG + +int +no_counters_are_held(void) +{ + lock_counters_info *counters; + + counters = lock_counters(); + return + (counters->rw_locked_zlock == 0) && + (counters->read_locked_zlock == 0) && + (counters->write_locked_zlock == 0) && + (counters->spin_locked_jnode == 0) && + (counters->rw_locked_tree == 0) && + (counters->read_locked_tree == 0) && + (counters->write_locked_tree == 0) && + (counters->rw_locked_dk == 0) && + (counters->read_locked_dk == 0) && + (counters->write_locked_dk == 0) && + (counters->spin_locked_txnh == 0) && + (counters->spin_locked_atom == 0) && + (counters->spin_locked_stack == 0) && + (counters->spin_locked_txnmgr == 0) && + (counters->spin_locked_inode_object == 0) && + (counters->spin_locked == 0) && + (counters->long_term_locked_znode == 0) && + (counters->inode_sem_r == 0) && + (counters->inode_sem_w == 0); +} + +int +commit_check_locks(void) +{ + lock_counters_info *counters; + int inode_sem_r; + int inode_sem_w; + int result; + + counters = lock_counters(); + inode_sem_r = counters->inode_sem_r; + inode_sem_w = counters->inode_sem_w; + + counters->inode_sem_r = counters->inode_sem_w = 0; + result = no_counters_are_held(); + counters->inode_sem_r = inode_sem_r; + counters->inode_sem_w = inode_sem_w; + return result; +} + +void +return_err(int code, const char *file, int line) +{ + if (code < 0 && is_in_reiser4_context()) { + reiser4_context *ctx = get_current_context(); + + if (ctx != NULL) { + fill_backtrace(&ctx->err.path, + REISER4_BACKTRACE_DEPTH, 0); + ctx->err.code = code; + ctx->err.file = file; + ctx->err.line = line; + } + } +} + +void +report_err(void) +{ + reiser4_context *ctx = get_current_context_check(); + + if (ctx != NULL) { + if (ctx->err.code != 0) { +#ifdef CONFIG_FRAME_POINTER + int i; + for (i = 0; i < REISER4_BACKTRACE_DEPTH ; ++ i) + printk("0x%p ", ctx->err.path.trace[i]); + printk("\n"); +#endif + printk("code: %i at %s:%i\n", + ctx->err.code, ctx->err.file, ctx->err.line); + } + } +} + +#endif + +#ifdef CONFIG_FRAME_POINTER +extern int kswapd(void *); + +#include +#include "ktxnmgrd.h" + +struct repacker; +extern int reiser4_repacker(struct repacker *); +extern int repacker_d(void*); + +static int is_addr_in(void *addr, void *start, void *end) +{ + return start < addr && addr < end; +} + +static int is_last_frame(void *addr) +{ + if (addr == NULL) + return 1; + if (is_addr_in(addr, kswapd, wakeup_kswapd)) + return 1; + else if (is_addr_in(addr, reiser4_repacker, repacker_d)) + return 1; + else if (is_addr_in(addr, init_ktxnmgrd_context, ktxnmgrd_attach)) + return 1; + else if (is_addr_in(addr, init_entd_context, done_entd_context)) + return 1; + else if (!kernel_text_address((unsigned long)addr)) + return 1; + else + return 0; +} + +reiser4_internal void +fill_backtrace(backtrace_path *path, int depth, int shift) +{ + int i; + void *addr; + + cassert(REISER4_BACKTRACE_DEPTH == 4); + assert("nikita-3229", shift < 6); + + /* long live Duff! */ + +#define FRAME(nr) \ + case (nr): \ + addr = __builtin_return_address((nr) + 2); \ + break + + xmemset(path, 0, sizeof *path); + addr = NULL; + for (i = 0; i < depth; ++ i) { + switch(i + shift) { + FRAME(0); + FRAME(1); + FRAME(2); + FRAME(3); + FRAME(4); + FRAME(5); + FRAME(6); + FRAME(7); + FRAME(8); + FRAME(9); + FRAME(10); + default: + impossible("nikita-3230", "everything is wrong"); + } + path->trace[i] = addr; + if (is_last_frame(addr)) + break; + } +} +#endif + +#if KERNEL_DEBUGGER +void debugtrap(void) +{ + /* do nothing. Put break point here. */ +#ifdef CONFIG_KGDB + extern void breakpoint(void); + breakpoint(); +#endif +} +#endif + +#if REISER4_DEBUG + +void call_on_each_assert(void) +{ + return; + /* + * DON'T USE ASSERTIONS HERE :) + */ + if (is_in_reiser4_context()) { + reiser4_super_info_data *sinfo; + reiser4_context *ctx; + + ctx = (reiser4_context *) current->fs_context; + sinfo = ctx->super->s_fs_info; + /* put checks here */ + } +} + +#endif + +#if REISER4_DEBUG_OUTPUT +reiser4_internal void +info_atom(const char *prefix, const txn_atom * atom) +{ + if (atom == NULL) { + printk("%s: no atom\n", prefix); + return; + } + + printk("%s: refcount: %i id: %i flags: %x txnh_count: %i" + " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix, + atomic_read(&atom->refcount), atom->atom_id, atom->flags, atom->txnh_count, + atom->capture_count, atom->stage, atom->start_time, atom->flushed); +} + +#endif + +const char *coord_tween_tostring(between_enum n); + +reiser4_internal void +jnode_tostring_internal(jnode * node, char *buf) +{ + const char *state; + char atom[32]; + char block[48]; + char items[32]; + int fmttd; + int dirty; + int lockit; + + lockit = spin_trylock_jnode(node); + + fmttd = jnode_is_znode(node); + dirty = JF_ISSET(node, JNODE_DIRTY); + + sprintf(block, " block=%s page=%p state=%lx", sprint_address(jnode_get_block(node)), node->pg, node->state); + + if (JF_ISSET(node, JNODE_OVRWR)) { + state = dirty ? "wandr,dirty" : "wandr"; + } else if (JF_ISSET(node, JNODE_RELOC) && JF_ISSET(node, JNODE_CREATED)) { + state = dirty ? "creat,dirty" : "creat"; + } else if (JF_ISSET(node, JNODE_RELOC)) { + state = dirty ? "reloc,dirty" : "reloc"; + } else if (JF_ISSET(node, JNODE_CREATED)) { + assert("jmacd-61554", dirty); + state = "fresh"; + block[0] = 0; + } else { + state = dirty ? "dirty" : "clean"; + } + + if (node->atom == NULL) { + atom[0] = 0; + } else { + sprintf(atom, " atom=%u", node->atom->atom_id); + } + + items[0] = 0; + if (!fmttd) { + sprintf(items, " index=%lu", index_jnode(node)); + } + + sprintf(buf + strlen(buf), + "%s=%p [%s%s%s level=%u%s%s]", + fmttd ? "z" : "j", + node, + state, atom, block, jnode_get_level(node), items, JF_ISSET(node, JNODE_FLUSH_QUEUED) ? " fq" : ""); + + if (lockit == 1) { + UNLOCK_JNODE(node); + } +} + +reiser4_internal const char * +jnode_tostring(jnode * node) +{ + static char fmtbuf[256]; + fmtbuf[0] = 0; + jnode_tostring_internal(node, fmtbuf); + return fmtbuf; +} + +reiser4_internal const char * +znode_tostring(znode * node) +{ + return jnode_tostring(ZJNODE(node)); +} + +reiser4_internal const char * +flags_tostring(int flags) +{ + switch (flags) { + case JNODE_FLUSH_WRITE_BLOCKS: + return "(write blocks)"; + case JNODE_FLUSH_COMMIT: + return "(commit)"; + case JNODE_FLUSH_MEMORY_FORMATTED: + return "(memory-z)"; + case JNODE_FLUSH_MEMORY_UNFORMATTED: + return "(memory-j)"; + default: + return "(unknown)"; + } +} + + +static int +proc_dodebug(ctl_table *table, int write, struct file *file, + void *buffer, size_t *lenp) +{ + char tmpbuf[20], *p, c; + unsigned int value; + size_t left, len; + + if ((file->f_pos && !write) || !*lenp) { + *lenp = 0; + return 0; + } + + left = *lenp; + + if (write) { + if (!access_ok(VERIFY_READ, buffer, left)) + return -EFAULT; + p = (char *) buffer; + while (left && __get_user(c, p) >= 0 && isspace(c)) + left--, p++; + if (!left) + goto done; + + if (left > sizeof(tmpbuf) - 1) + return -EINVAL; + copy_from_user(tmpbuf, p, left); + tmpbuf[left] = '\0'; + + for (p = tmpbuf, value = 0; '0' <= *p && *p <= '9'; p++, left--) + value = 10 * value + (*p - '0'); + if (*p && !isspace(*p)) + return -EINVAL; + while (left && isspace(*p)) + left--, p++; + *(unsigned int *) table->data = value; + } else { + if (!access_ok(VERIFY_WRITE, buffer, left)) + return -EFAULT; + len = sprintf(tmpbuf, "%d", *(unsigned int *) table->data); + if (len > left) + len = left; + __copy_to_user(buffer, tmpbuf, len); + if ((left -= len) > 0) { + put_user('\n', (char *)buffer + len); + left--; + } + } + +done: + *lenp -= left; + file->f_pos += *lenp; + return 0; +} + +unsigned int trace_flags; + +#define REISER4_SYSCTL_TRACE_FLAGS 1 +static ctl_table reiser4_sysctl[] = { + { + /* /proc/sys/fs/reiser4/trace_flags */ + .ctl_name = REISER4_SYSCTL_TRACE_FLAGS, + .procname = "trace_flags", + .data = &trace_flags, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dodebug + }, + { + .ctl_name = 0 + } +}; + +#define SYS_FS_REISER4 1 +ctl_table sys_fs_reiser4[] = { + { + /* /proc/sys/fs/reiser4 */ + .ctl_name = SYS_FS_REISER4, + .procname = "reiser4", + .mode = 0644, + .child = reiser4_sysctl + }, + { + .ctl_name = 0 + } +}; + +ctl_table sys_fs[] = { + { + /* /proc/sys/fs/ */ + .ctl_name = CTL_FS, + .procname = "fs", + .mode = 0555, + .child = sys_fs_reiser4 + }, + { + .ctl_name = 0 + } +}; + +static struct ctl_table_header *reiser4_sysctl_header; + +reiser4_internal int +reiser4_sysctl_init(void) +{ + if (!reiser4_sysctl_header) + reiser4_sysctl_header = register_sysctl_table(sys_fs, 1); + return 0; +} + +reiser4_internal void +reiser4_sysctl_done(void) +{ + if (reiser4_sysctl_header) { + unregister_sysctl_table(reiser4_sysctl_header); + reiser4_sysctl_header = NULL; + } +} + + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/debug.h linux-2.6.4-ck1/fs/reiser4/debug.h --- linux-2.6.4/fs/reiser4/debug.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/debug.h 2004-03-11 22:45:15.198524357 +1100 @@ -0,0 +1,524 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Declarations of debug macros. */ + +#if !defined( __FS_REISER4_DEBUG_H__ ) +#define __FS_REISER4_DEBUG_H__ + +#include "forward.h" +#include "reiser4.h" + +/* for __u?? types */ +#include +/* for struct super_block, etc */ +#include +/* for in_interrupt() */ +#include + +#include + +/* generic function to produce formatted output, decorating it with + whatever standard prefixes/postfixes we want. "Fun" is a function + that will be actually called, can be printk, panic etc. + This is for use by other debugging macros, not by users. */ +#define DCALL(lev, fun, reperr, label, format, ...) \ +({ \ + reiser4_print_prefix(lev, reperr, label, \ + __FUNCTION__, __FILE__, __LINE__); \ + fun(lev format "\n" , ## __VA_ARGS__); \ +}) + +#define reiser4_panic(mid, format, ...) \ + DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__) + +/* print message with indication of current process, file, line and + function */ +#define reiser4_log(label, format, ...) \ + DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__) + +/* Assertion checked during compilation. + If "cond" is false (0) we get duplicate case label in switch. + Use this to check something like famous + cassert (sizeof(struct reiserfs_journal_commit) == 4096) ; + in 3.x journal.c. If cassertion fails you get compiler error, + so no "maintainer-id". +*/ +#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } }) + +#if defined(CONFIG_REISER4_DEBUG) +/* turn on assertions */ +#define REISER4_DEBUG (1) +#else +#define REISER4_DEBUG (0) +#endif + +#if defined(CONFIG_REISER4_DEBUG_MODIFY) +/* this significantly slows down testing, but we should run our testsuite + through with this every once in a while. */ +#define REISER4_DEBUG_MODIFY (1) +#else +#define REISER4_DEBUG_MODIFY (0) +#endif + +#if defined(CONFIG_REISER4_DEBUG_MEMCPY) +/* provide our own memcpy/memmove to profile shifts */ +#define REISER4_DEBUG_MEMCPY (1) +#else +#define REISER4_DEBUG_MEMCPY (0) +#endif + +#if defined(CONFIG_REISER4_DEBUG_NODE) +/* check consistency of internal node structures */ +#define REISER4_DEBUG_NODE (1) +#else +#define REISER4_DEBUG_NODE (0) +#endif + +#if defined(CONFIG_REISER4_ZERO_NEW_NODE) +/* if this is non-zero, clear content of new node, otherwise leave whatever + may happen to be here */ +#define REISER4_ZERO_NEW_NODE (1) +#else +#define REISER4_ZERO_NEW_NODE (0) +#endif + +#if defined(CONFIG_REISER4_TRACE) +/* tracing facility. + + REISER4_DEBUG doesn't necessary implies tracing, because tracing is only + meaningful during debugging and can produce big amonts of output useless + for average user. +*/ +#define REISER4_TRACE (1) +#else +#define REISER4_TRACE (0) +#endif + +#if defined(CONFIG_REISER4_EVENT_LOG) +/* collect tree traces */ +#define REISER4_TRACE_TREE (1) +#else +#define REISER4_TRACE_TREE (0) +#endif + +#if defined(CONFIG_REISER4_STATS) +/* collect internal stats. Should be switched to use kernel logging facility + once latter merged. */ +#define REISER4_STATS (1) +#else +#define REISER4_STATS (0) +#endif + +#if defined(CONFIG_REISER4_DEBUG_OUTPUT) +/* debugging print functions. */ +#define REISER4_DEBUG_OUTPUT (1) +#else +#define REISER4_DEBUG_OUTPUT (0) +#endif + +#if defined(CONFIG_REISER4_COPY_ON_CAPTURE) +/* enable copy on capture */ +#define REISER4_COPY_ON_CAPTURE (1) +#else +#define REISER4_COPY_ON_CAPTURE (0) +#endif + +#if defined(CONFIG_REISER4_LOCKPROF) +#define REISER4_LOCKPROF (1) +#else +#define REISER4_LOCKPROF (0) +#endif + +#if defined(CONFIG_REISER4_LARGE_KEY) +#define REISER4_LARGE_KEY (1) +#else +#define REISER4_LARGE_KEY (0) +#endif + +#if defined(CONFIG_REISER4_ALL_IN_ONE) +#define REISER4_ALL_IN_ONE (1) +#else +#define REISER4_ALL_IN_ONE (0) +#endif + +#define noop do {;} while(0) + +#if REISER4_DEBUG +/* version of info that only actually prints anything when _d_ebugging + is on */ +#define dinfo(format, ...) printk(format , ## __VA_ARGS__) +/* macro to catch logical errors. Put it into `default' clause of + switch() statement. */ +#define impossible(label, format, ...) \ + reiser4_panic(label, "impossible: " format , ## __VA_ARGS__) +/* assert assures that @cond is true. If it is not, reiser4_panic() is + called. Use this for checking logical consistency and _never_ call + this to check correctness of external data: disk blocks and user-input . */ +#define assert(label, cond) \ +({ \ + /* call_on_each_assert(); */ \ + if (cond) { \ + /* put negated check to avoid using !(cond) that would lose \ + * warnings for things like assert(a = b); */ \ + ; \ + } else { \ + DEBUGON(1); \ + reiser4_panic(label, "assertion failed: %s", #cond); \ + } \ +}) + +/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */ +#define check_me( label, expr ) assert( label, ( expr ) ) + +#define ON_DEBUG( exp ) exp + +extern int schedulable(void); +extern void call_on_each_assert(void); + +#else + +#define dinfo( format, args... ) noop +#define impossible( label, format, args... ) noop +#define assert( label, cond ) noop +#define check_me( label, expr ) ( ( void ) ( expr ) ) +#define ON_DEBUG( exp ) +#define schedulable() might_sleep() + +/* REISER4_DEBUG */ +#endif + +/* per-thread information about lock acquired by this thread. Used by lock + * ordering checking in spin_macros.h */ +typedef struct lock_counters_info { + int rw_locked_tree; + int read_locked_tree; + int write_locked_tree; + + int rw_locked_dk; + int read_locked_dk; + int write_locked_dk; + + int rw_locked_cbk_cache; + int read_locked_cbk_cache; + int write_locked_cbk_cache; + + int rw_locked_zlock; + int read_locked_zlock; + int write_locked_zlock; + + int spin_locked_jnode; + int spin_locked_jload; + int spin_locked_txnh; + int spin_locked_atom; + int spin_locked_stack; + int spin_locked_txnmgr; + int spin_locked_ktxnmgrd; + int spin_locked_fq; + int spin_locked_super; + int spin_locked_inode_object; + int spin_locked_epoch; + int spin_locked_super_eflush; + int spin_locked; + int long_term_locked_znode; + + int inode_sem_r; + int inode_sem_w; + + int d_refs; + int x_refs; + int t_refs; +} lock_counters_info; + +#if REISER4_DEBUG +extern lock_counters_info *lock_counters(void); +#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b)) +#define LOCK_CNT_INC(counter) IN_CONTEXT(++(lock_counters()->counter), 0) +#define LOCK_CNT_DEC(counter) IN_CONTEXT(--(lock_counters()->counter), 0) +#define LOCK_CNT_NIL(counter) IN_CONTEXT(lock_counters()->counter == 0, 1) +#define LOCK_CNT_GTZ(counter) IN_CONTEXT(lock_counters()->counter > 0, 1) +#else +#define lock_counters() ((lock_counters_info *)NULL) +#define LOCK_CNT_INC(counter) noop +#define LOCK_CNT_DEC(counter) noop +#define LOCK_CNT_NIL(counter) (1) +#define LOCK_CNT_GTZ(counter) (1) +#endif + +#ifdef CONFIG_FRAME_POINTER +/* update debug.c:fill_backtrace() if you change this */ +#define REISER4_BACKTRACE_DEPTH (4) +typedef struct { + void *trace[REISER4_BACKTRACE_DEPTH]; +} backtrace_path; +extern void fill_backtrace(backtrace_path *path, int depth, int shift); +#else +typedef struct {} backtrace_path; +#define fill_backtrace(path, depth, shift) noop +#endif + + +/* flags controlling debugging behavior. Are set through debug_flags=N mount + option. */ +typedef enum { + /* print a lot of information during panic. When this is on all jnodes + * are listed. This can be *very* large output. Usually you don't want + * this. Especially over serial line. */ + REISER4_VERBOSE_PANIC = 0x00000001, + /* print a lot of information during umount */ + REISER4_VERBOSE_UMOUNT = 0x00000002, + /* print gathered statistics on umount */ + REISER4_STATS_ON_UMOUNT = 0x00000004, + /* check node consistency */ + REISER4_CHECK_NODE = 0x00000008 +} reiser4_debug_flags; + +extern int reiser4_are_all_debugged(struct super_block *super, __u32 flags); +extern int reiser4_is_debugged(struct super_block *super, __u32 flag); + +extern int is_in_reiser4_context(void); + +/* + * evaluate expression E only if with reiser4 context + */ +#define ON_CONTEXT(e) do { \ + if(is_in_reiser4_context()) { \ + e; \ + } } while(0) + +#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) ) + +#if REISER4_DEBUG_MODIFY +#define ON_DEBUG_MODIFY( exp ) exp +#else +#define ON_DEBUG_MODIFY( exp ) +#endif + +#define wrong_return_value( label, function ) \ + impossible( label, "wrong return value from " function ) +/* Issue warning message to the console */ +#define warning( label, format, ... ) \ + DCALL( KERN_WARNING, \ + printk, 1, label, "WARNING: " format , ## __VA_ARGS__ ) +#define not_yet( label, format, ... ) \ + reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ ) + +#if REISER4_TRACE +#if 0 +/* helper macro for tracing, see trace_stamp() below. */ +#define IF_TRACE(flags, e) \ + if(get_current_trace_flags() & (flags)) e +#endif +extern unsigned int trace_flags; +#define IF_TRACE(flags, e) \ + if (trace_flags & (flags)) \ + e + +#else +#define IF_TRACE( flags, e ) noop +#endif + +/* tracing flags. */ +typedef enum { + /* trace nothing */ + NO_TRACE = 0, + /* trace vfs interaction functions from vfs_ops.c */ + TRACE_VFS_OPS = (1 << 0), /* 0x00000001 */ + /* trace plugin handling functions */ + TRACE_PLUGINS = (1 << 1), /* 0x00000002 */ + /* trace tree traversals */ + TRACE_TREE = (1 << 2), /* 0x00000004 */ + /* trace znode manipulation functions */ + TRACE_ZNODES = (1 << 3), /* 0x00000008 */ + /* trace node layout functions */ + TRACE_NODES = (1 << 4), /* 0x00000010 */ + /* trace directory functions */ + TRACE_DIR = (1 << 5), /* 0x00000020 */ + /* trace flush code verbosely */ + TRACE_FLUSH_VERB = (1 << 6), /* 0x00000040 */ + /* trace flush code */ + TRACE_FLUSH = (1 << 7), /* 0x00000080 */ + /* trace carry */ + TRACE_CARRY = (1 << 8), /* 0x00000100 */ + /* trace how tree (web) of znodes if maintained through tree + balancings. */ + TRACE_ZWEB = (1 << 9), /* 0x00000200 */ + /* trace transactions. */ + TRACE_TXN = (1 << 10), /* 0x00000400 */ + /* trace object id allocation/releasing */ + TRACE_OIDS = (1 << 11), /* 0x00000800 */ + /* trace item shifts */ + TRACE_SHIFT = (1 << 12), /* 0x00001000 */ + /* trace page cache */ + TRACE_PCACHE = (1 << 13), /* 0x00002000 */ + /* trace extents */ + TRACE_EXTENTS = (1 << 14), /* 0x00004000 */ + /* trace locks */ + TRACE_LOCKS = (1 << 15), /* 0x00008000 */ + /* trace coords */ + TRACE_COORDS = (1 << 16), /* 0x00010000 */ + /* trace read-IO functions */ + TRACE_IO_R = (1 << 17), /* 0x00020000 */ + /* trace write-IO functions */ + TRACE_IO_W = (1 << 18), /* 0x00040000 */ + + /* trace log writing */ + TRACE_LOG = (1 << 19), /* 0x00080000 */ + + /* trace journal replaying */ + TRACE_REPLAY = (1 << 20), /* 0x00100000 */ + + /* trace space allocation */ + TRACE_ALLOC = (1 << 21), /* 0x00200000 */ + + /* trace space reservation */ + TRACE_RESERVE = (1 << 22), /* 0x00400000 */ + + /* trace emergency flush */ + TRACE_EFLUSH = (1 << 23), /* 0x00800000 */ + + /* trace ctails */ + TRACE_CTAIL = (1 << 24), /* 0x01000000 */ + + TRACE_PARSE = (1 << 25), /* 0x02000000 */ + + TRACE_CAPTURE_COPY = (1 << 26), /* 0x04000000 */ + + TRACE_EXTENT_ALLOC = (1 << 27), /* 0x08000000 */ + + /* vague section: used to trace bugs. Use it to issue optional prints + at arbitrary points of code. */ + TRACE_BUG = (1 << 31), /* 0x80000000 */ + + /* trace everything above */ + TRACE_ALL = 0xffffffffu +} reiser4_trace_flags; + +extern __u32 reiser4_current_trace_flags; + +/* just print where we are: file, function, line */ +#define trace_stamp( f ) IF_TRACE( f, reiser4_log( "trace", "" ) ) +/* print value of "var" */ +#define trace_var( f, format, var ) \ + IF_TRACE( f, reiser4_log( "trace", #var ": " format, var ) ) +/* print output only if appropriate trace flag(s) is on */ +#define ON_TRACE( f, ... ) IF_TRACE(f, printk(__VA_ARGS__)) + +extern void reiser4_do_panic(const char *format, ...) +__attribute__ ((noreturn, format(printf, 1, 2))); + +extern void reiser4_print_prefix(const char *level, int reperr, const char *mid, + const char *function, + const char *file, int lineno); + +extern int preempt_point(void); +extern void reiser4_print_stats(void); + +extern void *reiser4_kmalloc(size_t size, int gfp_flag); +extern void reiser4_kfree(void *area); +extern void reiser4_kfree_in_sb(void *area, struct super_block *sb); +extern __u32 get_current_trace_flags(void); + +#if REISER4_DEBUG +extern int no_counters_are_held(void); +extern int commit_check_locks(void); +#endif + +#if REISER4_DEBUG_OUTPUT && REISER4_DEBUG +extern void print_lock_counters(const char *prefix, const lock_counters_info * info); +#else +#define print_lock_counters( p, i ) noop +#endif + +#define REISER4_STACK_ABORT (8192 - sizeof(struct thread_info) - 30) +#define REISER4_STACK_GAP (REISER4_STACK_ABORT - 100) + +#if REISER4_DEBUG_MEMCPY +extern void *xmemcpy(void *dest, const void *src, size_t n); +extern void *xmemmove(void *dest, const void *src, size_t n); +extern void *xmemset(void *s, int c, size_t n); +#else +#define xmemcpy( d, s, n ) memcpy( ( d ), ( s ), ( n ) ) +#define xmemmove( d, s, n ) memmove( ( d ), ( s ), ( n ) ) +#define xmemset( s, c, n ) memset( ( s ), ( c ), ( n ) ) +#endif + +/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */ +#define IS_POW(i) \ +({ \ + typeof(i) __i; \ + \ + __i = (i); \ + !(__i & (__i - 1)); \ +}) + +#define KERNEL_DEBUGGER (1) + +#if KERNEL_DEBUGGER +#define DEBUGON(cond) \ +({ \ + extern void debugtrap(void); \ + \ + if (unlikely(cond)) \ + debugtrap(); \ +}) +#else +#define DEBUGON(cond) noop +#endif + +#if REISER4_DEBUG +typedef struct err_site { + backtrace_path path; + int code; + const char *file; + int line; +} err_site; +extern void return_err(int code, const char *file, int line); +extern void report_err(void); + +#define RETERR(code) \ +({ \ + typeof(code) __code; \ + \ + __code = (code); \ + return_err(__code, __FILE__, __LINE__); \ + __code; \ +}) + +#else +typedef struct err_site {} err_site; +#define RETERR(code) code +#define report_err() noop +#endif + +#if REISER4_LARGE_KEY +#define ON_LARGE_KEY(...) __VA_ARGS__ +#else +#define ON_LARGE_KEY(...) +#endif + +const char *jnode_tostring(jnode *); +void jnode_tostring_internal(jnode * node, char *buf); +const char *znode_tostring(znode *); +const char *flags_tostring(int flags); + +int reiser4_sysctl_init(void); +void reiser4_sysctl_done(void); + +#if REISER4_ALL_IN_ONE +#define reiser4_internal static +#else +#define reiser4_internal +#endif + +/* __FS_REISER4_DEBUG_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/dformat.h linux-2.6.4-ck1/fs/reiser4/dformat.h --- linux-2.6.4/fs/reiser4/dformat.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/dformat.h 2004-03-11 22:45:15.198524357 +1100 @@ -0,0 +1,164 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Formats of on-disk data and conversion functions. */ + +/* put all item formats in the files describing the particular items, + our model is, everything you need to do to add an item to reiser4, + (excepting the changes to the plugin that uses the item which go + into the file defining that plugin), you put into one file. */ +/* Data on disk are stored in little-endian format. + To declare fields of on-disk structures, use d8, d16, d32 and d64. + d??tocpu() and cputod??() to convert. */ + +#if !defined( __FS_REISER4_DFORMAT_H__ ) +#define __FS_REISER4_DFORMAT_H__ + + +#include +#include +#include + +/* our default disk byteorder is little endian */ + +#if defined( __LITTLE_ENDIAN ) +#define CPU_IN_DISK_ORDER (1) +#else +#define CPU_IN_DISK_ORDER (0) +#endif + +/* code on-disk data-types as structs with a single field + to rely on compiler type-checking. Like include/asm-i386/page.h */ +typedef struct d8 { + __u8 datum; +} d8 __attribute__ ((aligned(1))); +typedef struct d16 { + __u16 datum; +} d16 __attribute__ ((aligned(2))); +typedef struct d32 { + __u32 datum; +} d32 __attribute__ ((aligned(4))); +typedef struct d64 { + __u64 datum; +} d64 __attribute__ ((aligned(8))); + +#define PACKED __attribute__((packed)) + +static inline __u8 +d8tocpu(const d8 * ondisk /* on-disk value to convert */ ) +{ + return ondisk->datum; +} + +static inline __u16 +d16tocpu(const d16 * ondisk /* on-disk value to convert */ ) +{ + return __le16_to_cpu(get_unaligned(&ondisk->datum)); +} + +static inline __u32 +d32tocpu(const d32 * ondisk /* on-disk value to convert */ ) +{ + return __le32_to_cpu(get_unaligned(&ondisk->datum)); +} + +static inline __u64 +d64tocpu(const d64 * ondisk /* on-disk value to convert */ ) +{ + return __le64_to_cpu(get_unaligned(&ondisk->datum)); +} + +static inline d8 * +cputod8(unsigned int oncpu /* CPU value to convert */ , + d8 * ondisk /* result */ ) +{ + assert("nikita-1264", oncpu < 0x100); + put_unaligned(oncpu, &ondisk->datum); + return ondisk; +} + +static inline d16 * +cputod16(unsigned int oncpu /* CPU value to convert */ , + d16 * ondisk /* result */ ) +{ + assert("nikita-1265", oncpu < 0x10000); + put_unaligned(__cpu_to_le16(oncpu), &ondisk->datum); + return ondisk; +} + +static inline d32 * +cputod32(__u32 oncpu /* CPU value to convert */ , + d32 * ondisk /* result */ ) +{ + put_unaligned(__cpu_to_le32(oncpu), &ondisk->datum); + return ondisk; +} + +static inline d64 * +cputod64(__u64 oncpu /* CPU value to convert */ , + d64 * ondisk /* result */ ) +{ + put_unaligned(__cpu_to_le64(oncpu), &ondisk->datum); + return ondisk; +} + +/* data-type for block number on disk: these types enable changing the block + size to other sizes, but they are only a start. Suppose we wanted to + support 48bit block numbers. The dblock_nr blk would be changed to "short + blk[3]". The block_nr type should remain an integral type greater or equal + to the dblock_nr type in size so that CPU arithmetic operations work. */ +typedef __u64 reiser4_block_nr; + +/* data-type for block number on disk, disk format */ +union reiser4_dblock_nr { + d64 blk; +}; + +static inline reiser4_block_nr +dblock_to_cpu(const reiser4_dblock_nr * dblock) +{ + return d64tocpu(&dblock->blk); +} + +static inline void +cpu_to_dblock(reiser4_block_nr block, reiser4_dblock_nr * dblock) +{ + cputod64(block, &dblock->blk); +} + +/* true if disk addresses are the same */ +static inline int +disk_addr_eq(const reiser4_block_nr * b1 /* first block + * number to + * compare */ , + const reiser4_block_nr * b2 /* second block + * number to + * compare */ ) +{ + assert("nikita-1033", b1 != NULL); + assert("nikita-1266", b2 != NULL); + + return !memcmp(b1, b2, sizeof *b1); +} + +/* structure of master reiser4 super block */ +typedef struct reiser4_master_sb { + char magic[16]; /* "ReIsEr4" */ + d16 disk_plugin_id; /* id of disk layout plugin */ + d16 blocksize; + char uuid[16]; /* unique id */ + char label[16]; /* filesystem label */ + d64 diskmap; /* location of the diskmap. 0 if not present */ +} reiser4_master_sb; + +/* __FS_REISER4_DFORMAT_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/diskmap.c linux-2.6.4-ck1/fs/reiser4/diskmap.c --- linux-2.6.4/fs/reiser4/diskmap.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/diskmap.c 2004-03-11 22:45:15.199524201 +1100 @@ -0,0 +1,75 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ +/* Functions to deal with diskmap storage - read-only storage (currently can only be + set via fs-creation process) for use by various plugins */ + +#include + +#include "debug.h" +#include "super.h" +#include "diskmap.h" + +/* Looks through chain of diskmap blocks, looking for table entry where label and parameter + patch passed in "label" and "parameter" + Returns 0 on success, -1 if nothing was found or error have occured. */ +reiser4_internal int +reiser4_get_diskmap_value( u32 label, u32 parameter, u64 *value) +{ + struct super_block *sb = reiser4_get_current_sb(); + int retval = -1; + + assert("green-2006", label != REISER4_FIXMAP_END_LABEL && label != REISER4_FIXMAP_NEXT_LABEL); + + if ( get_super_private(sb)->diskmap_block ) { /* If there is diskmap table, we need to read and parse it */ + struct buffer_head *diskmap_bh; + struct reiser4_diskmap *diskmap; + int i = 0; + + diskmap_bh = sb_bread(sb, get_super_private(sb)->diskmap_block); +search_table: + if ( !diskmap_bh ) { + warning("green-2005", "Cannot read diskmap while doing bitmap checks"); + return -1; + } + + diskmap = (struct reiser4_diskmap *) diskmap_bh->b_data; + if ( strncmp(diskmap->magic, REISER4_FIXMAP_MAGIC, sizeof(REISER4_FIXMAP_MAGIC)-1 ) ) { + /* Wrong magic */ + brelse(diskmap_bh); + warning("green-2004", "diskmap is specified, but its magic is wrong"); + return -1; + } + + /* Since entries in tables are sorted, we iterate until we hit item that we are looking for, + or we reach end of whole fixmap or end of current block */ + while (((d32tocpu(&diskmap->table[i].label) <= label) && + (d32tocpu(&diskmap->table[i].parameter) < parameter)) && + /* Also check that we do not fall out of current block */ + ((sb->s_blocksize - sizeof(diskmap->magic))/sizeof(diskmap->table[0]) >= i)) + i++; + + if ( i > (sb->s_blocksize - sizeof(diskmap->magic))/sizeof(diskmap->table[0]) ) { + warning("green-2004", "diskmap block %Ld is not properly terminated", (long long)diskmap_bh->b_blocknr); + brelse(diskmap_bh); + return -1; + } + + /* Is this last entry in current table that holds disk block with more data ? */ + if ( d32tocpu(&diskmap->table[i].label) == REISER4_FIXMAP_NEXT_LABEL ) { /* Need to load next diskmap block */ + sector_t next_diskmap_block = d64tocpu(&diskmap->table[i].value); + brelse(diskmap_bh); + diskmap_bh = sb_bread(sb, next_diskmap_block); + i = 0; + goto search_table; + } + + /* See if we have found table entry we are looking for */ + if ( (d32tocpu(&diskmap->table[i].label) == label) && + (d32tocpu(&diskmap->table[i].parameter) == parameter) ) { + *value = d64tocpu(&diskmap->table[i].value); + retval = 0; + } + brelse(diskmap_bh); + } + + return retval; +} diff -Naurp linux-2.6.4/fs/reiser4/diskmap.h linux-2.6.4-ck1/fs/reiser4/diskmap.h --- linux-2.6.4/fs/reiser4/diskmap.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/diskmap.h 2004-03-11 22:45:15.199524201 +1100 @@ -0,0 +1,28 @@ +#if !defined (__REISER4_DISKMAP_H__) +#define __REISER4_DISKMAP_H__ + +#include "dformat.h" + +#define REISER4_FIXMAP_MAGIC "R4FiXMaPv1.0" + +#define REISER4_FIXMAP_END_LABEL -2 +#define REISER4_FIXMAP_NEXT_LABEL -1 + +/* This is diskmap table, it's entries must be sorted ascending first in label order, + then in parameter order. + End of table is marked with label REISER4_FIXMAP_END_LABEL + label REISER4_FIXMAP_NEXT_LABEL means that value in this row contains + disk block of next diskmap in diskmaps chain */ +struct reiser4_diskmap { + char magic[16]; + struct { + d32 label; + d32 parameter; + d64 value; + } table[0]; +}; + +int reiser4_get_diskmap_value( u32, u32, u64 *); + + +#endif diff -Naurp linux-2.6.4/fs/reiser4/doc/directory-service linux-2.6.4-ck1/fs/reiser4/doc/directory-service --- linux-2.6.4/fs/reiser4/doc/directory-service 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/doc/directory-service 2004-03-11 22:45:15.200524046 +1100 @@ -0,0 +1,203 @@ + + DIRECTORY SERVICE IN REISER4 + +Directory is mapping from file name to file itself. This mapping is +implemented through reiser4 internal balanced tree. Single global tree +is used as global index of all directories as opposed to having tree per +directory. Unfortunately file names cannot be used as keys until keys of +variable length are implemented, or unreasonable limitations on maximal +file name length are imposed. To work around this file name is hashed +and hash is used as key in a tree. No hash function is perfect and there +always be hash collisions, that is, file names having the same value of +a hash. Previous versions of reiserfs (3.5 and 3.6) used "generation +counter" to overcome this problem: keys for file names having the same +hash value were distinguished by having different generation +counters. This allowed to amortize hash collisions at the cost of +reducing number of bits used for hashing. This "generation counter" +technique is actually some ad hoc form of support for non-unique +keys. Keeping in mind that some form of this have to be implemented +anyway, it seems justifiable to implement more regular support for +non-unique keys in reiser4. + +NON-UNIQUE KEYS + +1. + +Non-unique keys require changes in both tree lookup and tree update +code. In addition some new API to iterate through items with identical +keys is required. + +Before going into detail let's note that non-unique keys weakens +traditional search tree invariant. Search tree with unique keys, keys of +all items in a left sub-tree of given delimiting key are less than, and +in the right sub-tree greater than or equal to the said key. In a search +tree with non-unique keys both inequalities are not strict. + +2. + +Tree lookups: we require that node layout ->lookup() methods always +return leftmost item with the key looked for. The same for item +->lookup() method for items supporting units with non-unique +keys. Standard node40 layout plugin handles this, see +fs/reiser4/plugin/node/node40.c:node40_lookup(). + +3. + +Tree balancing: it seems that only change here is the handling of +weakened search tree invariant. This can be gathered from the +observation that balancing never even compares keys, only tests them for +equality. More thought/research is required though. Looking at the +existing implementations (like Berkeley db) would be useful also. + +4. + +Iteration through items/unit with identical keys. There are two +interfaces to iterating abstraction known as "external" (also known as +"enumeration") and "internal" iterators. + +External iterator: + +external_iterator { + start(); + next(); + has_more_p(); +}; + +external_iterator eit; + +for( eit.start() ; eit.has_more_p() ; ) { + object = eit.next(); + ... do stuff with object ... +} + +Internal operator: + +internal_iterator { + iterate( int ( *function )( object *obj ) ); +}; + +internal_iterator iit; + +int do_stuff( object *obj ) +{ + ... do stuff with obj ... +} + +iit( &do_stuff ); + +External iterator seems easier to use, but they are known to be hard to +implement, especially for complex data-structures like trees (this is +because of the amount of state that should be maintained in "eit" +between its invocations). + +Internal iterators are harder to use in C, because new function has to +be declared to perform actions on objects in sequence, but are obviously +easier to implement. + +Given that in 4.0 version there will be only one client of this +iteration API (viz. directory lookup routine), it seems that internal +style is preferable for now. Later, external iterator interface can be +added if necessary. + +IMPLEMENTATION OF DIRECTORIES: + +1. + +There will be many various directory services implemented through +different plugins. Default directory plugin uses hashing techniques +described above. Let's code-name in hdir. + +2. + +Directory consists of directory entries, stored in a tree in a form of +directory items. Question about whether each directory entry should be +separate item or they can be compressed into items is left open by now. +First this decision is purely per-plugin decidable, second, compression +is good for performance, but harder to implement. + +Single directory entry is binding between file-system object and +directory. In hdir plugin it consists of full name of a file bound and +key (or part thereof) of file's stat-data: + +typedef struct hdir_entry { + /** + * key of object stat-data. It's not necessary to store + * whole key here, because it's always key of stat-data, so minor packing + * locality and offset can be omitted here. But this relies on + * particular key allocation scheme for stat-data, so, for extensibility + * sake, whole key can be stored here. + * + * We store key as array of bytes, because we don't want 8-byte alignment + * of dir entries. + */ + d8 sdkey[ sizeof( reiser4_key ) ]; + /** + * file name. Null terminated string. + */ + d8 name[ 0 ]; +} hdir_entry; + +4. + +On creation/linking/lookup of object "bar" in directory "foo" (foo/bar), +we compose key of directory entry for this object. Key has the form + +/* + * XXX this should be discussed + */ +dirent_k = (locality=foo_object_id, objectid=???, offset=hash("bar")); + +Major packing locality of dirent_k is set to foo_object_id so that all +objects (files) in this directory and their bodies are close to +respective directory entries. + +It seems that no single key allocation policy for directory entries fits +everyone's needs, so, this can be implemented as method of directory +plugin. No then less, choice of default key allocation policy is still +important decision, although not that important as in plugin-less +file-system. + +4. + +Function + +int hdir_find_entry( inode *dir, const hdir_entry *entry, + tween_coord *coord, lock_handle *lh ); + +iterates through all directory entries in @dir that have the same key as +@entry (scans hash-bucket), looking for exact match for entry->name. + +5. + +During ->create()/->link() hdir_find_entry() is used to find place to insert new +item (and to check for -EEXIST). + +During ->lookup() hdir_find_entry() is used find entry for the file +being looked for and to load stat-data afterwards. + +During ->unlink() hdir_find_entry() is used to find unit/item to be +removed. + +NOTE ON ->lookup(): + +VFS implements following protocol when creating new +file (fs/namei.c:open_namei()): + +dentry hash is searched. If search is unsuccessful, file system +->lookup() is called. +If lookup didn't find name, call ->create() + +While this protocol spares file system from dealing with dcache locking, +for reiserfs it means that tree traversal is performed twice during file +creation/deletion. Possible solution is to cache results of ->lookup() +(e.g, pointer to znode) in dentry and reuse then in ->create(). On the +other hand, point cache have more or less the same effect and is more +general. + + +^ Local variables: +^ mode-name: "Design Document" +^ indent-tabs-mode: nil +^ tab-width: 4 +^ eval: (progn (flyspell-mode) (flyspell-buffer)) +^ End: diff -Naurp linux-2.6.4/fs/reiser4/doc/lock-ordering linux-2.6.4-ck1/fs/reiser4/doc/lock-ordering --- linux-2.6.4/fs/reiser4/doc/lock-ordering 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/doc/lock-ordering 2004-03-11 22:45:15.201523890 +1100 @@ -0,0 +1,584 @@ +---------------------------------INTRODUCTION----------------------------------- + +This document tries to provide concise description of various "locking" issues +in reiser4 code. There are two major areas here: + +1. locking as a device for the concurrency control: various synchronization +objects are used to maintain integrity of shared data structures. + +2. (induced by the former) deadlocks, livelocks, missed wake ups, and alikes. + +"Locks" above means both standard synchronization primitives like mutexes, +semaphores, condition variables and so on, and any other kind of object on +which thread execution may "block". Waiting on io completion is not considered +here, because hardware errors barred, it will ultimately finish regardless of +any other threads and locks in the system (This only holds if io completion +handlers don't acquire locks themselves.). + +-------------------------------LOCKS IN REISER4--------------------------------- + +Reiser4 introduces following locks: + +1. Per-super-block tree spin lock (tree_lock*) + +2. Per-super-block delimiting key spin lock (dk_lock*) + +3. Per-jnode spin lock (jnode_lock*) + +4. Per-znode lock with deadlock detection (longterm_lock) + +5. Per-reiser4-inode spin lock (inode_guard*) + +6. Per-atom spin lock (atom_lock*) + +7. Per-transaction-handle spin lock (txnh_lock*) + +8. Per-transaction-manager spin lock (txnmgr_lock*) + +9. Per-lock-stack spin-lock (stack_lock*) + +10. Per-inode read-write lock (inode_rw_lock) + +11. Per-super-block spin lock (super_guard*+) + +12. Per-flushing-thread spin lock (ktxnmgrd_lock) + +13. Global lnode hash table lock (lnode_guard+) + +14. Per-super-block cbk cache spin lock (cbk_guard) + +15. Per-jnode spin lock used by debugging code to access and + modify check sum (cksum_guard+) + +16. Per-super-block oid map spin lock (oid_guard+) + +17. Per-super-block spin lock used by "test" disk format plugin to serialize + block allocation (test_lock+) + +18. Per-condition-variable spin lock (kcond_lock+) + +19. Single spin lock used to serialize fake block allocation (fake_lock+) + +20. Single spin lock used to serialize calls to reiser4_panic (panic_guard+) + +21. Single spin lock used by debugging code to keep track of all active + reiser4_context instances (contexts_lock+) + +22. Per-lnode condition variable used by wait for completion of "incompatible + access mode" (lnode_kcond) + +23. Per-flushing-thread condition variable for startup waiting (ktxnmgrd_start) + +24. Per-flushing-thread condition variable (ktxnmgrd_wait) + +25. Per-lock-stack wakeup semaphore (stack_sema) + +26. Per-super-block flush serializing semaphore (flush_sema) + +27. Per-transaction-manager commit semaphore (commit_sema) + +28. Per-super-block semaphore used to arbitrate use of 5% (delete_sema) + reserved disk space + +30. Global spin lock used to serialize calls to panic (panic_guard+) + +31. Global spin lock used to protect plugin set hash table (pset_guard+) + +32. Global spin lock used to protect phash hash table (phash_guard+) + +33. Per-bitmap-block semaphore used to serialize bitmap loading (bnode_sema+) + +34. Per-super-block epoch lock, protecting updates to (epoch_lock*) + znode_epoch field, used to implement seals (seal.[ch]) + efficiently. + +35. Per-atom "event". This is not really lock. Rather, this is an event + signaled each time atom changes its state. (atom_event) + +36. Per-znode spin lock used to protect long term locking + structures (zlock*) + +37. Per flush queue lock (fq_lock*) + +38. Per-super-block zgen lock, protecting znode generation (zgen*) + counter + +39. Per-jnode spin lock used to synchronize jload() with (jload_lock*) + ->releasepage(). + +40. Per-atom imaginary read-write semaphore handle_sema (handle_sema) + + let's pretend for the sake of simplicity that there is special per-atom + read-write semaphore that threads can claim. Call it + handle_sema. This semaphore is acquired on read when thread captures first + block and is released when thread's reiser4_context is closed. Formally + thread holds this semaphore on read exactly when + get_current_context()->trans->atom != NULL, i.e., when thread is + associated with atom. Logic behind introducing this imaginary semaphore is + that while some thread is associated with an atom (that is, keeps + transaction handle opened), this atom cannot commit. In particular, other + threads waiting on fusion with atom that is in CAPTURE_WAIT stage wait + until this atom commits, that is wait (at least) until there are no opened + transaction handles for this atom. Effectively such threads wait until + handle_semaphore is free, that is, they in some sense are trying to + acquire handle_semaphore in write mode. So, this circumferential + description allows one to reduce (at least partially) problem of waiting + on atom fusion to the lock ordering. + +41. Per-super-block spin lock protecting consistency of emergency flush hash + table, ->eflushed, and ->eflushed_anon counters in inode, and ->flushed + counter in atom. (eflush_guard) + +99. Various locks used by the user level simulator + +Locks marked by (*) after label, are accessed through spin lock macros, +defined in reiser4.h. For them, locking ordering is checked at the runtime (at +least in the principle) when REISER4_DEBUG is on(e). + +Locks marked by (+) after label exist only for serializing concurrent access +to the shared data and are not supposed to be used in conjunction with any +other locks. They are omitted from locking ordering below to simplify the +picture. One can imaging them to be rightmost in the ordering. + +All locks, spin locks, and semaphores, except for stack_sema are subject to +normal protocol: thread that grabbed the lock will release it. stack_sema is +described in more details below. + +Also, following kernel locks are used by our code: + +1. Per-page lock (page_lock) + +2. Per-page writeback bit (page_write) + +3. Per-inode semaphore (i_sem) + +4. Per-inode I_LOCK bit-lock (I_LOCK) + +Thread also can block on the following "objects" that are not really locks: + +1. Page fault (pfault) + +2. Memory allocation (kalloc) + +3. Dirtying a page (through balance_dirty_pages()) (page_dirty) + +----------------------------------LOCK SCOPE------------------------------------ + +Section describing what data are protected by what locks. TBD. + +----------------------------------INVARIANTS------------------------------------ + +Invariants are some (formal or informal) properties of data structures. For +example, for well-formed doubly linked list, following holds: + +item->next->prev == item && item->prev->next == item + +In most cases, invariants only hold under proper locks. + +LABEL AND DESCRIPTION LOCKS + +[inode->eflushed] inode_guard + + inode->eflushed > 0, iff there are emergency flushed jnodes belonging to + this inode. Also, each emergency flushed jnode is counted as increase in + inode->i_count. + +[cbk-cache-invariant] cbk_guard + + If cbk cache is traversed in LRU order, first go all used slots (with + slot->node != NULL), then, all unused. All used slots have unique + slot->node. (Checked by cbk_cache_invariant().) + +[znode-fake] jnode_lock, tree_lock + + /* fake znode doesn't have a parent, and */ + znode_get_level(node) == 0 => znode_parent(node) == NULL, and + /* there is another way to express this very check, and */ + znode_above_root(node) => znode_parent(node) == NULL, and + /* it has special block number, and */ + znode_get_level(node) == 0 => *znode_get_block(node) == FAKE_TREE_ADDR, and + /* it is the only znode with such block number, and */ + !znode_above_root(node) && znode_is_loaded(node) => + *znode_get_block(node) != FAKE_TREE_ADDR + /* it is parent of the tree root node */ + znode_is_true_root(node) => znode_above_root(znode_parent(node)) + + (Checked by znode_invariant_f().) + +[znode-level] jnode_lock, tree_lock + + /* level of parent znode is one larger than that of child, except for the + fake znode */ + znode_parent(node) != NULL && !znode_above_root(znode_parent(node)) => + znode_get_level(znode_parent(node)) == znode_get_level(node) + 1 + /* left neighbor is at the same level, and */ + znode_is_left_connected(node) && node->left != NULL => + znode_get_level(node) == znode_get_level(node->left)) + /* right neighbor is at the same level */ + znode_is_right_connected(node) && node->right != NULL => + znode_get_level(node) == znode_get_level(node->right) + + (Checked by znode_invariant_f().) + +[znode-connected] + + /* ->left, ->right pointers form a valid list and are consistent with + JNODE_{LEFT,RIGHT}_CONNECTED bits */ + + node->left != NULL => znode_is_left_connected(node) + node->right != NULL => znode_is_right_connected(node) + node->left != NULL => + znode_is_right_connected(node->left) && + node->left->right == node + node->right != NULL => + znode_is_left_connected(node->right) && + node->right->left == node + +[znode-c_count] jnode_lock, tree_lock + + /* for any znode, c_count of its parent is greater than 0, and */ + znode_parent(node) != NULL && !znode_above_root(znode_parent(node)) => + atomic_read(&znode_parent(node)->c_count) > 0), and + /* leaves don't have children */ + znode_get_level(node) == LEAF_LEVEL => atomic_read(&node->c_count) == 0 + + (Checked by znode_invariant_f().) + +[znode-refs] jnode_lock, tree_lock + + /* only referenced znode can be long-term locked */ + znode_is_locked(node) => atomic_read(&ZJNODE(node)->x_count) != 0 + + (Checked by znode_invariant_f().) + +[jnode-oid] jnode_lock, tree_lock + + /* for unformatted node ->objectid and ->mapping fields are + * consistent */ + jnode_is_unformatted(node) && node->key.j.mapping != NULL => + node->key.j.objectid == get_inode_oid(node->key.j.mapping->host) + + (Checked by znode_invariant_f().) + +[jnode-refs] jnode_lock, tree_lock + + /* only referenced jnode can be loaded */ + atomic_read(&node->x_count) >= node->d_count + + (Checked by jnode_invariant_f().) + +[jnode-dirty] jnode_lock, tree_lock + + /* dirty inode is part of atom */ + jnode_is_dirty(node) => node->atom != NULL + + (Checked by jnode_invariant_f().) + +[jnode-queued] jnode_lock, tree_lock + + /* only relocated node can be queued, except that when znode + * is being deleted, its JNODE_RELOC bit is cleared */ + JF_ISSET(node, JNODE_FLUSH_QUEUED) => + JF_ISSET(node, JNODE_RELOC) || JF_ISSET(node, JNODE_HEARD_BANSHEE) + + (Checked by jnode_invariant_f().) + +[jnode-atom-valid] jnode_lock, tree_lock + + /* node atom has valid state */ + node->atom != NULL => node->atom->stage != ASTAGE_INVALID + + (Checked by jnode_invariant_f().) + +[jnode-page-binding] jnode_lock, tree_lock + + /* if node points to page, it points back to node */ + node->pg != NULL => node->pg->private == node + + (Checked by jnode_invariant_f().) + +[sb-block-counts] super_guard + + reiser4_block_count(super) = reiser4_grabbed_blocks(super) + + reiser4_free_blocks(super) + + reiser4_data_blocks(super) + + reiser4_fake_allocated(super) + + reiser4_fake_allocated_unformatted(super) + + reiser4_flush_reserved(super) + + (Checked by check_block_counters().) + +[sb-grabbed] super_guard + + reiser4_grabbed_blocks(super) equals the sum of ctx->grabbed_blocks for + all grabbed contexts + +[sb-fake-allocated] txnmgr_lock, atom_lock + + When all atoms and transaction manager are locked, + reiser4_flush_reserved(super) equals to sum of atom->flush_reserved for + all atoms. + +[tap-sane] + + tap->mode is one of {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and + tap->coord != NULL, and + tap->lh != NULL, and + tap->loaded > 0 => znode_is_loaded(tap->coord->node), and + tap->coord->node == tap->lh->node + + (Checked by tap_invariant().) + +--------------------------------LOCK ORDERING----------------------------------- + +Lock ordering for kernel locks is taken from mm/filemap.c. Locks can be taken +from the left to the right. Locks on the same indentation level are unordered +with respect to each other. Any spin lock is righter than any long term lock, +obviously. + +i_sem +..inode_rw_lock <-------DEAD1-----+ +....handle_sema | +......I_LOCK | +......delete_sema | +......flush_sema | +........atom_event | +........longterm_lock <---DEAD2-+ | +......commit_sema | | +..........page_lock | | +............pfault | | +..............mm->mmap_sem------+-+ [do_page_fault] +..................ktxnmgrd_lock +................mapping->i_shared_sem +................kalloc +....................inode_guard +....................txnmgr_lock +......................atom_lock +..........................super_guard +........................jnode_lock [->vm_writeback()->jget()] +................................eflush_guard +..........................txnh_lock +............................zlock +........................fq_lock +..............................stack_lock +..................dk_lock +..............................tree_lock +................................cbk_guard +................................epoch_lock +................................zgen_lock +..........................jload_lock +....................mm->page_table_lock +......................mapping->private_lock +........................swaplock +..........................swap_device_lock +..........................&inode_lock +............................&sb_lock +............................mapping->page_lock +..............................zone->lru_lock + ^ + +-- spin locks are starting here. Don't schedule rightward. + +NOT FINISHED. + +..............&cache_chain_sem +......................cachep->spinlock +......................zone->lock + +page_dirty +....&inode_lock +....&sb_lock +....mapping->page_lock [mpage_writepages] +..page_lock +..longterm_lock [__set_page_dirty_buffers->__mark_inode_dirty] + +Nice and clear picture with all reiser4 locks totally ordered, right? + +Unfortunately, it is not always possible to adhere to this ordering. When it +is necessary to take locks "decreasing" order, standard trylock-and-repeat +loop is employed. See: + + atom_get_locked_with_txnh_locked(), + atom_get_locked_by_jnode(), + atom_free(), and + jnode_lock_page() + +functions for examples of this. + +The only exception from the above locking oder is when thread wants to lock +object it is just created and hasn't yet announced to other threads (by means +of placing it into some shared data structure like hash table or list). There +is special spin lock macro spin_lock_foo_no_ord() defined in reiser4.h for +this purpose. + +pfault and kalloc are something special: when page fault occurs at the page +occupied by mmapped from reiser4 file, reiser4_readpage() is invoked that +starts taking locks from the very beginning. + +DEAD1 + + Scenario: + + process has mmapped reiser4 regular file and then does write(2) into + this file from buffer that is in mmaped area. copy_from_user() causes + page fault: + + sys_write() + reiser4_write() + unix_file_write() [inode_rw_lock] + . + . + . + __copy_from_user() + . + . + . + handle_page_fault() + handle_mm_fault() + handle_pte_fault() + do_no_page() + unix_file_filemap_nopage() [inode_rw_lock] + + This is safe, because inode_rw_lock is read-taken by both read/write and + unix_file_filemap_nopage(). It is only write-taken during tail<->extent + conversion and if file is mmaped is was already converted to extents. + +DEAD2 + + is safe, because copy_from_user is used only for tails and extents: + + . extent: extent_write_flow() releases longterm_lock before calling + copy_from_user. + + . tail: during copying into tail, only node containing this tail is long + term locked. It is easy to see, that ->readpage serving page fault (that + is, readpage for unformatted data) will never attempt to lock said node. + +When memory allocation tries to free some memory it + +1. asynchronously launches kswapd that will ultimately call + reiser4_writepage(). + +2. calls reiser4_writepage() synchronously. + +----------------------------------LOCK PATTERNS--------------------------------- + +This section describes where in the code what locks sequences are held. This +places restrictions on modifications to the lock ordering above and enumerates +pieces of the code that should be revised if modification of the lock ordering +is necessary. + +flush_sema + + jnode_flush() + + to serialize flushing. This behavior can be disabled with mtflush + mount option. + +atom_lock->jnode_lock + + uncapture_block() + +atom_lock->tree_lock && jnode_lock && page_lock + + uncapture_block() calls jput() + +delete_sema + + common_unlink(), shorten_file()->unlink_check_and_grab() + + to serialize access to reserved 5% of disk only used by unlinks. (This + is necessary so that it is always possible to unlink something and + free more space on file-system.) + +delete_sema->flush_sema || commit_sema + + reiser4_release_reserved() calls txnmgr_force_commit_current_atom() under + delete_sema + +inode_rw_lock->delete_sema + + unix_file_truncate()->shorten_file() takes delete_sema from under write + mode of inode_rw_lock + +kalloc->jnode_lock + + emergency_flush() takes jnode spin lock + +jnode_lock->(mapping->page_lock) + + jnode_set_dirty()->__set_page_dirty_nobuffers() + +jnode_lock->(zone->lru_lock) + + jnode_set_dirty()->mark_page_accessed() + + +I_LOCK->longterm_lock + + reiser4_iget() + +tree_lock->epoch_lock + + zget() calls znode_build_version() + +jnode_lock->stack_lock + + longterm_lock_znode(), longterm_unlock_znode(), wake_up_all_lopri_owners() + +tree_lock->cbk_guard + + znode_remove() calls cbk_cache_invalidate() + +zlock->stack_lock + + wake_up_all_lopri_owners() + +atom->stack_lock + + check_not_fused_lock_owners() + +txnh->stack_lock + + check_not_fused_lock_owners() + +jnode_lock->jload_lock + + reiser4_releasepage(), emergency_flush(). But this can actually be made + other way around. + +jnode_lock->eflush_guard + + eflush_add(), eflush_del() + +atom_lock->super_guard + + grabbed2flush_reserved_nolock() + +----------------------------------DEADLOCKS------------------------------------- + +Big section describing found/possible/already-worked-around deadlocks. + +1. Locking during tree traversal. + +2. Locking during balancing. + +3. Locking during squalloc. + +4. Page locking. + +5. Atom fusion. + +Please, fill gaps up. + +TBD. + +2002.09.19. Nikita. + +-------------------------------------------------------------------------------- + +^ Local variables: +^ mode-name: "Memo" +^ indent-tabs-mode: nil +^ tab-width: 4 +^ eval: (progn (flyspell-mode) (flyspell-buffer)) +^ End: diff -Naurp linux-2.6.4/fs/reiser4/doc/lock-ordering.dot linux-2.6.4-ck1/fs/reiser4/doc/lock-ordering.dot --- linux-2.6.4/fs/reiser4/doc/lock-ordering.dot 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/doc/lock-ordering.dot 2004-03-11 22:45:15.203523579 +1100 @@ -0,0 +1,271 @@ +/* this is dot(1) input file for lock-ordering diagram */ +/* it should be passed through C preprocessor first */ +/* cpp -P -DFITPAGE lock-ordering.dot | tred | dot -Tps | gv -media a4 - */ + +#define CATTR fontsize=14, fontname=Helvetica +#define NATTR CATTR +#define EATTR CATTR + +#define SYSATTR color=yellow, style=filled +#define PSEUDOATTR color=pink, style=filled, peripheries=2 + +#define LONGATTR shape=ellipse +#define SPINATTR shape=box + +#define CONDATTR color=blue, peripheries=2, LONGATTR + +#define MARKLONG(name) name -> schedulable [style=invis, weight=0] + +#define SYSLONG(name, l) name [label=l, NATTR, LONGATTR, SYSATTR]; MARKLONG(name) +#define SYSPSEUDO(name) name [NATTR, LONGATTR, PSEUDOATTR]; MARKLONG(name) +#define RLONG(name) name [NATTR, LONGATTR]; MARKLONG(name) + +#define RCOND(name, l) name [label=l, NATTR, CONDATTR]; MARKLONG(name) + +#define MARKSPIN(name) schedulable -> name [style=invis, weight=0] + +#define SYSSPIN(name, l) name [label=l, NATTR, SYSATTR, SPINATTR]; MARKSPIN(name) +#define RSPIN(name) name [NATTR, SPINATTR]; MARKSPIN(name) + +#define ARC(from, to, func, ...) from -> to [EATTR, label=func, ## __VA_ARGS__] + +digraph locks { + +//clusterrank=none +#if defined(FITPAGE) +size="7.5, 10.5"; +ratio=compress; +center=true; +#endif + +subgraph long { + /* reiser4 long term locks */ + RLONG(longterm_lock); + RLONG(inode_rw_lock); + RLONG(stack_sema); + RLONG(flush_sema); + RLONG(commit_sema); + RLONG(delete_sema); + /* txncommit is a synonym for flush_sema and commit_sema */ + txncommit [LONGATTR, PSEUDOATTR]; MARKLONG(txncommit); + txncommit -> flush_sema [style=dotted, dir=both]; + txncommit -> commit_sema [style=dotted, dir=both]; + + /* atom_event is not really a lock: you can wait on it, but cannot "own" + it. */ + RCOND(atom_event,atom_event); + + //RLONG(lnode_kcond); + //RLONG(ktxnmgrd_start); + //RLONG(ktxnmgrd_wait); + //RLONG(bnode_sema); + + /* pseudo locks */ + SYSPSEUDO(pfault); + SYSPSEUDO(kalloc); + SYSPSEUDO(schedulable); + + /* system long term locks */ + SYSLONG(page_write, page_write); + SYSLONG(mm_mmap_sem, "mm->mmap_sem"); + SYSLONG(mapping_i_shared_sem, "mapping->i_shared_sem"); + + SYSLONG(i_sem, i_sem); + SYSLONG(page_lock, page_lock); + SYSLONG(cache_chain_sem, "&cache_chain_sem"); + SYSLONG(I_LOCK, "I_LOCK"); + + SYSLONG(namespace_sem, "namespace->sem"); + // SYSLONG(bdev_bd_sem, "bdev->bd_sem"); + SYSLONG(sb_s_lock, "sb->s_lock"); + SYSLONG(sb_s_umount, "sb->s_umount"); +} + +subgraph spin { + + /* reiser4 spin locks */ + + RSPIN(tree_lock); + RSPIN(dk_lock); + RSPIN(jnode_lock); + RSPIN(inode_guard); + RSPIN(atom_lock); + RSPIN(txnh_lock); + RSPIN(txnmgr_lock); + RSPIN(ktxnmgrd_lock); + RSPIN(cbk_guard); + RSPIN(epoch_lock); + RSPIN(zgen_lock); + RSPIN(stack_lock); + RSPIN(zlock); + RSPIN(fq_lock); + RSPIN(jload_lock); + RSPIN(super_guard); + + //RSPIN(stack_lock); + //RSPIN(lnode_guard); + //RSPIN(cksum_guard); + //RSPIN(oid_guard); + //RSPIN(test_lock); + //RSPIN(kcond_lock); + //RSPIN(fake_lock); + //RSPIN(panic_guard); + //RSPIN(contexts_lock); + //RSPIN(pset_guard); + //RSPIN(phash_guard); + + /* system spin locks */ + SYSSPIN(bkl, "BKL"); + SYSSPIN(cachep_spinlock, "cachep->spinlock"); + SYSSPIN(zone_lock, "zone->lock"); + SYSSPIN(swaplock, "&swaplock"); + SYSSPIN(zone_lru_lock, "zone->lru_lock"); + SYSSPIN(mapping_private_lock, "mapping->private_lock"); + SYSSPIN(mapping_page_lock, "mapping->page_lock"); + SYSSPIN(inode_lock, "&inode_lock"); + SYSSPIN(swap_device_lock, "swap->device_lock"); + SYSSPIN(mm_page_table_lock, "mm->page_table_lock"); + SYSSPIN(sb_lock, "&sb_lock"); + SYSSPIN(page_chain_lock, "page->chain_lock"); + //removed at 2003.04.04 by akpm@digeo.com + //SYSSPIN(dparent_lock, "dparent_lock"); + SYSSPIN(dcache_lock, "dcache_lock"); + SYSSPIN(fs_struct_lock, "fs_struct->lock"); + SYSSPIN(tasklist_lock, "&tasklist_lock"); + SYSSPIN(sig_siglock, "sig->siglock"); + SYSSPIN(fown_lock, "fown->lock"); + SYSSPIN(task_switch_lock, "task->switch_lock"); + SYSSPIN(task_proc_lock, "task->proc_lock"); + SYSSPIN(task_alloc_lock, "task->alloc_lock"); + /* rq->lock is special: it can be unlocked by thread different from locker */ + SYSSPIN(rq_lock, "rq->lock"); + SYSSPIN(task_capability_lock, "&task_capability_lock"); + SYSSPIN(mmlist_lock, "&mmlist_lock"); + SYSSPIN(files_file_lock, "files->file_lock"); + SYSSPIN(dn_lock, "&dn_lock"); + //SYSSPIN(bdev_lock, "&bdev_lock"); + SYSSPIN(suspend_pagedir_lock, "&suspend_pagedir_lock") +} + +/* dependencies */ + +ARC(inode_guard, tree_lock, "update_sd_at()"); +ARC(inode_guard, jnode_lock, "update_sd_at()"); +ARC(inode_guard, atom_lock, "update_sd_at()"); +ARC(atom_lock, jnode_lock, "uncapture_block()"); //capture_fuse_jnode_lists() +ARC(jnode_lock, txnh_lock, "try_capture_block()"); +//alredy covered +ARC(atom_lock, txnh_lock, "capture_fuse_txnh_lists()"); +ARC(jnode_lock, tree_lock, "jdrop_in_tree()"); +ARC(tree_lock, cbk_guard, "cbk_cache_invalidate()"); +ARC(dk_lock, tree_lock, "sync_dkeys()"); +ARC(txnmgr_lock, atom_lock, "atom_dec_and_unlock()"); //txnmgr_force_commit_all(),\ncommit_some_atoms(),\nflush_one_atom()"); +ARC(txnmgr_lock, jnode_lock, "atom_begin_andlock()"); +ARC(txnmgr_lock, txnh_lock, "atom_begin_andlock()"); +ARC(i_sem, inode_rw_lock, "unix_file_setattr()");//,\nunix_file_write()"); +ARC(page_lock, i_sem, "reiserfs_unpack()"); +ARC(inode_rw_lock, delete_sema, "shorten()"); +//ARC(delete_sema, txncommit, "reiser4_release_reserved()"); +ARC(flush_sema, longterm_lock, "flush_scan_left()");//,\nflush_allocate_znode_update(),\nflush_scan_formatted(),\nflush_pos_to_child_and_alloc()"); +ARC(longterm_lock, page_lock, "cbk_level_lookup()"); +ARC(commit_sema, page_lock, "submit_write()"); +ARC(pfault, mm_mmap_sem, "handle_page_fault()"); +ARC(page_lock, pfault, "extent_write_flow()"); +ARC(mm_mmap_sem, kalloc, "unix_file_readpage()"); + +//ARC(inode_rw_lock, mm_mmap_sem, "unix_file_filemap_nopage()", style=dotted, dir=back); +//ARC(mm_mmap_sem, kalloc, "DEAD2", style="dotted"); +ARC(kalloc, jnode_lock, "emergency_flush()"); +ARC(longterm_lock, jnode_lock, "longterm_unlock_znode()");//,\nflush_allocate_znode()"); + +ARC(kalloc, inode_guard, "eflush_add()"); +ARC(ktxnmgrd_lock, txnmgr_lock, "commit_some_atoms()"); + +//already covered +ARC(mapping_i_shared_sem, mapping_private_lock, "__set_page_dirty_buffers()"); +//already covered +ARC(mapping_i_shared_sem, mapping_page_lock, ""); +ARC(mapping_i_shared_sem, mm_page_table_lock, "vma_link()"); + +ARC(inode_lock, mapping_page_lock, "__sync_single_inode()"); +ARC(inode_lock, sb_lock, "writeback_inodes()"); + +ARC(mm_page_table_lock, swap_device_lock, "try_to_unmap_one()"); +ARC(mm_page_table_lock, mapping_private_lock, "try_to_unmap_one()"); +//already covered +ARC(mm_page_table_lock, mapping_page_lock, "try_to_unmap_one()"); + +ARC(mm_mmap_sem, mapping_i_shared_sem, "do_mmap_pgoff()"); + +ARC(swaplock, swap_device_lock, "swap_info_get()"); +ARC(swap_device_lock, mapping_page_lock, "exclusive_swap_page()"); + +ARC(page_lock, page_chain_lock, "shrink_list()"); +ARC(mm_page_table_lock, page_chain_lock, "page_add_rmap()");//,\npage_remove_rmap()"); +ARC(mapping_page_lock, zone_lru_lock, "add_to_page_cache()");//,\nfilemap_fdatawait()"); +ARC(mm_page_table_lock, zone_lru_lock, "page_add_rmap()");//,\npage_remove_rmap()"); +ARC(zone_lru_lock, page_chain_lock, "rmap.c"); + +ARC(cache_chain_sem, kalloc, "cpuup_callback()"); +//ARC(cache_chain_sem, pfault, "kmem_cache_create()"); + +//obsolete ARC(dcache_lock, dparent_lock, "d_move()"); +ARC(fs_struct_lock, dcache_lock, "set_fs_pwd()");//,\nset_fs_root()"); + +ARC(namespace_sem, i_sem, "sys_pivot_root()"); + +ARC(sb_s_lock, txncommit, "reiser4_write_super()"); +ARC(sb_s_umount, txncommit, "reiser4_kill_super()"); + +ARC(task_switch_lock, rq_lock, "finish_arch_switch()"); +ARC(task_proc_lock, tasklist_lock, "unhash_process()"); // de_thread() +ARC(task_proc_lock, dcache_lock, "proc_pid_unhash()"); + +ARC(tasklist_lock, sig_siglock, "de_thread()");//,\ndo_notify_parent(),\nsys_tkill(),\ncopy_process()"); //collect_sigign_sigcatch(),\n__exit_sighand(),\nfreeze_processes() +ARC(dn_lock, fown_lock, "__inode_dir_notify()"); +ARC(fown_lock, tasklist_lock, "send_sigio()");//,\nsend_sigurg()"); +ARC(tasklist_lock, task_alloc_lock, "chroot_fs_refs()"); +ARC(tasklist_lock, rq_lock, "setscheduler()"); +ARC(task_capability_lock, tasklist_lock, "sys_capget()");//,\nsys_capset()"); +ARC(task_alloc_lock, files_file_lock, "match_comm()");//,\nmatch_pid()"); + +ARC(mmlist_lock, mm_page_table_lock, "unuse_process()"); + +ARC(tree_lock, zone_lock, "page_clear_jnode()");//,\njrelse_nolock()"); +ARC(tree_lock, zone_lru_lock, "page_clear_jnode()");//,\njrelse_nolock()"); +ARC(tree_lock, mapping_page_lock, "jdrop_in_tree()"); +ARC(tree_lock, epoch_lock, "zget()"); +ARC(tree_lock, zgen_lock, "zget()"); + +ARC(bkl, inode_lock, "iget()"); + +ARC(jnode_lock, mapping_page_lock, "jnode_set_dirty()"); +ARC(jnode_lock, zone_lru_lock, "jnode_set_dirty()"); + +ARC(I_LOCK, longterm_lock, "reiser4_iget()"); + +//one cannot wait for atom event keeping longterm lock +ARC(atom_event, longterm_lock, "flush"); +//one cannot wait for atom event keeping page lock +ARC(atom_event, page_lock, "jnode_extent_write()"); +ARC(zlock, stack_lock, "longterm_lock_znode()");//,\nlongterm_unlock_znode(), wake_up_all_lopri_owners()"); + +ARC(atom_lock, stack_lock, "check_not_fused_lock_owners()");//atom_send_event() +ARC(txnh_lock, stack_lock, "check_not_fused_lock_owners()"); +ARC(fq_lock, stack_lock, "wakeup_atom_waitfor_list()"); +ARC(atom_lock, fq_lock, "detach_fq()"); +ARC(jnode_lock, zlock, "check_not_fused_lock_owners()"); +ARC(txnh_lock, zlock, "check_not_fused_lock_owners()"); + +ARC(suspend_pagedir_lock, zone_lock, "do_magic_suspend_2()"); +ARC(cachep_spinlock, zone_lock, "cache_flusharray()"); + +ARC(mapping_page_lock, zone_lock, "add_to_page_cache()"); // find_lock_page +ARC(mapping_page_lock, zone_lru_lock, "add_to_page_cache()"); // find_lock_page +ARC(mm_page_table_lock, zone_lock, "try_to_unmap_one()"); // get_user_pages, do_wp_page, do_anonymous_page, do_no_page +ARC(mm_page_table_lock, zone_lru_lock, "try_to_unmap_one()"); // get_user_pages, do_wp_page, do_anonymous_page, do_no_page +ARC(jnode_lock, zone_lock, "page_clear_jnode()"); // uncapture_page, extent_write_flow +ARC(jnode_lock, zone_lru_lock, "page_clear_jnode()"); // uncapture_page, extent_write_flow +ARC(jnode_lock, jload_lock, "reiser4_releasepage()"); +ARC(atom_lock, super_guard, "grabbed2flush_reserved_nolock()"); +} diff -Naurp linux-2.6.4/fs/reiser4/doc/metadata-in-pagecache linux-2.6.4-ck1/fs/reiser4/doc/metadata-in-pagecache --- linux-2.6.4/fs/reiser4/doc/metadata-in-pagecache 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/doc/metadata-in-pagecache 2004-03-11 22:45:15.217521403 +1100 @@ -0,0 +1,57 @@ +Hello, + +In upcoming reiser4 we are planning to use page cache to store all file system +meta data. In some cases it is straightforward; for example, bitmaps blocks, +placed on the disk through (almost) equal intervals ask to be bound to special +fake inode and indexed by their disk offsets. + +There is one important (most important actually) case where using fake inode +is inconvenient: blocks of internal balanced tree used by reiser4, known as +"formatted nodes". Natural solution of using block number as offset within +some fake inode doesn't pass, because when block size is smaller than page +some blocks mapped to the same page may be either occupied by something other +than formatted nodes, or just be free. + +This leads to the following complications: + + 1. we cannot simply use block_{read|write}_full_page(), because this will + waste IO bandwidth: block that doesn't contain formatted node will be read + into memory. Moreover, this block can be later read again, for example, + because this is data block of some file and hashed into different place in + the page cache, creating alias. This will definitely confuse buffer cache; + + 2. even is we keep track of what blocks have to be actually read, there still + will be "internal memory fragmentation", because some parts of page cache + pages will be unused. + +In brief, formatted nodes form a tree and because of this don't fit into + hashing scheme---there is no linear ordering among them. + +Moreover, formatted node is never looked up in the page cache by its block +number, because for each formatted node in memory there is special data +structure (znode) and znodes are hashed in the hash table anyway. + +So, all functionality that we need from the page cache is memory allocator +with attached memory pressure hooks (I guess, this is close to what Hans +called "sub-cache" in lkml discussions on this topic). + +It seems that we have two solutions: + + 1. change page cache to use different indexing for formatted nodes; + + 2. implement our own memory allocator sitting directly on the top of + alloc_pages() and installing proper ->mapping for pages that it grabs. + +(2) will only work if generic VM code (e.g., shrink_cache() or +page_launder_zone() in rmap VM) don't depend on particulars of page cache +hashing, that, fortunately, seems to be the case. This approach has following +advantages: + + . we can try to collocate related blocks on the same page, for example + blocks from the same transaction, of block with similar cache "hotness"; + + . we can use blocks larger than page size. + +Nikita. + + diff -Naurp linux-2.6.4/fs/reiser4/doc/oid-locid linux-2.6.4-ck1/fs/reiser4/doc/oid-locid --- linux-2.6.4/fs/reiser4/doc/oid-locid 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/doc/oid-locid 2004-03-11 22:45:15.218521247 +1100 @@ -0,0 +1,108 @@ +MIME-Version: 1.0 +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +Message-ID: <15392.39020.573047.826769@laputa.namesys.com> +Date: Wed, 19 Dec 2001 16:38:52 +0300 +To: Reiserfs developers mail-list +Subject: [RFC]: objectids and localities management +X-Mailer: VM 6.96 under 21.4 (patch 3) "Academic Rigor" XEmacs Lucid +FCC: ~/documents/mail/outgoing +--text follows this line-- +Hello, + +there is one thing that seems awkward in current reiser{fs|4} design: in +a key we have both locality id (locid) and object id (oid). This is +slightly illogical because oid alone is unique, but we cannot find an +object given oid. This was, by the way, main reason behind our NFS +troubles. So, why is this strictly necessary? I'll try to reason from +the "first principles". Following account doesn't pretend to be of any +historical accuracy of course. + +1. In a data structure we use to store objects (tree) items + with close keys are packed into the same disk block. This means that + we cannot completely separate key allocation from block + allocation. That is, + + - tree forces us to encode disk location preferences in a key. (A1) + +2. If we cannot completely separate key and block allocation let's try + in stead to blend them together. That is, we rely on block allocator + to follow tree ordering and topology: blocks containing items with + close keys are allocated close on disk and blocks contiguous in tree + order are more or less contiguous on disk. How far bitmap.c fulfill + or can fulfill these goals is out of the scope of this discussion, + + - let's suppose that we have ideal block allocator. (A2) + +3. Given this, why cannot we encode disk location preferences in oid + alone? Because oid has to be unique and we cannot predict how many + objects we are going to group together in a future (how many objects + there will be in a directory that is). That is, suppose we create two + directories "a" and "b" in succession. If oid were the only thing to + store location preference, than we should leave after the oid of "a" + enough unused oids for all objects within "a", but we don't know how + many of them will be there. + +4. To solve this (locid, oid) scheme was born. It has following + advantages: + + - it is simple to implement + - it allows one to encode enough location preference into the key (A3) + +But the more people used reiserfs and the more files they started to +store in a single directory, the less valid (A3) became. oid became +inadequate location preference, because while it allows to separate +files from different directories it doesn't allow to order files within +single directory. For example readdir of big directory is slow, because +files are not sorted within directory. Various ad-hoc solutions have +been proposed (oid==hash, add "band" to oid, etc), but there is obvious +conflict between requirement that oid is unique and desire to encode +additional information in it. In effect all such solutions amount to +further splitting of (locid,oid) pair into (locid, someid, oid) for the +reasons similar to those on the steps 3,4 above. + +The scheme proposed below tries to meet following goals: + + G1. only keep unique oid in a key, thus making it possible to find file + given its inode number and additionally shrink key, increasing + fanout. + + G2. allow configurable amount of fine-grained locality preference + information to be associated with each oid, thus allowing files + to be ordered in a tree according to some hierarchical "packing + localities", for example: first order files by oid of parent + directory, then by hash of name within this directory. + + +Proposal: + +Maintain separate map (oidlocmap, implementation discussed below) from +oid to "locpref", where locpref is additional fine-grained location +preference data, associated with oid. For example locpref may be just +(locid) to emulate existing behavior, or (locid, hash) or (locid, +user-supplied-grouping-info), etc. + +Key only contains oid, that is, ceteris paribus, key has form +(item-type, oid, offset). If oid is 60 bits, this is 16 bytes. + +Ordering of items within tree (and, given (A2), their ordering on disk) +is completely determined by keycmp() function that compares two +keys. Before comparing two keys, keycmp(k1, k2) consults oidlocmap and +obtains locprefs, associated with oids of k1 and k2. locprefs then are +"pasted" into k1 and k2, producing "expanded" keys, containing full +location preferences information. Expanded keys are compared as usual. + +In simplest case oidlocmap can be implemented as normal balanced tree, +where keys are oids (60 bits) and values locprefs. If we limit ourselves +to fixed format of locpref (at least per file system) than, we get +standard text-book balanced tree storing values of fixed size which is +simple to implement. + +There is of course overhead of maintaining oidlocmap and, especially, of +consulting it on each keycmp(), but it looks to me that it will be not +that significant, because oidlocmap is compact and will be out-weighted +by increased fanout in the main tree. + +Comments? + +Nikita. diff -Naurp linux-2.6.4/fs/reiser4/doc/page-cache-for-formatted-nodes linux-2.6.4-ck1/fs/reiser4/doc/page-cache-for-formatted-nodes --- linux-2.6.4/fs/reiser4/doc/page-cache-for-formatted-nodes 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/doc/page-cache-for-formatted-nodes 2004-03-11 22:45:15.219521092 +1100 @@ -0,0 +1,60 @@ +PROPOSAL: + +Keep formatted nodes in a page cache, binding them to the special fake inode +and using block number divided by number of blocks in a page as page index. + +ADVANTAGES: + +Page cache is preferred over buffer cache. Much more optimization and +scalability efforts are going into it. The fewer separate caches are in the +system, the simpler and better VM can handle load. + +DISADVANTAGES: + +As formatted nodes are indexed by block number, each page will contain +blocks with consequentive block numbers. This poses several problems: + + 1. When we need to read particular block from the disk (e.g., to load child + node during tree lookup), it is not clear that blocks with neighboring block + numbers are worth reading into memory at all. + + 2. Some of the blocks that have to go in the same page as block we need can + be unformatted ones. + +SOLUTIONS: + +There are several possible workarounds: + + 1. rely on the fact that in vast majority of cases block size is equal to + the page size. So, we can index formatted nodes by block number storing + exactly one block in the page. This will eliminate both problems at the + expense of the memory wasting in the setups where block size is smaller than + page size. + + 2. only load required block in the page marking other blocks mapped to this + page as up-to-date. It is not obvious that this will work at all, and in any + case, this will force us to use special API to access such pages, bypassing + VM interface. + + 3. rely on good repacker and load all blocks in the page hoping that they + are close to each other in tree order and will be accessed shortly. + + 4. allocate unformatted nodes such that they will never go into the same + frame as formatted. For example: + + - always align extent to the page boundary on the disk (page is CPU + specific though); + + - use some variation of border algorithm to separate formatted and + unformatted nodes; + + - use "enriched" bitmap where formatted and unformatted nodes are + distinguishable. + + +# Local variables: +# mode-name: "proposal" +# indent-tabs-mode: nil +# tab-width: 4 +# eval: (if (fboundp 'flyspell-mode) (flyspell-mode)) +# End: diff -Naurp linux-2.6.4/fs/reiser4/doc/readdir-problems-and-implementations linux-2.6.4-ck1/fs/reiser4/doc/readdir-problems-and-implementations --- linux-2.6.4/fs/reiser4/doc/readdir-problems-and-implementations 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/doc/readdir-problems-and-implementations 2004-03-11 22:45:15.219521092 +1100 @@ -0,0 +1,12 @@ +1. + +User level API. + +Standard + +^ Local variables: +^ mode-name: "Design Document" +^ indent-tabs-mode: nil +^ tab-width: 4 +^ eval: (if (fboundp 'flyspell-mode) (flyspell-mode)) +^ End: diff -Naurp linux-2.6.4/fs/reiser4/doc/reiser4.writeback.overview linux-2.6.4-ck1/fs/reiser4/doc/reiser4.writeback.overview --- linux-2.6.4/fs/reiser4/doc/reiser4.writeback.overview 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/doc/reiser4.writeback.overview 2004-03-11 22:45:15.220520936 +1100 @@ -0,0 +1,68 @@ +Hello, + +reiser4 has some features that make it somewhat difficult to integrate with +existing VM mechanisms. + +Reiser4 maintains all meta data in the single balanced tree. This tree is +maintained in the memory in the form different from what will be ultimately +written to the disk. Roughly speaking, before writing tree node to the disk, +some complex process ("flush") is to be performed. This process has following +characteristics: + + 1 it is not local, that is it operates on big number of nodes, possibly far + away from the starting node, both in tree and disk order. + + 2 it can involve reading of the large number of nodes from the disk (for + example, bitmap nodes are read during extent allocation that is deferred + until flush). + + 3 it can allocate unbounded amount of memory (during insertion of allocated + extents). + + 4 it participates in the locking protocol which reiser4 uses to implement + concurrent tree modifications. + + 5 it is CPU consuming and long + +As a result, flush reorganizes some part of reiser4 tree and produces large +queue of nodes ready to be submitted for io (as a matter of fact, flush write +clustering is so good that it used to hit BIO_MAX_PAGES all the time, until +checks were added for this). + +Items (3) and (4) alone make flush unsuitable for being called directly from +reiser4 ->vm_writeback() callback, because of OOM and deadlocks against +threads waiting for memory. + +So, it was decided that flush has to be performed from the separate +thread. Reiser4 has thread used to periodically commit old transactions and +this thread can be used for the flushing. That is, flushing thread does flush +and accumulates nodes prepared for the IO on the special +queue. reiser4_vm_writeback() submits nodes from this queue, if queue is +empty, it only wakes up flushing thread and immediately returns. + +Still there are some problems with integrating this stuff into VM scanning: + + 1 As ->vm_writeback() returns immediately without actually submitting pages + for IO, throttling on PG_writeback in shrink_list() will not work. This + opens a possibility (on a fast CPU), of try_to_free_pages() completing + scanning and calling out_of_memory() before flushing thread managed to add + anything to the queue. + + 2 It is possible, however unlikely, that flushing thread will be unable to flush + anything, because there is not enough memory. In this case reiser4 resorts + to the "emergency flush": some dumb algorithm that writes tree nodes to the + disk without taking locks and without optimizing tree layout. + + 3 Nodes prepared for IO can be from the active list, this means that they + will not be met/freed by shrink_list() after IO completion. New + blk_congestion_wait() should help here though. + +It looks like we need following changes to make this stuff working: + + 1 Adding ->priority field into struct writeback_control, so that file system + can vary its behavior depending on how desperate memory pressure is. + + 2 Different mechanism for scan throttling. + +Actually latter can be implemented completely within reiser4 but with some +awkwardness. diff -Naurp linux-2.6.4/fs/reiser4/doc/sys-reiser4-implemenation-overview linux-2.6.4-ck1/fs/reiser4/doc/sys-reiser4-implemenation-overview --- linux-2.6.4/fs/reiser4/doc/sys-reiser4-implemenation-overview 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/doc/sys-reiser4-implemenation-overview 2004-03-11 22:45:15.221520781 +1100 @@ -0,0 +1,222 @@ +SYS_REISER4 IMPLEMENTATION OVERVIEW + + +A. Basics +***************************************************************** + +sys_reiser4() system call executing a sequence of actions upon the +file-system(s). Actions are specified by the user in the form of a command +string. For the purposes of present discussion, said command string can be +thought of as a program in a special purpose programming language, which will +be further referred to as reiser4_lang. + +Canonical example of reiser4_lang program is + +/dir1/dir2/dir3/file1 <- /dir4/dir5/dir6/file2 + +It semantics is following: + +1. resolve "/dir1/dir2/dir3/file1" into file-system object (lookup operation) +2. resolve "/dir4/dir5/dir6/file2" into file-system object (lookup operation) +3. assign latter to the former. + +This is "assignment" operator. Assignment involves two "file-system objects" +and semantics of both lookup stage and assignment proper depends upon the type +of the file-system object. + +Following types of file-system objects are recognized: + +1. foreign objects: objects of different file-systems. Foreign object cannot +be target or source of an assignment. Rather, foreign objects can only appear +during path name lookup, while traversing non-reiser4 part of the file-system +name-space. Probably one should distinguish between objects belonging to +different file-system types (etx2, NFS) and objects belonging to different +reiser4 mounts. After sys_reiser4() is stable, foreign objects will be more +fully supported. + +2. reiser4 objects. + +3. pseudo-objects: these are entities injected into reiser4 name-space to +provide uniform access to various file-system meta-data. Pseudo-objects are +(usually) attached to some particular "host" object. [In the initial version,] +host objects are reiser4 objects. [Later it is possible to implement some +pseudo-objects for foreign objects.] Convention (but not enforced rule) is +that pseudo-objects are accessible through names starting with some well-known +prefix (".." is current favorite). Examples: ..owner, ..acl, etc. See comment +at the top of fs/reiser4/plugin/pseudo/pseudo.c for more details. + +B. lnodes +***************************************************************** + +lnodes are handles for file-system objects described above. They serve dual +purpose: + +1. uniform interface to the various types of objects. This allows the +reiser4_lang implementation to treat various types of objects in the same +manner. When new type of object has to be added, all changes will be grouped +in one place, rather than scattered across various files. This uniformity also +allows code sharing between reiser4_lang and VFS access paths. For example, +the same ->write method can be used by both. That is, ->read(), and ->write() +plugin methods used in VFS access paths will take lnode(s) as arguments and +can share code with sys_reiser4() implementation. For example, assignment is +particular case of write (or visa versa, depending on point of view). + + +2. synchronization. reiser4_lang doesn't use inodes and this poses a problem of +synchronization with VFS. Each lnode serves as a lock. See lnode.c for more +details. + +C. lookup +***************************************************************** + +reiser4_lang still supports only two traditional UNIX kinds of ordered names +(pathnames): absolute and relative to the current working directory. In both +cases, lookup starts from some file-system object represented by lnode. Then +lookup proceeds component-by-component as follows: + + lnode *parent; + lnode child; + + ret_code = lnode_get_dir_plugin( parent ) -> lnode_by_name( parent, + path_component, + &child ); + +1. Abovementioned locking issues require that parent lnode has to be kept +until operation on child finishes. In effect we get lock-coupling much like in +internal tree traversal. Also, possibility to use lock on node with directory +entry in stead of object lock was discussed. We have to think more on this. + + +2. Mount points crossing. It is possible, because dentries and therefore +inodes of all mount points are pinned in memory and lookup code can check at +each step whether mount point is crossed. Details are not very nice, because +for each inode in a path we have to scan list of all its dentries and check +whether correct one (corresponding to our path) is mount point. + +3. It is also possible to pass ->lnode_by_name the whole of the remaining +name, and let it decide how much of it it should handle. This will complicate +locking somewhat. But this is doable, though requires changes to the parser. + + +D. assignment +***************************************************************** + +Assignment A<-B basically means duplicating content of B into A. No +copy-on-write optimizations will be in version 4.0. + +Assignment implementation is based on the notion of flow (flow_t). Flow is a +source from which data can be obtained. Flow can be "backed up" by one of the +following: + +1. memory area in user space. (char *area, size_t length) +2. memory area in kernel space. (caddr_t *area, size_t length) +3. file-system object (lnode *obj, loff_t offset, size_t length) + +Main function to manipulate flows is: + +int flow_place( flow_t *flow, char *area, size_t length ); + +it copies @length bytes of @flow into @area and updated @flow correspondingly. +Behavior of flow_place() depends on the type of entity backing up @flow. If +@flow is based on the kernel-space area, memmove() is used to copy data. If +@flow is based on the user-space area, copy_from_user() is used. If @flow is +based on file-system object, flow_place() loads object's data into page cache +and copies them into @area. + +Thus, assignment code looks like following: + +typedef int ( *connect_t )( sink_t *target, flow_t *source ); + +int reiser4_assign( lnode *dst, lnode *src ) +{ + flow_t source; + sink_t target; + int ret_code; + file_plugin *src_fplug; + file_plugin *dst_fplug; + connect_t connection; + + /* get plugins */ + + src_fplug = lnode_get_file_plugin( src ); + dst_fplug = lnode_get_file_plugin( dst ); + + /* build source flow */ + ret_code = src_fplug -> build_flow( src, &source, 0 /* offset */ ); + + /* build target sink */ + ret_code = dst_fplug -> build_sink( dst, &target, 0 /* offset */ ); + + /* + * select how to transfer data from @src to @dst. + * + * Default implementation of this is common_transfer() (see below). + * + * Smart file plugin can choose connection based on type of @dst. + * + */ + connection = src_fplug -> select_connection( src, dst ); + + /* do transfer */ + return connection( &target, &source ); +} + + +/* look to chain conversion of (lnode * dst) -> (sink_t target) -> (lnode * dst) + I think, functions build_sink(...) and sink_object(...) - superfluous */ + +int common_transfer( sink_t *target, flow_t *source ) +{ + lnode *dst; + + dst = sink_object( target ); + while( flow_not_empty( source ) ) { + char *area; + size_t length; + + /* + * append some space to @target. Reasonable implementation will + * allocate several pagesful here + */ + ret_code = lnode_get_body_plugin( dst ) -> prepare_append( dst, + &area, + &length ); + /* why @length not depended from source? */ + /* + * put data from flow into newly alloted space. This also updates + * @flow. + */ + flow_place( source, area, length ); + /* + * perform necessary post-write activity required by @dst plugin, like + * encryption, compression, etc. Release pages. + */ + ret_code = lnode_get_body_plugin( dst ) -> commit_append( dst, + area, length ); + } +} + + +E. parsing +***************************************************************** + +It is not clear what parts of reiser4_lang processing should go into +kernel. In any case, providing direct system call as main (or, worse, the +only) way to access reiser4_lang functionality bounds as to maintain binary +compatibility in a future. To avoid this, reiser4 should be shipped with +user-level library, containing + +int reiser4( const char *cmd, size_t length ); + +function. For now, this function will directly despatch @cmd to the +sys_reiser4() in a future, it may do parsing itself and pass parse tree to the +kernel interpreter. + +***************************************************************** + +# Local variables: +# mode-name: "proposal" +# indent-tabs-mode: nil +# tab-width: 4 +# eval: (if (fboundp 'flyspell-mode) (flyspell-mode)) +# End: diff -Naurp linux-2.6.4/fs/reiser4/dscale.c linux-2.6.4-ck1/fs/reiser4/dscale.c --- linux-2.6.4/fs/reiser4/dscale.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/dscale.c 2004-03-11 22:45:15.222520625 +1100 @@ -0,0 +1,138 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Scalable on-disk integers */ + +/* + * Various on-disk structures contain integer-like structures. Stat-data + * contain [yes, "data" is plural, check the dictionary] file size, link + * count; extent unit contains extent width etc. To accommodate for general + * case enough space is reserved to keep largest possible value. 64 bits in + * all cases above. But in overwhelming majority of cases numbers actually + * stored in these fields will be comparatively small and reserving 8 bytes is + * a waste of precious disk bandwidth. + * + * Scalable integers are one way to solve this problem. dscale_write() + * function stores __u64 value in the given area consuming from 1 to 9 bytes, + * depending on the magnitude of the value supplied. dscale_read() reads value + * previously stored by dscale_write(). + * + * dscale_write() produces format not completely unlike of UTF: two highest + * bits of the first byte are used to store "tag". One of 4 possible tag + * values is chosen depending on the number being encoded: + * + * 0 ... 0x3f => 0 + * 0x40 ... 0x3fff => 1 + * 0x4000 ... 0x3fffffff => 2 + * 0x40000000 ... 0xffffffffffffffff => 3 + * + * (see dscale_range() function) + * + * As _highest_ bits are used for the test (which is natural) scaled integers + * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which + * uses LITTLE-ENDIAN. + * + */ + +#include "debug.h" +#include "dscale.h" + +/* return tag of scaled integer stored at @address */ +static int gettag(const unsigned char *address) +{ + return (*address) >> 6; +} + +/* clear tag from value */ +static void cleartag(__u64 *value, int tag) +{ + /* W-w-what ?! */ + *value &= ~(3 << (((1 << tag) << 3) - 2)); +} + +/* return tag for @value */ +static int dscale_range(__u64 value) +{ + if (value > 0x3fffffff) + return 3; + if (value > 0x3fff) + return 2; + if (value > 0x3f) + return 1; + return 0; +} + +/* restore value stored at @adderss by dscale_write() and return number of + * bytes consumed */ +reiser4_internal int dscale_read(unsigned char *address, __u64 *value) +{ + int tag; + + tag = gettag(address); + switch (tag) { + case 3: + *value = __be64_to_cpu(get_unaligned((__u64 *)(address + 1))); + /* worst case: 8 bytes for value itself plus one byte for + * tag */ + return 9; + case 0: + *value = get_unaligned(address); + break; + case 1: + *value = __be16_to_cpu(get_unaligned((__u16 *)address)); + break; + case 2: + *value = __be32_to_cpu(get_unaligned((__u32 *)address)); + break; + default: + return RETERR(-EIO); + } + cleartag(value, tag); + return 1 << tag; +} + +/* store @value at @address and return number of bytes consumed */ +reiser4_internal int dscale_write(unsigned char *address, __u64 value) +{ + int tag; + int shift; + unsigned char *valarr; + + tag = dscale_range(value); + value = __cpu_to_be64(value); + valarr = (unsigned char *)&value; + shift = (tag == 3) ? 1 : 0; + memcpy(address + shift, valarr + sizeof value - (1 << tag), 1 << tag); + *address |= (tag << 6); + return shift + (1 << tag); +} + +/* number of bytes required to store @value */ +reiser4_internal int dscale_bytes(__u64 value) +{ + int bytes; + + bytes = 1 << dscale_range(value); + if (bytes == 8) + ++ bytes; + return bytes; +} + +/* returns true if @value and @other require the same number of bytes to be + * stored. Used by detect when data structure (like stat-data) has to be + * expanded or contracted. */ +reiser4_internal int dscale_fit(__u64 value, __u64 other) +{ + return dscale_range(value) == dscale_range(other); +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/dscale.h linux-2.6.4-ck1/fs/reiser4/dscale.h --- linux-2.6.4/fs/reiser4/dscale.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/dscale.h 2004-03-11 22:45:15.222520625 +1100 @@ -0,0 +1,27 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Scalable on-disk integers. See dscale.h for details. */ + +#if !defined( __FS_REISER4_DSCALE_H__ ) +#define __FS_REISER4_DSCALE_H__ + +#include "dformat.h" + +extern int dscale_read (unsigned char *address, __u64 *value); +extern int dscale_write(unsigned char *address, __u64 value); +extern int dscale_bytes(__u64 value); +extern int dscale_fit (__u64 value, __u64 other); + +/* __FS_REISER4_DSCALE_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/emergency_flush.c linux-2.6.4-ck1/fs/reiser4/emergency_flush.c --- linux-2.6.4/fs/reiser4/emergency_flush.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/emergency_flush.c 2004-03-11 22:45:15.224520314 +1100 @@ -0,0 +1,904 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* this file exists only until VM gets fixed to reserve pages properly, which might or might not be very political. */ + +/* Implementation of emergency flush. */ + +/* OVERVIEW: + + Before writing a node to the disk, some complex process (flush.[ch]) is + to be performed. Flush is the main necessary preliminary step before + writing pages back to the disk, but it has some characteristics that make + it completely different from traditional ->writepage(): + + 1 It operates on a large number of nodes, possibly far away from the + starting node, both in tree and disk order. + + 2 it can involve reading of nodes from the disk (during extent + allocation, for example). + + 3 it can allocate memory (during insertion of allocated extents). + + 4 it participates in the locking protocol which reiser4 uses to + implement concurrent tree modifications. + + 5 it is CPU consuming and long + + As a result, flush reorganizes some part of reiser4 tree and produces + large queue of nodes ready to be submitted for io. + + Items (3) and (4) alone make flush unsuitable for being called directly + from reiser4 ->writepage() callback, because of OOM and deadlocks + against threads waiting for memory. + + So, flush is performed from within balance_dirty_page() path when dirty + pages are generated. If balance_dirty_page() fails to throttle writers + and page replacement finds dirty page on the inactive list, we resort to + "emergency flush" in our ->vm_writeback(). + + Emergency flush is relatively + dumb algorithm, implemented in this file, that tries to write tree nodes + to the disk without taking locks and without thoroughly optimizing tree + layout. We only want to call emergency flush in desperate situations, + because it is going to produce sub-optimal disk layouts. + + DETAILED DESCRIPTION + + Emergency flush (eflush) is designed to work as low level mechanism with + no or little impact on the rest of (already too complex) code. + + eflush is initiated from ->writepage() method called by VM on memory + pressure. It is supposed that ->writepage() is rare call path, because + balance_dirty_pages() throttles writes and tries to keep memory in + balance. + + eflush main entry point (emergency_flush()) checks whether jnode is + eligible for emergency flushing. Check is performed by flushable() + function which see for details. After successful check, new block number + ("emergency block") is allocated and io is initiated to write jnode + content to that block. + + After io is finished, jnode will be cleaned and VM will be able to free + page through call to ->releasepage(). + + emergency_flush() also contains special case invoked when it is possible + to avoid allocation of new node. + + Node selected for eflush is marked (by JNODE_EFLUSH bit in ->flags field) + and added to the special hash table of all eflushed nodes. This table + doesn't have linkage within each jnode, as this would waste memory in + assumption that eflush is rare. In stead new small memory object + (eflush_node_t) is allocated that contains pointer to jnode, emergency + block number, and is inserted into hash table. Per super block counter of + eflushed nodes is incremented. See section [INODE HANDLING] below for + more on this. + + It should be noted that emergency flush may allocate memory and wait for + io completion (bitmap read). + + Basically eflushed node has following distinctive characteristics: + + (1) JNODE_EFLUSH bit is set + + (2) no page + + (3) there is an element in hash table, for this node + + (4) node content is stored on disk in block whose number is stored + in the hash table element + + UNFLUSH + + Unflush is reverse of eflush, that is process bringing page of eflushed + inode back into memory. + + In accordance with the policy that eflush is low level and low impact + mechanism, transparent to the rest of the code, unflushing is performed + deeply within jload_gfp() which is main function used to load and pin + jnode page into memory. + + Specifically, if jload_gfp() determines that it is called on eflushed + node it gets emergency block number to start io against from the hash + table rather than from jnode itself. This is done in + jnode_get_io_block() function. After io completes, hash table element + for this node is removed and JNODE_EFLUSH bit is cleared. + + PROBLEMS + + 1. INODE HANDLING + + Usually (i.e., without eflush), jnode has a page attached to it. This + page pins corresponding struct address_space, and, hence, inode in + memory. Once inode has been eflushed, its page is gone and inode can be + wiped out of memory by the memory pressure (prune_icache()). This leads + to the number of complications: + + (1) jload_gfp() has to attach jnode tho the address space's radix + tree. This requires existence if inode. + + (2) normal flush needs jnode's inode to start slum collection from + unformatted jnode. + + (1) is really a problem, because it is too late to load inode (which + would lead to loading of stat data, etc.) within jload_gfp(). + + We, therefore, need some way to protect inode from being recycled while + having accessible eflushed nodes. + + I'll describe old solution here so it can be compared with new one. + + Original solution pinned inode by __iget() when first its node was + eflushed and released (through iput()) when last was unflushed. This + required maintenance of inode->eflushed counter in inode. + + Problem arise if last name of inode is unlinked when it has eflushed + nodes. In this case, last iput() that leads to the removal of file is + iput() made by unflushing from within jload_gfp(). Obviously, calling + truncate, and tree traversals from jload_gfp() is not a good idea. + + New solution is to pin inode in memory by adding I_EFLUSH bit to its + ->i_state field. This protects inode from being evicted by + prune_icache(). + + DISK SPACE ALLOCATION + + This section will describe how emergency block is allocated and how + block counters (allocated, grabbed, etc.) are manipulated. To be done. + + *****HISTORICAL SECTION**************************************************** + + DELAYED PARENT UPDATE + + Important point of emergency flush is that update of parent is sometimes + delayed: we don't update parent immediately if: + + 1 Child was just allocated, but parent is locked. Waiting for parent + lock in emergency flush is impossible (deadlockable). + + 2 Part of extent was allocated, but parent has not enough space to + insert allocated extent unit. Balancing in emergency flush is + impossible, because it will possibly wait on locks. + + When we delay update of parent node, we mark it as such (and possibly + also mark children to simplify delayed update later). Question: when + parent should be really updated? + + WHERE TO WRITE PAGE INTO? + + + So, it was decided that flush has to be performed from a separate + thread. Reiser4 has a thread used to periodically commit old transactions, + and this thread can be used for the flushing. That is, flushing thread + does flush and accumulates nodes prepared for the IO on the special + queue. reiser4_vm_writeback() submits nodes from this queue, if queue is + empty, it only wakes up flushing thread and immediately returns. + + Still there are some problems with integrating this stuff into VM + scanning: + + 1 As ->vm_writeback() returns immediately without actually submitting + pages for IO, throttling on PG_writeback in shrink_list() will not + work. This opens a possibility (on a fast CPU), of try_to_free_pages() + completing scanning and calling out_of_memory() before flushing thread + managed to add anything to the queue. + + 2 It is possible, however unlikely, that flushing thread will be + unable to flush anything, because there is not enough memory. In this + case reiser4 resorts to the "emergency flush": some dumb algorithm, + implemented in this file, that tries to write tree nodes to the disk + without taking locks and without thoroughly optimizing tree layout. We + only want to call emergency flush in desperate situations, because it + is going to produce sub-optimal disk layouts. + + 3 Nodes prepared for IO can be from the active list, this means that + they will not be met/freed by shrink_list() after IO completion. New + blk_congestion_wait() should help with throttling but not + freeing. This is not fatal though, because inactive list refilling + will ultimately get to these pages and reclaim them. + + REQUIREMENTS + + To make this work we need at least some hook inside VM scanning which + gets triggered after scanning (or scanning with particular priority) + failed to free pages. This is already present in the + mm/vmscan.c:set_shrinker() interface. + + Another useful thing that we would like to have is passing scanning + priority down to the ->vm_writeback() that will allow file system to + switch to the emergency flush more gracefully. + + POSSIBLE ALGORITHMS + + 1 Start emergency flush from ->vm_writeback after reaching some priority. + This allows to implement simple page based algorithm: look at the page VM + supplied us with and decide what to do. + + 2 Start emergency flush from shrinker after reaching some priority. + This delays emergency flush as far as possible. + + *****END OF HISTORICAL SECTION********************************************** + +*/ + +#include "forward.h" +#include "debug.h" +#include "page_cache.h" +#include "tree.h" +#include "jnode.h" +#include "znode.h" +#include "inode.h" +#include "super.h" +#include "block_alloc.h" +#include "emergency_flush.h" + +#include +#include +#include +#include +#include + +static int flushable(const jnode * node, struct page *page); +static int needs_allocation(const jnode * node); +static eflush_node_t *ef_alloc(int flags); +static reiser4_ba_flags_t ef_block_flags(const jnode *node); +static int ef_free_block(jnode *node, const reiser4_block_nr *blk, block_stage_t stage, eflush_node_t *ef); +static int ef_prepare(jnode *node, reiser4_block_nr *blk, eflush_node_t **enode, reiser4_blocknr_hint *hint); +static int eflush_add(jnode *node, reiser4_block_nr *blocknr, eflush_node_t *ef); + +/* slab for eflush_node_t's */ +static kmem_cache_t *eflush_slab; + +#define EFLUSH_START_BLOCK ((reiser4_block_nr)0) + +#define INC_STAT(node, counter) \ + reiser4_stat_inc_at_level(jnode_get_level(node), counter); + +/* this function exists only until VM gets fixed to reserve pages properly, + * which might or might not be very political. */ +/* try to flush @page to the disk + * + * Return 0 if page was successfully paged out. 1 if it is busy, error + * otherwise. + */ +reiser4_internal int +emergency_flush(struct page *page) +{ + struct super_block *sb; + jnode *node; + int result; + assert("nikita-2721", page != NULL); + assert("nikita-2724", PageLocked(page)); + + // warning("nikita-3112", "Emergency flush. Notify Reiser@Namesys.COM"); + + /* + * Page is locked, hence page<->jnode mapping cannot change. + */ + + sb = page->mapping->host->i_sb; + node = jprivate(page); + + assert("vs-1452", node != NULL); + + jref(node); + INC_STAT(node, vm.eflush.called); + + result = 0; + LOCK_JNODE(node); + /* + * page was dirty and under eflush. This is (only?) possible if page + * was re-dirtied through mmap(2) after eflush IO was submitted, but + * before ->releasepage() freed page. + */ + eflush_del(node, 1); + + LOCK_JLOAD(node); + if (flushable(node, page)) { + if (needs_allocation(node)) { + reiser4_block_nr blk; + eflush_node_t *efnode; + reiser4_blocknr_hint hint; + + blk = 0ull; + efnode = NULL; + + blocknr_hint_init(&hint); + + INC_STAT(node, vm.eflush.needs_block); + result = ef_prepare(node, &blk, &efnode, &hint); + if (flushable(node, page) && result == 0) { + assert("nikita-2759", efnode != NULL); + eflush_add(node, &blk, efnode); + + result = page_io(page, + node, + WRITE, + GFP_NOFS | __GFP_HIGH); + INC_STAT(node, vm.eflush.ok); + } else { + UNLOCK_JLOAD(node); + UNLOCK_JNODE(node); + if (blk != 0ull) + ef_free_block(node, &blk, + hint.block_stage, efnode); + if (efnode != NULL) + kmem_cache_free(eflush_slab, efnode); + ON_TRACE(TRACE_EFLUSH, "failure-2\n"); + result = 1; + INC_STAT(node, vm.eflush.nolonger); + } + + blocknr_hint_done(&hint); + } else { + txn_atom *atom; + flush_queue_t *fq; + + /* eflush without allocation temporary location for a node */ + ON_TRACE(TRACE_EFLUSH, "flushing to relocate place: %llu..", *jnode_get_block(node)); + + /* get flush queue for this node */ + result = fq_by_jnode(node, &fq); + + if (result) + return result; + + atom = node->atom; + + if (!flushable(node, page) || needs_allocation(node) || !jnode_is_dirty(node)) { + ON_TRACE(TRACE_EFLUSH, "failure-3\n"); + UNLOCK_JLOAD(node); + UNLOCK_JNODE(node); + UNLOCK_ATOM(atom); + fq_put(fq); + return 1; + } + + /* ok, now we can flush it */ + unlock_page(page); + + queue_jnode(fq, node); + + UNLOCK_JLOAD(node); + UNLOCK_JNODE(node); + UNLOCK_ATOM(atom); + + result = write_fq(fq, NULL); + if (result != 0) + lock_page(page); + + ON_TRACE(TRACE_EFLUSH, "flushed %d blocks\n", result); + /* Even if we wrote nothing, We unlocked the page, so let know to the caller that page should + not be unlocked again */ + fq_put(fq); + } + + } else { + UNLOCK_JLOAD(node); + UNLOCK_JNODE(node); + ON_TRACE(TRACE_EFLUSH, "failure-1\n"); + result = 1; + } + + jput(node); + return result; +} + +static int +flushable(const jnode * node, struct page *page) +{ + assert("nikita-2725", node != NULL); + assert("nikita-2726", spin_jnode_is_locked(node)); + assert("nikita-3388", spin_jload_is_locked(node)); + + if (jnode_is_loaded(node)) { /* loaded */ + INC_STAT(node, vm.eflush.loaded); + return 0; + } + if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) { /* already pending io */ + INC_STAT(node, vm.eflush.queued); + return 0; + } + if (JF_ISSET(node, JNODE_EPROTECTED)) { /* protected from e-flush */ + INC_STAT(node, vm.eflush.protected); + return 0; + } + if (JF_ISSET(node, JNODE_HEARD_BANSHEE)) { + INC_STAT(node, vm.eflush.heard_banshee); + return 0; + } + if (page == NULL) { /* nothing to flush */ + INC_STAT(node, vm.eflush.nopage); + return 0; + } + if (PageWriteback(page)) { /* already under io */ + INC_STAT(node, vm.eflush.writeback); + return 0; + } + /* don't flush bitmaps or journal records */ + if (!jnode_is_znode(node) && !jnode_is_unformatted(node)) { + INC_STAT(node, vm.eflush.bitmap); + return 0; + } + /* don't flush cluster pages */ + if (jnode_is_cluster_page(node)) { + INC_STAT(node, vm.eflush.clustered); + return 0; + } + if (JF_ISSET(node, JNODE_EFLUSH)) { /* already flushed */ + INC_STAT(node, vm.eflush.eflushed); + return 0; + } + return 1; +} + +#undef INC_STAT + +/* does node need allocation for eflushing? */ +static int +needs_allocation(const jnode * node) +{ + return !(JF_ISSET(node, JNODE_RELOC) && !blocknr_is_fake(jnode_get_block(node))); +} + + +static inline int +jnode_eq(jnode * const * j1, jnode * const * j2) +{ + assert("nikita-2733", j1 != NULL); + assert("nikita-2734", j2 != NULL); + + return *j1 == *j2; +} + +static ef_hash_table * +get_jnode_enhash(const jnode *node) +{ + struct super_block *super; + + assert("nikita-2739", node != NULL); + + super = jnode_get_tree(node)->super; + return &get_super_private(super)->efhash_table; +} + +static inline __u32 +jnode_hfn(ef_hash_table *table, jnode * const * j) +{ + __u32 val; + + assert("nikita-2735", j != NULL); + assert("nikita-3346", IS_POW(table->_buckets)); + + val = (unsigned long)*j; + val /= sizeof(**j); + return val & (table->_buckets - 1); +} + + +/* The hash table definition */ +#define KMALLOC(size) vmalloc(size) +#define KFREE(ptr, size) vfree(ptr) +TYPE_SAFE_HASH_DEFINE(ef, eflush_node_t, jnode *, node, linkage, jnode_hfn, jnode_eq); +#undef KFREE +#undef KMALLOC + +reiser4_internal int +eflush_init(void) +{ + eflush_slab = kmem_cache_create("eflush", sizeof (eflush_node_t), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (eflush_slab == NULL) + return RETERR(-ENOMEM); + else + return 0; +} + +reiser4_internal int +eflush_done(void) +{ + return kmem_cache_destroy(eflush_slab); +} + +reiser4_internal int +eflush_init_at(struct super_block *super) +{ + int buckets; + int result; + + buckets = 1 << fls(nr_free_pagecache_pages() >> 2); + do { + result = ef_hash_init(&get_super_private(super)->efhash_table, + buckets, + reiser4_stat(super, hashes.eflush)); + buckets >>= 1; + } while(result == -ENOMEM); + return result; +} + +reiser4_internal void +eflush_done_at(struct super_block *super) +{ + ef_hash_done(&get_super_private(super)->efhash_table); +} + +static eflush_node_t * +ef_alloc(int flags) +{ + return kmem_cache_alloc(eflush_slab, flags); +} + +static int +eflush_add(jnode *node, reiser4_block_nr *blocknr, eflush_node_t *ef) +{ + reiser4_tree *tree; + + assert("nikita-2737", node != NULL); + assert("nikita-2738", !JF_ISSET(node, JNODE_EFLUSH)); + assert("nikita-3382", !JF_ISSET(node, JNODE_EPROTECTED)); + assert("nikita-2765", spin_jnode_is_locked(node)); + assert("nikita-3381", spin_jload_is_locked(node)); + + tree = jnode_get_tree(node); + + ef->node = node; + ef->blocknr = *blocknr; + ef->hadatom = (node->atom != NULL); + ef->incatom = 0; + jref(node); + spin_lock_eflush(tree->super); + ef_hash_insert(get_jnode_enhash(node), ef); + ON_DEBUG(++ get_super_private(tree->super)->eflushed); + spin_unlock_eflush(tree->super); + + /* + * set JNODE_EFLUSH bit on the jnode. inode is not yet pinned at this + * point. We are safe, because page it is still attached to both @node + * and its inode. Page cannot be released at this point, because it is + * locked. + */ + JF_SET(node, JNODE_EFLUSH); + + if (jnode_is_unformatted(node)) { + struct inode *inode; + reiser4_inode *info; + + spin_lock_eflush(tree->super); + + inode = mapping_jnode(node)->host; + info = reiser4_inode_data(inode); + + ON_DEBUG(++ info->eflushed); + if (!ef->hadatom) + ++ info->eflushed_anon; + + /* this is to make inode not freeable */ + inode->i_state |= I_EFLUSH; + /* add eflush node to inode's list */ + list_add(&ef->inode_link, &info->eflushed_jnodes); + + spin_unlock_eflush(tree->super); + } + + UNLOCK_JLOAD(node); + + /* + * jnode_get_atom() can possible release jnode spin lock. This + * means it can only be called _after_ JNODE_EFLUSH is set, because + * otherwise we would have to re-check flushable() once more. No + * thanks. + */ + + if (ef->hadatom) { + txn_atom *atom; + + atom = jnode_get_atom(node); + if (atom != NULL) { + ++ atom->flushed; + ef->incatom = 1; + UNLOCK_ATOM(atom); + } + } + + UNLOCK_JNODE(node); + return 0; +} + +/* Arrghh... cast to keep hash table code happy. */ +#define C(node) ((jnode *const *)&(node)) + +reiser4_internal reiser4_block_nr * +eflush_get(const jnode *node) +{ + eflush_node_t *ef; + reiser4_tree *tree; + + assert("nikita-2740", node != NULL); + assert("nikita-2741", JF_ISSET(node, JNODE_EFLUSH)); + assert("nikita-2767", spin_jnode_is_locked(node)); + + + tree = jnode_get_tree(node); + spin_lock_eflush(tree->super); + ef = ef_hash_find(get_jnode_enhash(node), C(node)); + spin_unlock_eflush(tree->super); + + assert("nikita-2742", ef != NULL); + return &ef->blocknr; +} + +reiser4_internal void +eflush_del(jnode *node, int page_locked) +{ + assert("nikita-2743", node != NULL); + assert("nikita-2770", spin_jnode_is_locked(node)); + + if (unlikely(JF_ISSET(node, JNODE_EFLUSH))) { + eflush_node_t *ef; + ef_hash_table *table; + reiser4_tree *tree; + txn_atom *atom; + struct page *page; + struct inode *inode = NULL; + + reiser4_block_nr blk; + + table = get_jnode_enhash(node); + tree = jnode_get_tree(node); + page = jnode_page(node); + if (page != NULL) + page_cache_get(page); + + /* there is no reason to unflush node if it can be flushed + * back immediately. Unfortunately, this assertion requires + * jload lock. */ + /* assert("nikita-3083", !flushable(node, page) || page_locked); */ + + assert("nikita-2806", ergo(page_locked, page != NULL)); + assert("nikita-2807", ergo(page_locked, PageLocked(page))); + if (page != NULL) { + /* emergency flush hasn't reclaimed page yet. Wait + * until io is submitted. Otherwise there is a room + * for a race: emergency_flush() calls page_io() and + * we clear JNODE_EFLUSH bit concurrently---page_io() + * gets wrong block number. */ + UNLOCK_JNODE(node); + if (!page_locked) + lock_page(page); + wait_on_page_writeback(page); + LOCK_JNODE(node); + + if (unlikely(!JF_ISSET(node, JNODE_EFLUSH))) { + /* + * race: some other thread unflushed jnode. + */ + if (!page_locked) + unlock_page(page); + page_cache_release(page); + return; + } + /* + * either jnode was dirty or page was dirtied through + * mmap. Page's dirty bit was cleared before io was + * submitted. If page is left clean, we would have + * dirty jnode with clean page. Neither ->writepage() + * nor ->releasepage() can free it. Re-dirty page, so + * ->writepage() will be called again if necessary. + */ + set_page_dirty_internal(page); + } + assert("nikita-2766", atomic_read(&node->x_count) > 1); + + spin_lock_eflush(tree->super); + ef = ef_hash_find(table, C(node)); + assert("nikita-2745", ef != NULL); + blk = ef->blocknr; + ef_hash_remove(table, ef); + ON_DEBUG(-- get_super_private(tree->super)->eflushed); + spin_unlock_eflush(tree->super); + + if (ef->incatom) { + atom = jnode_get_atom(node); + assert("nikita-3311", atom != NULL); + -- atom->flushed; + UNLOCK_ATOM(atom); + } + + assert("vs-1215", JF_ISSET(node, JNODE_EFLUSH)); + JF_CLR(node, JNODE_EFLUSH); + + if (jnode_is_unformatted(node)) { + reiser4_inode *info; + + spin_lock_eflush(tree->super); + + inode = mapping_jnode(node)->host; + info = reiser4_inode_data(inode); + assert("vs-1194", info->eflushed > 0); + ON_DEBUG(-- info->eflushed); + if (!ef->hadatom) + -- info->eflushed_anon; + /* remove eflush node from inode's list of eflush + * nodes */ + list_del(&ef->inode_link); + if (list_empty(&info->eflushed_jnodes)) { + assert("nikita-3355", info->eflushed == 0); + inode->i_state &= ~I_EFLUSH; + } + + spin_unlock_eflush(tree->super); + } + UNLOCK_JNODE(node); + + if (blocknr_is_fake(jnode_get_block(node))) + assert ("zam-817", ef->initial_stage == BLOCK_UNALLOCATED); + else + assert ("zam-818", ef->initial_stage == BLOCK_GRABBED); + + jput(node); + + ef_free_block(node, &blk, + blocknr_is_fake(jnode_get_block(node)) ? + BLOCK_UNALLOCATED : BLOCK_GRABBED, ef); + + kmem_cache_free(eflush_slab, ef); + + if (page != NULL) { + if (!page_locked) + unlock_page(page); + page_cache_release(page); + } + + LOCK_JNODE(node); + } +} + +reiser4_internal int +emergency_unflush(jnode *node) +{ + int result; + + assert("nikita-2778", node != NULL); + assert("nikita-3046", schedulable()); + + if (JF_ISSET(node, JNODE_EFLUSH)) { + result = jload(node); + if (result == 0) { + struct page *page; + + assert("nikita-2777", !JF_ISSET(node, JNODE_EFLUSH)); + page = jnode_page(node); + assert("nikita-2779", page != NULL); + wait_on_page_writeback(page); + + jrelse(node); + } + } else + result = 0; + return result; +} + +static reiser4_ba_flags_t +ef_block_flags(const jnode *node) +{ + return jnode_is_znode(node) ? BA_FORMATTED : 0; +} + +static int ef_free_block(jnode *node, + const reiser4_block_nr *blk, + block_stage_t stage, eflush_node_t *ef) +{ + int result = 0; + reiser4_block_nr one; + + one = 1ull; + /* We cannot just ask block allocator to return block into flush + * reserved space, because there is no current atom at this point. */ + result = reiser4_dealloc_blocks(blk, &one, stage, ef_block_flags(node)); + if (result == 0 && stage == BLOCK_GRABBED) { + txn_atom *atom; + + if (ef->reserve) { + /* further, transfer block from grabbed into flush + * reserved space. */ + LOCK_JNODE(node); + atom = jnode_get_atom(node); + assert("nikita-2785", atom != NULL); + grabbed2flush_reserved_nolock(atom, 1); + UNLOCK_ATOM(atom); + JF_SET(node, JNODE_FLUSH_RESERVED); + UNLOCK_JNODE(node); + } else { + reiser4_context * ctx = get_current_context(); + grabbed2free(ctx, get_super_private(ctx->super), + (__u64)1); + } + } + return result; +} + +static int +ef_prepare(jnode *node, reiser4_block_nr *blk, eflush_node_t **efnode, reiser4_blocknr_hint * hint) +{ + int result; + reiser4_block_nr one; + int usedreserve; + + assert("nikita-2760", node != NULL); + assert("nikita-2761", blk != NULL); + assert("nikita-2762", efnode != NULL); + assert("nikita-2763", spin_jnode_is_locked(node)); + assert("nikita-3387", spin_jload_is_locked(node)); + + hint->blk = EFLUSH_START_BLOCK; + hint->max_dist = 0; + hint->level = jnode_get_level(node); + usedreserve = 0; + if (blocknr_is_fake(jnode_get_block(node))) + hint->block_stage = BLOCK_UNALLOCATED; + else { + txn_atom *atom; + switch (jnode_is_leaf(node)) { + default: + /* We cannot just ask block allocator to take block from + * flush reserved space, because there is no current + * atom at this point. */ + atom = jnode_get_atom(node); + if (atom != NULL) { + if (JF_ISSET(node, JNODE_FLUSH_RESERVED)) { + usedreserve = 1; + flush_reserved2grabbed(atom, 1); + JF_CLR(node, JNODE_FLUSH_RESERVED); + UNLOCK_ATOM(atom); + break; + } else + UNLOCK_ATOM(atom); + } + /* fall through */ + /* node->atom == NULL if page was dirtied through + * mmap */ + case 0: + result = reiser4_grab_space_force((__u64)1, BA_RESERVED); + grab_space_enable(); + if (result) { + warning("nikita-3323", + "Cannot allocate eflush block"); + return result; + } + } + + hint->block_stage = BLOCK_GRABBED; + } + + /* XXX protect @node from being concurrently eflushed. Otherwise, + * there is a danger of underflowing block space */ + UNLOCK_JLOAD(node); + UNLOCK_JNODE(node); + + one = 1ull; + result = reiser4_alloc_blocks(hint, blk, &one, ef_block_flags(node)); + if (result == 0) { + *efnode = ef_alloc(GFP_NOFS | __GFP_HIGH); + if (*efnode == NULL) + result = RETERR(-ENOMEM); + else { +#if REISER4_DEBUG + (*efnode)->initial_stage = hint->block_stage; +#endif + (*efnode)->reserve = usedreserve; + } + } + LOCK_JNODE(node); + LOCK_JLOAD(node); + return result; +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + LocalWords: " unflush eflushed LocalWords eflush writepage VM releasepage unflushing io " + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/emergency_flush.h linux-2.6.4-ck1/fs/reiser4/emergency_flush.h --- linux-2.6.4/fs/reiser4/emergency_flush.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/emergency_flush.h 2004-03-11 22:45:15.225520159 +1100 @@ -0,0 +1,58 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Emergency flush */ + +#ifndef __EMERGENCY_FLUSH_H__ +#define __EMERGENCY_FLUSH_H__ + + +/* this bit is set when inode gets first eflushed jnode (eflush_add()). It is cleared when last eflushed jnode is + eunflushed (eflush_del()). It solely exists to prevent inodes having eflushed jnodes from being pruned + (fs/inode.c:can_unuse()) */ +#define I_EFLUSH (256) + +#include "block_alloc.h" + +struct eflush_node; +typedef struct eflush_node eflush_node_t; + +TYPE_SAFE_HASH_DECLARE(ef, eflush_node_t); + +struct eflush_node { + jnode *node; + reiser4_block_nr blocknr; + ef_hash_link linkage; + struct list_head inode_link; /* for per inode list of eflush nodes */ + int hadatom :1; + int incatom :1; + int reserve :1; +#if REISER4_DEBUG + block_stage_t initial_stage; +#endif +}; + +int eflush_init(void); +int eflush_done(void); + +extern int eflush_init_at(struct super_block *super); +extern void eflush_done_at(struct super_block *super); + +extern reiser4_block_nr *eflush_get(const jnode *node); +extern void eflush_del(jnode *node, int page_locked); + +int emergency_flush(struct page *page); +int emergency_unflush(jnode *node); + + +/* __EMERGENCY_FLUSH_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/entd.c linux-2.6.4-ck1/fs/reiser4/entd.c --- linux-2.6.4/fs/reiser4/entd.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/entd.c 2004-03-11 22:45:15.226520003 +1100 @@ -0,0 +1,488 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Ent daemon. */ + +#include "debug.h" +#include "kcond.h" +#include "txnmgr.h" +#include "tree.h" +#include "entd.h" +#include "super.h" +#include "context.h" +#include "reiser4.h" + +#include /* struct task_struct */ +#include +#include +#include +#include /* INITIAL_JIFFIES */ +#include /* bdi_write_congested */ + +/* + * set this to 0 if you don't want to use wait-for-flush in ->writepage(). This + * is useful for debugging emergency flush, for example. + */ +#define USE_ENTD (0) + +#define DEF_PRIORITY 12 + +static void entd_flush(struct super_block *super); +static int entd(void *arg); + +#define entd_set_comm(state) \ + snprintf(current->comm, sizeof(current->comm), \ + "ent:%s%s", super->s_id, (state)) + +static inline entd_context * +get_entd_context(struct super_block *super) +{ + return &get_super_private(super)->entd; +} + +reiser4_internal void +init_entd_context(struct super_block *super) +{ + entd_context * ctx; + + assert("nikita-3104", super != NULL); + + ctx = get_entd_context(super); + + xmemset(ctx, 0, sizeof *ctx); + kcond_init(&ctx->startup); + kcond_init(&ctx->wait); + init_completion(&ctx->finish); + spin_lock_init(&ctx->guard); + + kernel_thread(entd, super, CLONE_VM | CLONE_FS | CLONE_FILES); + + spin_lock(&ctx->guard); + while (ctx->tsk == NULL) + kcond_wait(&ctx->startup, &ctx->guard, 0); + spin_unlock(&ctx->guard); +#if REISER4_DEBUG + flushers_list_init(&ctx->flushers_list); +#endif +} + +static int +entd(void *arg) +{ + struct super_block *super; + struct task_struct *me; + entd_context *ctx; + + super = arg; + /* standard kernel thread prologue */ + me = current; + /* reparent_to_init() is done by daemonize() */ + daemonize("ent:%s", super->s_id); + + /* block all signals */ + spin_lock_irq(&me->sighand->siglock); + siginitsetinv(&me->blocked, 0); + recalc_sigpending(); + spin_unlock_irq(&me->sighand->siglock); + + /* do_fork() just copies task_struct into the new + thread. ->fs_context shouldn't be copied of course. This shouldn't + be a problem for the rest of the code though. + */ + me->fs_context = NULL; + + ctx = get_entd_context(super); + + spin_lock(&ctx->guard); + ctx->tsk = me; + kcond_broadcast(&ctx->startup); + spin_unlock(&ctx->guard); + while (1) { + int result; + + if (me->flags & PF_FREEZE) + refrigerator(PF_IOTHREAD); + + entd_set_comm("."); + spin_lock(&ctx->guard); + ctx->kicks_pending = 0; + result = kcond_wait(&ctx->wait, &ctx->guard, 1); + + /* we are asked to exit */ + if (ctx->done) { + spin_unlock(&ctx->guard); + break; + } + + spin_unlock(&ctx->guard); + entd_set_comm("!"); + if (result == 0) + entd_flush(super); + else if (result != EINTR) + /* some other error */ + warning("nikita-3099", "Error: %i", result); + } + + complete_and_exit(&ctx->finish, 0); + /* not reached. */ + return 0; +} + +reiser4_internal void +done_entd_context(struct super_block *super) +{ + entd_context * ctx; + + assert("nikita-3103", super != NULL); + + ctx = get_entd_context(super); + + spin_lock(&ctx->guard); + ctx->done = 1; + kcond_signal(&ctx->wait); + spin_unlock(&ctx->guard); + + /* wait until daemon finishes */ + wait_for_completion(&ctx->finish); +} + +reiser4_internal void +enter_flush(struct super_block *super) +{ + entd_context * ctx; + reiser4_context * cur; + + assert("nikita-3105", super != NULL); + assert("nikita-3118", + get_current_context()->flush_started == INITIAL_JIFFIES); + + ctx = get_entd_context(super); + cur = get_current_context(); + + spin_lock(&ctx->guard); + ctx->last_flush = jiffies; + ctx->flushers += 1; +#if REISER4_DEBUG + flushers_list_push_front(&ctx->flushers_list, get_current_context()); +#endif + spin_unlock(&ctx->guard); + cur->flush_started = ctx->last_flush; + cur->io_started = INITIAL_JIFFIES; +} + +static const int decay = 3; + +reiser4_internal void flush_started_io(void) +{ + entd_context * ctx; + reiser4_context * cur; + unsigned long delta; + unsigned long now; + + cur = get_current_context(); + ctx = get_entd_context(cur->super); + + if (cur->io_started != INITIAL_JIFFIES) + return; + + now = jiffies; + delta = now - cur->flush_started; + assert("nikita-3114", time_after_eq(now, cur->flush_started)); + cur->io_started = now; + + spin_lock(&ctx->guard); + ctx->timeout = (delta + ((1 << decay) - 1) * ctx->timeout) >> decay; + /* confine ctx->timeout within [1 .. HZ/20] */ + if (ctx->timeout > HZ / 20) + ctx->timeout = HZ / 20; + if (ctx->timeout < 1) + ctx->timeout = 1; + spin_unlock(&ctx->guard); +} + +reiser4_internal void leave_flush(struct super_block *super) +{ + entd_context * ctx; + + assert("nikita-3105", super != NULL); + + ctx = get_entd_context(super); + + spin_lock(&ctx->guard); + + assert("nikita-3117", ctx->flushers > 0); + assert("nikita-3115", + get_current_context()->flush_started != INITIAL_JIFFIES); + + ctx->flushers -= 1; + if (ctx->flushers == 0) + ctx->last_flush = INITIAL_JIFFIES; +#if REISER4_DEBUG + flushers_list_remove_clean(get_current_context()); +#endif + spin_unlock(&ctx->guard); + get_current_context()->flush_started = INITIAL_JIFFIES; +} + +static int get_flushers(struct super_block *super, unsigned long *flush_start) +{ + entd_context * ctx; + int result; + + assert("nikita-3106", super != NULL); + assert("nikita-3108", flush_start != NULL); + + ctx = get_entd_context(super); + + /* NOTE: locking is silly */ + spin_lock(&ctx->guard); + *flush_start = ctx->last_flush; + result = ctx->flushers; + spin_unlock(&ctx->guard); + return result; +} + +static void kick_entd(struct super_block *super) +{ + entd_context * ctx; + assert("nikita-3109", super != NULL); + + ctx = get_entd_context(super); + + spin_lock(&ctx->guard); + if (ctx->kicks_pending == 0) + kcond_signal(&ctx->wait); + ++ ctx->kicks_pending; + spin_unlock(&ctx->guard); +} + +/* + * return true if we are done with @page (it is clean), or something really + * wrong happened and wait_for_flush() is looping. + * + * Used in wait_for_flush(), which see for more details. + */ +static int +is_writepage_done(jnode *node) +{ + reiser4_stat_inc(wff.iteration); + /* + * if flush managed to process this node we are done. + */ + if (jnode_check_flushprepped(node)) { + reiser4_stat_inc(wff.cleaned); + return 1; + } + /* + * jnode removed from the tree (truncate or balancing) + */ + if (JF_ISSET(node, JNODE_HEARD_BANSHEE)) { + reiser4_stat_inc(wff.removed); + return 1; + } + + return 0; +} + +/* + * return true if calling thread is either ent thread or the only flusher for + * this file system. Used in wait_for_flush(), which see for more details. + */ +static int dont_wait_for_flush(struct super_block *super) +{ + reiser4_context * cur; + unsigned long flush_started; + + cur = get_current_context(); + + if (cur->entd) { + reiser4_stat_inc(wff.skipped_ent); + return 1; + } + + if (get_flushers(super, &flush_started) == 1 && + cur->flush_started != INITIAL_JIFFIES) { + reiser4_stat_inc(wff.skipped_last); + return 1; + } + return 0; +} + +#define WFF_MAX_ITERATIONS (3) + +/* + * This function uses some heuristic algorithm that results in @page being + * cleaned by normal flushing (flush.c) in most cases. Reason for this is + * whenever possible to avoid emergency flush (emergency_flush.c) that doesn't + * perform disk layout optimization. + * + * Algorithm: + * + * 1. there is dedicated per-super block "ent" (from Tolkien's LOTR) thread + * used to start flushing if no other flushers are active. It is called an + * ent because it takes care of trees, it requires awakening, and once + * awakened it might do a lot. + * + * 2. our goal is to wait for some reasonable amount of time ("timeout") in + * hope that ongoing concurrent flush would process and clean @page. + * + * 3. specifically we wait until following happens: + * + * there is flush (possibly being done by the ent) that started more + * than timeout ago, + * + * and + * + * device queue is not congested. + * + * + * Intuitively this means that flush stalled, probably waiting for free + * memory. + * + * Tricky part here is selection of timeout value. Probably it should be + * dynamically adjusting based on CPU load and average time it takes flush to + * start submitting nodes. + * + * Return: + * + * > 0 we are done with page (it has been cleaned, or we decided we don't + * want to deal with it this time) + * < 0 some error occurred + * 0 no luck, proceed with emergency flush + * + */ +reiser4_internal int +wait_for_flush(struct page *page, jnode *node, struct writeback_control *wbc) +{ + struct backing_dev_info *bdi; + int flushers; + unsigned long flush_started; + unsigned long timeout; + int result; + int iterations; + struct super_block *super; + + bdi = page->mapping->backing_dev_info; + super = page->mapping->host->i_sb; + timeout = get_entd_context(super)->timeout; + + reiser4_stat_inc(wff.asked); + + result = 0; + iterations = 0; + + if (!USE_ENTD) + return 0; + + while (result == 0) { + flushers = get_flushers(super, &flush_started); + /* + * if there is no flushing going on---launch ent thread. + */ + if (flushers == 0) { + reiser4_stat_inc(wff.kicked); + kick_entd(super); + } + + /* + * if memory pressure is low, do nothing + */ + if (0/*page_zone(page)->prev_priority > (DEF_PRIORITY - 3)*/) { + reiser4_stat_inc(wff.low_priority); + result = 1; + break; + } + + /* + * we don't want to apply usual wait-for-flush logic in + * ->writepage() if current thread is ent or, more generally, + * if it is the only active flusher in this file + * system. Otherwise we get some thread waiting for flush to + * clean some pages and flush is waiting for nothing. This + * brings VM scanning to almost complete halt. + */ + if (dont_wait_for_flush(super)) + break; + + /* + * wait until at least one flushing thread is running for at + * least @timeout + */ + if (flushers != 0 && + time_before(flush_started + timeout, jiffies)) + break; + + schedule_timeout(timeout); + reiser4_stat_inc(wff.wait_flush); + + /* + * if flush managed to clean this page we are done. + */ + result = is_writepage_done(node); + + /* + * check for some weird condition to avoid stalling memory + * scan. + */ + if (++ iterations > WFF_MAX_ITERATIONS) { + reiser4_stat_inc(wff.toolong); + break; + } + } + + if (result == 0) + result = is_writepage_done(node); + + /* + * at this point we are either done (result != 0), or there is flush + * going on for at least @timeout. If device is congested, we + * conjecture that flush is actively progressing (as opposed to being + * stalled). + */ + if (result == 0 && bdi_write_congested(bdi)) { + reiser4_stat_inc(wff.skipped_congested); + result = 1; + } + + /* + * at this point either the scanning priority is low and we choose to + * not wait, or we flushed something, or there was a flushing thread + * going on for at least @timeout but nothing was sent down to the + * disk. Probably flush stalls waiting for memory. This shouldn't + * happen often for normal file system loads, because balance dirty + * pages ensures there are enough clean pages around. + */ + return result; +} + +static void entd_flush(struct super_block *super) +{ + long nr_submitted; + int result; + reiser4_context txn; + struct writeback_control wbc = { + .bdi = NULL, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = 0, + .nonblocking = 0, + }; + + init_context(&txn, super); + + txn.entd = 1; + + result = flush_some_atom(&nr_submitted, &wbc, 0); + if (result != 0) + warning("nikita-3100", "Flush failed: %i", result); + reiser4_exit_context(&txn); +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/entd.h linux-2.6.4-ck1/fs/reiser4/entd.h --- linux-2.6.4/fs/reiser4/entd.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/entd.h 2004-03-11 22:45:15.226520003 +1100 @@ -0,0 +1,53 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Ent daemon. */ + +#ifndef __ENTD_H__ +#define __ENTD_H__ + +#include "kcond.h" +#include "context.h" + +#include +#include +#include +#include /* for struct task_struct */ + +typedef struct entd_context { + kcond_t startup; + struct completion finish; + kcond_t wait; + spinlock_t guard; + struct task_struct *tsk; + int done; + int kicks_pending; + unsigned long last_flush; + int flushers; + unsigned long timeout; +#if REISER4_DEBUG + flushers_list_head flushers_list; +#endif +} entd_context; + +extern void init_entd_context(struct super_block *super); +extern void done_entd_context(struct super_block *super); + +extern void enter_flush(struct super_block *super); +extern void leave_flush(struct super_block *super); +extern void flush_started_io(void); + +extern int wait_for_flush(struct page *page, + jnode *node, struct writeback_control *wbc); + +/* __ENTD_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/eottl.c linux-2.6.4-ck1/fs/reiser4/eottl.c --- linux-2.6.4/fs/reiser4/eottl.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/eottl.c 2004-03-11 22:45:15.227519848 +1100 @@ -0,0 +1,370 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "forward.h" +#include "debug.h" +#include "key.h" +#include "coord.h" +#include "plugin/item/item.h" +#include "plugin/node/node.h" +#include "znode.h" +#include "block_alloc.h" +#include "tree_walk.h" +#include "tree_mod.h" +#include "carry.h" +#include "tree.h" +#include "super.h" + +#include /* for __u?? */ + +/* Extents on the twig level (EOTTL) handling. + + EOTTL poses some problems to the tree traversal, that are better + explained by example. + + Suppose we have block B1 on the twig level with the following items: + + 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id, offset) + 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each + 2. internal item I2 with key (10:0:0:0) + + We are trying to insert item with key (5:0:0:0). Lookup finds node + B1, and then intra-node lookup is done. This lookup finished on the + E1, because the key we are looking for is larger than the key of E1 + and is smaller than key the of I2. + + Here search is stuck. + + After some thought it is clear what is wrong here: extents on the + twig level break some basic property of the *search* tree (on the + pretext, that they restore property of balanced tree). + + Said property is the following: if in the internal node of the search + tree we have [ ... Key1 Pointer Key2 ... ] then, all data that are or + will be keyed in the tree with the Key such that Key1 <= Key < Key2 + are accessible through the Pointer. + + This is not true, when Pointer is Extent-Pointer, simply because + extent cannot expand indefinitely to the right to include any item + with + + Key1 <= Key <= Key2. + + For example, our E1 extent is only responsible for the data with keys + + (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and + + so, key range + + ( (1:4:100:0xffffffffffffffff), (10:0:0:0) ) + + is orphaned: there is no way to get there from the tree root. + + In other words, extent pointers are different than normal child + pointers as far as search tree is concerned, and this creates such + problems. + + Possible solution for this problem is to insert our item into node + pointed to by I2. There are some problems through: + + (1) I2 can be in a different node. + (2) E1 can be immediately followed by another extent E2. + + (1) is solved by calling reiser4_get_right_neighbor() and accounting + for locks/coords as necessary. + + (2) is more complex. Solution here is to insert new empty leaf node + and insert internal item between E1 and E2 pointing to said leaf + node. This is further complicated by possibility that E2 is in a + different node, etc. + + Problems: + + (1) if there was internal item I2 immediately on the right of an + extent E1 we and we decided to insert new item S1 into node N2 + pointed to by I2, then key of S1 will be less than smallest key in + the N2. Normally, search key checks that key we are looking for is in + the range of keys covered by the node key is being looked in. To work + around of this situation, while preserving useful consistency check + new flag CBK_TRUST_DK was added to the cbk falgs bitmask. This flag + is automatically set on entrance to the coord_by_key() and is only + cleared when we are about to enter situation described above. + + (2) If extent E1 is immediately followed by another extent E2 and we + are searching for the key that is between E1 and E2 we only have to + insert new empty leaf node when coord_by_key was called for + insertion, rather than just for lookup. To distinguish these cases, + new flag CBK_FOR_INSERT was added to the cbk falgs bitmask. This flag + is automatically set by coord_by_key calls performed by + insert_by_key() and friends. + + (3) Insertion of new empty leaf node (possibly) requires + balancing. In any case it requires modification of node content which + is only possible under write lock. It may well happen that we only + have read lock on the node where new internal pointer is to be + inserted (common case: lookup of non-existent stat-data that fells + between two extents). If only read lock is held, tree traversal is + restarted with lock_level modified so that next time we hit this + problem, write lock will be held. Once we have write lock, balancing + will be performed. + + + + + + +*/ + +/* look to the right of @coord. If it is an item of internal type - 1 is + returned. If that item is in right neighbor and it is internal - @coord and + @lh are switched to that node: move lock handle, zload right neighbor and + zrelse znode coord was set to at the beginning +*/ +/* Audited by: green(2002.06.15) */ +static int +is_next_item_internal(coord_t * coord) +{ + if (coord->item_pos != node_num_items(coord->node) - 1) { + /* next item is in the same node */ + coord_t right; + + coord_dup(&right, coord); + check_me("vs-742", coord_next_item(&right) == 0); + if (item_is_internal(&right)) { + coord_dup(coord, &right); + return 1; + } + } + return 0; +} + +/* inserting empty leaf after (or between) item of not internal type we have to + know which right delimiting key corresponding znode has to be inserted with */ +static reiser4_key * +rd_key(coord_t * coord, reiser4_key * key) +{ + coord_t dup; + + assert("nikita-2281", coord_is_between_items(coord)); + coord_dup(&dup, coord); + + RLOCK_DK(current_tree); + + if (coord_set_to_right(&dup) == 0) + /* get right delimiting key from an item to the right of @coord */ + unit_key_by_coord(&dup, key); + else + /* use right delimiting key of parent znode */ + *key = *znode_get_rd_key(coord->node); + + RUNLOCK_DK(current_tree); + return key; +} + + +ON_DEBUG(void check_dkeys(const znode *);) + +/* this is used to insert empty node into leaf level if tree lookup can not go + further down because it stopped between items of not internal type */ +static int +add_empty_leaf(coord_t * insert_coord, lock_handle * lh, const reiser4_key * key, const reiser4_key * rdkey) +{ + int result; + carry_pool pool; + carry_level todo; + carry_op *op; + znode *parent_node; + znode *node; + reiser4_item_data item; + carry_insert_data cdata; + reiser4_tree *tree; + + init_carry_pool(&pool); + init_carry_level(&todo, &pool); + ON_STATS(todo.level_no = TWIG_LEVEL); + assert("vs-49827", znode_contains_key_lock(insert_coord->node, key)); + + tree = znode_get_tree(insert_coord->node); + node = new_node(insert_coord->node, LEAF_LEVEL); + if (IS_ERR(node)) + return PTR_ERR(node); + /* setup delimiting keys for node being inserted */ + WLOCK_DK(tree); + znode_set_ld_key(node, key); + znode_set_rd_key(node, rdkey); + ON_DEBUG(node->creator = current); + ON_DEBUG(node->first_key = *key); + WUNLOCK_DK(tree); + + ZF_SET(node, JNODE_ORPHAN); + parent_node = insert_coord->node; + op = post_carry(&todo, COP_INSERT, insert_coord->node, 0); + if (!IS_ERR(op)) { + cdata.coord = insert_coord; + cdata.key = key; + cdata.data = &item; + op->u.insert.d = &cdata; + op->u.insert.type = COPT_ITEM_DATA; + build_child_ptr_data(node, &item); + item.arg = NULL; + /* have @insert_coord to be set at inserted item after + insertion is done */ + todo.track_type = CARRY_TRACK_CHANGE; + todo.tracked = lh; + + result = carry(&todo, 0); + if (result == 0) { + lock_handle lh; + + result = zload(node); + if (result == 0) { + /* + * if we inserted new child into tree we have + * to mark it dirty so that flush will be able + * to process it. + */ + init_lh(&lh); + result = longterm_lock_znode(&lh, node, + ZNODE_WRITE_LOCK, + ZNODE_LOCK_LOPRI); + if (result == 0) { + znode_make_dirty(node); + done_lh(&lh); + } else { + warning("nikita-3136", + "Cannot dirty child"); + print_znode("child", node); + } + zrelse(node); + } + } + } else + result = PTR_ERR(op); + zput(node); + done_carry_pool(&pool); + if (result == 0) { + /* balancing probably shifted @insert_coord into different + node. Reload. */ + if (parent_node != insert_coord->node) { + zrelse(parent_node); + result = zload(insert_coord->node); + coord_clear_iplug(insert_coord); + } + if (result == 0) { + WLOCK_TREE(tree); + assert("nikita-3312", znode_is_right_connected(node)); + assert("nikita-2984", node->right == NULL); + ZF_CLR(node, JNODE_RIGHT_CONNECTED); + WUNLOCK_TREE(tree); + result = connect_znode(insert_coord, node); + if (result == 0) + ON_DEBUG(check_dkeys(node)); + } + } + return result; +} + +/* handle extent-on-the-twig-level cases in tree traversal */ +reiser4_internal int +handle_eottl(cbk_handle * h /* cbk handle */ , + int *outcome /* how traversal should proceed */ ) +{ + int result; + reiser4_key key; + coord_t *coord; + + coord = h->coord; + + if (h->level != TWIG_LEVEL || (coord_is_existing_item(coord) && item_is_internal(coord))) { + /* Continue to traverse tree downward. */ + return 0; + } + /* strange item type found on non-stop level?! Twig + horrors? */ + assert("vs-356", h->level == TWIG_LEVEL); + assert("vs-357", ( { + coord_t lcoord; + coord_dup(&lcoord, coord); + check_me("vs-733", coord_set_to_left(&lcoord) == 0); + item_is_extent(&lcoord);} + )); + + if (*outcome == NS_FOUND) { + /* we have found desired key on twig level in extent item */ + h->result = CBK_COORD_FOUND; + reiser4_stat_inc(tree.cbk_found); + *outcome = LOOKUP_DONE; + return 1; + } + + if (!(h->flags & CBK_FOR_INSERT)) { + /* tree traversal is not for insertion. Just return + CBK_COORD_NOTFOUND. */ + h->result = CBK_COORD_NOTFOUND; + *outcome = LOOKUP_DONE; + return 1; + } + + /* take a look at the item to the right of h -> coord */ + result = is_next_item_internal(coord); + if (result < 0) { + /* error occured while we were trying to look at the item to + the right */ + h->error = "could not check next item"; + h->result = result; + *outcome = LOOKUP_DONE; + return 1; + } else if (result == 0) { + + /* item to the right is not internal one. Allocate a new node + and insert pointer to it after item h -> coord. + + This is a result of extents being located at the twig + level. For explanation, see comment just above + is_next_item_internal(). + */ + if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) { + /* we got node read locked, restart coord_by_key to + have write lock on twig level */ + h->lock_level = TWIG_LEVEL; + h->lock_mode = ZNODE_WRITE_LOCK; + *outcome = LOOKUP_REST; + return 1; + } + + result = add_empty_leaf(coord, h->active_lh, h->key, rd_key(coord, &key)); + if (result) { + h->error = "could not add empty leaf"; + h->result = result; + *outcome = LOOKUP_DONE; + return 1; + } + assert("vs-358", keyeq(h->key, item_key_by_coord(coord, &key))); + } else { + /* this is special case mentioned in the comment on + tree.h:cbk_flags. We have found internal item immediately + on the right of extent, and we are going to insert new item + there. Key of item we are going to insert is smaller than + leftmost key in the node pointed to by said internal item + (otherwise search wouldn't come to the extent in the first + place). + + This is a result of extents being located at the twig + level. For explanation, see comment just above + is_next_item_internal(). + */ + h->flags &= ~CBK_TRUST_DK; + } + assert("vs-362", item_is_internal(coord)); + return 0; +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/estimate.c linux-2.6.4-ck1/fs/reiser4/estimate.c --- linux-2.6.4/fs/reiser4/estimate.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/estimate.c 2004-03-11 22:45:15.228519692 +1100 @@ -0,0 +1,132 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "debug.h" +#include "dformat.h" +#include "tree.h" +#include "carry.h" +#include "plugin/item/ctail.h" + +/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied + + Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing + is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1 + neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for + leaf level, 3 for twig level, 2 on upper + 1 for root. + + Do not calculate the current node of the lowest level here - this is overhead only. + + children is almost always 1 here. Exception is flow insertion +*/ +static reiser4_block_nr +max_balance_overhead(reiser4_block_nr childen, tree_level tree_height) +{ + reiser4_block_nr ten_percent; + + ten_percent = ((103 * childen) >> 10); + + /* If we have too many balancings at the time, tree height can raise on more + then 1. Assume that if tree_height is 5, it can raise on 1 only. */ + return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent)); +} + +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to + perform insertion of one item into the tree */ +/* it is only called when tree height changes, or gets initialized */ +reiser4_internal reiser4_block_nr +calc_estimate_one_insert(tree_level height) +{ + return 1 + max_balance_overhead(1, height); +} + +reiser4_internal reiser4_block_nr +estimate_internal_amount(reiser4_block_nr children, tree_level tree_height) +{ + return max_balance_overhead(children, tree_height); +} + +reiser4_internal reiser4_block_nr +estimate_one_insert_item(reiser4_tree *tree) +{ + return tree->estimate_one_insert; +} + +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to + perform insertion of one unit into an item in the tree */ +reiser4_internal reiser4_block_nr +estimate_one_insert_into_item(reiser4_tree *tree) +{ + /* estimate insert into item just like item insertion */ + return tree->estimate_one_insert; +} + +reiser4_internal reiser4_block_nr +estimate_one_item_removal(reiser4_tree *tree) +{ + /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf + level */ + return tree->estimate_one_insert; +} + +/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and + both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal + levels */ +reiser4_internal reiser4_block_nr +estimate_insert_flow(tree_level height) +{ + return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 + CARRY_FLOW_NEW_NODES_LIMIT, height); +} + +/* returnes max number of nodes can be occupied by disk cluster */ +reiser4_internal reiser4_block_nr +estimate_disk_cluster(struct inode * inode) +{ + return 2 + inode_cluster_pages(inode); +} + +/* how many nodes might get dirty and added nodes during insertion of a disk cluster */ +reiser4_internal reiser4_block_nr +estimate_insert_cluster(struct inode * inode) +{ + return 3 + inode_cluster_pages(inode) + + max_balance_overhead(3 + inode_cluster_pages(inode), REISER4_MAX_ZTREE_HEIGHT); +} + +#if YOU_CAN_COMPILE_PSEUDO_CODE + +/* maximal cost in leaf nodes of deleting an item (left and right are wandered, current disappears but not immediately)*/ +#define ESTIMATE_ITEM_DELETE 2 + +/* maximal cost in leaf nodes of inserting an item (left, right, new, and current are wandered) */ +#define ESTIMATE_ITEM_INSERT 4 + +/* maximal cost in leaf nodes of updating an item (current is wandered)*/ +#define ESTIMATE_ITEM_UPDATE 1 + +estimate_rename() +{ + +/* we ignore internal nodes because we have some percent of the device + space in reserve, and no set of changes to internal nodes can + exceed that reserve and leave us with internal nodes whose children + can fit onto this disk drive because we know what worst case fan + out is. */ + +/* if we ever get a rename that does more than insert one item and + delete one item and update a parent directory stat data, we'll need + to recode this. */ + return ESTIMATE_ITEM_DELETE + ESTIMATE_ITEM_INSERT + ESTIMATE_ITEM_UPDATE; + +} + +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/file_ops.c linux-2.6.4-ck1/fs/reiser4/file_ops.c --- linux-2.6.4/fs/reiser4/file_ops.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/file_ops.c 2004-03-11 22:45:15.229519537 +1100 @@ -0,0 +1,417 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Interface to VFS. Reiser4 file_operations are defined here. */ + +#include "forward.h" +#include "debug.h" +#include "dformat.h" +#include "coord.h" +#include "plugin/item/item.h" +#include "plugin/file/file.h" +#include "plugin/security/perm.h" +#include "plugin/disk_format/disk_format.h" +#include "plugin/plugin.h" +#include "plugin/plugin_set.h" +#include "plugin/plugin_hash.h" +#include "plugin/object.h" +#include "txnmgr.h" +#include "jnode.h" +#include "znode.h" +#include "block_alloc.h" +#include "tree.h" +#include "trace.h" +#include "vfs_ops.h" +#include "inode.h" +#include "page_cache.h" +#include "ktxnmgrd.h" +#include "super.h" +#include "reiser4.h" +#include "kattr.h" +#include "entd.h" +#include "emergency_flush.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* file operations */ + +static loff_t reiser4_llseek(struct file *, loff_t, int); +static ssize_t reiser4_read(struct file *, char *, size_t, loff_t *); +static ssize_t reiser4_write(struct file *, const char *, size_t, loff_t *); +static int reiser4_readdir(struct file *, void *, filldir_t); +static int reiser4_ioctl(struct inode *, struct file *, unsigned int cmd, unsigned long arg); +static int reiser4_mmap(struct file *, struct vm_area_struct *); +static int reiser4_release(struct inode *, struct file *); +static int reiser4_fsync(struct file *, struct dentry *, int datasync); +static int reiser4_open(struct inode *, struct file *); +static ssize_t reiser4_sendfile(struct file *, loff_t *, size_t, read_actor_t, void __user *); + +#if 0 +static unsigned int reiser4_poll(struct file *, struct poll_table_struct *); +static int reiser4_flush(struct file *); +static int reiser4_fasync(int, struct file *, int); +static int reiser4_lock(struct file *, int, struct file_lock *); +static ssize_t reiser4_readv(struct file *, const struct iovec *, unsigned long, loff_t *); +static ssize_t reiser4_writev(struct file *, const struct iovec *, unsigned long, loff_t *); +static ssize_t reiser4_sendpage(struct file *, struct page *, int, size_t, loff_t *, int); +static unsigned long reiser4_get_unmapped_area(struct file *, unsigned long, + unsigned long, unsigned long, unsigned long); +#endif + +static loff_t +reiser4_llseek(struct file *file, loff_t off, int origin) +{ + loff_t result; + file_plugin *fplug; + struct inode *inode = file->f_dentry->d_inode; + loff_t(*seek_fn) (struct file *, loff_t, int); + reiser4_context ctx; + + init_context(&ctx, inode->i_sb); + reiser4_stat_inc(vfs_calls.llseek); + + ON_TRACE(TRACE_VFS_OPS, + "llseek: (i_ino %li, size %lld): off %lli, origin %d\n", inode->i_ino, inode->i_size, off, origin); + + fplug = inode_file_plugin(inode); + assert("nikita-2291", fplug != NULL); + seek_fn = fplug->seek ? : default_llseek; + result = seek_fn(file, off, origin); + reiser4_exit_context(&ctx); + return result; +} + +typedef struct readdir_actor_args { + void *dirent; + filldir_t filldir; + struct file *dir; + __u64 skip; + __u64 skipped; + reiser4_key key; +} readdir_actor_args; + +/* reiser4_readdir() - our readdir() method. + + readdir(2)/getdents(2) interface is based on implicit assumption that + readdir can be restarted from any particular point by supplying file + system with off_t-full of data. That is, file system fill ->d_off + field in struct dirent and later user passes ->d_off to the + seekdir(3), which is, actually, implemented by glibc as lseek(2) on + directory. + + Reiser4 cannot restart readdir from 64 bits of data, because two last + components of the key of directory entry are unknown, which given 128 + bits: locality and type fields in the key of directory entry are + always known, to start readdir() from given point objectid and offset + fields have to be filled. + +*/ +static int +reiser4_readdir(struct file *f /* directory file being read */ , + void *dirent /* opaque data passed to us by VFS */ , + filldir_t filldir /* filler function passed to us + * by VFS */ ) +{ + dir_plugin *dplug; + int result; + struct inode *inode; + reiser4_context ctx; + + inode = f->f_dentry->d_inode; + init_context(&ctx, inode->i_sb); + write_syscall_trace("%s", f->f_dentry->d_name.name); + reiser4_stat_inc(vfs_calls.readdir); + + dplug = inode_dir_plugin(inode); + if ((dplug != NULL) && (dplug->readdir != NULL)) + result = dplug->readdir(f, dirent, filldir); + else + result = RETERR(-ENOTDIR); + + update_atime(inode); + write_syscall_trace("ex"); + context_set_commit_async(&ctx); + reiser4_exit_context(&ctx); + return result; +} + +/* reiser4_ioctl - handler for ioctl for inode supported commands: + + REISER4_IOC_UNPACK - try to unpack tail from into extent and prevent packing + file (argument arg has to be non-zero) +*/ +static int +reiser4_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) +{ + int result; + reiser4_context ctx; + + init_context(&ctx, inode->i_sb); + write_syscall_trace("%s", filp->f_dentry->d_name.name); + reiser4_stat_inc(vfs_calls.ioctl); + + if (inode_file_plugin(inode)->ioctl == NULL) + result = -ENOSYS; + else + result = inode_file_plugin(inode)->ioctl(inode, filp, cmd, arg); + + write_syscall_trace("ex"); + reiser4_exit_context(&ctx); + return result; +} + +/* ->mmap() VFS method in reiser4 file_operations */ +static int +reiser4_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode; + int result; + reiser4_context ctx; + + init_context(&ctx, file->f_dentry->d_inode->i_sb); + write_syscall_trace("%s", file->f_dentry->d_name.name); + reiser4_stat_inc(vfs_calls.mmap); + + ON_TRACE(TRACE_VFS_OPS, "MMAP: (i_ino %lli, size %lld)\n", + get_inode_oid(file->f_dentry->d_inode), + file->f_dentry->d_inode->i_size); + + inode = file->f_dentry->d_inode; + assert("nikita-2936", inode_file_plugin(inode)->mmap != NULL); + result = inode_file_plugin(inode)->mmap(file, vma); + write_syscall_trace("ex"); + reiser4_exit_context(&ctx); + return result; +} + +/* reiser4 implementation of ->read() VFS method, member of reiser4 struct file_operations + + reads some part of a file from the filesystem into the user space buffer + + gets the plugin for the file and calls its read method which does everything except some initialization + +*/ +static ssize_t +reiser4_read(struct file *file /* file to read from */ , + char *buf /* user-space buffer to put data read + * from the file */ , + size_t count /* bytes to read */ , + loff_t * off /* current position within the file, which needs to be increased by the act of reading. Reads + * start from here. */ ) +{ + ssize_t result; + struct inode *inode; + reiser4_context ctx; + + assert("umka-072", file != NULL); + assert("umka-073", buf != NULL); + assert("umka-074", off != NULL); + + inode = file->f_dentry->d_inode; + init_context(&ctx, inode->i_sb); + write_syscall_trace("%s", file->f_dentry->d_name.name); + reiser4_stat_inc(vfs_calls.read); + + ON_TRACE(TRACE_VFS_OPS, + "READ: (i_ino %li, size %lld): %u bytes from pos %lli\n", + inode->i_ino, inode->i_size, count, *off); + + result = perm_chk(inode, read, file, buf, count, off); + if (likely(result == 0)) { + file_plugin *fplug; + + fplug = inode_file_plugin(inode); + assert("nikita-417", fplug != NULL); + assert("nikita-2935", fplug->write != NULL); + + /* unix_file_read is one method that might be invoked below */ + result = fplug->read(file, buf, count, off); + } + write_syscall_trace("ex"); + reiser4_exit_context(&ctx); + return result; +} + +/* ->write() VFS method in reiser4 file_operations */ +static ssize_t +reiser4_write(struct file *file /* file to write on */ , + const char *buf /* user-space buffer to get data + * to write into the file */ , + size_t size /* bytes to write */ , + loff_t * off /* offset to start writing + * from. This is updated to indicate + * actual number of bytes written */ ) +{ + struct inode *inode; + ssize_t result; + reiser4_context ctx; + + assert("nikita-1421", file != NULL); + assert("nikita-1422", buf != NULL); + assert("nikita-1424", off != NULL); + + inode = file->f_dentry->d_inode; + init_context(&ctx, inode->i_sb); + write_syscall_trace("%s", file->f_dentry->d_name.name); + reiser4_stat_inc(vfs_calls.write); + + ON_TRACE(TRACE_VFS_OPS, + "WRITE: (i_ino %li, size %lld): %u bytes to pos %lli\n", inode->i_ino, inode->i_size, size, *off); + + result = perm_chk(inode, write, file, buf, size, off); + if (likely(result == 0)) { + file_plugin *fplug; + + fplug = inode_file_plugin(inode); + assert("nikita-2934", fplug->read != NULL); + + result = fplug->write(file, buf, size, off); + } + write_syscall_trace("ex"); + context_set_commit_async(&ctx); + reiser4_exit_context(&ctx); + return result; +} + +/* Release reiser4 file. This is f_op->release() method. Called when last + holder closes a file */ +static int +reiser4_release(struct inode *i /* inode released */ , + struct file *f /* file released */ ) +{ + file_plugin *fplug; + int result; + reiser4_context ctx; + + assert("umka-081", i != NULL); + assert("nikita-1447", f != NULL); + + init_context(&ctx, i->i_sb); + fplug = inode_file_plugin(i); + assert("umka-082", fplug != NULL); + + ON_TRACE(TRACE_VFS_OPS, + "RELEASE: (i_ino %li, size %lld)\n", i->i_ino, i->i_size); + + if (fplug->release) + result = fplug->release(i, f); + else + result = 0; + + reiser4_free_file_fsdata(f); + + reiser4_exit_context(&ctx); + return result; +} + +static int +reiser4_open(struct inode * inode, struct file * file) +{ + int result; + + reiser4_context ctx; + file_plugin *fplug; + + init_context(&ctx, inode->i_sb); + reiser4_stat_inc(vfs_calls.open); + fplug = inode_file_plugin(inode); + + if (fplug->open != NULL) + result = fplug->open(inode, file); + else + result = 0; + + reiser4_exit_context(&ctx); + return result; +} + +/* FIXME: This way to support fsync is too expensive. Proper solution support is + to commit only atoms which contain dirty pages from given address space. */ +static int +reiser4_fsync(struct file *file UNUSED_ARG, + struct dentry *dentry, int datasync UNUSED_ARG) +{ + int result; + reiser4_context ctx; + + init_context(&ctx, dentry->d_inode->i_sb); + result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0); + context_set_commit_async(&ctx); + reiser4_exit_context(&ctx); + return result; +} + +/* Reads @count bytes from @file and calls @actor for every read page. This is + needed for loop back devices support. */ +static ssize_t reiser4_sendfile(struct file *file, loff_t *ppos, + size_t count, read_actor_t actor, + void __user *target) +{ + int result; + file_plugin *fplug; + reiser4_context ctx; + struct inode *inode; + + inode = file->f_dentry->d_inode; + init_context(&ctx, inode->i_sb); + + fplug = inode_file_plugin(inode); + + if (fplug->sendfile != NULL) + result = fplug->sendfile(file, ppos, count, actor, target); + else + result = RETERR(-EINVAL); + + reiser4_exit_context(&ctx); + return result; +} + + +struct file_operations reiser4_file_operations = { + .llseek = reiser4_llseek, /* d */ + .read = reiser4_read, /* d */ + .write = reiser4_write, /* d */ + .readdir = reiser4_readdir, /* d */ +/* .poll = reiser4_poll, */ + .ioctl = reiser4_ioctl, + .mmap = reiser4_mmap, /* d */ + .open = reiser4_open, +/* .flush = reiser4_flush, */ + .release = reiser4_release, /* d */ + .fsync = reiser4_fsync /* d */, + .sendfile = reiser4_sendfile, +/* .fasync = reiser4_fasync, */ +/* .lock = reiser4_lock, */ +/* .readv = reiser4_readv, */ +/* .writev = reiser4_writev, */ +/* .sendpage = reiser4_sendpage, */ +/* .get_unmapped_area = reiser4_get_unmapped_area */ +}; + + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/flush.c linux-2.6.4-ck1/fs/reiser4/flush.c --- linux-2.6.4/fs/reiser4/flush.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/flush.c 2004-03-11 22:45:15.237518293 +1100 @@ -0,0 +1,3617 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* The design document for this file is at http://www.namesys.com/v4/v4.html. */ + +#include "forward.h" +#include "debug.h" +#include "dformat.h" +#include "key.h" +#include "coord.h" +#include "type_safe_list.h" +#include "plugin/item/item.h" +#include "plugin/plugin.h" +#include "plugin/object.h" +#include "txnmgr.h" +#include "jnode.h" +#include "znode.h" +#include "block_alloc.h" +#include "tree_walk.h" +#include "carry.h" +#include "tree.h" +#include "vfs_ops.h" +#include "inode.h" +#include "page_cache.h" +#include "wander.h" +#include "super.h" +#include "trace.h" +#include "entd.h" +#include "reiser4.h" +#include "prof.h" +#include "flush.h" + +#include +#include /* for struct super_block */ +#include /* for struct page */ +#include /* for struct bio */ +#include +#include + +/* IMPLEMENTATION NOTES */ + +/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total + order to the nodes of the tree in which the parent is placed before its children, which + are ordered (recursively) in left-to-right order. When we speak of a "parent-first preceder", it + describes the node that "came before in forward parent-first order". When we speak of a + "parent-first follower", it describes the node that "comes next in parent-first + order" (alternatively the node that "came before in reverse parent-first order"). + + The following pseudo-code prints the nodes of a tree in forward parent-first order: + + void parent_first (node) + { + print_node (node); + if (node->level > leaf) { + for (i = 0; i < num_children; i += 1) { + parent_first (node->child[i]); + } + } + } +*/ + +/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block allocation so + that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order) + can be accomplished with sequential reads, which results in reading nodes in their + parent-first order. This is a read-optimization aspect of the flush algorithm, and + there is also a write-optimization aspect, which is that we wish to make large + sequential writes to the disk by allocating or reallocating blocks so that they can be + written in sequence. Sometimes the read-optimization and write-optimization goals + conflict with each other, as we discuss in more detail below. +*/ + +/* STATE BITS: The flush code revolves around the state of the jnodes it covers. Here are + the relevant jnode->state bits and their relevence to flush: + + JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be written it + must be allocated first. In order to be considered allocated, the jnode must have + exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These two bits are exclusive, and + all dirtied jnodes eventually have one of these bits set during each transaction. + + JNODE_CREATED: The node was freshly created in its transaction and has no previous + block address, so it is unconditionally assigned to be relocated, although this is + mainly for code-convenience. It is not being 'relocated' from anything, but in + almost every regard it is treated as part of the relocate set. The JNODE_CREATED bit + remains set even after JNODE_RELOC is set, so the actual relocate can be + distinguished from the created-and-allocated set easily: relocate-set members + (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which + have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set. + + JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the + decision to maintain the pre-existing location for this node and it will be written + to the wandered-log. + + JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was + not created, see note above). A block with JNODE_RELOC set is eligible for + early-flushing and may be submitted during flush_empty_queues. When the JNODE_RELOC + bit is set on a znode, the parent node's internal item is modified and the znode is + rehashed. + + JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its + flush queue. This means the jnode is not on any clean or dirty list, instead it is + moved to one of the flush queue (see flush_queue.h) object private list. This + prevents multiple concurrent flushes from attempting to start flushing from the + same node. + + (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up + squeeze-and-allocate on a node while its children are actively being squeezed and + allocated. This flag was created to avoid submitting a write request for a node + while its children are still being allocated and squeezed. Then flush queue was + re-implemented to allow unlimited number of nodes be queued. This flag support was + commented out in source code because we decided that there was no reason to submit + queued nodes before jnode_flush() finishes. However, current code calls fq_write() + during a slum traversal and may submit "busy nodes" to disk. Probably we can + re-enable the JNODE_FLUSH_BUSY bit support in future. + + With these state bits, we describe a test used frequently in the code below, + jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()). The + test for "flushprepped" returns true if any of the following are true: + + - The node is not dirty + - The node has JNODE_RELOC set + - The node has JNODE_OVRWR set + + If either the node is not dirty or it has already been processed by flush (and assigned + JNODE_OVRWR or JNODE_RELOC), then it is prepped. If jnode_is_flushprepped() returns + true then flush has work to do on that node. +*/ + +/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never + flushprepped twice (unless an explicit call to flush_unprep is made as described in + detail below). For example a node is dirtied, allocated, and then early-flushed to + disk and set clean. Before the transaction commits, the page is dirtied again and, due + to memory pressure, the node is flushed again. The flush algorithm will not relocate + the node to a new disk location, it will simply write it to the same, previously + relocated position again. +*/ + +/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we + start at a leaf node and allocate in parent-first order by iterating to the right. At + each step of the iteration, we check for the right neighbor. Before advancing to the + right neighbor, we check if the current position and the right neighbor share the same + parent. If they do not share the same parent, the parent is allocated before the right + neighbor. + + This process goes recursively up the tree and squeeze nodes level by level as long as + the right neighbor and the current position have different parents, then it allocates + the right-neighbors-with-different-parents on the way back down. This process is + described in more detail in flush_squalloc_changed_ancestor and the recursive function + squalloc_one_changed_ancestor. But the purpose here is not to discuss the + specifics of the bottom-up approach as it is to contrast the bottom-up and top-down + approaches. + + The top-down algorithm was implemented earlier (April-May 2002). In the top-down + approach, we find a starting point by scanning left along each level past dirty nodes, + then going up and repeating the process until the left node and the parent node are + clean. We then perform a parent-first traversal from the starting point, which makes + allocating in parent-first order trivial. After one subtree has been allocated in this + manner, we move to the right, try moving upward, then repeat the parent-first + traversal. + + Both approaches have problems that need to be addressed. Both are approximately the + same amount of code, but the bottom-up approach has advantages in the order it acquires + locks which, at the very least, make it the better approach. At first glance each one + makes the other one look simpler, so it is important to remember a few of the problems + with each one. + + Main problem with the top-down approach: When you encounter a clean child during the + parent-first traversal, what do you do? You would like to avoid searching through a + large tree of nodes just to find a few dirty leaves at the bottom, and there is not an + obvious solution. One of the advantages of the top-down approach is that during the + parent-first traversal you check every child of a parent to see if it is dirty. In + this way, the top-down approach easily handles the main problem of the bottom-up + approach: unallocated children. + + The unallocated children problem is that before writing a node to disk we must make + sure that all of its children are allocated. Otherwise, the writing the node means + extra I/O because the node will have to be written again when the child is finally + allocated. + + WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs, this + should not cause any file system corruption, it only degrades I/O performance because a + node may be written when it is sure to be written at least one more time in the same + transaction when the remaining children are allocated. What follows is a description + of how we will solve the problem. +*/ + +/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then, + proceeding in parent first order, allocate some of its left-children, then encounter a + clean child in the middle of the parent. We do not allocate the clean child, but there + may remain unallocated (dirty) children to the right of the clean child. If we were to + stop flushing at this moment and write everything to disk, the parent might still + contain unallocated children. + + We could try to allocate all the descendents of every node that we allocate, but this + is not necessary. Doing so could result in allocating the entire tree: if the root + node is allocated then every unallocated node would have to be allocated before + flushing. Actually, we do not have to write a node just because we allocate it. It is + possible to allocate but not write a node during flush, when it still has unallocated + children. However, this approach is probably not optimal for the following reason. + + The flush algorithm is designed to allocate nodes in parent-first order in an attempt + to optimize reads that occur in the same order. Thus we are read-optimizing for a + left-to-right scan through all the leaves in the system, and we are hoping to + write-optimize at the same time because those nodes will be written together in batch. + What happens, however, if we assign a block number to a node in its read-optimized + order but then avoid writing it because it has unallocated children? In that + situation, we lose out on the write-optimization aspect because a node will have to be + written again to the its location on the device, later, which likely means seeking back + to that location. + + So there are tradeoffs. We can choose either: + + A. Allocate all unallocated children to preserve both write-optimization and + read-optimization, but this is not always desirable because it may mean having to + allocate and flush very many nodes at once. + + B. Defer writing nodes with unallocated children, keep their read-optimized locations, + but sacrifice write-optimization because those nodes will be written again. + + C. Defer writing nodes with unallocated children, but do not keep their read-optimized + locations. Instead, choose to write-optimize them later, when they are written. To + facilitate this, we "undo" the read-optimized allocation that was given to the node so + that later it can be write-optimized, thus "unpreparing" the flush decision. This is a + case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a + call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit; + if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block + location, and set the JNODE_CREATED bit, effectively setting the node back to an + unallocated state. + + We will take the following approach in v4.0: for twig nodes we will always finish + allocating unallocated children (A). For nodes with (level > TWIG) we will defer + writing and choose write-optimization (C). + + To summarize, there are several parts to a solution that avoids the problem with + unallocated children: + + FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN" + problem because there was an experiment which was done showed that we have 1-2 nodes + with unallocated children for thousands of written nodes. The experiment was simple + like coping / deletion of linux kernel sources. However the problem can arise in more + complex tests. I think we have jnode_io_hook to insert a check for unallocated + children and see what kind of problem we have. + + 1. When flush reaches a stopping point (e.g., a clean node), it should continue calling + squeeze-and-allocate on any remaining unallocated children. FIXME: Difficulty to + implement: should be simple -- amounts to adding a while loop to jnode_flush, see + comments in that function. + + 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still + have unallocated children. If the twig level has unallocated children it is an + assertion failure. If a higher-level node has unallocated children, then it should be + explicitly de-allocated by a call to flush_unprep(). FIXME: Difficulty to implement: + should be simple. + + 3. (CPU-Optimization) Checking whether a node has unallocated children may consume more + CPU cycles than we would like, and it is possible (but medium complexity) to optimize + this somewhat in the case where large sub-trees are flushed. The following observation + helps: if both the left- and right-neighbor of a node are processed by the flush + algorithm then the node itself is guaranteed to have all of its children allocated. + However, the cost of this check may not be so expensive after all: it is not needed for + leaves and flush can guarantee this property for twigs. That leaves only (level > + TWIG) nodes that have to be checked, so this optimization only helps if at least three + (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless + there are many more (level > TWIG) nodes. But if there are many (level > TWIG) nodes + then the number of blocks being written will be very large, so the savings may be + insignificant. That said, the idea is to maintain both the left and right edges of + nodes that are processed in flush. When flush_empty_queue() is called, a relatively + simple test will tell whether the (level > TWIG) node is on the edge. If it is on the + edge, the slow check is necessary, but if it is in the interior then it can be assumed + to have all of its children allocated. FIXME: medium complexity to implement, but + simple to verify given that we must have a slow check anyway. + + 4. (Optional) This part is optional, not for v4.0--flush should work independently of + whether this option is used or not. Called RAPID_SCAN, the idea is to amend the + left-scan operation to take unallocated children into account. Normally, the left-scan + operation goes left as long as adjacent nodes are dirty up until some large maximum + value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing. But scan-left + may stop at a position where there are unallocated children to the left with the same + parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops after + FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes + with a rapid scan. The rapid scan skips all the interior children of a node--if the + leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the + twig to the left). If the left neighbor of the leftmost child is also dirty, then + continue the scan at the left twig and repeat. This option will cause flush to + allocate more twigs in a single pass, but it also has the potential to write many more + nodes than would otherwise be written without the RAPID_SCAN option. RAPID_SCAN + was partially implemented, code removed August 12, 2002 by JMACD. +*/ + +/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that the + starting point for flush is a leaf node, but actually the flush code cares very little + about whether or not this is true. It is possible that all the leaf nodes are flushed + and dirty parent nodes still remain, in which case jnode_flush() is called on a + non-leaf argument. Flush doesn't care--it treats the argument node as if it were a + leaf, even when it is not. This is a simple approach, and there may be a more optimal + policy but until a problem with this approach is discovered, simplest is probably best. + + NOTE: In this case, the ordering produced by flush is parent-first only if you ignore + the leaves. This is done as a matter of simplicity and there is only one (shaky) + justification. When an atom commits, it flushes all leaf level nodes first, followed + by twigs, and so on. With flushing done in this order, if flush is eventually called + on a non-leaf node it means that (somehow) we reached a point where all leaves are + clean and only internal nodes need to be flushed. If that it the case, then it means + there were no leaves that were the parent-first preceder/follower of the parent. This + is expected to be a rare case, which is why we do nothing special about it. However, + memory pressure may pass an internal node to flush when there are still dirty leaf + nodes that need to be flushed, which could prove our original assumptions + "inoperative". If this needs to be fixed, then scan_left/right should have + special checks for the non-leaf levels. For example, instead of passing from a node to + the left neighbor, it should pass from the node to the left neighbor's rightmost + descendent (if dirty). + +*/ + +/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB chunks, dirtying everything and putting + it into a transaction. We tell the allocator to allocate the blocks as far as possible towards one end of the + logical device--the left (starting) end of the device if we are walking from left to right, the right end of the + device if we are walking from right to left. We then make passes in alternating directions, and as we do this the + device becomes sorted such that tree order and block number order fully correlate. + + Resizing is done by shifting everything either all the way to the left or all the way + to the right, and then reporting the last block. +*/ + +/* RELOCATE DECISIONS: The code makes a decision to relocate in several places. This + descibes the policy from the highest level: + + The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the + leaf level during flush-scan (right, left), then we unconditionally decide to relocate + leaf nodes. + + Otherwise, there are two contexts in which we make a decision to relocate: + + 1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test(). + During the initial stages of flush, after scan-right completes, we want to ask the + question: should we relocate this leaf node and thus dirty the parent node. Then if + the node is a leftmost child its parent is its own parent-first preceder, thus we repeat + the question at the next level up, and so on. In these cases we are moving in the + reverse-parent first direction. + + There is another case which is considered the reverse direction, which comes at the end + of a twig in reverse_relocate_end_of_twig(). As we finish processing a twig we may + reach a point where there is a clean twig to the right with a dirty leftmost child. In + this case, we may wish to relocate the child by testing if it should be relocated + relative to its parent. + + 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in + allocate_znode. What distinguishes the forward parent-first case from the + reverse-parent first case is that the preceder has already been allocated in the + forward case, whereas in the reverse case we don't know what the preceder is until we + finish "going in reverse". That simplifies the forward case considerably, and there we + actually use the block allocator to determine whether, e.g., a block closer to the + preceder is available. +*/ + +/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is, once we + finish scan-left and find a starting point, if the parent's left neighbor is dirty then + squeeze the parent's left neighbor and the parent. This may change the + flush-starting-node's parent. Repeat until the child's parent is stable. If the child + is a leftmost child, repeat this left-edge squeezing operation at the next level up. + Note that we cannot allocate extents during this or they will be out of parent-first + order. There is also some difficult coordinate maintenence issues. We can't do a tree + search to find coordinates again (because we hold locks), we have to determine them + from the two nodes being squeezed. Looks difficult, but has potential to increase + space utilization. */ + +/* Flush-scan helper functions. */ +static void scan_init(flush_scan * scan); +static void scan_done(flush_scan * scan); + +/* Flush-scan algorithm. */ +static int scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit); +static int scan_right(flush_scan * scan, jnode * node, unsigned limit); +static int scan_common(flush_scan * scan, flush_scan * other); +static int scan_formatted(flush_scan * scan); +static int scan_unformatted(flush_scan * scan, flush_scan * other); +static int scan_by_coord(flush_scan * scan); + +/* Initial flush-point ancestor allocation. */ +static int alloc_pos_and_ancestors(flush_pos_t * pos); +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos); +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos); + +/* Main flush algorithm. Note on abbreviation: "squeeze and allocate" == "squalloc". */ +static int squalloc(flush_pos_t * pos); + +/* Flush squeeze implementation. */ +static int squeeze_right_non_twig(znode * left, znode * right); +static int shift_one_internal_unit(znode * left, znode * right); + +/* Flush reverse parent-first relocation routines. */ +static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk, const reiser4_block_nr * nblk); +static int reverse_relocate_test(jnode * node, const coord_t * parent_coord, flush_pos_t * pos); +static int reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord, flush_pos_t * pos); + +/* Flush allocate write-queueing functions: */ +static int allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos); +static int allocate_znode_update(znode * node, const coord_t * parent_coord, flush_pos_t * pos); +static int lock_parent_and_allocate_znode (znode *, flush_pos_t *); + +/* Flush helper functions: */ +static int jnode_lock_parent_coord(jnode * node, + coord_t * coord, + lock_handle * parent_lh, + load_count * parent_zh, + znode_lock_mode mode, int try); +static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side, znode_lock_mode mode); +static int znode_same_parents(znode * a, znode * b); + +static int +znode_check_flushprepped(znode * node) +{ + return jnode_check_flushprepped(ZJNODE(node)); +} + +/* Flush position functions */ +static void pos_init(flush_pos_t * pos); +static int pos_valid(flush_pos_t * pos); +static void pos_done(flush_pos_t * pos); +static int pos_stop(flush_pos_t * pos); + +/* check that @org is first jnode extent unit, if extent is unallocated, + * because all jnodes of unallocated extent are dirty and of the same atom. */ +#define checkchild(scan) \ +assert("nikita-3435", \ + ergo(scan->direction == LEFT_SIDE && \ + jnode_is_unformatted(scan->node) && \ + extent_is_unallocated(&scan->parent_coord), \ + extent_unit_index(&scan->parent_coord) == index_jnode(scan->node))) + +/* Flush debug functions */ +#if REISER4_DEBUG_OUTPUT +#else +#endif + +const char *pos_tostring(flush_pos_t * pos); + +/* This flush_cnt variable is used to track the number of concurrent flush operations, + useful for debugging. It is initialized in txnmgr.c out of laziness (because flush has + no static initializer function...) */ +ON_DEBUG(atomic_t flush_cnt;) + + +/* FIXME: remove me */#define FLUSH_CHECKS_CONGESTION 1 + +#if defined (FLUSH_CHECKS_CONGESTION) +/* check fs backing device for write congestion */ +static int check_write_congestion (void) +{ + struct super_block *sb; + struct backing_dev_info * bdi; + + sb = reiser4_get_current_sb(); + bdi = get_super_fake(sb)->i_mapping->backing_dev_info; + return bdi_write_congested(bdi); +} +#endif /* FLUSH_CHECKS_CONGESTION */ + +/* conditionally write flush queue */ +static int write_prepped_nodes (flush_pos_t * pos, int check_congestion) +{ + int ret; + + assert("zam-831", pos); + assert("zam-832", pos->fq); + + if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS)) + return 0; + +#if defined (FLUSH_CHECKS_CONGESTION) + if (check_congestion && check_write_congestion()) + return 0; +#endif /* FLUSH_CHECKS_CONGESTION */ + trace_mark(flush); + ret = write_fq(pos->fq, pos->nr_written); + set_rapid_flush_mode(0); + flush_started_io(); + return ret; +} + +/* Proper release all flush pos. resources then move flush position to new + locked node */ +static void move_flush_pos (flush_pos_t * pos, lock_handle * new_lock, + load_count * new_load, const coord_t * new_coord) +{ + assert ("zam-857", new_lock->node == new_load->node); + + if (new_coord) { + assert ("zam-858", new_coord->node == new_lock->node); + coord_dup(&pos->coord, new_coord); + } else { + coord_init_first_unit(&pos->coord, new_lock->node); + } + + if (pos->child) { + jput(pos->child); + pos->child = NULL; + } + + move_load_count(&pos->load, new_load); + done_lh(&pos->lock); + move_lh(&pos->lock, new_lock); +} + +/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */ +static int prepare_flush_pos(flush_pos_t *pos, jnode * org) +{ + int ret; + load_count load; + lock_handle lock; + + init_lh(&lock); + init_load_count(&load); + + if (jnode_is_znode(org)) { + ret = longterm_lock_znode(&lock, JZNODE(org), + ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI); + if (ret) + return ret; + + ret = incr_load_count_znode(&load, JZNODE(org)); + if (ret) + return ret; + + pos->state = (jnode_get_level(org) == LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL; + move_flush_pos(pos, &lock, &load, NULL); + } else { + coord_t parent_coord; + ret = jnode_lock_parent_coord(org, &parent_coord, &lock, + &load, ZNODE_WRITE_LOCK, 0); + if (ret) + goto done; + + pos->state = POS_ON_EPOINT; + move_flush_pos(pos, &lock, &load, &parent_coord); + pos->child = jref(org); + if (extent_is_unallocated(&parent_coord) && extent_unit_index(&parent_coord) != index_jnode(org)) { + /* @org is not first child of its parent unit. This may happen + because longerm lock of its parent node was released between + scan_left and scan_right. For now work around this having flush to repeat */ + ret = -EAGAIN; + } + } + + done: + done_load_count(&load); + done_lh(&lock); + return ret; +} + +/* TODO LIST (no particular order): */ +/* I have labelled most of the legitimate FIXME comments in this file with letters to + indicate which issue they relate to. There are a few miscellaneous FIXMEs with + specific names mentioned instead that need to be inspected/resolved. */ +/* B. There is an issue described in reverse_relocate_test having to do with an + imprecise is_preceder? check having to do with partially-dirty extents. The code that + sets preceder hints and computes the preceder is basically untested. Careful testing + needs to be done that preceder calculations are done correctly, since if it doesn't + affect correctness we will not catch this stuff during regular testing. */ +/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of these are + considered expected but unlikely conditions. Flush currently returns 0 (i.e., success + but no progress, i.e., restart) whenever it receives any of these in jnode_flush(). + Many of the calls that may produce one of these return values (i.e., + longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these + values themselves and, for instance, stop flushing instead of resulting in a restart. + If any of these results are true error conditions then flush will go into a busy-loop, + as we noticed during testing when a corrupt tree caused find_child_ptr to return + ENOENT. It needs careful thought and testing of corner conditions. +*/ +/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a created + block is assigned a block number then early-flushed to disk. It is dirtied again and + flush is called again. Concurrently, that block is deleted, and the de-allocation of + its block number does not need to be deferred, since it is not part of the preserve set + (i.e., it didn't exist before the transaction). I think there may be a race condition + where flush writes the dirty, created block after the non-deferred deallocated block + number is re-allocated, making it possible to write deleted data on top of non-deleted + data. Its just a theory, but it needs to be thought out. */ +/* F. bio_alloc() failure is not handled gracefully. */ +/* G. Unallocated children. */ +/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */ +/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */ + +/* JNODE_FLUSH: MAIN ENTRY POINT */ +/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty + neighborhood is named "slum"). Jnode_flush() is called if reiser4 has to write dirty + blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as + a part of transaction commit. + + Our objective here is to prep and flush the slum the jnode belongs to. We want to + squish the slum together, and allocate the nodes in it as we squish because allocation + of children affects squishing of parents. + + The "argument" @node tells flush where to start. From there, flush finds the left edge + of the slum, and calls squalloc (in which nodes are squeezed and allocated). To find a + "better place" to start squalloc first we perform a flush_scan. + + Flush-scanning may be performed in both left and right directions, but for different + purposes. When scanning to the left, we are searching for a node that precedes a + sequence of parent-first-ordered nodes which we will then flush in parent-first order. + During flush-scanning, we also take the opportunity to count the number of consecutive + leaf nodes. If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we + make a decision to reallocate leaf nodes (thus favoring write-optimization). + + Since the flush argument node can be anywhere in a sequence of dirty leaves, there may + also be dirty nodes to the right of the argument. If the scan-left operation does not + count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan + operation to see whether there is, in fact, enough nodes to meet the relocate + threshold. Each right- and left-scan operation uses a single flush_scan object. + + After left-scan and possibly right-scan, we prepare a flush_position object with the + starting flush point or parent coordinate, which was determined using scan-left. + + Next we call the main flush routine, squalloc, which iterates along the + leaf level, squeezing and allocating nodes (and placing them into the flush queue). + + After squalloc returns we take extra steps to ensure that all the children + of the final twig node are allocated--this involves repeating squalloc + until we finish at a twig with no unallocated children. + + Finally, we call flush_empty_queue to submit write-requests to disk. If we encounter + any above-twig nodes during flush_empty_queue that still have unallocated children, we + flush_unprep them. + + Flush treats several "failure" cases as non-failures, essentially causing them to start + over. E_DEADLOCK is one example. FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should + probably be handled properly rather than restarting, but there are a bunch of cases to + audit. +*/ + +static int jnode_flush(jnode * node, long *nr_to_flush, long * nr_written, flush_queue_t * fq, int flags) +{ + long ret = 0; + flush_scan right_scan; + flush_scan left_scan; + flush_pos_t flush_pos; + int todo; + struct super_block *sb; + reiser4_super_info_data *sbinfo; + jnode * leftmost_in_slum = NULL; + + assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack())); + assert("nikita-3022", schedulable()); + + /* lock ordering: delete_sema and flush_sema are unordered */ + assert("nikita-3185", + get_current_super_private()->delete_sema_owner != current); + + sb = reiser4_get_current_sb(); + sbinfo = get_super_private(sb); + if (!reiser4_is_set(sb, REISER4_MTFLUSH)) { +#if REISER4_STATS + unsigned long sleep_start = jiffies; +#endif + down(&sbinfo->flush_sema); +#if REISER4_STATS + reiser4_stat_add(flush.slept_in_mtflush_sem , jiffies - sleep_start); +#endif + } + + /* Flush-concurrency debug code */ +#if REISER4_DEBUG + atomic_inc(&flush_cnt); + ON_TRACE(TRACE_FLUSH, + "flush enter: pid %ul %u concurrent procs\n", + current->pid, atomic_read(&flush_cnt)); + IF_TRACE(TRACE_FLUSH, + if (atomic_read(&flush_cnt) > 1) printk("flush concurrency\n");); +#endif + + enter_flush(sb); + + ON_TRACE(TRACE_FLUSH, "flush squalloc %s %s\n", jnode_tostring(node), flags_tostring(flags)); + + /* Initialize a flush position. */ + pos_init(&flush_pos); + + flush_pos.nr_to_flush = nr_to_flush; + flush_pos.nr_written = nr_written; + flush_pos.fq = fq; + flush_pos.flags = flags; + + scan_init(&right_scan); + scan_init(&left_scan); + + /* init linkage status of the node */ + if (jnode_is_znode(node)) { + /* if jnode is unformatted this status will be set in scan_unformatted */ + set_flush_scan_nstat(&left_scan, LINKED); + set_flush_scan_nstat(&right_scan, LINKED); + } + + /*IF_TRACE (TRACE_FLUSH_VERB, print_tree_rec ("parent_first", current_tree, REISER4_TREE_BRIEF)); */ + /*IF_TRACE (TRACE_FLUSH_VERB, print_tree_rec ("parent_first", current_tree, REISER4_TREE_CHECK)); */ + + /* First scan left and remember the leftmost scan position. If the leftmost + position is unformatted we remember its parent_coord. We scan until counting + FLUSH_SCAN_MAXNODES. + + If starting @node is unformatted, at the beginning of left scan its + parent (twig level node, containing extent item) will be long term + locked and lock handle will be stored in the + @right_scan->parent_lock. This lock is used to start the rightward + scan without redoing the tree traversal (necessary to find parent) + and, hence, is kept during leftward scan. As a result, we have to + use try-lock when taking long term locks during the leftward scan. + */ + ret = scan_left(&left_scan, &right_scan, + node, sbinfo->flush.scan_maxnodes); + if (ret != 0) + goto failed; + + leftmost_in_slum = jref(left_scan.node); + scan_done(&left_scan); + + /* Then possibly go right to decide if we will use a policy of relocating leaves. + This is only done if we did not scan past (and count) enough nodes during the + leftward scan. If we do scan right, we only care to go far enough to establish + that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The + scan limit is the difference between left_scan.count and the threshold. */ + reiser4_stat_add(flush.left, left_scan.count); + + todo = sbinfo->flush.relocate_threshold - left_scan.count; + /* scan right is inherently deadlock prone, because we are + * (potentially) holding a lock on the twig node at this moment. + * FIXME: this is incorrect comment: lock is not held */ + if (todo > 0 && (get_flush_scan_nstat(&right_scan) == LINKED)) { + ret = scan_right(&right_scan, node, (unsigned)todo); + if (ret != 0) + goto failed; + } + + /* Only the right-scan count is needed, release any rightward locks right away. */ + scan_done(&right_scan); + + ON_TRACE(TRACE_FLUSH, "flush: left: %i, right: %i\n", + left_scan.count, right_scan.count); + + reiser4_stat_add(flush.right, right_scan.count); + + /* ... and the answer is: we should relocate leaf nodes if at least + FLUSH_RELOCATE_THRESHOLD nodes were found. */ + flush_pos.leaf_relocate = JF_ISSET(node, JNODE_REPACK) || + (left_scan.count + right_scan.count >= sbinfo->flush.relocate_threshold); + + /*assert ("jmacd-6218", jnode_check_dirty (left_scan.node)); */ + + /* Funny business here. We set the 'point' in the flush_position at prior to + starting squalloc regardless of whether the first point is + formatted or unformatted. Without this there would be an invariant, in the + rest of the code, that if the flush_position is unformatted then + flush_position->point is NULL and flush_position->parent_{lock,coord} is set, + and if the flush_position is formatted then flush_position->point is non-NULL + and no parent info is set. + + This seems lazy, but it makes the initial calls to reverse_relocate_test + (which ask "is it the pos->point the leftmost child of its parent") much easier + because we know the first child already. Nothing is broken by this, but the + reasoning is subtle. Holding an extra reference on a jnode during flush can + cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not + removed from sibling lists until they have zero reference count. Flush would + never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only + deleted to the right. So if nothing is broken, why fix it? + + NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any + point and in any moment, because of the concurrent file system + activity (for example, truncate). */ + + /* Check jnode state after flush_scan completed. Having a lock on this + node or its parent (in case of unformatted) helps us in case of + concurrent flushing. */ + if (jnode_check_flushprepped(leftmost_in_slum)) { + ON_TRACE(TRACE_FLUSH_VERB, "flush concurrency: %s already allocated\n", pos_tostring(&flush_pos)); + ret = 0; + goto failed; + } + + /* Now setup flush_pos using scan_left's endpoint. */ + ret = prepare_flush_pos(&flush_pos, leftmost_in_slum); + if (ret) + goto failed; + + if (jnode_check_flushprepped(leftmost_in_slum)) { + ON_TRACE(TRACE_FLUSH_VERB, "flush concurrency: %s already allocated\n", pos_tostring(&flush_pos)); + ret = 0; + goto failed; + } + + /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed */ + ret = alloc_pos_and_ancestors(&flush_pos); + if (ret) + goto failed; + + /* Do the main rightward-bottom-up squeeze and allocate loop. */ + ret = squalloc(&flush_pos); + pos_stop(&flush_pos); + if (ret) + goto failed; + + /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children. + First, the pos_stop() and pos_valid() routines should be modified + so that pos_stop() sets a flush_position->stop flag to 1 without + releasing the current position immediately--instead release it in + pos_done(). This is a better implementation than the current one anyway. + + It is not clear that all fields of the flush_position should not be released, + but at the very least the parent_lock, parent_coord, and parent_load should + remain held because they are hold the last twig when pos_stop() is + called. + + When we reach this point in the code, if the parent_coord is set to after the + last item then we know that flush reached the end of a twig (and according to + the new flush queueing design, we will return now). If parent_coord is not + past the last item, we should check if the current twig has any unallocated + children to the right (we are not concerned with unallocated children to the + left--in that case the twig itself should not have been allocated). If the + twig has unallocated children to the right, set the parent_coord to that + position and then repeat the call to squalloc. + + Testing for unallocated children may be defined in two ways: if any internal + item has a fake block number, it is unallocated; if any extent item is + unallocated then all of its children are unallocated. But there is a more + aggressive approach: if there are any dirty children of the twig to the right + of the current position, we may wish to relocate those nodes now. Checking for + potential relocation is more expensive as it requires knowing whether there are + any dirty children that are not unallocated. The extent_needs_allocation + should be used after setting the correct preceder. + + When we reach the end of a twig at this point in the code, if the flush can + continue (when the queue is ready) it will need some information on the future + starting point. That should be stored away in the flush_handle using a seal, I + believe. Holding a jref() on the future starting point may break other code + that deletes that node. + */ + + /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called + above the twig level. If the VM calls flush above the twig level, do nothing + and return (but figure out why this happens). The txnmgr should be modified to + only flush its leaf-level dirty list. This will do all the necessary squeeze + and allocate steps but leave unallocated branches and possibly unallocated + twigs (when the twig's leftmost child is not dirty). After flushing the leaf + level, the remaining unallocated nodes should be given write-optimized + locations. (Possibly, the remaining unallocated twigs should be allocated just + before their leftmost child.) + */ + + /* Any failure reaches this point. */ +failed: + + if (nr_to_flush != NULL) { + if (ret >= 0) { + ON_TRACE(TRACE_FLUSH, "flush_jnode wrote %u blocks\n", flush_pos.prep_or_free_cnt); + (*nr_to_flush) = flush_pos.prep_or_free_cnt; + } else { + (*nr_to_flush) = 0; + } + } + + if (ret == -EINVAL || ret == -E_DEADLOCK || ret == -E_NO_NEIGHBOR || ret == -ENOENT) { + /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly + in each case. They already are handled in many cases. */ + /* Something bad happened, but difficult to avoid... Try again! */ + ON_TRACE(TRACE_FLUSH, "flush restartable failure: %ld\n", ret); + ret = 0; + } + + + { + int ret1; + /* Write anything left in the queue, if specified by flags */ + ret1 = write_prepped_nodes(&flush_pos, 0); + + if (ret && ret != -ENOMEM) + warning("jmacd-16739", "flush failed: %ld", ret); + else + ret = ret1; + + } + + if (leftmost_in_slum) + jput(leftmost_in_slum); + + pos_done(&flush_pos); + scan_done(&left_scan); + scan_done(&right_scan); + + ON_DEBUG(atomic_dec(&flush_cnt)); + + write_syscall_trace("ex"); + + leave_flush(sb); + + if (!reiser4_is_set(sb, REISER4_MTFLUSH)) + up(&sbinfo->flush_sema); + + return ret; +} + +/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that + * flusher should submit all prepped nodes immediately without keeping them in + * flush queues for long time. The reason for rapid flush mode is to free + * memory as fast as possible. */ + +#if REISER4_USE_RAPID_FLUSH + +/* A system-wide rapid_flush_mode flag. */ +static atomic_t rapid_flush_mode_flg = ATOMIC_INIT(0); + +/** + * submit all prepped nodes if rapid flush mode is set, + * turn rapid flush mode off. + */ + +static int rapid_flush (flush_pos_t * pos) +{ + if (!atomic_read(&rapid_flush_mode_flg)) + return 0; + + return write_prepped_nodes(pos, 1); +} + +/** + * set rapid flush mode. + */ +void set_rapid_flush_mode (int on) +{ + atomic_set(&rapid_flush_mode_flg, on); +} + +#else + +#define rapid_flush(pos) (0) + +#endif /* REISER4_USE_RAPID_FLUSH */ + +/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes + * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return + * other errors as they are. */ +reiser4_internal int +flush_current_atom (int flags, long *nr_submitted, txn_atom ** atom) +{ + reiser4_super_info_data * sinfo = get_current_super_private(); + flush_queue_t *fq = NULL; + jnode * node; + int nr_queued; + int ret; + + assert ("zam-889", atom != NULL && *atom != NULL); + assert ("zam-890", spin_atom_is_locked(*atom)); + assert ("zam-892", get_current_context()->trans->atom == *atom); + + while(1) { + ret = fq_by_atom(*atom, &fq); + if (ret != -E_REPEAT) + break; + *atom = get_current_atom_locked(); + } + if (ret) + return ret; + + assert ("zam-891", spin_atom_is_locked(*atom)); + + /* parallel flushers limit */ + if (sinfo->tmgr.atom_max_flushers != 0) { + while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) { + /* An atom_send_event() call is inside fq_put_nolock() which is + called when flush is finished and nr_flushers is + decremented. */ + atom_wait_event(*atom); + *atom = get_current_atom_locked(); + } + } + + /* count ourself as a flusher */ + (*atom)->nr_flushers++; + + if (REISER4_TRACE_TREE) { + UNLOCK_ATOM(*atom); + write_syscall_trace("in"); + *atom = get_current_atom_locked(); + } + reiser4_stat_inc(flush.flush); + writeout_mode_enable(); + + nr_queued = 0; + + /* In this loop we process all already prepped (RELOC or OVRWR) and dirtied again + * nodes. The atom spin lock is not released until all dirty nodes processed or + * not prepped node found in the atom dirty lists. */ + while ((node = find_first_dirty_jnode(*atom, flags))) { + LOCK_JNODE(node); + + assert ("zam-881", jnode_is_dirty(node)); + assert ("zam-898", !JF_ISSET(node, JNODE_OVRWR)); + + if (JF_ISSET(node, JNODE_WRITEBACK)) { + capture_list_remove_clean(node); + capture_list_push_back(&(*atom)->writeback_nodes, node); + + ON_DEBUG(node->list = WB_LIST); + } else if (jnode_is_znode(node) && znode_above_root(JZNODE(node))) { + /* A special case for znode-above-root. The above-root (fake) + znode is captured and dirtied when the tree height changes or + when the root node is relocated. This causes atoms to fuse so + that changes at the root are serialized. However, this node is + never flushed. This special case used to be in lock.c to + prevent the above-root node from ever being captured, but now + that it is captured we simply prevent it from flushing. The + log-writer code relies on this to properly log superblock + modifications of the tree height. */ + jnode_make_wander_nolock(node); + } else if (JF_ISSET(node, JNODE_RELOC)) { + queue_jnode(fq, node); + ++ nr_queued; + } else + break; + + UNLOCK_JNODE(node); + } + + if (node == NULL) { + if (nr_queued == 0) { + writeout_mode_disable(); + (*atom)->nr_flushers --; + fq_put_nolock(fq); + /* current atom remains locked */ + return 0; + } + UNLOCK_ATOM(*atom); + } else { + jref(node); + UNLOCK_ATOM(*atom); + UNLOCK_JNODE(node); + ret = jnode_flush(node, NULL, nr_submitted, fq, flags); + jput(node); + } + + flush_started_io(); + trace_mark(flush); + ret = write_fq(fq, nr_submitted); + set_rapid_flush_mode(0); + + *atom = get_current_atom_locked(); + (*atom)->nr_flushers --; + fq_put_nolock(fq); + UNLOCK_ATOM(*atom); + + writeout_mode_disable(); + write_syscall_trace("ex"); + + if (ret == 0) + ret = -E_REPEAT; + + return ret; +} + +/* REVERSE PARENT-FIRST RELOCATION POLICIES */ + +/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the + reverse parent-first relocate context. Here all we know is the preceder and the block + number. Since we are going in reverse, the preceder may still be relocated as well, so + we can't ask the block allocator "is there a closer block available to relocate?" here. + In the _forward_ parent-first relocate context (not here) we actually call the block + allocator to try and find a closer location. */ +static int +reverse_relocate_if_close_enough(const reiser4_block_nr * pblk, const reiser4_block_nr * nblk) +{ + reiser4_block_nr dist; + + assert("jmacd-7710", *pblk != 0 && *nblk != 0); + assert("jmacd-7711", !blocknr_is_fake(pblk)); + assert("jmacd-7712", !blocknr_is_fake(nblk)); + + /* Distance is the absolute value. */ + dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk); + + /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder + block, do not relocate. */ + if (dist <= get_current_super_private()->flush.relocate_distance) { + return 0; + } + + return 1; +} + +/* This function is a predicate that tests for relocation. Always called in the + reverse-parent-first context, when we are asking whether the current node should be + relocated in order to expand the flush by dirtying the parent level (and thus + proceeding to flush that level). When traversing in the forward parent-first direction + (not here), relocation decisions are handled in two places: allocate_znode() and + extent_needs_allocation(). */ +static int +reverse_relocate_test(jnode * node, const coord_t * parent_coord, flush_pos_t * pos) +{ + reiser4_block_nr pblk = 0; + reiser4_block_nr nblk = 0; + + assert("jmacd-8989", !jnode_is_root(node)); + + /* + * This function is called only from the + * reverse_relocate_check_dirty_parent() and only if the parent + * node is clean. This implies that the parent has the real (i.e., not + * fake) block number, and, so does the child, because otherwise the + * parent would be dirty. + */ + + /* New nodes are treated as if they are being relocated. */ + if (jnode_created(node) + || (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) { + return 1; + } + + /* Find the preceder. FIXME(B): When the child is an unformatted, previously + existing node, the coord may be leftmost even though the child is not the + parent-first preceder of the parent. If the first dirty node appears somewhere + in the middle of the first extent unit, this preceder calculation is wrong. + Needs more logic in here. */ + if (coord_is_leftmost_unit(parent_coord)) { + pblk = *znode_get_block(parent_coord->node); + } else { + pblk = pos->preceder.blk; + } + check_preceder(pblk); + + /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */ + if (pblk == 0) { + return 1; + } + + nblk = *jnode_get_block(node); + + if (blocknr_is_fake(&nblk)) + /* child is unallocated, mark parent dirty */ + return 1; + + return reverse_relocate_if_close_enough(&pblk, &nblk); +} + +/* This function calls reverse_relocate_test to make a reverse-parent-first + relocation decision and then, if yes, it marks the parent dirty. */ +static int +reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord, flush_pos_t * pos) +{ + int ret; + + if (!znode_check_dirty(parent_coord->node)) { + + ret = reverse_relocate_test(node, parent_coord, pos); + if (ret < 0) { + return ret; + } + + /* FIXME-ZAM + if parent is already relocated - we do not want to grab space, right? */ + if (ret == 1) { + int grabbed; + + grabbed = get_current_context()->grabbed_blocks; + if (reiser4_grab_space_force((__u64)1, BA_RESERVED) != 0) + reiser4_panic("umka-1250", + "No space left during flush."); + + assert("jmacd-18923", znode_is_write_locked(parent_coord->node)); + znode_make_dirty(parent_coord->node); + grabbed2free_mark(grabbed); + } + } + + return 0; +} + +/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD + PARENT-FIRST LOOP BEGINS) */ + +/* Get the leftmost child for given coord. */ +static int get_leftmost_child_of_unit (const coord_t * coord, jnode ** child) +{ + int ret; + + ret = item_utmost_child(coord, LEFT_SIDE, child); + + if (ret) + return ret; + + if (IS_ERR(*child)) + return PTR_ERR(*child); + + return 0; +} + +/* This step occurs after the left- and right-scans are completed, before starting the + forward parent-first traversal. Here we attempt to allocate ancestors of the starting + flush point, which means continuing in the reverse parent-first direction to the + parent, grandparent, and so on (as long as the child is a leftmost child). This + routine calls a recursive process, alloc_one_ancestor, which does the real work, + except there is special-case handling here for the first ancestor, which may be a twig. + At each level (here and alloc_one_ancestor), we check for relocation and then, if + the child is a leftmost child, repeat at the next level. On the way back down (the + recursion), we allocate the ancestors in parent-first order. */ +static int alloc_pos_and_ancestors(flush_pos_t * pos) +{ + int ret = 0; + lock_handle plock; + load_count pload; + coord_t pcoord; + + ON_TRACE(TRACE_FLUSH_VERB, "flush alloc ancestors: %s\n", pos_tostring(pos)); + + coord_init_invalid(&pcoord, NULL); + init_lh(&plock); + init_load_count(&pload); + + if (pos->state == POS_ON_EPOINT) { + /* a special case for pos on twig level, where we already have + a lock on parent node. */ + /* The parent may not be dirty, in which case we should decide + whether to relocate the child now. If decision is made to + relocate the child, the parent is marked dirty. */ + ret = reverse_relocate_check_dirty_parent(pos->child, &pos->coord, pos); + if (ret) + goto exit; + + /* FIXME_NFQUCMPD: We only need to allocate the twig (if child + is leftmost) and the leaf/child, so recursion is not needed. + Levels above the twig will be allocated for + write-optimization before the transaction commits. */ + + /* Do the recursive step, allocating zero or more of our + * ancestors. */ + ret = alloc_one_ancestor(&pos->coord, pos); + + } else { + if (!znode_is_root(pos->lock.node)) { + /* all formatted nodes except tree root */ + ret = reiser4_get_parent(&plock, pos->lock.node, ZNODE_WRITE_LOCK, 0); + if (ret) + goto exit; + + ret = incr_load_count_znode(&pload, plock.node); + if (ret) + goto exit; + + ret = find_child_ptr(plock.node, pos->lock.node, &pcoord); + if (ret) + goto exit; + + ret = reverse_relocate_check_dirty_parent(ZJNODE(pos->lock.node), &pcoord, pos); + if (ret) + goto exit; + + ret = alloc_one_ancestor(&pcoord, pos); + if (ret) + goto exit; + } + + ret = allocate_znode(pos->lock.node, &pcoord, pos); + } +exit: + done_load_count(&pload); + done_lh(&plock); + return ret; +} + +/* This is the recursive step described in alloc_pos_and_ancestors, above. Ignoring the + call to set_preceder, which is the next function described, this checks if the + child is a leftmost child and returns if it is not. If the child is a leftmost child + it checks for relocation, possibly dirtying the parent. Then it performs the recursive + step. */ +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos) +{ + int ret = 0; + lock_handle alock; + load_count aload; + coord_t acoord; + + /* As we ascend at the left-edge of the region to flush, take this opportunity at + the twig level to find our parent-first preceder unless we have already set + it. */ + if (pos->preceder.blk == 0) { + ret = set_preceder(coord, pos); + if (ret != 0) + return ret; + } + + /* If the ancestor is clean or already allocated, or if the child is not a + leftmost child, stop going up, even leaving coord->node not flushprepped. */ + if (znode_check_flushprepped(coord->node)|| !coord_is_leftmost_unit(coord)) + return 0; + + init_lh(&alock); + init_load_count(&aload); + coord_init_invalid(&acoord, NULL); + + /* Only ascend to the next level if it is a leftmost child, but write-lock the + parent in case we will relocate the child. */ + if (!znode_is_root(coord->node)) { + + ret = jnode_lock_parent_coord( + ZJNODE(coord->node), &acoord, &alock, &aload, ZNODE_WRITE_LOCK, 0); + if (ret != 0) { + /* FIXME(C): check EINVAL, E_DEADLOCK */ + goto exit; + } + + ret = reverse_relocate_check_dirty_parent(ZJNODE(coord->node), &acoord, pos); + if (ret != 0) { + goto exit; + } + + /* Recursive call. */ + if (!znode_check_flushprepped(acoord.node)) { + ret = alloc_one_ancestor(&acoord, pos); + if (ret) + goto exit; + } + } + + /* Note: we call allocate with the parent write-locked (except at the root) in + case we relocate the child, in which case it will modify the parent during this + call. */ + ret = allocate_znode(coord->node, &acoord, pos); + +exit: + done_load_count(&aload); + done_lh(&alock); + return ret; +} + +/* During the reverse parent-first alloc_pos_and_ancestors process described above there is + a call to this function at the twig level. During alloc_pos_and_ancestors we may ask: + should this node be relocated (in reverse parent-first context)? We repeat this + process as long as the child is the leftmost child, eventually reaching an ancestor of + the flush point that is not a leftmost child. The preceder of that ancestors, which is + not a leftmost child, is actually on the leaf level. The preceder of that block is the + left-neighbor of the flush point. The preceder of that block is the rightmost child of + the twig on the left. So, when alloc_pos_and_ancestors passes upward through the twig + level, it stops momentarily to remember the block of the rightmost child of the twig on + the left and sets it to the flush_position's preceder_hint. + + There is one other place where we may set the flush_position's preceder hint, which is + during scan-left. +*/ +static int +set_preceder(const coord_t * coord_in, flush_pos_t * pos) +{ + int ret; + coord_t coord; + lock_handle left_lock; + load_count left_load; + +#if 0 + /* do not trust to allocation of nodes above twigs, use the block number of last + * write (write optimized approach). */ + if (znode_get_level(coord_in->node) > TWIG_LEVEL + 1) { + get_blocknr_hint_default(&pos->preceder.blk); + reiser4_stat_inc(block_alloc.nohint); + return 0; + } +#endif + + coord_dup(&coord, coord_in); + + init_lh(&left_lock); + init_load_count(&left_load); + + /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test. + coord_is_leftmost_unit is not the right test if the unformatted child is in the + middle of the first extent unit. */ + if (!coord_is_leftmost_unit(&coord)) { + coord_prev_unit(&coord); + } else { + ret = reiser4_get_left_neighbor(&left_lock, coord.node, ZNODE_READ_LOCK, GN_SAME_ATOM); + if (ret) { + /* If we fail for any reason it doesn't matter because the + preceder is only a hint. We are low-priority at this point, so + this must be the case. */ + if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR || + ret == -ENOENT || ret == -EINVAL || ret == -E_DEADLOCK) + { + ret = 0; + } + goto exit; + } + + ret = incr_load_count_znode(&left_load, left_lock.node); + if (ret) + goto exit; + + coord_init_last_unit(&coord, left_lock.node); + } + + ret = item_utmost_child_real_block(&coord, RIGHT_SIDE, &pos->preceder.blk); +exit: + check_preceder(pos->preceder.blk); + done_load_count(&left_load); + done_lh(&left_lock); + return ret; +} + +/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */ + +/* This procedure implements the outer loop of the flush algorithm. To put this in + context, here is the general list of steps taken by the flush routine as a whole: + + 1. Scan-left + 2. Scan-right (maybe) + 3. Allocate initial flush position and its ancestors + 4. + 5. + 6. + + This procedure implements the loop in steps 4 through 6 in the above listing. + + Step 4: if the current flush position is an extent item (position on the twig level), + it allocates the extent (allocate_extent_item_in_place) then shifts to the next + coordinate. If the next coordinate's leftmost child needs flushprep, we will continue. + If the next coordinate is an internal item, we descend back to the leaf level, + otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below). If the "next coordinate" + brings us past the end of the twig level, then we call + reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to + step #5 which moves to the right. + + Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the + tree to allocate any ancestors of the next-right flush position that are not also + ancestors of the current position. Those ancestors (in top-down order) are the next in + parent-first order. We squeeze adjacent nodes on the way up until the right node and + current node share the same parent, then allocate on the way back down. Finally, this + step sets the flush position to the next-right node. Then repeat steps 4 and 5. +*/ + +/* SQUEEZE CODE */ + + +/* squalloc_right_twig helper function, cut a range of extent items from + cut node to->node from the beginning up to coord @to. */ +static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key, znode * left) +{ + coord_t from; + reiser4_key from_key; + + coord_init_first_unit(&from, to->node); + item_key_by_coord(&from, &from_key); + + return cut_node_content(&from, to, &from_key, to_key, NULL); +} + +/* Copy as much of the leading extents from @right to @left, allocating + unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or + SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an + internal item it calls shift_one_internal_unit and may then return + SUBTREE_MOVED. */ +squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *, reiser4_key *stop_key); +#if REISER4_DEBUG +void *shift_check_prepare(const znode *left, const znode *right); +void shift_check(void *vp, const znode *left, const znode *right); +#endif +static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos) +{ + int ret = SUBTREE_MOVED; + coord_t coord; /* used to iterate over items */ + reiser4_key stop_key; + + assert("jmacd-2008", !node_is_empty(right)); + coord_init_first_unit(&coord, right); + + DISABLE_NODE_CHECK; + + ON_TRACE(TRACE_FLUSH_VERB, "sq_twig before copy extents: left %s\n", znode_tostring(left)); + ON_TRACE(TRACE_FLUSH_VERB, "sq_twig before copy extents: right %s\n", znode_tostring(right)); + + /* FIXME: can be optimized to cut once */ + while (!node_is_empty(coord.node) && item_is_extent(&coord)) { + ON_DEBUG(void *vp); + + assert("vs-1468", coord_is_leftmost_unit(&coord)); + ON_DEBUG(vp = shift_check_prepare(left, coord.node)); + + /* stop_key is used to find what was copied and what to cut */ + stop_key = *min_key(); + ret = squalloc_extent(left, &coord, pos, &stop_key); + if (ret != SQUEEZE_CONTINUE) + break; + assert("vs-1465", !keyeq(&stop_key, min_key())); + + /* Helper function to do the cutting. */ + set_key_offset(&stop_key, get_key_offset(&stop_key) - 1); + check_me("vs-1466", squalloc_right_twig_cut(&coord, &stop_key, left) == 0); + + ON_DEBUG(shift_check(vp, left, coord.node)); + } + + if (node_is_empty(coord.node)) + ret = SQUEEZE_SOURCE_EMPTY; + + ENABLE_NODE_CHECK; + node_check(left, REISER4_NODE_DKEYS); + node_check(right, REISER4_NODE_DKEYS); + + if (ret == SQUEEZE_TARGET_FULL) { + goto out; + } + + if (node_is_empty(right)) { + /* The whole right node was copied into @left. */ + ON_TRACE(TRACE_FLUSH_VERB, "sq_twig right node empty: %s\n", znode_tostring(right)); + assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY); + goto out; + } + + coord_init_first_unit(&coord, right); + + if (!item_is_internal(&coord)) { + /* we do not want to squeeze anything else to left neighbor because "slum" + is over */ + ret = SQUEEZE_TARGET_FULL; + goto out; + } + assert("jmacd-433", item_is_internal(&coord)); + + /* Shift an internal unit. The child must be allocated before shifting any more + extents, so we stop here. */ + ret = shift_one_internal_unit(left, right); + +out: + assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL + || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY); + + if (ret == SQUEEZE_TARGET_FULL) { + /* We submit prepped nodes here and expect that this @left twig + * will not be modified again during this jnode_flush() call. */ + int ret1; + + /* NOTE: seems like io is done under long term locks. */ + ret1 = write_prepped_nodes(pos, 1); + if (ret1 < 0) + return ret1; + } + + return ret; +} + +/* This is special node method which scans node items and check for each + one, if we need to apply flush squeeze item method. This item method + may resize/kill the item, and also may change its content. +*/ +static int squeeze_node(flush_pos_t * pos, znode * node) +{ + int ret = 0; + + item_plugin * iplug; + + assert("edward-304", pos != NULL); + assert("edward-305", pos->child == NULL); + assert("edward-475", znode_squeezable(node)); + + if (znode_get_level(node) != LEAF_LEVEL) + /* do not squeeze this node */ + goto exit; + + coord_init_first_unit(&pos->coord, node); + + while (1) { + ret = 0; + + if (node_is_empty(node)) + /* nothing to squeeze */ + goto exit; + if (pos->idata) { + iplug = pos->idata->iplug; + assert("edward-476", iplug->f.squeeze != NULL); + } + else if (!coord_is_existing_item(&pos->coord)) + /* finished */ + break; + else + iplug = item_plugin_by_coord(&pos->coord); + + if (iplug->f.squeeze == NULL) + /* unsqueezable */ + goto next; + + ret = iplug->f.squeeze(pos); + + if (ret == -E_REPEAT) + continue; + if (ret) + goto exit; + + assert("edward-307", pos->child == NULL); + + /* now we should check if (pos->idata != NULL), and if so, + call previous method again, BUT if current item is last + and mergeable with the first item of slum right neighbor, + we set idata->mergeable = 1, go to slum right neighbor + and continue squeezing using this info + */ + next: + if (coord_next_item(&pos->coord)) { + /* node is over */ + lock_handle right_lock; + load_count right_load; + coord_t coord; + + if (pos->idata == NULL) + break; + + init_lh(&right_lock); + init_load_count(&right_load); + + /* check for slum right neighbor */ + ret = neighbor_in_slum(node, &right_lock, RIGHT_SIDE, ZNODE_WRITE_LOCK); + if (ret == -E_NO_NEIGHBOR) + /* no neighbor, repeat on this node */ + continue; + else if (ret) + goto exit; + ret = incr_load_count_znode(&right_load, right_lock.node); + if (ret) { + done_lh(&right_lock); + break; + } + coord_init_after_item_end(&pos->coord); + coord_init_before_first_item(&coord, right_lock.node); + + if (iplug->b.mergeable(&pos->coord, &coord)) { + /* go to slum right neighbor */ + pos->idata->mergeable = 1; + done_load_count(&right_load); + done_lh(&right_lock); + break; + } + /* first item of right neighbor is not mergeable, + repeat this node */ + done_load_count(&right_load); + done_lh(&right_lock); + } + } + exit: + JF_CLR(ZJNODE(node), JNODE_SQUEEZABLE); + return ret; +} + +/* Squeeze and allocate the right neighbor. This is called after @left and + its current children have been squeezed and allocated already. This + procedure's job is to squeeze and items from @right to @left. + + If at the leaf level, use the shift_everything_left memcpy-optimized + version of shifting (squeeze_right_leaf). + + If at the twig level, extents are allocated as they are shifted from @right + to @left (squalloc_right_twig). + + At any other level, shift one internal item and return to the caller + (squalloc_parent_first) so that the shifted-subtree can be processed in + parent-first order. + + When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is + returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is + returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL + is returned. +*/ + +static int squeeze_right_neighbor(flush_pos_t * pos, znode * left, znode * right) +{ + int ret; + + /* FIXME it is possible to see empty hasn't-heard-banshee node in a + * tree owing to error (for example, ENOSPC) in write */ + /* assert("jmacd-9321", !node_is_empty(left)); */ + assert("jmacd-9322", !node_is_empty(right)); + assert("jmacd-9323", znode_get_level(left) == znode_get_level(right)); + + ON_TRACE(TRACE_FLUSH_VERB, "sq_rn[%u] left %s\n", znode_get_level(left), znode_tostring(left)); + ON_TRACE(TRACE_FLUSH_VERB, "sq_rn[%u] right %s\n", znode_get_level(left), znode_tostring(right)); + + switch (znode_get_level(left)) { + case TWIG_LEVEL: + /* Shift with extent allocating until either an internal item + is encountered or everything is shifted or no free space + left in @left */ + ret = squeeze_right_twig(left, right, pos); + break; + + default: + /* All other levels can use shift_everything until we implement per-item + flush plugins. */ + ret = squeeze_right_non_twig(left, right); + break; + } + + assert("jmacd-2011", (ret < 0 || + ret == SQUEEZE_SOURCE_EMPTY || ret == SQUEEZE_TARGET_FULL || ret == SUBTREE_MOVED)); + + if (ret == SQUEEZE_SOURCE_EMPTY) { + reiser4_stat_inc(flush.squeezed_completely); + } + + ON_TRACE(TRACE_FLUSH_VERB, "sq_rn[%u] returns %s: left %s\n", + znode_get_level(left), + (ret == SQUEEZE_SOURCE_EMPTY) ? "src empty" : + ((ret == SQUEEZE_TARGET_FULL) ? "tgt full" : + ((ret == SUBTREE_MOVED) ? "tree moved" : "error")), znode_tostring(left)); + return ret; +} + +static int squeeze_right_twig_and_advance_coord (flush_pos_t * pos, znode * right) +{ + int ret; + + ret = squeeze_right_twig(pos->lock.node, right, pos); + if (ret < 0) + return ret; + if (ret > 0) { + coord_init_after_last_item(&pos->coord, pos->lock.node); + return ret; + } + + coord_init_last_unit(&pos->coord, pos->lock.node); + return 0; +} + +#if 0 +/* "prepped" check for parent node without long-term locking it */ +static inline int fast_check_parent_flushprepped (znode * node) +{ + reiser4_tree * tree = current_tree; + int prepped = 1; + + RLOCK_TREE(tree); + + if (node->in_parent.node || !jnode_is_flushprepped(ZJNODE(node))) + prepped = 0; + + RUNLOCK_TREE(tree); + + return prepped; +} +#endif + +/* forward declaration */ +static int squalloc_upper_levels (flush_pos_t *, znode *, znode *); + +/* do a fast check for "same parents" condition before calling + * squalloc_upper_levels() */ +static inline int check_parents_and_squalloc_upper_levels (flush_pos_t * pos, znode *left, znode * right) +{ + if (znode_same_parents(left, right)) + return 0; + + return squalloc_upper_levels(pos, left, right); +} + +/* Check whether the parent of given @right node needs to be processes + ((re)allocated) prior to processing of the child. If @left and @right do not + share at least the parent of the @right is after the @left but before the + @right in parent-first order, we have to (re)allocate it before the @right + gets (re)allocated. */ +static int squalloc_upper_levels (flush_pos_t * pos, znode *left, znode * right) +{ + int ret; + + lock_handle left_parent_lock; + lock_handle right_parent_lock; + + load_count left_parent_load; + load_count right_parent_load; + + + init_lh(&left_parent_lock); + init_lh(&right_parent_lock); + + init_load_count(&left_parent_load); + init_load_count(&right_parent_load); + + ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK, 0); + if (ret) + goto out; + + ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK, 0); + if (ret) + goto out; + + /* Check for same parents */ + if (left_parent_lock.node == right_parent_lock.node) + goto out; + + if (znode_check_flushprepped(right_parent_lock.node)) { + /* Keep parent-first order. In the order, the right parent node stands + before the @right node. If it is already allocated, we set the + preceder (next block search start point) to its block number, @right + node should be allocated after it. + + However, preceder is set only if the right parent is on twig level. + The explanation is the following: new branch nodes are allocated over + already allocated children while the tree grows, it is difficult to + keep tree ordered, we assume that only leaves and twings are correctly + allocated. So, only twigs are used as a preceder for allocating of the + rest of the slum. */ + if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) { + pos->preceder.blk = *znode_get_block(right_parent_lock.node); + check_preceder(pos->preceder.blk); + } + goto out; + } + + ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node); + if (ret) + goto out; + + ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node); + if (ret) + goto out; + + ret = squeeze_right_neighbor(pos, left_parent_lock.node, right_parent_lock.node); + /* We stop if error. We stop if some items/units were shifted (ret == 0) + * and thus @right changed its parent. It means we have not process + * right_parent node prior to processing of @right. Positive return + * values say that shifting items was not happen because of "empty + * source" or "target full" conditions. */ + if (ret <= 0) + goto out; + + /* parent(@left) and parent(@right) may have different parents also. We + * do a recursive call for checking that. */ + ret = check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node, right_parent_lock.node); + if (ret) + goto out; + + /* allocate znode when going down */ + ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos); + + out: + done_load_count(&left_parent_load); + done_load_count(&right_parent_load); + + done_lh(&left_parent_lock); + done_lh(&right_parent_lock); + + return ret; +} + +/* Check the leftmost child "flushprepped" status, also returns true if child + * node was not found in cache. */ +static int leftmost_child_of_unit_check_flushprepped (const coord_t *coord) +{ + int ret; + int prepped; + + jnode * child; + + ret = get_leftmost_child_of_unit(coord, &child); + + if (ret) + return ret; + + if (child) { + prepped = jnode_check_flushprepped(child); + jput(child); + } else { + /* We consider not existing child as a node which slum + processing should not continue to. Not cached node is clean, + so it is flushprepped. */ + prepped = 1; + } + + return prepped; +} + +/* (re)allocate znode with automated getting parent node */ +static int lock_parent_and_allocate_znode (znode * node, flush_pos_t * pos) +{ + int ret; + lock_handle parent_lock; + load_count parent_load; + coord_t pcoord; + + assert ("zam-851", znode_is_write_locked(node)); + + init_lh(&parent_lock); + init_load_count(&parent_load); + + ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK, 0); + if (ret) + goto out; + + ret = incr_load_count_znode(&parent_load, parent_lock.node); + if (ret) + goto out; + + ret = find_child_ptr(parent_lock.node, node, &pcoord); + if (ret) + goto out; + + ret = allocate_znode(node, &pcoord, pos); + + out: + done_load_count(&parent_load); + done_lh(&parent_lock); + return ret; +} + +/* Process nodes on leaf level until unformatted node or rightmost node in the + * slum reached. */ +static int handle_pos_on_formatted (flush_pos_t * pos) +{ + int ret; + lock_handle right_lock; + load_count right_load; + + init_lh(&right_lock); + init_load_count(&right_load); + + if (znode_squeezable(pos->lock.node)) { + ret = squeeze_node(pos, pos->lock.node); + if (ret) + return ret; + } + + while (1) { + ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE, ZNODE_WRITE_LOCK); + if (ret) + break; + + /* we don't prep nodes for flushing twice. This can be suboptimal, or it + * can be optimal. For now we choose to live with the risk that it will + * be suboptimal because it would be quite complex to code it to be + * smarter. */ + if (znode_check_flushprepped(right_lock.node)) { + pos_stop(pos); + break; + } + + ret = incr_load_count_znode(&right_load, right_lock.node); + if (ret) + break; + + if (znode_squeezable(right_lock.node)) { + ret = squeeze_node(pos, right_lock.node); + if (ret) + break; + } + + if (node_is_empty(right_lock.node)) { + /* node was squeezed completely, repeat */ + done_load_count(&right_load); + done_lh(&right_lock); + continue; + } + + /* squeeze _before_ going upward. */ + ret = squeeze_right_neighbor(pos, pos->lock.node, right_lock.node); + if (ret < 0) + break; + + if (node_is_empty(right_lock.node)) { + /* repeat if right node was squeezed completely */ + done_load_count(&right_load); + done_lh(&right_lock); + continue; + } + + /* parent(right_lock.node) has to be processed before + * (right_lock.node) due to "parent-first" allocation order. */ + ret = check_parents_and_squalloc_upper_levels(pos, pos->lock.node, right_lock.node); + if (ret) + break; + /* (re)allocate _after_ going upward */ + ret = lock_parent_and_allocate_znode(right_lock.node, pos); + if (ret) + break; + + /* advance the flush position to the right neighbor */ + move_flush_pos(pos, &right_lock, &right_load, NULL); + + ret = rapid_flush(pos); + if (ret) + break; + } + + done_load_count(&right_load); + done_lh(&right_lock); + + /* This function indicates via pos whether to stop or go to twig or continue on current + * level. */ + return ret; + +} + +/* Process nodes on leaf level until unformatted node or rightmost node in the + * slum reached. */ +static int handle_pos_on_leaf (flush_pos_t * pos) +{ + int ret; + + assert ("zam-845", pos->state == POS_ON_LEAF); + + ret = handle_pos_on_formatted(pos); + + if (ret == -E_NO_NEIGHBOR) { + /* cannot get right neighbor, go process extents. */ + pos->state = POS_TO_TWIG; + return 0; + } + + return ret; +} + +/* Process slum on level > 1 */ +static int handle_pos_on_internal (flush_pos_t * pos) +{ + assert ("zam-850", pos->state == POS_ON_INTERNAL); + return handle_pos_on_formatted(pos); +} + +/* check whether squalloc should stop before processing given extent */ +static int squalloc_extent_should_stop (flush_pos_t * pos) +{ + assert("zam-869", item_is_extent(&pos->coord)); + + /* pos->child is a jnode handle_pos_on_extent() should start with in + * stead of the first child of the first extent unit. */ + if (pos->child) { + int prepped; + + assert("vs-1383", jnode_is_unformatted(pos->child)); + prepped = jnode_check_flushprepped(pos->child); + pos->pos_in_unit = jnode_get_index(pos->child) - extent_unit_index(&pos->coord); + assert("vs-1470", pos->pos_in_unit < extent_unit_width(&pos->coord)); + assert("nikita-3434", ergo(extent_is_unallocated(&pos->coord), + pos->pos_in_unit == 0)); + jput(pos->child); + pos->child = NULL; + + return prepped; + } + + pos->pos_in_unit = 0; + if (extent_is_unallocated(&pos->coord)) + return 0; + + return leftmost_child_of_unit_check_flushprepped(&pos->coord); +} + +int alloc_extent(flush_pos_t *flush_pos); + +/* Handle the case when regular reiser4 tree (znodes connected one to its + * neighbors by sibling pointers) is interrupted on leaf level by one or more + * unformatted nodes. By having a lock on twig level and use extent code + * routines to process unformatted nodes we swim around an irregular part of + * reiser4 tree. */ +static int handle_pos_on_twig (flush_pos_t * pos) +{ + int ret; + + assert ("zam-844", pos->state == POS_ON_EPOINT); + assert ("zam-843", item_is_extent(&pos->coord)); + + /* We decide should we continue slum processing with current extent + unit: if leftmost child of current extent unit is flushprepped + (i.e. clean or already processed by flush) we stop squalloc(). There + is a fast check for unallocated extents which we assume contain all + not flushprepped nodes. */ + /* FIXME: Here we implement simple check, we are only looking on the + leftmost child. */ + ret = squalloc_extent_should_stop(pos); + if (ret != 0) { + pos_stop(pos); + return ret; + } + + while (pos_valid(pos) && coord_is_existing_unit(&pos->coord) && item_is_extent(&pos->coord)) { + ret = alloc_extent(pos); + if (ret) { + break; + } + coord_next_unit(&pos->coord); + } + + if (coord_is_after_rightmost(&pos->coord)) { + pos->state = POS_END_OF_TWIG; + return 0; + } + if (item_is_internal(&pos->coord)) { + pos->state = POS_TO_LEAF; + return 0; + } + + assert ("zam-860", item_is_extent(&pos->coord)); + + /* "slum" is over */ + pos->state = POS_INVALID; + return 0; +} + +/* When we about to return flush position from twig to leaf level we can process + * the right twig node or move position to the leaf. This processes right twig + * if it is possible and jump to leaf level if not. */ +static int handle_pos_end_of_twig (flush_pos_t * pos) +{ + int ret; + lock_handle right_lock; + load_count right_load; + coord_t at_right; + jnode * child = NULL; + + + assert ("zam-848", pos->state == POS_END_OF_TWIG); + assert ("zam-849", coord_is_after_rightmost(&pos->coord)); + + init_lh(&right_lock); + init_load_count(&right_load); + + /* We get a lock on the right twig node even it is not dirty because + * slum continues or discontinues on leaf level not on next twig. This + * lock on the right twig is needed for getting its leftmost child. */ + ret = reiser4_get_right_neighbor(&right_lock, pos->lock.node, ZNODE_WRITE_LOCK, GN_SAME_ATOM); + if (ret) + goto out; + + ret = incr_load_count_znode(&right_load, right_lock.node); + if (ret) + goto out; + + /* right twig could be not dirty */ + if (znode_check_dirty(right_lock.node)) { + /* If right twig node is dirty we always attempt to squeeze it + * content to the left... */ +became_dirty: + ret = squeeze_right_twig_and_advance_coord(pos, right_lock.node); + if (ret <=0) { + /* pos->coord is on internal item, go to leaf level, or + * we have an error which will be caught in squalloc() */ + pos->state = POS_TO_LEAF; + goto out; + } + + /* If right twig was squeezed completely we wave to re-lock + * right twig. now it is done through the top-level squalloc + * routine. */ + if (node_is_empty(right_lock.node)) + goto out; + + /* ... and prep it if it is not yet prepped */ + if (!znode_check_flushprepped(right_lock.node)) { + /* As usual, process parent before ...*/ + ret = check_parents_and_squalloc_upper_levels(pos, pos->lock.node, right_lock.node); + if (ret) + goto out; + + /* ... processing the child */ + ret = lock_parent_and_allocate_znode(right_lock.node, pos); + if (ret) + goto out; + } + } else { + coord_init_first_unit(&at_right, right_lock.node); + + /* check first child of next twig, should we continue there ? */ + ret = get_leftmost_child_of_unit(&at_right, &child); + if (ret || child == NULL || jnode_check_flushprepped(child)) { + pos_stop(pos); + goto out; + } + + /* check clean twig for possible relocation */ + if (!znode_check_flushprepped(right_lock.node)) { + ret = reverse_relocate_check_dirty_parent(child, &at_right, pos); + if (ret) + goto out; + if (znode_check_dirty(right_lock.node)) + goto became_dirty; + } + } + + assert ("zam-875", znode_check_flushprepped(right_lock.node)); + + /* Update the preceder by a block number of just processed right twig + * node. The code above could miss the preceder updating because + * allocate_znode() could not be called for this node. */ + pos->preceder.blk = *znode_get_block(right_lock.node); + check_preceder(pos->preceder.blk); + + coord_init_first_unit(&at_right, right_lock.node); + assert("zam-868", coord_is_existing_unit(&at_right)); + + pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF; + move_flush_pos(pos, &right_lock, &right_load, &at_right); + + out: + done_load_count(&right_load); + done_lh(&right_lock); + + if (child) + jput(child); + + return ret; +} + +/* Move the pos->lock to leaf node pointed by pos->coord, check should we + * continue there. */ +static int handle_pos_to_leaf (flush_pos_t * pos) +{ + int ret; + lock_handle child_lock; + load_count child_load; + jnode * child; + + assert ("zam-846", pos->state == POS_TO_LEAF); + assert ("zam-847", item_is_internal(&pos->coord)); + + init_lh(&child_lock); + init_load_count(&child_load); + + ret = get_leftmost_child_of_unit(&pos->coord, &child); + if (ret) + return ret; + if (child == NULL) { + pos_stop(pos); + return 0; + } + + if (jnode_check_flushprepped(child)) { + pos->state = POS_INVALID; + goto out; + } + + ret = longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI); + if (ret) + goto out; + + ret = incr_load_count_znode(&child_load, JZNODE(child)); + if (ret) + goto out; + + ret = allocate_znode(JZNODE(child), &pos->coord, pos); + if (ret) + goto out; + + /* move flush position to leaf level */ + pos->state = POS_ON_LEAF; + move_flush_pos(pos, &child_lock, &child_load, NULL); + + out: + done_load_count(&child_load); + done_lh(&child_lock); + jput(child); + + return ret; +} +/* move pos from leaf to twig, and move lock from leaf to twig. */ +/* Move pos->lock to upper (twig) level */ +static int handle_pos_to_twig (flush_pos_t * pos) +{ + int ret; + + lock_handle parent_lock; + load_count parent_load; + coord_t pcoord; + + assert ("zam-852", pos->state == POS_TO_TWIG); + + init_lh(&parent_lock); + init_load_count(&parent_load); + + ret = reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK, 0); + if (ret) + goto out; + + ret = incr_load_count_znode(&parent_load, parent_lock.node); + if (ret) + goto out; + + ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord); + if (ret) + goto out; + + assert ("zam-870", item_is_internal(&pcoord)); + coord_next_item(&pcoord); + + if (coord_is_after_rightmost(&pcoord)) + pos->state = POS_END_OF_TWIG; + else if (item_is_extent(&pcoord)) + pos->state = POS_ON_EPOINT; + else { + /* Here we understand that getting -E_NO_NEIGHBOR in + * handle_pos_on_leaf() was because of just a reaching edge of + * slum */ + pos_stop(pos); + goto out; + } + + move_flush_pos(pos, &parent_lock, &parent_load, &pcoord); + + out: + done_load_count(&parent_load); + done_lh(&parent_lock); + + return ret; +} + +typedef int (*pos_state_handle_t)(flush_pos_t*); +static pos_state_handle_t flush_pos_handlers[] = { + /* process formatted nodes on leaf level, keep lock on a leaf node */ + [POS_ON_LEAF] = handle_pos_on_leaf, + /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently + * being processed */ + [POS_ON_EPOINT] = handle_pos_on_twig, + /* move a lock from leaf node to its parent for further processing of unformatted nodes */ + [POS_TO_TWIG] = handle_pos_to_twig, + /* move a lock from twig to leaf level when a processing of unformatted nodes finishes, + * pos->coord points to the leaf node we jump to */ + [POS_TO_LEAF] = handle_pos_to_leaf, + /* after processing last extent in the twig node, attempting to shift items from the twigs + * right neighbor and process them while shifting */ + [POS_END_OF_TWIG] = handle_pos_end_of_twig, + /* process formatted nodes on internal level, keep lock on an internal node */ + [POS_ON_INTERNAL] = handle_pos_on_internal +}; + +/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze, + * encrypt) nodes and their ancestors in "parent-first" order */ +static int squalloc (flush_pos_t * pos) +{ + int ret = 0; + + /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for + * greater CPU efficiency? Measure and see.... -Hans */ + while (pos_valid(pos)) { + ret = flush_pos_handlers[pos->state](pos); + if (ret < 0) + break; + + ret = rapid_flush(pos); + if (ret) + break; + } + + /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos* + routines, -E_NO_NEIGHBOR means that slum edge was reached */ + if (ret > 0 || ret == -E_NO_NEIGHBOR) + ret = 0; + + return ret; +} + +static void update_ldkey(znode * left) +{ + reiser4_key ldkey; + + if (node_is_empty(left)) + return; + + UNDER_RW_VOID(dk, znode_get_tree(left), write, + znode_set_ld_key(left, + leftmost_key_in_node(left, &ldkey))); +} + +/* Shift as much as possible from @right to @left using the memcpy-optimized + shift_everything_left. @left and @right are formatted neighboring nodes on + leaf level. */ +static int +squeeze_right_non_twig(znode * left, znode * right) +{ + int ret; + carry_pool pool; + carry_level todo; + int old_items; + reiser4_tree * tree; + + assert("nikita-2246", znode_get_level(left) == znode_get_level(right)); + + if (!znode_is_dirty(left) || !znode_is_dirty(right)) + return SQUEEZE_TARGET_FULL; + + init_carry_pool(&pool); + init_carry_level(&todo, &pool); + + old_items = node_num_items(left); + ret = shift_everything_left(right, left, &todo); + +#if REISER4_STATS + /* FIXME-VS: urgently added squeeze statistics */ + if (znode_get_level(left) == LEAF_LEVEL) { + int old_free_space = znode_free_space(left); + + reiser4_stat_inc(flush.squeezed_leaves); + reiser4_stat_add(flush.squeezed_leaf_items, node_num_items(left) - old_items); + reiser4_stat_add(flush.squeezed_leaf_bytes, old_free_space - znode_free_space(left)); + } +#endif + + tree = znode_get_tree(left); + update_ldkey(left); + UNDER_RW_VOID(dk, tree, write, update_znode_dkeys(left, right)); + + if (ret > 0) { + reiser4_block_nr amount; + int grabbed; + + /* Carry is called to update delimiting key or to remove empty + node. */ + ON_STATS(todo.level_no = znode_get_level(left) + 1); + +#if 0 + /* FIXME-VS: this looks superfluous: nodes participating in shift (@left and @right) are dirty + already. If @right does not fit entirely into @left - the only one block is to be reserved - their + common parent which contains delimiting key. If @right fits into @left entirely - the number of nodes + changed on higher levels is also one: that will be a node which from which pointer to @right. Note + that if direct parent of @right contains only one pointer and get deleted as well - number of nodes + to be changed on higher levels is still 1 */ + amount = estimate_internal_amount(2, + get_current_super_private()->tree.height); +#endif + amount = left->zjnode.tree->height; + grabbed = get_current_context()->grabbed_blocks; + ret = reiser4_grab_space_force(amount, BA_RESERVED); + if (ret != 0) { + reiser4_panic("nikita-3003", + "Reserved space is exhausted. Ask Hans."); + done_carry_pool(&pool); + return ret; + } + + ret = carry(&todo, NULL /* previous level */ ); + grabbed2free_mark(grabbed); + } else { + /* Shifting impossible, we return appropriate result code */ + ret = node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY : SQUEEZE_TARGET_FULL; + } + + done_carry_pool(&pool); + return ret; +} + +/* Shift first unit of first item if it is an internal one. Return + SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return + SUBTREE_MOVED. */ +static int +shift_one_internal_unit(znode * left, znode * right) +{ + int ret; + carry_pool pool; + carry_level todo; + coord_t coord; + int size, moved; + carry_plugin_info info; + + assert("nikita-2247", znode_get_level(left) == znode_get_level(right)); + assert("nikita-2435", znode_is_write_locked(left)); + assert("nikita-2436", znode_is_write_locked(right)); + + if (REISER4_DEBUG) { + RLOCK_TREE(znode_get_tree(left)); + assert("nikita-2434", left->right == right); + RUNLOCK_TREE(znode_get_tree(left)); + } + + coord_init_first_unit(&coord, right); + + if (REISER4_DEBUG && !node_is_empty(left)) { + coord_t last; + ON_DEBUG(reiser4_key right_key); + ON_DEBUG(reiser4_key left_key); + + coord_init_last_unit(&last, left); + + assert("nikita-2463", + keyle(item_key_by_coord(&last, &left_key), item_key_by_coord(&coord, &right_key))); + } + + assert("jmacd-2007", item_is_internal(&coord)); + + init_carry_pool(&pool); + init_carry_level(&todo, &pool); + + size = item_length_by_coord(&coord); + info.todo = &todo; + info.doing = NULL; + + ret = node_plugin_by_node(left)->shift(&coord, left, SHIFT_LEFT, 1 + /* delete @right if it becomes empty */ + , 0 /* move coord */ , + &info); + + /* If shift returns positive, then we shifted the item. */ + assert("vs-423", ret <= 0 || size == ret); + moved = (ret > 0); + + if (moved) { + int grabbed; + + /* Grabbing two blocks for left and right neighbours */ + /* + * FIXME-VS: left and right are involved into flush that means that they were modifed already and + * therefore space for their change was reserved already. What we have to reserve here is space for + * updating delimiting keys after shifting + */ + grabbed = get_current_context()->grabbed_blocks; + ret = reiser4_grab_space_force((__u64)(left->zjnode.tree->height), BA_RESERVED); + if (ret != 0) + return ret; + + znode_make_dirty(left); + znode_make_dirty(right); + update_ldkey(left); + UNDER_RW_VOID(dk, znode_get_tree(left), write, update_znode_dkeys(left, right)); + + ON_STATS(todo.level_no = znode_get_level(left) + 1); + ret = carry(&todo, NULL /* previous level */ ); + grabbed2free_mark(grabbed); + } + + ON_TRACE(TRACE_FLUSH_VERB, + "shift_one %s an item: left has %u items, right has %u items\n", + moved > 0 ? "moved" : "did not move", node_num_items(left), node_num_items(right)); + + done_carry_pool(&pool); + + if (ret != 0) { + /* Shift or carry operation failed. */ + assert("jmacd-7325", ret < 0); + return ret; + } + + return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL; +} + +/* ALLOCATE INTERFACE */ +/* Audited by: umka (2002.06.11) */ +reiser4_internal void +jnode_set_block(jnode * node /* jnode to update */ , + const reiser4_block_nr * blocknr /* new block nr */ ) +{ + assert("nikita-2020", node != NULL); + assert("umka-055", blocknr != NULL); + assert("zam-819", ergo(JF_ISSET(node, JNODE_EFLUSH), node->blocknr == 0)); + assert("vs-1453", ergo(JF_ISSET(node, JNODE_EFLUSH), jnode_is_unformatted(node))); + node->blocknr = *blocknr; +} + +/* Make the final relocate/wander decision during forward parent-first squalloc for a + znode. For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */ +static int +allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos) +{ + int ret; + reiser4_super_info_data * sbinfo = get_current_super_private(); + /* FIXME(D): We have the node write-locked and should have checked for ! + allocated() somewhere before reaching this point, but there can be a race, so + this assertion is bogus. */ + assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node))); + assert("jmacd-7988", znode_is_write_locked(node)); + assert("jmacd-7989", coord_is_invalid(parent_coord) + || znode_is_write_locked(parent_coord->node)); + + if (ZF_ISSET(node, JNODE_REPACK) || znode_created(node) || znode_is_root(node) || + /* We have enough nodes to relocate no matter what. */ + (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) + { + /* No need to decide with new nodes, they are treated the same as + relocate. If the root node is dirty, relocate. */ + if (pos->preceder.blk == 0) { + /* preceder is unknown and we have decided to relocate node -- + using of default value for search start is better than search + from block #0. */ + get_blocknr_hint_default(&pos->preceder.blk); + reiser4_stat_inc(block_alloc.nohint); + check_preceder(pos->preceder.blk); + } + + goto best_reloc; + + } else if (pos->preceder.blk == 0) { + /* If we don't know the preceder, leave it where it is. */ + jnode_make_wander(ZJNODE(node)); + } else { + /* Make a decision based on block distance. */ + reiser4_block_nr dist; + reiser4_block_nr nblk = *znode_get_block(node); + + assert("jmacd-6172", !blocknr_is_fake(&nblk)); + assert("jmacd-6173", !blocknr_is_fake(&pos->preceder.blk)); + assert("jmacd-6174", pos->preceder.blk != 0); + + if (pos->preceder.blk == nblk - 1) { + /* Ideal. */ + jnode_make_wander(ZJNODE(node)); + } else { + + dist = (nblk < pos->preceder.blk) ? (pos->preceder.blk - nblk) : (nblk - pos->preceder.blk); + + /* See if we can find a closer block (forward direction only). */ + pos->preceder.max_dist = min((reiser4_block_nr)sbinfo->flush.relocate_distance, dist); + pos->preceder.level = znode_get_level(node); + + ret = allocate_znode_update(node, parent_coord, pos); + + pos->preceder.max_dist = 0; + + if (ret && (ret != -ENOSPC)) + return ret; + + if (ret == 0) { + /* Got a better allocation. */ + znode_make_reloc(node, pos->fq); + } else if (dist < sbinfo->flush.relocate_distance) { + /* The present allocation is good enough. */ + jnode_make_wander(ZJNODE(node)); + } else { + /* Otherwise, try to relocate to the best position. */ + best_reloc: + ret = allocate_znode_update(node, parent_coord, pos); + if (ret != 0) + return ret; + + /* set JNODE_RELOC bit _after_ node gets allocated */ + znode_make_reloc(node, pos->fq); + } + } + } + + /* This is the new preceder. */ + pos->preceder.blk = *znode_get_block(node); + check_preceder(pos->preceder.blk); + pos->alloc_cnt += 1; + + assert ("jmacd-4277", !blocknr_is_fake(&pos->preceder.blk)); + + return 0; +} + +/* A subroutine of allocate_znode, this is called first to see if there is a close + position to relocate to. It may return ENOSPC if there is no close position. If there + is no close position it may not relocate. This takes care of updating the parent node + with the relocated block address. */ +static int +allocate_znode_update(znode * node, const coord_t * parent_coord, flush_pos_t * pos) +{ + int ret; + reiser4_block_nr blk; + reiser4_block_nr len = 1; + lock_handle uber_lock; + int flush_reserved_used = 0; + int grabbed; + + init_lh(&uber_lock); + + grabbed = get_current_context()->grabbed_blocks; + + /* discard e-flush allocation */ + ret = zload(node); + if (ret) + return ret; + + if (ZF_ISSET(node, JNODE_CREATED)) { + assert ("zam-816", blocknr_is_fake(znode_get_block(node))); + pos->preceder.block_stage = BLOCK_UNALLOCATED; + } else { + pos->preceder.block_stage = BLOCK_GRABBED; + + /* The disk space for relocating the @node is already reserved in "flush reserved" + * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab + * space from whole disk not from only 95%). */ + if (znode_get_level(node) == LEAF_LEVEL) { + /* + * earlier (during do_jnode_make_dirty()) we decided + * that @node can possibly go into overwrite set and + * reserved block for its wandering location. + */ + txn_atom * atom = get_current_atom_locked(); + assert("nikita-3449", + ZF_ISSET(node, JNODE_FLUSH_RESERVED)); + flush_reserved2grabbed(atom, (__u64)1); + spin_unlock_atom(atom); + /* + * we are trying to move node into relocate + * set. Allocation of relocated position "uses" + * reserved block. + */ + ZF_CLR(node, JNODE_FLUSH_RESERVED); + flush_reserved_used = 1; + } else { + ret = reiser4_grab_space_force((__u64)1, BA_RESERVED); + if (ret != 0) + goto exit; + } + } + + /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */ + ret = reiser4_alloc_blocks(&pos->preceder, &blk, &len, + BA_FORMATTED | BA_PERMANENT); + if(ret) { + /* Get flush reserved block back if allocation fails. */ + if (flush_reserved_used) { + /* + * ok, we failed to move node into relocate + * set. Restore status quo. + */ + grabbed2flush_reserved((__u64)1); + ZF_SET(node, JNODE_FLUSH_RESERVED); + } + goto exit; + } + + + if (!ZF_ISSET(node, JNODE_CREATED) && + (ret = reiser4_dealloc_block(znode_get_block(node), 0, BA_DEFER))) + goto exit; + + if (likely(!znode_is_root(node))) { + item_plugin *iplug; + + iplug = item_plugin_by_coord(parent_coord); + assert("nikita-2954", iplug->f.update != NULL); + iplug->f.update(parent_coord, &blk); + + znode_make_dirty(parent_coord->node); + + } else { + reiser4_tree *tree = znode_get_tree(node); + znode *uber; + + /* We take a longterm lock on the fake node in order to change + the root block number. This may cause atom fusion. */ + ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI, + &uber_lock); + /* The fake node cannot be deleted, and we must have priority + here, and may not be confused with ENOSPC. */ + assert("jmacd-74412", + ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC); + + if (ret) + goto exit; + + uber = uber_lock.node; + + UNDER_RW_VOID(tree, tree, write, tree->root_block = blk); + + znode_make_dirty(uber); + } + + ret = znode_rehash(node, &blk); +exit: + zrelse(node); + done_lh(&uber_lock); + grabbed2free_mark(grabbed); + return ret; +} + +/* JNODE INTERFACE */ + +/* Lock a node (if formatted) and then get its parent locked, set the child's + coordinate in the parent. If the child is the root node, the above_root + znode is returned but the coord is not set. This function may cause atom + fusion, but it is only used for read locks (at this point) and therefore + fusion only occurs when the parent is already dirty. */ +/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent + pointer in jnodes. */ +static int +jnode_lock_parent_coord(jnode * node, + coord_t * coord, + lock_handle * parent_lh, + load_count * parent_zh, + znode_lock_mode parent_mode, + int try) +{ + int ret; + + assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node)); + assert("edward-54", jnode_is_unformatted(node) || znode_is_any_locked(JZNODE(node))); + + if (!jnode_is_znode(node)) { + reiser4_key key; + tree_level stop_level = TWIG_LEVEL ; + lookup_bias bias = FIND_EXACT; + + assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP)); + + /* The case when node is not znode, but can have parent coord + (unformatted node, node which represents cluster page, + etc..). Generate a key for the appropriate entry, search + in the tree using coord_by_key, which handles locking for + us. */ + + /* + * nothing is locked at this moment, so, nothing prevents + * concurrent truncate from removing jnode from inode. To + * prevent this spin-lock jnode. jnode can be truncated just + * after call to the jnode_build_key(), but this is ok, + * because coord_by_key() will just fail to find appropriate + * extent. + */ + LOCK_JNODE(node); + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) { + jnode_build_key(node, &key); + ret = 0; + } else + ret = RETERR(-ENOENT); + UNLOCK_JNODE(node); + + if (ret != 0) + return ret; + + if (jnode_is_cluster_page(node)) + stop_level = LEAF_LEVEL; + + assert("jmacd-1812", coord != NULL); + + ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh, + parent_mode, bias, stop_level, stop_level, CBK_UNIQUE, 0/*ra_info*/); + switch (ret) { + case CBK_COORD_NOTFOUND: + if (jnode_is_cluster_page(node)) { + int result; + assert("edward-164", jnode_page(node) != NULL); + assert("edward-165", jnode_page(node)->mapping != NULL); + assert("edward-166", jnode_page(node)->mapping->host != NULL); + assert("edward-167", inode_get_flag(jnode_page(node)->mapping->host, REISER4_CLUSTER_KNOWN)); + /* jnode of a new cluster which is not represented by any items in the tree. */ + result = incr_load_count_znode(parent_zh, parent_lh->node); + if (result != 0) + return result; + coord->between = AFTER_ITEM; + } else if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) { + warning("nikita-3177", "Parent not found"); + print_jnode("node", node); + } + return ret; + case CBK_COORD_FOUND: + if (coord->between != AT_UNIT) { + /* FIXME: comment needed */ + done_lh(parent_lh); + if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) { + warning("nikita-3178", + "Found but not happy: %i", + coord->between); + print_jnode("node", node); + } + return RETERR(-ENOENT); + } + ret = incr_load_count_znode(parent_zh, parent_lh->node); + if (ret != 0) + return ret; + break; + default: + return ret; + } + + } else { + int flags; + znode *z; + + z = JZNODE(node); + /* Formatted node case: */ + assert("jmacd-2061", !znode_is_root(z)); + + flags = GN_ALLOW_NOT_CONNECTED; + if (try) + flags |= GN_TRY_LOCK; + + ret = reiser4_get_parent_flags(parent_lh, z, parent_mode, flags); + if (ret != 0) + /* -E_REPEAT is ok here, it is handled by the caller. */ + return ret; + + /* Make the child's position "hint" up-to-date. (Unless above + root, which caller must check.) */ + if (coord != NULL) { + + ret = incr_load_count_znode(parent_zh, parent_lh->node); + if (ret != 0) { + warning("jmacd-976812386", "incr_load_count_znode failed: %d", ret); + return ret; + } + + ret = find_child_ptr(parent_lh->node, z, coord); + if (ret != 0) { + warning("jmacd-976812", "find_child_ptr failed: %d", ret); + return ret; + } + } + } + + return 0; +} + +/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom. + If there is no next neighbor or the neighbor is not in memory or if there is a + neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned. */ +static int +neighbor_in_slum( + + znode * node, /* starting point */ + + lock_handle * lock, /* lock on starting point */ + + sideof side, /* left or right direction we seek the next node in */ + + znode_lock_mode mode /* kind of lock we want */ + + ) +{ + int ret; + + assert("jmacd-6334", znode_is_connected(node)); + + ret = reiser4_get_neighbor(lock, node, mode, GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0)); + + if (ret) { + /* May return -ENOENT or -E_NO_NEIGHBOR. */ + /* FIXME(C): check EINVAL, E_DEADLOCK */ + if (ret == -ENOENT) { + ret = RETERR(-E_NO_NEIGHBOR); + } + + return ret; + } + + /* Check dirty bit of locked znode, no races here */ + if (znode_check_dirty(lock->node)) + return 0; + + done_lh(lock); + return RETERR(-E_NO_NEIGHBOR); +} + +/* Return true if two znodes have the same parent. This is called with both nodes + write-locked (for squeezing) so no tree lock is needed. */ +static int +znode_same_parents(znode * a, znode * b) +{ + assert("jmacd-7011", znode_is_write_locked(a)); + assert("jmacd-7012", znode_is_write_locked(b)); + + /* We lock the whole tree for this check.... I really don't like whole tree + * locks... -Hans */ + return UNDER_RW(tree, znode_get_tree(a), read, + (znode_parent(a) == znode_parent(b))); +} + +/* FLUSH SCAN */ + +/* Initialize the flush_scan data structure. */ +static void +scan_init(flush_scan * scan) +{ + memset(scan, 0, sizeof (*scan)); + init_lh(&scan->node_lock); + init_lh(&scan->parent_lock); + init_load_count(&scan->parent_load); + init_load_count(&scan->node_load); + coord_init_invalid(&scan->parent_coord, NULL); +} + +/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */ +static void +scan_done(flush_scan * scan) +{ + done_load_count(&scan->node_load); + if (scan->node != NULL) { + jput(scan->node); + scan->node = NULL; + } + done_load_count(&scan->parent_load); + done_lh(&scan->parent_lock); + done_lh(&scan->node_lock); +} + +/* Returns true if flush scanning is finished. */ +reiser4_internal int +scan_finished(flush_scan * scan) +{ + return scan->stop || (scan->direction == RIGHT_SIDE && + scan->count >= scan->max_count); +} + +/* Return true if the scan should continue to the @tonode. True if the node meets the + same_slum_check condition. If not, deref the "left" node and stop the scan. */ +reiser4_internal int +scan_goto(flush_scan * scan, jnode * tonode) +{ + int go = same_slum_check(scan->node, tonode, 1, 0); + + if (!go) { + scan->stop = 1; + ON_TRACE(TRACE_FLUSH_VERB, + "flush %s scan stop: stop at node %s\n", + scanning_left(scan) ? "left" : "right", jnode_tostring(scan->node)); + ON_TRACE(TRACE_FLUSH_VERB, + "flush %s scan stop: do not cont at %s\n", + scanning_left(scan) ? "left" : "right", jnode_tostring(tonode)); + jput(tonode); + } + + return go; +} + +/* Set the current scan->node, refcount it, increment count by the @add_count (number to + count, e.g., skipped unallocated nodes), deref previous current, and copy the current + parent coordinate. */ +reiser4_internal int +scan_set_current(flush_scan * scan, jnode * node, unsigned add_count, const coord_t * parent) +{ + /* Release the old references, take the new reference. */ + done_load_count(&scan->node_load); + + if (scan->node != NULL) { + jput(scan->node); + } + scan->node = node; + scan->count += add_count; + + /* This next stmt is somewhat inefficient. The scan_extent_coord code could + delay this update step until it finishes and update the parent_coord only once. + It did that before, but there was a bug and this was the easiest way to make it + correct. */ + if (parent != NULL) { + coord_dup(&scan->parent_coord, parent); + } + + /* Failure may happen at the incr_load_count call, but the caller can assume the reference + is safely taken. */ + return incr_load_count_jnode(&scan->node_load, node); +} + +/* Return true if scanning in the leftward direction. */ +reiser4_internal int +scanning_left(flush_scan * scan) +{ + return scan->direction == LEFT_SIDE; +} + +/* Performs leftward scanning starting from either kind of node. Counts the starting + node. The right-scan object is passed in for the left-scan in order to copy the parent + of an unformatted starting position. This way we avoid searching for the unformatted + node's parent when scanning in each direction. If we search for the parent once it is + set in both scan objects. The limit parameter tells flush-scan when to stop. + + Rapid scanning is used only during scan_left, where we are interested in finding the + 'leftpoint' where we begin flushing. We are interested in stopping at the left child + of a twig that does not have a dirty left neighbor. THIS IS A SPECIAL CASE. The + problem is finding a way to flush only those nodes without unallocated children, and it + is difficult to solve in the bottom-up flushing algorithm we are currently using. The + problem can be solved by scanning left at every level as we go upward, but this would + basically bring us back to using a top-down allocation strategy, which we already tried + (see BK history from May 2002), and has a different set of problems. The top-down + strategy makes avoiding unallocated children easier, but makes it difficult to + propertly flush dirty children with clean parents that would otherwise stop the + top-down flush, only later to dirty the parent once the children are flushed. So we + solve the problem in the bottom-up algorithm with a special case for twigs and leaves + only. + + The first step in solving the problem is this rapid leftward scan. After we determine + that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we + are no longer interested in the exact count, we are only interested in finding a the + best place to start the flush. We could choose one of two possibilities: + + 1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor. + This requires checking one leaf per rapid-scan twig + + 2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig + to the left. This requires checking possibly all of the in-memory children of each + twig during the rapid scan. + + For now we implement the first policy. +*/ +static int +scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit) +{ + int ret = 0; + + scan->max_count = limit; + scan->direction = LEFT_SIDE; + + ret = scan_set_current(scan, jref(node), 1, NULL); + if (ret != 0) { + return ret; + } + + ret = scan_common(scan, right); + if (ret != 0) { + return ret; + } + + /* Before rapid scanning, we need a lock on scan->node so that we can get its + parent, only if formatted. */ + if (jnode_is_znode(scan->node)) { + ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node), + ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI); + } + + /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */ + return ret; +} + +/* Performs rightward scanning... Does not count the starting node. The limit parameter + is described in scan_left. If the starting node is unformatted then the + parent_coord was already set during scan_left. The rapid_after parameter is not used + during right-scanning. + + scan_right is only called if the scan_left operation does not count at least + FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter is set to + the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning + scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */ +static int +scan_right(flush_scan * scan, jnode * node, unsigned limit) +{ + int ret; + + scan->max_count = limit; + scan->direction = RIGHT_SIDE; + + ret = scan_set_current(scan, jref(node), 0, NULL); + if (ret != 0) { + return ret; + } + + return scan_common(scan, NULL); +} + +/* Common code to perform left or right scanning. */ +static int +scan_common(flush_scan * scan, flush_scan * other) +{ + int ret; + + assert("nikita-2376", scan->node != NULL); + assert("edward-54", jnode_is_unformatted(scan->node) || jnode_is_znode(scan->node)); + + /* Special case for starting at an unformatted node. Optimization: we only want + to search for the parent (which requires a tree traversal) once. Obviously, we + shouldn't have to call it once for the left scan and once for the right scan. + For this reason, if we search for the parent during scan-left we then duplicate + the coord/lock/load into the scan-right object. */ + if (jnode_is_unformatted(scan->node)) { + ret = scan_unformatted(scan, other); + if (ret != 0) + return ret; + } + /* This loop expects to start at a formatted position and performs chaining of + formatted regions */ + while (!scan_finished(scan)) { + + ret = scan_formatted(scan); + if (ret != 0) { + return ret; + } + } + + return 0; +} + +/* called by scan_unformatted() when jnode_lock_parent_coord + returns COORD_NOT_FOUND. +*/ +static int +scan_should_link_node(flush_scan * scan) +{ + assert("edward-311", scan->node != NULL); + if (jnode_is_cluster_page(scan->node)) { + + assert("edward-303", scan->parent_coord.between != EMPTY_NODE); + return 1; + } + return 0; +} + +static int +scan_unformatted(flush_scan * scan, flush_scan * other) +{ + int ret = 0; + int try = 0; + + if (!coord_is_invalid(&scan->parent_coord)) + goto scan; + + /* set parent coord from */ + if (!jnode_is_unformatted(scan->node)) { + /* formatted position*/ + + lock_handle lock; + assert("edward-301", jnode_is_znode(scan->node)); + init_lh(&lock); + + /* + * when flush starts from unformatted node, first thing it + * does is tree traversal to find formatted parent of starting + * node. This parent is then kept lock across scans to the + * left and to the right. This means that during scan to the + * left we cannot take left-ward lock, because this is + * dead-lock prone. So, if we are scanning to the left and + * there is already lock held by this thread, + * jnode_lock_parent_coord() should use try-lock. + */ + try = scanning_left(scan) && !lock_stack_isclean(get_current_lock_stack()); + /* Need the node locked to get the parent lock, We have to + take write lock since there is at least one call path + where this znode is already write-locked by us. */ + ret = longterm_lock_znode(&lock, JZNODE(scan->node), ZNODE_WRITE_LOCK, + scanning_left(scan) ? ZNODE_LOCK_LOPRI : ZNODE_LOCK_HIPRI); + if (ret != 0) + /* EINVAL or E_DEADLOCK here mean... try again! At this point we've + scanned too far and can't back out, just start over. */ + return ret; + + ret = jnode_lock_parent_coord(scan->node, + &scan->parent_coord, + &scan->parent_lock, + &scan->parent_load, + ZNODE_WRITE_LOCK, try); + + /* FIXME(C): check EINVAL, E_DEADLOCK */ + done_lh(&lock); + if (ret == -E_REPEAT) { + scan->stop = 1; + return 0; + } + if (ret) + return ret; + + } else { + /* unformatted position */ + + ret = jnode_lock_parent_coord(scan->node, &scan->parent_coord, &scan->parent_lock, + &scan->parent_load, ZNODE_WRITE_LOCK, try); + + if (IS_CBKERR(ret)) + return ret; + + if (ret == CBK_COORD_NOTFOUND) { + /* FIXME(C): check EINVAL, E_DEADLOCK */ + ON_TRACE(TRACE_FLUSH, + "flush_scan_common: jnode_lock_parent_coord returned %d\n", ret); + if (!scan_should_link_node(scan)) + return ret; + } + else { + /* parent was found */ + set_flush_scan_nstat(scan, LINKED); + ON_TRACE(TRACE_FLUSH, + "flush_scan_common: jnode_lock_parent_coord returned 0\n"); + assert("jmacd-8661", other != NULL); + } + + /* Duplicate the reference into the other flush_scan. */ + coord_dup(&other->parent_coord, &scan->parent_coord); + copy_lh(&other->parent_lock, &scan->parent_lock); + copy_load_count(&other->parent_load, &scan->parent_load); + set_flush_scan_nstat(other, scan->nstat); + } + scan: + return scan_by_coord(scan); +} + +/* Performs left- or rightward scanning starting from a formatted node. Follow left + pointers under tree lock as long as: + + - node->left/right is non-NULL + - node->left/right is connected, dirty + - node->left/right belongs to the same atom + - scan has not reached maximum count +*/ +static int +scan_formatted(flush_scan * scan) +{ + int ret; + znode *neighbor = NULL; + + assert("jmacd-1401", !scan_finished(scan)); + + do { + znode *node = JZNODE(scan->node); + + /* Node should be connected, but if not stop the scan. */ + if (!znode_is_connected(node)) { + scan->stop = 1; + break; + } + + /* Lock the tree, check-for and reference the next sibling. */ + RLOCK_TREE(znode_get_tree(node)); + + /* It may be that a node is inserted or removed between a node and its + left sibling while the tree lock is released, but the flush-scan count + does not need to be precise. Thus, we release the tree lock as soon as + we get the neighboring node. */ + neighbor = scanning_left(scan) ? node->left : node->right; + if (neighbor != NULL) { + zref(neighbor); + } + + RUNLOCK_TREE(znode_get_tree(node)); + + /* If neighbor is NULL at the leaf level, need to check for an unformatted + sibling using the parent--break in any case. */ + if (neighbor == NULL) { + break; + } + + ON_TRACE(TRACE_FLUSH_VERB, "format scan %s %s\n", + scanning_left(scan) ? "left" : "right", znode_tostring(neighbor)); + + /* Check the condition for going left, break if it is not met. This also + releases (jputs) the neighbor if false. */ + if (!scan_goto(scan, ZJNODE(neighbor))) { + break; + } + + /* Advance the flush_scan state to the left, repeat. */ + ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL); + if (ret != 0) { + return ret; + } + + } while (!scan_finished(scan)); + + /* If neighbor is NULL then we reached the end of a formatted region, or else the + sibling is out of memory, now check for an extent to the left (as long as + LEAF_LEVEL). */ + if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL || scan_finished(scan)) { + scan->stop = 1; + return 0; + } + /* Otherwise, calls scan_by_coord for the right(left)most item of the + left(right) neighbor on the parent level, then possibly continue. */ + + coord_init_invalid(&scan->parent_coord, NULL); + return scan_unformatted(scan, NULL); +} + +/* NOTE-EDWARD: + This scans adjacent items of the same type and calls scan flush plugin for each one. + Performs left(right)ward scanning starting from a (possibly) unformatted node. If we start + from unformatted node, then we continue only if the next neighbor is also unformatted. + When called from scan_formatted, we skip first iteration (to make sure that + right(left)most item of the left(right) neighbor on the parent level is of the same + type and set appropriate coord). */ +static int +scan_by_coord(flush_scan * scan) +{ + int ret = 0; + int scan_this_coord; + lock_handle next_lock; + load_count next_load; + coord_t next_coord; + jnode *child; + item_plugin *iplug; + + init_lh(&next_lock); + init_load_count(&next_load); + scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0); + + /* set initial item id */ + if (get_flush_scan_nstat(scan) == UNLINKED) + iplug = item_plugin_by_jnode(scan->node); + else + iplug = item_plugin_by_coord(&scan->parent_coord); + + for (; !scan_finished(scan); scan_this_coord = 1) { + if (scan_this_coord) { + /* Here we expect that unit is scannable. it would not be so due + * to race with extent->tail conversion. */ + if (iplug->f.scan == NULL) { + scan->stop = 1; + ret = 0; + goto exit; + } + + ret = iplug->f.scan(scan); + if (ret != 0) + goto exit; + + if (scan_finished(scan)) { + checkchild(scan); + break; + } + } else { + /* the same race against truncate as above is possible + * here, it seems */ + + /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan + the first coordinate. */ + assert("jmacd-1231", item_is_internal(&scan->parent_coord)); + } + + if(iplug->f.utmost_child == NULL || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) { + /* stop this coord and continue on parrent level */ + ret = scan_set_current(scan, ZJNODE(zref(scan->parent_coord.node)), 1, NULL); + if (ret != 0) + goto exit; + break; + } + + /* Either way, the invariant is that scan->parent_coord is set to the + parent of scan->node. Now get the next unit. */ + coord_dup(&next_coord, &scan->parent_coord); + coord_sideof_unit(&next_coord, scan->direction); + + /* If off-the-end of the twig, try the next twig. */ + if (coord_is_after_sideof_unit(&next_coord, scan->direction)) { + /* We take the write lock because we may start flushing from this + * coordinate. */ + ret = neighbor_in_slum(next_coord.node, &next_lock, scan->direction, ZNODE_WRITE_LOCK); + + if (ret == -E_NO_NEIGHBOR) { + scan->stop = 1; + ret = 0; + break; + } + + if (ret != 0) { + goto exit; + } + + ret = incr_load_count_znode(&next_load, next_lock.node); + if (ret != 0) { + goto exit; + } + + coord_init_sideof_unit(&next_coord, next_lock.node, sideof_reverse(scan->direction)); + } + + iplug = item_plugin_by_coord(&next_coord); + + /* Get the next child. */ + ret = iplug->f.utmost_child(&next_coord, sideof_reverse(scan->direction), &child); + if (ret != 0) + goto exit; + /* If the next child is not in memory, or, item_utmost_child + failed (due to race with unlink, most probably), stop + here. */ + if (child == NULL || IS_ERR(child)) { + scan->stop = 1; + checkchild(scan); + break; + } + + assert("nikita-2374", jnode_is_unformatted(child) || jnode_is_znode(child)); + + /* See if it is dirty, part of the same atom. */ + if (!scan_goto(scan, child)) { + checkchild(scan); + break; + } + + /* If so, make this child current. */ + ret = scan_set_current(scan, child, 1, &next_coord); + if (ret != 0) + goto exit; + + /* Now continue. If formatted we release the parent lock and return, then + proceed. */ + if (jnode_is_znode(child)) + break; + + /* Otherwise, repeat the above loop with next_coord. */ + if (next_load.node != NULL) { + done_lh(&scan->parent_lock); + move_lh(&scan->parent_lock, &next_lock); + move_load_count(&scan->parent_load, &next_load); + } + } + + assert("jmacd-6233", scan_finished(scan) || jnode_is_znode(scan->node)); + exit: + checkchild(scan); + + if (jnode_is_znode(scan->node)) { + done_lh(&scan->parent_lock); + done_load_count(&scan->parent_load); + } + + done_load_count(&next_load); + done_lh(&next_lock); + return ret; +} + +/* FLUSH POS HELPERS */ + +/* Initialize the fields of a flush_position. */ +static void +pos_init(flush_pos_t * pos) +{ + xmemset(pos, 0, sizeof *pos); + + pos->state = POS_INVALID; + coord_init_invalid(&pos->coord, NULL); + init_lh(&pos->lock); + init_load_count(&pos->load); + + blocknr_hint_init(&pos->preceder); +} + +/* The flush loop inside squalloc periodically checks pos_valid to + determine when "enough flushing" has been performed. This will return true until one + of the following conditions is met: + + 1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush" + parameter, meaning we have flushed as many blocks as the kernel requested. When + flushing to commit, this parameter is NULL. + + 2. pos_stop() is called because squalloc discovers that the "next" node in the + flush order is either non-existant, not dirty, or not in the same atom. +*/ + + +static int pos_valid (flush_pos_t * pos) +{ + return pos->state != POS_INVALID; +} + +/* Release any resources of a flush_position. Called when jnode_flush finishes. */ +static void +pos_done(flush_pos_t * pos) +{ + pos_stop(pos); + blocknr_hint_done(&pos->preceder); +} + +/* Reset the point and parent. Called during flush subroutines to terminate the + squalloc loop. */ +static int +pos_stop(flush_pos_t * pos) +{ + pos->state = POS_INVALID; + done_lh(&pos->lock); + done_load_count(&pos->load); + coord_init_invalid(&pos->coord, NULL); + + if (pos->child) { + jput(pos->child); + pos->child = NULL; + } + + return 0; +} + +/* Return the flush_position's block allocator hint. */ +reiser4_internal reiser4_blocknr_hint * +pos_hint(flush_pos_t * pos) +{ + return &pos->preceder; +} + +/* Return true if we have decided to unconditionally relocate leaf nodes, thus write + optimizing. */ +reiser4_internal int +pos_leaf_relocate(flush_pos_t * pos) +{ + return pos->leaf_relocate; +} + +reiser4_internal flush_queue_t * pos_fq(flush_pos_t * pos) +{ + return pos->fq; +} + +const char *coord_tween_tostring(between_enum n); + +reiser4_internal const char * +pos_tostring(flush_pos_t * pos) +{ + static char fmtbuf[256]; + load_count load; + fmtbuf[0] = 0; + + init_load_count(&load); + + if (pos->state == POS_ON_EPOINT) { + assert("jmacd-79123", pos->lock.node == pos->load.node); + + strcat(fmtbuf, "par:"); + jnode_tostring_internal(ZJNODE(pos->lock.node), fmtbuf); + + if (incr_load_count_znode(&load, pos->lock.node)) { + return "*error*"; + } + + if (coord_is_before_leftmost(&pos->coord)) { + sprintf(fmtbuf + strlen(fmtbuf), "[left]"); + } else if (coord_is_after_rightmost(&pos->coord)) { + sprintf(fmtbuf + strlen(fmtbuf), "[right]"); + } else { + sprintf(fmtbuf + strlen(fmtbuf), "[%s i=%u/%u", + coord_tween_tostring(pos->coord.between), + pos->coord.item_pos, node_num_items(pos->coord.node)); + + if (!coord_is_existing_item(&pos->coord)) { + sprintf(fmtbuf + strlen(fmtbuf), "]"); + } else { + + sprintf(fmtbuf + strlen(fmtbuf), ",u=%u/%u %s]", + pos->coord.unit_pos, + coord_num_units(&pos->coord), coord_is_existing_unit(&pos->coord) + ? (item_is_extent(&pos->coord) ? + "ext" : (item_is_internal(&pos->coord) ? "int" : "other")) + : "tween"); + } + } + } else if (pos->lock.node != NULL) { + strcat(fmtbuf, "pt:"); + jnode_tostring_internal(ZJNODE(pos->lock.node), fmtbuf); + } + + done_load_count(&load); + return fmtbuf; +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 90 + LocalWords: preceder + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/flush.h linux-2.6.4-ck1/fs/reiser4/flush.h --- linux-2.6.4/fs/reiser4/flush.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/flush.h 2004-03-11 22:45:15.238518138 +1100 @@ -0,0 +1,167 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* DECLARATIONS: */ + +#if !defined(__REISER4_FLUSH_H__) +#define __REISER4_FLUSH_H__ + +#include "plugin/item/ctail.h" /* for ctail scan/squeeze info */ + +typedef enum { + UNLINKED = 0, + LINKED = 1 +} flush_scan_node_stat_t; + +/* The flush_scan data structure maintains the state of an in-progress flush-scan on a + single level of the tree. A flush-scan is used for counting the number of adjacent + nodes to flush, which is used to determine whether we should relocate, and it is also + used to find a starting point for flush. A flush-scan object can scan in both right + and left directions via the scan_left() and scan_right() interfaces. The + right- and left-variations are similar but perform different functions. When scanning + left we (optionally perform rapid scanning and then) longterm-lock the endpoint node. + When scanning right we are simply counting the number of adjacent, dirty nodes. */ +struct flush_scan { + + /* The current number of nodes scanned on this level. */ + unsigned count; + + /* There may be a maximum number of nodes for a scan on any single level. When + going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */ + unsigned max_count; + + /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */ + sideof direction; + + /* Initially @stop is set to false then set true once some condition stops the + search (e.g., we found a clean node before reaching max_count or we found a + node belonging to another atom). */ + int stop; + + /* The current scan position. If @node is non-NULL then its reference count has + been incremented to reflect this reference. */ + jnode *node; + + /* node specific linkage status. This indicates if the node that flush + * started from is linked to the tree (like formatted nodes, extent's jnodes), + * or not (like jnodes of newly created cluster of cryptcompressed file. + * If (nstat == UNLINKED) we don't do right scan. Also we use this status in + * scan_by_coord() to assign item plugin */ + flush_scan_node_stat_t nstat; + + /* A handle for zload/zrelse of current scan position node. */ + load_count node_load; + + /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the + node is locked using this lock handle. The endpoint needs to be locked for + transfer to the flush_position object after scanning finishes. */ + lock_handle node_lock; + + /* When the position is unformatted, its parent, coordinate, and parent + zload/zrelse handle. */ + lock_handle parent_lock; + coord_t parent_coord; + load_count parent_load; + + /* The block allocator preceder hint. Sometimes flush_scan determines what the + preceder is and if so it sets it here, after which it is copied into the + flush_position. Otherwise, the preceder is computed later. */ + reiser4_block_nr preceder_blk; +}; + +static inline flush_scan_node_stat_t +get_flush_scan_nstat(flush_scan * scan) + +{ + return scan->nstat; +} + +static inline void +set_flush_scan_nstat(flush_scan * scan, flush_scan_node_stat_t nstat) +{ + scan->nstat = nstat; +} + +typedef struct flush_squeeze_item_data { + item_plugin * iplug; + int mergeable; + union { + ctail_squeeze_info_t ctail_info; + } u; +} flush_squeeze_item_data_t; + +typedef enum flush_position_state { + POS_INVALID, /* Invalid or stopped pos, do not continue slum + * processing */ + POS_ON_LEAF, /* pos points to already prepped, locked formatted node at + * leaf level */ + POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field is used + * to traverse unformatted nodes */ + POS_TO_LEAF, /* pos is being moved to leaf level */ + POS_TO_TWIG, /* pos is being moved to twig level */ + POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is after + * rightmost unit of the current twig */ + POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal node */ + +} flushpos_state_t; + + + +/* An encapsulation of the current flush point and all the parameters that are passed + through the entire squeeze-and-allocate stage of the flush routine. A single + flush_position object is constructed after left- and right-scanning finishes. */ +struct flush_position { + flushpos_state_t state; + + coord_t coord; /* coord to traverse unformatted nodes */ + lock_handle lock; /* current lock we hold */ + load_count load; /* load status for current locked formatted node */ + + jnode * child; /* for passing a reference to unformatted child + * across pos state changes */ + + reiser4_blocknr_hint preceder; /* The flush 'hint' state. */ + int leaf_relocate; /* True if enough leaf-level nodes were + * found to suggest a relocate policy. */ + long *nr_to_flush; /* If called under memory pressure, + * indicates how many nodes the VM asked to flush. */ + int alloc_cnt; /* The number of nodes allocated during squeeze and allococate. */ + int prep_or_free_cnt; /* The number of nodes prepared for write (allocate) or squeezed and freed. */ + flush_queue_t *fq; + long *nr_written; /* number of nodes submitted to disk */ + int flags; /* a copy of jnode_flush flags argument */ + + znode * prev_twig; /* previous parent pointer value, used to catch + * processing of new twig node */ + flush_squeeze_item_data_t * idata; /* squeeze item data handle */ + + unsigned long pos_in_unit; /* for extents only. Position + within an extent unit of first + jnode of slum */ +}; + +/* used in extent.c */ +int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size, const coord_t * parent); +int scan_finished(flush_scan * scan); +int scanning_left(flush_scan * scan); +int scan_goto(flush_scan * scan, jnode * tonode); +txn_atom *atom_locked_by_fq(flush_queue_t * fq); + +int init_fqs(void); +void done_fqs(void); + +#if REISER4_TRACE +const char *jnode_tostring(jnode * node); +#else +#define jnode_tostring(n) "" +#endif + +#if REISER4_DEBUG +#define check_preceder(blk) \ +assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb())); + +#else +#define check_preceder( b ) noop +#endif + +/* __REISER4_FLUSH_H__ */ +#endif diff -Naurp linux-2.6.4/fs/reiser4/flush_queue.c linux-2.6.4-ck1/fs/reiser4/flush_queue.c --- linux-2.6.4/fs/reiser4/flush_queue.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/flush_queue.c 2004-03-11 22:45:15.239517982 +1100 @@ -0,0 +1,716 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "debug.h" +#include "type_safe_list.h" +#include "super.h" +#include "txnmgr.h" +#include "jnode.h" +#include "znode.h" +#include "page_cache.h" +#include "wander.h" +#include "vfs_ops.h" + +#include +#include +#include +#include +#include + +/* A flush queue object is an accumulator for keeping jnodes prepared + by the jnode_flush() function for writing to disk. Those "queued" jnodes are + kept on the flush queue until memory pressure or atom commit asks + flush queues to write some or all from their jnodes. */ + +TYPE_SAFE_LIST_DEFINE(fq, flush_queue_t, alink); + +#if REISER4_DEBUG +# define spin_ordering_pred_fq(fq) (1) +#endif + +SPIN_LOCK_FUNCTIONS(fq, flush_queue_t, guard); + +/* + LOCKING: + + fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped + list protected by atom spin lock. fq->prepped list uses the following + locking: + + two ways to protect fq->prepped list for read-only list traversal: + + 1. atom spin-lock atom. + 2. fq is IN_USE, atom->nr_running_queues increased. + + and one for list modification: + + 1. atom is spin-locked and one condition is true: fq is IN_USE or + atom->nr_running_queues == 0. + + The deadlock-safe order for flush queues and atoms is: first lock atom, then + lock flush queue, then lock jnode. +*/ + +#define fq_in_use(fq) ((fq)->state & FQ_IN_USE) +#define fq_ready(fq) (!fq_in_use(fq)) + +#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0) +#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0) + +/* get lock on atom from locked flush queue object */ +reiser4_internal txn_atom * +atom_get_locked_by_fq(flush_queue_t * fq) +{ + /* This code is similar to jnode_get_atom(), look at it for the + * explanation. */ + txn_atom *atom; + + assert("zam-729", spin_fq_is_locked(fq)); + + while(1) { + atom = fq->atom; + if (atom == NULL) + break; + + if (spin_trylock_atom(atom)) + break; + + atomic_inc(&atom->refcount); + spin_unlock_fq(fq); + LOCK_ATOM(atom); + spin_lock_fq(fq); + + if (fq->atom == atom) { + atomic_dec(&atom->refcount); + break; + } + + spin_unlock_fq(fq); + atom_dec_and_unlock(atom); + spin_lock_fq(fq); + } + + return atom; +} + +reiser4_internal txn_atom * +atom_locked_by_fq(flush_queue_t * fq) +{ + return UNDER_SPIN(fq, fq, atom_get_locked_by_fq(fq)); +} + +static void +init_fq(flush_queue_t * fq) +{ + xmemset(fq, 0, sizeof *fq); + + atomic_set(&fq->nr_submitted, 0); + + capture_list_init(&fq->prepped); + + sema_init(&fq->io_sem, 0); + spin_fq_init(fq); +} + +/* slab for flush queues */ +static kmem_cache_t *fq_slab; + +reiser4_internal int init_fqs(void) +{ + fq_slab = kmem_cache_create("fq", + sizeof (flush_queue_t), + 0, + SLAB_HWCACHE_ALIGN, + NULL, + NULL); + return (fq_slab == NULL) ? RETERR(-ENOMEM) : 0; +} + +reiser4_internal void done_fqs(void) +{ + kmem_cache_destroy(fq_slab); +} + +/* create new flush queue object */ +static flush_queue_t * +create_fq(void) +{ + flush_queue_t *fq; + + fq = kmem_cache_alloc(fq_slab, GFP_KERNEL); + if (fq) + init_fq(fq); + + return fq; +} + +/* adjust atom's and flush queue's counters of queued nodes */ +static void +count_enqueued_node(flush_queue_t * fq) +{ + ON_DEBUG(fq->atom->num_queued++); +} + +static void +count_dequeued_node(flush_queue_t * fq) +{ + assert("zam-993", fq->atom->num_queued > 0); + ON_DEBUG(fq->atom->num_queued--); +} + +/* attach flush queue object to the atom */ +static void +attach_fq(txn_atom * atom, flush_queue_t * fq) +{ + assert("zam-718", spin_atom_is_locked(atom)); + fq_list_push_front(&atom->flush_queues, fq); + fq->atom = atom; + ON_DEBUG(atom->nr_flush_queues++); +} + +static void +detach_fq(flush_queue_t * fq) +{ + assert("zam-731", spin_atom_is_locked(fq->atom)); + + spin_lock_fq(fq); + fq_list_remove_clean(fq); + assert("vs-1456", fq->atom->nr_flush_queues > 0); + ON_DEBUG(fq->atom->nr_flush_queues--); + fq->atom = NULL; + spin_unlock_fq(fq); +} + +/* destroy flush queue object */ +reiser4_internal void +done_fq(flush_queue_t * fq) +{ + assert("zam-763", capture_list_empty(&fq->prepped)); + assert("zam-766", atomic_read(&fq->nr_submitted) == 0); + + kmem_cache_free(fq_slab, fq); +} + +reiser4_internal void +mark_jnode_queued(flush_queue_t *fq, jnode *node) +{ + JF_SET(node, JNODE_FLUSH_QUEUED); + count_enqueued_node(fq); +} + +/* Putting jnode into the flush queue. Both atom and jnode should be + spin-locked. */ +reiser4_internal void +queue_jnode(flush_queue_t * fq, jnode * node) +{ + assert("zam-711", spin_jnode_is_locked(node)); + assert("zam-713", node->atom != NULL); + assert("zam-712", spin_atom_is_locked(node->atom)); + assert("zam-714", jnode_is_dirty(node)); + assert("zam-716", fq->atom != NULL); + assert("zam-717", fq->atom == node->atom); + assert("zam-826", JF_ISSET(node, JNODE_RELOC)); + assert("zam-907", fq_in_use(fq)); + + if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) { + assert("vs-1481", node->list == FQ_LIST); + return; /* queued already */ + } + + mark_jnode_queued(fq, node); + capture_list_remove_clean(node); + capture_list_push_back(&fq->prepped, node); + ON_DEBUG(node->list = FQ_LIST); +} + +/* repeatable process for waiting io completion on a flush queue object */ +static int +wait_io(flush_queue_t * fq, int *nr_io_errors) +{ + assert("zam-738", fq->atom != NULL); + assert("zam-739", spin_atom_is_locked(fq->atom)); + assert("zam-736", fq_in_use(fq)); + assert("zam-911", capture_list_empty(&fq->prepped)); + + if (atomic_read(&fq->nr_submitted) != 0) { + UNLOCK_ATOM(fq->atom); + + assert("nikita-3013", schedulable()); + + blk_run_queues(); + if ( !(reiser4_get_current_sb()->s_flags & MS_RDONLY) ) + down(&fq->io_sem); + + /* Ask the caller to re-aquire the locks and call this + function again. Note: this technique is commonly used in + the txnmgr code. */ + return -E_REPEAT; + } + + *nr_io_errors += atomic_read(&fq->nr_errors); + return 0; +} + +/* wait on I/O completion, re-submit dirty nodes to write */ +static int +finish_fq(flush_queue_t * fq, int *nr_io_errors) +{ + int ret; + txn_atom * atom = fq->atom; + + assert("zam-801", atom != NULL); + assert("zam-744", spin_atom_is_locked(atom)); + assert("zam-762", fq_in_use(fq)); + + ret = wait_io(fq, nr_io_errors); + if (ret) + return ret; + + detach_fq(fq); + done_fq(fq); + + atom_send_event(atom); + + return 0; +} + +/* wait for all i/o for given atom to be completed, actually do one iteration + on that and return -E_REPEAT if there more iterations needed */ +static int +finish_all_fq(txn_atom * atom, int *nr_io_errors) +{ + flush_queue_t *fq; + + assert("zam-730", spin_atom_is_locked(atom)); + + if (fq_list_empty(&atom->flush_queues)) + return 0; + + for_all_type_safe_list(fq, &atom->flush_queues, fq) { + if (fq_ready(fq)) { + int ret; + + mark_fq_in_use(fq); + assert("vs-1247", fq->owner == NULL); + ON_DEBUG(fq->owner = current); + ret = finish_fq(fq, nr_io_errors); + + if ( *nr_io_errors ) + reiser4_handle_error(); + + if (ret) { + fq_put(fq); + return ret; + } + + UNLOCK_ATOM(atom); + + return -E_REPEAT; + } + } + + /* All flush queues are in use; atom remains locked */ + return -EBUSY; +} + +/* wait all i/o for current atom */ +reiser4_internal int +current_atom_finish_all_fq(void) +{ + txn_atom *atom; + int nr_io_errors = 0; + int ret = 0; + + do { + while (1) { + atom = get_current_atom_locked(); + ret = finish_all_fq(atom, &nr_io_errors); + if (ret != -EBUSY) + break; + atom_wait_event(atom); + } + } while (ret == -E_REPEAT); + + /* we do not need locked atom after this function finishes, SUCCESS or + -EBUSY are two return codes when atom remains locked after + finish_all_fq */ + if (!ret) + UNLOCK_ATOM(atom); + + assert("nikita-2696", spin_atom_is_not_locked(atom)); + + if (ret) + return ret; + + if (nr_io_errors) + return RETERR(-EIO); + + return 0; +} + +/* change node->atom field for all jnode from given list */ +static void +scan_fq_and_update_atom_ref(capture_list_head * list, txn_atom * atom) +{ + jnode *cur; + + for_all_type_safe_list(capture, list, cur) { + LOCK_JNODE(cur); + cur->atom = atom; + UNLOCK_JNODE(cur); + } +} + +/* support for atom fusion operation */ +reiser4_internal void +fuse_fq(txn_atom * to, txn_atom * from) +{ + flush_queue_t *fq; + + assert("zam-720", spin_atom_is_locked(to)); + assert("zam-721", spin_atom_is_locked(from)); + + + for_all_type_safe_list(fq, &from->flush_queues, fq) { + scan_fq_and_update_atom_ref(&fq->prepped, to); + spin_lock_fq(fq); + fq->atom = to; + spin_unlock_fq(fq); + } + + fq_list_splice(&to->flush_queues, &from->flush_queues); + +#if REISER4_DEBUG + to->num_queued += from->num_queued; + to->nr_flush_queues += from->nr_flush_queues; + from->nr_flush_queues = 0; +#endif +} + +#if REISER4_DEBUG +int atom_fq_parts_are_clean (txn_atom * atom) +{ + assert("zam-915", atom != NULL); + return fq_list_empty(&atom->flush_queues); +} +#endif +/* Bio i/o completion routine for reiser4 write operations. */ +static int +end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG, int err UNUSED_ARG) +{ + int i; + int nr_errors = 0; + flush_queue_t *fq; + + assert ("zam-958", bio->bi_rw & WRITE); + + /* i/o op. is not fully completed */ + if (bio->bi_size != 0) + return 1; + + /* we expect that bio->private is set to NULL or fq object which is used + * for synchronization and error counting. */ + fq = bio->bi_private; + /* Check all elements of io_vec for correct write completion. */ + for (i = 0; i < bio->bi_vcnt; i += 1) { + struct page *pg = bio->bi_io_vec[i].bv_page; + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + SetPageError(pg); + nr_errors++; + } + + { + /* jnode WRITEBACK ("write is in progress bit") is + * atomically cleared here. */ + jnode *node; + + assert("zam-736", pg != NULL); + assert("zam-736", PagePrivate(pg)); + node = (jnode *) (pg->private); + + JF_CLR(node, JNODE_WRITEBACK); + } + + end_page_writeback(pg); + page_cache_release(pg); + } + + if (fq) { + /* count i/o error in fq object */ + atomic_add(nr_errors, &fq->nr_errors); + + /* If all write requests registered in this "fq" are done we up + * the semaphore. */ + if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted)) + up(&fq->io_sem); + } + + bio_put(bio); + return 0; +} + +/* Count I/O requests which will be submitted by @bio in given flush queues + @fq */ +reiser4_internal void +add_fq_to_bio(flush_queue_t * fq, struct bio *bio) +{ + bio->bi_private = fq; + bio->bi_end_io = end_io_handler; + + if (fq) + atomic_add(bio->bi_vcnt, &fq->nr_submitted); +} + +/* Move all queued nodes out from @fq->prepped list. */ +static void release_prepped_list(flush_queue_t * fq) +{ + txn_atom * atom; + + assert ("zam-904", fq_in_use(fq)); + atom = UNDER_SPIN(fq, fq, atom_get_locked_by_fq(fq)); + + while(!capture_list_empty(&fq->prepped)) { + jnode * cur; + + cur = capture_list_front(&fq->prepped); + capture_list_remove_clean(cur); + + count_dequeued_node(fq); + LOCK_JNODE(cur); + assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR)); + assert("vs-1615", cur->list == FQ_LIST); + JF_CLR(cur, JNODE_FLUSH_QUEUED); + + if (JF_ISSET(cur, JNODE_DIRTY)) { + capture_list_push_back(&atom->dirty_nodes[jnode_get_level(cur)], cur); + ON_DEBUG(cur->list = DIRTY_LIST); + } else { + capture_list_push_back(&atom->clean_nodes, cur); + ON_DEBUG(cur->list = CLEAN_LIST); + } + + UNLOCK_JNODE(cur); + } + + if (-- atom->nr_running_queues == 0) + atom_send_event(atom); + + UNLOCK_ATOM(atom); +} + +/* Submit write requests for nodes on the already filled flush queue @fq. + + @fq: flush queue object which contains jnodes we can (and will) write. + @return: number of submitted blocks (>=0) if success, otherwise -- an error + code (<0). */ +reiser4_internal int +write_fq(flush_queue_t * fq, long * nr_submitted) +{ + int ret; + txn_atom * atom; + + while (1) { + atom = UNDER_SPIN(fq, fq, atom_get_locked_by_fq(fq)); + assert ("zam-924", atom); + /* do not write fq in parallel. */ + if (atom->nr_running_queues == 0) + break; + atom_wait_event(atom); + } + + atom->nr_running_queues ++; + UNLOCK_ATOM(atom); + + ret = write_jnode_list(&fq->prepped, fq, nr_submitted); + release_prepped_list(fq); + + return ret; +} + +/* Getting flush queue object for exclusive use by one thread. May require + several iterations which is indicated by -E_REPEAT return code. + + This function does not contain code for obtaining an atom lock because an + atom lock is obtained by different ways in different parts of reiser4, + usually it is current atom, but we need a possibility for getting fq for the + atom of given jnode. */ +reiser4_internal int +fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq) +{ + flush_queue_t *fq; + + assert("zam-745", spin_atom_is_locked(atom)); + + fq = fq_list_front(&atom->flush_queues); + while (!fq_list_end(&atom->flush_queues, fq)) { + spin_lock_fq(fq); + + if (fq_ready(fq)) { + mark_fq_in_use(fq); + assert("vs-1246", fq->owner == NULL); + ON_DEBUG(fq->owner = current); + spin_unlock_fq(fq); + + if (*new_fq) + done_fq(*new_fq); + + *new_fq = fq; + + return 0; + } + + spin_unlock_fq(fq); + + fq = fq_list_next(fq); + } + + /* Use previously allocated fq object */ + if (*new_fq) { + mark_fq_in_use(*new_fq); + assert("vs-1248", (*new_fq)->owner == 0); + ON_DEBUG((*new_fq)->owner = current); + attach_fq(atom, *new_fq); + + return 0; + } + + UNLOCK_ATOM(atom); + + *new_fq = create_fq(); + + if (*new_fq == NULL) + return RETERR(-ENOMEM); + + return RETERR(-E_REPEAT); +} + +/* A wrapper around fq_by_atom for getting a flush queue object for current + * atom, if success fq->atom remains locked. */ +reiser4_internal flush_queue_t * +get_fq_for_current_atom(void) +{ + flush_queue_t *fq = NULL; + txn_atom *atom; + int ret; + + do { + atom = get_current_atom_locked(); + ret = fq_by_atom(atom, &fq); + } while (ret == -E_REPEAT); + + if (ret) + return ERR_PTR(ret); + return fq; +} + +/* Releasing flush queue object after exclusive use */ +reiser4_internal void +fq_put_nolock(flush_queue_t * fq) +{ + assert("zam-747", fq->atom != NULL); + assert("zam-902", capture_list_empty(&fq->prepped)); + mark_fq_ready(fq); + assert("vs-1245", fq->owner == current); + ON_DEBUG(fq->owner = NULL); +} + +reiser4_internal void +fq_put(flush_queue_t * fq) +{ + txn_atom *atom; + + spin_lock_fq(fq); + atom = atom_get_locked_by_fq(fq); + + assert("zam-746", atom != NULL); + + fq_put_nolock(fq); + atom_send_event(atom); + + spin_unlock_fq(fq); + UNLOCK_ATOM(atom); +} + +/* A part of atom object initialization related to the embedded flush queue + list head */ + +reiser4_internal void +init_atom_fq_parts(txn_atom * atom) +{ + fq_list_init(&atom->flush_queues); +} + +/* get a flush queue for an atom pointed by given jnode (spin-locked) ; returns + * both atom and jnode locked and found and took exclusive access for flush + * queue object. */ +reiser4_internal int fq_by_jnode (jnode * node, flush_queue_t ** fq) +{ + txn_atom * atom; + int ret; + + assert("zam-835", spin_jnode_is_locked(node)); + + *fq = NULL; + + while (1) { + /* begin with taking lock on atom */ + atom = jnode_get_atom(node); + UNLOCK_JNODE(node); + + if (atom == NULL) { + /* jnode does not point to the atom anymore, it is + * possible because jnode lock could be removed for a + * time in atom_get_locked_by_jnode() */ + if (*fq) { + done_fq(*fq); + *fq = NULL; + } + return 0; + } + + /* atom lock is required for taking flush queue */ + ret = fq_by_atom(atom, fq); + + if (ret) { + if (ret == -E_REPEAT) + /* atom lock was released for doing memory + * allocation, start with locked jnode one more + * time */ + goto lock_again; + return ret; + } + + /* It is correct to lock atom first, then lock a jnode */ + LOCK_JNODE(node); + + if (node->atom == atom) + break; /* Yes! it is our jnode. We got all of them: + * flush queue, and both locked atom and + * jnode */ + + /* release all locks and allocated objects and restart from + * locked jnode. */ + UNLOCK_JNODE(node); + + fq_put(*fq); + fq = NULL; + + UNLOCK_ATOM(atom); + + lock_again: + LOCK_JNODE(node); + } + + return 0; +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 80 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/forward.h linux-2.6.4-ck1/fs/reiser4/forward.h --- linux-2.6.4/fs/reiser4/forward.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/forward.h 2004-03-11 22:45:15.240517827 +1100 @@ -0,0 +1,260 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Forward declarations. Thank you Kernighan. */ + +#if !defined( __REISER4_FORWARD_H__ ) +#define __REISER4_FORWARD_H__ + +#include + +typedef struct zlock zlock; +typedef struct lock_stack lock_stack; +typedef struct lock_handle lock_handle; +typedef struct znode znode; +typedef struct flow flow_t; +typedef struct coord coord_t; +typedef struct tree_access_pointer tap_t; +typedef struct item_coord item_coord; +typedef struct shift_params shift_params; +typedef struct reiser4_object_create_data reiser4_object_create_data; +typedef union reiser4_plugin reiser4_plugin; +typedef int reiser4_plugin_id; +typedef struct item_plugin item_plugin; +typedef struct jnode_plugin jnode_plugin; +typedef struct reiser4_item_data reiser4_item_data; +typedef union reiser4_key reiser4_key; +typedef union reiser4_dblock_nr reiser4_dblock_nr; +typedef struct reiser4_tree reiser4_tree; +typedef struct carry_cut_data carry_cut_data; +typedef struct carry_kill_data carry_kill_data; +typedef struct carry_tree_op carry_tree_op; +typedef struct carry_tree_node carry_tree_node; +typedef struct carry_plugin_info carry_plugin_info; +typedef struct reiser4_journal reiser4_journal; +typedef struct txn_atom txn_atom; +typedef struct txn_handle txn_handle; +typedef struct txn_mgr txn_mgr; +typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc; +typedef struct reiser4_context reiser4_context; +typedef struct carry_level carry_level; +typedef struct blocknr_set blocknr_set; +typedef struct blocknr_set_entry blocknr_set_entry; +/* super_block->s_fs_info points to this */ +typedef struct reiser4_super_info_data reiser4_super_info_data; +/*next two objects are fields of reiser4_super_info_data */ +typedef struct reiser4_oid_allocator reiser4_oid_allocator; +typedef struct reiser4_space_allocator reiser4_space_allocator; + +typedef struct flush_scan flush_scan; +typedef struct flush_position flush_pos_t; + +typedef unsigned short pos_in_node_t; +#define MAX_POS_IN_NODE 65535 + +typedef struct jnode jnode; +typedef struct reiser4_blocknr_hint reiser4_blocknr_hint; + +typedef struct uf_coord uf_coord_t; +typedef struct hint hint_t; + +typedef struct ktxnmgrd_context ktxnmgrd_context; + +typedef struct reiser4_xattr_plugin reiser4_xattr_plugin; + +struct inode; +struct page; +struct file; +struct dentry; +struct super_block; + +/* return values of coord_by_key(). cbk == coord_by_key */ +typedef enum { + CBK_COORD_FOUND = 0, + CBK_COORD_NOTFOUND = -ENOENT, +} lookup_result; + +/* results of lookup with directory file */ +typedef enum { + FILE_NAME_FOUND = 0, + FILE_NAME_NOTFOUND = -ENOENT, + FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */ + FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */ +} file_lookup_result; + +/* behaviors of lookup. If coord we are looking for is actually in a tree, + both coincide. */ +typedef enum { + /* search exactly for the coord with key given */ + FIND_EXACT, + /* search for coord with the maximal key not greater than one + given */ + FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */ +} lookup_bias; + +typedef enum { + /* number of leaf level of the tree + The fake root has (tree_level=0). */ + LEAF_LEVEL = 1, + + /* number of level one above leaf level of the tree. + + It is supposed that internal tree used by reiser4 to store file + system data and meta data will have height 2 initially (when + created by mkfs). + */ + TWIG_LEVEL = 2, +} tree_level; + +/* The "real" maximum ztree height is the 0-origin size of any per-level + array, since the zero'th level is not used. */ +#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL) + +/* enumeration of possible mutual position of item and coord. This enum is + return type of ->is_in_item() item plugin method which see. */ +typedef enum { + /* coord is on the left of an item*/ + IP_ON_THE_LEFT, + /* coord is inside item */ + IP_INSIDE, + /* coord is inside item, but to the right of the rightmost unit of + this item */ + IP_RIGHT_EDGE, + /* coord is on the right of an item */ + IP_ON_THE_RIGHT +} interposition; + +/* type of lock to acquire on znode before returning it to caller */ +typedef enum { + ZNODE_NO_LOCK = 0, + ZNODE_READ_LOCK = 1, + ZNODE_WRITE_LOCK = 2, +} znode_lock_mode; + +/* type of lock request */ +typedef enum { + ZNODE_LOCK_LOPRI = 0, + ZNODE_LOCK_HIPRI = (1 << 0), + + /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep + waiting for the lock to become available. If the lock is unavailable, reiser4_znode_lock will immediately + return the value -E_REPEAT. */ + ZNODE_LOCK_NONBLOCK = (1 << 1), + /* An option for longterm_lock_znode which prevents atom fusion */ + ZNODE_LOCK_DONT_FUSE = (1 << 2) +} znode_lock_request; + +typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op; + +/* used to specify direction of shift. These must be -1 and 1 */ +typedef enum { + SHIFT_LEFT = 1, + SHIFT_RIGHT = -1 +} shift_direction; + +typedef enum { + LEFT_SIDE, + RIGHT_SIDE +} sideof; + +#define round_up( value, order ) \ + ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) & \ + ~( ( order ) - 1 ) ) ) + +/* values returned by squalloc_right_neighbor and its auxiliary functions */ +typedef enum { + /* unit of internal item is moved */ + SUBTREE_MOVED = 0, + /* nothing else can be squeezed into left neighbor */ + SQUEEZE_TARGET_FULL = 1, + /* all content of node is squeezed into its left neighbor */ + SQUEEZE_SOURCE_EMPTY = 2, + /* one more item is copied (this is only returned by + allocate_and_copy_extent to squalloc_twig)) */ + SQUEEZE_CONTINUE = 3 +} squeeze_result; + +typedef enum { + STATIC_STAT_DATA_ID, + SIMPLE_DIR_ENTRY_ID, + COMPOUND_DIR_ID, + NODE_POINTER_ID, + ACL_ID, + EXTENT_POINTER_ID, + FORMATTING_ID, + CTAIL_ID, + BLACK_BOX_ID, + LAST_ITEM_ID +} item_id; + +/* Flags passed to jnode_flush() to allow it to distinguish default settings based on + whether commit() was called or VM memory pressure was applied. */ +typedef enum { + /* submit flush queue to disk at jnode_flush completion */ + JNODE_FLUSH_WRITE_BLOCKS = 1, + + /* flush is called for commit */ + JNODE_FLUSH_COMMIT = 2, + /* not implemented */ + JNODE_FLUSH_MEMORY_FORMATTED = 4, + + /* not implemented */ + JNODE_FLUSH_MEMORY_UNFORMATTED = 8, +} jnode_flush_flags; + +/* Flags to insert/paste carry operations. Currently they only used in + flushing code, but in future, they can be used to optimize for repetitive + accesses. */ +typedef enum { + /* carry is not allowed to shift data to the left when trying to find + free space */ + COPI_DONT_SHIFT_LEFT = (1 << 0), + /* carry is not allowed to shift data to the right when trying to find + free space */ + COPI_DONT_SHIFT_RIGHT = (1 << 1), + /* carry is not allowed to allocate new node(s) when trying to find + free space */ + COPI_DONT_ALLOCATE = (1 << 2), + /* try to load left neighbor if its not in a cache */ + COPI_LOAD_LEFT = (1 << 3), + /* try to load right neighbor if its not in a cache */ + COPI_LOAD_RIGHT = (1 << 4), + /* shift insertion point to the left neighbor */ + COPI_GO_LEFT = (1 << 5), + /* shift insertion point to the right neighbor */ + COPI_GO_RIGHT = (1 << 6), + /* try to step back into original node if insertion into new node + fails after shifting data there. */ + COPI_STEP_BACK = (1 << 7), + COPI_GLUE_LEFT = (1 << 8), + COPI_GLUE_RIGHT = (1 << 9) +} cop_insert_flag; + +typedef enum { + SAFE_UNLINK, /* safe-link for unlink */ + SAFE_TRUNCATE, /* safe-link for truncate */ + SAFE_E2T, /* safe-link for extent->tail conversion */ + SAFE_T2E /* safe-link for tail->extent conversion */ +} reiser4_safe_link_t; + +/* this is to show on which list of atom jnode is */ +typedef enum { + NOT_CAPTURED, + DIRTY_LIST, + CLEAN_LIST, + FQ_LIST, + WB_LIST, + OVRWR_LIST +} atom_list; + +/* __REISER4_FORWARD_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/init_super.c linux-2.6.4-ck1/fs/reiser4/init_super.c --- linux-2.6.4/fs/reiser4/init_super.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/init_super.c 2004-03-11 22:45:15.241517671 +1100 @@ -0,0 +1,514 @@ +/* Copyright by Hans Reiser, 2003 */ + +#include "forward.h" +#include "debug.h" +#include "dformat.h" +#include "txnmgr.h" +#include "jnode.h" +#include "znode.h" +#include "tree.h" +#include "vfs_ops.h" +#include "inode.h" +#include "page_cache.h" +#include "ktxnmgrd.h" +#include "super.h" +#include "reiser4.h" +#include "kattr.h" +#include "entd.h" +#include "emergency_flush.h" +#include "prof.h" +#include "repacker.h" +#include "safe_link.h" + +#include +#include +#include +#include +#include +#include +#include + +#define _INIT_PARAM_LIST (struct super_block * s, reiser4_context * ctx, void * data, int silent) +#define _DONE_PARAM_LIST (struct super_block * s) + +#define _INIT_(subsys) static int _init_##subsys _INIT_PARAM_LIST +#define _DONE_(subsys) static void _done_##subsys _DONE_PARAM_LIST + +#define _DONE_EMPTY(subsys) _DONE_(subsys) {} + +_INIT_(mount_flags_check) +{ +/* if (bdev_read_only(s->s_bdev) || (s->s_flags & MS_RDONLY)) { + warning("nikita-3322", "Readonly reiser4 is not yet supported"); + return RETERR(-EROFS); + }*/ + return 0; +} + +_DONE_EMPTY(mount_flags_check) + +_INIT_(sinfo) +{ + reiser4_super_info_data * sbinfo; + + sbinfo = kmalloc(sizeof (reiser4_super_info_data), GFP_KERNEL); + if (!sbinfo) + return RETERR(-ENOMEM); + + s->s_fs_info = sbinfo; + s->s_op = NULL; + xmemset(sbinfo, 0, sizeof (*sbinfo)); + + ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes)); + ON_DEBUG(spin_lock_init(&sbinfo->all_guard)); + + sema_init(&sbinfo->delete_sema, 1); + sema_init(&sbinfo->flush_sema, 1); + spin_super_init(sbinfo); + spin_super_eflush_init(sbinfo); + + return 0; +} + +_DONE_(sinfo) +{ + assert("zam-990", s->s_fs_info != NULL); + rcu_barrier(); + kfree(s->s_fs_info); + s->s_fs_info = NULL; +} + +_INIT_(stat) +{ + return reiser4_stat_init(&get_super_private(s)->stats); +} + +_DONE_(stat) +{ + reiser4_stat_done(&get_super_private(s)->stats); +} + +_INIT_(context) +{ + return init_context(ctx, s); +} + +_DONE_(context) +{ + reiser4_super_info_data * sbinfo; + + sbinfo = get_super_private(s); + + close_trace_file(&sbinfo->trace_file); + + if (reiser4_is_debugged(s, REISER4_STATS_ON_UMOUNT)) + reiser4_print_stats(); + + /* we don't want ->write_super to be called any more. */ + if (s->s_op) + s->s_op->write_super = NULL; + kill_block_super(s); + /* err, don't show this to Viro */ + down_write(&s->s_umount); + +#if REISER4_DEBUG + { + struct list_head *scan; + + /* print jnodes that survived umount. */ + list_for_each(scan, &sbinfo->all_jnodes) { + jnode *busy; + + busy = list_entry(scan, jnode, jnodes); + info_jnode("\nafter umount", busy); + } + } + if (sbinfo->kmalloc_allocated > 0) + warning("nikita-2622", + "%i bytes still allocated", sbinfo->kmalloc_allocated); +#endif + + get_current_context()->trans = NULL; + done_context(get_current_context()); +} + +_INIT_(parse_options) +{ + return reiser4_parse_options(s, data); +} + +_DONE_(parse_options) +{ + close_trace_file(&get_super_private(s)->trace_file); +} + +_INIT_(object_ops) +{ + build_object_ops(s, &get_super_private(s)->ops); + return 0; +} + +_DONE_EMPTY(object_ops) + +_INIT_(read_super) +{ + struct buffer_head *super_bh; + struct reiser4_master_sb *master_sb; + int plugin_id; + reiser4_super_info_data * sbinfo = get_super_private(s); + unsigned long blocksize; + + read_super_block: +#ifdef CONFIG_REISER4_BADBLOCKS + if ( sbinfo->altsuper ) + super_bh = sb_bread(s, (sector_t) (sbinfo->altsuper >> s->s_blocksize_bits)); + else +#endif + /* look for reiser4 magic at hardcoded place */ + super_bh = sb_bread(s, (sector_t) (REISER4_MAGIC_OFFSET / s->s_blocksize)); + + if (!super_bh) + return RETERR(-EIO); + + master_sb = (struct reiser4_master_sb *) super_bh->b_data; + /* check reiser4 magic string */ + if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING, sizeof(REISER4_SUPER_MAGIC_STRING))) { + /* reset block size if it is not a right one FIXME-VS: better comment is needed */ + blocksize = d16tocpu(&master_sb->blocksize); + + if (blocksize != PAGE_CACHE_SIZE) { + if (!silent) + warning("nikita-2609", "%s: wrong block size %ld\n", s->s_id, blocksize); + brelse(super_bh); + return RETERR(-EINVAL); + } + if (blocksize != s->s_blocksize) { + brelse(super_bh); + if (!sb_set_blocksize(s, (int) blocksize)) { + return RETERR(-EINVAL); + } + goto read_super_block; + } + + plugin_id = d16tocpu(&master_sb->disk_plugin_id); + /* only two plugins are available for now */ + assert("vs-476", plugin_id == FORMAT40_ID); + sbinfo->df_plug = disk_format_plugin_by_id(plugin_id); + sbinfo->diskmap_block = d64tocpu(&master_sb->diskmap); + brelse(super_bh); + } else { + if (!silent) { + warning("nikita-2608", "Wrong master super block magic."); + } + + /* no standard reiser4 super block found */ + brelse(super_bh); + /* FIXME-VS: call guess method for all available layout + plugins */ + /* umka (2002.06.12) Is it possible when format-specific super + block exists but there no master super block? */ + return RETERR(-EINVAL); + } + return 0; +} + +_DONE_EMPTY(read_super) + +_INIT_(tree0) +{ + reiser4_super_info_data * sbinfo = get_super_private(s); + + init_tree_0(&sbinfo->tree); + sbinfo->tree.super = s; + return 0; +} + +_DONE_EMPTY(tree0) + +_INIT_(txnmgr) +{ + txnmgr_init(&get_super_private(s)->tmgr); + return 0; +} + +_DONE_(txnmgr) +{ + txnmgr_done(&get_super_private(s)->tmgr); +} + +extern ktxnmgrd_context kdaemon; + +_INIT_(ktxnmgrd) +{ + return ktxnmgrd_attach(&kdaemon, &get_super_private(s)->tmgr); +} + +_DONE_(ktxnmgrd) +{ + ktxnmgrd_detach(&get_super_private(s)->tmgr); +} + +_INIT_(formatted_fake) +{ + return init_formatted_fake(s); +} + +_DONE_(formatted_fake) +{ + reiser4_super_info_data * sbinfo; + + sbinfo = get_super_private(s); + + rcu_barrier(); + + /* done_formatted_fake just has finished with last jnodes (bitmap + * ones) */ + done_tree(&sbinfo->tree); + /* call finish_rcu(), because some znode were "released" in + * done_tree(). */ + rcu_barrier(); + done_formatted_fake(s); +} + +_INIT_(entd) +{ + init_entd_context(s); + return 0; +} + +_DONE_(entd) +{ + done_entd_context(s); +} + +_DONE_(disk_format); + +_INIT_(disk_format) +{ + int res; + + res = get_super_private(s)->df_plug->get_ready(s, data); + if (res != 0) + return res; + + return get_super_private(s)->df_plug->check_mount(s); +} + +_DONE_(disk_format) +{ + reiser4_super_info_data *sbinfo = get_super_private(s); + + sbinfo->df_plug->release(s); +} + +_INIT_(sb_counters) +{ + /* There are some 'committed' versions of reiser4 super block + counters, which correspond to reiser4 on-disk state. These counters + are initialized here */ + reiser4_super_info_data *sbinfo = get_super_private(s); + + sbinfo->blocks_free_committed = sbinfo->blocks_free; + sbinfo->nr_files_committed = oids_used(s); + + return 0; +} + +_DONE_EMPTY(sb_counters) + +_INIT_(fs_root) +{ + reiser4_super_info_data *sbinfo = get_super_private(s); + struct inode * inode; + int result; + + inode = reiser4_iget(s, sbinfo->df_plug->root_dir_key(s), 0); + if (IS_ERR(inode)) + return RETERR(PTR_ERR(inode)); + + s->s_root = d_alloc_root(inode); + if (!s->s_root) { + iput(inode); + return RETERR(-ENOMEM); + } + + s->s_root->d_op = &sbinfo->ops.dentry; + + if (!is_inode_loaded(inode)) { + reiser4_inode *info; + + info = reiser4_inode_data(inode); + + result = grab_plugin_from(info, file, default_file_plugin(s)); + if (result == 0) + result = grab_plugin_from(info, + dir, default_dir_plugin(s)); + if (result == 0) + result = grab_plugin_from(info, + sd, default_sd_plugin(s)); + if (result == 0) + result = grab_plugin_from(info, hash, + default_hash_plugin(s)); + if (result == 0) + result = grab_plugin_from(info, formatting, + default_formatting_plugin(s)); + if (result == 0) + result = grab_plugin_from(info, + perm, default_perm_plugin(s)); + if (result == 0) + result = grab_plugin_from(info, dir_item, + default_dir_item_plugin(s)); + if (result == 0) { + assert("nikita-1951", info->pset->file != NULL); + assert("nikita-1814", info->pset->dir != NULL); + assert("nikita-1815", info->pset->sd != NULL); + assert("nikita-1816", info->pset->hash != NULL); + assert("nikita-1817", info->pset->formatting != NULL); + assert("nikita-1818", info->pset->perm != NULL); + assert("vs-545", info->pset->dir_item != NULL); + } else + warning("nikita-3448", "Cannot set plugins of root: %i", + result); + reiser4_iget_complete(inode); + } else + result = 0; + s->s_maxbytes = MAX_LFS_FILESIZE; + return result; +} + +_DONE_(fs_root) +{ + shrink_dcache_parent(s->s_root); +} + +_INIT_(sysfs) +{ + return reiser4_sysfs_init(s); +} + +_DONE_(sysfs) +{ + reiser4_sysfs_done(s); +} + +_INIT_(sysctl) +{ + return reiser4_sysctl_init(); +} + +_DONE_(sysctl) +{ + reiser4_sysctl_done(); +} + +_INIT_(repacker) +{ + return init_reiser4_repacker(s); +} + +_DONE_(repacker) +{ + done_reiser4_repacker(s); +} + +_INIT_(safelink) +{ + process_safelinks(s); + /* failure to process safe-links is not critical. Continue with + * mount. */ + return 0; +} + +_DONE_(safelink) +{ +} + +_INIT_(exit_context) +{ + return reiser4_exit_context(ctx); +} + +_DONE_EMPTY(exit_context) + +struct reiser4_subsys { + int (*init) _INIT_PARAM_LIST; + void (*done) _DONE_PARAM_LIST; +}; + +#define _SUBSYS(subsys) {.init = &_init_##subsys, .done = &_done_##subsys} +static struct reiser4_subsys subsys_array[] = { + _SUBSYS(mount_flags_check), + _SUBSYS(sinfo), + _SUBSYS(stat), + _SUBSYS(context), + _SUBSYS(parse_options), + _SUBSYS(object_ops), + _SUBSYS(read_super), + _SUBSYS(tree0), + _SUBSYS(txnmgr), + _SUBSYS(ktxnmgrd), + _SUBSYS(entd), + _SUBSYS(formatted_fake), + _SUBSYS(disk_format), + _SUBSYS(sb_counters), + _SUBSYS(fs_root), + _SUBSYS(sysfs), + _SUBSYS(sysctl), + _SUBSYS(repacker), + _SUBSYS(safelink), + _SUBSYS(exit_context) +}; + +#define REISER4_NR_SUBSYS (sizeof(subsys_array) / sizeof(struct reiser4_subsys)) + +static void done_super (struct super_block * s, int last_done) +{ + int i; + for (i = last_done; i >= 0; i--) + subsys_array[i].done(s); +} + +/* read super block from device and fill remaining fields in @s. + + This is read_super() of the past. */ +reiser4_internal int +reiser4_fill_super (struct super_block * s, void * data, int silent) +{ + reiser4_context ctx; + int i; + int ret; + + assert ("zam-989", s != NULL); + + for (i = 0; i < REISER4_NR_SUBSYS; i++) { + ret = subsys_array[i].init(s, &ctx, data, silent); + if (ret) { + done_super(s, i - 1); + return ret; + } + } + return 0; +} + +#if 0 + +int reiser4_done_super (struct super_block * s) +{ + reiser4_context ctx; + + init_context(&ctx, s); + done_super(s, REISER4_NR_SUBSYS - 1); + return 0; +} + +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 80 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/init_super.h linux-2.6.4-ck1/fs/reiser4/init_super.h --- linux-2.6.4/fs/reiser4/init_super.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/init_super.h 2004-03-11 22:45:15.242517516 +1100 @@ -0,0 +1,4 @@ +/* Copyright by Hans Reiser, 2003 */ + +extern int reiser4_fill_super (struct super_block * s, void * data, int silent); +extern int reiser4_done_super (struct super_block * s); diff -Naurp linux-2.6.4/fs/reiser4/inode.c linux-2.6.4-ck1/fs/reiser4/inode.c --- linux-2.6.4/fs/reiser4/inode.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/inode.c 2004-03-11 22:45:15.243517360 +1100 @@ -0,0 +1,795 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Inode specific operations. */ + +#include "forward.h" +#include "debug.h" +#include "key.h" +#include "kassign.h" +#include "coord.h" +#include "seal.h" +#include "dscale.h" +#include "plugin/item/item.h" +#include "plugin/security/perm.h" +#include "plugin/plugin.h" +#include "plugin/object.h" +#include "znode.h" +#include "vfs_ops.h" +#include "inode.h" +#include "super.h" +#include "reiser4.h" + +#include /* for struct super_block, address_space */ + +/* return reiser4 internal tree which inode belongs to */ +/* Audited by: green(2002.06.17) */ +reiser4_internal reiser4_tree * +tree_by_inode(const struct inode * inode /* inode queried */ ) +{ + assert("nikita-256", inode != NULL); + assert("nikita-257", inode->i_sb != NULL); + return get_tree(inode->i_sb); +} + +/* return reiser4-specific inode flags */ +static inline unsigned long * +inode_flags(const struct inode * const inode) +{ + assert("nikita-2842", inode != NULL); + return &reiser4_inode_data(inode)->flags; +} + +/* set reiser4-specific flag @f in @inode */ +reiser4_internal void +inode_set_flag(struct inode * inode, reiser4_file_plugin_flags f) +{ + assert("nikita-2248", inode != NULL); + set_bit((int) f, inode_flags(inode)); +} + +/* clear reiser4-specific flag @f in @inode */ +reiser4_internal void +inode_clr_flag(struct inode * inode, reiser4_file_plugin_flags f) +{ + assert("nikita-2250", inode != NULL); + clear_bit((int) f, inode_flags(inode)); +} + +/* true if reiser4-specific flag @f is set in @inode */ +reiser4_internal int +inode_get_flag(const struct inode * inode, reiser4_file_plugin_flags f) +{ + assert("nikita-2251", inode != NULL); + return test_bit((int) f, inode_flags(inode)); +} + +/* convert oid to inode number */ +reiser4_internal ino_t oid_to_ino(oid_t oid) +{ + return (ino_t) oid; +} + +/* convert oid to user visible inode number */ +reiser4_internal ino_t oid_to_uino(oid_t oid) +{ + /* reiser4 object is uniquely identified by oid which is 64 bit + quantity. Kernel in-memory inode is indexed (in the hash table) by + 32 bit i_ino field, but this is not a problem, because there is a + way to further distinguish inodes with identical inode numbers + (find_actor supplied to iget()). + + But user space expects unique 32 bit inode number. Obviously this + is impossible. Work-around is to somehow hash oid into user visible + inode number. + */ + oid_t max_ino = (ino_t) ~ 0; + + if (REISER4_INO_IS_OID || (oid <= max_ino)) + return oid; + else + /* this is remotely similar to algorithm used to find next pid + to use for process: after wrap-around start from some + offset rather than from 0. Idea is that there are some long + living objects with which we don't want to collide. + */ + return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1)); +} + +/* check that "inode" is on reiser4 file-system */ +reiser4_internal int +is_reiser4_inode(const struct inode *inode /* inode queried */ ) +{ + return + inode != NULL && + (is_reiser4_super(inode->i_sb) || + inode->i_op == &reiser4_inode_operations); + +} + +/* Maximal length of a name that can be stored in directory @inode. + + This is used in check during file creation and lookup. */ +reiser4_internal int +reiser4_max_filename_len(const struct inode *inode /* inode queried */ ) +{ + assert("nikita-287", is_reiser4_inode(inode)); + assert("nikita-1710", inode_dir_item_plugin(inode)); + if (inode_dir_item_plugin(inode)->s.dir.max_name_len) + return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode); + else + return 255; +} + +/* Maximal number of hash collisions for this directory. */ +reiser4_internal int +max_hash_collisions(const struct inode *dir /* inode queried */ ) +{ + assert("nikita-1711", dir != NULL); +#if REISER4_USE_COLLISION_LIMIT + return reiser4_inode_data(dir)->plugin.max_collisions; +#else + (void) dir; + return ~0; +#endif +} + +/* Install file, inode, and address_space operation on @inode, depending on + its mode. */ +reiser4_internal int +setup_inode_ops(struct inode *inode /* inode to intialise */ , + reiser4_object_create_data * data /* parameters to create + * object */ ) +{ + reiser4_super_info_data *sinfo; + + sinfo = get_super_private(inode->i_sb); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO:{ + dev_t rdev; /* to keep gcc happy */ + + /* ugly hack with rdev */ + if (data == NULL) { + rdev = inode->i_rdev; + inode->i_rdev = 0; + } else + rdev = data->rdev; + inode->i_blocks = 0; + inode->i_op = &sinfo->ops.special; + /* other fields are already initialised. */ + init_special_inode(inode, inode->i_mode, rdev); + break; + } + case S_IFLNK: + inode->i_op = &sinfo->ops.symlink; + inode->i_fop = NULL; + inode->i_mapping->a_ops = &sinfo->ops.as; + break; + case S_IFDIR: + inode->i_op = &sinfo->ops.dir; + inode->i_fop = &sinfo->ops.file; + inode->i_mapping->a_ops = &sinfo->ops.as; + break; + case S_IFREG: + inode->i_op = &sinfo->ops.regular; + inode->i_fop = &sinfo->ops.file; + inode->i_mapping->a_ops = &sinfo->ops.as; + break; + default: + warning("nikita-291", "wrong file mode: %o for %llu", inode->i_mode, get_inode_oid(inode)); + reiser4_make_bad_inode(inode); + return RETERR(-EINVAL); + } + return 0; +} + +/* initialise inode from disk data. Called with inode locked. + Return inode locked. */ +static int +init_inode(struct inode *inode /* inode to intialise */ , + coord_t * coord /* coord of stat data */ ) +{ + int result; + item_plugin *iplug; + void *body; + int length; + reiser4_inode *state; + + assert("nikita-292", coord != NULL); + assert("nikita-293", inode != NULL); + + coord_clear_iplug(coord); + result = zload(coord->node); + if (result) + return result; + iplug = item_plugin_by_coord(coord); + body = item_body_by_coord(coord); + length = item_length_by_coord(coord); + + assert("nikita-295", iplug != NULL); + assert("nikita-296", body != NULL); + assert("nikita-297", length > 0); + + /* inode is under I_LOCK now */ + + state = reiser4_inode_data(inode); + /* call stat-data plugin method to load sd content into inode */ + result = iplug->s.sd.init_inode(inode, body, length); + plugin_set_sd(&state->pset, iplug); + if (result == 0) { + result = setup_inode_ops(inode, NULL); + if (result == 0 && + inode->i_sb->s_root && inode->i_sb->s_root->d_inode) { + reiser4_inode *self; + reiser4_inode *root; + + /* take missing plugins from file-system defaults */ + self = reiser4_inode_data(inode); + root = reiser4_inode_data(inode->i_sb->s_root->d_inode); + /* file and directory plugins are already initialised. */ + result = grab_plugin(self, root, sd); + if (result == 0) + result = grab_plugin(self, root, hash); + if (result == 0) + result = grab_plugin(self, root, formatting); + if (result == 0) + result = grab_plugin(self, root, perm); + if (result == 0) + result = grab_plugin(self, root, dir_item); + if (result != 0) { + warning("nikita-3447", + "Cannot set up plugins for %lli", + get_inode_oid(inode)); + } + } + } + zrelse(coord->node); + return result; +} + +/* read `inode' from the disk. This is what was previously in + reiserfs_read_inode2(). + + Must be called with inode locked. Return inode still locked. +*/ +static int +read_inode(struct inode *inode /* inode to read from disk */ , + const reiser4_key * key /* key of stat data */, + int silent) +{ + int result; + lock_handle lh; + reiser4_inode *info; + coord_t coord; + + assert("nikita-298", inode != NULL); + assert("nikita-1945", !is_inode_loaded(inode)); + + info = reiser4_inode_data(inode); + assert("nikita-300", info->locality_id != 0); + + coord_init_zero(&coord); + init_lh(&lh); + /* locate stat-data in a tree and return znode locked */ + result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent); + assert("nikita-301", !is_inode_loaded(inode)); + if (result == 0) { + /* use stat-data plugin to load sd into inode. */ + result = init_inode(inode, &coord); + if (result == 0) { + /* initialize stat-data seal */ + spin_lock_inode(inode); + seal_init(&info->sd_seal, &coord, key); + info->sd_coord = coord; + spin_unlock_inode(inode); + + /* call file plugin's method to initialize plugin + * specific part of inode */ + if (inode_file_plugin(inode)->init_inode_data) + inode_file_plugin(inode)->init_inode_data(inode, + NULL, + 0); + /* Check the opened inode for consistency. */ + result = get_super_private(inode->i_sb)->df_plug->check_open(inode); + } + } + /* lookup_sd() doesn't release coord because we want znode + stay read-locked while stat-data fields are accessed in + init_inode() */ + done_lh(&lh); + + if (result != 0) + reiser4_make_bad_inode(inode); + return result; +} + +/* initialise new reiser4 inode being inserted into hash table. */ +static int +init_locked_inode(struct inode *inode /* new inode */ , + void *opaque /* key of stat data passed to the + * iget5_locked as cookie */ ) +{ + reiser4_key *key; + + assert("nikita-1995", inode != NULL); + assert("nikita-1996", opaque != NULL); + key = opaque; + set_inode_oid(inode, get_key_objectid(key)); + reiser4_inode_data(inode)->locality_id = get_key_locality(key); + return 0; +} + +/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked(). + + This function is called by iget5_locked() to distinguish reiser4 inodes + having the same inode numbers. Such inodes can only exist due to some error + condition. One of them should be bad. Inodes with identical inode numbers + (objectids) are distinguished by their packing locality. + +*/ +reiser4_internal int +reiser4_inode_find_actor(struct inode *inode /* inode from hash table to + * check */ , + void *opaque /* "cookie" passed to + * iget5_locked(). This is stat data + * key */ ) +{ + reiser4_key *key; + + key = opaque; + return + /* oid is unique, so first term is enough, actually. */ + get_inode_oid(inode) == get_key_objectid(key) && + /* + * also, locality should be checked, but locality is stored in + * the reiser4-specific part of the inode, and actor can be + * called against arbitrary inode that happened to be in this + * hash chain. Hence we first have to check that this is + * reiser4 inode at least. is_reiser4_inode() is probably too + * early to call, as inode may have ->i_op not yet + * initialised. + */ + is_reiser4_super(inode->i_sb) && + (!is_inode_loaded(inode) || + reiser4_inode_data(inode)->locality_id == get_key_locality(key)); +} + +/* + * this is our helper function a la iget(). This is be called by + * reiser4_lookup() and reiser4_read_super(). Return inode locked or error + * encountered. + */ +reiser4_internal struct inode * +reiser4_iget(struct super_block *super /* super block */ , + const reiser4_key * key /* key of inode's stat-data */, + int silent) +{ + struct inode *inode; + int result; + + assert("nikita-302", super != NULL); + assert("nikita-303", key != NULL); + + result = 0; + + /* call iget(). Our ->read_inode() is dummy, so this will either + find inode in cache or return uninitialised inode */ + inode = iget5_locked(super, + (unsigned long) get_key_objectid(key), + reiser4_inode_find_actor, + init_locked_inode, + (reiser4_key *) key); + if (inode == NULL) + return ERR_PTR(RETERR(-ENOMEM)); + if (is_bad_inode(inode) && !silent) { + warning("nikita-304", "Stat data not found"); + print_key("key", key); + iput(inode); + return ERR_PTR(RETERR(-EIO)); + } + + /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully + loaded and initialized inode from just allocated inode. If + REISER4_LOADED bit is not set, reiser4_iget() completes loading under + inode->i_sem. The place in reiser4 which uses not initialized inode + is the reiser4 repacker, see repacker-related functions in + plugin/item/extent.c */ + if (!is_inode_loaded(inode)) { + reiser4_inode * info; + + info = reiser4_inode_data(inode); + down(&inode->i_sem); + if (!is_inode_loaded(inode)) { + /* locking: iget5_locked returns locked inode */ + assert("nikita-1941", !is_inode_loaded(inode)); + assert("nikita-1949", + reiser4_inode_find_actor(inode, + (reiser4_key *)key)); + /* now, inode has objectid as ->i_ino and locality in + reiser4-specific part. This is enough for + read_inode() to read stat data from the disk */ + result = read_inode(inode, key, silent); + } + } + + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + + if (is_bad_inode(inode)) { + up(&inode->i_sem); + iput(inode); + inode = ERR_PTR(result); + } else if (REISER4_DEBUG) { + reiser4_key found_key; + + build_sd_key(inode, &found_key); + if (!keyeq(&found_key, key)) { + warning("nikita-305", "Wrong key in sd"); + print_key("sought for", key); + print_key("found", &found_key); + } + } + return inode; +} + +/* reiser4_iget() may return not fully initialized inode, this function should + * be called after one completes reiser4 inode initializing. */ +reiser4_internal void reiser4_iget_complete (struct inode * inode) +{ + assert("zam-988", is_reiser4_inode(inode)); + + if (!is_inode_loaded(inode)) { + inode_set_flag(inode, REISER4_LOADED); + up(&inode->i_sem); + } +} + +reiser4_internal void +reiser4_make_bad_inode(struct inode *inode) +{ + assert("nikita-1934", inode != NULL); + + /* clear LOADED bit */ + inode_clr_flag(inode, REISER4_LOADED); + make_bad_inode(inode); + return; +} + +reiser4_internal file_plugin * +inode_file_plugin(const struct inode * inode) +{ + assert("nikita-1997", inode != NULL); + return reiser4_inode_data(inode)->pset->file; +} + +reiser4_internal dir_plugin * +inode_dir_plugin(const struct inode * inode) +{ + assert("nikita-1998", inode != NULL); + return reiser4_inode_data(inode)->pset->dir; +} + +reiser4_internal perm_plugin * +inode_perm_plugin(const struct inode * inode) +{ + assert("nikita-1999", inode != NULL); + return reiser4_inode_data(inode)->pset->perm; +} + +reiser4_internal formatting_plugin * +inode_formatting_plugin(const struct inode * inode) +{ + assert("nikita-2000", inode != NULL); + return reiser4_inode_data(inode)->pset->formatting; +} + +reiser4_internal hash_plugin * +inode_hash_plugin(const struct inode * inode) +{ + assert("nikita-2001", inode != NULL); + return reiser4_inode_data(inode)->pset->hash; +} + +reiser4_internal crypto_plugin * +inode_crypto_plugin(const struct inode * inode) +{ + assert("edward-36", inode != NULL); + return reiser4_inode_data(inode)->pset->crypto; +} + +reiser4_internal compression_plugin * +inode_compression_plugin(const struct inode * inode) +{ + assert("edward-37", inode != NULL); + return reiser4_inode_data(inode)->pset->compression; +} + +reiser4_internal digest_plugin * +inode_digest_plugin(const struct inode * inode) +{ + assert("edward-86", inode != NULL); + return reiser4_inode_data(inode)->pset->digest; +} + +/* Audited by: green(2002.06.17) */ +reiser4_internal item_plugin * +inode_sd_plugin(const struct inode * inode) +{ + assert("vs-534", inode != NULL); + return reiser4_inode_data(inode)->pset->sd; +} + +/* Audited by: green(2002.06.17) */ +reiser4_internal item_plugin * +inode_dir_item_plugin(const struct inode * inode) +{ + assert("vs-534", inode != NULL); + return reiser4_inode_data(inode)->pset->dir_item; +} + +reiser4_internal void +inode_set_extension(struct inode *inode, sd_ext_bits ext) +{ + reiser4_inode *state; + + assert("nikita-2716", inode != NULL); + assert("nikita-2717", ext < LAST_SD_EXTENSION); + + state = reiser4_inode_data(inode); + spin_lock_inode(inode); + /* FIXME: return value of scint_pack is not checked. */ + scint_pack(&state->extmask, + scint_unpack(&state->extmask) | (1 << ext), GFP_ATOMIC); + /* force re-calculation of stat-data length on next call to + update_sd(). */ + inode_clr_flag(inode, REISER4_SDLEN_KNOWN); + spin_unlock_inode(inode); +} + +reiser4_internal void +inode_set_plugin(struct inode *inode, reiser4_plugin * plug) +{ + assert("nikita-2718", inode != NULL); + assert("nikita-2719", plug != NULL); + + reiser4_inode_data(inode)->plugin_mask |= (1 << plug->h.type_id); + inode_set_extension(inode, PLUGIN_STAT); +} + +reiser4_internal void +inode_check_scale(struct inode *inode, __u64 old, __u64 new) +{ + assert("nikita-2875", inode != NULL); + spin_lock_inode(inode); + if (!dscale_fit(old, new)) + inode_clr_flag(inode, REISER4_SDLEN_KNOWN); + spin_unlock_inode(inode); +} + +/* + * initialize ->ordering field of inode. This field defines how file stat-data + * and body is ordered within a tree with respect to other objects within the + * same parent directory. + */ +reiser4_internal void +init_inode_ordering(struct inode *inode, + reiser4_object_create_data *crd, int create) +{ + reiser4_key key; + + if (create) { + struct inode *parent; + + parent = crd->parent; + assert("nikita-3224", inode_dir_plugin(parent) != NULL); + inode_dir_plugin(parent)->build_entry_key(parent, + &crd->dentry->d_name, + &key); + } else { + coord_t *coord; + + coord = &reiser4_inode_data(inode)->sd_coord; + coord_clear_iplug(coord); + /* safe to use ->sd_coord, because node is under long term + * lock */ + WITH_DATA(coord->node, item_key_by_coord(coord, &key)); + } + + set_inode_ordering(inode, get_key_ordering(&key)); +} + +reiser4_internal znode * +inode_get_vroot(struct inode *inode) +{ + reiser4_block_nr blk; + znode *result; + reiser4_inode *info; + + info = reiser4_inode_data(inode); + LOCK_INODE(info); + blk = info->vroot; + UNLOCK_INODE(info); + if (!disk_addr_eq(&UBER_TREE_ADDR, &blk)) + result = zlook(tree_by_inode(inode), &blk); + else + result = NULL; + return result; +} + +reiser4_internal void +inode_set_vroot(struct inode *inode, znode *vroot) +{ + reiser4_inode *info; + + info = reiser4_inode_data(inode); + LOCK_INODE(info); + info->vroot = *znode_get_block(vroot); + UNLOCK_INODE(info); +} + +reiser4_internal void +inode_clean_vroot(struct inode *inode) +{ + reiser4_inode *info; + + info = reiser4_inode_data(inode); + LOCK_INODE(info); + info->vroot = UBER_TREE_ADDR; + UNLOCK_INODE(info); +} + +reiser4_internal int +get_reiser4_inode_by_key (struct inode ** result, const reiser4_key * key) +{ + struct super_block * super = reiser4_get_current_sb(); + struct inode * inode; + + /* We do not need to read reiser4 inode from disk and initialize all + * reiser4 inode fields. */ + inode = iget_locked(super, (unsigned long)get_key_objectid(key)); + if (inode == NULL) + return -ENOMEM; + if (is_bad_inode(inode)) { + iput(inode); + return -EIO; + } + + if (inode->i_state & I_NEW) { + reiser4_inode * inode_data = reiser4_inode_data(inode); + + /* These inode fields are required for tree traversal. */ + set_inode_oid(inode, get_key_objectid(key)); + inode_data->locality_id = get_key_locality(key); +#if REISER4_LARGE_KEY + inode_data->ordering = get_key_ordering(key); +#endif + + inode->i_mapping->a_ops = &reiser4_as_operations; + unlock_new_inode(inode); + } + + *result = inode; + return 0; +} + + +#if REISER4_DEBUG_OUTPUT +/* Debugging aid: print information about inode. */ +reiser4_internal void +print_inode(const char *prefix /* prefix to print */ , + const struct inode *i /* inode to print */ ) +{ + reiser4_key inode_key; + reiser4_inode *ref; + + if (i == NULL) { + printk("%s: inode: null\n", prefix); + return; + } + printk("%s: ino: %lu, count: %i, link: %i, mode: %o, size: %llu\n", + prefix, i->i_ino, atomic_read(&i->i_count), i->i_nlink, i->i_mode, (unsigned long long) i->i_size); + printk("\tuid: %i, gid: %i, dev: %i, rdev: %i\n", i->i_uid, i->i_gid, i->i_sb->s_dev, i->i_rdev); + printk("\tatime: [%li,%li], mtime: [%li,%li], ctime: [%li,%li]\n", + i->i_atime.tv_sec, i->i_atime.tv_nsec, + i->i_mtime.tv_sec, i->i_mtime.tv_nsec, + i->i_ctime.tv_sec, i->i_ctime.tv_nsec); + printk("\tblkbits: %i, blksize: %lu, blocks: %lu, bytes: %u\n", + i->i_blkbits, i->i_blksize, i->i_blocks, i->i_bytes); + printk("\tversion: %lu, generation: %i, state: %lu, flags: %u\n", + i->i_version, i->i_generation, i->i_state, i->i_flags); + printk("\tis_reiser4_inode: %i\n", is_reiser4_inode(i)); + print_key("\tkey", build_sd_key(i, &inode_key)); + ref = reiser4_inode_data(i); + print_plugin("\tfile", file_plugin_to_plugin(ref->pset->file)); + print_plugin("\tdir", dir_plugin_to_plugin(ref->pset->dir)); + print_plugin("\tperm", perm_plugin_to_plugin(ref->pset->perm)); + print_plugin("\tformatting", formatting_plugin_to_plugin(ref->pset->formatting)); + print_plugin("\thash", hash_plugin_to_plugin(ref->pset->hash)); + print_plugin("\tsd", item_plugin_to_plugin(ref->pset->sd)); + + /* FIXME-VS: this segfaults trying to print seal's coord */ + print_seal("\tsd_seal", &ref->sd_seal); + print_coord("\tsd_coord", &ref->sd_coord, 0); + printk("\tflags: %lx, extmask: %llu, pmask: %i, locality: %llu\n", + *inode_flags(i), scint_unpack(&ref->extmask), + ref->plugin_mask, ref->locality_id); +} +#endif + +#if REISER4_DEBUG +void +inode_invariant(const struct inode *inode) +{ + reiser4_inode * object; + + object = reiser4_inode_data(inode); + assert("nikita-3077", spin_inode_object_is_locked(object)); + + spin_lock_eflush(inode->i_sb); + + assert("nikita-3146", object->eflushed >= 0); + assert("nikita-3441", ergo(object->eflushed > 0, + !list_empty(&object->eflushed_jnodes))); + assert("nikita-3442", object->eflushed >= object->eflushed_anon); + + spin_unlock_eflush(inode->i_sb); +} + +void +mark_inode_update(struct inode *object, int immediate) +{ + int i; + int pos; + reiser4_context *ctx; + + ctx = get_current_context(); + for (i = 0, pos = -1; i < TRACKED_DELAYED_UPDATE; ++i) { + if (ctx->dirty[i].ino == object->i_ino) { + pos = i; + break; + } else if (ctx->dirty[i].ino == 0) + pos = i; + } + if (pos == -1) + warning("nikita-3402", "Too many delayed inode updates"); + else if (immediate) { + ctx->dirty[pos].ino = 0; + } else { + ctx->dirty[pos].ino = object->i_ino; + ctx->dirty[pos].delayed = 1; +#ifdef CONFIG_FRAME_POINTER + ctx->dirty[pos].stack[0] = __builtin_return_address(0); + ctx->dirty[pos].stack[1] = __builtin_return_address(1); + ctx->dirty[pos].stack[2] = __builtin_return_address(2); + ctx->dirty[pos].stack[3] = __builtin_return_address(3); +#endif + } +} + + +int +delayed_inode_updates(dirty_inode_info info) +{ + int i; + + for (i = 0; i < TRACKED_DELAYED_UPDATE; ++i) { + if (info[i].ino != 0 && info[i].delayed) + return 1; + } + return 0; +} + +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/inode.h linux-2.6.4-ck1/fs/reiser4/inode.h --- linux-2.6.4/fs/reiser4/inode.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/inode.h 2004-03-11 22:45:15.244517205 +1100 @@ -0,0 +1,432 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Inode functions. */ + +#if !defined( __REISER4_INODE_H__ ) +#define __REISER4_INODE_H__ + +#include "forward.h" +#include "debug.h" +#include "spin_macros.h" +#include "key.h" +#include "kcond.h" +#include "seal.h" +#include "scint.h" +#include "plugin/plugin.h" +#include "plugin/cryptcompress.h" +#include "plugin/plugin_set.h" +#include "plugin/security/perm.h" +#include "plugin/security/acl.h" +#include "plugin/pseudo/pseudo.h" +#if defined(XATTR) +#include "plugin/xattr.h" +#endif +#include "vfs_ops.h" +#include "jnode.h" + +#include /* for __u?? , ino_t */ +#include /* for struct super_block, struct + * rw_semaphore, etc */ +#include +#include + +/* reiser4-specific inode flags. They are "transient" and are not + supposed to be stored on disk. Used to trace "state" of + inode. Bitmasks for this field are defined in + reiser4_file_plugin_flags enum. + + Flags are stored in inode->i_mapping.assoc_mapping field */ +typedef enum { + /* this is light-weight inode, inheriting some state from its + parent */ + REISER4_LIGHT_WEIGHT = 0, + /* stat data wasn't yet created */ + REISER4_NO_SD = 1, + /* internal immutable flag. Currently is only used + to avoid race condition during file creation. + See comment in create_object(). */ + REISER4_IMMUTABLE = 2, + /* inode was read from storage */ + REISER4_LOADED = 3, + /* this bit is set for symlinks. inode->u.generic_ip points to target + name of symlink. */ + REISER4_GENERIC_PTR_USED = 4, + /* set if size of stat-data item for this inode is known. If this is + * set we can avoid recalculating size of stat-data on each update. */ + REISER4_SDLEN_KNOWN = 6, + /* reiser4_inode->crypt points to the crypto stat */ + REISER4_CRYPTO_STAT_LOADED = 7, + /* reiser4_inode->cluster_shift makes sense */ + REISER4_CLUSTER_KNOWN = 8, + /* cryptcompress_inode_data points to the secret key */ + REISER4_SECRET_KEY_INSTALLED = 9, + /* file is mapped for read only and it contains of tails. */ + REISER4_TAILS_FILE_MMAPED = 10, + REISER4_PART_CONV = 11 +} reiser4_file_plugin_flags; + +/* state associated with each inode. + reiser4 inode. + + NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes + be of the same size. File-system allocates inodes by itself through + s_op->allocate_inode() method. So, it is possible to adjust size of inode + at the time of its creation. + + + Invariants involving parts of this data-type: + + [inode->eflushed] + +*/ + +typedef struct reiser4_inode reiser4_inode; +/* return pointer to reiser4-specific part of inode */ +static inline reiser4_inode * +reiser4_inode_data(const struct inode * inode /* inode queried */); + +#include "plugin/file/file.h" + +#if BITS_PER_LONG == 64 + +#define REISER4_INO_IS_OID (1) +typedef struct {; +} oid_hi_t; + +/* BITS_PER_LONG == 64 */ +#else + +#define REISER4_INO_IS_OID (0) +typedef __u32 oid_hi_t; + +/* BITS_PER_LONG == 64 */ +#endif + +struct reiser4_inode { + /* spin lock protecting fields of this structure. */ + reiser4_spin_data guard; + /* object plugins */ + plugin_set *pset; + /* high 32 bits of object id */ + oid_hi_t oid_hi; + /* seal for stat-data */ + seal_t sd_seal; + /* locality id for this file */ + oid_t locality_id; +#if REISER4_LARGE_KEY + __u64 ordering; +#endif + /* coord of stat-data in sealed node */ + coord_t sd_coord; + /* bit-mask of stat-data extentions used by this file */ + scint_t extmask; + /* number of unformatted nodes, dirtied from mmap and eflushed from + * this object. */ + int eflushed_anon; + /* bitmask of non-default plugins for this inode */ + __u16 plugin_mask; + /* cluster parameter for crypto and compression */ + __u8 cluster_shift; + /* secret key parameter for crypto */ + crypto_stat_t *crypt; + /* list of pages dirtied through mmap */ + struct list_head moved_pages; + union { + readdir_list_head readdir_list; + struct list_head mmaped; + } lists; + /* per-inode flags. Filled by values of reiser4_file_plugin_flags */ + unsigned long flags; + union { + /* fields specific to unix_file plugin */ + unix_file_info_t unix_file_info; + /* fields specific to cryptcompress plugin */ + cryptcompress_info_t cryptcompress_info; + /* fields specific to pseudo file plugin */ + pseudo_info_t pseudo_info; + } file_plugin_data; + union { + acl_perm_info_t acl_perm_info; + } perm_plugin_data; + +#if defined(XATTR) + /* list of object specific xattr namespaces */ + xattr_list_head xattr_namespaces; +#endif + + /* list of unformatted jnodes eflushed from this object */ + struct list_head eflushed_jnodes; + + jnode inode_jnode; /* this is to capture inode */ + + /* currently operations on this tree are protected by tree's spin lock */ + struct radix_tree_root jnode_tree; + /* block number of virtual root for this object. See comment above + * fs/reiser4/search.c:handle_vroot() */ + reiser4_block_nr vroot; +#if REISER4_DEBUG + /* number of jnodes in jnode tree */ + int jnodes; + /* number of unformatted nodes eflushed from this object */ + int eflushed; +#endif +}; + +#define I_EFLUSH (256) +#define I_JNODES (512) + + +typedef struct reiser4_inode_object { + /* private part */ + reiser4_inode p; + /* generic fields not specific to reiser4, but used by VFS */ + struct inode vfs_inode; +} reiser4_inode_object; + +/* return pointer to the reiser4 specific portion of @inode */ +static inline reiser4_inode * +reiser4_inode_data(const struct inode * inode /* inode queried */) +{ + assert("nikita-254", inode != NULL); + return &container_of(inode, reiser4_inode_object, vfs_inode)->p; +} + +static inline struct inode * +inode_by_reiser4_inode(const reiser4_inode *r4_inode /* inode queried */) +{ + return &container_of(r4_inode, reiser4_inode_object, p)->vfs_inode; +} + +/* + * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct + * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64 + * bits. + * + * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part + * of inode, otherwise whole oid is stored in i_ino. + * + * Wrappers below ([sg]et_inode_oid()) are used to hide this difference. + */ + +#define OID_HI_SHIFT (sizeof(ino_t) * 8) + +#if REISER4_INO_IS_OID + +static inline oid_t +get_inode_oid(const struct inode *inode) +{ + return inode->i_ino; +} + +static inline void +set_inode_oid(struct inode *inode, oid_t oid) +{ + inode->i_ino = oid; +} + +/* REISER4_INO_IS_OID */ +#else + +static inline oid_t +get_inode_oid(const struct inode *inode) +{ + return + ((__u64)reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) | + inode->i_ino; +} + +static inline void +set_inode_oid(struct inode *inode, oid_t oid) +{ + assert("nikita-2519", inode != NULL); + inode->i_ino = (ino_t)(oid); + reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT; + assert("nikita-2521", get_inode_oid(inode) == (oid)); +} + +/* REISER4_INO_IS_OID */ +#endif + +static inline oid_t +get_inode_locality(const struct inode *inode) +{ + return reiser4_inode_data(inode)->locality_id; +} + +#if REISER4_LARGE_KEY +static inline __u64 get_inode_ordering(const struct inode *inode) +{ + return reiser4_inode_data(inode)->ordering; +} + +static inline void set_inode_ordering(const struct inode *inode, __u64 ordering) +{ + reiser4_inode_data(inode)->ordering = ordering; +} + +#else + +#define get_inode_ordering(inode) (0) +#define set_inode_ordering(inode, val) noop + +#endif + +/* + * each reiser4 inode maintain a list of pages dirtied through mmap. This is + * needed, because we need effective was to find all such pages and capture + * them. This function returns a head of this list. + */ +static inline struct list_head * +get_moved_pages(struct address_space *mapping) +{ + return &reiser4_inode_data(mapping->host)->moved_pages; +} + +/* return inode in which @uf_info is embedded */ +static inline struct inode * +unix_file_info_to_inode(const unix_file_info_t *uf_info) +{ + return &container_of(uf_info, reiser4_inode_object, + p.file_plugin_data.unix_file_info)->vfs_inode; +} + +/* ordering predicate for inode spin lock: only jnode lock can be held */ +#define spin_ordering_pred_inode_object(inode) \ + ( lock_counters() -> rw_locked_dk == 0 ) && \ + ( lock_counters() -> rw_locked_tree == 0 ) && \ + ( lock_counters() -> spin_locked_txnh == 0 ) && \ + ( lock_counters() -> rw_locked_zlock == 0 ) && \ + ( lock_counters() -> spin_locked_jnode == 0 ) && \ + ( lock_counters() -> spin_locked_atom == 0 ) && \ + ( lock_counters() -> spin_locked_ktxnmgrd == 0 ) && \ + ( lock_counters() -> spin_locked_txnmgr == 0 ) + +SPIN_LOCK_FUNCTIONS(inode_object, reiser4_inode, guard); + +extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const)); +extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const)); + +extern reiser4_tree *tree_by_inode(const struct inode *inode); + +#if REISER4_DEBUG +extern void inode_invariant(const struct inode *inode); +#else +#define inode_invariant(inode) noop +#endif + +#define spin_lock_inode(inode) \ +({ \ + LOCK_INODE(reiser4_inode_data(inode)); \ + inode_invariant(inode); \ +}) + +#define spin_unlock_inode(inode) \ +({ \ + inode_invariant(inode); \ + UNLOCK_INODE(reiser4_inode_data(inode)); \ +}) + +extern znode *inode_get_vroot(struct inode *inode); +extern void inode_set_vroot(struct inode *inode, znode *vroot); +extern void inode_clean_vroot(struct inode *inode); + +extern int reiser4_max_filename_len(const struct inode *inode); +extern int max_hash_collisions(const struct inode *dir); +extern void reiser4_unlock_inode(struct inode *inode); +extern int is_reiser4_inode(const struct inode *inode); +extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *); +extern struct inode *reiser4_iget(struct super_block *super, const reiser4_key * key, int silent); +extern void reiser4_iget_complete (struct inode * inode); +extern int reiser4_inode_find_actor(struct inode *inode, void *opaque); +extern int get_reiser4_inode_by_key (struct inode **, const reiser4_key *); + + +extern void inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f); +extern void inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f); +extern int inode_get_flag(const struct inode *inode, reiser4_file_plugin_flags f); + +/* has inode been initialized? */ +static inline int +is_inode_loaded(const struct inode *inode /* inode queried */ ) +{ + assert("nikita-1120", inode != NULL); + return inode_get_flag(inode, REISER4_LOADED); +} + +extern file_plugin *inode_file_plugin(const struct inode *inode); +extern dir_plugin *inode_dir_plugin(const struct inode *inode); +extern perm_plugin *inode_perm_plugin(const struct inode *inode); +extern formatting_plugin *inode_formatting_plugin(const struct inode *inode); +extern hash_plugin *inode_hash_plugin(const struct inode *inode); +extern crypto_plugin *inode_crypto_plugin(const struct inode *inode); +extern digest_plugin *inode_digest_plugin(const struct inode *inode); +extern compression_plugin *inode_compression_plugin(const struct inode *inode); +extern item_plugin *inode_sd_plugin(const struct inode *inode); +extern item_plugin *inode_dir_item_plugin(const struct inode *inode); + +extern void inode_set_plugin(struct inode *inode, reiser4_plugin * plug); +extern void reiser4_make_bad_inode(struct inode *inode); + +extern void inode_set_extension(struct inode *inode, sd_ext_bits ext); +extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new); + +/* + * update field @field in inode @i to contain value @value. + */ +#define INODE_SET_FIELD(i, field, value) \ +({ \ + struct inode *__i; \ + typeof(value) __v; \ + \ + __i = (i); \ + __v = (value); \ + inode_check_scale(__i, __i->field, __v); \ + __i->field = __v; \ +}) + +#define INODE_INC_FIELD(i, field) \ +({ \ + struct inode *__i; \ + \ + __i = (i); \ + inode_check_scale(__i, __i->field, __i->field + 1); \ + ++ __i->field; \ +}) + +#define INODE_DEC_FIELD(i, field) \ +({ \ + struct inode *__i; \ + \ + __i = (i); \ + inode_check_scale(__i, __i->field, __i->field - 1); \ + -- __i->field; \ +}) + +/* See comment before readdir_common() for description. */ +static inline readdir_list_head * +get_readdir_list(const struct inode *inode) +{ + return &reiser4_inode_data(inode)->lists.readdir_list; +} + +extern void init_inode_ordering(struct inode *inode, + reiser4_object_create_data *crd, int create); + +#if REISER4_DEBUG_OUTPUT +extern void print_inode(const char *prefix, const struct inode *i); +#else +#define print_inode(p, i) noop +#endif + +/* __REISER4_INODE_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/inode_ops.c linux-2.6.4-ck1/fs/reiser4/inode_ops.c --- linux-2.6.4/fs/reiser4/inode_ops.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/inode_ops.c 2004-03-11 22:45:15.246516894 +1100 @@ -0,0 +1,697 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Interface to VFS. Reiser4 inode_operations are defined here. */ + +#include "forward.h" +#include "debug.h" +#include "dformat.h" +#include "coord.h" +#include "plugin/item/item.h" +#include "plugin/file/file.h" +#include "plugin/security/perm.h" +#include "plugin/disk_format/disk_format.h" +#include "plugin/plugin.h" +#include "plugin/plugin_set.h" +#include "plugin/plugin_hash.h" +#include "plugin/object.h" +#include "txnmgr.h" +#include "jnode.h" +#include "znode.h" +#include "block_alloc.h" +#include "tree.h" +#include "trace.h" +#include "vfs_ops.h" +#include "inode.h" +#include "page_cache.h" +#include "ktxnmgrd.h" +#include "super.h" +#include "reiser4.h" +#include "kattr.h" +#include "entd.h" +#include "emergency_flush.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* inode operations */ + +static int reiser4_create(struct inode *, struct dentry *, int, + struct nameidata *); +static struct dentry *reiser4_lookup(struct inode *, struct dentry *, + struct nameidata *); +static int reiser4_link(struct dentry *, struct inode *, struct dentry *); +static int reiser4_unlink(struct inode *, struct dentry *); +static int reiser4_rmdir(struct inode *, struct dentry *); +static int reiser4_symlink(struct inode *, struct dentry *, const char *); +static int reiser4_mkdir(struct inode *, struct dentry *, int); +static int reiser4_mknod(struct inode *, struct dentry *, int, dev_t); +static int reiser4_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); +static int reiser4_readlink(struct dentry *, char *, int); +static int reiser4_follow_link(struct dentry *, struct nameidata *); +static void reiser4_truncate(struct inode *); +static int reiser4_permission(struct inode *, int, struct nameidata *); +static int reiser4_setattr(struct dentry *, struct iattr *); +static int reiser4_getattr(struct vfsmount *mnt, struct dentry *, struct kstat *); + +#if 0 +static int reiser4_setxattr(struct dentry *, const char *, void *, size_t, int); +static ssize_t reiser4_getxattr(struct dentry *, const char *, void *, size_t); +static ssize_t reiser4_listxattr(struct dentry *, char *, size_t); +static int reiser4_removexattr(struct dentry *, const char *); +#endif + +static int invoke_create_method(struct inode *parent, + struct dentry *dentry, + reiser4_object_create_data * data); + +/* ->create() VFS method in reiser4 inode_operations */ +static int +reiser4_create(struct inode *parent /* inode of parent + * directory */, + struct dentry *dentry /* dentry of new object to + * create */, + int mode /* new object mode */, + struct nameidata *nameidata) +{ + reiser4_object_create_data data; + xmemset(&data, 0, sizeof data); + + reiser4_stat_inc_at(parent->i_sb, vfs_calls.create); + + data.mode = S_IFREG | mode; + data.id = UNIX_FILE_PLUGIN_ID; + return invoke_create_method(parent, dentry, &data); +} + +/* ->mkdir() VFS method in reiser4 inode_operations */ +static int +reiser4_mkdir(struct inode *parent /* inode of parent + * directory */ , + struct dentry *dentry /* dentry of new object to + * create */ , + int mode /* new object's mode */ ) +{ + reiser4_object_create_data data; + + reiser4_stat_inc_at(parent->i_sb, vfs_calls.mkdir); + + data.mode = S_IFDIR | mode; + data.id = DIRECTORY_FILE_PLUGIN_ID; + return invoke_create_method(parent, dentry, &data); +} + +/* ->symlink() VFS method in reiser4 inode_operations */ +static int +reiser4_symlink(struct inode *parent /* inode of parent + * directory */ , + struct dentry *dentry /* dentry of new object to + * create */ , + const char *linkname /* pathname to put into + * symlink */ ) +{ + reiser4_object_create_data data; + + reiser4_stat_inc_at(parent->i_sb, vfs_calls.symlink); + + data.name = linkname; + data.id = SYMLINK_FILE_PLUGIN_ID; + data.mode = S_IFLNK | S_IRWXUGO; + return invoke_create_method(parent, dentry, &data); +} + +/* ->mknod() VFS method in reiser4 inode_operations */ +static int +reiser4_mknod(struct inode *parent /* inode of parent directory */ , + struct dentry *dentry /* dentry of new object to + * create */ , + int mode /* new object's mode */ , + dev_t rdev /* minor and major of new device node */ ) +{ + reiser4_object_create_data data; + + reiser4_stat_inc_at(parent->i_sb, vfs_calls.mknod); + + data.mode = mode; + data.rdev = rdev; + data.id = SPECIAL_FILE_PLUGIN_ID; + return invoke_create_method(parent, dentry, &data); +} + +/* ->rename() inode operation */ +static int +reiser4_rename(struct inode *old_dir, struct dentry *old, struct inode *new_dir, struct dentry *new) +{ + int result; + reiser4_context ctx; + + assert("nikita-2314", old_dir != NULL); + assert("nikita-2315", old != NULL); + assert("nikita-2316", new_dir != NULL); + assert("nikita-2317", new != NULL); + + init_context(&ctx, old_dir->i_sb); + reiser4_stat_inc(vfs_calls.rename); + + result = perm_chk(old_dir, rename, old_dir, old, new_dir, new); + if (result == 0) { + dir_plugin *dplug; + + dplug = inode_dir_plugin(old_dir); + assert("nikita-2271", dplug != NULL); + if (dplug->rename != NULL) + result = dplug->rename(old_dir, old, new_dir, new); + else + result = RETERR(-EPERM); + } + context_set_commit_async(&ctx); + reiser4_exit_context(&ctx); + return result; +} + +/* reiser4_lookup() - entry point for ->lookup() method. + + This is a wrapper for lookup_object which is a wrapper for the directory + plugin that does the lookup. + + This is installed in ->lookup() in reiser4_inode_operations. +*/ +static struct dentry * +reiser4_lookup(struct inode *parent, /* directory within which we are to + * look for the name specified in + * dentry */ + struct dentry *dentry, /* this contains the name that is to + be looked for on entry, and on exit + contains a filled in dentry with a + pointer to the inode (unless name + not found) */ + struct nameidata *nameidata) +{ + dir_plugin *dplug; + int retval; + struct dentry *result; + reiser4_context ctx; + int (*lookup) (struct inode * parent_inode, struct dentry **dentry); + + assert("nikita-403", parent != NULL); + assert("nikita-404", dentry != NULL); + + init_context(&ctx, parent->i_sb); + reiser4_stat_inc(vfs_calls.lookup); + + /* find @parent directory plugin and make sure that it has lookup + method */ + dplug = inode_dir_plugin(parent); + if (dplug != NULL && dplug->lookup != NULL) + /* if parent directory has directory plugin with ->lookup + * method, use the latter to do lookup */ + lookup = dplug->lookup; + else if (!reiser4_is_set(parent->i_sb, REISER4_NO_PSEUDO)) + /* even if there is no ->lookup method, pseudo file lookup + * should still be performed, but only unless we are in + * "no-pseudo" mode */ + lookup = lookup_pseudo_file; + else + lookup = NULL; + if (lookup != NULL) { + struct dentry *name; + + name = dentry; + /* call its lookup method */ + retval = lookup(parent, &name); + if (retval == 0) { + if (name == NULL) { + /* + * new object was looked up. Initialize it. + */ + struct inode *obj; + file_plugin *fplug; + + obj = dentry->d_inode; + assert("nikita-2645", obj != NULL); + fplug = inode_file_plugin(obj); + retval = fplug->bind(obj, parent); + } + } else if (retval == -ENOENT) { + /* object not found */ + d_add(dentry, NULL); + retval = 0; + name = NULL; + } + + if (retval == 0) + /* success */ + result = name; + else + result = ERR_PTR(retval); + } else + result = ERR_PTR(-ENOTDIR); + + /* prevent balance_dirty_pages() from being called: we don't want to + * do this under directory i_sem. */ + context_set_commit_async(&ctx); + reiser4_exit_context(&ctx); + return result; +} + +static int +reiser4_readlink(struct dentry *dentry, char *buf, int buflen) +{ + assert("vs-852", S_ISLNK(dentry->d_inode->i_mode)); + reiser4_stat_inc_at(dentry->d_inode->i_sb, vfs_calls.readlink); + if (!dentry->d_inode->u.generic_ip || !inode_get_flag(dentry->d_inode, REISER4_GENERIC_PTR_USED)) + return RETERR(-EINVAL); + return vfs_readlink(dentry, buf, buflen, dentry->d_inode->u.generic_ip); +} + +static int +reiser4_follow_link(struct dentry *dentry, struct nameidata *data) +{ + assert("vs-851", S_ISLNK(dentry->d_inode->i_mode)); + + reiser4_stat_inc_at(dentry->d_inode->i_sb, vfs_calls.follow_link); + if (!dentry->d_inode->u.generic_ip || !inode_get_flag(dentry->d_inode, REISER4_GENERIC_PTR_USED)) + return RETERR(-EINVAL); + return vfs_follow_link(data, dentry->d_inode->u.generic_ip); +} + +/* ->setattr() inode operation + + Called from notify_change. */ +static int +reiser4_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode; + int result; + reiser4_context ctx; + + assert("nikita-2269", attr != NULL); + + inode = dentry->d_inode; + assert("vs-1108", inode != NULL); + init_context(&ctx, inode->i_sb); + reiser4_stat_inc(vfs_calls.setattr); + result = perm_chk(inode, setattr, dentry, attr); + if (result == 0) { + if (!inode_get_flag(inode, REISER4_IMMUTABLE)) { + file_plugin *fplug; + + fplug = inode_file_plugin(inode); + assert("nikita-2271", fplug != NULL); + assert("nikita-2296", fplug->setattr != NULL); + result = fplug->setattr(inode, attr); + } else + result = RETERR(-E_REPEAT); + } + context_set_commit_async(&ctx); + reiser4_exit_context(&ctx); + return result; +} + +/* ->getattr() inode operation called (indirectly) by sys_stat(). */ +static int +reiser4_getattr(struct vfsmount *mnt UNUSED_ARG, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode; + int result; + reiser4_context ctx; + + inode = dentry->d_inode; + init_context(&ctx, inode->i_sb); + reiser4_stat_inc(vfs_calls.getattr); + result = perm_chk(inode, getattr, mnt, dentry, stat); + if (result == 0) { + file_plugin *fplug; + + fplug = inode_file_plugin(inode); + assert("nikita-2295", fplug != NULL); + assert("nikita-2297", fplug->getattr != NULL); + result = fplug->getattr(mnt, dentry, stat); + } + reiser4_exit_context(&ctx); + return result; +} + +/* helper function: call object plugin to truncate file to @size */ +static int +truncate_object(struct inode *inode /* object to truncate */ , + loff_t size /* size to truncate object to */ ) +{ + file_plugin *fplug; + int result; + + assert("nikita-1026", inode != NULL); + assert("nikita-1027", is_reiser4_inode(inode)); + assert("nikita-1028", inode->i_sb != NULL); + + write_syscall_trace("%llu %lli", get_inode_oid(inode), size); + + fplug = inode_file_plugin(inode); + assert("vs-142", fplug != NULL); + + assert("nikita-2933", fplug->truncate != NULL); + result = fplug->truncate(inode, size); + if (result != 0) + warning("nikita-1602", "Truncate error: %i for %lli", result, get_inode_oid(inode)); + + write_syscall_trace("ex"); + return result; +} + +/* ->truncate() VFS method in reiser4 inode_operations */ +static void +reiser4_truncate(struct inode *inode /* inode to truncate */ ) +{ + reiser4_context ctx; + + assert("umka-075", inode != NULL); + + init_context(&ctx, inode->i_sb); + reiser4_stat_inc(vfs_calls.truncate); + ON_TRACE(TRACE_VFS_OPS, "TRUNCATE: i_ino %li to size %lli\n", inode->i_ino, inode->i_size); + + truncate_object(inode, inode->i_size); + + /* for mysterious reasons ->truncate() VFS call doesn't return + value */ + (void)reiser4_exit_context(&ctx); +} + +/* ->permission() method in reiser4_inode_operations. */ +static int +reiser4_permission(struct inode *inode /* object */ , + int mask, /* mode bits to check permissions + * for */ + struct nameidata *nameidata) +{ + /* reiser4_context creation/destruction removed from here, + because permission checks currently don't require this. + + Permission plugin have to create context itself if necessary. */ + assert("nikita-1687", inode != NULL); + + return perm_chk(inode, mask, inode, mask); +} + +/* common part of both unlink and rmdir. */ +static int +unlink_file(struct inode *parent /* parent directory */ , + struct dentry *victim /* name of object being + * unlinked */ ) +{ + int result; + dir_plugin *dplug; + reiser4_context ctx; + + init_context(&ctx, parent->i_sb); + write_syscall_trace("%s", victim->d_name.name); + + assert("nikita-1435", parent != NULL); + assert("nikita-1436", victim != NULL); + + ON_TRACE(TRACE_DIR | TRACE_VFS_OPS, "unlink: %lli/%s\n", + get_inode_oid(parent), victim->d_name.name); + + dplug = inode_dir_plugin(parent); + assert("nikita-1429", dplug != NULL); + if (dplug->unlink != NULL) + result = dplug->unlink(parent, victim); + else + result = RETERR(-EPERM); + write_syscall_trace("ex"); + /* @victim can be already removed from the disk by this time. Inode is + then marked so that iput() wouldn't try to remove stat data. But + inode itself is still there. + */ + /* we cannot release directory semaphore here, because name has + * already been deleted, but dentry (@victim) still exists. */ + /* prevent balance_dirty_pages() from being called: we don't want to + * do this under directory i_sem. */ + context_set_commit_async(&ctx); + reiser4_exit_context(&ctx); + return result; +} + +/* ->unlink() VFS method in reiser4 inode_operations + + remove link from @parent directory to @victim object: delegate work + to object plugin +*/ +/* Audited by: umka (2002.06.12) */ +static int +reiser4_unlink(struct inode *parent /* parent directory */ , + struct dentry *victim /* name of object being + * unlinked */ ) +{ + assert("nikita-2011", parent != NULL); + assert("nikita-2012", victim != NULL); + assert("nikita-2013", victim->d_inode != NULL); + reiser4_stat_inc_at(parent->i_sb,vfs_calls.unlink); + if (inode_dir_plugin(victim->d_inode) == NULL) + return unlink_file(parent, victim); + else + return RETERR(-EISDIR); +} + +/* ->rmdir() VFS method in reiser4 inode_operations + + The same as unlink, but only for directories. + +*/ +/* Audited by: umka (2002.06.12) */ +static int +reiser4_rmdir(struct inode *parent /* parent directory */ , + struct dentry *victim /* name of directory being + * unlinked */ ) +{ + assert("nikita-2014", parent != NULL); + assert("nikita-2015", victim != NULL); + assert("nikita-2016", victim->d_inode != NULL); + + reiser4_stat_inc_at(parent->i_sb, vfs_calls.rmdir); + if (inode_dir_plugin(victim->d_inode) != NULL) + /* there is no difference between unlink and rmdir for + reiser4 */ + return unlink_file(parent, victim); + else + return RETERR(-ENOTDIR); +} + +/* ->link() VFS method in reiser4 inode_operations + + entry point for ->link() method. + + This is installed as ->link inode operation for reiser4 + inodes. Delegates all work to object plugin +*/ +/* Audited by: umka (2002.06.12) */ +static int +reiser4_link(struct dentry *existing /* dentry of existing + * object */ , + struct inode *parent /* parent directory */ , + struct dentry *where /* new name for @existing */ ) +{ + int result; + dir_plugin *dplug; + reiser4_context ctx; + + assert("umka-080", existing != NULL); + assert("nikita-1031", parent != NULL); + + init_context(&ctx, parent->i_sb); + context_set_commit_async(&ctx); + reiser4_stat_inc(vfs_calls.link); + + dplug = inode_dir_plugin(parent); + assert("nikita-1430", dplug != NULL); + if (dplug->link != NULL) { + result = dplug->link(parent, existing, where); + if (result == 0) + d_instantiate(where, existing->d_inode); + } else { + result = RETERR(-EPERM); + } + up(&existing->d_inode->i_sem); + up(&parent->i_sem); + reiser4_exit_context(&ctx); + down(&parent->i_sem); + down(&existing->d_inode->i_sem); + return result; +} + +/* call ->create() directory plugin method. */ +static int +invoke_create_method(struct inode *parent /* parent directory */ , + struct dentry *dentry /* dentry of new + * object */ , + reiser4_object_create_data * data /* information + * necessary + * to create + * new + * object */ ) +{ + int result; + dir_plugin *dplug; + reiser4_context ctx; + + init_context(&ctx, parent->i_sb); + context_set_commit_async(&ctx); + write_syscall_trace("%s %o", dentry->d_name.name, data->mode); + + assert("nikita-426", parent != NULL); + assert("nikita-427", dentry != NULL); + assert("nikita-428", data != NULL); + + dplug = inode_dir_plugin(parent); + if (dplug == NULL) + result = RETERR(-ENOTDIR); + else if (dplug->create_child != NULL) { + struct inode *child; + + child = NULL; + + data->parent = parent; + data->dentry = dentry; + + result = dplug->create_child(data, &child); + if (unlikely(result != 0)) { + if (child != NULL) { + /* + * what we actually want to check in the + * assertion below is that @child only + * contains items that iput()->... is going to + * remove (usually stat-data). Obvious check + * for child->i_size == 0 doesn't work for + * symlinks. + */ + assert("nikita-3140", S_ISLNK(child->i_mode) || + child->i_size == 0); + reiser4_make_bad_inode(child); + iput(child); + } + } else { + d_instantiate(dentry, child); + ON_TRACE(TRACE_VFS_OPS, "create: %s (%o) %llu\n", + dentry->d_name.name, + data->mode, get_inode_oid(child)); + } + } else + result = RETERR(-EPERM); + + write_syscall_trace("ex"); + + reiser4_exit_context(&ctx); + return result; +} + +#if defined(XATTR) +#define FP_OP(_inode, _method, _errcode, ...) \ +({ \ + reiser4_context ctx; \ + file_plugin *fplug; \ + int result; \ + \ + init_context(&ctx, (_inode)->i_sb); \ + context_set_commit_async(&ctx); \ + \ + fplug = inode_file_plugin(_inode); \ + if (fplug->_method != NULL) \ + result = fplug->_method(__VA_ARGS__); \ + else \ + result = RETERR(_errcode); \ + reiser4_exit_context(&ctx); \ + result; \ +}) + +static int +reiser4_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return FP_OP(dentry->d_inode, xattr.set, -EOPNOTSUPP, + dentry, name, value, size, flags); +} + +static ssize_t +reiser4_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + return FP_OP(dentry->d_inode, xattr.get, -EOPNOTSUPP, + dentry, name, buffer, size); +} + +static ssize_t +reiser4_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + return FP_OP(dentry->d_inode, xattr.list, -EOPNOTSUPP, + dentry, buffer, size); +} + +static int +reiser4_removexattr(struct dentry *dentry, const char *name) +{ + return FP_OP(dentry->d_inode, xattr.remove, -EOPNOTSUPP, + dentry, name); +} + +#undef FP_OP +#endif + +struct inode_operations reiser4_inode_operations = { + .create = reiser4_create, /* d */ + .lookup = reiser4_lookup, /* d */ + .link = reiser4_link, /* d */ + .unlink = reiser4_unlink, /* d */ + .symlink = reiser4_symlink, /* d */ + .mkdir = reiser4_mkdir, /* d */ + .rmdir = reiser4_rmdir, /* d */ + .mknod = reiser4_mknod, /* d */ + .rename = reiser4_rename, /* d */ + .readlink = NULL, + .follow_link = NULL, + .truncate = reiser4_truncate, /* d */ + .permission = reiser4_permission, /* d */ + .setattr = reiser4_setattr, /* d */ + .getattr = reiser4_getattr, /* d */ +#if defined(XATTR) + .setxattr = reiser4_setxattr, /* d */ + .getxattr = reiser4_getxattr, /* d */ + .listxattr = reiser4_listxattr, /* d */ + .removexattr = reiser4_removexattr /* d */ +#endif +}; + +struct inode_operations reiser4_symlink_inode_operations = { + .setattr = reiser4_setattr, /* d */ + .getattr = reiser4_getattr, /* d */ + .readlink = reiser4_readlink, + .follow_link = reiser4_follow_link +}; + +struct inode_operations reiser4_special_inode_operations = { + .setattr = reiser4_setattr, /* d */ + .getattr = reiser4_getattr /* d */ +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/interpolate.c linux-2.6.4-ck1/fs/reiser4/interpolate.c --- linux-2.6.4/fs/reiser4/interpolate.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/interpolate.c 2004-03-11 22:45:15.246516894 +1100 @@ -0,0 +1,20 @@ +/* We will use @ as the symbol for dereferencing, we won't use * because +we want to reserve it for use as a wildcard someday. + +Inheriting stat data from source_filename can be done as: + +target_filename/mode<=@source_filename/mode + +File body inheritance is accomplished by extending symlink functionality: + +file_body_inheritance example: + +target_filename/symlink<=`@freshly_interpolate_this_filename_whenever_resolving_target_filename+`here is some text stored directly in the symlink''+@interpolate_this_filename_at_symlink_creation_time+`@freshly_interpolate_this_filename2_whenever_resolving_target_filename+"this is some more text that is directly embedded in the symlink"' + +Mr. Demidov, flesh this out in detail, being careful to worry about +how to write to interpolated files. I think you need to interpret +strings that are between interpolations as the delimiters of those +interpolations, and changing those strings can then only be done by +writing to filename/sym. + +*/ diff -Naurp linux-2.6.4/fs/reiser4/ioctl.h linux-2.6.4-ck1/fs/reiser4/ioctl.h --- linux-2.6.4/fs/reiser4/ioctl.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/ioctl.h 2004-03-11 22:45:15.247516738 +1100 @@ -0,0 +1,41 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +#if !defined( __REISER4_IOCTL_H__ ) +#define __REISER4_IOCTL_H__ + +#include + +/* + * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into + * extents and fix in this state. This is used by applications that rely on + * + * . files being block aligned, and + * + * . files never migrating on disk + * + * for example, boot loaders (LILO) need this. + * + * This ioctl should be used as + * + * result = ioctl(fd, REISER4_IOC_UNPACK); + * + * File behind fd descriptor will be converted to the extents (if necessary), + * and its stat-data will be updated so that it will never be converted back + * into tails again. + */ +#define REISER4_IOC_UNPACK _IOW(0xCD,1,long) + +/* __REISER4_IOCTL_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/jnode.c linux-2.6.4-ck1/fs/reiser4/jnode.c --- linux-2.6.4/fs/reiser4/jnode.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/jnode.c 2004-03-11 22:45:15.250516272 +1100 @@ -0,0 +1,1965 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ +/* Jnode manipulation functions. */ +/* Jnode is entity used to track blocks with data and meta-data in reiser4. + + In particular, jnodes are used to track transactional information + associated with each block. Each znode contains jnode as ->zjnode field. + + Jnode stands for either Josh or Journal node. +*/ + +/* + * Taxonomy. + * + * Jnode represents block containing data or meta-data. There are jnodes + * for: + * + * unformatted blocks (jnodes proper). There are plans, however to + * have a handle per extent unit rather than per each unformatted + * block, because there are so many of them. + * + * For bitmaps. Each bitmap is actually represented by two jnodes--one + * for working and another for "commit" data, together forming bnode. + * + * For io-heads. These are used by log writer. + * + * For formatted nodes (znode). See comment at the top of znode.c for + * details specific to the formatted nodes (znodes). + * + * Node data. + * + * Jnode provides access to the data of node it represents. Data are + * stored in a page. Page is kept in a page cache. This means, that jnodes + * are highly interconnected with page cache and VM internals. + * + * jnode has a pointer to page (->pg) containing its data. Pointer to data + * themselves is cached in ->data field to avoid frequent calls to + * page_address(). + * + * jnode and page are attached to each other by jnode_attach_page(). This + * function places pointer to jnode in page->private, sets PG_private flag + * and increments page counter. + * + * Opposite operation is performed by page_clear_jnode(). + * + * jnode->pg is protected by jnode spin lock, and page->private is + * protected by page lock. See comment at the top of page_cache.c for + * more. + * + * page can be detached from jnode for two reasons: + * + * . jnode is removed from a tree (file is truncated, of formatted + * node is removed by balancing). + * + * . during memory pressure, VM calls ->releasepage() method + * (reiser4_releasepage()) to evict page from memory. + * + * (there, of course, is also umount, but this is special case we are not + * concerned with here). + * + * To protect jnode page from eviction, one calls jload() function that + * "pins" page in memory (loading it if necessary), increments + * jnode->d_count, and kmap()s page. Page is unpinned through call to + * jrelse(). + * + * Jnode life cycle. + * + * jnode is created, placed in hash table, and, optionally, in per-inode + * radix tree. Page can be attached to jnode, pinned, released, etc. + * + * When jnode is captured into atom its reference counter is + * increased. While being part of an atom, jnode can be "early + * flushed". This means that as part of flush procedure, jnode is placed + * into "relocate set", and its page is submitted to the disk. After io + * completes, page can be detached, then loaded again, re-dirtied, etc. + * + * Thread acquired reference to jnode by calling jref() and releases it by + * jput(). When last reference is removed, jnode is still retained in + * memory (cached) if it has page attached, _unless_ it is scheduled for + * destruction (has JNODE_HEARD_BANSHEE bit set). + * + * Tree read-write lock was used as "existential" lock for jnodes. That is, + * jnode->x_count could be changed from 0 to 1 only under tree write lock, + * that is, tree lock protected unreferenced jnodes stored in the hash + * table, from recycling. + * + * This resulted in high contention on tree lock, because jref()/jput() is + * frequent operation. To ameliorate this problem, RCU is used: when jput() + * is just about to release last reference on jnode it sets JNODE_RIP bit + * on it, and then proceed with jnode destruction (removing jnode from hash + * table, cbk_cache, detaching page, etc.). All places that change jnode + * reference counter from 0 to 1 (jlookup(), zlook(), and + * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by + * jnode_rip_check() function), and pretend that nothing was found in hash + * table if bit is set. + * + * jput defers actual return of jnode into slab cache to some later time + * (by call_rcu()), this guarantees that other threads can safely continue + * working with JNODE_RIP-ped jnode. + * + */ + +#include "reiser4.h" +#include "debug.h" +#include "dformat.h" +#include "plugin/plugin_header.h" +#include "plugin/plugin.h" +#include "plugin/plugin_hash.h" +#include "txnmgr.h" +#include "jnode.h" +#include "znode.h" +#include "tree.h" +#include "tree_walk.h" +#include "super.h" +#include "inode.h" +#include "page_cache.h" +#include "prof.h" + +#include /* UML needs this for PAGE_OFFSET */ +#include +#include +#include +#include /* for vmalloc(), vfree() */ +#include +#include /* for struct address_space */ +#include /* for inode_lock */ + +static kmem_cache_t *_jnode_slab = NULL; + +static void jnode_set_type(jnode * node, jnode_type type); + + +/* true if valid page is attached to jnode */ +static inline int jnode_is_parsed (jnode * node) +{ + return JF_ISSET(node, JNODE_PARSED); +} + +/* hash table support */ + +/* compare two jnode keys for equality. Used by hash-table macros */ +static inline int +jnode_key_eq(const jnode_key_t * k1, const jnode_key_t * k2) +{ + assert("nikita-2350", k1 != NULL); + assert("nikita-2351", k2 != NULL); + + return (k1->index == k2->index && k1->objectid == k2->objectid); +} + +/* Hash jnode by its key (inode plus offset). Used by hash-table macros */ +static inline __u32 +jnode_key_hashfn(j_hash_table *table, const jnode_key_t * key) +{ + assert("nikita-2352", key != NULL); + assert("nikita-3346", IS_POW(table->_buckets)); + + /* yes, this is remarkable simply (where not stupid) hash function. */ + return (key->objectid + key->index) & (table->_buckets - 1); +} + +/* The hash table definition */ +#define KMALLOC(size) vmalloc(size) +#define KFREE(ptr, size) vfree(ptr) +TYPE_SAFE_HASH_DEFINE(j, jnode, jnode_key_t, key.j, link.j, jnode_key_hashfn, jnode_key_eq); +#undef KFREE +#undef KMALLOC + +/* call this to initialise jnode hash table */ +reiser4_internal int +jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ ) +{ + int buckets; + int result; + + assert("nikita-2359", tree != NULL); + + /* + * number of hash buckets in hash table depends on amount of memory + * available. If we cannot allocate that much, number of buckets is + * halved until allocation succeeds. + */ + buckets = 1 << fls(nr_free_pagecache_pages()); + do { + result = j_hash_init(&tree->jhash_table, buckets, + reiser4_stat(tree->super, hashes.jnode)); + buckets >>= 1; + } while (result == -ENOMEM); + return result; +} + +/* call this to destroy jnode hash table. This is called during umount. */ +reiser4_internal int +jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ ) +{ + j_hash_table *jtable; + jnode *node; + jnode *next; + + assert("nikita-2360", tree != NULL); + + /* + * Scan hash table and free all jnodes. + */ + + IF_TRACE(TRACE_ZWEB, UNDER_RW_VOID(tree, tree, read, + print_jnodes("umount", tree))); + + jtable = &tree->jhash_table; + for_all_in_htable(jtable, j, node, next) { + assert("nikita-2361", !atomic_read(&node->x_count)); + jdrop(node); + } + + j_hash_done(&tree->jhash_table); + return 0; +} + +/* Initialize static variables in this file. */ +reiser4_internal int +jnode_init_static(void) +{ + assert("umka-168", _jnode_slab == NULL); + + _jnode_slab = kmem_cache_create("jnode", sizeof (jnode), 0, + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, + NULL, NULL); + + if (_jnode_slab == NULL) { + goto error; + } + + return 0; + +error: + + if (_jnode_slab != NULL) { + kmem_cache_destroy(_jnode_slab); + } + return RETERR(-ENOMEM); +} + +reiser4_internal int +jnode_done_static(void) +{ + int ret = 0; + + if (_jnode_slab != NULL) { + ret = kmem_cache_destroy(_jnode_slab); + _jnode_slab = NULL; + } + + return ret; +} + +/* Initialize a jnode. */ +reiser4_internal void +jnode_init(jnode * node, reiser4_tree * tree, jnode_type type) +{ + assert("umka-175", node != NULL); + + xmemset(node, 0, sizeof (jnode)); + ON_DEBUG(node->magic = JMAGIC); + jnode_set_type(node, type); + atomic_set(&node->d_count, 0); + atomic_set(&node->x_count, 0); + spin_jnode_init(node); + spin_jload_init(node); + node->atom = NULL; + node->tree = tree; + capture_list_clean(node); + ON_DEBUG(node->list = NOT_CAPTURED); + INIT_RCU_HEAD(&node->rcu); + +#if REISER4_DEBUG + { + reiser4_super_info_data *sbinfo; + + sbinfo = get_super_private(tree->super); + spin_lock_irq(&sbinfo->all_guard); + list_add(&node->jnodes, &sbinfo->all_jnodes); + spin_unlock_irq(&sbinfo->all_guard); + } +#endif +} + +#if REISER4_DEBUG +/* + * Remove jnode from ->all_jnodes list. + */ +void +jnode_done(jnode * node, reiser4_tree * tree) +{ + reiser4_super_info_data *sbinfo; + + sbinfo = get_super_private(tree->super); + + spin_lock_irq(&sbinfo->all_guard); + assert("nikita-2422", !list_empty(&node->jnodes)); + list_del_init(&node->jnodes); + spin_unlock_irq(&sbinfo->all_guard); +} +#endif + +/* return already existing jnode of page */ +reiser4_internal jnode * +jnode_by_page(struct page *pg) +{ + assert("nikita-2066", pg != NULL); + assert("nikita-2400", PageLocked(pg)); + assert("nikita-2068", PagePrivate(pg)); + assert("nikita-2067", jprivate(pg) != NULL); + return jprivate(pg); +} + +/* exported functions to allocate/free jnode objects outside this file */ +reiser4_internal jnode * +jalloc(void) +{ + jnode *jal = kmem_cache_alloc(_jnode_slab, GFP_KERNEL); + return jal; +} + +/* return jnode back to the slab allocator */ +reiser4_internal inline void +jfree(jnode * node) +{ + assert("zam-449", node != NULL); + + assert("nikita-2663", capture_list_is_clean(node) && node->list == NOT_CAPTURED); + assert("nikita-2774", !JF_ISSET(node, JNODE_EFLUSH)); + assert("nikita-3222", list_empty(&node->jnodes)); + assert("nikita-3221", jnode_page(node) == NULL); + + /* not yet phash_jnode_destroy(node); */ + + /* poison memory. */ + ON_DEBUG(xmemset(node, 0xad, sizeof *node)); + kmem_cache_free(_jnode_slab, node); +} + +static void +jnode_free_actor(void *arg) +{ + jnode * node; + jnode_type jtype; + + node = arg; + jtype = jnode_get_type(node); + + ON_DEBUG(jnode_done(node, jnode_get_tree(node))); + + switch (jtype) { + case JNODE_IO_HEAD: + case JNODE_BITMAP: + case JNODE_UNFORMATTED_BLOCK: + jfree(node); + break; + case JNODE_FORMATTED_BLOCK: + zfree(JZNODE(node)); + break; + case JNODE_INODE: + default: + wrong_return_value("nikita-3197", "Wrong jnode type"); + } +} + +static inline void +jnode_free(jnode * node, jnode_type jtype) +{ + if (jtype != JNODE_INODE) { + assert("nikita-3219", list_empty(&node->rcu.list)); + call_rcu(&node->rcu, jnode_free_actor, node); + } else + jnode_list_remove(node); +} + +/* allocate new unformatted jnode */ +reiser4_internal jnode * +jnew_unformatted(void) +{ + jnode *jal; + + jal = jalloc(); + if (jal == NULL) + return NULL; + + jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK); + jal->key.j.mapping = 0; + jal->key.j.index = (unsigned long)-1; + jal->key.j.objectid = 0; + return jal; +} + +/* look for jnode with given mapping and offset within hash table */ +reiser4_internal jnode * +jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index) +{ + jnode_key_t jkey; + jnode *node; + + assert("nikita-2353", tree != NULL); + + jkey.objectid = objectid; + jkey.index = index; + + rcu_read_lock(); + node = j_hash_find(&tree->jhash_table, &jkey); + if (node != NULL) { + /* protect @node from recycling */ + jref(node); + assert("nikita-2955", jnode_invariant(node, 0, 0)); + node = jnode_rip_check(tree, node); + } + rcu_read_unlock(); + return node; +} + +/* as jlookup(), but is called with tree locked already */ +static jnode * +jlookup_locked(reiser4_tree * tree, oid_t objectid, unsigned long index) +{ + jnode_key_t jkey; + jnode *node; + + assert("nikita-2353", tree != NULL); + + jkey.objectid = objectid; + jkey.index = index; + + node = j_hash_find(&tree->jhash_table, &jkey); + if (node != NULL) { + /* protect @node from recycling */ + jref(node); + if (unlikely(JF_ISSET(node, JNODE_RIP))) { + dec_x_ref(node); + node = NULL; + } + } + return node; +} + +static int +inode_has_no_jnodes(reiser4_inode *r4_inode) +{ + if (r4_inode->jnode_tree.rnode == 0) { + assert("vs-1434", r4_inode->jnodes == 0); + assert("vs-1435", (inode_by_reiser4_inode(r4_inode)->i_state & I_JNODES) == 0); + return 1; + } + assert("vs-1436", r4_inode->jnodes > 0); + assert("vs-1437", (inode_by_reiser4_inode(r4_inode)->i_state & I_JNODES) != 0); + return 0; +} + + +/* insert jnode into reiser4 inode's radix tree of jnodes. This is performed under tree spin lock. It also sets a bit + (I_JNODES) in inode's i_state so that fs/inode.c:can_unuse never returns 1, so, jnodes in inode's jnode tree prevent + inode from being pruned. This is important because jnodes store pointer to inode's mapping */ +static void +inode_attach_jnode(jnode *node) +{ + struct inode *inode; + reiser4_inode *r4_inode; + + assert("vs-1439", node->key.j.mapping); + + inode = node->key.j.mapping->host; + r4_inode = reiser4_inode_data(inode); + spin_lock(&inode_lock); + if (inode_has_no_jnodes(r4_inode)) { + assert("vs-1433", (inode->i_state & I_JNODES) == 0); + inode->i_state |= I_JNODES; + } + check_me("vs-1431", radix_tree_insert(&r4_inode->jnode_tree, node->key.j.index, node) == 0); + ON_DEBUG(r4_inode->jnodes ++); + spin_unlock(&inode_lock); +} + +/* remove jnode into reiser4 inode's radix tree. This is performed under tree spin lock. If last jnode is removed from + inode's jnode tree inode gets "released" - bit I_JNODES is cleared */ +static void +inode_detach_jnode(jnode *node) +{ + struct inode *inode; + reiser4_inode *r4_inode; + + assert("vs-1440", node->key.j.mapping); + inode = node->key.j.mapping->host; + assert("vs-1441", node->key.j.objectid == get_inode_oid(inode)); + r4_inode = reiser4_inode_data(inode); + assert("vs-1431", r4_inode->jnodes > 0 && (inode->i_state & I_JNODES)); + + spin_lock(&inode_lock); + check_me("vs-1431", radix_tree_delete(&r4_inode->jnode_tree, jnode_get_index(node))); + ON_DEBUG(r4_inode->jnodes --); + if (r4_inode->jnode_tree.rnode == 0) { + assert("vs-1432", inode->i_state & I_JNODES); + assert("vs-1432", r4_inode->jnodes == 0); + inode->i_state &= ~I_JNODES; + } + spin_unlock(&inode_lock); +} + +/* put jnode into hash table (where they can be found by flush who does not know mapping) and to inode's tree of jnodes + (where they can be found (hopefully faster) in places where mapping is known). Currently it is used by + fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is created */ +static void +hash_unformatted_jnode(jnode *node, struct address_space *mapping, unsigned long index) +{ + j_hash_table *jtable; + + assert("vs-1446", jnode_is_unformatted(node)); + assert("vs-1442", node->key.j.mapping == 0); + assert("vs-1443", node->key.j.objectid == 0); + assert("vs-1444", node->key.j.index == (unsigned long)-1); + assert("nikita-3439", rw_tree_is_write_locked(jnode_get_tree(node))); + + node->key.j.mapping = mapping; + node->key.j.objectid = get_inode_oid(mapping->host); + node->key.j.index = index; + + jtable = &jnode_get_tree(node)->jhash_table; + + /* race with some other thread inserting jnode into the hash table is + * impossible, because we keep the page lock. */ + /* + * following assertion no longer holds because of RCU: it is possible + * jnode is in the hash table, but with JNODE_RIP bit set. + */ + /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */ + j_hash_insert_rcu(jtable, node); + + inode_attach_jnode(node); +} + +static void +unhash_unformatted_node_nolock(jnode *node) +{ + /* remove jnode from hash-table */ + j_hash_remove_rcu(&node->tree->jhash_table, node); + + /* remove jnode from inode's tree of jnodes */ + inode_detach_jnode(node); + + node->key.j.mapping = 0; + node->key.j.index = (unsigned long)-1; + node->key.j.objectid = 0; +} + +/* remove jnode from hash table and from inode's tree of jnodes. This is used in reiser4_invalidatepage and in + kill_hook_extent->truncate_inode_jnodes->uncapture_jnode */ +reiser4_internal void +unhash_unformatted_jnode(jnode *node) +{ + assert("vs-1445", jnode_is_unformatted(node)); + WLOCK_TREE(node->tree); + + unhash_unformatted_node_nolock(node); + + WUNLOCK_TREE(node->tree); +} + +jnode * +find_get_jnode(reiser4_tree * tree, struct address_space *mapping, oid_t oid, + unsigned long index) +{ + jnode *result; + jnode *shadow; + int preload; + + result = jnew_unformatted(); + + if (unlikely(result == NULL)) + return ERR_PTR(RETERR(-ENOMEM)); + + preload = radix_tree_preload(GFP_KERNEL); + if (preload != 0) + return ERR_PTR(preload); + + WLOCK_TREE(tree); + shadow = jlookup_locked(tree, oid, index); + if (likely(shadow == NULL)) { + jref(result); + hash_unformatted_jnode(result, mapping, index); + + } else { + jnode_free(result, JNODE_UNFORMATTED_BLOCK); + assert("vs-1498", shadow->key.j.mapping == mapping); + /*shadow->key.j.mapping = mapping;*/ + result = shadow; + } + + + WUNLOCK_TREE(tree); + assert("nikita-2955", ergo(result != NULL, jnode_invariant(result, 0, 0))); + radix_tree_preload_end(); + return result; +} + + +/* jget() (a la zget() but for unformatted nodes). Returns (and possibly + creates) jnode corresponding to page @pg. jnode is attached to page and + inserted into jnode hash-table. */ +static jnode * +do_jget(reiser4_tree * tree, struct page * pg) +{ + /* + * There are two ways create jnode: starting with pre-existing page + * and without page. + * + * When page already exists, jnode is created + * (jnode_of_page()->do_jget()) under page lock. This is done in + * ->writepage(), or when capturing anonymous page dirtied through + * mmap. + * + * Jnode without page is created by index_extent_jnode(). + * + */ + + jnode *result; + oid_t oid = get_inode_oid(pg->mapping->host); + + assert("umka-176", pg != NULL); + assert("nikita-2394", PageLocked(pg)); + + result = jprivate(pg); + if (likely(result != NULL)) + return jref(result); + + tree = tree_by_page(pg); + + /* check hash-table first */ + result = jlookup(tree, oid, pg->index); + if (unlikely(result != NULL)) { + UNDER_SPIN_VOID(jnode, result, jnode_attach_page(result, pg)); + result->key.j.mapping = pg->mapping; + return result; + } + + result = find_get_jnode(tree, pg->mapping, oid, pg->index); + /* attach jnode to page */ + UNDER_SPIN_VOID(jnode, result, jnode_attach_page(result, pg)); + return result; +} + +reiser4_internal jnode * +jnode_of_page(struct page * pg) +{ + jnode * result; + + assert("umka-176", pg != NULL); + assert("nikita-2394", PageLocked(pg)); + + result = do_jget(tree_by_page(pg), pg); + + if (REISER4_DEBUG && !IS_ERR(result)) { + assert("nikita-3210", result == jprivate(pg)); + assert("nikita-2046", jnode_page(jprivate(pg)) == pg); + if (jnode_is_unformatted(jprivate(pg))) { + assert("nikita-2364", jprivate(pg)->key.j.index == pg->index); + assert("nikita-2367", + jprivate(pg)->key.j.mapping == pg->mapping); + assert("nikita-2365", + jprivate(pg)->key.j.objectid == get_inode_oid(pg->mapping->host)); + assert("vs-1200", + jprivate(pg)->key.j.objectid == pg->mapping->host->i_ino); + assert("nikita-2356", jnode_is_unformatted(jnode_by_page(pg))); + } + assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0)); + } + return result; +} + +reiser4_internal void +jnode_attach_page(jnode * node, struct page *pg) +{ + assert("nikita-2060", node != NULL); + assert("nikita-2061", pg != NULL); + + assert("nikita-2050", pg->private == 0ul); + assert("nikita-2393", !PagePrivate(pg)); + + assert("nikita-2396", PageLocked(pg)); + assert("nikita-2397", spin_jnode_is_locked(node)); + + page_cache_get(pg); + pg->private = (unsigned long) node; + node->pg = pg; + SetPagePrivate(pg); +} + +reiser4_internal void +page_clear_jnode(struct page *page, jnode * node) +{ + assert("nikita-2424", page != NULL); + assert("nikita-2425", PageLocked(page)); + assert("nikita-2426", node != NULL); + assert("nikita-2427", spin_jnode_is_locked(node)); + assert("nikita-2428", PagePrivate(page)); + + JF_CLR(node, JNODE_PARSED); + page->private = 0ul; + ClearPagePrivate(page); + node->pg = NULL; + page_cache_release(page); +} + +/* it is only used in one place to handle error */ +reiser4_internal void +page_detach_jnode(struct page *page, struct address_space *mapping, unsigned long index) +{ + assert("nikita-2395", page != NULL); + + lock_page(page); + if ((page->mapping == mapping) && (page->index == index) && PagePrivate(page)) { + jnode *node; + + node = jprivate(page); + assert("nikita-2399", spin_jnode_is_not_locked(node)); + UNDER_SPIN_VOID(jnode, node, page_clear_jnode(page, node)); + } + unlock_page(page); +} + +/* return @node page locked. + + Locking ordering requires that one first takes page lock and afterwards + spin lock on node attached to this page. Sometimes it is necessary to go in + the opposite direction. This is done through standard trylock-and-release + loop. +*/ +reiser4_internal struct page * +jnode_lock_page(jnode * node) +{ + struct page *page; + + assert("nikita-2052", node != NULL); + assert("nikita-2401", spin_jnode_is_not_locked(node)); + + while (1) { + + LOCK_JNODE(node); + page = jnode_page(node); + if (page == NULL) { + break; + } + + /* no need to page_cache_get( page ) here, because page cannot + be evicted from memory without detaching it from jnode and + this requires spin lock on jnode that we already hold. + */ + if (!TestSetPageLocked(page)) { + /* We won a lock on jnode page, proceed. */ + break; + } + + /* Page is locked by someone else. */ + page_cache_get(page); + UNLOCK_JNODE(node); + wait_on_page_locked(page); + /* it is possible that page was detached from jnode and + returned to the free pool, or re-assigned while we were + waiting on locked bit. This will be rechecked on the next + loop iteration. + */ + page_cache_release(page); + + /* try again */ + } + return page; +} +static inline int +jparse(jnode * node) +{ + int result; + + assert("nikita-2466", node != NULL); + + LOCK_JNODE(node); + if (likely(!jnode_is_parsed(node))) { + result = jnode_ops(node)->parse(node); + if (likely(result == 0)) + JF_SET(node, JNODE_PARSED); + } else + result = 0; + UNLOCK_JNODE(node); + return result; +} + +/* Lock a page attached to jnode, create and attach page to jnode if it had no one. */ +reiser4_internal struct page * +jnode_get_page_locked(jnode * node, int gfp_flags) +{ + struct page * page; + + LOCK_JNODE(node); + page = jnode_page(node); + + if (page == NULL) { + UNLOCK_JNODE(node); + page = find_or_create_page(jnode_get_mapping(node), + jnode_get_index(node), gfp_flags); + if (page == NULL) + return ERR_PTR(RETERR(-ENOMEM)); + } else { + if (!TestSetPageLocked(page)) { + UNLOCK_JNODE(node); + return page; + } + page_cache_get(page); + UNLOCK_JNODE(node); + lock_page(page); + assert("nikita-3134", page->mapping == jnode_get_mapping(node)); + } + + LOCK_JNODE(node); + if (!jnode_page(node)) + jnode_attach_page(node, page); + UNLOCK_JNODE(node); + + page_cache_release(page); + assert ("zam-894", jnode_page(node) == page); + return page; +} + +/* Start read operation for jnode's page if page is not up-to-date. */ +static int jnode_start_read (jnode * node, struct page * page) +{ + assert ("zam-893", PageLocked(page)); + + if (PageUptodate(page)) { + unlock_page(page); + return 0; + } + return page_io(page, node, READ, GFP_KERNEL); +} + +#if REISER4_DEBUG +static void check_jload(jnode * node, struct page * page) +{ + if (jnode_is_znode(node)) { + node40_header *nh; + znode *z; + + z = JZNODE(node); + if (znode_is_any_locked(z)) { + nh = (node40_header *)kmap(page); + /* this only works for node40-only file systems. For + * debugging. */ + assert("nikita-3253", + z->nr_items == d16tocpu(&nh->nr_items)); + kunmap(page); + } + } +} +#else +#define check_jload(node, page) noop +#endif + +reiser4_internal void jload_prefetch(const jnode * node) +{ + prefetchw(&node->x_count); +} + +/* load jnode's data into memory */ +reiser4_internal int +jload_gfp (jnode * node /* node to load */, + int gfp_flags /* allocation flags*/, + int do_kmap) +{ + struct page * page; + int result = 0; + int parsed; + + assert("nikita-3010", schedulable()); + write_node_trace(node); + + prefetchw(&node->pg); + + /* taking d-reference implies taking x-reference. */ + jref(node); + + /* + * acquiring d-reference to @jnode and check for JNODE_PARSED bit + * should be atomic, otherwise there is a race against + * reiser4_releasepage(). + */ + LOCK_JLOAD(node); + add_d_ref(node); + parsed = jnode_is_parsed(node); + UNLOCK_JLOAD(node); + + if (unlikely(!parsed)) { + ON_TRACE(TRACE_PCACHE, "read node: %p\n", node); + + page = jnode_get_page_locked(node, gfp_flags); + if (unlikely(IS_ERR(page))) { + result = PTR_ERR(page); + goto failed; + } + + result = jnode_start_read(node, page); + if (unlikely(result != 0)) + goto failed; + + wait_on_page_locked(page); + if (unlikely(!PageUptodate(page))) { + result = RETERR(-EIO); + goto failed; + } + + if (do_kmap) + node->data = kmap(page); + + result = jparse(node); + if (unlikely(result != 0)) { + if (do_kmap) + kunmap(page); + goto failed; + } + check_jload(node, page); + } else { + page = jnode_page(node); + check_jload(node, page); + if (do_kmap) + node->data = kmap(page); + reiser4_stat_inc_at_level(jnode_get_level(node), + jnode.jload_already); + } + + if (unlikely(JF_ISSET(node, JNODE_EFLUSH))) + UNDER_SPIN_VOID(jnode, node, eflush_del(node, 0)); + + if (!is_writeout_mode()) + /* We do not mark pages active if jload is called as a part of + * jnode_flush() or reiser4_write_logs(). Both jnode_flush() + * and write_logs() add no value to cached data, there is no + * sense to mark pages as active when they go to disk, it just + * confuses vm scanning routines because clean page could be + * moved out from inactive list as a result of this + * mark_page_accessed() call. */ + mark_page_accessed(page); + + return 0; + + failed: + jrelse_tail(node); + return result; + +} + +/* start asynchronous reading for given jnode's page. */ +reiser4_internal int jstartio (jnode * node) +{ + struct page * page; + + page = jnode_get_page_locked(node, GFP_KERNEL); + if (IS_ERR(page)) + return PTR_ERR(page); + + return jnode_start_read(node, page); +} + + +/* Initialize a node by calling appropriate plugin instead of reading + * node from disk as in jload(). */ +reiser4_internal int jinit_new (jnode * node, int gfp_flags) +{ + struct page * page; + int result; + + jref(node); + add_d_ref(node); + + page = jnode_get_page_locked(node, gfp_flags); + if (IS_ERR(page)) { + result = PTR_ERR(page); + goto failed; + } + + SetPageUptodate(page); + unlock_page(page); + + node->data = kmap(page); + + if (!jnode_is_parsed(node)) { + jnode_plugin * jplug = jnode_ops(node); + result = UNDER_SPIN(jnode, node, jplug->init(node)); + if (result) { + kunmap(page); + goto failed; + } + JF_SET(node, JNODE_PARSED); + } + + return 0; + + failed: + jrelse(node); + return result; +} + +reiser4_internal void +jrelse_tail(jnode * node /* jnode to release references to */) +{ + assert("nikita-489", atomic_read(&node->d_count) > 0); + atomic_dec(&node->d_count); + /* release reference acquired in jload_gfp() or jinit_new() */ + jput(node); + LOCK_CNT_DEC(d_refs); +} + +/* drop reference to node data. When last reference is dropped, data are + unloaded. */ +reiser4_internal void +jrelse(jnode * node /* jnode to release references to */) +{ + struct page *page; + + assert("nikita-487", node != NULL); + assert("nikita-1906", spin_jnode_is_not_locked(node)); + + ON_TRACE(TRACE_PCACHE, "release node: %p\n", node); + + page = jnode_page(node); + if (likely(page != NULL)) { + /* + * it is safe not to lock jnode here, because at this point + * @node->d_count is greater than zero (if jrelse() is used + * correctly, that is). JNODE_PARSED may be not set yet, if, + * for example, we got here as a result of error handling path + * in jload(). Anyway, page cannot be detached by + * reiser4_releasepage(). truncate will invalidate page + * regardless, but this should not be a problem. + */ + kunmap(page); + } + jrelse_tail(node); +} + +/* called from jput() to wait for io completion */ +static void jnode_finish_io(jnode * node) +{ + struct page *page; + + assert("nikita-2922", node != NULL); + + LOCK_JNODE(node); + page = jnode_page(node); + if (page != NULL) { + page_cache_get(page); + UNLOCK_JNODE(node); + wait_on_page_writeback(page); + page_cache_release(page); + } else + UNLOCK_JNODE(node); +} + +/* + * This is called by jput() when last reference to jnode is released. This is + * separate function, because we want fast path of jput() to be inline and, + * therefore, small. + */ +reiser4_internal void +jput_final(jnode * node) +{ + int r_i_p; + + /* A fast check for keeping node in cache. We always keep node in cache + * if its page is present and node was not marked for deletion */ + if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) { + rcu_read_unlock(); + return; + } + + r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP); + /* + * if r_i_p is true, we were first to set JNODE_RIP on this node. In + * this case it is safe to access node after unlock. + */ + rcu_read_unlock(); + if (r_i_p) { + jnode_finish_io(node); + if (JF_ISSET(node, JNODE_HEARD_BANSHEE)) + /* node is removed from the tree. */ + jdelete(node); + else + jnode_try_drop(node); + } + /* if !r_i_p some other thread is already killing it */ +} + +reiser4_internal int +jwait_io(jnode * node, int rw) +{ + struct page *page; + int result; + + assert("zam-447", node != NULL); + assert("zam-448", jnode_page(node) != NULL); + + page = jnode_page(node); + + result = 0; + if (rw == READ) { + wait_on_page_locked(page); + } else { + assert("nikita-2227", rw == WRITE); + wait_on_page_writeback(page); + } + if (PageError(page)) + result = RETERR(-EIO); + + return result; +} + +void +jnode_set_type(jnode * node, jnode_type type) +{ + static unsigned long type_to_mask[] = { + [JNODE_UNFORMATTED_BLOCK] = 1, + [JNODE_FORMATTED_BLOCK] = 0, + [JNODE_BITMAP] = 2, + [JNODE_IO_HEAD] = 6, + [JNODE_INODE] = 4 + }; + + assert("zam-647", type < LAST_JNODE_TYPE); + assert("nikita-2815", !jnode_is_loaded(node)); + assert("nikita-3386", node->state == 0); + + node->state |= (type_to_mask[type] << JNODE_TYPE_1); +} + +static int +init_noinit(jnode * node UNUSED_ARG) +{ + return 0; +} + +static int +parse_noparse(jnode * node UNUSED_ARG) +{ + return 0; +} + +reiser4_internal struct address_space * +mapping_jnode(const jnode * node) +{ + struct address_space *map; + + assert("nikita-2713", node != NULL); + map = node->key.j.mapping; + assert("nikita-2714", map != NULL); + assert("nikita-2897", is_reiser4_inode(map->host)); + assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid); + assert("vs-1447", !JF_ISSET(node, JNODE_CC)); + return map; +} + +reiser4_internal unsigned long +index_jnode(const jnode * node) +{ + assert("vs-1447", !JF_ISSET(node, JNODE_CC)); + return node->key.j.index; +} + +static inline void +remove_jnode(jnode * node, reiser4_tree * tree) +{ + if (node->key.j.mapping) + unhash_unformatted_node_nolock(node); +} + +static void +remove_inode_jnode(jnode * node, reiser4_tree * tree UNUSED_ARG) +{ + assert("nikita-2663", capture_list_is_clean(node)); + + phash_jnode_destroy(node); +} + +static struct address_space * +mapping_znode(const jnode * node) +{ + assert("vs-1447", !JF_ISSET(node, JNODE_CC)); + return get_super_fake(jnode_get_tree(node)->super)->i_mapping; +} + +extern int znode_shift_order; +static unsigned long +index_znode(const jnode * node) +{ + unsigned long addr; + assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode)); + + addr = (unsigned long)node; + return (addr - PAGE_OFFSET) >> znode_shift_order; +} + +static struct address_space * +mapping_bitmap(const jnode * node) +{ + return get_super_private(jnode_get_tree(node)->super)->bitmap->i_mapping; +} + +static unsigned long +index_is_address(const jnode * node) +{ + unsigned long ind; + + ind = (unsigned long)node; + return ind - PAGE_OFFSET; +} + +/* resolve race with jput */ +reiser4_internal jnode * +jnode_rip_sync(reiser4_tree *t, jnode * node) +{ + if (unlikely(JF_ISSET(node, JNODE_RIP))) { + RLOCK_TREE(t); + if (JF_ISSET(node, JNODE_RIP)) { + dec_x_ref(node); + node = NULL; + } + RUNLOCK_TREE(t); + } + return node; +} + + +reiser4_internal reiser4_key * +jnode_build_key(const jnode * node, reiser4_key * key) +{ + struct inode *inode; + item_plugin *iplug; + loff_t off; + + assert("nikita-3092", node != NULL); + assert("nikita-3093", key != NULL); + assert("nikita-3094", jnode_is_unformatted(node)); + + + off = ((loff_t)index_jnode(node)) << PAGE_CACHE_SHIFT; + inode = mapping_jnode(node)->host; + + if (node->parent_item_id != 0) + iplug = item_plugin_by_id(node->parent_item_id); + else + iplug = NULL; + + if (iplug != NULL && iplug->f.key_by_offset) + iplug->f.key_by_offset(inode, off, key); + else { + file_plugin *fplug; + + fplug = inode_file_plugin(inode); + assert ("zam-1007", fplug != NULL); + assert ("zam-1008", fplug->key_by_inode != NULL); + + fplug->key_by_inode(inode, off, key); + } + + return key; +} + +extern int zparse(znode * node); + +static int +parse_znode(jnode * node) +{ + return zparse(JZNODE(node)); +} + +static void +delete_znode(jnode * node, reiser4_tree * tree) +{ + znode *z; + + assert("nikita-2128", rw_tree_is_write_locked(tree)); + assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE)); + + z = JZNODE(node); + assert("vs-899", z->c_count == 0); + + /* delete znode from sibling list. */ + sibling_list_remove(z); + + znode_remove(z, tree); +} + +static int +remove_znode(jnode * node, reiser4_tree * tree) +{ + znode *z; + + assert("nikita-2128", rw_tree_is_locked(tree)); + z = JZNODE(node); + + if (z->c_count == 0) { + /* detach znode from sibling list. */ + sibling_list_drop(z); + /* this is called with tree spin-lock held, so call + znode_remove() directly (rather than znode_lock_remove()). */ + znode_remove(z, tree); + return 0; + } + return RETERR(-EBUSY); +} + +static int +init_znode(jnode * node) +{ + znode *z; + + z = JZNODE(node); + return z->nplug->init(z); +} + +/* jplug->clone for formatted nodes (znodes) */ +znode *zalloc(int gfp_flag); +void zinit(znode *, const znode * parent, reiser4_tree *); +reiser4_internal jnode * +clone_formatted(jnode *node) +{ + znode *clone; + + assert("vs-1430", jnode_is_znode(node)); + clone = zalloc(GFP_KERNEL); + if (clone == NULL) + return ERR_PTR(RETERR(-ENOMEM)); + zinit(clone, 0, current_tree); + jnode_set_block(ZJNODE(clone), jnode_get_block(node)); + /* ZJNODE(clone)->key.z is not initialized */ + clone->level = JZNODE(node)->level; + + return ZJNODE(clone); +} + +/* jplug->clone for unformatted nodes */ +reiser4_internal jnode * +clone_unformatted(jnode *node) +{ + jnode *clone; + + assert("vs-1431", jnode_is_unformatted(node)); + clone = jalloc(); + if (clone == NULL) + return ERR_PTR(RETERR(-ENOMEM)); + + jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK); + jnode_set_block(clone, jnode_get_block(node)); + + return clone; + +} + +jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = { + [JNODE_UNFORMATTED_BLOCK] = { + .h = { + .type_id = REISER4_JNODE_PLUGIN_TYPE, + .id = JNODE_UNFORMATTED_BLOCK, + .pops = NULL, + .label = "unformatted", + .desc = "unformatted node", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .init = init_noinit, + .parse = parse_noparse, + .mapping = mapping_jnode, + .index = index_jnode, + .clone = clone_unformatted + }, + [JNODE_FORMATTED_BLOCK] = { + .h = { + .type_id = REISER4_JNODE_PLUGIN_TYPE, + .id = JNODE_FORMATTED_BLOCK, + .pops = NULL, + .label = "formatted", + .desc = "formatted tree node", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .init = init_znode, + .parse = parse_znode, + .mapping = mapping_znode, + .index = index_znode, + .clone = clone_formatted + }, + [JNODE_BITMAP] = { + .h = { + .type_id = REISER4_JNODE_PLUGIN_TYPE, + .id = JNODE_BITMAP, + .pops = NULL, + .label = "bitmap", + .desc = "bitmap node", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .init = init_noinit, + .parse = parse_noparse, + .mapping = mapping_bitmap, + .index = index_is_address, + .clone = NULL + }, + [JNODE_IO_HEAD] = { + .h = { + .type_id = REISER4_JNODE_PLUGIN_TYPE, + .id = JNODE_IO_HEAD, + .pops = NULL, + .label = "io head", + .desc = "io head", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .init = init_noinit, + .parse = parse_noparse, + .mapping = mapping_bitmap, + .index = index_is_address, + .clone = NULL + }, + [JNODE_INODE] = { + .h = { + .type_id = REISER4_JNODE_PLUGIN_TYPE, + .id = JNODE_INODE, + .pops = NULL, + .label = "inode", + .desc = "inode's builtin jnode", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .init = NULL, + .parse = NULL, + .mapping = NULL, + .index = NULL, + .clone = NULL + } +}; + +static inline int +jnode_is_busy(const jnode * node, jnode_type jtype) +{ + if (atomic_read(&node->x_count) > 0) + return 1; + if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0) + return 1; + return 0; +} + +static inline void +jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG) +{ + switch (jtype) { + case JNODE_UNFORMATTED_BLOCK: + remove_jnode(node, tree); + break; + case JNODE_IO_HEAD: + case JNODE_BITMAP: + break; + case JNODE_INODE: + remove_inode_jnode(node, tree); + break; + case JNODE_FORMATTED_BLOCK: + remove_znode(node, tree); + break; + default: + wrong_return_value("nikita-3196", "Wrong jnode type"); + } +} + +static inline void +jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG) +{ + switch (jtype) { + case JNODE_UNFORMATTED_BLOCK: + remove_jnode(node, tree); + break; + case JNODE_IO_HEAD: + case JNODE_BITMAP: + break; + case JNODE_FORMATTED_BLOCK: + delete_znode(node, tree); + break; + case JNODE_INODE: + default: + wrong_return_value("nikita-3195", "Wrong jnode type"); + } +} + +#if REISER4_DEBUG +void jnode_list_remove(jnode * node) +{ + reiser4_super_info_data *sbinfo; + + sbinfo = get_super_private(jnode_get_tree(node)->super); + + spin_lock_irq(&sbinfo->all_guard); + assert("nikita-2422", !list_empty(&node->jnodes)); + list_del_init(&node->jnodes); + spin_unlock_irq(&sbinfo->all_guard); +} +#endif + +reiser4_internal int +jnode_try_drop(jnode * node) +{ + int result; + reiser4_tree *tree; + jnode_type jtype; + + trace_stamp(TRACE_ZNODES); + assert("nikita-2491", node != NULL); + assert("nikita-2583", JF_ISSET(node, JNODE_RIP)); + + ON_TRACE(TRACE_PCACHE, "trying to drop node: %p\n", node); + + tree = jnode_get_tree(node); + jtype = jnode_get_type(node); + + LOCK_JNODE(node); + WLOCK_TREE(tree); + if (jnode_page(node) != NULL) { + UNLOCK_JNODE(node); + WUNLOCK_TREE(tree); + JF_CLR(node, JNODE_RIP); + return RETERR(-EBUSY); + } + + result = jnode_is_busy(node, jtype); + if (result == 0) { + assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE)); + assert("nikita-3223", !JF_ISSET(node, JNODE_EFLUSH)); + assert("jmacd-511/b", atomic_read(&node->d_count) == 0); + + UNLOCK_JNODE(node); + /* no page and no references---despatch him. */ + jnode_remove(node, jtype, tree); + WUNLOCK_TREE(tree); + jnode_free(node, jtype); + } else { + WUNLOCK_TREE(tree); + UNLOCK_JNODE(node); + JF_CLR(node, JNODE_RIP); + } + return result; +} + +/* jdelete() -- Remove jnode from the tree */ +reiser4_internal int +jdelete(jnode * node /* jnode to finish with */) +{ + struct page *page; + int result; + reiser4_tree *tree; + jnode_type jtype; + + trace_stamp(TRACE_ZNODES); + assert("nikita-467", node != NULL); + assert("nikita-2531", JF_ISSET(node, JNODE_RIP)); + /* jnode cannot be eflushed at this point, because emegrency flush + * acquired additional reference counter. */ + assert("nikita-2917", !JF_ISSET(node, JNODE_EFLUSH)); + + ON_TRACE(TRACE_PCACHE, "delete node: %p\n", node); + + jtype = jnode_get_type(node); + + page = jnode_lock_page(node); + assert("nikita-2402", spin_jnode_is_locked(node)); + + tree = jnode_get_tree(node); + + WLOCK_TREE(tree); + result = jnode_is_busy(node, jtype); + if (likely(!result)) { + assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE)); + assert("jmacd-511", atomic_read(&node->d_count) == 0); + + /* detach page */ + if (page != NULL) { + /* + * FIXME this is racy against jnode_extent_write(). + */ + page_clear_jnode(page, node); + } + UNLOCK_JNODE(node); + /* goodbye */ + jnode_delete(node, jtype, tree); + WUNLOCK_TREE(tree); + jnode_free(node, jtype); + /* @node is no longer valid pointer */ + if (page != NULL) + drop_page(page); + } else { + JF_CLR(node, JNODE_RIP); + WUNLOCK_TREE(tree); + UNLOCK_JNODE(node); + if (page != NULL) + unlock_page(page); + } + return result; +} + +/* drop jnode on the floor. + + Return value: + + -EBUSY: failed to drop jnode, because there are still references to it + + 0: successfully dropped jnode + +*/ +static int +jdrop_in_tree(jnode * node, reiser4_tree * tree) +{ + struct page *page; + jnode_type jtype; + int result; + + assert("zam-602", node != NULL); + assert("nikita-2362", rw_tree_is_not_locked(tree)); + assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE)); + // assert( "nikita-2532", JF_ISSET( node, JNODE_RIP ) ); + + ON_TRACE(TRACE_PCACHE, "drop node: %p\n", node); + + jtype = jnode_get_type(node); + + page = jnode_lock_page(node); + assert("nikita-2405", spin_jnode_is_locked(node)); + + WLOCK_TREE(tree); + + result = jnode_is_busy(node, jtype); + if (!result) { + assert("nikita-2488", page == jnode_page(node)); + assert("nikita-2533", atomic_read(&node->d_count) == 0); + if (page != NULL) { + assert("nikita-2126", !PageDirty(page)); + assert("nikita-2127", PageUptodate(page)); + assert("nikita-2181", PageLocked(page)); + page_clear_jnode(page, node); + } + UNLOCK_JNODE(node); + jnode_remove(node, jtype, tree); + WUNLOCK_TREE(tree); + jnode_free(node, jtype); + if (page != NULL) { + drop_page(page); + } + } else { + JF_CLR(node, JNODE_RIP); + WUNLOCK_TREE(tree); + UNLOCK_JNODE(node); + if (page != NULL) + unlock_page(page); + } + return result; +} + +/* This function frees jnode "if possible". In particular, [dcx]_count has to + be 0 (where applicable). */ +reiser4_internal void +jdrop(jnode * node) +{ + jdrop_in_tree(node, jnode_get_tree(node)); +} + + +/* IO head jnode implementation; The io heads are simple j-nodes with limited + functionality (these j-nodes are not in any hash table) just for reading + from and writing to disk. */ + +reiser4_internal jnode * +alloc_io_head(const reiser4_block_nr * block) +{ + jnode *jal = jalloc(); + + if (jal != NULL) { + jnode_init(jal, current_tree, JNODE_IO_HEAD); + jnode_set_block(jal, block); + } + + jref(jal); + + return jal; +} + +reiser4_internal void +drop_io_head(jnode * node) +{ + assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD); + + jput(node); + jdrop(node); +} + +/* protect keep jnode data from reiser4_releasepage() */ +reiser4_internal void +pin_jnode_data(jnode * node) +{ + assert("zam-671", jnode_page(node) != NULL); + page_cache_get(jnode_page(node)); +} + +/* make jnode data free-able again */ +reiser4_internal void +unpin_jnode_data(jnode * node) +{ + assert("zam-672", jnode_page(node) != NULL); + page_cache_release(jnode_page(node)); +} + +reiser4_internal struct address_space * +jnode_get_mapping(const jnode * node) +{ + assert("nikita-3162", node != NULL); + return jnode_ops(node)->mapping(node); +} + +#if 1 || REISER4_DEBUG +/* debugging aid: jnode invariant */ +reiser4_internal int +jnode_invariant_f(const jnode * node, + char const **msg) +{ +#define _ergo(ant, con) \ + ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con))) +#define _check(exp) ((*msg) = #exp, (exp)) + + return + _check(node != NULL) && + + /* [jnode-queued] */ + + /* only relocated node can be queued, except that when znode + * is being deleted, its JNODE_RELOC bit is cleared */ + _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED), + JF_ISSET(node, JNODE_RELOC) || + JF_ISSET(node, JNODE_HEARD_BANSHEE)) && + +#if REISER4_DEBUG + _check(node->jnodes.prev != NULL) && + _check(node->jnodes.next != NULL) && +#endif + + /* [jnode-dirty] invariant */ + + /* dirty inode is part of atom */ + _ergo(jnode_is_dirty(node), node->atom != NULL) && + + /* [jnode-oid] invariant */ + + /* for unformatted node ->objectid and ->mapping fields are + * consistent */ + _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL, + node->key.j.objectid == get_inode_oid(node->key.j.mapping->host)) && + /* [jnode-atom-valid] invariant */ + + /* node atom has valid state */ + _ergo(node->atom != NULL, + node->atom->stage != ASTAGE_INVALID) && + + /* [jnode-page-binding] invariant */ + + /* if node points to page, it points back to node */ + _ergo(node->pg != NULL, jprivate(node->pg) == node) && + + /* [jnode-refs] invariant */ + + /* only referenced jnode can be loaded */ + _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count)); + +} + +/* debugging aid: check znode invariant and panic if it doesn't hold */ +int +jnode_invariant(const jnode * node, int tlocked, int jlocked) +{ + char const *failed_msg; + int result; + reiser4_tree *tree; + + tree = jnode_get_tree(node); + + assert("umka-063312", node != NULL); + assert("umka-064321", tree != NULL); + + if (!jlocked && !tlocked) + LOCK_JNODE((jnode *) node); + if (!tlocked) + RLOCK_TREE(jnode_get_tree(node)); + result = jnode_invariant_f(node, &failed_msg); + if (!result) { + info_jnode("corrupted node", node); + warning("jmacd-555", "Condition %s failed", failed_msg); + } + if (!tlocked) + RUNLOCK_TREE(jnode_get_tree(node)); + if (!jlocked && !tlocked) + UNLOCK_JNODE((jnode *) node); + return result; +} + +/* REISER4_DEBUG */ +#endif + +#if REISER4_STATS +void reiser4_stat_inc_at_level_jput(const jnode * node) +{ + reiser4_stat_inc_at_level(jnode_get_level(node), jnode.jput); +} + +void reiser4_stat_inc_at_level_jputlast(const jnode * node) +{ + reiser4_stat_inc_at_level(jnode_get_level(node), jnode.jputlast); +} +/* REISER4_STATS */ +#endif + +#if REISER4_DEBUG_OUTPUT + +reiser4_internal const char * +jnode_type_name(jnode_type type) +{ + switch (type) { + case JNODE_UNFORMATTED_BLOCK: + return "unformatted"; + case JNODE_FORMATTED_BLOCK: + return "formatted"; + case JNODE_BITMAP: + return "bitmap"; + case JNODE_IO_HEAD: + return "io head"; + case JNODE_INODE: + return "inode"; + case LAST_JNODE_TYPE: + return "last"; + default:{ + static char unknown[30]; + + sprintf(unknown, "unknown %i", type); + return unknown; + } + } +} + +#define jnode_state_name( node, flag ) \ + ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" ) + +/* debugging aid: output human readable information about @node */ +reiser4_internal void +info_jnode(const char *prefix /* prefix to print */ , + const jnode * node /* node to print */ ) +{ + assert("umka-068", prefix != NULL); + + if (node == NULL) { + printk("%s: null\n", prefix); + return; + } + + printk("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i," + " block: %s, d_count: %d, x_count: %d, " + "pg: %p, atom: %p, lock: %i:%i, type: %s, ", + prefix, node, node->state, + jnode_state_name(node, JNODE_PARSED), + jnode_state_name(node, JNODE_HEARD_BANSHEE), + jnode_state_name(node, JNODE_LEFT_CONNECTED), + jnode_state_name(node, JNODE_RIGHT_CONNECTED), + jnode_state_name(node, JNODE_ORPHAN), + jnode_state_name(node, JNODE_CREATED), + jnode_state_name(node, JNODE_RELOC), + jnode_state_name(node, JNODE_OVRWR), + jnode_state_name(node, JNODE_DIRTY), + jnode_state_name(node, JNODE_IS_DYING), + jnode_state_name(node, JNODE_EFLUSH), + jnode_state_name(node, JNODE_FLUSH_QUEUED), + jnode_state_name(node, JNODE_RIP), + jnode_state_name(node, JNODE_MISSED_IN_CAPTURE), + jnode_state_name(node, JNODE_WRITEBACK), + jnode_state_name(node, JNODE_NEW), + jnode_state_name(node, JNODE_DKSET), + jnode_state_name(node, JNODE_EPROTECTED), + jnode_state_name(node, JNODE_REPACK), + jnode_state_name(node, JNODE_CLUSTER_PAGE), + jnode_get_level(node), sprint_address(jnode_get_block(node)), + atomic_read(&node->d_count), atomic_read(&node->x_count), + jnode_page(node), node->atom, +#if REISER4_LOCKPROF && REISER4_LOCKPROF_OBJECTS + node->guard.held, node->guard.trying, +#else + 0, 0, +#endif + jnode_type_name(jnode_get_type(node))); + if (jnode_is_unformatted(node)) { + printk("inode: %llu, index: %lu, ", + node->key.j.objectid, node->key.j.index); + } +} + +/* debugging aid: output human readable information about @node */ +reiser4_internal void +print_jnode(const char *prefix /* prefix to print */ , + const jnode * node /* node to print */) +{ + if (jnode_is_znode(node)) + print_znode(prefix, JZNODE(node)); + else + info_jnode(prefix, node); +} + +/* this is cut-n-paste replica of print_znodes() */ +reiser4_internal void +print_jnodes(const char *prefix, reiser4_tree * tree) +{ + jnode *node; + jnode *next; + j_hash_table *htable; + int tree_lock_taken; + + if (tree == NULL) + tree = current_tree; + + /* this is a debugging function. It can be called by reiser4_panic() + with tree spin-lock already held. Trylock is not exactly what we + want here, but it is passable. + */ + tree_lock_taken = write_trylock_tree(tree); + htable = &tree->jhash_table; + + for_all_in_htable(htable, j, node, next) { + info_jnode(prefix, node); + printk("\n"); + } + if (tree_lock_taken) + WUNLOCK_TREE(tree); +} + +/* REISER4_DEBUG_OUTPUT */ +#endif + +/* this is only used to created jnode during capture copy */ +reiser4_internal jnode *jclone(jnode *node) +{ + jnode *clone; + + assert("vs-1429", jnode_ops(node)->clone); + clone = jnode_ops(node)->clone(node); + if (IS_ERR(clone)) + return clone; + + jref(clone); + JF_SET(clone, JNODE_HEARD_BANSHEE); + JF_SET(clone, JNODE_CC); + return clone; +} + + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/jnode.h linux-2.6.4-ck1/fs/reiser4/jnode.h --- linux-2.6.4/fs/reiser4/jnode.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/jnode.h 2004-03-11 22:45:15.252515961 +1100 @@ -0,0 +1,770 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Declaration of jnode. */ + +#ifndef __JNODE_H__ +#define __JNODE_H__ + +#include "forward.h" +#include "type_safe_hash.h" +#include "type_safe_list.h" +#include "txnmgr.h" +#include "key.h" +#include "debug.h" +#include "dformat.h" +#include "spin_macros.h" +#include "emergency_flush.h" + +#include "plugin/plugin.h" + +#include +#include +#include +#include +#include +#include +#include + +/* declare hash table of jnodes (jnodes proper, that is, unformatted + nodes) */ +TYPE_SAFE_HASH_DECLARE(j, jnode); + +/* declare hash table of znodes */ +TYPE_SAFE_HASH_DECLARE(z, znode); + +typedef struct { + __u64 objectid; + unsigned long index; + struct address_space *mapping; +} jnode_key_t; + +/* + Jnode is the "base class" of other nodes in reiser4. It is also happens to + be exactly the node we use for unformatted tree nodes. + + Jnode provides following basic functionality: + + . reference counting and indexing. + + . integration with page cache. Jnode has ->pg reference to which page can + be attached. + + . interface to transaction manager. It is jnode that is kept in transaction + manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this + means, there should be special type of jnode for inode.) + + Locking: + + Spin lock: the following fields are protected by the per-jnode spin lock: + + ->state + ->atom + ->capture_link + + Following fields are protected by the global tree lock: + + ->link + ->key.z (content of ->key.z is only changed in znode_rehash()) + ->key.j + + Atomic counters + + ->x_count + ->d_count + + ->pg, and ->data are protected by spin lock for unused jnode and are + immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable() + is false). + + ->tree is immutable after creation + + Unclear + + ->blocknr: should be under jnode spin-lock, but current interface is based + on passing of block address. + + If you ever need to spin lock two nodes at once, do this in "natural" + memory order: lock znode with lower address first. (See + spin_lock_znode_pair() and spin_lock_znode_triple() functions, NOTE-NIKITA + TDB) + + Invariants involving this data-type: + + [jnode-dirty] + [jnode-refs] + [jnode-oid] + [jnode-queued] + [jnode-atom-valid] + [jnode-page-binding] +*/ + +struct jnode { +#if REISER4_DEBUG +#define JMAGIC 0x52654973 /* "ReIs" */ + int magic; +#endif + /* FIRST CACHE LINE (16 bytes): data used by jload */ + + /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */ + /* 0 */ unsigned long state; + + /* lock, protecting jnode's fields. */ + /* 4 */ reiser4_spin_data load; + + /* counter of references to jnode itself. Increased on jref(). + Decreased on jput(). + */ + /* 8 */ atomic_t x_count; + + /* counter of references to jnode's data. Pin data page(s) in + memory while this is greater than 0. Increased on jload(). + Decreased on jrelse(). + */ + /* 12 */ atomic_t d_count; + + /* SECOND CACHE LINE: data used by hash table lookups */ + + /* 16 */ union { + /* znodes are hashed by block number */ + reiser4_block_nr z; + /* unformatted nodes are hashed by mapping plus offset */ + jnode_key_t j; + } key; + + /* THIRD CACHE LINE */ + + /* 32 */ union { + /* pointers to maintain hash-table */ + z_hash_link z; + j_hash_link j; + } link; + + /* pointer to jnode page. */ + /* 36 */ struct page *pg; + /* pointer to node itself. This is page_address(node->pg) when page is + attached to the jnode + */ + /* 40 */ void *data; + + /* 44 */ reiser4_tree *tree; + + /* FORTH CACHE LINE: atom related fields */ + + /* 48 */ reiser4_spin_data guard; + + /* atom the block is in, if any */ + /* 52 */ txn_atom *atom; + + /* capture list */ + /* 56 */ capture_list_link capture_link; + + /* FIFTH CACHE LINE */ + + /* 64 */ struct rcu_head rcu; /* crosses cache line */ + + /* SIXTH CACHE LINE */ + + /* the real blocknr (where io is going to/from) */ + /* 80 */ reiser4_block_nr blocknr; + /* Parent item type, unformatted and CRC need it for offset => key conversion. */ + /* NOTE: this parent_item_id looks like jnode type. */ + /* 88 */ reiser4_plugin_id parent_item_id; + /* 92 */ +#if REISER4_DEBUG + /* list of all jnodes for debugging purposes. */ + struct list_head jnodes; + /* how many times this jnode was written in one transaction */ + int written; + /* this indicates which atom's list the jnode is on */ + atom_list list; +#endif +} __attribute__((aligned(16))); + +typedef enum { + JNODE_UNFORMATTED_BLOCK, + JNODE_FORMATTED_BLOCK, + JNODE_BITMAP, + JNODE_IO_HEAD, + JNODE_INODE, + LAST_JNODE_TYPE +} jnode_type; + +TYPE_SAFE_LIST_DEFINE(capture, jnode, capture_link); + +typedef enum { + /* jnode's page is loaded and data checked */ + JNODE_PARSED = 0, + /* node was deleted, not all locks on it were released. This + node is empty and is going to be removed from the tree + shortly. */ + JNODE_HEARD_BANSHEE = 1, + /* left sibling pointer is valid */ + JNODE_LEFT_CONNECTED = 2, + /* right sibling pointer is valid */ + JNODE_RIGHT_CONNECTED = 3, + + /* znode was just created and doesn't yet have a pointer from + its parent */ + JNODE_ORPHAN = 4, + + /* this node was created by its transaction and has not been assigned + a block address. */ + JNODE_CREATED = 5, + + /* this node is currently relocated */ + JNODE_RELOC = 6, + /* this node is currently wandered */ + JNODE_OVRWR = 7, + + /* this znode has been modified */ + JNODE_DIRTY = 8, + + /* znode lock is being invalidated */ + JNODE_IS_DYING = 9, + + JNODE_EFLUSH = 11, + + /* jnode is queued for flushing. */ + JNODE_FLUSH_QUEUED = 12, + + /* In the following bits jnode type is encoded. */ + JNODE_TYPE_1 = 13, + JNODE_TYPE_2 = 14, + JNODE_TYPE_3 = 15, + + /* jnode is being destroyed */ + JNODE_RIP = 16, + + /* znode was not captured during locking (it might so be because + ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */ + JNODE_MISSED_IN_CAPTURE = 17, + + /* write is in progress */ + JNODE_WRITEBACK = 18, + + /* FIXME: now it is used by crypto-compress plugin only */ + JNODE_NEW = 19, + + JNODE_DKSET = 20, + /* if page was dirtied through mmap, we don't want to lose data, even + * though page and jnode may be clean. Mark jnode with JNODE_KEEPME so + * that ->releasepage() can tell. As this is used only for + * unformatted, we can share bit with DKSET which is only meaningful + * for formatted. */ + JNODE_KEEPME = 20, + + /* cheap and effective protection of jnode from emergency flush. This + * bit can only be set by thread that holds long term lock on jnode + * parent node (twig node, where extent unit lives). */ + JNODE_EPROTECTED = 21, + JNODE_CLUSTER_PAGE = 22, + /* Jnode is marked for repacking, that means the reiser4 flush and the + * block allocator should process this node special way */ + JNODE_REPACK = 23, + /* enable node squeezing */ + JNODE_SQUEEZABLE = 24, + + JNODE_SCANNED = 25, + JNODE_JLOADED_BY_GET_OVERWRITE_SET = 26, + /* capture copy jnode */ + JNODE_CC = 27, + /* this jnode is copy of coced original */ + JNODE_CCED = 28, + /* + * When jnode is dirtied for the first time in given transaction, + * do_jnode_make_dirty() checks whether this jnode can possible became + * member of overwrite set. If so, this bit is set, and one block is + * reserved in the ->flush_reserved space of atom. + * + * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when + * + * (1) flush decides that we want this block to go into relocate + * set after all. + * + * (2) wandering log is allocated (by log writer) + * + * (3) extent is allocated + * + */ + JNODE_FLUSH_RESERVED = 29 +} reiser4_jnode_state; + +/* Macros for accessing the jnode state. */ +static inline void +JF_CLR(jnode * j, int f) +{ + assert("unknown-1", j->magic == JMAGIC); + clear_bit(f, &j->state); +} +static inline int +JF_ISSET(const jnode * j, int f) +{ + assert("unknown-2", j->magic == JMAGIC); + return test_bit(f, &((jnode *) j)->state); +} +static inline void +JF_SET(jnode * j, int f) +{ + assert("unknown-3", j->magic == JMAGIC); + set_bit(f, &j->state); +} + +static inline int +JF_TEST_AND_SET(jnode * j, int f) +{ + assert("unknown-4", j->magic == JMAGIC); + return test_and_set_bit(f, &j->state); +} + +/* ordering constraint for znode spin lock: znode lock is weaker than + tree lock and dk lock */ +#define spin_ordering_pred_jnode( node ) \ + ( ( lock_counters() -> rw_locked_tree == 0 ) && \ + ( lock_counters() -> spin_locked_txnh == 0 ) && \ + ( lock_counters() -> rw_locked_zlock == 0 ) && \ + ( lock_counters() -> rw_locked_dk == 0 ) && \ + /* \ + in addition you cannot hold more than one jnode spin lock at a \ + time. \ + */ \ + ( lock_counters() -> spin_locked_jnode < 2 ) ) + +/* Define spin_lock_jnode, spin_unlock_jnode, and spin_jnode_is_locked. + Take and release short-term spinlocks. Don't hold these across + io. +*/ +SPIN_LOCK_FUNCTIONS(jnode, jnode, guard); + +#define spin_ordering_pred_jload(node) (1) + +SPIN_LOCK_FUNCTIONS(jload, jnode, load); + +static inline int +jnode_is_in_deleteset(const jnode * node) +{ + return JF_ISSET(node, JNODE_RELOC); +} + + +extern int jnode_init_static(void); +extern int jnode_done_static(void); + +/* Jnode routines */ +extern jnode *jalloc(void); +extern void jfree(jnode * node) NONNULL; +extern jnode *jnew_unformatted(void); +extern jnode *jclone(jnode *); +extern jnode *jlookup(reiser4_tree * tree, + oid_t objectid, unsigned long ind) NONNULL; +extern jnode *jnode_by_page(struct page *pg) NONNULL; +extern jnode *jnode_of_page(struct page *pg) NONNULL; +void jnode_attach_page(jnode * node, struct page *pg); +jnode *find_get_jnode(reiser4_tree * tree, + struct address_space *mapping, oid_t oid, + unsigned long index); + +void unhash_unformatted_jnode(jnode *); +struct page *jnode_get_page_locked(jnode *, int gfp_flags); +extern jnode *page_next_jnode(jnode * node) NONNULL; +extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL; +extern void jnode_make_dirty(jnode * node) NONNULL; +extern void jnode_make_clean(jnode * node) NONNULL; +extern void jnode_make_wander_nolock(jnode * node) NONNULL; +extern void jnode_make_wander(jnode*) NONNULL; +extern void znode_make_reloc(znode*, flush_queue_t*) NONNULL; +extern void unformatted_make_reloc(jnode*, flush_queue_t*) NONNULL; + +extern void jnode_set_block(jnode * node, + const reiser4_block_nr * blocknr) NONNULL; +extern struct page *jnode_lock_page(jnode *) NONNULL; +extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL; + +/* block number of node */ +static inline const reiser4_block_nr * +jnode_get_block(const jnode * node /* jnode to query */) +{ + assert("nikita-528", node != NULL); + + return &node->blocknr; +} + +/* block number for IO. Usually this is the same as jnode_get_block(), unless + * jnode was emergency flushed---then block number chosen by eflush is + * used. */ +static inline const reiser4_block_nr * +jnode_get_io_block(const jnode * node) +{ + assert("nikita-2768", node != NULL); + assert("nikita-2769", spin_jnode_is_locked(node)); + + if (unlikely(JF_ISSET(node, JNODE_EFLUSH))) + return eflush_get(node); + else + return jnode_get_block(node); +} + +/* Jnode flush interface. */ +extern reiser4_blocknr_hint *pos_hint(flush_pos_t * pos); +extern int pos_leaf_relocate(flush_pos_t * pos); +extern flush_queue_t * pos_fq(flush_pos_t * pos); + +/* FIXME-VS: these are used in plugin/item/extent.c */ + +/* does extent_get_block have to be called */ +#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED) +#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED) +/* pointer to this block was just created (either by appending or by plugging a + hole), or zinit_new was called */ +#define jnode_created(node) JF_ISSET (node, JNODE_CREATED) +#define jnode_set_created(node) JF_SET (node, JNODE_CREATED) + +/* the node should be squeezed during flush squalloc phase */ +#define jnode_squeezable(node) JF_ISSET (node, JNODE_SQUEEZABLE) +#define jnode_set_squeezable(node) JF_SET (node, JNODE_SQUEEZABLE) + +/* Macros to convert from jnode to znode, znode to jnode. These are macros + because C doesn't allow overloading of const prototypes. */ +#define ZJNODE(x) (& (x) -> zjnode) +#define JZNODE(x) \ +({ \ + typeof (x) __tmp_x; \ + \ + __tmp_x = (x); \ + assert ("jmacd-1300", jnode_is_znode (__tmp_x)); \ + (znode*) __tmp_x; \ +}) + +extern int jnodes_tree_init(reiser4_tree * tree); +extern int jnodes_tree_done(reiser4_tree * tree); + +#if REISER4_DEBUG +extern int znode_is_any_locked(const znode * node); +extern int jnode_invariant(const jnode * node, int tlocked, int jlocked); +extern void jnode_list_remove(jnode * node); +#else +#define jnode_list_remove(node) noop +#endif + +#if REISER4_DEBUG_OUTPUT +extern void info_jnode(const char *prefix, const jnode * node); +extern void print_jnode(const char *prefix, const jnode * node); +extern void print_jnodes(const char *prefix, reiser4_tree * tree); +#else +#define info_jnode(p, n) noop +#define print_jnodes(p, t) noop +#define print_jnode(p, n) noop +#endif + +int znode_is_root(const znode * node) NONNULL; + +/* bump reference counter on @node */ +static inline void +add_x_ref(jnode * node /* node to increase x_count of */ ) +{ + assert("nikita-1911", node != NULL); + + atomic_inc(&node->x_count); + LOCK_CNT_INC(x_refs); +} + +static inline void +dec_x_ref(jnode * node) +{ + assert("nikita-3215", node != NULL); + assert("nikita-3216", atomic_read(&node->x_count) > 0); + + atomic_dec(&node->x_count); + assert("nikita-3217", LOCK_CNT_GTZ(x_refs)); + LOCK_CNT_DEC(x_refs); +} + +/* jref() - increase counter of references to jnode/znode (x_count) */ +static inline jnode * +jref(jnode * node) +{ + assert("jmacd-508", (node != NULL) && !IS_ERR(node)); + add_x_ref(node); + return node; +} + +extern int jdelete(jnode * node) NONNULL; + +/* get the page of jnode */ +static inline struct page * +jnode_page(const jnode * node) +{ + return node->pg; +} + +/* return pointer to jnode data */ +static inline char * +jdata(const jnode * node) +{ + assert("nikita-1415", node != NULL); + assert("nikita-3198", jnode_page(node) != NULL); + return node->data; +} + +static inline int +jnode_is_loaded(const jnode * node) +{ + assert("zam-506", node != NULL); + return atomic_read(&node->d_count) > 0; +} + +extern void page_detach_jnode(struct page *page, + struct address_space *mapping, + unsigned long index) NONNULL; +extern void page_clear_jnode(struct page *page, jnode * node) NONNULL; + +static inline void +jnode_set_reloc(jnode * node) +{ + assert("nikita-2431", node != NULL); + assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR)); + JF_SET(node, JNODE_RELOC); +} + +/* bump data counter on @node */ +static inline void add_d_ref(jnode * node /* node to increase d_count of */ ) +{ + assert("nikita-1962", node != NULL); + + atomic_inc(&node->d_count); + LOCK_CNT_INC(d_refs); +} + + +/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */ + +extern int jload_gfp(jnode * node, int gfp, int do_kmap) NONNULL; + +static inline int jload(jnode * node) +{ + return jload_gfp(node, GFP_KERNEL, 1); +} + +extern int jinit_new(jnode * node, int gfp_flags) NONNULL; +extern int jstartio(jnode * node) NONNULL; + +extern void jdrop(jnode * node) NONNULL; +extern int jwait_io(jnode * node, int rw) NONNULL; + +extern void jload_prefetch(const jnode * node); + +extern jnode *alloc_io_head(const reiser4_block_nr * block) NONNULL; +extern void drop_io_head(jnode * node) NONNULL; + +static inline reiser4_tree * +jnode_get_tree(const jnode * node) +{ + assert("nikita-2691", node != NULL); + return node->tree; +} + +extern void pin_jnode_data(jnode *); +extern void unpin_jnode_data(jnode *); + +static inline jnode_type +jnode_get_type(const jnode * node) +{ + static const unsigned long state_mask = + (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3); + + static jnode_type mask_to_type[] = { + /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */ + + /* 000 */ + [0] = JNODE_FORMATTED_BLOCK, + /* 001 */ + [1] = JNODE_UNFORMATTED_BLOCK, + /* 010 */ + [2] = JNODE_BITMAP, + /* 011 */ + [3] = LAST_JNODE_TYPE, /*invalid */ + /* 100 */ + [4] = JNODE_INODE, + /* 101 */ + [5] = LAST_JNODE_TYPE, + /* 110 */ + [6] = JNODE_IO_HEAD, + /* 111 */ + [7] = LAST_JNODE_TYPE, /* invalid */ + }; + + return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1]; +} + +/* returns true if node is a znode */ +static inline int +jnode_is_znode(const jnode * node) +{ + return jnode_get_type(node) == JNODE_FORMATTED_BLOCK; +} + +/* return true if "node" is dirty */ +static inline int +jnode_is_dirty(const jnode * node) +{ + assert("nikita-782", node != NULL); + assert("jmacd-1800", spin_jnode_is_locked(node) || (jnode_is_znode(node) && znode_is_any_locked(JZNODE(node)))); + return JF_ISSET(node, JNODE_DIRTY); +} + +/* return true if "node" is dirty, node is unlocked */ +static inline int +jnode_check_dirty(jnode * node) +{ + assert("jmacd-7798", node != NULL); + assert("jmacd-7799", spin_jnode_is_not_locked(node)); + return UNDER_SPIN(jnode, node, jnode_is_dirty(node)); +} + +static inline int +jnode_is_flushprepped(const jnode * node) +{ + assert("jmacd-78212", node != NULL); + assert("jmacd-71276", spin_jnode_is_locked(node)); + return !jnode_is_dirty(node) || JF_ISSET(node, JNODE_RELOC) + || JF_ISSET(node, JNODE_OVRWR); +} + +/* Return true if @node has already been processed by the squeeze and allocate + process. This implies the block address has been finalized for the + duration of this atom (or it is clean and will remain in place). If this + returns true you may use the block number as a hint. */ +static inline int +jnode_check_flushprepped(jnode * node) +{ + /* It must be clean or relocated or wandered. New allocations are set to relocate. */ + assert("jmacd-71275", spin_jnode_is_not_locked(node)); + return UNDER_SPIN(jnode, node, jnode_is_flushprepped(node)); +} + +/* returns true if node is unformatted */ +static inline int +jnode_is_unformatted(const jnode * node) +{ + assert("jmacd-0123", node != NULL); + return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK; +} + +/* returns true if node represents a cluster cache page */ +static inline int +jnode_is_cluster_page(const jnode * node) +{ + assert("edward-50", node != NULL); + return (JF_ISSET(node, JNODE_CLUSTER_PAGE)); +} + +/* returns true is node is builtin inode's jnode */ +static inline int +jnode_is_inode(const jnode * node) +{ + assert("vs-1240", node != NULL); + return jnode_get_type(node) == JNODE_INODE; +} + +static inline jnode_plugin * +jnode_ops_of(const jnode_type type) +{ + assert("nikita-2367", type < LAST_JNODE_TYPE); + return jnode_plugin_by_id((reiser4_plugin_id) type); +} + +static inline jnode_plugin * +jnode_ops(const jnode * node) +{ + assert("nikita-2366", node != NULL); + + return jnode_ops_of(jnode_get_type(node)); +} + +/* Get the index of a block. */ +static inline unsigned long +jnode_get_index(jnode * node) +{ + return jnode_ops(node)->index(node); +} + +/* return true if "node" is the root */ +static inline int +jnode_is_root(const jnode * node) +{ + return jnode_is_znode(node) && znode_is_root(JZNODE(node)); +} + +extern struct address_space * mapping_jnode(const jnode * node); +extern unsigned long index_jnode(const jnode * node); + +extern int jnode_try_drop(jnode * node); + +static inline void jput(jnode * node); +extern void jput_final(jnode * node); + +#if REISER4_STATS +extern void reiser4_stat_inc_at_level_jput(const jnode * node); +extern void reiser4_stat_inc_at_level_jputlast(const jnode * node); +#else +#define reiser4_stat_inc_at_level_jput(node) noop +#define reiser4_stat_inc_at_level_jputlast(node) noop +#endif + +/* jput() - decrement x_count reference counter on znode. + + Count may drop to 0, jnode stays in cache until memory pressure causes the + eviction of its page. The c_count variable also ensures that children are + pressured out of memory before the parent. The jnode remains hashed as + long as the VM allows its page to stay in memory. +*/ +static inline void +jput(jnode * node) +{ + trace_stamp(TRACE_ZNODES); + + assert("jmacd-509", node != NULL); + assert("jmacd-510", atomic_read(&node->x_count) > 0); + assert("nikita-3065", spin_jnode_is_not_locked(node)); + assert("zam-926", schedulable()); + LOCK_CNT_DEC(x_refs); + + reiser4_stat_inc_at_level_jput(node); + rcu_read_lock(); + /* + * we don't need any kind of lock here--jput_final() uses RCU. + */ + if (unlikely(atomic_dec_and_test(&node->x_count))) { + reiser4_stat_inc_at_level_jputlast(node); + jput_final(node); + } else + rcu_read_unlock(); +} + +extern void jrelse(jnode * node); +extern void jrelse_tail(jnode * node); + +extern jnode *jnode_rip_sync(reiser4_tree *t, jnode * node); + +/* resolve race with jput */ +static inline jnode * +jnode_rip_check(reiser4_tree *tree, jnode * node) +{ + if (unlikely(JF_ISSET(node, JNODE_RIP))) + node = jnode_rip_sync(tree, node); + return node; +} + +extern reiser4_key * jnode_build_key(const jnode * node, reiser4_key * key); + +/* __JNODE_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/kassign.c linux-2.6.4-ck1/fs/reiser4/kassign.c --- linux-2.6.4/fs/reiser4/kassign.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/kassign.c 2004-03-11 22:45:15.254515650 +1100 @@ -0,0 +1,522 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Key assignment policy implementation */ + +#include "debug.h" +#include "key.h" +#include "kassign.h" +#include "vfs_ops.h" +#include "inode.h" +#include "super.h" + +#include /* for __u?? */ +#include /* for struct super_block, etc */ + +#if REISER4_LARGE_KEY +#define ORDERING_CHARS (sizeof(__u64) - 1) +#define OID_CHARS (sizeof(__u64)) +#else +#define ORDERING_CHARS (0) +#define OID_CHARS (sizeof(__u64) - 1) +#endif + +#define OFFSET_CHARS (sizeof(__u64)) + +#define INLINE_CHARS (ORDERING_CHARS + OID_CHARS) + +static const __u64 longname_mark = 0x0100000000000000ull; + +reiser4_internal int +is_longname_key(const reiser4_key *key) +{ + __u64 highpart; + + assert("nikita-2863", key != NULL); + assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR); + + if (REISER4_LARGE_KEY) + highpart = get_key_ordering(key); + else + highpart = get_key_objectid(key); + + return (highpart & longname_mark) ? 1 : 0; +} + +reiser4_internal int +is_longname(const char *name UNUSED_ARG, int len) +{ + return len > ORDERING_CHARS + OID_CHARS + OFFSET_CHARS; +} + +/* code ascii string into __u64. + + Put characters of @name into result (@str) one after another starting + from @start_idx-th highest (arithmetically) byte. This produces + endian-safe encoding. memcpy(2) will not do. + +*/ +static __u64 +pack_string(const char *name /* string to encode */ , + int start_idx /* highest byte in result from + * which to start encoding */ ) +{ + unsigned i; + __u64 str; + + str = 0; + for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) { + str <<= 8; + str |= (unsigned char) name[i]; + } + str <<= (sizeof str - i - start_idx) << 3; + return str; +} + +#if !REISER4_DEBUG_OUTPUT +static +#endif +/* opposite to pack_string(). Takes value produced by pack_string(), restores + * string encoded in it and stores result in @buf */ +reiser4_internal char * +unpack_string(__u64 value, char *buf) +{ + do { + *buf = value >> (64 - 8); + if (*buf) + ++ buf; + value <<= 8; + } while(value != 0); + *buf = 0; + return buf; +} + +/* obtain name encoded in @key and store it in @buf */ +reiser4_internal char * +extract_name_from_key(const reiser4_key *key, char *buf) +{ + char *c; + + assert("nikita-2868", !is_longname_key(key)); + + c = buf; + if (REISER4_LARGE_KEY) { + c = unpack_string(get_key_ordering(key) & ~longname_mark, c); + c = unpack_string(get_key_fulloid(key), c); + } else + c = unpack_string(get_key_fulloid(key) & ~longname_mark, c); + unpack_string(get_key_offset(key), c); + return buf; +} + +/* build key for directory entry. + ->build_entry_key() for directory plugin */ +reiser4_internal void +build_entry_key_common(const struct inode *dir /* directory where entry is + * (or will be) in.*/ , + const struct qstr *qname /* name of file referenced + * by this entry */ , + reiser4_key * result /* resulting key of directory + * entry */ ) +{ + __u64 ordering; + __u64 objectid; + __u64 offset; + const char *name; + int len; + + assert("nikita-1139", dir != NULL); + assert("nikita-1140", qname != NULL); + assert("nikita-1141", qname->name != NULL); + assert("nikita-1142", result != NULL); + + name = qname->name; + len = qname->len; + + assert("nikita-2867", strlen(name) == len); + + key_init(result); + /* locality of directory entry's key is objectid of parent + directory */ + set_key_locality(result, get_inode_oid(dir)); + /* minor packing locality is constant */ + set_key_type(result, KEY_FILE_NAME_MINOR); + /* dot is special case---we always want it to be first entry in + a directory. Actually, we just want to have smallest + directory entry. + */ + if (len == 1 && name[0] == '.') + return; + + /* This is our brand new proposed key allocation algorithm for + directory entries: + + If name is shorter than 7 + 8 = 15 characters, put first 7 + characters into objectid field and remaining characters (if + any) into offset field. Dream long dreamt came true: file + name as a key! + + If file name is longer than 15 characters, put first 7 + characters into objectid and hash of remaining characters + into offset field. + + To distinguish above cases, in latter set up unused high bit + in objectid field. + + + With large keys (REISER4_LARGE_KEY) algorithm is updated + appropriately. + */ + + /* objectid of key is composed of seven first characters of + file's name. This imposes global ordering on directory + entries. + */ + if (REISER4_LARGE_KEY) { + ordering = pack_string(name, 1); + if (len > ORDERING_CHARS) + objectid = pack_string(name + ORDERING_CHARS, 0); + else + objectid = 0ull; + } else + objectid = pack_string(name, 1); + if (!is_longname(name, len)) { + if (len > INLINE_CHARS) + offset = pack_string(name + INLINE_CHARS, 0); + else + offset = 0ull; + } else { + /* note in a key the fact that offset contains hash. */ + if (REISER4_LARGE_KEY) + ordering |= longname_mark; + else + objectid |= longname_mark; + + /* offset is the hash of the file name. */ + offset = inode_hash_plugin(dir)->hash(name + INLINE_CHARS, + len - INLINE_CHARS); + } + + if (REISER4_LARGE_KEY) { + set_key_ordering(result, ordering); + set_key_fulloid(result, objectid); + } else { + /* objectid is 60 bits */ + assert("nikita-1405", !(objectid & ~KEY_OBJECTID_MASK)); + set_key_objectid(result, objectid); + } + set_key_offset(result, offset); + return; +} + +/* build key for directory entry. + ->build_entry_key() for directory plugin + + This is for directories where we want repeatable and restartable readdir() + even in case 32bit user level struct dirent (readdir(3)). +*/ +reiser4_internal void +build_entry_key_stable_entry(const struct inode *dir /* directory where + * entry is (or + * will be) in. */ , + const struct qstr *name /* name of file + * referenced by + * this entry */ , + reiser4_key * result /* resulting key of + * directory entry */ ) +{ + oid_t objectid; + + assert("nikita-2283", dir != NULL); + assert("nikita-2284", name != NULL); + assert("nikita-2285", name->name != NULL); + assert("nikita-2286", result != NULL); + + key_init(result); + /* locality of directory entry's key is objectid of parent + directory */ + set_key_locality(result, get_inode_oid(dir)); + /* minor packing locality is constant */ + set_key_type(result, KEY_FILE_NAME_MINOR); + /* dot is special case---we always want it to be first entry in + a directory. Actually, we just want to have smallest + directory entry. + */ + if ((name->len == 1) && (name->name[0] == '.')) + return; + + /* objectid of key is 31 lowest bits of hash. */ + objectid = inode_hash_plugin(dir)->hash(name->name, (int) name->len) & 0x7fffffff; + + assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK)); + set_key_objectid(result, objectid); + + /* offset is always 0. */ + set_key_offset(result, (__u64) 0); + return; +} + +/* build key to be used by ->readdir() method. + + See reiser4_readdir() for more detailed comment. + Common implementation of dir plugin's method build_readdir_key +*/ +reiser4_internal int +build_readdir_key_common(struct file *dir /* directory being read */ , + reiser4_key * result /* where to store key */ ) +{ + reiser4_file_fsdata *fdata; + struct inode *inode; + + assert("nikita-1361", dir != NULL); + assert("nikita-1362", result != NULL); + assert("nikita-1363", dir->f_dentry != NULL); + inode = dir->f_dentry->d_inode; + assert("nikita-1373", inode != NULL); + + fdata = reiser4_get_file_fsdata(dir); + if (IS_ERR(fdata)) + return PTR_ERR(fdata); + assert("nikita-1364", fdata != NULL); + return extract_key_from_de_id(get_inode_oid(inode), &fdata->dir.readdir.position.dir_entry_key, result); + +} + +/* true, if @key is the key of "." */ +reiser4_internal int +is_dot_key(const reiser4_key * key /* key to check */ ) +{ + assert("nikita-1717", key != NULL); + assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR); + return + (get_key_ordering(key) == 0ull) && + (get_key_objectid(key) == 0ull) && + (get_key_offset(key) == 0ull); +} + +/* build key for stat-data. + + return key of stat-data of this object. This should became sd plugin + method in the future. For now, let it be here. + +*/ +reiser4_internal reiser4_key * +build_sd_key(const struct inode * target /* inode of an object */ , + reiser4_key * result /* resulting key of @target + stat-data */ ) +{ + assert("nikita-261", result != NULL); + + key_init(result); + set_key_locality(result, reiser4_inode_data(target)->locality_id); + set_key_ordering(result, get_inode_ordering(target)); + set_key_objectid(result, get_inode_oid(target)); + set_key_type(result, KEY_SD_MINOR); + set_key_offset(result, (__u64) 0); + return result; +} + +/* encode part of key into &obj_key_id + + This encodes into @id part of @key sufficient to restore @key later, + given that latter is key of object (key of stat-data). + + See &obj_key_id +*/ +reiser4_internal int +build_obj_key_id(const reiser4_key * key /* key to encode */ , + obj_key_id * id /* id where key is encoded in */ ) +{ + assert("nikita-1151", key != NULL); + assert("nikita-1152", id != NULL); + + xmemcpy(id, key, sizeof *id); + return 0; +} + +/* encode reference to @obj in @id. + + This is like build_obj_key_id() above, but takes inode as parameter. */ +reiser4_internal int +build_inode_key_id(const struct inode *obj /* object to build key of */ , + obj_key_id * id /* result */ ) +{ + reiser4_key sdkey; + + assert("nikita-1166", obj != NULL); + assert("nikita-1167", id != NULL); + + build_sd_key(obj, &sdkey); + build_obj_key_id(&sdkey, id); + return 0; +} + +/* decode @id back into @key + + Restore key of object stat-data from @id. This is dual to + build_obj_key_id() above. +*/ +reiser4_internal int +extract_key_from_id(const obj_key_id * id /* object key id to extract key + * from */ , + reiser4_key * key /* result */ ) +{ + assert("nikita-1153", id != NULL); + assert("nikita-1154", key != NULL); + + key_init(key); + xmemcpy(key, id, sizeof *id); + return 0; +} + +/* extract objectid of directory from key of directory entry within said + directory. + */ +reiser4_internal oid_t +extract_dir_id_from_key(const reiser4_key * de_key /* key of + * directory + * entry */ ) +{ + assert("nikita-1314", de_key != NULL); + return get_key_locality(de_key); +} + +/* encode into @id key of directory entry. + + Encode into @id information sufficient to later distinguish directory + entries within the same directory. This is not whole key, because all + directory entries within directory item share locality which is equal + to objectid of their directory. + +*/ +reiser4_internal int +build_de_id(const struct inode *dir /* inode of directory */ , + const struct qstr *name /* name to be given to @obj by + * directory entry being + * constructed */ , + de_id * id /* short key of directory entry */ ) +{ + reiser4_key key; + + assert("nikita-1290", dir != NULL); + assert("nikita-1292", id != NULL); + + /* NOTE-NIKITA this is suboptimal. */ + inode_dir_plugin(dir)->build_entry_key(dir, name, &key); + return build_de_id_by_key(&key, id); +} + +/* encode into @id key of directory entry. + + Encode into @id information sufficient to later distinguish directory + entries within the same directory. This is not whole key, because all + directory entries within directory item share locality which is equal + to objectid of their directory. + +*/ +reiser4_internal int +build_de_id_by_key(const reiser4_key * entry_key /* full key of directory + * entry */ , + de_id * id /* short key of directory entry */ ) +{ + xmemcpy(id, ((__u64 *) entry_key) + 1, sizeof *id); + return 0; +} + +/* restore from @id key of directory entry. + + Function dual to build_de_id(): given @id and locality, build full + key of directory entry within directory item. + +*/ +reiser4_internal int +extract_key_from_de_id(const oid_t locality /* locality of directory + * entry */ , + const de_id * id /* directory entry id */ , + reiser4_key * key /* result */ ) +{ + /* no need to initialise key here: all fields are overwritten */ + xmemcpy(((__u64 *) key) + 1, id, sizeof *id); + set_key_locality(key, locality); + set_key_type(key, KEY_FILE_NAME_MINOR); + return 0; +} + +/* compare two &obj_key_id */ +reiser4_internal cmp_t +key_id_cmp(const obj_key_id * i1 /* first object key id to compare */ , + const obj_key_id * i2 /* second object key id to compare */ ) +{ + reiser4_key k1; + reiser4_key k2; + + extract_key_from_id(i1, &k1); + extract_key_from_id(i2, &k2); + return keycmp(&k1, &k2); +} + +/* compare &obj_key_id with full key */ +reiser4_internal cmp_t +key_id_key_cmp(const obj_key_id * id /* object key id to compare */ , + const reiser4_key * key /* key to compare */ ) +{ + reiser4_key k1; + + extract_key_from_id(id, &k1); + return keycmp(&k1, key); +} + +/* compare two &de_id's */ +reiser4_internal cmp_t +de_id_cmp(const de_id * id1 /* first &de_id to compare */ , + const de_id * id2 /* second &de_id to compare */ ) +{ + /* NOTE-NIKITA ugly implementation */ + reiser4_key k1; + reiser4_key k2; + + extract_key_from_de_id((oid_t) 0, id1, &k1); + extract_key_from_de_id((oid_t) 0, id2, &k2); + return keycmp(&k1, &k2); +} + +/* compare &de_id with key */ +reiser4_internal cmp_t +de_id_key_cmp(const de_id * id /* directory entry id to compare */ , + const reiser4_key * key /* key to compare */ ) +{ + cmp_t result; + reiser4_key *k1; + + k1 = (reiser4_key *)(((unsigned long)id) - sizeof key->el[0]); + result = KEY_DIFF_EL(k1, key, 1); + if (result == EQUAL_TO) { + result = KEY_DIFF_EL(k1, key, 2); + if (REISER4_LARGE_KEY && result == EQUAL_TO) { + result = KEY_DIFF_EL(k1, key, 3); + } + } + return result; +} + +/* true if key of root directory sd */ +reiser4_internal int +is_root_dir_key(const struct super_block *super /* super block to check */ , + const reiser4_key * key /* key to check */ ) +{ + assert("nikita-1819", super != NULL); + assert("nikita-1820", key != NULL); + /* call disk plugin's root_dir_key method if it exists */ + if (get_super_private(super)->df_plug && get_super_private(super)->df_plug->root_dir_key) + return keyeq(key, get_super_private(super)->df_plug->root_dir_key(super)); + return 0; +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/kassign.h linux-2.6.4-ck1/fs/reiser4/kassign.h --- linux-2.6.4/fs/reiser4/kassign.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/kassign.h 2004-03-11 22:45:15.254515650 +1100 @@ -0,0 +1,94 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Key assignment policy interface. */ + +#if !defined( __KASSIGN_H__ ) +#define __KASSIGN_H__ + +#include "forward.h" +#include "key.h" +#include "dformat.h" + +#include /* for __u?? */ +#include /* for struct super_block, etc */ +#include /* for struct qstr */ +/* key assignment functions */ + +/* Information from which key of file stat-data can be uniquely + restored. This depends on key assignment policy for + stat-data. Currently it's enough to store object id and locality id + (60+60==120) bits, because minor packing locality and offset of + stat-data key are always known constants: KEY_SD_MINOR and 0 + respectively. For simplicity 4 bits are wasted in each id, and just + two 64 bit integers are stored. + + This field has to be byte-aligned, because we don't want to waste + space in directory entries. There is another side of a coin of + course: we waste CPU and bus bandwidth in stead, by copying data back + and forth. + + Next optimization: &obj_key_id is mainly used to address stat data from + directory entries. Under the assumption that majority of files only have + only name (one hard link) from *the* parent directory it seems reasonable + to only store objectid of stat data and take its locality from key of + directory item. + + This requires some flag to be added to the &obj_key_id to distinguish + between these two cases. Remaining bits in flag byte are then asking to be + used to store file type. + + This optimization requires changes in directory item handling code. + +*/ +typedef struct obj_key_id { + d8 locality[sizeof (__u64)]; + ON_LARGE_KEY(d8 ordering[sizeof (__u64)];) + d8 objectid[sizeof (__u64)]; +} obj_key_id; + +/* Information sufficient to uniquely identify directory entry within + compressed directory item. + + For alignment issues see &obj_key_id above. +*/ +typedef struct de_id { + ON_LARGE_KEY(d8 ordering[sizeof (__u64)];) + d8 objectid[sizeof (__u64)]; + d8 offset[sizeof (__u64)]; +} de_id; + +extern int build_inode_key_id(const struct inode *obj, obj_key_id * id); +extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key); +extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id); +extern oid_t extract_dir_id_from_key(const reiser4_key * de_key); +extern int build_de_id(const struct inode *dir, const struct qstr *name, de_id * id); +extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id); +extern int extract_key_from_de_id(const oid_t locality, const de_id * id, reiser4_key * key); +extern cmp_t key_id_cmp(const obj_key_id * i1, const obj_key_id * i2); +extern cmp_t key_id_key_cmp(const obj_key_id * id, const reiser4_key * key); +extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2); +extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key); + +extern int build_readdir_key_common(struct file *dir, reiser4_key * result); +extern void build_entry_key_common(const struct inode *dir, const struct qstr *name, reiser4_key * result); +extern void build_entry_key_stable_entry(const struct inode *dir, const struct qstr *name, reiser4_key * result); +extern int is_dot_key(const reiser4_key * key); +extern reiser4_key *build_sd_key(const struct inode *target, reiser4_key * result); +extern int is_root_dir_key(const struct super_block *super, const reiser4_key * key); + +extern int is_longname_key(const reiser4_key *key); +extern int is_longname(const char *name, int len); +extern char *extract_name_from_key(const reiser4_key *key, char *buf); + +/* __KASSIGN_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/kattr.c linux-2.6.4-ck1/fs/reiser4/kattr.c --- linux-2.6.4/fs/reiser4/kattr.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/kattr.c 2004-03-11 22:45:15.255515494 +1100 @@ -0,0 +1,549 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Interface to sysfs' attributes */ + +/* + * Reiser4 exports some of its internal data through sysfs. + * + * For details on sysfs see fs/sysfs, include/linux/sysfs.h, + * include/linux/kobject.h. Roughly speaking, one embeds struct kobject into + * some kernel data type. Objects of this type will be represented as + * _directories_ somewhere below /sys. Attributes can be registered for + * kobject and they will be visible as files within corresponding + * directory. Each attribute is represented by struct kattr. How given + * attribute reacts to read and write is determined by ->show and ->store + * operations that are properties of its parent kobject. + * + * Reiser4 exports following stuff through sysfs: + * + * path kobject or attribute + * + * /sys/fs/reiser4/ + * / sbinfo->kobj + * sb-fields def_attrs[] + * stats/ sbinfo->stats_kobj + * stat-cnts reiser4_stat_defs[] + * level-NN/ sbinfo->level[].kobj + * stat-cnts reiser4_stat_level_defs[] + * + * (For some reasons we also add /sys/fs and /sys/fs/reiser4 manually, but + * this is supposed to be done by core.) + * + * Shouldn't struct kobject be renamed to struct knobject? + * + */ + +#include "debug.h" +#include "super.h" +#include "kattr.h" +#include "prof.h" + +#include /* struct kobject */ +#include /* struct super_block */ + +#if REISER4_USE_SYSFS + +/* convert @attr to reiser4_kattr object it is embedded in */ +static inline reiser4_kattr * +to_kattr(struct attribute *attr) +{ + return container_of(attr, reiser4_kattr, attr); +} + +/* convert @kobj to super block it is embedded it */ +static inline struct super_block * +to_super(struct kobject *kobj) +{ + reiser4_super_info_data *sbinfo; + + sbinfo = container_of(kobj, reiser4_super_info_data, kobj); + return sbinfo->tree.super; +} + +static ssize_t +kattr_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct super_block *super; + reiser4_kattr *kattr; + + super = to_super(kobj); + kattr = to_kattr(attr); + + if (kattr->show != NULL) + return kattr->show(super, kattr, 0, buf); + else + return 0; +} + +static ssize_t +kattr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t size) +{ + struct super_block *super; + reiser4_kattr *kattr; + + super = to_super(kobj); + kattr = to_kattr(attr); + + if (kattr->store != NULL) + return kattr->store(super, kattr, 0, buf, size); + else + return 0; +} + +typedef struct { + ptrdiff_t offset; + const char *format; +} super_field_cookie; + +#define DEFINE_SUPER_RO(aname, afield, aformat, asize) \ +static super_field_cookie __cookie_ ## aname = { \ + .offset = offsetof(reiser4_super_info_data, afield), \ + .format = aformat "\n" \ +}; \ + \ +static reiser4_kattr kattr_super_ro_ ## aname = { \ + .attr = { \ + .name = (char *) #afield, \ + .mode = 0444 /* r--r--r-- */ \ + }, \ + .cookie = &__cookie_ ## aname, \ + .show = show_ro_ ## asize \ +} + +#define getat(ptr, offset, type) *(type *)(((char *)(ptr)) + (offset)) + +static ssize_t +show_ro_32(struct super_block * s, reiser4_kattr * kattr, void * o, char * buf) +{ + char *p; + super_field_cookie *cookie; + __u32 val; + + (void)o; + + cookie = kattr->cookie; + val = getat(get_super_private(s), cookie->offset, __u32); + p = buf; + KATTR_PRINT(p, buf, cookie->format, (unsigned long long)val); + return (p - buf); +} + +static ssize_t show_ro_64(struct super_block * s, + reiser4_kattr * kattr, void * opaque, char * buf) +{ + char *p; + super_field_cookie *cookie; + __u64 val; + + (void)opaque; + + cookie = kattr->cookie; + val = getat(get_super_private(s), cookie->offset, __u64); + p = buf; + KATTR_PRINT(p, buf, cookie->format, (unsigned long long)val); + return (p - buf); +} + +#undef getat + +#define SHOW_OPTION(p, buf, option) \ + if (option) \ + KATTR_PRINT((p), (buf), #option "\n") + +static ssize_t +show_options(struct super_block * s, reiser4_kattr * kattr, void * o, char * buf) +{ + char *p; + + (void)o; + p = buf; + + SHOW_OPTION(p, buf, REISER4_DEBUG); + SHOW_OPTION(p, buf, REISER4_DEBUG_MODIFY); + SHOW_OPTION(p, buf, REISER4_DEBUG_MEMCPY); + SHOW_OPTION(p, buf, REISER4_DEBUG_NODE); + SHOW_OPTION(p, buf, REISER4_ZERO_NEW_NODE); + SHOW_OPTION(p, buf, REISER4_TRACE); + SHOW_OPTION(p, buf, REISER4_TRACE_TREE); + SHOW_OPTION(p, buf, REISER4_STATS); + SHOW_OPTION(p, buf, REISER4_DEBUG_OUTPUT); + SHOW_OPTION(p, buf, REISER4_LOCKPROF); + SHOW_OPTION(p, buf, REISER4_LARGE_KEY); + SHOW_OPTION(p, buf, REISER4_PROF); + return (p - buf); +} + +static reiser4_kattr compile_options = { + .attr = { + .name = (char *) "options", + .mode = 0444 /* r--r--r-- */ + }, + .cookie = NULL, + .show = show_options +}; + +static ssize_t +show_device(struct super_block * s, reiser4_kattr * kattr, void * o, char * buf) +{ + char *p; + + (void)o; + p = buf; + KATTR_PRINT(p, buf, "%lu\n", (unsigned long)s->s_dev); + return (p - buf); +} + +static reiser4_kattr device = { + .attr = { + .name = (char *) "device", + .mode = 0444 /* r--r--r-- */ + }, + .cookie = NULL, + .show = show_device +}; + +#if REISER4_DEBUG +ssize_t store_bugme(struct super_block * s, + reiser4_kattr *ka, void *opaque, const char *buf, + size_t size) +{ + DEBUGON(1); + return size; +} + +static reiser4_kattr bugme = { + .attr = { + .name = (char *) "bugme", + .mode = 0222 /* -w--w--w- */ + }, + .cookie = NULL, + .store = store_bugme +}; + +/* REISER4_DEBUG */ +#endif + +DEFINE_SUPER_RO(01, mkfs_id, "%llx", 32); +DEFINE_SUPER_RO(02, block_count, "%llu", 64); +DEFINE_SUPER_RO(03, blocks_used, "%llu", 64); +DEFINE_SUPER_RO(04, blocks_free_committed, "%llu", 64); +DEFINE_SUPER_RO(05, blocks_grabbed, "%llu", 64); +DEFINE_SUPER_RO(06, blocks_fake_allocated_unformatted, "%llu", 64); +DEFINE_SUPER_RO(07, blocks_fake_allocated, "%llu", 64); +DEFINE_SUPER_RO(08, blocks_flush_reserved, "%llu", 64); +DEFINE_SUPER_RO(09, fsuid, "%llx", 32); +#if REISER4_DEBUG +DEFINE_SUPER_RO(10, eflushed, "%llu", 32); +#endif +DEFINE_SUPER_RO(11, blocknr_hint_default, "%lli", 64); +DEFINE_SUPER_RO(12, nr_files_committed, "%llu", 64); +DEFINE_SUPER_RO(13, tmgr.atom_count, "%llu", 32); +DEFINE_SUPER_RO(14, tmgr.id_count, "%llu", 32); +DEFINE_SUPER_RO(15, tmgr.atom_max_size, "%llu", 32); +DEFINE_SUPER_RO(16, tmgr.atom_max_age, "%llu", 32); + +/* tree fields */ +DEFINE_SUPER_RO(17, tree.root_block, "%llu", 64); +DEFINE_SUPER_RO(18, tree.height, "%llu", 32); +DEFINE_SUPER_RO(19, tree.znode_epoch, "%llu", 64); +DEFINE_SUPER_RO(20, tree.carry.new_node_flags, "%llx", 32); +DEFINE_SUPER_RO(21, tree.carry.new_extent_flags, "%llx", 32); +DEFINE_SUPER_RO(22, tree.carry.paste_flags, "%llx", 32); +DEFINE_SUPER_RO(23, tree.carry.insert_flags, "%llx", 32); + +/* not very good. Should be done by the plugin in stead */ +DEFINE_SUPER_RO(24, next_to_use, "%llu", 64); +DEFINE_SUPER_RO(25, oids_in_use, "%llu", 64); + +DEFINE_SUPER_RO(26, entd.flushers, "%llu", 32); +DEFINE_SUPER_RO(27, entd.timeout, "%llu", 32); + +static struct attribute * kattr_def_attrs[] = { + &kattr_super_ro_01.attr, + &kattr_super_ro_02.attr, + &kattr_super_ro_03.attr, + &kattr_super_ro_04.attr, + &kattr_super_ro_05.attr, + &kattr_super_ro_06.attr, + &kattr_super_ro_07.attr, + &kattr_super_ro_08.attr, + &kattr_super_ro_09.attr, +#if REISER4_DEBUG + &kattr_super_ro_10.attr, +#endif + &kattr_super_ro_11.attr, + &kattr_super_ro_12.attr, + &kattr_super_ro_13.attr, + &kattr_super_ro_14.attr, + &kattr_super_ro_15.attr, + &kattr_super_ro_16.attr, + &kattr_super_ro_17.attr, + &kattr_super_ro_18.attr, + &kattr_super_ro_19.attr, + &kattr_super_ro_20.attr, + &kattr_super_ro_21.attr, + &kattr_super_ro_22.attr, + &kattr_super_ro_23.attr, + &kattr_super_ro_24.attr, + &kattr_super_ro_25.attr, + &kattr_super_ro_26.attr, + &kattr_super_ro_27.attr, + &compile_options.attr, + &device.attr, +#if REISER4_DEBUG + &bugme.attr, +#endif + NULL +}; + +static struct sysfs_ops attr_ops = { + .show = kattr_show, + .store = kattr_store +}; + +struct kobj_type ktype_reiser4 = { + .sysfs_ops = &attr_ops, + .default_attrs = kattr_def_attrs, + .release = NULL +}; + +#if REISER4_STATS + +static ssize_t +kattr_stats_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + reiser4_super_info_data *sbinfo; + reiser4_kattr *kattr; + + sbinfo = container_of(kobj, reiser4_super_info_data, stats_kobj); + kattr = to_kattr(attr); + + if (kattr->show != NULL) + return kattr->show(sbinfo->tree.super, kattr, 0, buf); + else + return 0; +} + +static ssize_t +kattr_stats_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t size) +{ + reiser4_super_info_data *sbinfo; + reiser4_kattr *kattr; + + sbinfo = container_of(kobj, reiser4_super_info_data, stats_kobj); + kattr = to_kattr(attr); + + if (kattr->store != NULL) + return kattr->store(sbinfo->tree.super, kattr, 0, buf, size); + else + return 0; +} + + +static struct sysfs_ops stats_attr_ops = { + .show = kattr_stats_show, + .store = kattr_stats_store +}; + +static struct kobj_type ktype_noattr = { + .sysfs_ops = &stats_attr_ops, + .default_attrs = NULL, + .release = NULL +}; + +static ssize_t +kattr_level_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + reiser4_super_info_data *sbinfo; + reiser4_level_stats_kobj *level_kobj; + int level; + reiser4_kattr *kattr; + + level_kobj = container_of(kobj, reiser4_level_stats_kobj, kobj); + level = level_kobj->level; + level_kobj -= level; + sbinfo = container_of(level_kobj, reiser4_super_info_data, level[0]); + kattr = to_kattr(attr); + + if (kattr->show != NULL) + return kattr->show(sbinfo->tree.super, kattr, &level, buf); + else + return 0; +} + +static ssize_t +kattr_level_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t size) +{ + reiser4_super_info_data *sbinfo; + reiser4_level_stats_kobj *level_kobj; + int level; + reiser4_kattr *kattr; + + level_kobj = container_of(kobj, reiser4_level_stats_kobj, kobj); + level = level_kobj->level; + level_kobj -= level; + sbinfo = container_of(level_kobj, reiser4_super_info_data, level[0]); + kattr = to_kattr(attr); + + if (kattr->store != NULL) + return kattr->store(sbinfo->tree.super, kattr, &level, buf, size); + else + return 0; +} + +static struct sysfs_ops attr_level_ops = { + .show = kattr_level_show, + .store = kattr_level_store +}; + +static struct kobj_type ktype_level_reiser4 = { + .sysfs_ops = &attr_level_ops, + .default_attrs = NULL, + .release = NULL +}; + +static int register_level_attrs(reiser4_super_info_data *sbinfo, int i) +{ + struct kobject *parent; + struct kobject *level; + int result; + + parent = &sbinfo->stats_kobj; + sbinfo->level[i].level = i; + level = &sbinfo->level[i].kobj; + level->parent = kobject_get(parent); + if (level->parent != NULL) { + snprintf(level->name, + KOBJ_NAME_LEN, "level-%2.2i", i); + level->ktype = &ktype_level_reiser4; + result = kobject_register(level); + if (result == 0) + result = reiser4_populate_kattr_level_dir(level); + } else + result = RETERR(-EBUSY); + return result; +} +#endif + +static decl_subsys(fs, NULL, NULL); +decl_subsys(reiser4, &ktype_reiser4, NULL); + +reiser4_internal int +reiser4_sysfs_init_once(void) +{ + int result; + + result = subsystem_register(&fs_subsys); + if (result == 0) { + kset_set_kset_s(&reiser4_subsys, fs_subsys); + result = subsystem_register(&reiser4_subsys); + if (result == 0) + result = init_prof_kobject(); + } + return result; +} + +reiser4_internal void +reiser4_sysfs_done_once(void) +{ + subsystem_unregister(&reiser4_subsys); + subsystem_unregister(&fs_subsys); + done_prof_kobject(); +} + +reiser4_internal int +reiser4_sysfs_init(struct super_block *super) +{ + reiser4_super_info_data *sbinfo; + struct kobject *kobj; + int result; + ON_STATS(struct kobject *stats_kobj); + + sbinfo = get_super_private(super); + + kobj = &sbinfo->kobj; + + snprintf(kobj->name, KOBJ_NAME_LEN, "%s", super->s_id); + kobj_set_kset_s(sbinfo, reiser4_subsys); + result = kobject_register(kobj); + if (result != 0) + return result; +#if REISER4_STATS + /* add attributes representing statistical counters */ + stats_kobj = &sbinfo->stats_kobj; + stats_kobj->parent = kobject_get(kobj); + snprintf(stats_kobj->name, KOBJ_NAME_LEN, "stats"); + stats_kobj->ktype = &ktype_noattr; + result = kobject_register(stats_kobj); + if (result != 0) + return result; + result = reiser4_populate_kattr_dir(stats_kobj); + if (result == 0) { + int i; + + for (i = 0; i < sizeof_array(sbinfo->level); ++i) { + result = register_level_attrs(sbinfo, i); + if (result != 0) + break; + } + } +#else + result = reiser4_populate_kattr_dir(kobj); +#endif + + return result; +} + +reiser4_internal void +reiser4_sysfs_done(struct super_block *super) +{ + reiser4_super_info_data *sbinfo; + ON_STATS(int i); + + sbinfo = get_super_private(super); +#if REISER4_STATS + for (i = 0; i < sizeof_array(sbinfo->level); ++i) + kobject_unregister(&sbinfo->level[i].kobj); + kobject_unregister(&sbinfo->stats_kobj); +#endif + kobject_unregister(&sbinfo->kobj); +} + +/* REISER4_USE_SYSFS */ +#else + +reiser4_internal int +reiser4_sysfs_init(struct super_block *super) +{ + return 0; +} + +reiser4_internal void +reiser4_sysfs_done(struct super_block *super) +{} + +reiser4_internal int +reiser4_sysfs_init_once(void) +{ + return 0; +} + +reiser4_internal void +reiser4_sysfs_done_once(void) +{} + +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/kattr.h linux-2.6.4-ck1/fs/reiser4/kattr.h --- linux-2.6.4/fs/reiser4/kattr.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/kattr.h 2004-03-11 22:45:15.256515339 +1100 @@ -0,0 +1,56 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Interface to sysfs' attributes. See kattr.c for comments */ + +#if !defined( __REISER4_KATTR_H__ ) +#define __REISER4_KATTR_H__ + +#include +#include + +#define REISER4_USE_SYSFS (1) + +/* helper macros used by kattr code to output information into buffer without + * caring about overflow checking. */ +#define KATTR_LEFT(p, buf) (PAGE_SIZE - (p - buf) - 1) +#define KATTR_PRINT(p, buf, ...) \ +({ \ + p += snprintf(p, KATTR_LEFT(p, buf) , ## __VA_ARGS__); \ +}) + +struct super_block; +struct reiser4_kattr; +typedef struct reiser4_kattr reiser4_kattr; + +struct reiser4_kattr { + struct attribute attr; + void *cookie; + ssize_t (*show) (struct super_block * s, + reiser4_kattr *, void *opaque, char *buf); + ssize_t (*store) (struct super_block * s, + reiser4_kattr *, void *opaque, const char *buf, + size_t size); +}; + +extern int reiser4_sysfs_init_once(void); +extern void reiser4_sysfs_done_once(void); + +extern int reiser4_sysfs_init(struct super_block *super); +extern void reiser4_sysfs_done(struct super_block *super); + +extern struct kobj_type ktype_reiser4; + +/* __REISER4_KATTR_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/kcond.c linux-2.6.4-ck1/fs/reiser4/kcond.c --- linux-2.6.4/fs/reiser4/kcond.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/kcond.c 2004-03-11 22:45:15.256515339 +1100 @@ -0,0 +1,298 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Kernel condition variables implementation. + + This is simplistic (90 LOC mod comments) condition variable + implementation. Condition variable is the most natural "synchronization + object" in some circumstances. + + Each CS text-book on multi-threading should discuss condition + variables. Also see man/info for: + + pthread_cond_init(3), + pthread_cond_destroy(3), + pthread_cond_signal(3), + pthread_cond_broadcast(3), + pthread_cond_wait(3), + pthread_cond_timedwait(3). + + See comments in kcond_wait(). + + TODO + + 1. Add an option (to kcond_init?) to make conditional variable async-safe + so that signals and broadcasts can be done from interrupt + handlers. Requires using spin_lock_irq in kcond_*(). + + 2. "Predicated" sleeps: add predicate function to the qlink and only wake + sleeper if predicate is true. Probably requires additional parameters to + the kcond_{signal,broadcast}() to supply cookie to the predicate. Standard + wait_queues already have this functionality. Idea is that if one has + object behaving like finite state automaton it is possible to use single + per-object condition variable to signal all state transitions. Predicates + allow waiters to select only transitions they are interested in without + going through context switch. + + 3. It is relatively easy to add support for sleeping on the several + condition variables at once. Does anybody need this? + +*/ + +#include "debug.h" +#include "kcond.h" +#include "spin_macros.h" + +#include +#include + +static void kcond_timeout(unsigned long datum); +static void kcond_remove(kcond_t * cvar, kcond_queue_link_t * link); + +/* initialize condition variable. Initializer for global condition variables + is macro in kcond.h */ +reiser4_internal kcond_t * +kcond_init(kcond_t * cvar /* cvar to init */ ) +{ + assert("nikita-1868", cvar != NULL); + + xmemset(cvar, 0, sizeof *cvar); + spin_lock_init(&cvar->lock); + cvar->queue = NULL; + return cvar; +} + +/* destroy condition variable. */ +reiser4_internal int +kcond_destroy(kcond_t * cvar /* cvar to destroy */ ) +{ + return kcond_are_waiters(cvar) ? -EBUSY : 0; +} + +/* Wait until condition variable is signalled. Call this with @lock locked. + If @signl is true, then sleep on condition variable will be interruptible + by signals. -EINTR is returned if sleep were interrupted by signal and 0 + otherwise. + + kcond_t is just a queue protected by spinlock. Whenever thread is going to + sleep on the kcond_t it does the following: + + (1) prepares "queue link" @qlink which is semaphore constructed locally on + the stack of the thread going to sleep. + + (2) takes @cvar spinlock + + (3) adds @qlink to the @cvar queue of waiters + + (4) releases @cvar spinlock + + (5) sleeps on semaphore constructed at step (1) + + When @cvar will be signalled or broadcasted all semaphors enqueued to the + @cvar queue will be upped and kcond_wait() will return. + + By use of local semaphore for each waiter we avoid races between going to + sleep and waking up---endemic plague of condition variables. + + For example, should kcond_broadcast() come in between steps (4) and (5) it + would call up() on semaphores already in a queue and hence, down() in the + step (5) would return immediately. + +*/ +reiser4_internal int +kcond_wait(kcond_t * cvar /* cvar to wait for */ , + spinlock_t * lock /* lock to use */ , + int signl /* if 0, ignore signals during sleep */ ) +{ + kcond_queue_link_t qlink; + int result; + + assert("nikita-1869", cvar != NULL); + assert("nikita-1870", lock != NULL); + assert("nikita-1871", check_spin_is_locked(lock)); + + spin_lock(&cvar->lock); + qlink.next = cvar->queue; + cvar->queue = &qlink; + init_MUTEX_LOCKED(&qlink.wait); + spin_unlock(&cvar->lock); + spin_unlock(lock); + + result = 0; + if (signl) + result = down_interruptible(&qlink.wait); + else + down(&qlink.wait); + spin_lock(&cvar->lock); + if (result != 0) { + /* if thread was woken up by signal, @qlink is probably still + in the queue, remove it. */ + kcond_remove(cvar, &qlink); + } + /* if it wasn't woken up by signal, spinlock here is still useful, + because we want to wait until kcond_{broadcast|signal} + finishes. Otherwise down() could interleave with up() in such a way + that, that kcond_wait() would exit and up() would see garbage in a + semaphore. + */ + spin_unlock(&cvar->lock); + spin_lock(lock); + return result; +} + +typedef struct { + kcond_queue_link_t *link; + int *woken_up; +} kcond_timer_arg; + +/* like kcond_wait(), but with timeout */ +reiser4_internal int +kcond_timedwait(kcond_t * cvar /* cvar to wait for */ , + spinlock_t * lock /* lock to use */ , + signed long timeout /* timeout in jiffies */ , + int signl /* if 0, ignore signals during sleep */ ) +{ + struct timer_list timer; + kcond_queue_link_t qlink; + int result; + int woken_up; + kcond_timer_arg targ; + + assert("nikita-2437", cvar != NULL); + assert("nikita-2438", lock != NULL); + assert("nikita-2439", check_spin_is_locked(lock)); + + spin_lock(&cvar->lock); + qlink.next = cvar->queue; + cvar->queue = &qlink; + init_MUTEX_LOCKED(&qlink.wait); + spin_unlock(&cvar->lock); + spin_unlock(lock); + + assert("nikita-3011", schedulable()); + + /* prepare timer */ + init_timer(&timer); + timer.expires = jiffies + timeout; + timer.data = (unsigned long) &targ; + timer.function = kcond_timeout; + + woken_up = 0; + + targ.link = &qlink; + targ.woken_up = &woken_up; + + /* ... and set it up */ + add_timer(&timer); + + result = 0; + if (signl) + result = down_interruptible(&qlink.wait); + else + down(&qlink.wait); + + /* cancel timer */ + del_timer_sync(&timer); + + if (woken_up) + result = -ETIMEDOUT; + + spin_lock(&cvar->lock); + if (result != 0) { + /* if thread was woken up by signal, or due to time-out, + @qlink is probably still in the queue, remove it. */ + kcond_remove(cvar, &qlink); + } + spin_unlock(&cvar->lock); + + spin_lock(lock); + return result; +} + +/* Signal condition variable: wake up one waiter, if any. */ +reiser4_internal int +kcond_signal(kcond_t * cvar /* cvar to signal */ ) +{ + kcond_queue_link_t *queue_head; + + assert("nikita-1872", cvar != NULL); + + spin_lock(&cvar->lock); + + queue_head = cvar->queue; + if (queue_head != NULL) { + cvar->queue = queue_head->next; + up(&queue_head->wait); + } + spin_unlock(&cvar->lock); + return 1; +} + +/* Broadcast condition variable: wake up all waiters. */ +reiser4_internal int +kcond_broadcast(kcond_t * cvar /* cvar to broadcast */ ) +{ + kcond_queue_link_t *queue_head; + + assert("nikita-1875", cvar != NULL); + + spin_lock(&cvar->lock); + + for (queue_head = cvar->queue; queue_head != NULL; queue_head = queue_head->next) + up(&queue_head->wait); + + cvar->queue = NULL; + spin_unlock(&cvar->lock); + return 1; +} + +/* true if there are threads sleeping on @cvar */ +reiser4_internal int +kcond_are_waiters(kcond_t * cvar /* cvar to query */ ) +{ + assert("nikita-1877", cvar != NULL); + return cvar->queue != NULL; +} + +/* timer expiration function used by kcond_timedwait */ +static void +kcond_timeout(unsigned long datum) +{ + kcond_timer_arg *arg; + + arg = (kcond_timer_arg *) datum; + *arg->woken_up = 1; + up(&arg->link->wait); +} + +/* helper function to remove @link from @cvar queue */ +static void +kcond_remove(kcond_t * cvar /* cvar to operate on */ , + kcond_queue_link_t * link /* link to remove */ ) +{ + kcond_queue_link_t *scan; + kcond_queue_link_t *prev; + + assert("nikita-2440", cvar != NULL); + assert("nikita-2441", check_spin_is_locked(&cvar->lock)); + + for (scan = cvar->queue, prev = NULL; scan != NULL; prev = scan, scan = scan->next) { + if (scan == link) { + if (prev == NULL) + cvar->queue = scan->next; + else + prev->next = scan->next; + break; + } + } +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/kcond.h linux-2.6.4-ck1/fs/reiser4/kcond.h --- linux-2.6.4/fs/reiser4/kcond.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/kcond.h 2004-03-11 22:45:15.257515183 +1100 @@ -0,0 +1,59 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Declaration of kernel condition variables and API. See kcond.c for more + info. */ + +#ifndef __KCOND_H__ +#define __KCOND_H__ + +#include +#include + +typedef struct kcond_queue_link_s kcond_queue_link_t; + +/* condition variable */ +typedef struct kcond_s { + /* lock protecting integrity of @queue */ + spinlock_t lock; + /* queue of waiters */ + kcond_queue_link_t *queue; +} kcond_t; + +/* queue link added to the kcond->queue by each waiter */ +struct kcond_queue_link_s { + /* next link in the queue */ + kcond_queue_link_t *next; + /* semaphore to signal on wake up */ + struct semaphore wait; +}; + +extern kcond_t *kcond_init(kcond_t * cvar); +extern int kcond_destroy(kcond_t * cvar); + +extern int kcond_wait(kcond_t * cvar, spinlock_t * lock, int signl); +extern int kcond_timedwait(kcond_t * cvar, spinlock_t * lock, signed long timeout, int signl); +extern int kcond_signal(kcond_t * cvar); +extern int kcond_broadcast(kcond_t * cvar); + +extern int kcond_are_waiters(kcond_t * cvar); + +extern void kcond_print(kcond_t * cvar); + +#define KCOND_STATIC_INIT \ + { \ + .lock = SPIN_LOCK_UNLOCKED, \ + .queue = NULL \ + } + +/* __KCOND_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/key.c linux-2.6.4-ck1/fs/reiser4/key.c --- linux-2.6.4/fs/reiser4/key.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/key.c 2004-03-11 22:45:15.258515028 +1100 @@ -0,0 +1,157 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Key manipulations. */ + +#include "debug.h" +#include "key.h" +#include "super.h" +#include "reiser4.h" + +#include /* for __u?? */ + +/* Minimal possible key: all components are zero. It is presumed that this is + independent of key scheme. */ +static const reiser4_key MINIMAL_KEY = { + .el = { + {0ull}, + ON_LARGE_KEY({0ull},) + {0ull}, + {0ull} + } +}; + +/* Maximal possible key: all components are ~0. It is presumed that this is + independent of key scheme. */ +static const reiser4_key MAXIMAL_KEY = { + .el = { + {~0ull}, + ON_LARGE_KEY({~0ull},) + {~0ull}, + {~0ull} + } +}; + +/* Initialise key. */ +reiser4_internal void +key_init(reiser4_key * key /* key to init */ ) +{ + assert("nikita-1169", key != NULL); + xmemset(key, 0, sizeof *key); +} + +/* minimal possible key in the tree. Return pointer to the static storage. */ +reiser4_internal const reiser4_key * +min_key(void) +{ + return &MINIMAL_KEY; +} + +/* maximum possible key in the tree. Return pointer to the static storage. */ +reiser4_internal const reiser4_key * +max_key(void) +{ + return &MAXIMAL_KEY; +} + +#if REISER4_DEBUG_OUTPUT +/* debugging aid: print symbolic name of key type */ +static const char * +type_name(unsigned int key_type /* key type */ ) +{ + switch (key_type) { + case KEY_FILE_NAME_MINOR: + return "file name"; + case KEY_SD_MINOR: + return "stat data"; + case KEY_ATTR_NAME_MINOR: + return "attr name"; + case KEY_ATTR_BODY_MINOR: + return "attr body"; + case KEY_BODY_MINOR: + return "file body"; + default: + return "unknown"; + } +} + +extern char *unpack_string(__u64 value, char *buf); + +/* debugging aid: print human readable information about key */ +reiser4_internal void +print_key(const char *prefix /* prefix to print */ , + const reiser4_key * key /* key to print */ ) +{ + /* turn bold on */ + /* printf ("\033[1m"); */ + if (key == NULL) + printk("%s: null key\n", prefix); + else { + if (REISER4_LARGE_KEY) + printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix, + get_key_locality(key), + get_key_type(key), + get_key_ordering(key), + get_key_band(key), + get_key_objectid(key), + get_key_offset(key)); + else + printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix, + get_key_locality(key), + get_key_type(key), + get_key_band(key), + get_key_objectid(key), + get_key_offset(key)); + if (get_key_type(key) == KEY_FILE_NAME_MINOR) { + char buf[DE_NAME_BUF_LEN]; + char *c; + + c = buf; + c = unpack_string(get_key_ordering(key), c); + unpack_string(get_key_fulloid(key), c); + printk("[%s", buf); + if (is_longname_key(key)) + printk("...]\n"); + else { + unpack_string(get_key_offset(key), buf); + printk("%s]\n", buf); + } + } else { + printk("[%s]\n", type_name(get_key_type(key))); + } + } + /* turn bold off */ + /* printf ("\033[m\017"); */ +} + +#endif + +reiser4_internal int +sprintf_key(char *buffer /* buffer to print key into */ , + const reiser4_key * key /* key to print */ ) +{ + if (REISER4_LARGE_KEY) + return sprintf(buffer, "(%Lx:%x:%Lx:%Lx:%Lx:%Lx)", + get_key_locality(key), + get_key_type(key), + get_key_ordering(key), + get_key_band(key), + get_key_objectid(key), + get_key_offset(key)); + else + return sprintf(buffer, "(%Lx:%x:%Lx:%Lx:%Lx)", + get_key_locality(key), + get_key_type(key), + get_key_band(key), + get_key_objectid(key), + get_key_offset(key)); +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/key.h linux-2.6.4-ck1/fs/reiser4/key.h --- linux-2.6.4/fs/reiser4/key.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/key.h 2004-03-11 22:45:15.259514872 +1100 @@ -0,0 +1,380 @@ +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Declarations of key-related data-structures and operations on keys. */ + +#if !defined( __REISER4_KEY_H__ ) +#define __REISER4_KEY_H__ + +#include "dformat.h" +#include "forward.h" +#include "debug.h" + +#include /* for __u?? */ + +/* Operations on keys in reiserfs tree */ + +/* No access to any of these fields shall be done except via a + wrapping macro/function, and that wrapping macro/function shall + convert to little endian order. Compare keys will consider cpu byte order. */ + +/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below + which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files + within that directory, and not near to the file itself. It is interesting to consider whether this is the wrong + approach, and whether there should be no difference at all. For current usage patterns this choice is probably the + right one. */ + +/* possible values for minor packing locality (4 bits required) */ +typedef enum { + /* file name */ + KEY_FILE_NAME_MINOR = 0, + /* stat-data */ + KEY_SD_MINOR = 1, + /* file attribute name */ + KEY_ATTR_NAME_MINOR = 2, + /* file attribute value */ + KEY_ATTR_BODY_MINOR = 3, + /* file body (tail or extent) */ + KEY_BODY_MINOR = 4, +} key_minor_locality; + +/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key. + Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space, + and by the repacker. It is stylistically better to put aggregation information into the key. Thus, if you want to + segregate extents from tails, it is better to give them distinct minor packing localities rather than changing + block_alloc.c to check the node type when deciding where to allocate the node. + + The need to randomly displace new directories and large files disturbs this symmetry unfortunately. However, it + should be noted that this is a need that is not clearly established given the existence of a repacker. Also, in our + current implementation tails have a different minor packing locality from extents, and no files have both extents and + tails, so maybe symmetry can be had without performance cost after all. Symmetry is what we ship for now.... +*/ + +/* Arbitrary major packing localities can be assigned to objects using + the reiser4(filenameA/..packing<=some_number) system call. + + In reiser4, the creat() syscall creates a directory + + whose default flow (that which is referred to if the directory is + read as a file) is the traditional unix file body. + + whose directory plugin is the 'filedir' + + whose major packing locality is that of the parent of the object created. + + The static_stat item is a particular commonly used directory + compression (the one for normal unix files). + + The filedir plugin checks to see if the static_stat item exists. + There is a unique key for static_stat. If yes, then it uses the + static_stat item for all of the values that it contains. The + static_stat item contains a flag for each stat it contains which + indicates whether one should look outside the static_stat item for its + contents. +*/ + +/* offset of fields in reiser4_key. Value of each element of this enum + is index within key (thought as array of __u64's) where this field + is. */ +typedef enum { + /* major "locale", aka dirid. Sits in 1st element */ + KEY_LOCALITY_INDEX = 0, + /* minor "locale", aka item type. Sits in 1st element */ + KEY_TYPE_INDEX = 0, + ON_LARGE_KEY(KEY_ORDERING_INDEX,) + /* "object band". Sits in 2nd element */ + KEY_BAND_INDEX, + /* objectid. Sits in 2nd element */ + KEY_OBJECTID_INDEX = KEY_BAND_INDEX, + /* full objectid. Sits in 2nd element */ + KEY_FULLOID_INDEX = KEY_BAND_INDEX, + /* Offset. Sits in 3rd element */ + KEY_OFFSET_INDEX, + /* Name hash. Sits in 3rd element */ + KEY_HASH_INDEX = KEY_OFFSET_INDEX, + KEY_CACHELINE_END = KEY_OFFSET_INDEX, + KEY_LAST_INDEX +} reiser4_key_field_index; + +/* key in reiser4 internal "balanced" tree. It is just array of three + 64bit integers in disk byte order (little-endian by default). This + array is actually indexed by reiser4_key_field. Each __u64 within + this array is called "element". Logical key component encoded within + elements are called "fields". + + We declare this as union with second component dummy to suppress + inconvenient array<->pointer casts implied in C. */ +union reiser4_key { + d64 el[KEY_LAST_INDEX]; + int pad; +}; + +/* bitmasks showing where within reiser4_key particular key is + stored. */ +typedef enum { + /* major locality occupies higher 60 bits of the first element */ + KEY_LOCALITY_MASK = 0xfffffffffffffff0ull, + /* minor locality occupies lower 4 bits of the first element */ + KEY_TYPE_MASK = 0xfull, + /* controversial band occupies higher 4 bits of the 2nd element */ + KEY_BAND_MASK = 0xf000000000000000ull, + /* objectid occupies lower 60 bits of the 2nd element */ + KEY_OBJECTID_MASK = 0x0fffffffffffffffull, + /* full 64bit objectid*/ + KEY_FULLOID_MASK = 0xffffffffffffffffull, + /* offset is just 3rd L.M.Nt itself */ + KEY_OFFSET_MASK = 0xffffffffffffffffull, + /* ordering is whole second element */ + KEY_ORDERING_MASK = 0xffffffffffffffffull, +} reiser4_key_field_mask; + +/* how many bits key element should be shifted to left to get particular field */ +typedef enum { + KEY_LOCALITY_SHIFT = 4, + KEY_TYPE_SHIFT = 0, + KEY_BAND_SHIFT = 60, + KEY_OBJECTID_SHIFT = 0, + KEY_FULLOID_SHIFT = 0, + KEY_OFFSET_SHIFT = 0, + KEY_ORDERING_SHIFT = 0, +} reiser4_key_field_shift; + +static inline __u64 +get_key_el(const reiser4_key * key, reiser4_key_field_index off) +{ + assert("nikita-753", key != NULL); + assert("nikita-754", off < KEY_LAST_INDEX); + return d64tocpu(&key->el[off]); +} + +static inline void +set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value) +{ + assert("nikita-755", key != NULL); + assert("nikita-756", off < KEY_LAST_INDEX); + cputod64(value, &key->el[off]); +} + +/* macro to define getter and setter functions for field F with type T */ +#define DEFINE_KEY_FIELD( L, U, T ) \ +static inline T get_key_ ## L ( const reiser4_key *key ) \ +{ \ + assert( "nikita-750", key != NULL ); \ + return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) & \ + KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT; \ +} \ + \ +static inline void set_key_ ## L ( reiser4_key *key, T loc ) \ +{ \ + __u64 el; \ + \ + assert( "nikita-752", key != NULL ); \ + \ + el = get_key_el( key, KEY_ ## U ## _INDEX ); \ + /* clear field bits in the key */ \ + el &= ~KEY_ ## U ## _MASK; \ + /* actually it should be \ + \ + el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \ + \ + but we trust user to never pass values that wouldn't fit \ + into field. Clearing extra bits is one operation, but this \ + function is time-critical. \ + But check this in assertion. */ \ + assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) & \ + ~KEY_ ## U ## _MASK ) == 0 ); \ + el |= ( loc << KEY_ ## U ## _SHIFT ); \ + set_key_el( key, KEY_ ## U ## _INDEX, el ); \ +} + +typedef __u64 oid_t; + +/* define get_key_locality(), set_key_locality() */ +DEFINE_KEY_FIELD(locality, LOCALITY, oid_t); +/* define get_key_type(), set_key_type() */ +DEFINE_KEY_FIELD(type, TYPE, key_minor_locality); +/* define get_key_band(), set_key_band() */ +DEFINE_KEY_FIELD(band, BAND, __u64); +/* define get_key_objectid(), set_key_objectid() */ +DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t); +/* define get_key_fulloid(), set_key_fulloid() */ +DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t); +/* define get_key_offset(), set_key_offset() */ +DEFINE_KEY_FIELD(offset, OFFSET, __u64); +#if (REISER4_LARGE_KEY) +/* define get_key_ordering(), set_key_ordering() */ +DEFINE_KEY_FIELD(ordering, ORDERING, __u64); +#else +static inline __u64 get_key_ordering(const reiser4_key *key) +{ + return 0; +} + +static inline void set_key_ordering(reiser4_key *key, __u64 val) +{ +} +#endif + +/* key comparison result */ +typedef enum { LESS_THAN = -1, /* if first key is less than second */ + EQUAL_TO = 0, /* if keys are equal */ + GREATER_THAN = +1 /* if first key is greater than second */ +} cmp_t; + +void key_init(reiser4_key * key); + +/* minimal possible key in the tree. Return pointer to the static storage. */ +extern const reiser4_key *min_key(void); +extern const reiser4_key *max_key(void); + +/* helper macro for keycmp() */ +#define KEY_DIFF(k1, k2, field) \ +({ \ + typeof (get_key_ ## field (k1)) f1; \ + typeof (get_key_ ## field (k2)) f2; \ + \ + f1 = get_key_ ## field (k1); \ + f2 = get_key_ ## field (k2); \ + \ + (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \ +}) + +/* helper macro for keycmp() */ +#define KEY_DIFF_EL(k1, k2, off) \ +({ \ + __u64 e1; \ + __u64 e2; \ + \ + e1 = get_key_el(k1, off); \ + e2 = get_key_el(k2, off); \ + \ + (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \ +}) + +/* compare `k1' and `k2'. This function is a heart of "key allocation + policy". All you need to implement new policy is to add yet another + clause here. */ +static inline cmp_t +keycmp(const reiser4_key * k1 /* first key to compare */ , + const reiser4_key * k2 /* second key to compare */ ) +{ + cmp_t result; + + assert("nikita-439", k1 != NULL); + assert("nikita-440", k2 != NULL); + + if (REISER4_PLANA_KEY_ALLOCATION) { + /* if physical order of fields in a key is identical + with logical order, we can implement key comparison + as three 64bit comparisons. */ + /* logical order of fields in plan-a: + locality->type->objectid->offset. */ + /* compare locality and type at once */ + result = KEY_DIFF_EL(k1, k2, 0); + if (result == EQUAL_TO) { + /* compare objectid (and band if it's there) */ + result = KEY_DIFF_EL(k1, k2, 1); + /* compare offset */ + if (result == EQUAL_TO) { + result = KEY_DIFF_EL(k1, k2, 2); + if (REISER4_LARGE_KEY && result == EQUAL_TO) { + result = KEY_DIFF_EL(k1, k2, 3); + } + } + } + } else if (REISER4_3_5_KEY_ALLOCATION) { + result = KEY_DIFF(k1, k2, locality); + if (result == EQUAL_TO) { + result = KEY_DIFF(k1, k2, objectid); + if (result == EQUAL_TO) { + result = KEY_DIFF(k1, k2, type); + if (result == EQUAL_TO) + result = KEY_DIFF(k1, k2, offset); + } + } + } else + impossible("nikita-441", "Unknown key allocation scheme!"); + return result; +} + +/* true if @k1 equals @k2 */ +static inline int +keyeq(const reiser4_key * k1 /* first key to compare */ , + const reiser4_key * k2 /* second key to compare */ ) +{ + assert("nikita-1879", k1 != NULL); + assert("nikita-1880", k2 != NULL); + return !memcmp(k1, k2, sizeof *k1); +} + +/* true if @k1 is less than @k2 */ +static inline int +keylt(const reiser4_key * k1 /* first key to compare */ , + const reiser4_key * k2 /* second key to compare */ ) +{ + assert("nikita-1952", k1 != NULL); + assert("nikita-1953", k2 != NULL); + return keycmp(k1, k2) == LESS_THAN; +} + +/* true if @k1 is less than or equal to @k2 */ +static inline int +keyle(const reiser4_key * k1 /* first key to compare */ , + const reiser4_key * k2 /* second key to compare */ ) +{ + assert("nikita-1954", k1 != NULL); + assert("nikita-1955", k2 != NULL); + return keycmp(k1, k2) != GREATER_THAN; +} + +/* true if @k1 is greater than @k2 */ +static inline int +keygt(const reiser4_key * k1 /* first key to compare */ , + const reiser4_key * k2 /* second key to compare */ ) +{ + assert("nikita-1959", k1 != NULL); + assert("nikita-1960", k2 != NULL); + return keycmp(k1, k2) == GREATER_THAN; +} + +/* true if @k1 is greater than or equal to @k2 */ +static inline int +keyge(const reiser4_key * k1 /* first key to compare */ , + const reiser4_key * k2 /* second key to compare */ ) +{ + assert("nikita-1956", k1 != NULL); + assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched + * November 3: Laika */ + return keycmp(k1, k2) != LESS_THAN; +} + +static inline void +prefetchkey(reiser4_key *key) +{ + prefetch(key); + prefetch(&key->el[KEY_CACHELINE_END]); +} + +/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) = + 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */ +/* size of a buffer suitable to hold human readable key representation */ +#define KEY_BUF_LEN (80) + +extern int sprintf_key(char *buffer, const reiser4_key * key); +#if REISER4_DEBUG_OUTPUT +extern void print_key(const char *prefix, const reiser4_key * key); +#else +#define print_key(p,k) noop +#endif + +/* __FS_REISERFS_KEY_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/ktxnmgrd.c linux-2.6.4-ck1/fs/reiser4/ktxnmgrd.c --- linux-2.6.4/fs/reiser4/ktxnmgrd.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/ktxnmgrd.c 2004-03-11 22:45:15.259514872 +1100 @@ -0,0 +1,285 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ +/* Transaction manager daemon. */ + +/* + * ktxnmgrd is a kernel daemon responsible for committing transactions. It is + * needed/important for the following reasons: + * + * 1. in reiser4 atom is not committed immediately when last transaction + * handle closes, unless atom is either too old or too large (see + * atom_should_commit()). This is done to avoid committing too frequently. + * because: + * + * 2. sometimes we don't want to commit atom when closing last transaction + * handle even if it is old and fat enough. For example, because we are at + * this point under directory semaphore, and committing would stall all + * accesses to this directory. + * + * ktxnmgrd binds its time sleeping on condition variable. When is awakes + * either due to (tunable) timeout or because it was explicitly woken up by + * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones + * eligible. + * + */ + +#include "debug.h" +#include "kcond.h" +#include "txnmgr.h" +#include "tree.h" +#include "ktxnmgrd.h" +#include "super.h" +#include "reiser4.h" + +#include /* for struct task_struct */ +#include +#include +#include + +static int scan_mgr(txn_mgr * mgr); + +/* change current->comm so that ps, top, and friends will see changed + state. This serves no useful purpose whatsoever, but also costs + nothing. May be it will make lonely system administrator feeling less alone + at 3 A.M. +*/ +#define set_comm( state ) \ + snprintf( current -> comm, sizeof( current -> comm ), \ + "%s:%s", __FUNCTION__, ( state ) ) + +reiser4_internal void +init_ktxnmgrd_context(ktxnmgrd_context * ctx) +{ + assert("nikita-2442", ctx != NULL); + + xmemset(ctx, 0, sizeof *ctx); + init_completion(&ctx->finish); + kcond_init(&ctx->startup); + kcond_init(&ctx->wait); + kcond_init(&ctx->loop); + spin_lock_init(&ctx->guard); + ctx->timeout = REISER4_TXNMGR_TIMEOUT; + txn_mgrs_list_init(&ctx->queue); +} + +/* The background transaction manager daemon, started as a kernel thread + during reiser4 initialization. */ +static int +ktxnmgrd(void *arg) +{ + struct task_struct *me; + ktxnmgrd_context *ctx; + + /* standard kernel thread prologue */ + me = current; + /* reparent_to_init() is done by daemonize() */ + daemonize(__FUNCTION__); + + /* block all signals */ + spin_lock_irq(&me->sighand->siglock); + siginitsetinv(&me->blocked, 0); + recalc_sigpending(); + spin_unlock_irq(&me->sighand->siglock); + + /* do_fork() just copies task_struct into the new + thread. ->fs_context shouldn't be copied of course. This shouldn't + be a problem for the rest of the code though. + */ + me->fs_context = NULL; + + ctx = arg; + spin_lock(&ctx->guard); + ctx->tsk = me; + kcond_broadcast(&ctx->startup); + while (1) { + int result; + txn_mgr *mgr; + + /* software suspend support. */ + if (me->flags & PF_FREEZE) { + spin_unlock(&ctx->guard); + refrigerator(PF_IOTHREAD); + spin_lock(&ctx->guard); + } + + set_comm("wait"); + /* wait for @ctx -> timeout or explicit wake up. + + kcond_wait() is called with last argument 1 enabling wakeup + by signals so that this thread is not counted in + load-average. This doesn't require any special handling, + because all signals were blocked. + */ + result = kcond_timedwait(&ctx->wait, + &ctx->guard, ctx->timeout, 1); + + /* wake up all threads doing umount. See ktxnmgrd_detach(). */ + kcond_broadcast(&ctx->loop); + + if (result != -ETIMEDOUT && result != -EINTR && result != 0) { + /* some other error */ + warning("nikita-2443", "Error: %i", result); + continue; + } + + /* we are asked to exit */ + if (ctx->done) + break; + + set_comm(result ? "timed" : "run"); + + /* wait timed out or ktxnmgrd was woken up by explicit request + to commit something. Scan list of atoms in txnmgr and look + for too old atoms. + */ + do { + ctx->rescan = 0; + for_all_type_safe_list(txn_mgrs, &ctx->queue, mgr) { + scan_mgr(mgr); + spin_lock(&ctx->guard); + if (ctx->rescan) { + /* the list could be modified while ctx + spinlock was released, we have to + repeat scanning from the + beginning */ + break; + } + } + } while (ctx->rescan); + } + + spin_unlock(&ctx->guard); + + complete_and_exit(&ctx->finish, 0); + /* not reached. */ + return 0; +} + +#undef set_comm + +reiser4_internal int +ktxnmgrd_attach(ktxnmgrd_context * ctx, txn_mgr * mgr) +{ + int first_mgr; + + assert("nikita-2448", mgr != NULL); + + spin_lock(&ctx->guard); + + first_mgr = !ctx->started; + ctx->started = 1; + ctx->rescan = 1; + + /* attach @mgr to daemon. Not taking spin-locks, because this is early + during @mgr initialization. */ + mgr->daemon = ctx; + txn_mgrs_list_push_back(&ctx->queue, mgr); + + spin_unlock(&ctx->guard); + + if (first_mgr) { + /* attaching first mgr, start daemon */ + ctx->done = 0; + /* kernel_thread never fails. */ + kernel_thread(ktxnmgrd, ctx, CLONE_KERNEL); + } + + spin_lock(&ctx->guard); + + /* daemon thread is not yet initialized */ + if (ctx->tsk == NULL) + /* wait until initialization completes */ + kcond_wait(&ctx->startup, &ctx->guard, 0); + + assert("nikita-2452", ctx->tsk != NULL); + + spin_unlock(&ctx->guard); + return 0; +} + +reiser4_internal void +ktxnmgrd_detach(txn_mgr * mgr) +{ + ktxnmgrd_context *ctx; + + assert("nikita-2450", mgr != NULL); + + /* this is supposed to happen when @mgr is quiesced and no locking is + necessary. */ + ctx = mgr->daemon; + if (ctx == NULL) + return; + + spin_lock(&ctx->guard); + txn_mgrs_list_remove(mgr); + mgr->daemon = NULL; + ctx->rescan = 1; + + /* removing last mgr, stop daemon */ + if (txn_mgrs_list_empty(&ctx->queue)) { + ctx->tsk = NULL; + ctx->done = 1; + ctx->started = 0; + spin_unlock(&ctx->guard); + kcond_signal(&ctx->wait); + + /* wait until daemon finishes */ + wait_for_completion(&ctx->finish); + } else { + kcond_signal(&ctx->wait); + /* ctx->loop is signaled by ktxnmgrd() after it woke up, but + * before it enters scan_mgr() loop. Note that both signaling + * of ctx->wait and wait on ctx->loop are done under + * ctx->guard spin lock. This guarantees that current thread + * cannot lose wakeup. */ + kcond_wait(&ctx->loop, &ctx->guard, 0); + spin_unlock(&ctx->guard); + } +} + +reiser4_internal void +ktxnmgrd_kick(txn_mgr * mgr) +{ + assert("nikita-3234", mgr != NULL); + assert("nikita-3235", mgr->daemon != NULL); + kcond_signal(&mgr->daemon->wait); +} + +reiser4_internal int +is_current_ktxnmgrd(void) +{ + return (get_current_super_private()->tmgr.daemon->tsk == current); +} + +/* scan one transaction manager for old atoms; should be called with ktxnmgrd + * spinlock, releases this spin lock at exit */ +static int +scan_mgr(txn_mgr * mgr) +{ + int ret; + reiser4_context ctx; + reiser4_tree *tree; + + assert("nikita-2454", mgr != NULL); + + /* NOTE-NIKITA this only works for atoms embedded into super blocks. */ + tree = &container_of(mgr, reiser4_super_info_data, tmgr)->tree; + assert("nikita-2455", tree != NULL); + assert("nikita-2456", tree->super != NULL); + + init_context(&ctx, tree->super); + + ret = commit_some_atoms(mgr); + + reiser4_exit_context(&ctx); + return ret; +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/ktxnmgrd.h linux-2.6.4-ck1/fs/reiser4/ktxnmgrd.h --- linux-2.6.4/fs/reiser4/ktxnmgrd.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/ktxnmgrd.h 2004-03-11 22:45:15.260514717 +1100 @@ -0,0 +1,67 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Transaction manager daemon. See ktxnmgrd.c for comments. */ + +#ifndef __KTXNMGRD_H__ +#define __KTXNMGRD_H__ + +#include "kcond.h" +#include "txnmgr.h" +#include "spin_macros.h" + +#include +#include +#include +#include +#include /* for struct task_struct */ + +/* in this structure all data necessary to start up, shut down and communicate + * with ktxnmgrd are kept. */ +struct ktxnmgrd_context { + /* conditional variable used to synchronize start up of ktxnmgrd */ + kcond_t startup; + /* completion used to synchronize shut down of ktxnmgrd */ + struct completion finish; + /* condition variable on which ktxnmgrd sleeps */ + kcond_t wait; + /* condition variable that ktxnmgrd broadcasts on each iterations. It + * is used to synchronize with umount. */ + kcond_t loop; + /* spin lock protecting all fields of this structure */ + spinlock_t guard; + /* timeout of sleeping on ->wait */ + signed long timeout; + /* kernel thread running ktxnmgrd */ + struct task_struct *tsk; + /* list of all file systems served by this ktxnmgrd */ + txn_mgrs_list_head queue; + /* is ktxnmgrd already started? */ + int started:1; + /* is ktxnmgrd being shut down? */ + int done:1; + /* should ktxnmgrd repeat scanning of atoms? */ + int rescan:1; +}; + +extern void init_ktxnmgrd_context(ktxnmgrd_context * context); + +extern int ktxnmgrd_attach(ktxnmgrd_context * ctx, txn_mgr * mgr); +extern void ktxnmgrd_detach(txn_mgr * mgr); + +extern void ktxnmgrd_kick(txn_mgr * mgr); + +extern int is_current_ktxnmgrd(void); + +/* __KTXNMGRD_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/latch.c linux-2.6.4-ck1/fs/reiser4/latch.c --- linux-2.6.4/fs/reiser4/latch.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/latch.c 2004-03-11 22:45:15.260514717 +1100 @@ -0,0 +1,113 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "debug.h" +#include "latch.h" + +reiser4_internal void +rw_latch_init(rw_latch_t * latch) +{ + spin_lock_init(&latch->guard); + latch->access = 0; + kcond_init(&latch->cond); +} + +reiser4_internal void +rw_latch_done(rw_latch_t * latch) +{ + assert("nikita-3062", latch->access == 0); + kcond_destroy(&latch->cond); +} + +reiser4_internal void +rw_latch_down_read(rw_latch_t * latch) +{ + spin_lock(&latch->guard); + while (latch->access < 0) + kcond_wait(&latch->cond, &latch->guard, 0); + latch->access ++; + spin_unlock(&latch->guard); +} + +reiser4_internal void +rw_latch_down_write(rw_latch_t * latch) +{ + spin_lock(&latch->guard); + while (latch->access != 0) + kcond_wait(&latch->cond, &latch->guard, 0); + latch->access = -1; + spin_unlock(&latch->guard); +} + +reiser4_internal void +rw_latch_up_read(rw_latch_t * latch) +{ + spin_lock(&latch->guard); + assert("nikita-3063", latch->access > 0); + latch->access --; + if (latch->access == 0) + kcond_broadcast(&latch->cond); + spin_unlock(&latch->guard); +} + +reiser4_internal void +rw_latch_up_write(rw_latch_t * latch) +{ + spin_lock(&latch->guard); + assert("nikita-3063", latch->access == -1); + latch->access = 0; + kcond_broadcast(&latch->cond); + spin_unlock(&latch->guard); +} + +reiser4_internal void +rw_latch_downgrade(rw_latch_t * latch) +{ + spin_lock(&latch->guard); + assert("nikita-3063", latch->access == -1); + latch->access = +1; + kcond_broadcast(&latch->cond); + spin_unlock(&latch->guard); +} + +reiser4_internal int +rw_latch_try_read(rw_latch_t * latch) +{ + int result; + + spin_lock(&latch->guard); + if (latch->access < 0) + result = -EBUSY; + else { + result = 0; + latch->access ++; + } + spin_unlock(&latch->guard); + return result; +} + +reiser4_internal int +rw_latch_try_write(rw_latch_t * latch) +{ + int result; + + spin_lock(&latch->guard); + if (latch->access != 0) + result = -EBUSY; + else { + result = 0; + latch->access = -1; + } + spin_unlock(&latch->guard); + return result; +} + + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/latch.h linux-2.6.4-ck1/fs/reiser4/latch.h --- linux-2.6.4/fs/reiser4/latch.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/latch.h 2004-03-11 22:45:15.261514561 +1100 @@ -0,0 +1,36 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#ifndef __LATCH_H__ +#define __LATCH_H__ + +#include "kcond.h" + +typedef struct rw_latch { + spinlock_t guard; + int access; + kcond_t cond; +} rw_latch_t; + +extern void rw_latch_init(rw_latch_t * latch); +extern void rw_latch_done(rw_latch_t * latch); +extern void rw_latch_down_read(rw_latch_t * latch); +extern void rw_latch_down_write(rw_latch_t * latch); +extern void rw_latch_up_read(rw_latch_t * latch); +extern void rw_latch_up_write(rw_latch_t * latch); +extern void rw_latch_downgrade(rw_latch_t * latch); + +extern int rw_latch_try_read(rw_latch_t * latch); +extern int rw_latch_try_write(rw_latch_t * latch); + +/* __LATCH_H__ */ +#endif + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/lib.h linux-2.6.4-ck1/fs/reiser4/lib.h --- linux-2.6.4/fs/reiser4/lib.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/lib.h 2004-03-11 22:45:15.261514561 +1100 @@ -0,0 +1,75 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#if !defined (__FS_REISER4_LIB_H__) +#define __FS_REISER4_LIB_H__ + +/* These 2 functions of 64 bit numbers division were taken from + include/sound/pcm.h */ + +/* Helper function for 64 bits numbers division. */ +static inline void +divl(__u32 high, __u32 low, __u32 div, __u32 * q, __u32 * r) +{ + __u64 n = (__u64) high << 32 | low; + __u64 d = (__u64) div << 31; + __u32 q1 = 0; + int c = 32; + + while (n > 0xffffffffU) { + q1 <<= 1; + if (n >= d) { + n -= d; + q1 |= 1; + } + d >>= 1; + c--; + } + q1 <<= c; + if (n) { + low = n; + *q = q1 | (low / div); + if (r) + *r = low % div; + } else { + if (r) + *r = 0; + *q = q1; + } + return; +} + +/* Function for 64 bits numbers division. */ +static inline __u64 +div64_32(__u64 n, __u32 div, __u32 * rem) +{ + __u32 low, high; + + low = n & 0xffffffff; + high = n >> 32; + if (high) { + __u32 high1 = high % div; + __u32 low1 = low; + high /= div; + divl(high1, low1, div, &low, rem); + return (__u64) high << 32 | low; + } else { + if (rem) + *rem = low % div; + return low / div; + } + + return 0; +} + +#endif /* __FS_REISER4_LIB_H__ */ + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/linux-5_reiser4_syscall.patch linux-2.6.4-ck1/fs/reiser4/linux-5_reiser4_syscall.patch --- linux-2.6.4/fs/reiser4/linux-5_reiser4_syscall.patch 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/linux-5_reiser4_syscall.patch 2004-03-11 22:45:15.261514561 +1100 @@ -0,0 +1,38 @@ +===== arch/um/kernel/sys_call_table.c 1.5 vs edited ===== +--- 1.5/arch/um/kernel/sys_call_table.c Wed Nov 6 17:36:22 2002 ++++ edited/arch/um/kernel/sys_call_table.c Fri Dec 6 22:15:35 2002 +@@ -232,6 +232,7 @@ + extern syscall_handler_t sys_io_cancel; + extern syscall_handler_t sys_exit_group; + extern syscall_handler_t sys_lookup_dcookie; ++extern syscall_handler_t sys_eriser4; + + #if CONFIG_NFSD + #define NFSSERVCTL sys_nfsserctl +@@ -483,6 +484,7 @@ + [ __NR_free_hugepages ] = sys_ni_syscall, + [ __NR_exit_group ] = sys_exit_group, + [ __NR_lookup_dcookie ] = sys_lookup_dcookie, ++ [ __NR_reiser4_sys_call ] = sys_reiser4, + + ARCH_SYSCALLS + [ LAST_SYSCALL + 1 ... NR_syscalls ] = +===== include/asm-i386/unistd.h 1.19 vs edited ===== +--- 1.19/include/asm-i386/unistd.h Thu Oct 31 18:28:28 2002 ++++ edited/include/asm-i386/unistd.h Fri Dec 6 22:45:24 2002 +@@ -262,6 +262,7 @@ + #define __NR_sys_epoll_ctl 255 + #define __NR_sys_epoll_wait 256 + #define __NR_remap_file_pages 257 ++#define __NR_reiser4_sys_call 258 + + + /* user-visible error numbers are in the range -1 - -124: see */ +@@ -378,6 +379,7 @@ + static inline _syscall1(int,close,int,fd) + static inline _syscall1(int,_exit,int,exitcode) + static inline _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) ++static inline _syscall1(long,_reiser4_sys_call,char*,p_strIng) + + #endif + diff -Naurp linux-2.6.4/fs/reiser4/lnode.c linux-2.6.4-ck1/fs/reiser4/lnode.c --- linux-2.6.4/fs/reiser4/lnode.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/lnode.c 2004-03-11 22:45:15.262514406 +1100 @@ -0,0 +1,431 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Lnode manipulation functions. */ +/* Lnode is light-weight node used as common data-structure by both VFS access + paths and reiser4() system call processing. + + One of the main targets of reiser4() system call is to allow manipulation + on potentially huge number of objects. This makes use of inode in reiser4() + impossible. On the other hand there is a need to synchronize reiser4() and + VFS access. + + To do this small object (lnode) is allocated (on the stack if possible) for + each object involved into reiser4() system call. Such lnode only contains + lock, information necessary to link it into global hash table, and + condition variable to wake up waiters (see below). + + In other words, lnode is handle that reiser4 keeps for a file system object + while object is being actively used. For example, when read is performed by + reiser4_read(), lnode exists for inode being read. When reiser4_read() + exits lnode is deleted, but inode is still there in the inode cache. + + As lnode only exists while object is being actively manipulated by some + threads, it follows that lnodes can always live on the stack of such + threads. + + Case-by-case: + + A. access through VFS (reiser4_{read|write|truncate|*}()): + + 1. operation starts with inode supplied by VFS. + + 2. lget( &local_lnode, LNODE_INODE, inode -> i_ino ) is called. This, + if necessary, will wait until sys_reiser4() access to this file is + finished, and + + 3. add lnode to the per super block hash table. + + B. creation of new inode in reiser4_iget(): + + 1. create new empty inode (iget(), or icreate()) + + 2. step A.3. A.2 is not necessary, because we are creating new object + and parent is in VFS access (hence sys_reiser4() cannot add/delete + objects in parent). + + 3. read stat data from disk and initialise inode + + C. sys_reiser4() access: + + 1. check for existing inode in a hash-table. + + Rationale: if inode is already here it is advantageous to use it, + because it already has information from stat data. + + If inode is found proceed as in case A. + + 2. otherwise, lget( &local_lnode, LNODE_LW, oid ) is called. + + + NOT FINISHED. + + + + + + + + INTERNAL NOTES: + + 1. fs/inode.c:inode_lock is not static: we can use it. Good. + + 2. but fs/inode.c:find_inode() is. Either write own version, or remove + static and EXPORT_SYMBOL-ize it. + + + +*/ + +#include "debug.h" +#include "kcond.h" +#include "key.h" +#include "kassign.h" +#include "plugin/plugin_header.h" +#include "plugin/plugin_set.h" +#include "lnode.h" +#include "super.h" +#include "reiser4.h" + +#include /* for struct super_block */ +#include + +static reiser4_key *lnode_dentry_key(const lnode * node, reiser4_key * result); +static reiser4_key *lnode_inode_key(const lnode * node, reiser4_key * result); +static reiser4_key *lnode_lw_key(const lnode * node, reiser4_key * result); +static int lnode_inode_eq(const lnode * node1, const lnode * node2); +static int lnode_lw_eq(const lnode * node1, const lnode * node2); + +#if REISER4_DEBUG +static int lnode_valid_type(lnode_type type); +#endif + +/* Common operations for various types of lnodes. + + NOTE-NIKITA consider making this plugin. */ +static struct { + /* get a key of the corresponding file system object */ + reiser4_key *(*key) (const lnode * node, reiser4_key * result); + /* get a plugin suitable for the corresponding file system object */ + int (*get_plugins) (const lnode * node, plugin_set * area); + /* set a plugin suitable for the corresponding file system object */ + int (*set_plugins) (lnode * node, const plugin_set * area); + /* true if @node1 and @node2 refer to the same object */ + int (*eq) (const lnode * node1, const lnode * node2); +} lnode_ops[LNODE_NR_TYPES] = { + [LNODE_DENTRY] = { + .key = lnode_dentry_key, + .get_plugins = NULL, + .set_plugins = NULL, + .eq = NULL + }, + [LNODE_INODE] = { + .key = lnode_inode_key, + .get_plugins = NULL, + .set_plugins = NULL, + .eq = lnode_inode_eq + }, + /* + [LNODE_PSEUDO] = { + .key = NULL, + .get_plugins = NULL, + .set_plugins = NULL, + .eq = NULL + }, + */ + [LNODE_LW] = { + .key = lnode_lw_key, + .get_plugins = NULL, + .set_plugins = NULL, + .eq = lnode_lw_eq + } +}; + +/* hash table support */ + +/* compare two block numbers for equality. Used by hash-table macros */ +/* Audited by: green(2002.06.15) */ +static inline int +oid_eq(const oid_t * o1 /* first oid to compare */ , + const oid_t * o2 /* second oid to compare */ ) +{ + return *o1 == *o2; +} + +/* Hash znode by block number. Used by hash-table macros */ +/* Audited by: green(2002.06.15) */ +static inline __u32 +oid_hash(ln_hash_table *table, const oid_t * o /* oid to hash */ ) +{ + return *o & (LNODE_HTABLE_BUCKETS - 1); +} + +/* The hash table definition */ +#define KMALLOC(size) kmalloc((size), GFP_KERNEL) +#define KFREE(ptr, size) kfree(ptr) +TYPE_SAFE_HASH_DEFINE(ln, lnode, oid_t, h.oid, h.link, oid_hash, oid_eq); +#undef KFREE +#undef KMALLOC + +ln_hash_table lnode_htable; +spinlock_t lnode_guard = SPIN_LOCK_UNLOCKED; + + +/* true if @required lnode type is @compatible with @set lnode type. If lnode + types are incompatible, then thread trying to obtain @required type of + access will wait until all references (lnodes) of the @set type to the file + system object are released. + + For example, thread trying to manipulate object through VFS (@required type + is LNODE_INODE) will wait if object is currently manipulated through + reiser4() call (that is, there are lnodes with type LNODE_LW). + +*/ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +lnode_compatible_type(lnode_type required /* required lnode type */ , + lnode_type set /* lnode type already set */ ) +{ + return !((set == LNODE_LW) && (required != LNODE_INODE)); +} + +/* initialise lnode module for @super. */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +lnodes_init(void) +{ + ln_hash_init(&lnode_htable, LNODE_HTABLE_BUCKETS, NULL); + return 0; +} + +/* free lnode resources associated with @super. */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +lnodes_done(void) +{ + ln_hash_done(&lnode_htable); + return 0; +} + +/* Acquire handle to file system object. + + First check whether there is already lnode for this oid in a hash table. + If no---initialise @node and add it into the hash table. If hash table + already contains lnode with such oid, and incompatible type, wait until + said lnode is deleted. If compatible lnode is found in the hash table, + increase its reference counter and return. + + + + +*/ +/* Audited by: green(2002.06.15) */ +reiser4_internal lnode * +lget( /*lnode * node , lnode to add to the hash table */ + lnode_type type /* lnode type */ , oid_t oid /* objectid */ ) +{ + lnode *result; + + // assert("nikita-1862", node != NULL); + assert("nikita-1866", lnode_valid_type(type)); + + spin_lock(&lnode_guard); + /* check hash table */ + while ((result = ln_hash_find(&lnode_htable, &oid)) != 0) { + if (!lnode_compatible_type(type, result->h.type)) { + int ret; + + /* if lnode is of incompatible type, wait until all + incompatible users go away. For example, if we are + requesting lnode for VFS access (and our @type is + LNODE_INODE), wait until all reiser4() system call + manipulations with this object finish. + */ + ret = kcond_wait(&result->h.cvar, &lnode_guard, 1); + if (ret != 0) { + result = ERR_PTR(ret); + break; + } + } else { + /* compatible lnode found in the hash table. Just + return it. */ + ++result->h.ref; + break; + } + } + if (result == NULL) { + /* lnode wasn't found in the hash table, initialise @node and + add it into hash table. */ + result = ( lnode * ) kmalloc( sizeof( lnode ), GFP_KERNEL); + xmemset(result, 0, sizeof( lnode )); + result->h.type = type; + result->h.oid = oid; + kcond_init(&result->h.cvar); + result->h.ref = 1; + ln_hash_insert(&lnode_htable, result); + } + spin_unlock(&lnode_guard); + return result; +} + +/* release reference to file system object */ +/* Audited by: green(2002.06.15) */ +reiser4_internal void +lput(lnode * node /* lnode to release */ ) +{ + assert("nikita-1864", node != NULL); + assert("nikita-1961", lnode_valid_type(node->h.type)); /* man in + * a + * space */ + spin_lock(&lnode_guard); + assert("nikita-1878", ln_hash_find(&lnode_htable, &node->h.oid) == node); + if (--node->h.ref == 0) { + ln_hash_remove(&lnode_htable, node); + kcond_broadcast(&node->h.cvar); + kfree(node); + } + spin_unlock(&lnode_guard); +} + +reiser4_internal lnode * +lref(lnode * node) +{ + assert("nikita-3241", node != NULL); + assert("nikita-3242", lnode_valid_type(node->h.type)); + + spin_lock(&lnode_guard); + ++ node->h.ref; + spin_unlock(&lnode_guard); + return node; +} + +/* true if @node1 and @node2 refer to the same object */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +lnode_eq(const lnode * node1 /* first node to compare */ , + const lnode * node2 /* second node to compare */ ) +{ + assert("nikita-1921", node1 != NULL); + assert("nikita-1922", node2 != NULL); /* Finnegans Wake started */ + + if (node1->h.oid != node2->h.oid) + return 0; + else if (node1->h.type != node2->h.type) + return 0; + else + return lnode_ops[node1->h.type].eq(node1, node2); +} + +/* return key of object behind @node */ +/* Audited by: green(2002.06.15) */ +reiser4_internal reiser4_key * +lnode_key(const lnode * node /* lnode to query */ , + reiser4_key * result /* result */ ) +{ + assert("nikita-1849", node != NULL); + assert("nikita-1855", lnode_valid_type(node->h.type)); + return lnode_ops[node->h.type].key(node, result); +} + +/* return plugins of object behind @node */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +get_lnode_plugins(const lnode * node /* lnode to query */ , + plugin_set * area /* result */ ) +{ + assert("nikita-1853", node != NULL); + assert("nikita-1858", lnode_valid_type(node->h.type)); + return lnode_ops[node->h.type].get_plugins(node, area); +} + +/* set plugins of object behind @node */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +set_lnode_plugins(lnode * node /* lnode to modify */ , + const plugin_set * area /* plugins to install */ ) +{ + assert("nikita-1859", node != NULL); + assert("nikita-1860", lnode_valid_type(node->h.type)); + return lnode_ops[node->h.type].set_plugins(node, area); +} + +#if REISER4_DEBUG +/* true if @type is valid lnode type */ +/* Audited by: green(2002.06.15) */ +static int +lnode_valid_type(lnode_type type /* would-be lnode type */ ) +{ + return type < LNODE_NR_TYPES; +} +#endif + +/* return key of object behind dentry-based @node */ +reiser4_internal reiser4_key * +lnode_dentry_key(const lnode * node /* lnode to query */ , + reiser4_key * result /* result */ ) +{ + return build_sd_key(node->dentry.dentry->d_inode, result); +} + + + +/* return key of object behind inode-based @node */ +/* Audited by: green(2002.06.15) */ +static reiser4_key * +lnode_inode_key(const lnode * node /* lnode to query */ , + reiser4_key * result /* result */ ) +{ + return build_sd_key(node->inode.inode, result); +} + +/* return key of object behind lighweight @node */ +/* Audited by: green(2002.06.15) */ +static reiser4_key * +lnode_lw_key(const lnode * node /* lnode to query */ , + reiser4_key * result /* result */ ) +{ + *result = node->lw.key; + return result; +} + +/* compare two inodes */ +/* Audited by: green(2002.06.15) */ +static int +lnode_inode_eq(const lnode * node1 /* first node to compare */ , + const lnode * node2 /* second node to compare */ ) +{ + assert("nikita-1923", node1 != NULL); + assert("nikita-1924", node2 != NULL); + + assert("nikita-1927", node1->inode.inode != NULL); + assert("nikita-1928", node2->inode.inode != NULL); + + return (node1->inode.inode == node2->inode.inode); + +} + +/* compare two lw objects */ +/* Audited by: green(2002.06.15) */ +static int +lnode_lw_eq(const lnode * node1 UNUSED_ARG /* first node to + * compare */ , + const lnode * node2 UNUSED_ARG /* second node to + * compare */ ) +{ + assert("nikita-1925", node1 != NULL); + assert("nikita-1926", node2 != NULL); + + /* we only get there if oids are equal */ + assert("nikita-1929", node1->h.oid == node2->h.oid); + assert("nikita-1930", keyeq(&node1->lw.key, &node2->lw.key)); + return 1; +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/lnode.h linux-2.6.4-ck1/fs/reiser4/lnode.h --- linux-2.6.4/fs/reiser4/lnode.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/lnode.h 2004-03-11 22:45:15.263514250 +1100 @@ -0,0 +1,116 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Declaration of lnode (light-weight node). */ + +#ifndef __LNODE_H__ +#define __LNODE_H__ + +#include "forward.h" +#include "dformat.h" +#include "kcond.h" +#include "type_safe_hash.h" +#include "plugin/plugin_header.h" +#include "plugin/plugin_set.h" +#include "key.h" + +#include /* for __u?? */ +#include /* for struct super_block, etc. */ +#include /* for struct super_block, etc. */ + +typedef enum { + LNODE_DENTRY, + LNODE_INODE, +// LNODE_PSEUDO, + LNODE_REISER4_INODE, + LNODE_LW, + LNODE_NR_TYPES +} lnode_type; + +typedef union lnode lnode; + +/* declare hash table of lnode_lw's */ +TYPE_SAFE_HASH_DECLARE(ln, lnode); + +/* common part of various lnode types */ +typedef struct lnode_header { + /* lnode type. Taken from lnode_type enum. Never changed after + initialisation, so needs no locking. */ + __u8 type; + /* unused. Alignment requires this anyway. */ + __u8 flags; + /* condition variable to wake up waiters */ + kcond_t cvar; + /* hash table linkage. Updated under hash-table spinlock. */ + ln_hash_link link; + /* objectid of underlying file system object. Never changed after + initialisation, so needs no locking. */ + oid_t oid; + /* reference counter. Updated under hash-table spinlock. */ + int ref; +} lnode_header; + +typedef struct lnode_dentry { + lnode_header h; + struct dentry *dentry; + struct vfsmount *mnt; +} lnode_dentry; + +typedef struct lnode_inode { + lnode_header h; + struct inode *inode; +} lnode_inode; + +typedef struct lnode_reiser4_inode { + lnode_header h; + struct reiser4_inode *inode; +} lnode_reiser4_inode; + +typedef struct lnode_lw { + lnode_header h; + struct super_block * lw_sb; + reiser4_key key; +} lnode_lw; + +#if 0 +typedef struct lnode_pseudo { + lnode_header h; + lnode *host; + /* something to identify pseudo file type, like name or plugin */ +} lnode_pseudo; +#endif + +union lnode { + lnode_header h; + lnode_dentry dentry; + lnode_inode inode; + lnode_reiser4_inode reiser4_inode; + lnode_lw lw; +// lnode_pseudo pseudo; +}; + +extern int lnodes_init(void); +extern int lnodes_done(void); + +extern lnode *lget( lnode_type type, oid_t oid); +extern void lput(lnode * node); +extern int lnode_eq(const lnode * node1, const lnode * node2); +extern lnode *lref(lnode * node); + +extern struct inode *inode_by_lnode(const lnode * node); +extern reiser4_key *lnode_key(const lnode * node, reiser4_key * result); + +extern int get_lnode_plugins(const lnode * node, plugin_set * area); +extern int set_lnode_plugins(lnode * node, const plugin_set * area); + +/* __LNODE_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/lock.c linux-2.6.4-ck1/fs/reiser4/lock.c --- linux-2.6.4/fs/reiser4/lock.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/lock.c 2004-03-11 22:45:15.266513784 +1100 @@ -0,0 +1,1443 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Traditional deadlock avoidance is achieved by acquiring all locks in a single order. V4 balances the tree from the bottom up, and searches the tree from the top down, and that is really the way we want it, so tradition won't work for us. + +Instead we have two lock orderings, a high priority lock ordering, and a low priority lock ordering. Each node in the tree has a lock in its znode. + + Suppose we have a set of processes which lock (R/W) tree nodes. Each + process has a set (maybe empty) of already locked nodes ("process locked + set"). Each process may have a pending lock request to a node locked by + another process. Note: we lock and unlock, but do not transfer locks: it is possible transferring locks instead would save some bus locking.... + + Deadlock occurs when we have a loop constructed from + process locked sets and lock request vectors. + + + NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in memory is extended with "znodes" with which we connect nodes with their left and right neighbors using sibling pointers stored in the znodes. When we perform balancing operations we often go from left to right and from right to left. + + + +-P1-+ +-P3-+ + |+--+| V1 |+--+| + ||N1|| -------> ||N3|| + |+--+| |+--+| + +----+ +----+ + ^ | + |V2 |V3 + | v + +---------P2---------+ + |+--+ +--+| + ||N2| -------- |N4|| + |+--+ +--+| + +--------------------+ + + We solve this by ensuring that only low priority processes lock in top to + bottom order and from right to left, and high priority processes lock from + bottom to top and left to right. + + ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and + kill those damn busy loops. + ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom + stage) cannot be ordered that way. There are no rules what nodes can belong + to the atom and what nodes cannot. We cannot define what is right or left + direction, what is top or bottom. We can take immediate parent or side + neighbor of one node, but nobody guarantees that, say, left neighbor node is + not a far right neighbor for other nodes from the same atom. It breaks + deadlock avoidance rules and hi-low priority locking cannot be applied for + atom locks. + + How does it help to avoid deadlocks ? + + Suppose we have a deadlock with n processes. Processes from one priority + class never deadlock because they take locks in one consistent + order. + + So, any possible deadlock loop must have low priority as well as high + priority processes. There are no other lock priority levels except low and + high. We know that any deadlock loop contains at least one node locked by a + low priority process and requested by a high priority process. If this + situation is caught and resolved it is sufficient to avoid deadlocks. + + V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION. + + The deadlock prevention algorithm is based on comparing + priorities of node owners (processes which keep znode locked) and + requesters (processes which want to acquire a lock on znode). We + implement a scheme where low-priority owners yield locks to + high-priority requesters. We created a signal passing system that + is used to ask low-priority processes to yield one or more locked + znodes. + + The condition when a znode needs to change its owners is described by the + following formula: + + ############################################# + # # + # (number of high-priority requesters) > 0 # + # AND # + # (numbers of high-priority owners) == 0 # + # # + ############################################# + + Note that a low-priority process + delays node releasing if another high-priority process owns this node. So, slightly more strictly speaking, to have a deadlock capable cycle you must have a loop in which a high priority process is waiting on a low priority process to yield a node, which is slightly different from saying a high priority process is waiting on a node owned by a low priority process. + + It is enough to avoid deadlocks if we prevent any low-priority process from + falling asleep if its locked set contains a node which satisfies the + deadlock condition. + + That condition is implicitly or explicitly checked in all places where new + high-priority requests may be added or removed from node request queue or + high-priority process takes or releases a lock on node. The main + goal of these checks is to never lose the moment when node becomes "has + wrong owners" and send "must-yield-this-lock" signals to its low-pri owners + at that time. + + The information about received signals is stored in the per-process + structure (lock stack) and analyzed before a low-priority process goes to + sleep but after a "fast" attempt to lock a node fails. Any signal wakes + sleeping process up and forces him to re-check lock status and received + signal info. If "must-yield-this-lock" signals were received the locking + primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code. + + V4 LOCKING DRAWBACKS + + If we have already balanced on one level, and we are propagating our changes upward to a higher level, it could be + very messy to surrender all locks on the lower level because we put so much computational work into it, and reverting + them to their state before they were locked might be very complex. We also don't want to acquire all locks before + performing balancing because that would either be almost as much work as the balancing, or it would be too + conservative and lock too much. We want balancing to be done only at high priority. Yet, we might want to go to the + left one node and use some of its empty space... So we make one attempt at getting the node to the left using + try_lock, and if it fails we do without it, because we didn't really need it, it was only a nice to have. + + LOCK STRUCTURES DESCRIPTION + + The following data structures are used in the reiser4 locking + implementation: + + All fields related to long-term locking are stored in znode->lock. + + The lock stack is a per thread object. It owns all znodes locked by the + thread. One znode may be locked by several threads in case of read lock or + one znode may be write locked by one thread several times. The special link + objects (lock handles) support n<->m relation between znodes and lock + owners. + + + + +---------+ +---------+ + | LS1 | | LS2 | + +---------+ +---------+ + ^ ^ + |---------------+ +----------+ + v v v v + +---------+ +---------+ +---------+ +---------+ + | LH1 | | LH2 | | LH3 | | LH4 | + +---------+ +---------+ +---------+ +---------+ + ^ ^ ^ ^ + | +------------+ | + v v v + +---------+ +---------+ +---------+ + | Z1 | | Z2 | | Z3 | + +---------+ +---------+ +---------+ + + Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The picture above shows that lock stack LS1 has a + list of 2 lock handles LH1 and LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode Z1 is + locked by only one thread, znode has only one lock handle LH1 on its list, similar situation is for Z3 which is + locked by the thread 2 only. Z2 is locked (for read) twice by different threads and two lock handles are on its + list. Each lock handle represents a single relation of a locking of a znode by a thread. Locking of a znode is an + establishing of a locking relation between the lock stack and the znode by adding of a new lock handle to a list of + lock handles, the lock stack. The lock stack links all lock handles for all znodes locked by the lock stack. The znode + list groups all lock handles for all locks stacks which locked the znode. + + Yet another relation may exist between znode and lock owners. If lock + procedure cannot immediately take lock on an object it adds the lock owner + on special `requestors' list belongs to znode. That list represents a + queue of pending lock requests. Because one lock owner may request only + only one lock object at a time, it is a 1->n relation between lock objects + and a lock owner implemented as it is described above. Full information + (priority, pointers to lock and link objects) about each lock request is + stored in lock owner structure in `request' field. + + SHORT_TERM LOCKING + + This is a list of primitive operations over lock stacks / lock handles / + znodes and locking descriptions for them. + + 1. locking / unlocking which is done by two list insertion/deletion, one + to/from znode's list of lock handles, another one is to/from lock stack's + list of lock handles. The first insertion is protected by + znode->lock.guard spinlock. The list owned by the lock stack can be + modified only by thread who owns the lock stack and nobody else can + modify/read it. There is nothing to be protected by a spinlock or + something else. + + 2. adding/removing a lock request to/from znode requesters list. The rule is + that znode->lock.guard spinlock should be taken for this. + + 3. we can traverse list of lock handles and use references to lock stacks who + locked given znode if znode->lock.guard spinlock is taken. + + 4. If a lock stack is associated with a znode as a lock requestor or lock + owner its existence is guaranteed by znode->lock.guard spinlock. Some its + (lock stack's) fields should be protected from being accessed in parallel + by two or more threads. Please look at lock_stack structure definition + for the info how those fields are protected. */ + +/* Znode lock and capturing intertwining. */ +/* In current implementation we capture formatted nodes before locking + them. Take a look on longterm lock znode, try_capture() request precedes + locking requests. The longterm_lock_znode function unconditionally captures + znode before even checking of locking conditions. + + Another variant is to capture znode after locking it. It was not tested, but + at least one deadlock condition is supposed to be there. One thread has + locked a znode (Node-1) and calls try_capture() for it. Try_capture() sleeps + because znode's atom has CAPTURE_WAIT state. Second thread is a flushing + thread, its current atom is the atom Node-1 belongs to. Second thread wants + to lock Node-1 and sleeps because Node-1 is locked by the first thread. The + described situation is a deadlock. */ + +#include "debug.h" +#include "txnmgr.h" +#include "znode.h" +#include "jnode.h" +#include "tree.h" +#include "plugin/node/node.h" +#include "super.h" + +#include + +#if REISER4_DEBUG +static int request_is_deadlock_safe(znode *, znode_lock_mode, + znode_lock_request); +#endif + +#define ADDSTAT(node, counter) \ + reiser4_stat_inc_at_level(znode_get_level(node), znode.counter) + +/* Returns a lock owner associated with current thread */ +reiser4_internal lock_stack * +get_current_lock_stack(void) +{ + return &get_current_context()->stack; +} + +/* Wakes up all low priority owners informing them about possible deadlock */ +static void +wake_up_all_lopri_owners(znode * node) +{ + lock_handle *handle; + + assert("nikita-1824", rw_zlock_is_locked(&node->lock)); + for_all_type_safe_list(owners, &node->lock.owners, handle) { + spin_lock_stack(handle->owner); + + assert("nikita-1832", handle->node == node); + /* count this signal in owner->nr_signaled */ + if (!handle->signaled) { + handle->signaled = 1; + atomic_inc(&handle->owner->nr_signaled); + } + /* Wake up a single process */ + __reiser4_wake_up(handle->owner); + + spin_unlock_stack(handle->owner); + } +} + +/* Adds a lock to a lock owner, which means creating a link to the lock and + putting the link into the two lists all links are on (the doubly linked list + that forms the lock_stack, and the doubly linked list of links attached + to a lock. +*/ +static inline void +link_object(lock_handle * handle, lock_stack * owner, znode * node) +{ + assert("jmacd-810", handle->owner == NULL); + assert("nikita-1828", owner == get_current_lock_stack()); + assert("nikita-1830", rw_zlock_is_locked(&node->lock)); + + handle->owner = owner; + handle->node = node; + locks_list_push_back(&owner->locks, handle); + owners_list_push_front(&node->lock.owners, handle); + handle->signaled = 0; +} + +/* Breaks a relation between a lock and its owner */ +static inline void +unlink_object(lock_handle * handle) +{ + assert("zam-354", handle->owner != NULL); + assert("nikita-1608", handle->node != NULL); + assert("nikita-1633", rw_zlock_is_locked(&handle->node->lock)); + assert("nikita-1829", handle->owner == get_current_lock_stack()); + + locks_list_remove_clean(handle); + owners_list_remove_clean(handle); + + /* indicates that lock handle is free now */ + handle->owner = NULL; +} + +/* Actually locks an object knowing that we are able to do this */ +static void +lock_object(lock_stack * owner) +{ + lock_request *request; + znode *node; + assert("nikita-1839", owner == get_current_lock_stack()); + + request = &owner->request; + node = request->node; + assert("nikita-1834", rw_zlock_is_locked(&node->lock)); + if (request->mode == ZNODE_READ_LOCK) { + node->lock.nr_readers++; + } else { + /* check that we don't switched from read to write lock */ + assert("nikita-1840", node->lock.nr_readers <= 0); + /* We allow recursive locking; a node can be locked several + times for write by same process */ + node->lock.nr_readers--; + } + + link_object(request->handle, owner, node); + + if (owner->curpri) { + node->lock.nr_hipri_owners++; + } + ON_TRACE(TRACE_LOCKS, + "%spri lock: %p node: %p: hipri_owners: %u: nr_readers: %d\n", + owner->curpri ? "hi" : "lo", owner, node, node->lock.nr_hipri_owners, node->lock.nr_readers); +} + +/* Check for recursive write locking */ +static int +recursive(lock_stack * owner) +{ + int ret; + znode *node; + + node = owner->request.node; + + /* Owners list is not empty for a locked node */ + assert("zam-314", !owners_list_empty(&node->lock.owners)); + assert("nikita-1841", owner == get_current_lock_stack()); + assert("nikita-1848", rw_zlock_is_locked(&node->lock)); + + ret = (owners_list_front(&node->lock.owners)->owner == owner); + + /* Recursive read locking should be done usual way */ + assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK); + /* mixing of read/write locks is not allowed */ + assert("zam-341", !ret || znode_is_wlocked(node)); + + return ret; +} + +#if REISER4_DEBUG +/* Returns true if the lock is held by the calling thread. */ +int +znode_is_any_locked(const znode * node) +{ + lock_handle *handle; + lock_stack *stack; + int ret; + + if (!znode_is_locked(node)) { + return 0; + } + + stack = get_current_lock_stack(); + + spin_lock_stack(stack); + + ret = 0; + + for_all_type_safe_list(locks, &stack->locks, handle) { + if (handle->node == node) { + ret = 1; + break; + } + } + + spin_unlock_stack(stack); + + return ret; +} + +#endif + +/* Returns true if a write lock is held by the calling thread. */ +reiser4_internal int +znode_is_write_locked(const znode * node) +{ + lock_stack *stack; + lock_handle *handle; + + assert("jmacd-8765", node != NULL); + + if (!znode_is_wlocked(node)) { + return 0; + } + + stack = get_current_lock_stack(); + + /* If it is write locked, then all owner handles must equal the current stack. */ + handle = owners_list_front(&node->lock.owners); + + return (handle->owner == stack); +} + +/* This "deadlock" condition is the essential part of reiser4 locking + implementation. This condition is checked explicitly by calling + check_deadlock_condition() or implicitly in all places where znode lock + state (set of owners and request queue) is changed. Locking code is + designed to use this condition to trigger procedure of passing object from + low priority owner(s) to high priority one(s). + + The procedure results in passing an event (setting lock_handle->signaled + flag) and counting this event in nr_signaled field of owner's lock stack + object and wakeup owner's process. +*/ +static inline int +check_deadlock_condition(znode * node) +{ + assert("nikita-1833", rw_zlock_is_locked(&node->lock)); + return node->lock.nr_hipri_requests > 0 && node->lock.nr_hipri_owners == 0; +} + +/* checks lock/request compatibility */ +static int +check_lock_object(lock_stack * owner) +{ + znode *node = owner->request.node; + + assert("nikita-1842", owner == get_current_lock_stack()); + assert("nikita-1843", rw_zlock_is_locked(&node->lock)); + + /* See if the node is disconnected. */ + if (unlikely(ZF_ISSET(node, JNODE_IS_DYING))) { + ON_TRACE(TRACE_LOCKS, "attempt to lock dying znode: %p", node); + return RETERR(-EINVAL); + } + + /* Do not ever try to take a lock if we are going in low priority + direction and a node have a high priority request without high + priority owners. */ + if (unlikely(!owner->curpri && check_deadlock_condition(node))) { + return RETERR(-E_REPEAT); + } + + if (unlikely(!is_lock_compatible(node, owner->request.mode))) { + return RETERR(-E_REPEAT); + } + + return 0; +} + +static int +can_lock_object(lock_stack * owner) +{ + int result; + znode *node = owner->request.node; + + result = check_lock_object(owner); + if (REISER4_STATS && znode_get_level(node) > 0) { + if (result != 0) + ADDSTAT(node, lock_contented); + else + ADDSTAT(node, lock_uncontented); + } + return result; +} + +/* Setting of a high priority to the process. It clears "signaled" flags + because znode locked by high-priority process can't satisfy our "deadlock + condition". */ +static void +set_high_priority(lock_stack * owner) +{ + assert("nikita-1846", owner == get_current_lock_stack()); + /* Do nothing if current priority is already high */ + if (!owner->curpri) { + /* We don't need locking for owner->locks list, because, this + * function is only called with the lock stack of the current + * thread, and no other thread can play with owner->locks list + * and/or change ->node pointers of lock handles in this list. + * + * (Interrupts also are not involved.) + */ + lock_handle *item = locks_list_front(&owner->locks); + while (!locks_list_end(&owner->locks, item)) { + znode *node = item->node; + + WLOCK_ZLOCK(&node->lock); + + node->lock.nr_hipri_owners++; + + ON_TRACE(TRACE_LOCKS, + "set_hipri lock: %p node: %p: hipri_owners after: %u nr_readers: %d\n", + item, node, node->lock.nr_hipri_owners, node->lock.nr_readers); + + /* we can safely set signaled to zero, because + previous statement (nr_hipri_owners ++) guarantees + that signaled will be never set again. */ + item->signaled = 0; + WUNLOCK_ZLOCK(&node->lock); + + item = locks_list_next(item); + } + owner->curpri = 1; + atomic_set(&owner->nr_signaled, 0); + } +} + +/* Sets a low priority to the process. */ +static void +set_low_priority(lock_stack * owner) +{ + assert("nikita-3075", owner == get_current_lock_stack()); + /* Do nothing if current priority is already low */ + if (owner->curpri) { + /* scan all locks (lock handles) held by @owner, which is + actually current thread, and check whether we are reaching + deadlock possibility anywhere. + */ + lock_handle *handle = locks_list_front(&owner->locks); + while (!locks_list_end(&owner->locks, handle)) { + znode *node = handle->node; + WLOCK_ZLOCK(&node->lock); + /* this thread just was hipri owner of @node, so + nr_hipri_owners has to be greater than zero. */ + ON_TRACE(TRACE_LOCKS, + "set_lopri lock: %p node: %p: hipri_owners before: %u nr_readers: %d\n", + handle, node, node->lock.nr_hipri_owners, node->lock.nr_readers); + assert("nikita-1835", node->lock.nr_hipri_owners > 0); + node->lock.nr_hipri_owners--; + /* If we have deadlock condition, adjust a nr_signaled + field. It is enough to set "signaled" flag only for + current process, other low-pri owners will be + signaled and waken up after current process unlocks + this object and any high-priority requestor takes + control. */ + if (check_deadlock_condition(node) + && !handle->signaled) { + handle->signaled = 1; + atomic_inc(&owner->nr_signaled); + } + WUNLOCK_ZLOCK(&node->lock); + handle = locks_list_next(handle); + } + owner->curpri = 0; + } +} + +#define MAX_CONVOY_SIZE ((unsigned)(NR_CPUS - 1)) + +/* helper function used by longterm_unlock_znode() to wake up requestor(s). */ +/* + * In certain multi threaded work loads jnode spin lock is the most + * contented one. Wake up of threads waiting for znode is, thus, + * important to do right. There are three well known strategies: + * + * (1) direct hand-off. Hasn't been tried. + * + * (2) wake all (thundering herd). This degrades performance in our + * case. + * + * (3) wake one. Simplest solution where requestor in the front of + * requestors list is awaken under znode spin lock is not very + * good on the SMP, because first thing requestor will try to do + * after waking up on another CPU is to acquire znode spin lock + * that is still held by this thread. As an optimization we grab + * lock stack spin lock, release znode spin lock and wake + * requestor. done_context() synchronize against stack spin lock + * to avoid (impossible) case where requestor has been waked by + * some other thread (wake_up_all_lopri_owners(), or something + * similar) and managed to exit before we waked it up. + * + * Effect of this optimization wasn't big, after all. + * + */ +static void +wake_up_requestor(znode *node) +{ +#if NR_CPUS > 2 + requestors_list_head *creditors; + lock_stack *convoy[MAX_CONVOY_SIZE]; + int convoyused; + int convoylimit; + + assert("nikita-3180", node != NULL); + assert("nikita-3181", rw_zlock_is_locked(&node->lock)); + + ADDSTAT(node, wakeup); + + convoyused = 0; + convoylimit = min(num_online_cpus() - 1, MAX_CONVOY_SIZE); + creditors = &node->lock.requestors; + if (!requestors_list_empty(creditors)) { + convoy[0] = requestors_list_front(creditors); + convoyused = 1; + ADDSTAT(node, wakeup_found); + /* + * it has been verified experimentally, that there are no + * convoys on the leaf level. + */ + if (znode_get_level(node) != LEAF_LEVEL && + convoy[0]->request.mode == ZNODE_READ_LOCK && + convoylimit > 1) { + lock_stack *item; + + ADDSTAT(node, wakeup_found_read); + for (item = requestors_list_next(convoy[0]); + ! requestors_list_end(creditors, item); + item = requestors_list_next(item)) { + ADDSTAT(node, wakeup_scan); + if (item->request.mode == ZNODE_READ_LOCK) { + ADDSTAT(node, wakeup_convoy); + convoy[convoyused] = item; + ++ convoyused; + /* + * it is safe to spin lock multiple + * lock stacks here, because lock + * stack cannot sleep on more than one + * requestors queue. + */ + /* + * use raw spin_lock in stead of macro + * wrappers, because spin lock + * profiling code cannot cope with so + * many locks held at the same time. + */ + spin_lock(&item->sguard.lock); + if (convoyused == convoylimit) + break; + } + } + } + spin_lock(&convoy[0]->sguard.lock); + } + + WUNLOCK_ZLOCK(&node->lock); + + while (convoyused > 0) { + -- convoyused; + __reiser4_wake_up(convoy[convoyused]); + spin_unlock(&convoy[convoyused]->sguard.lock); + } +#else + /* uniprocessor case: keep it simple */ + if (!requestors_list_empty(&node->lock.requestors)) { + lock_stack *requestor; + + requestor = requestors_list_front(&node->lock.requestors); + reiser4_wake_up(requestor); + } + + WUNLOCK_ZLOCK(&node->lock); +#endif +} + +#undef MAX_CONVOY_SIZE + +reiser4_internal void +longterm_unlock_znode(lock_handle * handle) +{ + znode *node = handle->node; + lock_stack *oldowner = handle->owner; + int hipri; + int readers; + int rdelta; + int youdie; + + assert("jmacd-1021", handle != NULL); + assert("jmacd-1022", handle->owner != NULL); + assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode)); + + assert("zam-130", oldowner == get_current_lock_stack()); + + LOCK_CNT_DEC(long_term_locked_znode); + + ADDSTAT(node, unlock); + + hipri = oldowner->curpri ? -1 : 0; + readers = node->lock.nr_readers; + rdelta = (readers > 0) ? -1 : +1; + youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0); + + WLOCK_ZLOCK(&node->lock); + + assert("zam-101", znode_is_locked(node)); + + /* Adjust a number of high priority owners of this lock */ + node->lock.nr_hipri_owners += hipri; + assert("nikita-1836", node->lock.nr_hipri_owners >= 0); + + ON_TRACE(TRACE_LOCKS, + "%spri unlock: %p node: %p: hipri_owners: %u nr_readers %d\n", + oldowner->curpri ? "hi" : "lo", + handle, + node, + node->lock.nr_hipri_owners, + node->lock.nr_readers); + + /* Handle znode deallocation on last write-lock release. */ + if (znode_is_wlocked_once(node)) { + if (youdie) { + forget_znode(handle); + assert("nikita-2191", znode_invariant(node)); + zput(node); + return; + } + ON_DEBUG_MODIFY(znode_post_write(node)); + } + + if (handle->signaled) + atomic_dec(&oldowner->nr_signaled); + + /* Unlocking means owner<->object link deletion */ + unlink_object(handle); + + /* This is enough to be sure whether an object is completely + unlocked. */ + node->lock.nr_readers += rdelta; + + /* If the node is locked it must have an owners list. Likewise, if + the node is unlocked it must have an empty owners list. */ + assert("zam-319", equi(znode_is_locked(node), + !owners_list_empty(&node->lock.owners))); + + /* If there are pending lock requests we wake up a requestor */ + if (!znode_is_wlocked(node)) + wake_up_requestor(node); + else + WUNLOCK_ZLOCK(&node->lock); + + assert("nikita-3182", rw_zlock_is_not_locked(&node->lock)); + /* minus one reference from handle->node */ + handle->node = NULL; + assert("nikita-2190", znode_invariant(node)); + ON_DEBUG(check_lock_data()); + ON_DEBUG(check_lock_node_data(node)); + zput(node); +} + +static int +lock_tail(lock_stack *owner, int wake_up_next, int ok, znode_lock_mode mode) +{ + znode *node = owner->request.node; + + assert("jmacd-807", rw_zlock_is_locked(&node->lock)); + + /* If we broke with (ok == 0) it means we can_lock, now do it. */ + if (ok == 0) { + lock_object(owner); + owner->request.mode = 0; + if (mode == ZNODE_READ_LOCK) + wake_up_next = 1; + } + + if (wake_up_next) + wake_up_requestor(node); + else + WUNLOCK_ZLOCK(&node->lock); + + if (ok == 0) { + /* count a reference from lockhandle->node + + znode was already referenced at the entry to this function, + hence taking spin-lock here is not necessary (see comment + in the zref()). + */ + zref(node); + + LOCK_CNT_INC(long_term_locked_znode); + if (REISER4_DEBUG_NODE && mode == ZNODE_WRITE_LOCK) { + node_check(node, 0); + ON_DEBUG_MODIFY(znode_pre_write(node)); + } + } + + ON_DEBUG(check_lock_data()); + ON_DEBUG(check_lock_node_data(node)); + return ok; +} + +static int +longterm_lock_tryfast(lock_stack * owner) +{ + int result; + int wake_up_next = 0; + znode *node; + zlock *lock; + + node = owner->request.node; + lock = &node->lock; + + assert("nikita-3340", schedulable()); + assert("nikita-3341", request_is_deadlock_safe(node, + ZNODE_READ_LOCK, + ZNODE_LOCK_LOPRI)); + + result = UNDER_RW(zlock, lock, read, can_lock_object(owner)); + + if (likely(result != -EINVAL)) { + spin_lock_znode(node); + result = try_capture( + ZJNODE(node), ZNODE_READ_LOCK, 0, 1/* can copy on capture */); + spin_unlock_znode(node); + WLOCK_ZLOCK(lock); + if (unlikely(result != 0)) { + owner->request.mode = 0; + wake_up_next = 1; + } else { + result = can_lock_object(owner); + if (unlikely(result == -E_REPEAT)) { + /* fall back to longterm_lock_znode() */ + WUNLOCK_ZLOCK(lock); + return 1; + } + } + return lock_tail(owner, wake_up_next, result, ZNODE_READ_LOCK); + } else + return 1; +} + +/* locks given lock object */ +reiser4_internal int +longterm_lock_znode( + /* local link object (allocated by lock owner thread, usually on its own + * stack) */ + lock_handle * handle, + /* znode we want to lock. */ + znode * node, + /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */ + znode_lock_mode mode, + /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */ + znode_lock_request request) +{ + int ret; + int hipri = (request & ZNODE_LOCK_HIPRI) != 0; + int wake_up_next = 0; + int non_blocking = 0; + int has_atom; + txn_capture cap_flags; + zlock *lock; + txn_handle *txnh; + tree_level level; + + /* Get current process context */ + lock_stack *owner = get_current_lock_stack(); + + /* Check that the lock handle is initialized and isn't already being used. */ + assert("jmacd-808", handle->owner == NULL); + assert("nikita-3026", schedulable()); + assert("nikita-3219", request_is_deadlock_safe(node, mode, request)); + + cap_flags = 0; + if (request & ZNODE_LOCK_NONBLOCK) { + cap_flags |= TXN_CAPTURE_NONBLOCKING; + non_blocking = 1; + } + + if (request & ZNODE_LOCK_DONT_FUSE) + cap_flags |= TXN_CAPTURE_DONT_FUSE; + + /* If we are changing our process priority we must adjust a number + of high priority owners for each znode that we already lock */ + if (hipri) { + set_high_priority(owner); + } else { + set_low_priority(owner); + } + + level = znode_get_level(node); + ADDSTAT(node, lock); + + /* Fill request structure with our values. */ + owner->request.mode = mode; + owner->request.handle = handle; + owner->request.node = node; + + txnh = get_current_context()->trans; + lock = &node->lock; + + if (mode == ZNODE_READ_LOCK && request == 0) { + ret = longterm_lock_tryfast(owner); + if (ret <= 0) + return ret; + } + + has_atom = (txnh->atom != NULL); + + if (REISER4_STATS) { + if (mode == ZNODE_READ_LOCK) + ADDSTAT(node, lock_read); + else + ADDSTAT(node, lock_write); + + if (hipri) + ADDSTAT(node, lock_hipri); + else + ADDSTAT(node, lock_lopri); + } + + /* Synchronize on node's zlock guard lock. */ + WLOCK_ZLOCK(lock); + + if (znode_is_locked(node) && + mode == ZNODE_WRITE_LOCK && recursive(owner)) + return lock_tail(owner, 0, 0, mode); + + for (;;) { + ADDSTAT(node, lock_iteration); + + /* Check the lock's availability: if it is unavaiable we get E_REPEAT, 0 + indicates "can_lock", otherwise the node is invalid. */ + ret = can_lock_object(owner); + + if (unlikely(ret == -EINVAL)) { + /* @node is dying. Leave it alone. */ + /* wakeup next requestor to support lock invalidating */ + wake_up_next = 1; + ADDSTAT(node, lock_dying); + break; + } + + if (unlikely(ret == -E_REPEAT && non_blocking)) { + /* either locking of @node by the current thread will + * lead to the deadlock, or lock modes are + * incompatible. */ + ADDSTAT(node, lock_cannot_lock); + break; + } + + assert("nikita-1844", (ret == 0) || ((ret == -E_REPEAT) && !non_blocking)); + /* If we can get the lock... Try to capture first before + taking the lock.*/ + + /* first handle commonest case where node and txnh are already + * in the same atom. */ + /* safe to do without taking locks, because: + * + * 1. read of aligned word is atomic with respect to writes to + * this word + * + * 2. false negatives are handled in try_capture(). + * + * 3. false positives are impossible. + * + * PROOF: left as an exercise to the curious reader. + * + * Just kidding. Here is one: + * + * At the time T0 txnh->atom is stored in txnh_atom. + * + * At the time T1 node->atom is stored in node_atom. + * + * At the time T2 we observe that + * + * txnh_atom != NULL && node_atom == txnh_atom. + * + * Imagine that at this moment we acquire node and txnh spin + * lock in this order. Suppose that under spin lock we have + * + * node->atom != txnh->atom, (S1) + * + * at the time T3. + * + * txnh->atom != NULL still, because txnh is open by the + * current thread. + * + * Suppose node->atom == NULL, that is, node was un-captured + * between T1, and T3. But un-capturing of formatted node is + * always preceded by the call to invalidate_lock(), which + * marks znode as JNODE_IS_DYING under zlock spin + * lock. Contradiction, because can_lock_object() above checks + * for JNODE_IS_DYING. Hence, node->atom != NULL at T3. + * + * Suppose that node->atom != node_atom, that is, atom, node + * belongs to was fused into another atom: node_atom was fused + * into node->atom. Atom of txnh was equal to node_atom at T2, + * which means that under spin lock, txnh->atom == node->atom, + * because txnh->atom can only follow fusion + * chain. Contradicts S1. + * + * The same for hypothesis txnh->atom != txnh_atom. Hence, + * node->atom == node_atom == txnh_atom == txnh->atom. Again + * contradicts S1. Hence S1 is false. QED. + * + */ + + if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) { + ADDSTAT(node, lock_no_capture); + } else { + /* + * unlock zlock spin lock here. It is possible for + * longterm_unlock_znode() to sneak in here, but there + * is no harm: invalidate_lock() will mark znode as + * JNODE_IS_DYING and this will be noted by + * can_lock_object() below. + */ + WUNLOCK_ZLOCK(lock); + spin_lock_znode(node); + ret = try_capture( + ZJNODE(node), mode, cap_flags, 1/* can copy on capture*/); + spin_unlock_znode(node); + WLOCK_ZLOCK(lock); + if (unlikely(ret != 0)) { + /* In the failure case, the txnmgr releases + the znode's lock (or in some cases, it was + released a while ago). There's no need to + reacquire it so we should return here, + avoid releasing the lock. */ + owner->request.mode = 0; + /* next requestor may not fail */ + wake_up_next = 1; + break; + } + + /* Check the lock's availability again -- this is + because under some circumstances the capture code + has to release and reacquire the znode spinlock. */ + ret = can_lock_object(owner); + } + + /* This time, a return of (ret == 0) means we can lock, so we + should break out of the loop. */ + if (likely(ret != -E_REPEAT || non_blocking)) { + ADDSTAT(node, lock_can_lock); + break; + } + + /* Lock is unavailable, we have to wait. */ + + /* By having semaphore initialization here we cannot lose + wakeup signal even if it comes after `nr_signaled' field + check. */ + ret = prepare_to_sleep(owner); + if (unlikely(ret != 0)) { + break; + } + + assert("nikita-1837", rw_zlock_is_locked(&node->lock)); + if (hipri) { + /* If we are going in high priority direction then + increase high priority requests counter for the + node */ + lock->nr_hipri_requests++; + /* If there are no high priority owners for a node, + then immediately wake up low priority owners, so + they can detect possible deadlock */ + if (lock->nr_hipri_owners == 0) + wake_up_all_lopri_owners(node); + /* And prepare a lock request */ + requestors_list_push_front(&lock->requestors, owner); + } else { + /* If we are going in low priority direction then we + set low priority to our process. This is the only + case when a process may become low priority */ + /* And finally prepare a lock request */ + requestors_list_push_back(&lock->requestors, owner); + } + + /* Ok, here we have prepared a lock request, so unlock + a znode ...*/ + WUNLOCK_ZLOCK(lock); + /* ... and sleep */ + go_to_sleep(owner, level); + + WLOCK_ZLOCK(lock); + + if (hipri) { + assert("nikita-1838", lock->nr_hipri_requests > 0); + lock->nr_hipri_requests--; + } + + requestors_list_remove(owner); + } + + assert("jmacd-807/a", rw_zlock_is_locked(&node->lock)); + return lock_tail(owner, wake_up_next, ret, mode); +} + +/* lock object invalidation means changing of lock object state to `INVALID' + and waiting for all other processes to cancel theirs lock requests. */ +reiser4_internal void +invalidate_lock(lock_handle * handle /* path to lock + * owner and lock + * object is being + * invalidated. */ ) +{ + znode *node = handle->node; + lock_stack *owner = handle->owner; + lock_stack *rq; + + assert("zam-325", owner == get_current_lock_stack()); + assert("zam-103", znode_is_write_locked(node)); + assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED)); + assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED)); + assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE)); + assert("nikita-3097", znode_is_wlocked_once(node)); + assert("nikita-3338", rw_zlock_is_locked(&node->lock)); + + if (handle->signaled) + atomic_dec(&owner->nr_signaled); + + ZF_SET(node, JNODE_IS_DYING); + unlink_object(handle); + node->lock.nr_readers = 0; + + /* all requestors will be informed that lock is invalidated. */ + for_all_type_safe_list(requestors, &node->lock.requestors, rq) { + reiser4_wake_up(rq); + } + + /* We use that each unlock() will wakeup first item from requestors + list; our lock stack is the last one. */ + while (!requestors_list_empty(&node->lock.requestors)) { + requestors_list_push_back(&node->lock.requestors, owner); + + prepare_to_sleep(owner); + + WUNLOCK_ZLOCK(&node->lock); + go_to_sleep(owner, znode_get_level(node)); + WLOCK_ZLOCK(&node->lock); + + requestors_list_remove(owner); + } + + WUNLOCK_ZLOCK(&node->lock); +} + +/* Initializes lock_stack. */ +reiser4_internal void +init_lock_stack(lock_stack * owner /* pointer to + * allocated + * structure. */ ) +{ + /* xmemset(,0,) is done already as a part of reiser4 context + * initialization */ + /* xmemset(owner, 0, sizeof (lock_stack)); */ + locks_list_init(&owner->locks); + requestors_list_clean(owner); + spin_stack_init(owner); + owner->curpri = 1; + sema_init(&owner->sema, 0); +} + +/* Initializes lock object. */ +reiser4_internal void +reiser4_init_lock(zlock * lock /* pointer on allocated + * uninitialized lock object + * structure. */ ) +{ + xmemset(lock, 0, sizeof (zlock)); + rw_zlock_init(lock); + requestors_list_init(&lock->requestors); + owners_list_init(&lock->owners); +} + +/* lock handle initialization */ +reiser4_internal void +init_lh(lock_handle * handle) +{ + xmemset(handle, 0, sizeof *handle); + locks_list_clean(handle); + owners_list_clean(handle); +} + +/* freeing of lock handle resources */ +reiser4_internal void +done_lh(lock_handle * handle) +{ + assert("zam-342", handle != NULL); + if (handle->owner != NULL) + longterm_unlock_znode(handle); +} + +/* What kind of lock? */ +reiser4_internal znode_lock_mode lock_mode(lock_handle * handle) +{ + if (handle->owner == NULL) { + return ZNODE_NO_LOCK; + } else if (znode_is_rlocked(handle->node)) { + return ZNODE_READ_LOCK; + } else { + return ZNODE_WRITE_LOCK; + } +} + +/* Transfer a lock handle (presumably so that variables can be moved between stack and + heap locations). */ +static void +move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old) +{ + znode *node = old->node; + lock_stack *owner = old->owner; + int signaled; + + /* locks_list, modified by link_object() is not protected by + anything. This is valid because only current thread ever modifies + locks_list of its lock_stack. + */ + assert("nikita-1827", owner == get_current_lock_stack()); + assert("nikita-1831", new->owner == NULL); + + WLOCK_ZLOCK(&node->lock); + + new->node = node; + + signaled = old->signaled; + if (unlink_old) { + unlink_object(old); + } else { + if (node->lock.nr_readers > 0) { + node->lock.nr_readers += 1; + } else { + node->lock.nr_readers -= 1; + } + if (signaled) { + atomic_inc(&owner->nr_signaled); + } + if (owner->curpri) { + node->lock.nr_hipri_owners += 1; + } + LOCK_CNT_INC(long_term_locked_znode); + + zref(node); + } + link_object(new, owner, node); + new->signaled = signaled; + + WUNLOCK_ZLOCK(&node->lock); +} + +reiser4_internal void +move_lh(lock_handle * new, lock_handle * old) +{ + move_lh_internal(new, old, /*unlink_old */ 1); +} + +reiser4_internal void +copy_lh(lock_handle * new, lock_handle * old) +{ + move_lh_internal(new, old, /*unlink_old */ 0); +} + +/* after getting -E_DEADLOCK we unlock znodes until this function returns false */ +reiser4_internal int +check_deadlock(void) +{ + lock_stack *owner = get_current_lock_stack(); + return atomic_read(&owner->nr_signaled) != 0; +} + +/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock + priorities. */ +reiser4_internal int +prepare_to_sleep(lock_stack * owner) +{ + assert("nikita-1847", owner == get_current_lock_stack()); + /* NOTE(Zam): We cannot reset the lock semaphore here because it may + clear wake-up signal. The initial design was to re-check all + conditions under which we continue locking, release locks or sleep + until conditions are changed. However, even lock.c does not follow + that design. So, wake-up signal which is stored in semaphore state + could we loosen by semaphore reset. The less complex scheme without + resetting the semaphore is enough to not to loose wake-ups. + + if (0) { + + NOTE-NIKITA: I commented call to sema_init() out hoping + that it is the reason or thread sleeping in + down(&owner->sema) without any other thread running. + + Anyway, it is just an optimization: is semaphore is not + reinitialised at this point, in the worst case + longterm_lock_znode() would have to iterate its loop once + more. + spin_lock_stack(owner); + sema_init(&owner->sema, 0); + spin_unlock_stack(owner); + } + */ + + /* We return -E_DEADLOCK if one or more "give me the lock" messages are + * counted in nr_signaled */ + if (unlikely(atomic_read(&owner->nr_signaled) != 0)) { + assert("zam-959", !owner->curpri); + return RETERR(-E_DEADLOCK); + } + return 0; +} + +/* Wakes up a single thread */ +reiser4_internal void +__reiser4_wake_up(lock_stack * owner) +{ + up(&owner->sema); +} + +/* Puts a thread to sleep */ +reiser4_internal void +__go_to_sleep(lock_stack * owner +#if REISER4_STATS + , int node_level +#endif +) +{ +#ifdef CONFIG_REISER4_STATS + unsigned long sleep_start = jiffies; +#endif + /* Well, we might sleep here, so holding of any spinlocks is no-no */ + assert("nikita-3027", schedulable()); + /* return down_interruptible(&owner->sema); */ + down(&owner->sema); +#ifdef CONFIG_REISER4_STATS + switch (node_level) { + case ADD_TO_SLEPT_IN_WAIT_EVENT: + reiser4_stat_add(txnmgr.slept_in_wait_event, jiffies - sleep_start); + break; + case ADD_TO_SLEPT_IN_WAIT_ATOM: + reiser4_stat_add(txnmgr.slept_in_wait_atom, jiffies - sleep_start); + break; + default: + reiser4_stat_add_at_level(node_level, time_slept, + jiffies - sleep_start); + } +#endif +} + +reiser4_internal int +lock_stack_isclean(lock_stack * owner) +{ + if (locks_list_empty(&owner->locks)) { + assert("zam-353", atomic_read(&owner->nr_signaled) == 0); + return 1; + } + + return 0; +} + +#if REISER4_DEBUG_OUTPUT +/* Debugging help */ +reiser4_internal void +print_lock_stack(const char *prefix, lock_stack * owner) +{ + lock_handle *handle; + + spin_lock_stack(owner); + + printk("%s:\n", prefix); + printk(".... nr_signaled %d\n", atomic_read(&owner->nr_signaled)); + printk(".... curpri %s\n", owner->curpri ? "high" : "low"); + + if (owner->request.mode != 0) { + printk(".... current request: %s", owner->request.mode == ZNODE_WRITE_LOCK ? "write" : "read"); + print_address("", znode_get_block(owner->request.node)); + } + + printk(".... current locks:\n"); + + for_all_type_safe_list(locks, &owner->locks, handle) { + if (handle->node != NULL) + print_address(znode_is_rlocked(handle->node) ? + "...... read" : "...... write", znode_get_block(handle->node)); + } + + spin_unlock_stack(owner); +} +#endif + +#if REISER4_DEBUG + +void +check_lock_stack(lock_stack * stack) +{ + spin_lock_stack(stack); + locks_list_check(&stack->locks); + spin_unlock_stack(stack); +} + +extern spinlock_t active_contexts_lock; +extern context_list_head active_contexts; + +void +check_lock_data(void) +{ + if (0) { + reiser4_context *context; + + spin_lock(&active_contexts_lock); + for_all_type_safe_list(context, &active_contexts, context) { + check_lock_stack(&context->stack); + } + spin_unlock(&active_contexts_lock); + } else + check_lock_stack(&get_current_context()->stack); +} + +void +check_lock_node_data(znode * node) +{ + RLOCK_ZLOCK(&node->lock); + owners_list_check(&node->lock.owners); + requestors_list_check(&node->lock.requestors); + RUNLOCK_ZLOCK(&node->lock); +} + +static int +request_is_deadlock_safe(znode * node, znode_lock_mode mode, + znode_lock_request request) +{ + lock_stack *owner; + + owner = get_current_lock_stack(); + if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) && + znode_get_level(node) != 0) { + lock_handle *item; + + for_all_type_safe_list(locks, &owner->locks, item) { + znode *other = item->node; + + if (znode_get_level(other) == 0) + continue; + if (znode_get_level(other) > znode_get_level(node)) + return 0; + } + } + return 1; +} + +#endif + +/* return pointer to static storage with name of lock_mode. For + debugging */ +reiser4_internal const char * +lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ ) +{ + if (lock == ZNODE_READ_LOCK) + return "read"; + else if (lock == ZNODE_WRITE_LOCK) + return "write"; + else { + static char buf[30]; + + sprintf(buf, "unknown: %i", lock); + return buf; + } +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/lock.h linux-2.6.4-ck1/fs/reiser4/lock.h --- linux-2.6.4/fs/reiser4/lock.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/lock.h 2004-03-11 22:45:15.267513629 +1100 @@ -0,0 +1,269 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Long term locking data structures. See lock.c for details. */ + +#ifndef __LOCK_H__ +#define __LOCK_H__ + +#include "forward.h" +#include "debug.h" +#include "dformat.h" +#include "spin_macros.h" +#include "key.h" +#include "coord.h" +#include "type_safe_list.h" +#include "plugin/node/node.h" +#include "jnode.h" +#include "readahead.h" + +#include +#include +#include /* for PAGE_CACHE_SIZE */ +#include +#include + +/* per-znode lock requests queue; list items are lock owner objects + which want to lock given znode. + + Locking: protected by znode spin lock. */ +TYPE_SAFE_LIST_DECLARE(requestors); +/* per-znode list of lock handles for this znode + + Locking: protected by znode spin lock. */ +TYPE_SAFE_LIST_DECLARE(owners); +/* per-owner list of lock handles that point to locked znodes which + belong to one lock owner + + Locking: this list is only accessed by the thread owning the lock stack this + list is attached to. Hence, no locking is necessary. +*/ +TYPE_SAFE_LIST_DECLARE(locks); + +/* Per-znode lock object */ +struct zlock { + reiser4_rw_data guard; + /* The number of readers if positive; the number of recursively taken + write locks if negative. Protected by zlock spin lock. */ + int nr_readers; + /* A number of processes (lock_stacks) that have this object + locked with high priority */ + unsigned nr_hipri_owners; + /* A number of attempts to lock znode in high priority direction */ + unsigned nr_hipri_requests; + /* A linked list of lock_handle objects that contains pointers + for all lock_stacks which have this lock object locked */ + owners_list_head owners; + /* A linked list of lock_stacks that wait for this lock */ + requestors_list_head requestors; +}; + +#define rw_ordering_pred_zlock(lock) \ + (lock_counters()->spin_locked_stack == 0) + +/* Define spin_lock_zlock, spin_unlock_zlock, etc. */ +RW_LOCK_FUNCTIONS(zlock, zlock, guard); + +#define lock_is_locked(lock) ((lock)->nr_readers != 0) +#define lock_is_rlocked(lock) ((lock)->nr_readers > 0) +#define lock_is_wlocked(lock) ((lock)->nr_readers < 0) +#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1) +#define lock_can_be_rlocked(lock) ((lock)->nr_readers >=0) +#define lock_mode_compatible(lock, mode) \ + (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) \ + || ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock))) + + +/* Since we have R/W znode locks we need additional bidirectional `link' + objects to implement n<->m relationship between lock owners and lock + objects. We call them `lock handles'. + + Locking: see lock.c/"SHORT-TERM LOCKING" +*/ +struct lock_handle { + /* This flag indicates that a signal to yield a lock was passed to + lock owner and counted in owner->nr_signalled + + Locking: this is accessed under spin lock on ->node. + */ + int signaled; + /* A link to owner of a lock */ + lock_stack *owner; + /* A link to znode locked */ + znode *node; + /* A list of all locks for a process */ + locks_list_link locks_link; + /* A list of all owners for a znode */ + owners_list_link owners_link; +}; + +typedef struct lock_request { + /* A pointer to uninitialized link object */ + lock_handle *handle; + /* A pointer to the object we want to lock */ + znode *node; + /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */ + znode_lock_mode mode; +} lock_request; + +/* A lock stack structure for accumulating locks owned by a process */ +struct lock_stack { + /* A guard lock protecting a lock stack */ + reiser4_spin_data sguard; + /* number of znodes which were requested by high priority processes */ + atomic_t nr_signaled; + /* Current priority of a process + + This is only accessed by the current thread and thus requires no + locking. + */ + int curpri; + /* A list of all locks owned by this process. Elements can be added to + * this list only by the current thread. ->node pointers in this list + * can be only changed by the current thread. */ + locks_list_head locks; + /* When lock_stack waits for the lock, it puts itself on double-linked + requestors list of that lock */ + requestors_list_link requestors_link; + /* Current lock request info. + + This is only accessed by the current thread and thus requires no + locking. + */ + lock_request request; + /* It is a lock_stack's synchronization object for when process sleeps + when requested lock not on this lock_stack but which it wishes to + add to this lock_stack is not immediately available. It is used + instead of wait_queue_t object due to locking problems (lost wake + up). "lost wakeup" occurs when process is waken up before he actually + becomes 'sleepy' (through sleep_on()). Using of semaphore object is + simplest way to avoid that problem. + + A semaphore is used in the following way: only the process that is + the owner of the lock_stack initializes it (to zero) and calls + down(sema) on it. Usually this causes the process to sleep on the + semaphore. Other processes may wake him up by calling up(sema). The + advantage to a semaphore is that up() and down() calls are not + required to preserve order. Unlike wait_queue it works when process + is woken up before getting to sleep. + + NOTE-NIKITA: Transaction manager is going to have condition variables + (&kcondvar_t) anyway, so this probably will be replaced with + one in the future. + + After further discussion, Nikita has shown me that Zam's implementation is + exactly a condition variable. The znode's {zguard,requestors_list} represents + condition variable and the lock_stack's {sguard,semaphore} guards entry and + exit from the condition variable's wait queue. But the existing code can't + just be replaced with a more general abstraction, and I think its fine the way + it is. */ + struct semaphore sema; +}; + +/* defining of list manipulation functions for lists above */ +TYPE_SAFE_LIST_DEFINE(requestors, lock_stack, requestors_link); +TYPE_SAFE_LIST_DEFINE(owners, lock_handle, owners_link); +TYPE_SAFE_LIST_DEFINE(locks, lock_handle, locks_link); + +/* + User-visible znode locking functions +*/ + +extern int longterm_lock_znode (lock_handle * handle, + znode * node, + znode_lock_mode mode, + znode_lock_request request); + +extern void longterm_unlock_znode(lock_handle * handle); + +extern int check_deadlock(void); + +extern lock_stack *get_current_lock_stack(void); + +extern void init_lock_stack(lock_stack * owner); +extern void reiser4_init_lock(zlock * lock); + +extern void init_lh(lock_handle *); +extern void move_lh(lock_handle * new, lock_handle * old); +extern void copy_lh(lock_handle * new, lock_handle * old); +extern void done_lh(lock_handle *); +extern znode_lock_mode lock_mode(lock_handle *); + +extern int prepare_to_sleep(lock_stack * owner); + +#if REISER4_STATS + +#define ADD_TO_SLEPT_IN_WAIT_EVENT (-1) +#define ADD_TO_SLEPT_IN_WAIT_ATOM (-2) + +/* if REISER4_STATS __go_to_sleep() accepts additional parameter @level for + * gathering per-level sleep statistics. The go_to_sleep wrapper hides the + * __go_to_sleep() function prototypes difference. */ +void __go_to_sleep(lock_stack*, int); +#define go_to_sleep(owner, level) __go_to_sleep(owner, level); + +#else + +void __go_to_sleep(lock_stack*); +#define go_to_sleep(owner, level) __go_to_sleep(owner) + +#endif + +extern void __reiser4_wake_up(lock_stack * owner); + +extern int lock_stack_isclean(lock_stack * owner); + +/* zlock object state check macros: only used in assertions. Both forms imply that the + lock is held by the current thread. */ +extern int znode_is_write_locked(const znode * node); + +#if REISER4_DEBUG +#define spin_ordering_pred_stack_addendum (1) +#else +#define spin_ordering_pred_stack_addendum \ + ((lock_counters()->rw_locked_dk == 0) && \ + (lock_counters()->rw_locked_tree == 0)) +#endif +/* lock ordering is: first take zlock spin lock, then lock stack spin lock */ +#define spin_ordering_pred_stack(stack) \ + ((lock_counters()->spin_locked_stack == 0) && \ + (lock_counters()->spin_locked_txnmgr == 0) && \ + (lock_counters()->spin_locked_super == 0) && \ + (lock_counters()->spin_locked_inode_object == 0) && \ + (lock_counters()->rw_locked_cbk_cache == 0) && \ + (lock_counters()->spin_locked_epoch == 0) && \ + (lock_counters()->spin_locked_super_eflush == 0) && \ + spin_ordering_pred_stack_addendum) + +/* Same for lock_stack */ +SPIN_LOCK_FUNCTIONS(stack, lock_stack, sguard); + +static inline void +reiser4_wake_up(lock_stack * owner) +{ + spin_lock_stack(owner); + __reiser4_wake_up(owner); + spin_unlock_stack(owner); +} + +const char *lock_mode_name(znode_lock_mode lock); + +#if REISER4_DEBUG +extern void check_lock_data(void); +extern void check_lock_node_data(znode * node); +#else +#define check_lock_data() noop +#define check_lock_node_data() noop +#endif + +/* __LOCK_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/Makefile linux-2.6.4-ck1/fs/reiser4/Makefile --- linux-2.6.4/fs/reiser4/Makefile 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/Makefile 2004-03-11 22:45:15.169528866 +1100 @@ -0,0 +1,169 @@ +# +# reiser4/Makefile +# + +obj-$(CONFIG_REISER4_FS) += reiser4.o + +EXTRA_CFLAGS += \ + -Wformat \ + -Wundef \ + -Wunused \ + -Wcomment \ + \ + -Wno-nested-externs \ + -Wno-write-strings \ + -Wno-sign-compare + +# -Wpointer-arith \ +# -Wlarger-than-16384 \ +# -Winline \ + +ifeq ($(CONFIG_REISER4_NOOPT),y) + EXTRA_CFLAGS += -O0 -fno-inline +else +# this warning is only supported when optimization is on. + EXTRA_CFLAGS += \ + -Wuninitialized +endif + +ifeq ($(CONFIG_REISER4_ALL_IN_ONE),y) + +reiser4-objs := all-reiser4.o + +else + +reiser4-objs := \ + debug.o \ + stats.o \ + jnode.o \ + znode.o \ + key.o \ + pool.o \ + tree_mod.o \ + estimate.o \ + carry.o \ + carry_ops.o \ + lock.o \ + tree.o \ + context.o \ + tap.o \ + coord.o \ + block_alloc.o \ + txnmgr.o \ + kassign.o \ + flush.o \ + wander.o \ + eottl.o \ + search.o \ + page_cache.o \ + lnode.o \ + kcond.o \ + latch.o \ + seal.o \ + scint.o \ + dscale.o \ + trace.o \ + flush_queue.o \ + ktxnmgrd.o \ + kattr.o \ + blocknrset.o \ + super.o \ + oid.o \ + tree_walk.o \ + inode.o \ + vfs_ops.o \ + inode_ops.o \ + file_ops.o \ + as_ops.o \ + emergency_flush.o \ + spinprof.o\ + entd.o\ + readahead.o \ + crypt.o \ + compress.o\ + diskmap.o \ + prof.o \ + repacker.o \ + status_flags.o \ + init_super.o \ + crab_lock.o \ + safe_link.o \ + \ + plugin/plugin.o \ + plugin/plugin_set.o \ + plugin/plugin_hash.o \ + plugin/node/node.o \ + plugin/object.o \ + plugin/symlink.o \ + plugin/cryptcompress.o \ + plugin/digest.o \ + plugin/node/node40.o \ + \ + plugin/item/static_stat.o \ + plugin/item/sde.o \ + plugin/item/cde.o \ + plugin/item/blackbox.o \ + plugin/item/internal.o \ + plugin/item/tail.o \ + plugin/item/ctail.o \ + plugin/item/extent.o \ + plugin/item/extent_item_ops.o \ + plugin/item/extent_file_ops.o \ + plugin/item/extent_flush_ops.o \ + plugin/item/extent_repack_ops.o \ + \ + plugin/hash.o \ + plugin/tail_policy.o \ + plugin/item/item.o \ + \ + plugin/dir/hashed_dir.o \ + plugin/dir/pseudo_dir.o \ + plugin/dir/dir.o \ + \ + plugin/security/perm.o \ + \ + plugin/pseudo/pseudo.o \ + \ + plugin/space/bitmap.o \ + \ + plugin/disk_format/disk_format40.o \ + plugin/disk_format/disk_format.o \ + \ + plugin/file/pseudo.o \ + plugin/file/file.o \ + plugin/file/tail_conversion.o +# plugin/xattr.o \ +# plugin/security/acl.o \ + +reiser4-objs += sys_reiser4.o + +ifeq ($(CONFIG_REISER4_FS_SYSCALL),y) + + ifeq ($(CONFIG_REISER4_FS_SYSCALL_YACC),y) + + YFLAGS= -d -v -r -b $(obj)/parser/parser + + $(obj)/parser/parser.code.c: $(obj)/parser/parser.y + + $(YACC) $(YFLAGS) $(obj)/parser/parser.y + + endif + + sys_reiser4.o: $/sys_reiser4.c \ + $/parser/parser.code.c \ + $/parser/parser.tab.c \ + $/parser/parser.tab.h \ + $/parser/lib.c \ + $/parser/pars.cls.h \ + $/parser/pars.yacc.h \ + $/parser/parser.h + + +# $(MAKE) $(obj)/parser/parser +#clean-files := parser/parser.code.c +##clean-rule =@$(MAKE) -C $/parser clean +#clean-rule =@$(MAKE) $(obj)/parser/parser.code.c +endif + +endif + diff -Naurp linux-2.6.4/fs/reiser4/oid.c linux-2.6.4-ck1/fs/reiser4/oid.c --- linux-2.6.4/fs/reiser4/oid.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/oid.c 2004-03-11 22:45:15.268513473 +1100 @@ -0,0 +1,131 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "debug.h" +#include "super.h" +#include "txnmgr.h" + +/* we used to have oid allocation plugin. It was removed because it + was recognized as providing unneeded level of abstraction. If one + ever will find it useful - look at yet_unneeded_abstractions/oid +*/ + +reiser4_internal int +oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next) +{ + reiser4_super_info_data *sbinfo; + + sbinfo = get_super_private(super); + + sbinfo->next_to_use = next; + sbinfo->oids_in_use = nr_files; + return 0; +} + +reiser4_internal oid_t +oid_allocate(struct super_block *super) +{ + reiser4_super_info_data *sbinfo; + oid_t oid; + + sbinfo = get_super_private(super); + + reiser4_spin_lock_sb(sbinfo); + if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) { + oid = sbinfo->next_to_use ++; + sbinfo->oids_in_use ++; + } else + oid = ABSOLUTE_MAX_OID; + reiser4_spin_unlock_sb(sbinfo); + return oid; +} + +reiser4_internal int +oid_release(struct super_block *super, oid_t oid UNUSED_ARG) +{ + reiser4_super_info_data *sbinfo; + + sbinfo = get_super_private(super); + + reiser4_spin_lock_sb(sbinfo); + sbinfo->oids_in_use --; + reiser4_spin_unlock_sb(sbinfo); + return 0; +} + +reiser4_internal oid_t oid_next(const struct super_block *super) +{ + reiser4_super_info_data *sbinfo; + oid_t oid; + + sbinfo = get_super_private(super); + + reiser4_spin_lock_sb(sbinfo); + oid = sbinfo->next_to_use; + reiser4_spin_unlock_sb(sbinfo); + return oid; +} + +reiser4_internal long oids_used(const struct super_block *super) +{ + reiser4_super_info_data *sbinfo; + oid_t used; + + sbinfo = get_super_private(super); + + reiser4_spin_lock_sb(sbinfo); + used = sbinfo->oids_in_use; + reiser4_spin_unlock_sb(sbinfo); + if (used < (__u64) ((long) ~0) >> 1) + return (long) used; + else + return (long) -1; +} + + +reiser4_internal long oids_free(const struct super_block *super) +{ + reiser4_super_info_data *sbinfo; + oid_t oids; + + sbinfo = get_super_private(super); + + reiser4_spin_lock_sb(sbinfo); + oids = ABSOLUTE_MAX_OID - OIDS_RESERVED - sbinfo->next_to_use; + reiser4_spin_unlock_sb(sbinfo); + if (oids < (__u64) ((long) ~0) >> 1) + return (long) oids; + else + return (long) -1; +} + +reiser4_internal void +oid_count_allocated(void) +{ + txn_atom *atom; + + atom = get_current_atom_locked(); + atom->nr_objects_created++; + UNLOCK_ATOM(atom); +} + +/* count an object deletion in atom's nr_objects_deleted */ +reiser4_internal void +oid_count_released(void) +{ + txn_atom *atom; + + atom = get_current_atom_locked(); + atom->nr_objects_deleted++; + UNLOCK_ATOM(atom); +} + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/page_cache.c linux-2.6.4-ck1/fs/reiser4/page_cache.c --- linux-2.6.4/fs/reiser4/page_cache.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/page_cache.c 2004-03-11 22:45:15.269513318 +1100 @@ -0,0 +1,873 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Memory pressure hooks. Fake inodes handling. */ +/* We store all file system meta data (and data, of course) in the page cache. + + What does this mean? In stead of using bread/brelse we create special + "fake" inode (one per super block) and store content of formatted nodes + into pages bound to this inode in the page cache. In newer kernels bread() + already uses inode attached to block device (bd_inode). Advantage of having + our own fake inode is that we can install appropriate methods in its + address_space operations. Such methods are called by VM on memory pressure + (or during background page flushing) and we can use them to react + appropriately. + + In initial version we only support one block per page. Support for multiple + blocks per page is complicated by relocation. + + To each page, used by reiser4, jnode is attached. jnode is analogous to + buffer head. Difference is that jnode is bound to the page permanently: + jnode cannot be removed from memory until its backing page is. + + jnode contain pointer to page (->pg field) and page contain pointer to + jnode in ->private field. Pointer from jnode to page is protected to by + jnode's spinlock and pointer from page to jnode is protected by page lock + (PG_locked bit). Lock ordering is: first take page lock, then jnode spin + lock. To go into reverse direction use jnode_lock_page() function that uses + standard try-lock-and-release device. + + Properties: + + 1. when jnode-to-page mapping is established (by jnode_attach_page()), page + reference counter is increased. + + 2. when jnode-to-page mapping is destroyed (by jnode_detach_page() and + page_detach_jnode()), page reference counter is decreased. + + 3. on jload() reference counter on jnode page is increased, page is + kmapped and `referenced'. + + 4. on jrelse() inverse operations are performed. + + 5. kmapping/kunmapping of unformatted pages is done by read/write methods. + + + DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting + historically.] + + [In the following discussion, `lock' invariably means long term lock on + znode.] (What about page locks?) + + There is some special class of deadlock possibilities related to memory + pressure. Locks acquired by other reiser4 threads are accounted for in + deadlock prevention mechanism (lock.c), but when ->vm_writeback() is + invoked additional hidden arc is added to the locking graph: thread that + tries to allocate memory waits for ->vm_writeback() to finish. If this + thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock + prevention is useless. + + Another related problem is possibility for ->vm_writeback() to run out of + memory itself. This is not a problem for ext2 and friends, because their + ->vm_writeback() don't allocate much memory, but reiser4 flush is + definitely able to allocate huge amounts of memory. + + It seems that there is no reliable way to cope with the problems above. In + stead it was decided that ->vm_writeback() (as invoked in the kswapd + context) wouldn't perform any flushing itself, but rather should just wake + up some auxiliary thread dedicated for this purpose (or, the same thread + that does periodic commit of old atoms (ktxnmgrd.c)). + + Details: + + 1. Page is called `reclaimable' against particular reiser4 mount F if this + page can be ultimately released by try_to_free_pages() under presumptions + that: + + a. ->vm_writeback() for F is no-op, and + + b. none of the threads accessing F are making any progress, and + + c. other reiser4 mounts obey the same memory reservation protocol as F + (described below). + + For example, clean un-pinned page, or page occupied by ext2 data are + reclaimable against any reiser4 mount. + + When there is more than one reiser4 mount in a system, condition (c) makes + reclaim-ability not easily verifiable beyond trivial cases mentioned above. + + + + + + + + + THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE + + Fake inode is used to bound formatted nodes and each node is indexed within + fake inode by its block number. If block size of smaller than page size, it + may so happen that block mapped to the page with formatted node is occupied + by unformatted node or is unallocated. This lead to some complications, + because flushing whole page can lead to an incorrect overwrite of + unformatted node that is moreover, can be cached in some other place as + part of the file body. To avoid this, buffers for unformatted nodes are + never marked dirty. Also pages in the fake are never marked dirty. This + rules out usage of ->writepage() as memory pressure hook. In stead + ->releasepage() is used. + + Josh is concerned that page->buffer is going to die. This should not pose + significant problem though, because we need to add some data structures to + the page anyway (jnode) and all necessary book keeping can be put there. + +*/ + +/* Life cycle of pages/nodes. + + jnode contains reference to page and page contains reference back to + jnode. This reference is counted in page ->count. Thus, page bound to jnode + cannot be released back into free pool. + + 1. Formatted nodes. + + 1. formatted node is represented by znode. When new znode is created its + ->pg pointer is NULL initially. + + 2. when node content is loaded into znode (by call to zload()) for the + first time following happens (in call to ->read_node() or + ->allocate_node()): + + 1. new page is added to the page cache. + + 2. this page is attached to znode and its ->count is increased. + + 3. page is kmapped. + + 3. if more calls to zload() follow (without corresponding zrelses), page + counter is left intact and in its stead ->d_count is increased in znode. + + 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero + ->release_node() is called and page is kunmapped as result. + + 5. at some moment node can be captured by a transaction. Its ->x_count + is then increased by transaction manager. + + 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE + bit set) following will happen (also see comment at the top of znode.c): + + 1. when last lock is released, node will be uncaptured from + transaction. This released reference that transaction manager acquired + at the step 5. + + 2. when last reference is released, zput() detects that node is + actually deleted and calls ->delete_node() + operation. page_cache_delete_node() implementation detaches jnode from + page and releases page. + + 7. otherwise (node wasn't removed from the tree), last reference to + znode will be released after transaction manager committed transaction + node was in. This implies squallocing of this node (see + flush.c). Nothing special happens at this point. Znode is still in the + hash table and page is still attached to it. + + 8. znode is actually removed from the memory because of the memory + pressure, or during umount (znodes_tree_done()). Anyway, znode is + removed by the call to zdrop(). At this moment, page is detached from + znode and removed from the inode address space. + +*/ + +#include "debug.h" +#include "dformat.h" +#include "key.h" +#include "txnmgr.h" +#include "jnode.h" +#include "znode.h" +#include "block_alloc.h" +#include "tree.h" +#include "vfs_ops.h" +#include "inode.h" +#include "super.h" +#include "entd.h" +#include "page_cache.h" +#include "ktxnmgrd.h" + +#include +#include +#include /* for struct page */ +#include /* for struct page */ +#include +#include +#include +#include + +static struct bio *page_bio(struct page *page, jnode * node, int rw, int gfp); + +static struct address_space_operations formatted_fake_as_ops; + +static const oid_t fake_ino = 0x1; +static const oid_t bitmap_ino = 0x2; +static const oid_t cc_ino = 0x3; + +/* one-time initialization of fake inodes handling functions. */ +reiser4_internal int +init_fakes() +{ + return 0; +} + +static void +init_fake_inode(struct super_block *super, struct inode *fake, struct inode **pfake) +{ + assert("nikita-2168", fake->i_state & I_NEW); + fake->i_mapping->a_ops = &formatted_fake_as_ops; + fake->i_blkbits = super->s_blocksize_bits; + fake->i_size = ~0ull; + fake->i_rdev = super->s_bdev->bd_dev; + fake->i_bdev = super->s_bdev; + *pfake = fake; + /* NOTE-NIKITA something else? */ + unlock_new_inode(fake); +} + +/* initialize fake inode to which formatted nodes are bound in the page cache. */ +reiser4_internal int +init_formatted_fake(struct super_block *super) +{ + struct inode *fake; + struct inode *bitmap; + struct inode *cc; + reiser4_super_info_data *sinfo; + + assert("nikita-1703", super != NULL); + + sinfo = get_super_private_nocheck(super); + fake = iget_locked(super, oid_to_ino(fake_ino)); + + if (fake != NULL) { + init_fake_inode(super, fake, &sinfo->fake); + + bitmap = iget_locked(super, oid_to_ino(bitmap_ino)); + if (bitmap != NULL) { + init_fake_inode(super, bitmap, &sinfo->bitmap); + + cc = iget_locked(super, oid_to_ino(cc_ino)); + if (cc != NULL) { + init_fake_inode(super, cc, &sinfo->cc); + return 0; + } else { + iput(sinfo->fake); + iput(sinfo->bitmap); + sinfo->fake = NULL; + sinfo->bitmap = NULL; + } + } else { + iput(sinfo->fake); + sinfo->fake = NULL; + } + } + return RETERR(-ENOMEM); +} + +/* release fake inode for @super */ +reiser4_internal int +done_formatted_fake(struct super_block *super) +{ + reiser4_super_info_data *sinfo; + + sinfo = get_super_private_nocheck(super); + + if (sinfo->fake != NULL) { + assert("vs-1426", sinfo->fake->i_data.nrpages == 0); + iput(sinfo->fake); + sinfo->fake = NULL; + } + + if (sinfo->bitmap != NULL) { + iput(sinfo->bitmap); + sinfo->bitmap = NULL; + } + + if (sinfo->cc != NULL) { + iput(sinfo->cc); + sinfo->cc = NULL; + } + return 0; +} + +#if REISER4_TRACE_TREE +int reiser4_submit_bio_helper(const char *moniker, int rw, struct bio *bio) +{ + int result; + + write_io_trace(moniker, rw, bio); + result = submit_bio(rw, bio); + return result; +} +#endif + +reiser4_internal void reiser4_wait_page_writeback (struct page * page) +{ + assert ("zam-783", PageLocked(page)); + + do { + unlock_page(page); + wait_on_page_writeback(page); + lock_page(page); + } while (PageWriteback(page)); +} + +/* return tree @page is in */ +reiser4_internal reiser4_tree * +tree_by_page(const struct page *page /* page to query */ ) +{ + assert("nikita-2461", page != NULL); + return &get_super_private(page->mapping->host->i_sb)->tree; +} + +#if REISER4_DEBUG_MEMCPY + +/* Our own versions of memcpy, memmove, and memset used to profile shifts of + tree node content. Coded to avoid inlining. */ + +struct mem_ops_table { + void *(*cpy) (void *dest, const void *src, size_t n); + void *(*move) (void *dest, const void *src, size_t n); + void *(*set) (void *s, int c, size_t n); +}; + +void * +xxmemcpy(void *dest, const void *src, size_t n) +{ + return memcpy(dest, src, n); +} + +void * +xxmemmove(void *dest, const void *src, size_t n) +{ + return memmove(dest, src, n); +} + +void * +xxmemset(void *s, int c, size_t n) +{ + return memset(s, c, n); +} + +struct mem_ops_table std_mem_ops = { + .cpy = xxmemcpy, + .move = xxmemmove, + .set = xxmemset +}; + +struct mem_ops_table *mem_ops = &std_mem_ops; + +void * +xmemcpy(void *dest, const void *src, size_t n) +{ + return mem_ops->cpy(dest, src, n); +} + +void * +xmemmove(void *dest, const void *src, size_t n) +{ + return mem_ops->move(dest, src, n); +} + +void * +xmemset(void *s, int c, size_t n) +{ + return mem_ops->set(s, c, n); +} + +#endif + +/* completion handler for single page bio-based read. + + mpage_end_io_read() would also do. But it's static. + +*/ +static int +end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG, int err UNUSED_ARG) +{ + struct page *page; + + if (bio->bi_size != 0) { + warning("nikita-3332", "Truncated single page read: %i", + bio->bi_size); + return 1; + } + + page = bio->bi_io_vec[0].bv_page; + + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + SetPageUptodate(page); + else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + bio_put(bio); + return 0; +} + +/* completion handler for single page bio-based write. + + mpage_end_io_write() would also do. But it's static. + +*/ +static int +end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG, int err UNUSED_ARG) +{ + struct page *page; + + if (bio->bi_size != 0) { + warning("nikita-3333", "Truncated single page write: %i", + bio->bi_size); + return 1; + } + + page = bio->bi_io_vec[0].bv_page; + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + SetPageError(page); + end_page_writeback(page); + bio_put(bio); + return 0; +} + +/* ->readpage() method for formatted nodes */ +static int +formatted_readpage(struct file *f UNUSED_ARG, struct page *page /* page to read */ ) +{ + assert("nikita-2412", PagePrivate(page) && jprivate(page)); + return page_io(page, jprivate(page), READ, GFP_KERNEL); +} + +/* submit single-page bio request */ +reiser4_internal int +page_io(struct page *page /* page to perform io for */ , + jnode * node /* jnode of page */ , + int rw /* read or write */ , int gfp /* GFP mask */ ) +{ + struct bio *bio; + int result; + + assert("nikita-2094", page != NULL); + assert("nikita-2226", PageLocked(page)); + assert("nikita-2634", node != NULL); + assert("nikita-2893", rw == READ || rw == WRITE); + + if (rw) { + if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) { + unlock_page(page); + return 0; + } + } + + bio = page_bio(page, node, rw, gfp); + if (!IS_ERR(bio)) { + if (rw == WRITE) { + SetPageWriteback(page); + unlock_page(page); + } + reiser4_submit_bio(rw, bio); + result = 0; + } else { + unlock_page(page); + result = PTR_ERR(bio); + } + return result; +} + +/* helper function to construct bio for page */ +static struct bio * +page_bio(struct page *page, jnode * node, int rw, int gfp) +{ + struct bio *bio; + assert("nikita-2092", page != NULL); + assert("nikita-2633", node != NULL); + + /* Simple implemenation in the assumption that blocksize == pagesize. + + We only have to submit one block, but submit_bh() will allocate bio + anyway, so lets use all the bells-and-whistles of bio code. + */ + + bio = bio_alloc(gfp, 1); + if (bio != NULL) { + int blksz; + struct super_block *super; + reiser4_block_nr blocknr; + + super = page->mapping->host->i_sb; + assert("nikita-2029", super != NULL); + blksz = super->s_blocksize; + assert("nikita-2028", blksz == (int) PAGE_CACHE_SIZE); + + blocknr = *UNDER_SPIN(jnode, node, jnode_get_io_block(node)); + + assert("nikita-2275", blocknr != (reiser4_block_nr) 0); + assert("nikita-2276", !blocknr_is_fake(&blocknr)); + + bio->bi_bdev = super->s_bdev; + /* fill bio->bi_sector before calling bio_add_page(), because + * q->merge_bvec_fn may want to inspect it (see + * drivers/md/linear.c:linear_mergeable_bvec() for example. */ + bio->bi_sector = blocknr * (blksz >> 9); + + if (!bio_add_page(bio, page, blksz, 0)) { + warning("nikita-3452", + "Single page bio cannot be constructed"); + return ERR_PTR(RETERR(-EINVAL)); + } + + /* bio -> bi_idx is filled by bio_init() */ + bio->bi_end_io = (rw == READ) ? + end_bio_single_page_read : end_bio_single_page_write; + + return bio; + } else + return ERR_PTR(RETERR(-ENOMEM)); +} + +/* Common memory pressure notification. */ +reiser4_internal int +reiser4_writepage(struct page *page /* page to start writeback from */, + struct writeback_control *wbc) +{ + struct super_block *s; + reiser4_context ctx; + reiser4_tree *tree; + txn_atom * atom; + jnode *node; + int result; + + s = page->mapping->host->i_sb; + init_context(&ctx, s); + + reiser4_stat_inc(pcwb.calls); + + assert("vs-828", PageLocked(page)); + + set_rapid_flush_mode(1); + + tree = &get_super_private(s)->tree; + node = jnode_of_page(page); + if (!IS_ERR(node)) { + int phantom; + + assert("nikita-2419", node != NULL); + + LOCK_JNODE(node); + /* + * page was dirty, but jnode is not. This is (only?) + * possible if page was modified through mmap(). We + * want to handle such jnodes specially. + */ + phantom = !jnode_is_dirty(node); + atom = jnode_get_atom(node); + if (atom != NULL) { + if (!(atom->flags & ATOM_FORCE_COMMIT)) { + atom->flags |= ATOM_FORCE_COMMIT; + ktxnmgrd_kick(&get_super_private(s)->tmgr); + reiser4_stat_inc(txnmgr.commit_from_writepage); + } + UNLOCK_ATOM(atom); + } + UNLOCK_JNODE(node); + + result = emergency_flush(page); + if (result != 0) { + /* + * cannot flush page right now, or some error + */ + reiser4_stat_inc(pcwb.not_written); + } else { + /* + * page was successfully flushed + */ + reiser4_stat_inc(pcwb.written); + if (phantom && jnode_is_unformatted(node)) + JF_SET(node, JNODE_KEEPME); + } + jput(node); + } else { + reiser4_stat_inc(pcwb.no_jnode); + result = PTR_ERR(node); + } + if (result != 0) { + /* + * shrink list doesn't move page to another mapping + * list when clearing dirty flag. So it is enough to + * just set dirty bit. + */ + SetPageDirty(page); + inc_page_state(nr_dirty); + unlock_page(page); + } + reiser4_exit_context(&ctx); + return result; +} + +/* ->set_page_dirty() method of formatted address_space */ +static int +formatted_set_page_dirty(struct page *page /* page to mark + * dirty */ ) +{ + assert("nikita-2173", page != NULL); + return __set_page_dirty_nobuffers(page); +} + +/* address space operations for the fake inode */ +static struct address_space_operations formatted_fake_as_ops = { + /* Perform a writeback of a single page as a memory-freeing + * operation. */ + .writepage = reiser4_writepage, + /* this is called to read formatted node */ + .readpage = formatted_readpage, + /* ->sync_page() method of fake inode address space operations. Called + from wait_on_page() and lock_page(). + + This is most annoyingly misnomered method. Actually it is called + from wait_on_page_bit() and lock_page() and its purpose is to + actually start io by jabbing device drivers. + */ + .sync_page = reiser4_start_up_io, + /* Write back some dirty pages from this mapping. Called from sync. + called during sync (pdflush) */ + .writepages = reiser4_writepages, + /* Set a page dirty */ + .set_page_dirty = formatted_set_page_dirty, + /* used for read-ahead. Not applicable */ + .readpages = NULL, + .prepare_write = NULL, + .commit_write = NULL, + .bmap = NULL, + /* called just before page is being detached from inode mapping and + removed from memory. Called on truncate, cut/squeeze, and + umount. */ + .invalidatepage = reiser4_invalidatepage, + /* this is called by shrink_cache() so that file system can try to + release objects (jnodes, buffers, journal heads) attached to page + and, may be made page itself free-able. + */ + .releasepage = reiser4_releasepage, + .direct_IO = NULL +}; + +/* called just before page is released (no longer used by reiser4). Callers: + jdelete() and extent2tail(). */ +reiser4_internal void +drop_page(struct page *page) +{ + assert("nikita-2181", PageLocked(page)); + clear_page_dirty(page); + ClearPageUptodate(page); +#if defined(PG_skipped) + ClearPageSkipped(page); +#endif + if (page->mapping != NULL) { + remove_from_page_cache(page); + unlock_page(page); + /* page removed from the mapping---decrement page counter */ + page_cache_release(page); + } else + unlock_page(page); +} + + +/* distinguish jnodes with and without pages, captured and not */ +static void +invalidate_unformatted(jnode *node) +{ + struct page *page; + + LOCK_JNODE(node); + page = node->pg; + if (page) { + page_cache_get(page); + UNLOCK_JNODE(node); + truncate_mapping_pages_range(page->mapping, page->index, 1); + page_cache_release(page); + } else { + JF_SET(node, JNODE_HEARD_BANSHEE); + uncapture_jnode(node); + unhash_unformatted_jnode(node); + } +} + +#define JNODE_GANG_SIZE (16) + +static int +truncate_inode_jnodes_range(struct inode *inode, unsigned long from, unsigned long count) +{ + reiser4_inode *info; + int truncated_jnodes; + reiser4_tree *tree; + unsigned long index; + unsigned long end; + + truncated_jnodes = 0; + + info = reiser4_inode_data(inode); + tree = tree_by_inode(inode); + + index = from; + end = from + count; + + while (1) { + jnode *gang[JNODE_GANG_SIZE]; + int taken; + int i; + jnode *node; + + assert("nikita-3466", index <= end); + + RLOCK_TREE(tree); + taken = radix_tree_gang_lookup(&info->jnode_tree, (void **)gang, + index, JNODE_GANG_SIZE); + for (i = 0; i < taken; ++i) { + node = gang[i]; + if (index_jnode(node) < end) + jref(node); + else + gang[i] = NULL; + } + RUNLOCK_TREE(tree); + + for (i = 0; i < taken; ++i) { + node = gang[i]; + if (node != NULL) { + index = max(index, index_jnode(node)); + invalidate_unformatted(node); + truncated_jnodes ++; + jput(node); + } else + break; + } + if (i != taken || taken == 0) + break; + } + return truncated_jnodes; +} + +/* */ +reiser4_internal void +reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from, unsigned long count) +{ + loff_t from_bytes, count_bytes; + + from_bytes = ((loff_t)from) << PAGE_CACHE_SHIFT; + assert("vs-1621", count != 0); + count_bytes = ((loff_t)count) << PAGE_CACHE_SHIFT; + + invalidate_mmap_range(mapping, from_bytes, count_bytes); + truncate_mapping_pages_range(mapping, from, count); + truncate_inode_jnodes_range(mapping->host, from, count); +} + + +#if REISER4_DEBUG_OUTPUT + +#define page_flag_name( page, flag ) \ + ( test_bit( ( flag ), &( page ) -> flags ) ? ((#flag "|")+3) : "" ) + +reiser4_internal void +print_page(const char *prefix, struct page *page) +{ + if (page == NULL) { + printk("null page\n"); + return; + } + printk("%s: page index: %lu mapping: %p count: %i private: %lx\n", + prefix, page->index, page->mapping, atomic_read(&page->count), page->private); + printk("\tflags: %s%s%s%s %s%s%s%s %s%s%s%s %s%s%s\n", + page_flag_name(page, PG_locked), + page_flag_name(page, PG_error), + page_flag_name(page, PG_referenced), + page_flag_name(page, PG_uptodate), + page_flag_name(page, PG_dirty), + page_flag_name(page, PG_lru), + page_flag_name(page, PG_active), + page_flag_name(page, PG_slab), + page_flag_name(page, PG_highmem), + page_flag_name(page, PG_checked), + page_flag_name(page, PG_arch_1), + page_flag_name(page, PG_reserved), + page_flag_name(page, PG_private), page_flag_name(page, PG_writeback), page_flag_name(page, PG_nosave)); + if (jprivate(page) != NULL) { + print_jnode("\tpage jnode", jprivate(page)); + printk("\n"); + } +} + +reiser4_internal void +print_page_state(const char *prefix, struct page_state *ps) +{ + printk("%i: %s: " + "free: %u, " + "dirty: %lu, " + "writeback: %lu, " +// "pagecache: %lu, " +// "page_table_pages: %lu, " +// "reverse_maps: %lu, " + "mapped: %lu, " + "slab: %lu, " +// "pgpgin: %lu, " +// "pgpgout: %lu, " +// "pswpin: %lu, " +// "pswpout: %lu, " +// "pgalloc: %lu, " +// "pgfree: %lu, " +// "pgactivate: %lu, " +// "pgdeactivate: %lu, " +// "pgfault: %lu, " +// "pgmajfault: %lu, " +// "pgscan: %lu, " +// "pgrefill: %lu, " +// "pgsteal: %lu, " + "kswapd_steal: %lu, " +// "pageoutrun: %lu, " +// "allocstall: %lu + "\n", current->pid, prefix, + + nr_free_pages(), + ps->nr_dirty, + ps->nr_writeback, +// ps->nr_pagecache, +// ps->nr_page_table_pages, +// ps->nr_reverse_maps, + ps->nr_mapped, + ps->nr_slab, +// ps->pgpgin, +// ps->pgpgout, +// ps->pswpin, +// ps->pswpout, +// ps->pgalloc, +// ps->pgfree, +// ps->pgactivate, +// ps->pgdeactivate, +// ps->pgfault, +// ps->pgmajfault, +// ps->pgscan, +// ps->pgrefill, +// ps->pgsteal, + ps->kswapd_steal //, +// ps->pageoutrun, +// ps->allocstall + ); +} + +reiser4_internal void +print_page_stats(const char *prefix) +{ + struct page_state ps; + get_full_page_state(&ps); + print_page_state(prefix, &ps); +} + + +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/page_cache.h linux-2.6.4-ck1/fs/reiser4/page_cache.h --- linux-2.6.4/fs/reiser4/page_cache.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/page_cache.h 2004-03-11 22:45:15.270513162 +1100 @@ -0,0 +1,68 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ +/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */ + +#if !defined( __REISER4_PAGE_CACHE_H__ ) +#define __REISER4_PAGE_CACHE_H__ + +#include "forward.h" +#include "debug.h" + +#include /* for struct super_block, address_space */ +#include /* for struct page */ +#include /* for lock_page() */ + +extern int init_fakes(void); +extern int init_formatted_fake(struct super_block *super); +extern int done_formatted_fake(struct super_block *super); + +extern reiser4_tree *tree_by_page(const struct page *page); + +#if REISER4_TRACE_TREE +extern char *jnode_short_info(const jnode *j, char *buf); +extern int reiser4_submit_bio_helper(const char *moniker, + int rw, struct bio *bio); +#define reiser4_submit_bio(rw, bio) \ + reiser4_submit_bio_helper(__FUNCTION__, (rw), (bio)) +#else +#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio)) +#endif + +extern void reiser4_wait_page_writeback (struct page * page); +static inline void lock_and_wait_page_writeback (struct page * page) +{ + lock_page(page); + if (unlikely(PageWriteback(page))) + reiser4_wait_page_writeback(page); +} + +#define jprivate(page) ((jnode *) (page)->private) + +extern int page_io(struct page *page, jnode * node, int rw, int gfp); +extern int reiser4_writepage(struct page *page, struct writeback_control *wbc); +extern void drop_page(struct page *page); +extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from, unsigned long count); + +#if REISER4_DEBUG_OUTPUT +extern void print_page(const char *prefix, struct page *page); +extern void print_page_state(const char *prefix, struct page_state *ps); +extern void print_page_stats(const char *prefix); +#else +#define print_page(prf, p) noop +#define print_page_state(prefix, ps) noop +#define print_page_stats(prefix) noop +#endif + +/* __REISER4_PAGE_CACHE_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/parser/lex.l linux-2.6.4-ck1/fs/reiser4/parser/lex.l --- linux-2.6.4/fs/reiser4/parser/lex.l 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/lex.l 2004-03-11 22:45:15.271513007 +1100 @@ -0,0 +1,65 @@ +%Start Normal + +LETTER [A-Za-z_] +DIG [0-9] +WRD ({LETTER}|{DIG})+ +SP [ \t\n]+ +SPECIAL [\[\]\{\}\/\\\,\:\;\*\$\@\!\`\'] + + + +%% + +%{ +BEGIN Normal; +%} + +{SP}"/"{SP} {return BLANK_SLASH_BLANK;} + +{SP}?";"{SP}? {return SEMICOLON;} +{SP}?","{SP}? {return COMMA;} +{SP}?"+"{SP}? {return PLUS;} +{SP}?"("{SP}? {return L_PARENT;} +{SP}?")"{SP}? {return R_PARENT;} +{SP}?"{"{SP}? {return L_FLX_PARENT;} +{SP}?"}"{SP}? {return R_FLX_PARENT;} +{SP}?"["{SP}? {return L_SKW_PARENT;} +{SP}?"]"{SP}? {return R_SKW_PARENT;} + +{SP}?eq{SP}? {return EQ;} +{SP}?ne{SP}? {return NE;} +{SP}?le{SP}? {return LE;} +{SP}?ge{SP}? {return GE;} +{SP}?lt{SP}? {return LT;} +{SP}?gt{SP}? {return GT;} +{SP}?is{SP}? {return IS;} +{SP}?and{SP}? {return AND;} +{SP}?or{SP}? {return OR;} +{SP}?not{SP}? {return NOT;} +{SP}?if{SP}? {return IF;} +{SP}?then{SP}? {return THEN;} +{SP}?else{SP}? {return ELSE;} +{SP}?exist{SP}? {return EXIST;} + +{SP}?"<""-"{SP}? {return L_ASSIGN;} +{SP}?"<""-""="{SP}? {return L_SYMLINK;} + +{SP}?tw"/""("{SP}? {return TRANSCRASH;} + +"/"process {return SLASH_PROCESS;} +"/"stat {return SLASH_STAT;} +"/"range {return SLASH_RANGE;} +"/""(" {return SLASH_L_PARENT;} +"/" {return SLASH;} + + +{SP}?"]"{SP}? {return BLANK_SLASH_BLANK;} + + +{WRD} { return WORD ;} + +. { return 0 ;} +%% + + + diff -Naurp linux-2.6.4/fs/reiser4/parser/lib.c linux-2.6.4-ck1/fs/reiser4/parser/lib.c --- linux-2.6.4/fs/reiser4/parser/lib.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/lib.c 2004-03-11 22:45:15.273512696 +1100 @@ -0,0 +1,1442 @@ +/* + * Copyright 2001, 2002 by Hans Reiser, licensing governed by reiser4/README + */ + +/* + * functions for parser.y + */ + + +#include "lib.h" + +#include + +/* FIXME:NIKITA->VOVA this file uses indentation completely different than the + * rest of reiser4 and kernel. This complicates reading of the code by other + * people. I think this should be changed. + * OK. But after it's works*/ + + + +#define LEX_XFORM 1001 +#define LEXERR2 1002 +#define LEX_Ste 1003 + +/* printing errors for parsing */ +static void yyerror( struct reiser4_syscall_w_space *ws /* work space ptr */, + int msgnum /* message number */, ...) +{ + char errstr[120]={"\nreiser4 parser:"}; + va_list args; + va_start(args, msgnum); + switch (msgnum) { + case 101: + strcat(errstr,"yacc stack overflow"); + break; + case LEX_XFORM: + strcat(errstr,"x format has odd number of symbols"); + break; + case LEXERR2: +/* int state = va_arg(args, int);*/ + strcat(errstr,"internal lex table error"); + break; + case LEX_Ste: + strcat(errstr,"wrong lexem"); + break; + case 11111: { + int state = va_arg(args, int); + /* int s = va_arg(args, int);*/ + strcat(errstr," syntax error:"); + switch(state) { + case 4: + strcat(errstr,"wrong operation"); + break; + case 6: + strcat(errstr,"wrong assign operation"); + break; + case 7: + case 14: + strcat(errstr,"wrong name"); + break; + case 26: + strcat(errstr,"wrong logical operation"); + break; + case 9: + strcat(errstr,"wrong THEN keyword"); + break; + case 36: + case 49: + strcat(errstr,"wrong separatop"); + break; + default: + strcat(errstr,"syntax error"); + break; + } + } + break; + } + va_end(args); + printk(errstr); + printk("\n%s",curr_symbol(ws)); +} + +//static int yywrap() +//{ +// return 1; +//} + +/* free lists of work space*/ +static void freeList(freeSpace_t * list /* head of list to be fee */) +{ + freeSpace_t * curr,* next; + next = list; + while (next) { + curr = next; + next = curr->freeSpace_next; + kfree(curr); + } +} + +/* free work space*/ +static int reiser4_pars_free(struct reiser4_syscall_w_space * ws /* work space ptr */) +{ + + dput( ws->root_e->lnode.lnode->dentry.dentry ); + mntput( ws->root_e->lnode.lnode->dentry.mnt ); + + dput( ws->cur_level->cur_exp->lnode.lnode->dentry.dentry ); + mntput( ws->cur_level->cur_exp->lnode.lnode->dentry.mnt ); + + if (ws->freeSpHead) { + freeList(ws->freeSpHead); + } + kfree(ws); + return 0; +} + +/* FIXME:NIKITA->VOVA code below looks like custom made memory allocator. Why + * not to use slab? */ +#define initNextFreeSpace(fs) (fs)->freeSpace_next = NULL; \ + (fs)->freeSpaceMax = (fs)->freeSpaceBase+FREESPACESIZE; \ + (fs)->freeSpace = (fs)->freeSpaceBase + + +/* allocate work space */ +static freeSpace_t * freeSpaceAlloc() +{ + freeSpace_t * fs; + fs = ( freeSpace_t * ) kmalloc( sizeof( freeSpace_t ),GFP_KERNEL ) ; + assert("VD kmalloc work space",fs!=NULL); + memset( fs , 0, sizeof( freeSpace_t )); + initNextFreeSpace(fs); + + + + return fs; +} + +#define get_first_freeSpHead(ws) (ws)->freeSpHead +#define get_next_freeSpHead(curr) (curr)->freeSpace_next + + +/* allocate next work space */ +static freeSpace_t * freeSpaceNextAlloc(struct reiser4_syscall_w_space * ws /* work space ptr */ ) +{ + freeSpace_t * curr,* next; + curr=NULL; + next = get_first_freeSpHead(ws); + while (next) { + curr = next; + next = get_next_freeSpHead(curr); + } + next = freeSpaceAlloc(); + if(curr==NULL) { + ws->freeSpHead=next; + } + else { + curr->freeSpace_next=next; + } + next->freeSpace_next=NULL; + return next; +} + +/* allocate field lenth=len in work space */ +static char* list_alloc(struct reiser4_syscall_w_space * ws/* work space ptr */, + int len/* lenth of structures to be allocated in bytes */) +{ + char * rez; + if( (ws->freeSpCur->freeSpace+len) > (ws->freeSpCur->freeSpaceMax) ) { + ws->freeSpCur = freeSpaceNextAlloc(ws); + } + rez = ws->freeSpCur->freeSpace; + ws->freeSpCur->freeSpace += ROUND_UP(len); + return rez; +} + +/* allocate new level of parsing in work space */ +static streg_t *alloc_new_level(struct reiser4_syscall_w_space * ws /* work space ptr */ ) +{ + return ( streg_t *) list_alloc(ws,sizeof(streg_t)); +} + +/* allocate structure of new variable of input expression */ +static pars_var_t * alloc_pars_var(struct reiser4_syscall_w_space * ws /* work space ptr */, + pars_var_t * last_pars_var /* last of allocated pars_var or NULL if list is empty */) +{ + pars_var_t * pars_var; + PTRACE(ws, "begin ws->Head_pars_var =%p last_pars_var=%p",ws->Head_pars_var, last_pars_var); + pars_var = (pars_var_t *)list_alloc(ws,sizeof(pars_var_t)); + if ( last_pars_var == NULL ) { + ws->Head_pars_var = pars_var; + } + else { + last_pars_var->next = pars_var; + } + pars_var->next = NULL; + PTRACE(ws, "return pars_var =%p ",pars_var); + return pars_var; +} + +/* free lnodes used in expression */ +static int free_expr( /*struct reiser4_syscall_w_space * ws, */ expr_v4_t * expr) +{ + expr_list_t * tmp; + int ret = 0; + assert("VD-free_expr", expr!=NULL); + switch (expr->h.type) { + case EXPR_WRD: + break; + case EXPR_PARS_VAR: + assert("VD-free_expr.EXPR_PARS_VAR", expr->pars_var.v!=Null); + assert("VD-free_expr.EXPR_PARS_VAR.ln", expr->pars_var.v->ln!=Null); + if (!--expr->pars_var.v->count) { + lput(expr->pars_var.v->ln); + } + break; + case EXPR_LIST: + tmp=&expr->list; + while (tmp) { + assert("VD-free_expr.EXPR_LIST", tmp->h.type==EXPR_LIST); + ret |= free_expr(tmp->source); + tmp = tmp->next; + } + break; + case EXPR_ASSIGN: + assert("VD-free_expr.EXPR_ASSIGN", expr->assgn.target!=Null); + assert("VD-free_expr.EXPR_ASSIGN.ln", expr->assgn.target->ln!=Null); + assert("VD-free_expr.EXPR_ASSIGN.count", expr->assgn.target->count>0); + if (!--expr->assgn.target->count) { + lput(expr->assgn.target->ln); + } + ret |= free_expr(expr->assgn.source); + break; + case EXPR_LNODE: + assert("VD-free_expr.lnode.lnode", expr->lnode.lnode!=Null); + dput( expr->lnode.lnode->dentry.dentry ); + mntput( expr->lnode.lnode->dentry.mnt ); + lput(expr->lnode.lnode); + break; + case EXPR_FLOW: + break; +/* + case EXPR_OP3: + free_expr(expr->op3.op_r); + free_expr(expr->op3.op_l); + free_expr(expr->op3.op); + break; +*/ + case EXPR_OP2: + ret = free_expr(expr->op2.op_r); + ret |= free_expr(expr->op2.op_l); + break; + case EXPR_OP: + ret = free_expr(expr->op.op); + break; + } + return ret; +} + + +//ln->inode.inode->i_op->lookup(struct inode *,struct dentry *); +//current->fs->pwd->d_inode->i_op->lookup(struct inode *,struct dentry *); + +#if 0 +/* alloca te space for lnode */ +static lnode * alloc_lnode(struct reiser4_syscall_w_space * ws /* work space ptr */ ) +{ + lnode * ln; + ln = ( lnode * ) kmalloc( sizeof( lnode ), GFP_KERNEL); + assert("VD-alloc_pars_var", ln != NULL ); + memset( ln , 0, sizeof( lnode )); + return ln; +} +#endif + +/* make lnode_dentry from inode, except reiser4 inode */ +static lnode * get_lnode(struct reiser4_syscall_w_space * ws /* work space ptr */ ) +{ + lnode * ln; + reiser4_key key, * k_rez,* l_rez; + +#if 0 /*def NOT_YET*/ + if ( is_reiser4_inode( ws->nd.dentry->inode ) ) { + + k_rez = build_sd_key( ws->nd.dentry->inode, &key); + ln = lget( LNODE_REISER4_INODE, get_inode_oid( ws->nd.dentry->inode) ); + // ln->lw.lw_sb = ws->nd.dentry->inode->isb; + ln->reiser4_inode.inode = /*????*/ ws->nd.dentry->inode->isb; + ln->reiser4_inode.inode = /*????*/ ws->nd.dentry->inode->isb; + PTRACE( ws, "r4: lnode=%p", ln ); + } + else +#endif + { + ln = lget( LNODE_DENTRY, get_inode_oid( ws->nd.dentry->d_inode) ); + ln->dentry.dentry = ws->nd.dentry; + ln->dentry.mnt = ws->nd.mnt; + PTRACE( ws, "no r4 lnode=%p,dentry=%p", ln, ln->dentry.dentry); + } + PTRACE( ws, " lnode=%p", ln ); + return ln; +} + +/* allocate work space, initialize work space, tables, take root inode and PWD inode */ +static struct reiser4_syscall_w_space * reiser4_pars_init() +{ + struct reiser4_syscall_w_space * ws; + /* allocate work space for parser + working variables, attached to this call */ + ws = kmalloc( sizeof( struct reiser4_syscall_w_space ), GFP_KERNEL ); + assert("VD_allock work space", ws != NULL); + memset( ws, 0, sizeof( struct reiser4_syscall_w_space )); + ws->ws_yystacksize = MAXLEVELCO; /* must be 500 by default */ + ws->ws_yymaxdepth = MAXLEVELCO; /* must be 500 by default */ + /* allocate first part of working tables + and initialise headers */ + ws->freeSpHead = freeSpaceAlloc(); + ws->freeSpCur = ws->freeSpHead; + ws->wrdHead = NULL; + ws->root_e = init_root(ws); + ws->cur_level = alloc_new_level(ws); + ws->cur_level->cur_exp = init_pwd(ws); + ws->cur_level->wrk_exp = ws->cur_level->cur_exp; /* current wrk for new level */ + ws->cur_level->prev = NULL; + ws->cur_level->next = NULL; + ws->cur_level->level = 0; + ws->cur_level->stype = 0; + return ws; +} + + +/* level up of parsing level */ +static void level_up(struct reiser4_syscall_w_space *ws /* work space ptr */, + long type /* type of level we going to */) +{ + PTRACE(ws, "%s", "begin"); + if (ws->cur_level->next==NULL) { + ws->cur_level->next = alloc_new_level(ws); + ws->cur_level->next->prev = ws->cur_level; + ws->cur_level->next->next = NULL; + ws->cur_level->level = ws->cur_level->prev->level+1; + } + ws->cur_level = ws->cur_level->next; + ws->cur_level->stype = type; + ws->cur_level->cur_exp = ws->cur_level->prev->wrk_exp; /* current pwd for new level */ + ws->cur_level->wrk_exp = ws->cur_level->cur_exp; /* current wrk for new level */ +} + +/* level down of parsing level */ +static void level_down(struct reiser4_syscall_w_space * ws /* work space ptr */, + long type1 /* type of level that was up( for checking) */, + long type2 /* type of level that is down(for checking)*/) +{ + assert("VD-level_down: type mithmatch", type1==type2); + assert("VD-level_down: type mithmatch with level", type1==ws->cur_level->stype); +// path_release(ws->cur_level->path_walk->nd); ?????? +// this is wrong ???? ws->cur_level->prev->wrk_exp = ws->cur_level->wrk_exp ; /* current wrk for new level */ + ws->cur_level = ws->cur_level->prev; +} + +/* move_selected_word - copy term from input bufer to free space. + * if it need more, move freeSpace to the end. + * otherwise next term will owerwrite it + * freeSpace is a kernel space no need make getnam(). + * exclude is for special for string: store without '' + */ +static void move_selected_word(struct reiser4_syscall_w_space * ws /* work space ptr */, + int exclude /* TRUE - for storing string without first and last symbols + FALS - for storing names */ ) +{ + int i; + /* char * s= ws->ws_pline;*/ + if (exclude) { + ws->yytext++; + } + for( ws->tmpWrdEnd = ws->freeSpCur->freeSpace; ws->yytext < curr_symbol(ws); ) { + i=0; + // while( *ws->yytext == '\'' ) + // { + // ws->yytext++; + // i++; + // } + // while ( ws->yytext > curr_symbol(ws) ) + // { + // i--; + // ws->yytext--; + // } + // if ( i ) for ( i/=2; i; i-- ) *ws->tmpWrdEnd++='\''; /* in source text for each '' - result will ' */ + /* \???????? */ + if ( *ws->yytext == '\\' ) { + int tmpI; + ws->yytext++; + switch ( tolower( (int)*(ws->yytext) ) ) { + case 'x': /* \x01..9a..e */ + i = 0; + tmpI = 1; + while( tmpI) { + if (isdigit( (int)*(ws->yytext) ) ) { + i = (i << 4) + ( *ws->yytext++ - '0' ); + } + else if( tolower( (int) *(ws->yytext) ) >= 'a' && tolower( (int)*(ws->yytext) ) <= 'e' ) { + i = (i << 4) + ( *ws->yytext++ - 'a' + 10 ); + } + else { + if ( tmpI & 1 ) { + yyerror( ws, LEX_XFORM ); /* x format has odd number of symbols */ + } + tmpI = 0; + } + if ( tmpI && !( tmpI++ & 1 ) ) { + *ws->tmpWrdEnd++ = (unsigned char) i; + i = 0; + } + } + break; + } + } + else *ws->tmpWrdEnd++ = *ws->yytext++; + if( ws->tmpWrdEnd > (ws->freeSpCur->freeSpaceMax - sizeof(wrd_t)) ) { + + assert ("VD sys_reiser4. selectet_word:Internal space buffer overflow: input token exceed size of bufer", + ws->freeSpCur->freeSpace > ws->freeSpCur->freeSpaceBase); + /* we can reallocate new space and copy all + symbols of current token inside it */ + { + freeSpace_t * tmp; + tmp=ws->freeSpCur; + ws->freeSpCur = freeSpaceNextAlloc(ws); + assert ("VD sys_reiser4:Internal text buffer overflow: no enouse mem", ws->freeSpCur !=NULL); + { + int i; + i = ws->tmpWrdEnd - tmp->freeSpace; + memmove( ws->freeSpCur->freeSpace, tmp->freeSpace, i ); + ws->tmpWrdEnd = ws->freeSpCur->freeSpace + i; + } + } + } + } +#if 0 + if (exclude) { + ws->tmpWrdEnd--; + } +#endif + *ws->tmpWrdEnd++ = '\0'; +} + + +/* compare parsed word with keywords*/ +static int b_check_word(struct reiser4_syscall_w_space * ws /* work space ptr */) +{ + int i, j, l; + j=sizeof(pars_key)/(sizeof(char*)+sizeof(int))-1; + l=0; + while( ( j - l ) >= 0 ) { + i = ( j + l /*+ 1*/ ) >> 1; + switch( strcmp( pars_key[i].wrd, ws->freeSpCur->freeSpace ) ) { + case 0: + PTRACE(ws,"founded: i=%d, %s, %d", i, pars_key[i].wrd, pars_key[i].class); + return( pars_key[i].class ); + break; + case 1: j = i - 1; break; + default: l = i + 1; break; + } + } + return(0); +} + + +/* comparing parsed word with already stored words, if not compared, storing it */ +static __inline__ wrd_t * _wrd_inittab(struct reiser4_syscall_w_space * ws /* work space ptr */ ) +{ + wrd_t * cur_wrd; + wrd_t * new_wrd; + int len; + new_wrd = ws->wrdHead; +#if 0 + len = strlen( ws->freeSpCur->freeSpace) ; +#else + len = ws->tmpWrdEnd - ws->freeSpCur->freeSpace - 1 ; +#endif + PTRACE( ws, "wrd %s len=%d wrdHead=%p", ws->freeSpCur->freeSpace, len ,ws->wrdHead ); + cur_wrd = NULL; + while ( !( new_wrd == NULL ) ) { + cur_wrd = new_wrd; + if ( cur_wrd->u.len == len ) { + if( !memcmp( cur_wrd->u.name, ws->freeSpCur->freeSpace, cur_wrd->u.len ) ) { + PTRACE( ws, "wrd %s len=%d founded=%p", ws->freeSpCur->freeSpace, len ,cur_wrd ); + return cur_wrd; + } + } + new_wrd = cur_wrd->next; + } + new_wrd = ( wrd_t *)(ws->freeSpCur->freeSpace + ROUND_UP( len+1 )); + new_wrd->u.name = ws->freeSpCur->freeSpace; + new_wrd->u.len = len; + ws->freeSpCur->freeSpace= (char*)new_wrd + ROUND_UP(sizeof(wrd_t)); + new_wrd->next = NULL; + if (cur_wrd==NULL) { + ws->wrdHead = new_wrd; + } + else { + cur_wrd->next = new_wrd; + } + PTRACE( ws, "wrd len=%d new=%p, name=%p name=%s len=%d", len , new_wrd, new_wrd->u.name, new_wrd->u.name, new_wrd->u.len ); + return new_wrd; +} + +/* lexical analisator for yacc automat */ +static int reiser4_lex( struct reiser4_syscall_w_space * ws /* work space ptr */) +{ + char term, n, i = 0; + int ret = 0; + char lcls; +// char * s ; + +// s = curr_symbol(ws); /* first symbol or Last readed symbol of the previous token parsing */ + if ( *curr_symbol(ws) == 0 ) return 0; /* end of string is EOF */ + + while(ncl[(int)*curr_symbol(ws)]==Blk) { + next_symbol(ws); + if ( *curr_symbol(ws) == 0 ) return 0; /* end of string is EOF */ + } + + + lcls = ncl[(int)*curr_symbol(ws)]; + ws->yytext = curr_symbol(ws); + term = 1; + while( term ) { + n=lcls; + while ( n > 0 ) { + next_symbol(ws); + lcls=n; + n = lexcls[ (int)lcls ].c[ (int)i=ncl[ (int)*curr_symbol(ws) ] ]; + } + if ( n == OK ) { + term=0; + } + else { + yyerror ( ws, LEXERR2, (lcls-1)* 20+i ); + return(0); + } + } + switch (lcls) { + case Blk: + case Ste: + yyerror(ws,LEX_Ste); + break; + case Wrd: + move_selected_word( ws, lexcls[(int) lcls ].c[0] ); + /* if ret>0 this is keyword */ + if ( !(ret = b_check_word(ws)) ) { /* this is not keyword. tray check in worgs. ret = Wrd */ + ret=lexcls[(int) lcls ].term; + ws->ws_yylval.wrd = _wrd_inittab(ws); + } + break; + case Int: + case Ptr: + case Pru: + case Str: /*`......"*/ + move_selected_word( ws, lexcls[(int) lcls ].c[0] ); + ret=lexcls[(int) lcls ].term; + ws->ws_yylval.wrd = _wrd_inittab(ws); + break; + /* + move_selected_word( ws, lexcls[ lcls ].c[0] ); + ret=lexcls[ lcls ].term; + ws->ws_yyval.w = _wrd_inittab(ws); + break; + */ + case Stb: + case Com: + case Mns: + case Les: + case Slh: + case Bsl: /*\ */ + case Sp1: /*;*/ + case Sp2: /*:*/ + case Dot: /*.*/ + case Sp4: /*=*/ + case Sp5: /*>*/ + case Sp6: /*?*/ + case ASG:/*<-*/ + case App:/*<<-*/ + case Lnk:/*->*/ + ret=lexcls[(int) lcls ].term; + break; + case Lpr: + case Rpr: + ws->ws_yylval.charType = *ws->yytext ; + ret=lexcls[(int) lcls ].term; + break; + default : /* others */ + ret=*ws->yytext; + break; + } + return ret; +} + + + +/*==========================================================*/ + +/* allocate new expression @type */ +static expr_v4_t * alloc_new_expr(struct reiser4_syscall_w_space * ws /* work space ptr */, + int type /* type of new expression */) +{ + expr_v4_t * e; + e = ( expr_v4_t *) list_alloc( ws, sizeof(expr_v4_t)); + e->h.type = type; + return e; +} + +/* store NULL name in word table */ +wrd_t * nullname(struct reiser4_syscall_w_space * ws /* work space ptr */) +{ + PTRACE(ws, "%s", "begin"); + ws->tmpWrdEnd = ws->freeSpCur->freeSpace; + *ws->tmpWrdEnd++ = 0; + return _wrd_inittab(ws); +} + +/* initialize node for root lnode */ +static expr_v4_t * init_root(struct reiser4_syscall_w_space * ws /* work space ptr */) +{ + expr_v4_t * e; + e = alloc_new_expr( ws, EXPR_PARS_VAR ); + e->pars_var.v = alloc_pars_var( ws, NULL ); + e->pars_var.v->w = nullname(ws) ; /* or '/' ????? */ + ws->nd.flags = LOOKUP_NOALT; + +// walk_init_root( "/", (&ws->nd)); /* from namei.c walk_init_root */ + read_lock(¤t->fs->lock); + ws->nd.mnt = mntget(current->fs->rootmnt); /*????*/ + ws->nd.dentry = dget(current->fs->root); + read_unlock(¤t->fs->lock); + + e->pars_var.v->ln = get_lnode( ws ) ; + + e->pars_var.v->parent = NULL; + return e; +} + + +/* initialize node for PWD lnode */ +static expr_v4_t * init_pwd(struct reiser4_syscall_w_space * ws /* work space ptr */) +{ + expr_v4_t * e; + e = alloc_new_expr(ws,EXPR_PARS_VAR); + e->pars_var.v = alloc_pars_var(ws,ws->root_e->pars_var.v); + + e->pars_var.v->w = nullname(ws) ; /* better if it will point to full pathname for pwd */ + +// path_lookup(".",,&(ws->nd)); /* from namei.c path_lookup */ + read_lock(¤t->fs->lock); + ws->nd.mnt = mntget(current->fs->pwdmnt); + ws->nd.dentry = dget(current->fs->pwd); + read_unlock(¤t->fs->lock); + current->total_link_count = 0; + + e->pars_var.v->ln = get_lnode( ws ) ; + e->pars_var.v->parent = ws->root_e->pars_var.v; + + return e; +} + + +#if 0 +static expr_v4_t * pars_lookup(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2) +{ + not ready; + pars_var_t * rez_pars_var; + pars_var_t * this_l; + this_l = getFirstPars_Var(e1); + while(this_l != NULL ) { + } + assert("pars_lookup:lnode is null",rez_pars_var->ln!=NULL); + memcpy( &curent_dentry.d_name , w, sizeof(struct qstr));<--------------- + if( ( rez_pars_var->ln = pars_var->ln->d_inode->i_op->lookup( pars_var->ln->d_inode, &curent_dentry) ) == NULL ) { + /* lnode not exist: we will not need create it. this is error*/ + } +} +#endif + +/* Object_Name : begin_from name %prec ROOT { $$ = pars_expr( ws, $1, $2 ) ; } + | Object_Name SLASH name { $$ = pars_expr( ws, $1, $3 ) ; } */ +static expr_v4_t * pars_expr(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expression ( not yet used)*/, + expr_v4_t * e2 /* second expression*/) +{ + ws->cur_level->wrk_exp=e2; + return e2; +} + +/* not yet */ +static pars_var_t * getFirstPars_VarFromExpr(struct reiser4_syscall_w_space * ws ) +{ + pars_var_t * ret = 0; + expr_v4_t * e = ws->cur_level->wrk_exp; + switch (e->h.type) { + case EXPR_PARS_VAR: + ret = e->pars_var.v; + break; + default: + + } + return ret; +} + +/* search pars_var for @w */ +static expr_v4_t * lookup_word(struct reiser4_syscall_w_space * ws /* work space ptr */, + wrd_t * w /* word to search for */) +{ + expr_v4_t * e; + pars_var_t * cur_pars_var; + PTRACE(ws, "&w=%p,w->u.name=%p, %s",w,w->u.name,w->u.name); +#if 1 /* tmp. this is fist version. for II we need do "while" throus expression for all pars_var */ + cur_pars_var = ws->cur_level->wrk_exp->pars_var.v; + + + +#else + cur_pars_var = getFirstPars_VarFromExpr(ws); + while(pars_var!=NULL) + { +#endif + + + + e = alloc_new_expr( ws, EXPR_PARS_VAR ); + + e->pars_var.v = lookup_pars_var_word( ws, cur_pars_var, w ); + + +#if 0 + pars_var=getNextPars_VarFromExpr(ws); + } + all rezult mast be connected to expression. +#endif + + + PTRACE(ws, "end e=%p",e); + return e; +} + +/* set work path in level to current in level */ +static inline expr_v4_t * pars_lookup_curr(struct reiser4_syscall_w_space * ws /* work space ptr */) +{ + ws->cur_level->wrk_exp = ws->cur_level->cur_exp; /* current wrk for pwd of level */ + return ws->cur_level->wrk_exp; +} + +/* set work path in level to root */ +static inline expr_v4_t * pars_lookup_root(struct reiser4_syscall_w_space * ws) +{ + ws->cur_level->wrk_exp = ws->root_e; /* set current to root */ + return ws->cur_level->wrk_exp; +} + + + +#if 0 +/*?????*/ + +/* implementation of lookup_name() method for hashed directories + + it looks for name specified in @w in reiser4_inode @parent and if name is found - key of object found entry points + to is stored in @key */ +reiser4_internal int +lookup_name_hashed_reiser4(reiser4_inode *parent /* reiser4 inode of directory to lookup for name in */, + wrd_t *w /* name to look for */, + reiser4_key *key /* place to store key */) +{ + int result; + coord_t *coord; + lock_handle lh; + const char *name; + int len; + reiser4_dir_entry_desc entry; + + assert("nikita-1247", parent != NULL); + assert("nikita-1248", w != NULL); + +?? assert("vs-1486", dentry->d_op == &reiser4_dentry_operations); + + result = reiser4_perm_chk(parent, lookup, parent, &w->u); + + + if (result != 0) + return 0; + + name = w->u.name; + len = w->u.len; + + if ( len > parent->pset->dir_item) + /* some arbitrary error code to return */ + return RETERR(-ENAMETOOLONG); + + coord = &reiser4_get_dentry_fsdata(dentry)->dec.entry_coord; ??????? + coord_clear_iplug(coord); + + + + + init_lh(&lh); + + ON_TRACE(TRACE_DIR | TRACE_VFS_OPS, "lookup inode: %lli \"%s\"\n", get_inode_oid(parent), dentry->d_name.name); + + /* find entry in a directory. This is plugin method. */ + + + // result = find_entry(parent, dentry, &lh, ZNODE_READ_LOCK, &entry); + + + if (result == 0) { + /* entry was found, extract object key from it. */ + result = WITH_COORD(coord, item_plugin_by_coord(coord)->s.dir.extract_key(coord, key)); + } + done_lh(&lh); + return result; + +} + +node_plugin_by_node(coord->node)->lookup(coord->node, key, FIND_MAX_NOT_MORE_THAN, &twin); +item_type_by_coord(coord) + +/* + * try to look up built-in pseudo file by its name. + */ +reiser4_internal int +lookup_pseudo_file(reiser4_inode *parent /* reiser4 inode of directory to lookup for name in */, + wrd_t *w /* name to look for */, + reiser4_key *key /* place to store key */) + // struct dentry * dentry) +{ + reiser4_plugin *plugin; + const char *name; + struct inode *pseudo; + int result; + + + + + + + assert("nikita-2999", parent != NULL); + assert("nikita-3000", dentry != NULL); + + /* if pseudo files are disabled for this file system bail out */ + if (reiser4_is_set(parent->i_sb, REISER4_NO_PSEUDO)) + return RETERR(-ENOENT); + + name = dentry->d_name.name; + pseudo = ERR_PTR(-ENOENT); + /* scan all pseudo file plugins and check each */ + for_all_plugins(REISER4_PSEUDO_PLUGIN_TYPE, plugin) { + pseudo_plugin *pplug; + + pplug = &plugin->pseudo; + if (pplug->try != NULL && pplug->try(pplug, parent, name)) { + pseudo = add_pseudo(parent, pplug, dentry); + break; + } + } + if (!IS_ERR(pseudo)) + result = 0; + else + result = PTR_ERR(pseudo); + return result; +} + +#endif + + + +/* seach @parent/w in internal table. if found return it, else @parent->lookup(@w) */ +static pars_var_t * lookup_pars_var_word(struct reiser4_syscall_w_space * ws /* work space ptr */, + pars_var_t * parent /* parent for w */, + wrd_t * w /* to lookup for word */) +{ + struct dentry * de, * de_rez; + reiser4_key key,* k_rez; + coord_t coord; + lock_handle lh; + item_plugin *iplug; + pars_var_t * rez_pars_var; + pars_var_t * last_pars_var; + PTRACE(ws, "begin ws->Head_pars_var=%p, parent=%p w=%p",ws->Head_pars_var,parent,w); + + last_pars_var = NULL; + rez_pars_var = ws->Head_pars_var; + while (rez_pars_var!=NULL) { + if( rez_pars_var->parent == parent && rez_pars_var->w == w) { + rez_pars_var->count++; + return rez_pars_var; + } + last_pars_var = rez_pars_var; + rez_pars_var = rez_pars_var->next; + } +// reiser4_fs = 0; + rez_pars_var = alloc_pars_var(ws, last_pars_var); + rez_pars_var->w = w; + rez_pars_var->parent = parent; + +// case EXPR_PARS_VAR: +// /* not yet */ +// ws->nd.dentry=parent->ln->dentry.dentry; +// de_rez = link_path_walk( w->u.name, &(ws->nd) ); /* namei.c */ +// break; + switch (parent->ln->h.type) { + + case LNODE_INODE: /* not use it ! */ + de = d_alloc_anon(parent->ln->inode.inode); + break; + case LNODE_DENTRY: + // de = parent->ln->dentry.dentry; + // de_rez = lookup_one_len( w->u.name, de, w->u.len); /* namei.c */ + + ws->nd.dentry = parent->ln->dentry.dentry; + ws->nd.mnt = parent->ln->dentry.mnt; + ws->nd.flags = LOOKUP_NOALT ; + if ( link_path_walk( w->u.name, &(ws->nd) ) ) /* namei.c */ { + /*????????????*/ + } + else { + rez_pars_var->ln = lget( LNODE_DENTRY, get_inode_oid( ws->nd.dentry->d_inode) ); + rez_pars_var->ln->dentry.dentry = ws->nd.dentry; + rez_pars_var->ln->dentry.mnt = ws->nd.mnt; + } + PTRACE(ws, "rez de=%p",rez_pars_var->ln->dentry.dentry); + break; + /* + case LNODE_PSEUDO: + PTRACE(ws, "parent pseudo=%p",parent->ln->pseudo.host); + break; + */ + case LNODE_LW: + break; + case LNODE_REISER4_INODE: + rez_pars_var->ln->h.type = LNODE_REISER4_INODE /* LNODE_LW */; + +#if 0 /* NOT_YET ???? */ + +// ln = lget( LNODE_DENTRY, get_key_objectid(&key ) ); + + result = coord_by_key(get_super_private(parent->ln->lw.lw_sb)->tree, + parent->ln->lw.key, + &coord, + &lh, + ZNODE_READ_LOCK, + FIND_EXACT, + LEAF_LEVEL, + LEAF_LEVEL, + CBK_UNIQUE, + 0); + // if (REISER4_DEBUG && result == 0) + // check_sd_coord(coord, key); + + if (result != 0) { + lw_key_warning(parent->ln->lw.key, result); + } + else { + switch(item_type_by_coord(coord)) { + case STAT_DATA_ITEM_TYPE: + printk("VD-item type is STAT_DATA\n"); + case DIR_ENTRY_ITEM_TYPE: + printk("VD-item type is DIR_ENTRY\n"); + iplug = item_plugin_by_coord(coord); + if (iplug->b.lookup != NULL) { + iplug->b.lookup(); /*????*/ + } + + + + + + + + + + case INTERNAL_ITEM_TYPE: + printk("VD-item type is INTERNAL\n"); + case ORDINARY_FILE_METADATA_TYPE: + + + case OTHER_ITEM_TYPE: + printk("VD-item type is OTHER\n"); + } + + } + /*?? lookup_sd find_item_obsolete */ +#endif + + case LNODE_NR_TYPES: + break; + } + PTRACE(ws, "de=%p w->u.name= %p, u.name->%s, u.len=%d",de,w->u.name,w->u.name,w->u.len); + + return rez_pars_var; + +} + + +/* execute code: walk tree, call plugins and return value */ +static expr_v4_t * make_do_it(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* expression for execution (not yet used)*/ ) +{ + PTRACE(ws, "%s", "begin"); + return e1; +} + +/* if_then_else procedure */ +static expr_v4_t * if_then_else(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* expression of condition */, + expr_v4_t * e2 /* expression of then */, + expr_v4_t * e3 /* expression of else */ ) +{ + PTRACE(ws, "%s", "begin"); + return e1; +} + +/* not yet */ +static expr_v4_t * if_then(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /**/, + expr_v4_t * e2 /**/ ) +{ + PTRACE(ws, "%s", "begin"); + return e1; +} + +/* not yet */ +static void goto_end(struct reiser4_syscall_w_space * ws /* work space ptr */) +{ +} + + +/* STRING_CONSTANT to expression */ +static expr_v4_t * constToExpr(struct reiser4_syscall_w_space * ws /* work space ptr */, + wrd_t * e1 /* constant for convert to expression */) +{ + expr_v4_t * new_expr = alloc_new_expr(ws, EXPR_WRD ); + new_expr->wd.s = e1; + return NULL; +} + +/* allocate EXPR_OP2 */ +static expr_v4_t * allocate_expr_op2(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr */, + expr_v4_t * e2 /* second expr */, + int op /* expression code */) +{ + expr_v4_t * ret; + ret = alloc_new_expr( ws, EXPR_OP2 ); + assert("VD alloc op2", ret!=NULL); + ret->h.exp_code = op; + ret->op2.op_l = e1; + ret->op2.op_r = e2; + return ret; +} + +/* allocate EXPR_OP */ +static expr_v4_t * allocate_expr_op(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr */, + int op /* expression code */) +{ + expr_v4_t * ret; + ret = alloc_new_expr(ws, EXPR_OP2 ); + assert("VD alloc op2", ret!=NULL); + ret->h.exp_code = op; + ret->op.op = e1; + return ret; +} + + +/* concatenate expressions */ +static expr_v4_t * connect_expression(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of connecting */, + expr_v4_t * e2 /* second expr of connecting */) +{ + return allocate_expr_op2( ws, e1, e2, CONNECT ); +} + + +/* compare expressions */ +static expr_v4_t * compare_EQ_expression(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of comparing */, + expr_v4_t * e2 /* second expr of comparing */) +{ + return allocate_expr_op2( ws, e1, e2, COMPARE_EQ ); +} + + +static expr_v4_t * compare_NE_expression(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of comparing */, + expr_v4_t * e2 /* second expr of comparing */) +{ + return allocate_expr_op2( ws, e1, e2, COMPARE_NE ); +} + + +static expr_v4_t * compare_LE_expression(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of comparing */, + expr_v4_t * e2 /* second expr of comparing */) +{ + return allocate_expr_op2( ws, e1, e2, COMPARE_LE ); +} + + +static expr_v4_t * compare_GE_expression(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of comparing */, + expr_v4_t * e2 /* second expr of comparing */) +{ + return allocate_expr_op2( ws, e1, e2, COMPARE_GE ); +} + + +static expr_v4_t * compare_LT_expression(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of comparing */, + expr_v4_t * e2 /* second expr of comparing */) +{ + return allocate_expr_op2( ws, e1, e2, COMPARE_LT ); +} + + +static expr_v4_t * compare_GT_expression(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of comparing */, + expr_v4_t * e2 /* second expr of comparing */) +{ + return allocate_expr_op2( ws, e1, e2, COMPARE_GT ); +} + + +static expr_v4_t * compare_OR_expression(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of comparing */, + expr_v4_t * e2 /* second expr of comparing */) +{ + return allocate_expr_op2( ws, e1, e2, COMPARE_OR ); +} + + +static expr_v4_t * compare_AND_expression(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of comparing */, + expr_v4_t * e2 /* second expr of comparing */) +{ + return allocate_expr_op2( ws, e1, e2, COMPARE_AND ); +} + + +static expr_v4_t * not_expression(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of comparing */) +{ + return allocate_expr_op( ws, e1, COMPARE_NOT ); +} + + +/**/ +static expr_v4_t * check_exist(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of comparing */) +{ + return e1; +} + +/* union lists */ +static expr_v4_t * union_lists(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of connecting */, + expr_v4_t * e2 /* second expr of connecting */) +{ + expr_list_t *next, *last; + assert("VD-connect_list", e1->h.type == EXPR_LIST); + + last = (expr_list_t *)e1; + next = e1->list.next; + /* find last in list */ + while ( next ) { + last = next; + next = next->next; + } + if ( e2->h.type == EXPR_LIST ) { /* connect 2 lists */ + last->next = (expr_list_t *) e2; + } + else { /* add 2 EXPR to 1 list */ + next = (expr_list_t *) alloc_new_expr(ws, EXPR_LIST ); + assert("VD alloct list", next!=NULL); + next->next = NULL; + next->source = e2; + last->next = next; + } + return e1; +} + + +/* make list from expressions */ +static expr_v4_t * list_expression(struct reiser4_syscall_w_space * ws /* work space ptr */, + expr_v4_t * e1 /* first expr of list */, + expr_v4_t * e2 /* second expr of list */) +{ + expr_v4_t * ret; + + if ( e1->h.type == EXPR_LIST ) { + ret = union_lists( ws, e1, e2); + } + else { + + if ( e2->h.type == EXPR_LIST ) { + ret = union_lists( ws, e2, e1); + } + else { + ret = alloc_new_expr(ws, EXPR_LIST ); + assert("VD alloct list 1", ret!=NULL); + ret->list.source = e1; + ret->list.next = (expr_list_t *)alloc_new_expr(ws, EXPR_LIST ); + assert("VD alloct list 2",ret->list.next!=NULL); + ret->list.next->next = NULL; + ret->list.next->source = e2; + } + } + return ret; +} + + + +static inline expr_v4_t * list_async_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2 ) +{ + return list_expression( ws, e1 , e2 ); +} + + + +static expr_v4_t * assign(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2) +{ + /* while for each pars_var in e1 */ + pump( e1->pars_var.v, e2 ); + return e2; /* tmp. */ +} + + + +static expr_v4_t * assign_invert(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2) +{ + return e2; +} + +/* not yet */ +static expr_v4_t * symlink(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2) +{ + return e2; +} + + + +/* + A flow is a source from which data can be obtained. A Flow can be one of these types: + + 1. memory area in user space. (char *area, size_t length) + 2. memory area in kernel space. (caddr_t *area, size_t length) + 3. file-system object (lnode *obj, loff_t offset, size_t length) +*/ +#if 0 +typedef struct connect connect_t; + +struct connect +{ + expr_v4_t * (*u)(pars_var_t *dst, expr_v4_t *src); +}; + +static expr_v4_t * reiser4_assign( pars_var_t *dst, expr_v4_t *src ) +{ + int ret_code; + file_plugin *src_fplug; + file_plugin *dst_fplug; + connect_t connection; + + /* + * select how to transfer data from @src to @dst. + * + * Default implementation of this is common_transfer() (see below). + * + * Smart file plugin can choose connection based on type of @dst. + * + */ +#if 0 + connection = dst->v->fplug -> select_connection( src, dst ); +#else + /* connection.u=common_transfer;*/ +#endif + + /* do transfer */ + return common_transfer( &dst, &src ); +} + +#endif + + +static int source_not_empty(expr_v4_t *source) +{ + return 0; +} + +static mm_segment_t __ski_old_fs; + + +#define START_KERNEL_IO_GLOB \ + __ski_old_fs = get_fs(); \ + set_fs( KERNEL_DS ) + +#define END_KERNEL_IO_GLOB \ + set_fs( __ski_old_fs ); + +#define PUMP_BUF_SIZE (PAGE_CACHE_SIZE) + + +static tube_t * get_tube_general(pars_var_t *sink, expr_v4_t *source) +{ + tube_t * tube=NULL; + tube = kmalloc(sizeof(struct tube), GFP_KERNEL); + memset( tube , 0, sizeof( struct tube )); + + PTRACE1( "%s", "begin"); + assert("get_tube_general: no tube",!IS_ERR(tube)); + assert("get_tube_general: src expression wrong",source->h.type == EXPR_PARS_VAR); + assert("get_tube_general: src no dentry",source->pars_var.v->ln->h.type== LNODE_DENTRY); + assert("get_tube_general: dst no dentry",sink->ln->h.type== LNODE_DENTRY); + + tube->buf = kmalloc(PUMP_BUF_SIZE, GFP_KERNEL); + memset( tube->buf , 0, PUMP_BUF_SIZE); + + tube->readoff = 0; + tube->writeoff = 0; + + tube->type_offset = 0; + tube->offset = 0; + tube->len = 0; + tube->used = 0; + tube->src = dentry_open(source->pars_var.v->ln->dentry.dentry, NULL, O_RDONLY);; + tube->dst = dentry_open(sink->ln->dentry.dentry, NULL, O_WRONLY);; +// tube->source = source; +// tube->sink = sink; + START_KERNEL_IO_GLOB; + return tube; +} + +static size_t reserv_space_in_sink(tube_t * tube, size_t len ) +{ + return vfs_read(tube->src, tube->buf, len, &tube->readoff); +} + +static size_t get_available_len(struct file * fl) +{ + PTRACE1( "%s", "begin"); + return PUMP_BUF_SIZE; +} + +static int prep_tube_general(tube_t * tube) +{ + PTRACE1( "%s", "begin"); + tube->len = reserv_space_in_sink( tube, get_available_len(tube->src) ); + return tube->len; +} + +static int source_to_tube_general(tube_t * tube) +{ +// tube->source->fplug->read(tube->offset,tube->len); + PTRACE1( "%s", "begin"); + return tube->len; +} + +static int tube_to_sink_general(tube_t * tube) +{ +// tube->sink->fplug->write(tube->offset,tube->len); +// tube->offset+=tube->len; + return vfs_write(tube->dst, tube->buf, tube->len, &tube->writeoff); +} + +static void put_tube(tube_t * tube) +{ + PTRACE1( "%s", "begin"); + END_KERNEL_IO_GLOB; + kfree(tube->buf); + kfree(tube); +} + + +/* + Often connection() will be a method that employs memcpy(). Sometimes + copying data from one file plugin to another will mean transforming + the data. What reiser4_assign does depends on the type of the flow + and sink. If @flow is based on the kernel-space area, memmove() is + used to copy data. If @flow is based on the user-space area, + copy_from_user() is used. If @flow is based on a file-system object, + flow_place() uses the page cache as a universal translator, loads + the object's data into the page cache, and then copies them into + @area. Someday methods will be written to copy objects more + efficiently than using the page cache (e.g. consider copying holes + [add link to definition of a hole]), but this will not be + implemented in V4.0. +*/ +static int pump( pars_var_t *sink, expr_v4_t *source ) +{ + tube_t * tube; + int ret_code; + int (*prep_tube)(tube_t *); +// int (*prep_tube)(expr_v4_t *); + int (*source_to_tube)(tube_t *); + int (*tube_to_sink)(tube_t *); + +// pos_t source_pos; +// pos_t sink_pos; + + PTRACE1( "%s", "begin"); + + /* remember to write code for freeing tube, error handling, etc. */ +#if 0 + tube = sink->fplug -> get_tube( sink, source); + prep_tube = sink->fplug->prep_tube (tube); + source_to_tube = source->fplug->source_to_tube; + tube_to_sink = sink->fplug->tube_to_sink; +#else + tube = get_tube_general( sink, source); + prep_tube = prep_tube_general; + source_to_tube = source_to_tube_general; + tube_to_sink = tube_to_sink_general; +#endif + + while( prep_tube( tube ) ) { + ret_code = source_to_tube( tube ); + ret_code = tube_to_sink( tube ); + } + put_tube(tube); + return ret_code; + PTRACE1( "%s", "end"); +} + + + + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/parser/lib.h linux-2.6.4-ck1/fs/reiser4/parser/lib.h --- linux-2.6.4/fs/reiser4/parser/lib.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/lib.h 2004-03-11 22:45:15.274512540 +1100 @@ -0,0 +1,76 @@ +/* + * Copyright 2001, 2002 by Hans Reiser, licensing governed by reiser4/README + */ + +/* + * functions for parser.y + */ + + + + + + +static void yyerror( struct reiser4_syscall_w_space *ws, int msgnum , ...); +static int yywrap(void); +static void freeList(freeSpace_t * list); +static int reiser4_pars_free(struct reiser4_syscall_w_space * ws); +static freeSpace_t * freeSpaceAlloc(void); +static freeSpace_t * freeSpaceNextAlloc(struct reiser4_syscall_w_space * ws); +static char* list_alloc(struct reiser4_syscall_w_space * ws, int size); +static streg_t *alloc_new_level(struct reiser4_syscall_w_space * ws); +static pars_var_t * alloc_pars_var(struct reiser4_syscall_w_space * ws, pars_var_t * last_pars_var); +static lnode * get_lnode(struct reiser4_syscall_w_space * ws); +static struct reiser4_syscall_w_space * reiser4_pars_init(void); +static void level_up(struct reiser4_syscall_w_space *ws, long type); +static void level_down(struct reiser4_syscall_w_space * ws, long type1, long type2); +static void move_selected_word(struct reiser4_syscall_w_space * ws, int exclude ); +static int b_check_word(struct reiser4_syscall_w_space * ws ); +static __inline__ wrd_t * _wrd_inittab(struct reiser4_syscall_w_space * ws ); +static int reiser4_lex( struct reiser4_syscall_w_space * ws ); +static expr_v4_t * alloc_new_expr(struct reiser4_syscall_w_space * ws, int type); +wrd_t * nullname(struct reiser4_syscall_w_space * ws); +static expr_v4_t * init_root(struct reiser4_syscall_w_space * ws); +static expr_v4_t * init_pwd(struct reiser4_syscall_w_space * ws); +static expr_v4_t * pars_expr(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static expr_v4_t * lookup_word(struct reiser4_syscall_w_space * ws, wrd_t * w); +static inline expr_v4_t * pars_lookup_curr(struct reiser4_syscall_w_space * ws); +static inline expr_v4_t * pars_lookup_root(struct reiser4_syscall_w_space * ws); +static pars_var_t * lookup_pars_var_word(struct reiser4_syscall_w_space * ws, pars_var_t * pars_var, wrd_t * w); +static expr_v4_t * make_do_it(struct reiser4_syscall_w_space * ws, expr_v4_t * e1 ); +static expr_v4_t * if_then_else(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2 , expr_v4_t * e3 ); +static expr_v4_t * if_then(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2 ); +static void goto_end(struct reiser4_syscall_w_space * ws); +static expr_v4_t * constToExpr(struct reiser4_syscall_w_space * ws, wrd_t * e1 ); +static expr_v4_t * connect_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static expr_v4_t * compare_EQ_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static expr_v4_t * compare_NE_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static expr_v4_t * compare_LE_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static expr_v4_t * compare_GE_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static expr_v4_t * compare_LT_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static expr_v4_t * compare_GT_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static expr_v4_t * compare_OR_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static expr_v4_t * compare_AND_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static expr_v4_t * not_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1); +static expr_v4_t * check_exist(struct reiser4_syscall_w_space * ws, expr_v4_t * e1); +static expr_v4_t * list_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2 ); +static inline expr_v4_t * list_async_expression(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2 ); +static expr_v4_t * assign(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static expr_v4_t * assign_invert(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static expr_v4_t * symlink(struct reiser4_syscall_w_space * ws, expr_v4_t * e1, expr_v4_t * e2); +static int source_not_empty(expr_v4_t *source); +static tube_t * get_tube_general(pars_var_t *sink, expr_v4_t *source); +static size_t reserv_space_in_sink(tube_t * tube, size_t len ); +static size_t get_available_len(struct file * fl); +static int prep_tube_general(tube_t * tube); +static int source_to_tube_general(tube_t * tube); +static int tube_to_sink_general(tube_t * tube); +static void put_tube(tube_t * tube); +static int pump( pars_var_t *sink, expr_v4_t *source ); + + +#define curr_symbol(ws) ((ws)->ws_pline) +#define next_symbol(ws) (++curr_symbol(ws)) +#define tolower(a) a +#define isdigit(a) ((a)>=0 && (a)<=9) + diff -Naurp linux-2.6.4/fs/reiser4/parser/Makefile linux-2.6.4-ck1/fs/reiser4/parser/Makefile --- linux-2.6.4/fs/reiser4/parser/Makefile 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/Makefile 2004-03-11 22:45:15.270513162 +1100 @@ -0,0 +1,35 @@ + +.SUFFIXES= .l .y .c + +GENSRCS= parser.code.c +GENHDRS= y.t.h + +SRCS= ${GENSRCS} ${CSRCS} +CLEANFILES= ${GENSRCS} ${GENHDRS} y.output + +# Override default kernel CFLAGS. This is a userland app. +# PARS_CFLAGS:= -I/usr/include -I. -ldb +YFLAGS= -d -t -v -r -b parser + + +YACC=./yacc + +#ifdef DEBUG +CFLAGS+= -DDEBUG -g +YFLAGS+= -t -v -r -b parser +LFLAGS= -d +#endif + +#$(PROG): $(SRCS) $(GENHDRS) +# $(CC) $(PARS_CFLAGS) $(CFLAGS) $(CSRCS) -o $(PROG) +# $(CC) $(PARS_CFLAGS) $(CFLAGS) $(SRCS) -o $(PROG) + +clean: + rm -f $(CLEANFILES) $(PROG) + +parser.code.c: parser.y + $(YACC) $(YFLAGS) parser.y + + + + diff -Naurp linux-2.6.4/fs/reiser4/parser/pars.cls.h linux-2.6.4-ck1/fs/reiser4/parser/pars.cls.h --- linux-2.6.4/fs/reiser4/parser/pars.cls.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/pars.cls.h 2004-03-11 22:45:15.275512385 +1100 @@ -0,0 +1,285 @@ +/* + * Copyright 2001, 2002 by Hans Reiser, licensing governed by reiser4/README + */ + +/* + * definitions of common constants for lex component of parser.y + */ + + + +#define ERR -128 + + +#if 1 +typedef enum { + OK , + Blk , /* blank */ + Wrd , /* any symbol exept spec symbl */ + Int , /* numeric */ + + Ptr , /* pointer */ + + Pru , /* _pruner */ + + Stb , /* ` string begin */ + Ste , /* ' string end */ + Lpr , /* ( [ { */ + Rpr , /* ) ] } */ + Com , /* , */ + Mns , /* - */ + + + Les , /* < */ + Slh , /* / */ + + Lsq , /* [ ----------*/ + Rsq , /* ] ----------*/ + + Bsl , /* \ */ + + Lfl , /* { ----------*/ + Rfl , /* } ----------*/ + + Pip , /* | */ + Sp1 , /* : */ + Sp2 , /* ; */ + + Dot , /* . */ + + Sp4 , /* = */ + Sp5 , /* > */ + Sp6 , /* ? */ + Pls , /* + ???*/ + + /*LastTerm Sp6*/ + + Res , /* */ + + Str , + ASG , + App , + Lnk , + + Ap2 + +} state; +#else +#define OK 0 +#define Blk 1 /* blank */ +#define Wrd 2 /* any symbol exept spec symbl */ +#define Int 3 /* numeric */ + +#define Ptr 4 /* pointer */ + +#define Pru 5 /* _pruner */ + +#define Stb 6 /* ` string begin */ +#define Ste 7 /* ' string end */ +#define Lpr 8 /* ( [ { */ +#define Rpr 9 /* ) ] } */ +#define Com 10 /* , */ +#define Mns 11 /* - */ + +#define Pls 11 /* + ???*/ + +#define Les 12 /* < */ +#define Slh 13 /* / */ + +#define Lsq 14 /* [ ----------*/ +#define Rsq 15 /* ] ----------*/ + +#define Bsl 16 /* \ */ + +#define Lfl 18 /* { ----------*/ +#define Rfl 19 /* } ----------*/ + +#define Pip 20 /* | */ +#define Sp1 22 /* : */ +#define Sp2 23 /* ; */ + +#define Dot 24 /* . */ + +#define Sp4 25 /* = */ +#define Sp5 26 /* > */ +#define Sp6 27 /* ? */ + +#define LastTerm Sp6 + +#define Res 28 /* */ + + +#define Str 32 +#define ASG 33 +#define App 34 +#define Lnk 35 + +#define Ap2 36 +#endif + +#define STRING_CONSTANT_EMPTY STRING_CONSTANT /* tmp */ + + +static char ncl [256] = { + Blk, ERR, ERR, ERR, ERR, ERR, ERR, ERR, + ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, + ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, + ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, + /* 32*/ + /* ! " # $ % & ' */ + Blk, Res, Res, Res, Res, Res, Res, Ste, + /* ( ) * + , - . / */ + Lpr, Rpr, Res, Pls, Com, Mns, Dot, Slh, + /* 0 1 2 3 4 5 6 7 */ + Int, Int, Int, Int, Int, Int, Int, Int, + /* 8 9 : ; < = > ? */ + Int, Int, Sp2, Sp1, Les, Sp4, Sp5, Sp6, + + /* 64*/ + /* @ A B C D E F G */ + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + /* H I J K L M N O */ + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + /* P Q R S T U V W */ + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + /* X Y Z [ \ ] ^ _ */ + Wrd, Wrd, Wrd, Lpr, Bsl, Rpr, Res, Pru, + /* 96*/ + /* ` a b c d e f g */ + Stb, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + /* h i j k l m n o */ + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + /* p q r s t u v w */ + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + /* x y z { | } ~ */ + Wrd, Wrd, Wrd, Lpr, Pip, Rpr, Wrd, ERR, + + /*128*/ + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + /*160*/ + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + /*192*/ + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + /*224*/ + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, + Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, Wrd, ERR +}; + + +struct lexcls { + int term; + char c[32]; +} ; + +static struct { + char * wrd; + int class; +} +pars_key [] = { + { "and" , AND }, + { "else" , ELSE }, + { "eq" , EQ }, + { "ge" , GE }, + { "gt" , GT }, + { "if" , IF }, + { "le" , LE }, + { "lt" , LT }, + { "ne" , NE }, + { "not" , NOT }, + { "or" , OR }, + { "then" , THEN }, + { "tw/" , TRANSCRASH } +}; + + +struct lexcls lexcls[64] = { +/* +.. a 1 _ ` ' ( ) , - < / [ ] \ { } | ; : . = > ? + +Blk Wrd Int Ptr Pru Stb Ste Lpr Rpr Com Mns Les Slh Lsq Rsq Bsl Lfl Rfl Pip Sp1 Sp2 Dot Sp4 Sp5 Sp6 Pls ... */ +[Blk]={ 0, {0, +Blk,Wrd,Int,Ptr,Pru,Str,ERR, Lpr,Rpr,Com,Mns,Les,Slh,Lsq,Rsq, Bsl,Lfl,Rfl,Pip,Sp1,Sp2,Dot,Sp4, Sp5,Sp6,ERR,ERR,ERR,ERR,ERR,ERR}}, +[Wrd]={ WORD, {0, +OK ,Wrd,Wrd,Wrd,Wrd,Wrd,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , Bsl,OK ,OK ,OK ,OK ,OK ,Wrd,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, + +[Int]={ WORD, {0, +OK ,Wrd,Int,Wrd,Wrd,OK ,OK , OK ,OK ,OK ,Wrd,OK ,OK ,OK ,OK , Wrd,OK ,OK ,OK ,OK ,OK ,Wrd,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, + +[Ptr]={ WORD,{0, +OK ,Wrd,Wrd,Wrd,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , Wrd,OK ,OK ,OK ,OK ,OK ,Wrd,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Pru]={ P_RUNNER,{0, +OK ,Pru,Pru,Pru,Pru,OK ,OK , OK ,OK ,OK ,Pru,OK ,OK ,OK ,OK , Pru,OK ,OK ,OK ,OK ,OK ,Pru,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, + +[Stb]={ STRING_CONSTANT_EMPTY, {1, +Str,Str,Str,Str,Str,Str,OK , Str,Str,Str,Str,Str,Str,Str,Str, Str,Str,Str,Str,Str,Str,Str,Str, Str,Str,Str,Str,Str,Str,Str,Str}}, +[Ste]={ 0, {0, +ERR,ERR,ERR,ERR,ERR,ERR,ERR, ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR, ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR, ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR}}, +[Lpr]={ L_BRACKET /*L_PARENT*/,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Rpr]={ R_BRACKET,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Com]={ COMMA,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Mns]={ 0,{0, +ERR,ERR,ERR,ERR,ERR,ERR,ERR, ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR, ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR, Lnk,ERR,ERR,ERR,ERR,ERR,ERR,ERR}}, +[Les]{ LT,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,ASG,App,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, + +[Slh]={ SLASH,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,Slh,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, + +[Lsq]={ 0/*L_SKW_PARENT*/,{0, /*mast removed*/ +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Rsq]={ 0/*R_SKW_PARENT*/,{0, /*mast removed*/ +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Bsl]={ 0,{0, +Wrd,Wrd,Wrd,Wrd,Wrd,Wrd,Wrd, Wrd,Wrd,Wrd,Wrd,Wrd,Wrd,Wrd,Wrd, Wrd,Wrd,Wrd,Wrd,Wrd,Wrd,Wrd,Wrd, Wrd,Wrd,Wrd,Wrd,Wrd,Wrd,Wrd,Wrd}}, +[Lfl]={ 0 /*L_FLX_PARENT*/,{0, /*mast removed*/ +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Rfl]={ 0 /*R_FLX_PARENT*/,{0, /*mast removed*/ +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Pip]={ 0,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Sp1]={ 0,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Sp2]={ SEMICOLON,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Dot]={ 0,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Sp4]={ 0,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Sp5]={ 0,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, + +[Sp6]={ 0,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Pls]={ PLUS,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Res]={ 0,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, + +[Str]={ STRING_CONSTANT,{1, +OK ,Str,Str,Str,Str,Str,OK , Str,Str,Str,Str,Str,Str,Str,Str, Str,Str,Str,Str,Str,Str,Str,Str, Str,Str,Str,Str,Str,Str,Str,Str}}, +[ASG]={ L_ASSIGN,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[App]={ L_ASSIGN,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,Ap2,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }}, +[Lnk]={ L_SYMLINK,{0, +ERR,ERR,ERR,ERR,ERR,ERR,ERR, ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR, ERR,ERR,ERR,ERR,ERR,ERR,ERR,ERR, OK ,ERR,ERR,ERR,ERR,ERR,ERR,ERR}}, + +[Ap2]={ L_APPEND,{0, +OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK , OK ,OK ,OK ,OK ,OK ,OK ,OK ,OK }} + +}; + + diff -Naurp linux-2.6.4/fs/reiser4/parser/parser.code.c linux-2.6.4-ck1/fs/reiser4/parser/parser.code.c --- linux-2.6.4/fs/reiser4/parser/parser.code.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/parser.code.c 2004-03-11 22:45:15.277512074 +1100 @@ -0,0 +1,476 @@ +#ifndef lint +/*static char yysccsid[] = "from: @(#)yaccpar 1.9 (Berkeley) 02/21/93";*/ +static char yyrcsid[] = "$Id: skeleton.c,v 1.4 1993/12/21 18:45:32 jtc Exp $\n 2002/10/22 VD reiser4"; +#endif +#define YYBYACC 1 +#define YYMAJOR 1 +#define YYMINOR 9 +#define yyclearin (yychar=(-1)) +#define yyerrok (yyerrflag=0) +#define YYRECOVERING (yyerrflag!=0) +#define YYPREFIX "yy" +#line 9 "fs/reiser4/parser/parser.y" +typedef union +{ + long charType; + expr_v4_t * expr; + wrd_t * wrd; +} YYSTYPE; +#line 20 "fs/reiser4/parser/parser.code.c" +#define L_BRACKET 257 +#define R_BRACKET 258 +#define WORD 259 +#define P_RUNNER 260 +#define STRING_CONSTANT 261 +#define TRANSCRASH 262 +#define L_ASSIGN 263 +#define L_APPEND 264 +#define L_SYMLINK 265 +#define SEMICOLON 266 +#define COMMA 267 +#define PLUS 268 +#define SLASH 269 +#define INV_L 270 +#define INV_R 271 +#define EQ 272 +#define NE 273 +#define LE 274 +#define GE 275 +#define LT 276 +#define GT 277 +#define IS 278 +#define AND 279 +#define OR 280 +#define NOT 281 +#define IF 282 +#define THEN 283 +#define ELSE 284 +#define EXIST 285 +#define NAME 286 +#define UNNAME 287 +#define ROOT 288 +#define USLASH 289 +#define YYERRCODE 256 +#define YYTABLESIZE 444 +#define YYFINAL 5 +#ifndef YYDEBUG +#define YYDEBUG 0 +#endif +#define YYMAXTOKEN 289 +#if defined(YYREISER4_DEF) +#define extern static +#endif +extern short yylhs[]; +extern short yylen[]; +extern short yydefred[]; +extern short yydgoto[]; +extern short yysindex[]; +extern short yyrindex[]; +extern short yygindex[]; +extern short yytable[]; +extern short yycheck[]; +#if YYDEBUG +extern char *yyname[]; +extern char *yyrule[]; +#endif +#if defined(YYREISER4_DEF) +#define YYSTACKSIZE 500 +#define YYMAXDEPTH 500 +#define yydebug ws->ws_yydebug +#define yynerrs ws->ws_yynerrs +#define yyerrflag ws->ws_yyerrflag +#define yychar ws->ws_yychar +#define yyssp ws->ws_yyssp +#define yyvsp ws->ws_yyvsp +#define yyval ws->ws_yyval +#define yylval ws->ws_yylval +#define yyss ws->ws_yyss +#define yyvs ws->ws_yyvs +#define yystacksize ws->ws_yystacksize +#else +#ifdef YYSTACKSIZE +#undef YYMAXDEPTH +#define YYMAXDEPTH YYSTACKSIZE +#else +#ifdef YYMAXDEPTH +#define YYSTACKSIZE YYMAXDEPTH +#else +#define YYSTACKSIZE 500 +#define YYMAXDEPTH 500 +#endif +#endif +int yydebug; +int yynerrs; +int yyerrflag; +int yychar; +short *yyssp; +YYSTYPE *yyvsp; +YYSTYPE yyval; +YYSTYPE yylval; +short yyss[YYSTACKSIZE]; +YYSTYPE yyvs[YYSTACKSIZE]; +#define yystacksize YYSTACKSIZE +#endif +#line 160 "fs/reiser4/parser/parser.y" + + +#define yyversion "4.0.0" +#include "pars.cls.h" +#include "parser.tab.c" +#include "pars.yacc.h" +#include "lib.c" + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + End: +*/ +#line 132 "fs/reiser4/parser/parser.code.c" +#define YYABORT goto yyabort +#define YYREJECT goto yyabort +#define YYACCEPT goto yyaccept +#define YYERROR goto yyerrlab +int +#if defined(YYREISER4_DEF) +yyparse(struct reiser4_syscall_w_space * ws) +#else +#if defined(__STDC__) +yyparse(void) +#else +yyparse() +#endif +#endif +{ + register int yym, yyn, yystate; +#if YYDEBUG + register char *yys; + static char *getenv(); + + if (yys = getenv("YYDEBUG")) + { + yyn = *yys; + if (yyn >= '0' && yyn <= '9') + yydebug = yyn - '0'; + } +#endif + + yynerrs = 0; + yyerrflag = 0; + yychar = (-1); + + yyssp = yyss; + yyvsp = yyvs; + *yyssp = yystate = 0; + +yyloop: + if ((yyn = yydefred[yystate]) != 0) goto yyreduce; + if (yychar < 0) + { + if ((yychar = yylex()) < 0) yychar = 0; +#if YYDEBUG + if (yydebug) + { + yys = 0; + if (yychar <= YYMAXTOKEN) yys = yyname[yychar]; + if (!yys) yys = "illegal-symbol"; + printf("%sdebug: state %d, reading %d (%s)\n", + YYPREFIX, yystate, yychar, yys); + } +#endif + } + if ((yyn = yysindex[yystate]) && (yyn += yychar) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == yychar) + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: state %d, shifting to state %d\n", + YYPREFIX, yystate, yytable[yyn]); +#endif + if (yyssp >= yyss + yystacksize - 1) + { + goto yyoverflow; + } + *++yyssp = yystate = yytable[yyn]; + *++yyvsp = yylval; + yychar = (-1); + if (yyerrflag > 0) --yyerrflag; + goto yyloop; + } + if ((yyn = yyrindex[yystate]) && (yyn += yychar) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == yychar) + { + yyn = yytable[yyn]; + goto yyreduce; + } + if (yyerrflag) goto yyinrecovery; +#if defined(YYREISER4_DEF) + yyerror(ws,11111,yystate,yychar); +#else + yyerror("syntax error"); +#endif +#ifdef lint + goto yyerrlab; +#endif +yyerrlab: + ++yynerrs; +yyinrecovery: + if (yyerrflag < 3) + { + yyerrflag = 3; + for (;;) + { + if ((yyn = yysindex[*yyssp]) && (yyn += YYERRCODE) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == YYERRCODE) + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: state %d, error recovery shifting\ + to state %d\n", YYPREFIX, *yyssp, yytable[yyn]); +#endif + if (yyssp >= yyss + yystacksize - 1) + { + goto yyoverflow; + } + *++yyssp = yystate = yytable[yyn]; + *++yyvsp = yylval; + goto yyloop; + } + else + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: error recovery discarding state %d\n", + YYPREFIX, *yyssp); +#endif + if (yyssp <= yyss) goto yyabort; + --yyssp; + --yyvsp; + } + } + } + else + { + if (yychar == 0) goto yyabort; +#if YYDEBUG + if (yydebug) + { + yys = 0; + if (yychar <= YYMAXTOKEN) yys = yyname[yychar]; + if (!yys) yys = "illegal-symbol"; + printf("%sdebug: state %d, error recovery discards token %d (%s)\n", + YYPREFIX, yystate, yychar, yys); + } +#endif + yychar = (-1); + goto yyloop; + } +yyreduce: +#if YYDEBUG + if (yydebug) + printf("%sdebug: state %d, reducing by rule %d (%s)\n", + YYPREFIX, yystate, yyn, yyrule[yyn]); +#endif + yym = yylen[yyn]; + yyval = yyvsp[1-yym]; + switch (yyn) + { +case 1: +#line 85 "fs/reiser4/parser/parser.y" +{ yyval.charType = free_expr( yyvsp[0].expr ); } +break; +case 2: +#line 89 "fs/reiser4/parser/parser.y" +{ yyval.expr = yyvsp[0].expr;} +break; +case 3: +#line 90 "fs/reiser4/parser/parser.y" +{ yyval.expr = constToExpr( ws, yyvsp[0].wrd ); } +break; +case 4: +#line 91 "fs/reiser4/parser/parser.y" +{ yyval.expr = connect_expression( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 5: +#line 92 "fs/reiser4/parser/parser.y" +{ yyval.expr = list_expression( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 6: +#line 93 "fs/reiser4/parser/parser.y" +{ yyval.expr = list_async_expression( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 7: +#line 94 "fs/reiser4/parser/parser.y" +{ yyval.expr = yyvsp[0].expr; level_down( ws, IF_STATEMENT, IF_STATEMENT ); } +break; +case 8: +#line 96 "fs/reiser4/parser/parser.y" +{ yyval.expr = assign( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 9: +#line 97 "fs/reiser4/parser/parser.y" +{ yyval.expr = assign( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 10: +#line 98 "fs/reiser4/parser/parser.y" +{ yyval.expr = assign_invert( ws, yyvsp[-4].expr, yyvsp[-1].expr ); } +break; +case 11: +#line 99 "fs/reiser4/parser/parser.y" +{ yyval.expr = symlink( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 13: +#line 108 "fs/reiser4/parser/parser.y" +{ yyval.expr = if_then_else( ws, yyvsp[-3].expr, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 14: +#line 109 "fs/reiser4/parser/parser.y" +{ yyval.expr = if_then( ws, yyvsp[-1].expr, yyvsp[0].expr) ; } +break; +case 15: +#line 113 "fs/reiser4/parser/parser.y" +{ yyval.expr = yyvsp[0].expr; } +break; +case 16: +#line 116 "fs/reiser4/parser/parser.y" +{ level_up( ws, IF_STATEMENT ); } +break; +case 17: +#line 120 "fs/reiser4/parser/parser.y" +{ yyval.expr = not_expression( ws, yyvsp[0].expr ); } +break; +case 18: +#line 121 "fs/reiser4/parser/parser.y" +{ yyval.expr = check_exist( ws, yyvsp[0].expr ); } +break; +case 19: +#line 122 "fs/reiser4/parser/parser.y" +{ yyval.expr = compare_EQ_expression( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 20: +#line 123 "fs/reiser4/parser/parser.y" +{ yyval.expr = compare_NE_expression( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 21: +#line 124 "fs/reiser4/parser/parser.y" +{ yyval.expr = compare_LE_expression( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 22: +#line 125 "fs/reiser4/parser/parser.y" +{ yyval.expr = compare_GE_expression( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 23: +#line 126 "fs/reiser4/parser/parser.y" +{ yyval.expr = compare_LT_expression( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 24: +#line 127 "fs/reiser4/parser/parser.y" +{ yyval.expr = compare_GT_expression( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 25: +#line 128 "fs/reiser4/parser/parser.y" +{ yyval.expr = compare_OR_expression( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 26: +#line 129 "fs/reiser4/parser/parser.y" +{ yyval.expr = compare_AND_expression( ws, yyvsp[-2].expr, yyvsp[0].expr ); } +break; +case 27: +#line 133 "fs/reiser4/parser/parser.y" +{ goto_end( ws );} +break; +case 28: +#line 137 "fs/reiser4/parser/parser.y" +{ yyval.expr = yyvsp[0].expr;} +break; +case 29: +#line 141 "fs/reiser4/parser/parser.y" +{ yyval.expr = pars_expr( ws, yyvsp[-1].expr, yyvsp[0].expr ) ; } +break; +case 30: +#line 142 "fs/reiser4/parser/parser.y" +{ yyval.expr = pars_expr( ws, yyvsp[-2].expr, yyvsp[0].expr ) ; } +break; +case 31: +#line 146 "fs/reiser4/parser/parser.y" +{ yyval.expr = pars_lookup_root( ws ) ; } +break; +case 32: +#line 147 "fs/reiser4/parser/parser.y" +{ yyval.expr = pars_lookup_curr( ws ) ; } +break; +case 33: +#line 151 "fs/reiser4/parser/parser.y" +{ yyval.expr = lookup_word( ws, yyvsp[0].wrd ); } +break; +case 34: +#line 152 "fs/reiser4/parser/parser.y" +{ yyval.expr = yyvsp[-1].expr; level_down( ws, yyvsp[-2].charType, yyvsp[0].charType );} +break; +case 35: +#line 156 "fs/reiser4/parser/parser.y" +{ yyval.charType = yyvsp[0].charType; level_up( ws, yyvsp[0].charType ); /*set_curr_path( ws ); */} +break; +#line 417 "fs/reiser4/parser/parser.code.c" + } + yyssp -= yym; + yystate = *yyssp; + yyvsp -= yym; + yym = yylhs[yyn]; + if (yystate == 0 && yym == 0) + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: after reduction, shifting from state 0 to\ + state %d\n", YYPREFIX, YYFINAL); +#endif + yystate = YYFINAL; + *++yyssp = YYFINAL; + *++yyvsp = yyval; + if (yychar < 0) + { + if ((yychar = yylex()) < 0) yychar = 0; +#if YYDEBUG + if (yydebug) + { + yys = 0; + if (yychar <= YYMAXTOKEN) yys = yyname[yychar]; + if (!yys) yys = "illegal-symbol"; + printf("%sdebug: state %d, reading %d (%s)\n", + YYPREFIX, YYFINAL, yychar, yys); + } +#endif + } + if (yychar == 0) goto yyaccept; + goto yyloop; + } + if ((yyn = yygindex[yym]) && (yyn += yystate) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == yystate) + yystate = yytable[yyn]; + else + yystate = yydgoto[yym]; +#if YYDEBUG + if (yydebug) + printf("%sdebug: after reduction, shifting from state %d \ +to state %d\n", YYPREFIX, *yyssp, yystate); +#endif + if (yyssp >= yyss + yystacksize - 1) + { + goto yyoverflow; + } + *++yyssp = yystate; + *++yyvsp = yyval; + goto yyloop; +yyoverflow: +#if defined(YYREISER4_DEF) + yyerror(ws,101); /*yacc stack overflow*/ +#else + yyerror("yacc stack overflow"); +#endif +yyabort: + return (1); +yyaccept: + return (0); +} diff -Naurp linux-2.6.4/fs/reiser4/parser/parser.doc linux-2.6.4-ck1/fs/reiser4/parser/parser.doc --- linux-2.6.4/fs/reiser4/parser/parser.doc 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/parser.doc 2004-03-11 22:45:15.278511918 +1100 @@ -0,0 +1,388 @@ +/* Parser for the reiser4() system call */ + +/* Takes a string and parses it into a set of commands which are + executed. */ + +/* + +If you want to read about where future versions of this syntax will +go, and what the grand scheme is, please go to +www.namesys.com/future_vision.html. + +Names resolve into sets of keys. In Reiser4, the sets of keys all +consist of exactly one key, but this will change in future versions. + +Keys are not immutable objectids ala inode numbers. The use of +immutable objectids that all objects could be found by was the +original architecture, and before coding was started this was realized +to require lower performance due to creating immutable, and therefor +poor, locality of reference when resolving them. Keys currently do +contain unique objectids, but this objectid does not suffice for +finding the object. + +Name compounders construct names from subnames. / and [] ([] is not +implemented in reiser4, see www.namesys.com/future_vision.html for +what it will do in a later version) are name compounders. Name +compounders can use any name which can be resolved into a key as a +subname. This provides "closure", in which the type of the result of +every name operation is the same as the type of what the name +operators operate on. (Persons who create abstract models tend to +place a high value on achieving closure in their design. For more +about closure, read www.namesys.com/future_vision.html.) + +A/B indicates that B is to be passed to A, and A will resolve its +meaning. This is consistent with Unix usage of /. B is considered +a subname of A in this example. + +Reiser4 supports a plugin that implements Unix regular files (regfile), +and a plugin that implements Unix regular directories (regdir). (NIKITA-FIXME-HANS: use those names in the code) + +Plugins may resolve a subname by invoking a plugin of another object, +or by invoking methods built into themselves. + +Special characters are whitespace plus []{}()/\,:;*$@!`' and keywords +are <- and -> + +<-, ->, are assignment operators. + + +'A<-B' uses B's read method to read B, and uses A'S write method to +write to A what was read from B. It is a copy command similar to +sendfile(). + +The righthand side of an assignment defines a flow. We calculate the flow, +and then invoke the the write method defined by the lefthand side of +the assignment. + +Example: + +A<-B + +assigns the contents of the object named B to A, overwriting the contents of A if A exists. + +` and ' indicate that all special characters between them should be +ignored and parsed as a single string. That is, A<-`some text' causes A to have +contents equal to a file named `some text'. Quotes are allowed to nest. + +" indicates that the next word is inlined text. Sorry, " is the symbol least useful for something else, so it got used. + +A<-"(this is a string not a name of a file) // German style quoting: ,,ksdajfhkasdh`` + +assigns (sans the single quotes) the string `this is a string not a name of a file' to A. + +A<-"`I think that using " in a language for delimiting quoting is bad style because delimiters should be matching pairs not matching identical singletons if nesting is to work at all.' + +assigns the string `I think that using " in a language for delimiting quoting is bad style because delimiters should be matching pairs not matching identical singletons if nesting is to work at all.' to A + +A<<-B + +appends file B to file A + +We need to define multiple aspects of the object when creating it. +Additionally, we need to assign default values to those aspects of the +definition not defined. The problem arises when we have a multi-part +definition. We should avoid assigning one part, then assigning +default values for all other parts, then overwriting those default +values, some of which actually cannot be overwritten (e.g. pluginid). + +This means we need to name the object, and then perform multiple +assignments in its creation. + +(x_ and )x_ where x is any sequence (including the null sequence) of +non-special characters, are `named parenthesis'. They have the usual meaning of +parenthesis in most languages, and delimit what should be treated as a +single unit by operators. If you use named parenthesis you can avoid +the "LISP bird nest" effect. The disadvantage is that if you leave +off the whitespace following the open parenthesis you will get an +unintended effect. Note that there must be no space between ( and x. + +Referencing the contents of parenthesis by the name of the parenthesis +is planned in later versions.. + +It is an unenforced but encouraged style convention that subnames which contain +meta-information about the object, and pseudo-files that implement +methods of the object, begin with `..'. IT IS NOT A REQUIREMENT +THAT THEY START WITH `..', READ THAT AGAIN! Sorry, got tired of the complaints about +the non-existent requirement. It all depends on how you write your plugins that +use the meta-information whether the meta-data starts with `..'. + +Since what is meta-information, what is a method of the object, and +what is contained information, or methods of sub-objects, are not +necessarily always inherently susceptible to precise natural +distinction, and since we desire to allow users maximal stylistic +freedom to extend reiser4 into emulating other syntaxes, this is only +an optional plugin design convention for use when appropriate. + +One can specify whether a file is listed by readdir in reiser4. Using +that feature, subnames of files containing meta-information about +other files are by convention not listed by readdir, but can be listed +by using the command reiser4("A_listing<-A/..list"), and then reading +the file A_listing. + +For instance, if A is a regfile or regdir, then A/..owner resolves to +a file containing the owner of A, and reading the A directory shows no +file named ..owner. More generally, all of the fields returned by +stat() have a corresponding name of the form A/..field_name for all +regfiles and regdirs. The use of'..' avoids clashes between method +names and filenames. More extreme measures could be taken using +something more obscure than '..' as a prefix, but I remember that +Clearcase and WAFL never really had much in the way of problems with +namespace collisions affecting users seriously, so I don't think one +should excessively inconvenience a syntax for such reasons. + +DEMIDOV-FIXME-HANS: the paragraph below conflicts with the above +*A (similar to C language dereference) means take the contents of +that object which A is the name for, and treat those contents as a name. + + +*`A B' is a reference to a file whose name consists of the characters +A and a space and a B. + + +A;B indicates that B is not to be executed until A completes. So, `/' +orders subnames within a compound name, and `;' orders operations. + + +A,B indicates that A and B are independent of each other and +unordered. + +A/B indicates that the plugin for A is to be passed B, +and asked to handle it in its way, whatever that way is. + +/ and \ are considered equivalent, as a kindness to former windows users + +C/..invert<-A +"(some text)+ B + +indicates that C when read shall return the contents of A followed by +'some text' as a delimiter followed by the contents of B. + +if A and B are object expressions then + A+B is object expression + A\B is object expression + A<-B is possible operation + + +So, let us discuss the following example: + +Assume 357 is the user id of Michael Jackson. + +The following 3 expressions are equivalent: + +ascii_command = "/home/teletubbies/(..new(..name<-glove_location, ..object_t<-audit/regular, ..perm_t<-acl); glove_location/..acl<-( uid<-357, access<-denied ); glove_location/..audit<-mailto<-teletubbies@pbs.org; glove_location<-'we stole it quite some number of years ago, and put it in the very first loot pile (in the hole near the purple flower.')"; + +ascii_command = "/home/teletubbies/(glove_location<-( ..object_t<-audit/regular, ..perm_t<-acl); glove_location/..acl<- ( uid<-357, access<-denied ); glove_location/..audit<-mailto<-teletubbies@pbs.org; glove_location<-'we stole it quite some number of years ago, and put it in the very first loot pile (in the hole near the purple flower)')"; + + +ascii_command = "/home/teletubbies/(glove_location<-( ..object_t<-audit/regular, ..perm_t<-acl); glove_location / ( ..acl<-(uid<-357, access<-denied) ; ..audit<-mailto<-teletubbies@pbs.org); glove_location<-'we stole it quite some number of years ago, and put it in the very first loot pile (in the hole near the purple flower).')"; + +DEMIDOV-FIXME-HANS: what is the meaning of the line below, and should it be ..new rather than new +a/b/(new/(name<-"new_file_name"; type<-"regular file"; perm_t<-"acl"); new_file_name/acl<- ( uid<-"357", access<-"denied" )) + +DEMIDOV-FIXME-HANS: what is the meaning of ..anon and backing? +ascii_command = + "/home/teletubbies/glove_location<- + ( (..object_t<-audit, ..perm_t<-acl) ; + ..acl<- ( uid<-'357', access<-denied ); + ..audit/(backing<-..anon<=(..object_t<-regular); // lookup<-/home/teletubbies/some-existing-file), + log<-(mailto<-teletubbies@pbs.org)); + ..body<-'we stole it quite some number of years ago, and put it in the very first loot pile (in the hole near the purple flower)';)"; + +(a b) +result<-/subject/[elves strike] +(result /subject/[elves strike]) +/subject/[elves strike]->result + + + + +DEMIDOV-FIXME-HANS: cleanup the explanation below +The components of this example have the following meanings: +/home/teletubbies/ - directory name +/(..name - specifies that its argument is the name of the new file - parameter for ..new plugin +/glove_location, - name of new file - parameter for name submethod of ..new method +..object_t - name of submethod that assigns object type to new files - parameter for ..new plugin +/audit - plugin for file +/regular, - plugin for backing store for audit plugin +..perm_t - security plugin to be assigned +) - end of parameters for ..new plugin +; - next system call +glove_location - file name +/..acl - plugin ..acl +( - begin parameters for ..new plugin of ..acl plugin +uid - plugin of ..acl plugin +/357 - its value(parameter) +, - +access - .. +/denied - value to assign +) - end of parameter list +) - ? unbalanced brakes +; +..audit - plugin - file is unknown! +/..new +/mailto +/"teletubbies@pbs.org" +; +glove_location_file - file name +/ + +"we stole it quite some number of years ago, and put it in the very first loot pile (in the hole near the purple flower)." - body of file + +reiser4(&ascii_command, ascii_command_length, stack_binary_parameters_referenced_in_ascii_command, stack_length); + + +*/ + +/* + + w=\$v v=\$u u=5 z=\$w+$w + echo $z + eval echo $z + eval eval echo $z + +eval eval eval echo $z + +result is: + +$w+$v +$v+$u +$u+5 +5+5 + +tw/transcrash_33[ /home/reiser/(a <- b, c <- d) ] + + chgrp -- changes group ownership + chown -- changes ownership + chmod -- changes permissions + cp -- copies + dd -- copies and converts + df -- shows filesystem disk usage. + dir -- gives brief directory listing + du -- shows disk usage + ln -- creates links + ls -- lists directory contents + mkdir -- creates directories + mkfifo -- creates FIFOs (named pipes) + mknod -- creates special files + mv -- renames + rm -- removes (deletes) + rmdir -- removes empty directories + shred -- deletes a file securely + sync -- synchronizes memory and disk +*/ + + + +/* + + +Assignment, and transaction, will be the commands supported in Reiser4(); more commands will appear in Reiser5. -> and <- will be the assignment operators. + +The amount transferred by an assignment is the minimum of the size of the left hand side and the size of the right hand side. This amount is usually made one of the return values. + + * lhs (assignment target) values: + + /..process/..range/(first_byte<-(loff_t),last_byte<-(loff_t),bytes_written<-(ssize_t*) ) + assigns (writes) to the buffer starting at address first_byte in the process address space, ending at last_byte, with the number of bytes actually written + (The assignment source may be smaller or larger than the assignment target.) being written to address bytes_written. + Representation of first_byte,last_byte, and bytes_written is left to the coder to determine. + It is an issue that will be of much dispute and little importance. + Notice / is used to indicate that the order of the operands matters; see www.namesys.com/future_vision.html for details of why this is appropriate syntax design. + Note the lack of a file descriptor. + + /filename + assigns to the file named filename, wholly obliterating its body with what is assigned. + + /filename/..range/(first_byte<-(loff_t),last_byte<-(loff_t),bytes_written<-(ssize_t*) ) + writes to the body, starting at first_byte, ending not past last_byte, + recording number of bytes written in bytes_written + + /filename/..range/(first_byte<-(loff_t),bytes_written<-(ssize_t*) ) + writes to the body starting at offset, recording number of bytes written in bytes_written + + * rhs (assignment source) values: + + /..process/..range/(first_byte<-(loff_t),last_byte<-(loff_t),bytes_read<-(ssize_t*) ) + reads from the buffer starting at address first_byte in the process address space, ending at last_byte. + The number of bytes actually read (assignment source may be smaller or larger than assignment target) is written to address bytes_read. + Representation of first_byte, last_byte, and bytes_read is left to the coder to determine, as it is an issue that will be of much dispute and little importance. + + /filename + reads the entirety of the file named filename. + + /filename/..range/(first_byte<-(loff_t),last_byte<-(loff_t),bytes_read<-(ssize_t*) ) + reads from the body, starting at first_byte, ending not past last_byte, + recording number of bytes read in bytes_read + + /filename/..range/(first_byte<-(loff_t),bytes_read<-(ssize_t*) ) + reads from the body starting at offset until the end, recording number of bytes read in bytes_read + + /filename/..stat/owner + reads from the ownership field of the stat data (stat data is that which is returned by the + stat() system call (owner, permissions, etc.) and stored on a per file basis by the FS.) + + + + + +*/ + + + + +/* + +example: + + + + /path0/path1/filename/..range/(offset<-100,bytes_written<-0xff001258,last_byte<-256)<-/path0/path2/filename/..range/(first_byte<-0,bytes_readed<-0xff001250) + + /path0/(path1/filename/..range/(offset<-100,bytes_written<-0xff001258,last_byte<-256)<-path2/filename/..range(first_byte<-0,bytes_readed<-0xff001250) ) + + /path0/(path1/filename/..range/(100,256)<-path2/filename/..range(0,256) ) + + + ? + /path0/path1/filename/..range/(offset<-100,bytes_written<-0xff001258),last_byte<-256,/path0/path2/filename/..range(first_byte<-0,p_bytes_readed<-0xff001250) + +ssize_t bytes_readed; + +sprintf( string_pointer_bytes_read, "%8.8p", &bytes_readed ); + + */ + + +examples: + + +.....b/(new/(name<-"new_file_name"; type<-regular_file; permition<-acl); new_file_name/acl/( uid<-"357", access<-denied );new_file_name<-/etc/passwd) + +where: + b is a directory +we have lnode for it.and make: +b_lnode->lookup(new) +it that meen we find lnode for directory plugin "new" + and then we find new_lnode->lookup(name). this is lnode for name of new file and we assign to it string constant "new_file_name". + + then we find new_lnode->lookup(type). this is lnode of type of new file + then we find new_lnode->lookup(regular_file) this is lnode of constants of types of plugin "new" for regular file + then we copy contens regular_file_lnode throuse tube to type_lnode. + + then we find new_lnode->lookup(permition). this is lnode of type of permition of new file + then we find new_lnode->lookup(acl). this is lnode of constants of type of permition of "new" plugin, correcponding to acl + then we copy contens acl_lnode throuse tube to permition_lnode. + +then we find b_lnode->lookup(new_file_name). this is lnode for new file we jast created. +then we find new_file_name_lnode->lookup(acl) . this is lnode of acl plugin . + then we find acl_lnode->lookup(uid)this is lnode for uid field of acl and assign to it string constant "357". + then we find acl_lnode->lookup(access). + then we find acl_lnode->lookup(denied). + then we copy contens denied_lnode throuse tube to access_lnode. + +then we find root_lnode->lookup(etc), +then we find etc_lnode->lookup(passwd) +then we read contens passwd_lnode throuse tube and write to new_file_name_lnode lnode. + +ok. command string is executed. + + diff -Naurp linux-2.6.4/fs/reiser4/parser/parser.h linux-2.6.4-ck1/fs/reiser4/parser/parser.h --- linux-2.6.4/fs/reiser4/parser/parser.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/parser.h 2004-03-11 22:45:15.279511763 +1100 @@ -0,0 +1,297 @@ +/* + * Copyright 2001, 2002 by Hans Reiser, licensing governed by reiser4/README + */ + +/* + * definitions of common constants and data-types used by + * parser.y + */ + + /* level type defines */ + +#include "../forward.h" +#include "../debug.h" +#include "../dformat.h" +#include "../key.h" +#include "../type_safe_list.h" +#include "../plugin/plugin_header.h" +#include "../plugin/item/static_stat.h" +#include "../plugin/item/internal.h" +#include "../plugin/item/sde.h" +#include "../plugin/item/cde.h" +#include "../plugin/item/extent.h" +#include "../plugin/item/tail.h" +#include "../plugin/file/file.h" +#include "../plugin/symlink.h" +#include "../plugin/dir/hashed_dir.h" +#include "../plugin/dir/dir.h" +#include "../plugin/item/item.h" +#include "../plugin/node/node.h" +#include "../plugin/node/node40.h" +#include "../plugin/security/perm.h" +#include "../plugin/space/bitmap.h" +#include "../plugin/space/space_allocator.h" +#include "../plugin/disk_format/disk_format40.h" +#include "../plugin/disk_format/disk_format.h" + +#include /* for struct super_block, address_space */ +#include /* for struct page */ +#include /* for struct buffer_head */ +#include /* for struct dentry */ +#include + +typedef enum { + TW_BEGIN, + ASYN_BEGIN, + CD_BEGIN, + OP_LEVEL, + NOT_HEAD, + IF_STATEMENT, + UNORDERED +} def; + +//#define printf(p1,...) PTRACE(ws,p1,...) +#define yylex() reiser4_lex(ws) +#define register +#define yyacc +//#define bizon + +#define PARSER_DEBUG + +#define PTRACE(ws, format, ... ) \ +({ \ + ON_TRACE(TRACE_PARSE, "parser:%s %p %s: " format "\n", \ + __FUNCTION__, ws, (ws)->ws_pline, __VA_ARGS__); \ +}) + +#define PTRACE1( format, ... ) \ +({ \ + ON_TRACE(TRACE_PARSE, "parser:%s " format "\n", \ + __FUNCTION__, __VA_ARGS__); \ +}) + + +typedef struct pars_var pars_var_t; + +typedef struct wrd wrd_t; + + + /* sizes defines */ +#define FREESPACESIZE_DEF PAGE_SIZE*4 +#define FREESPACESIZE (FREESPACESIZE_DEF - sizeof(char*)*2 - sizeof(int) ) + +#define _ROUND_UP_MASK(n) ((1UL<<(n))-1UL) + +#define _ROUND_UP(x,n) (((long)(x)+_ROUND_UP_MASK(n)) & ~_ROUND_UP_MASK(n)) + +// to be ok for alpha and others we have to align structures to 8 byte boundary. + + +#define ROUND_UP(x) _ROUND_UP((x),3) + + +typedef struct tube tube_t; + +struct tube { + int type_offset; + char * offset; /* pointer to reading position */ + long len; /* lenth of current operation + (min of (max_of_read_lenth and max_of_write_lenth) )*/ + long used; + char * buf; /* pointer to bufer */ + loff_t readoff; /* reading offset */ + loff_t writeoff; /* writing offset */ + +// expr_v4_t * source; + struct file *src; + +/* offset might actually point to sink */ +// pars_var_t * sink; + struct file *dst; + +/* pos_t pos; */ +}; + +struct wrd { + wrd_t * next ; /* next word */ + struct qstr u ; /* u.name is ptr to space */ +}; + +typedef enum { + noV4Space, + V4Space, + V4Plugin +} SpaceType; + +struct path_walk { + struct dentry *dentry; + struct vfsmount *mnt; +}; + +struct pars_var { + pars_var_t * next ; /* next */ + pars_var_t * parent; /* parent */ + wrd_t * w ; /* pair (parent,w) is unique */ + lnode * ln; /* file/dir name lnode */ + struct path_walk path; /* for mount point */ + int count; /* ref counter */ + int vtype; /* Type of name */ + size_t off; /* current offset read/write of object */ + size_t len; /* length of sequence of bytes for read/write (-1 no limit) */ + int vSpace ; /* v4 space name or not ??? */ + int vlevel ; /* level ??? */ +// int (*fplug)(lnode * node, const reiser4_plugin_ref * area); +} ; + +typedef union expr_v4 expr_v4_t; + +typedef enum { + CONNECT, + COMPARE_EQ, + COMPARE_NE, + COMPARE_LE, + COMPARE_GE, + COMPARE_LT, + COMPARE_GT, + COMPARE_OR, + COMPARE_AND, + COMPARE_NOT +} expr_code_type; + +//#typedef __u8 op2_t; + +typedef struct expr_common { + __u8 type; + __u8 exp_code; +} expr_common_t; + +typedef struct expr_lnode { + expr_common_t h; + lnode *lnode; +} expr_lnode_t; + +typedef struct expr_flow { + expr_common_t h; + flow_t * flw; +} expr_flow_t; + +typedef struct expr_pars_var { + expr_common_t h; + pars_var_t * v; +} expr_pars_var_t; + + +typedef struct expr_wrd { + expr_common_t h; + wrd_t * s; +} expr_wrd_t; + +typedef struct expr_op3 { + expr_common_t h; + expr_v4_t * op; + expr_v4_t * op_l; + expr_v4_t * op_r; +} expr_op3_t; + +typedef struct expr_op2 { + expr_common_t h; + expr_v4_t * op_l; + expr_v4_t * op_r; +} expr_op2_t; + +typedef struct expr_op { + expr_common_t h; + expr_v4_t * op; +} expr_op_t; + +typedef struct expr_assign { + expr_common_t h; + pars_var_t * target; + expr_v4_t * source; +// expr_v4_t * (* construct)( lnode *, expr_v4_t * ); +} expr_assign_t; + +typedef struct expr_list expr_list_t; +struct expr_list { + expr_common_t h; + expr_list_t * next; + expr_v4_t * source; +} ; + +typedef enum { + EXPR_WRD, + EXPR_PARS_VAR, + EXPR_LIST, + EXPR_ASSIGN, + EXPR_LNODE, + EXPR_FLOW, + EXPR_OP3, + EXPR_OP2, + EXPR_OP +} expr_v4_type; + +union expr_v4 { + expr_common_t h; + expr_wrd_t wd; + expr_pars_var_t pars_var; + expr_list_t list; + + expr_assign_t assgn; + + expr_lnode_t lnode; + expr_flow_t flow; +// expr_op3_t op3; + expr_op2_t op2; + expr_op_t op; +}; + +/* ok this is space for names, constants and tmp*/ +typedef struct freeSpace freeSpace_t; + +struct freeSpace { + freeSpace_t * freeSpace_next; /* next buffer */ + char * freeSpace; /* pointer to free space */ + char * freeSpaceMax; /* for overflow control */ + char freeSpaceBase[FREESPACESIZE]; /* current buffer */ +}; + + +typedef struct streg streg_t; + +struct streg { + int stype; /* cur type of level */ + int level; /* cur level */ + streg_t * next; + streg_t * prev; + expr_v4_t * cur_exp; /* current (pwd) expression for this level */ + expr_v4_t * wrk_exp; /* current (work) expression for this level */ + +// struct path_walk path_walk; +// struct nameidata_reiser4 nd; /* current for this level */ + +}; + + +static struct { + unsigned char numOfParam; + unsigned char typesOfParam[4] ; +} typesOfCommand[] = { + {0,{0,0,0,0}} +}; + +static struct { + void (* call_function)(void) ; + unsigned char type; /* describe parameters, and its types */ +} Code[] = { +}; + + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/parser/parser.tab.c linux-2.6.4-ck1/fs/reiser4/parser/parser.tab.c --- linux-2.6.4/fs/reiser4/parser/parser.tab.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/parser.tab.c 2004-03-11 22:45:15.280511607 +1100 @@ -0,0 +1,197 @@ +short yylhs[] = { -1, + 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 7, 7, 9, 11, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 10, 4, 2, 2, + 5, 5, 3, 3, 1, +}; +short yylen[] = { 2, + 1, 1, 1, 3, 3, 3, 1, 3, 3, 5, + 3, 1, 4, 2, 2, 1, 2, 2, 3, 3, + 3, 3, 3, 3, 3, 3, 2, 1, 2, 3, + 1, 0, 1, 3, 1, +}; +short yydefred[] = { 0, + 12, 3, 31, 16, 0, 0, 0, 0, 0, 7, + 0, 0, 0, 0, 0, 0, 35, 33, 0, 29, + 0, 0, 0, 0, 0, 0, 0, 0, 15, 30, + 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 34, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, +}; +short yydgoto[] = { 5, + 19, 6, 20, 7, 8, 9, 10, 29, 11, 25, + 12, +}; +short yysindex[] = { -216, + 0, 0, 0, 0, 0, -264, -240, -251, -199, 0, + -267, -239, -251, -243, -216, -216, 0, 0, -216, 0, + -216, -216, -216, -216, -255, -216, -216, -217, 0, 0, + -216, -199, -199, -199, -194, 0, -246, -219, -219, -216, + -199, -199, -216, -216, -216, -216, -216, -216, -216, -216, + -191, 0, -219, -199, -199, -199, -199, -199, -199, -199, + -199, 0, +}; +short yyrindex[] = { -205, + 0, 0, 0, 0, 0, 1, 0, 0, 44, 0, + 0, -205, 0, -205, -205, -205, 0, 0, -205, 0, + -205, -205, -205, -205, 161, -205, -205, 0, 0, 0, + -205, 101, 121, 141, 0, 0, 21, 41, 61, -205, + -218, -213, -205, -205, -205, -205, -205, -205, -205, -205, + 0, 0, 81, -212, -204, -201, -200, -198, -267, -197, + -196, 0, +}; +short yygindex[] = { 0, + 0, 0, 65, 0, 0, -12, 0, 0, 0, 0, + 0, +}; +short yytable[] = { 28, + 2, 32, 33, 34, 13, 17, 35, 18, 36, 37, + 38, 39, 1, 41, 42, 24, 1, 2, 51, 21, + 6, 2, 14, 15, 16, 3, 31, 53, 40, 3, + 54, 55, 56, 57, 58, 59, 60, 61, 4, 1, + 4, 26, 4, 1, 2, 27, 21, 22, 21, 22, + 23, 32, 3, 32, 43, 44, 45, 46, 47, 48, + 27, 49, 50, 52, 17, 4, 21, 22, 23, 18, + 19, 21, 22, 23, 21, 22, 23, 30, 20, 62, + 13, 21, 22, 0, 23, 26, 25, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, + 0, 0, 0, 28, 28, 28, 2, 2, 2, 0, + 0, 2, 2, 2, 2, 2, 2, 2, 6, 2, + 2, 0, 0, 2, 2, 0, 0, 6, 6, 0, + 0, 6, 6, 6, 6, 6, 6, 6, 4, 6, + 6, 0, 0, 6, 6, 0, 0, 0, 4, 0, + 0, 4, 4, 4, 4, 4, 4, 4, 27, 4, + 4, 0, 0, 4, 4, 0, 0, 0, 27, 0, + 0, 27, 27, 27, 27, 27, 27, 27, 13, 27, + 27, 0, 0, 27, 27, 0, 0, 0, 13, 0, + 0, 13, 13, 13, 13, 13, 13, 13, 8, 13, + 13, 0, 0, 13, 13, 0, 0, 0, 0, 0, + 0, 8, 8, 8, 8, 8, 8, 8, 9, 8, + 8, 0, 0, 8, 8, 0, 0, 0, 0, 0, + 0, 9, 9, 9, 9, 9, 9, 9, 11, 9, + 9, 0, 0, 9, 9, 0, 0, 0, 0, 0, + 0, 11, 11, 11, 11, 11, 11, 11, 14, 11, + 11, 0, 0, 11, 11, 0, 14, 14, 14, 0, + 0, 14, 14, 14, 14, 14, 14, 14, 0, 14, + 14, 0, 0, 14, +}; +short yycheck[] = { 12, + 0, 14, 15, 16, 269, 257, 19, 259, 21, 22, + 23, 24, 256, 26, 27, 283, 256, 261, 31, 266, + 0, 261, 263, 264, 265, 269, 270, 40, 284, 269, + 43, 44, 45, 46, 47, 48, 49, 50, 282, 256, + 0, 281, 282, 0, 261, 285, 266, 267, 266, 267, + 268, 257, 269, 259, 272, 273, 274, 275, 276, 277, + 0, 279, 280, 258, 283, 282, 266, 267, 268, 283, + 283, 266, 267, 268, 266, 267, 268, 13, 283, 271, + 0, 283, 283, -1, 283, 283, 283, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, 258, -1, + -1, -1, -1, 263, 264, 265, 266, 267, 268, -1, + -1, 271, 272, 273, 274, 275, 276, 277, 258, 279, + 280, -1, -1, 283, 284, -1, -1, 267, 268, -1, + -1, 271, 272, 273, 274, 275, 276, 277, 258, 279, + 280, -1, -1, 283, 284, -1, -1, -1, 268, -1, + -1, 271, 272, 273, 274, 275, 276, 277, 258, 279, + 280, -1, -1, 283, 284, -1, -1, -1, 268, -1, + -1, 271, 272, 273, 274, 275, 276, 277, 258, 279, + 280, -1, -1, 283, 284, -1, -1, -1, 268, -1, + -1, 271, 272, 273, 274, 275, 276, 277, 258, 279, + 280, -1, -1, 283, 284, -1, -1, -1, -1, -1, + -1, 271, 272, 273, 274, 275, 276, 277, 258, 279, + 280, -1, -1, 283, 284, -1, -1, -1, -1, -1, + -1, 271, 272, 273, 274, 275, 276, 277, 258, 279, + 280, -1, -1, 283, 284, -1, -1, -1, -1, -1, + -1, 271, 272, 273, 274, 275, 276, 277, 258, 279, + 280, -1, -1, 283, 284, -1, 266, 267, 268, -1, + -1, 271, 272, 273, 274, 275, 276, 277, -1, 279, + 280, -1, -1, 283, +}; +#ifndef YYDEBUG +#define YYDEBUG 0 +#endif +#if YYDEBUG +char *yyname[] = { +"end-of-file",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"L_BRACKET","R_BRACKET","WORD", +"P_RUNNER","STRING_CONSTANT","TRANSCRASH","L_ASSIGN","L_APPEND","L_SYMLINK", +"SEMICOLON","COMMA","PLUS","SLASH","INV_L","INV_R","EQ","NE","LE","GE","LT", +"GT","IS","AND","OR","NOT","IF","THEN","ELSE","EXIST","NAME","UNNAME","ROOT", +"USLASH", +}; +char *yyrule[] = { +"$accept : reiser4", +"reiser4 : Expression", +"Expression : Object_Name", +"Expression : STRING_CONSTANT", +"Expression : Expression PLUS Expression", +"Expression : Expression SEMICOLON Expression", +"Expression : Expression COMMA Expression", +"Expression : if_statement", +"Expression : target L_ASSIGN Expression", +"Expression : target L_APPEND Expression", +"Expression : target L_ASSIGN INV_L Expression INV_R", +"Expression : target L_SYMLINK Expression", +"Expression : error", +"if_statement : if_Begin then_operation ELSE Expression", +"if_statement : if_Begin then_operation", +"if_Begin : if if_Expression", +"if : IF", +"if_Expression : NOT Expression", +"if_Expression : EXIST Expression", +"if_Expression : Expression EQ Expression", +"if_Expression : Expression NE Expression", +"if_Expression : Expression LE Expression", +"if_Expression : Expression GE Expression", +"if_Expression : Expression LT Expression", +"if_Expression : Expression GT Expression", +"if_Expression : Expression OR Expression", +"if_Expression : Expression AND Expression", +"then_operation : THEN Expression", +"target : Object_Name", +"Object_Name : begin_from name", +"Object_Name : Object_Name SLASH name", +"begin_from : SLASH", +"begin_from :", +"name : WORD", +"name : level_up Expression R_BRACKET", +"level_up : L_BRACKET", +}; +#endif diff -Naurp linux-2.6.4/fs/reiser4/parser/parser.tab.h linux-2.6.4-ck1/fs/reiser4/parser/parser.tab.h --- linux-2.6.4/fs/reiser4/parser/parser.tab.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/parser.tab.h 2004-03-11 22:45:15.281511452 +1100 @@ -0,0 +1,40 @@ +#define L_BRACKET 257 +#define R_BRACKET 258 +#define WORD 259 +#define P_RUNNER 260 +#define STRING_CONSTANT 261 +#define TRANSCRASH 262 +#define L_ASSIGN 263 +#define L_APPEND 264 +#define L_SYMLINK 265 +#define SEMICOLON 266 +#define COMMA 267 +#define PLUS 268 +#define SLASH 269 +#define INV_L 270 +#define INV_R 271 +#define EQ 272 +#define NE 273 +#define LE 274 +#define GE 275 +#define LT 276 +#define GT 277 +#define IS 278 +#define AND 279 +#define OR 280 +#define NOT 281 +#define IF 282 +#define THEN 283 +#define ELSE 284 +#define EXIST 285 +#define NAME 286 +#define UNNAME 287 +#define ROOT 288 +#define USLASH 289 +typedef union +{ + long charType; + expr_v4_t * expr; + wrd_t * wrd; +} YYSTYPE; +extern YYSTYPE yylval; diff -Naurp linux-2.6.4/fs/reiser4/parser/parser.y linux-2.6.4-ck1/fs/reiser4/parser/parser.y --- linux-2.6.4/fs/reiser4/parser/parser.y 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/parser.y 2004-03-11 22:45:15.281511452 +1100 @@ -0,0 +1,175 @@ +/* + * Copyright 2001, 2002 by Hans Reiser, licensing governed by reiser4/README + */ + +/* Parser for the reiser4() system call */ + + +/* type definitions */ +%union +{ + long charType; + expr_v4_t * expr; + wrd_t * wrd; +} + +%type L_BRACKET R_BRACKET level_up reiser4 + +%type WORD +%type P_RUNNER +%type STRING_CONSTANT + +%type Object_Name name target +%type begin_from +%type Expression + +%type if_statement +%type if_statement if_Expression if_Begin +%type then_operation + +%token TRANSCRASH +%token L_ASSIGN L_APPEND L_SYMLINK +%token SEMICOLON /* ; */ +%token COMMA /* , */ +%token PLUS /* + */ +%token L_BRACKET R_BRACKET +%token SLASH +%token INV_L INV_R +%token EQ NE LE GE LT GT +%token IS +%token AND +%token OR +%token P_RUNNER +%token NOT +%token IF +%token THEN ELSE +%token EXIST +%token NAME UNNAME +%token WORD STRING_CONSTANT +%token ROOT + + +%left UNNAME +%left NAME +%left NOT +%left AND +%left OR +%left EQ NE LE GE LT GT + +%right L_SYMLINK /* -> */ +%right L_APPEND /* <<- */ +%right L_ASSIGN /* <- */ + +%left PLUS /* + */ + +%right ELSE +%left COMMA /* , */ + +%left SEMICOLON /* ; */ +%left SLASH /* / */ +%left USLASH /* / */ + +/* +For bison: +%pure_parser +*/ + +/* + Starting production of our grammar. + */ +%start reiser4 + +%% + +reiser4 + : Expression { $$ = free_expr( $1 ); } +; + +Expression + : Object_Name { $$ = $1;} + | STRING_CONSTANT { $$ = constToExpr( ws, $1 ); } + | Expression PLUS Expression { $$ = connect_expression( ws, $1, $3 ); } + | Expression SEMICOLON Expression { $$ = list_expression( ws, $1, $3 ); } + | Expression COMMA Expression { $$ = list_async_expression( ws, $1, $3 ); } + | if_statement { $$ = $1; level_down( ws, IF_STATEMENT, IF_STATEMENT ); } + /* the ASSIGNMENT operator return a value: bytes written */ + | target L_ASSIGN Expression { $$ = assign( ws, $1, $3 ); } /* <- direct assign */ + | target L_APPEND Expression { $$ = assign( ws, $1, $3 ); } /* <- direct assign */ + | target L_ASSIGN INV_L Expression INV_R { $$ = assign_invert( ws, $1, $4 ); } /* <- invert assign. destination must have ..invert method */ + | target L_SYMLINK Expression { $$ = symlink( ws, $1, $3 ); } /* -> symlink the SYMLINK operator return a value: bytes ???? */ + | error /*SEMICOLON*/ + +//| level_up Expression R_BRACKET { $$ = $2 level_down( ws, $1, $3 );} +//| Expression Expression { $$ = list_unordered_expression( ws, $1, $2 ); } + +; + +if_statement + : if_Begin then_operation ELSE Expression %prec PLUS { $$ = if_then_else( ws, $1, $2, $4 ); } + | if_Begin then_operation %prec PLUS { $$ = if_then( ws, $1, $2) ; } +; + +if_Begin + : if if_Expression { $$ = $2; } +; + +if: IF { level_up( ws, IF_STATEMENT ); } +; + +if_Expression + : NOT Expression { $$ = not_expression( ws, $2 ); } + | EXIST Expression { $$ = check_exist( ws, $2 ); } + | Expression EQ Expression { $$ = compare_EQ_expression( ws, $1, $3 ); } + | Expression NE Expression { $$ = compare_NE_expression( ws, $1, $3 ); } + | Expression LE Expression { $$ = compare_LE_expression( ws, $1, $3 ); } + | Expression GE Expression { $$ = compare_GE_expression( ws, $1, $3 ); } + | Expression LT Expression { $$ = compare_LT_expression( ws, $1, $3 ); } + | Expression GT Expression { $$ = compare_GT_expression( ws, $1, $3 ); } + | Expression OR Expression { $$ = compare_OR_expression( ws, $1, $3 ); } + | Expression AND Expression { $$ = compare_AND_expression( ws, $1, $3 ); } +; + +then_operation + : THEN Expression %prec PLUS { goto_end( ws );} +; + +target + : Object_Name { $$ = $1;} +; + +Object_Name + : begin_from name %prec ROOT { $$ = pars_expr( ws, $1, $2 ) ; } + | Object_Name SLASH name { $$ = pars_expr( ws, $1, $3 ) ; } +; + +begin_from + : SLASH { $$ = pars_lookup_root( ws ) ; } + | { $$ = pars_lookup_curr( ws ) ; } +; + +name + : WORD { $$ = lookup_word( ws, $1 ); } + | level_up Expression R_BRACKET { $$ = $2; level_down( ws, $1, $3 );} /*not yet */ +; + +level_up + : L_BRACKET { $$ = $1; level_up( ws, $1 ); /*set_curr_path( ws ); */} +; + +%% + + +#define yyversion "4.0.0" +#include "pars.cls.h" +#include "parser.tab.c" +#include "pars.yacc.h" +#include "lib.c" + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/parser/pars.yacc.h linux-2.6.4-ck1/fs/reiser4/parser/pars.yacc.h --- linux-2.6.4/fs/reiser4/parser/pars.yacc.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/pars.yacc.h 2004-03-11 22:45:15.276512229 +1100 @@ -0,0 +1,68 @@ +/* + * Copyright 2001, 2002 by Hans Reiser, licensing governed by reiser4/README + */ + +/* + * definitions of work space for yacc generated from + * parser.y + */ + +#define MAXLEVELCO 500 +#define BEGIN_FROM_ROOT 222 +#define BEGIN_FROM_CURRENT 333 + +struct reiser4_syscall_w_space { + /* char * ws_inline; this two field used for parsing string, one (inline) stay on begin */ + char * ws_pline; /* of token, second (pline) walk to end to token */ +#ifdef yyacc + /* next field need for yacc */ + /* accesing to this fields from rules: ws->... */ + int ws_yystacksize; /*500*/ + int ws_yymaxdepth ; /*500*/ + int ws_yydebug; + int ws_yynerrs; + int ws_yyerrflag; + int ws_yychar; + int * ws_yyssp; + YYSTYPE * ws_yyvsp; + YYSTYPE ws_yyval; + YYSTYPE ws_yylval; + int ws_yyss[YYSTACKSIZE]; + YYSTYPE ws_yyvs[YYSTACKSIZE]; +#else + /* declare for bison */ +#endif + int ws_yyerrco; + int ws_level; /* current level */ + int ws_errco; /* number of errors */ + /* working fields */ + char * tmpWrdEnd; /* pointer for parsing input string */ + char * yytext; /* pointer for parsing input string */ + /* space for */ + freeSpace_t * freeSpHead; /* work spaces list Header */ + freeSpace_t * freeSpCur; /* current work space */ + wrd_t * wrdHead; /* names list Header */ + pars_var_t * Head_pars_var; /* parsed variables Header */ + streg_t * Head_level; /* parsers level list Header */ + streg_t * cur_level; /* current level */ + + expr_v4_t * root_e; /* root expression for this task */ + struct nameidata nd; /* work field for pass to VFS mount points */ + +// pars_var_t * wvn; /* work for this task */ +// struct dentry * de; /* work dentry for this task */ +}; + + + +#define printf prink + +/* + * Make Linus happy. + * Local variables: + * c-indentation-style: "K&R" + * mode-name: "LC" + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -Naurp linux-2.6.4/fs/reiser4/parser/tmp.c linux-2.6.4-ck1/fs/reiser4/parser/tmp.c --- linux-2.6.4/fs/reiser4/parser/tmp.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/tmp.c 2004-03-11 22:45:15.282511296 +1100 @@ -0,0 +1,32 @@ + +/* +#define SP 1 +#define SP_SIM 2 +#define WRD 3 +#define SP 22 +#define SP_SIM 23 +#define WRD 24 +*/ + +#include "../sys_reiser4.c" + +int +main() +{ + int i; + i = 0; + while (i != 307) { + printf("-------->%d\n", i = sys_reiser4("a<-b;"); + } + return 0; + } + +/* + * Make Linus happy. + * Local variables: + * c-indentation-style: "K&R" + * mode-name: "LC" + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -Naurp linux-2.6.4/fs/reiser4/parser/yacc_reiser4.patch linux-2.6.4-ck1/fs/reiser4/parser/yacc_reiser4.patch --- linux-2.6.4/fs/reiser4/parser/yacc_reiser4.patch 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/parser/yacc_reiser4.patch 2004-03-11 22:45:15.282511296 +1100 @@ -0,0 +1,127 @@ +--- ../yacc/skeleton.c 1993-12-22 14:28:01.000000000 +0300 ++++ skeleton.c 2004-01-29 23:05:26.000000000 +0300 +@@ -16,7 +16,7 @@ + { + "#ifndef lint", + "/*static char yysccsid[] = \"from: @(#)yaccpar 1.9 (Berkeley) 02/21/93\";*/", +- "static char yyrcsid[] = \"$Id: skeleton.c,v 1.4 1993/12/21 18:45:32 jtc Exp $\";", ++ "static char yyrcsid[] = \"$Id: skeleton.c,v 1.4 1993/12/21 18:45:32 jtc Exp $\\n 2002/10/22 VD reiser4\";", + "#endif", + "#define YYBYACC 1", + "#define YYMAJOR 1", +@@ -30,6 +30,9 @@ + + char *tables[] = + { ++ "#if defined(YYREISER4_DEF)", ++ "#define extern static", ++ "#endif", + "extern short yylhs[];", + "extern short yylen[];", + "extern short yydefred[];", +@@ -49,28 +52,45 @@ + + char *header[] = + { +- "#ifdef YYSTACKSIZE", +- "#undef YYMAXDEPTH", +- "#define YYMAXDEPTH YYSTACKSIZE", +- "#else", +- "#ifdef YYMAXDEPTH", +- "#define YYSTACKSIZE YYMAXDEPTH", ++ "#if defined(YYREISER4_DEF)", ++ "#define YYSTACKSIZE 500", ++ "#define YYMAXDEPTH 500", ++ "#define yydebug ws->ws_yydebug ", ++ "#define yynerrs ws->ws_yynerrs", ++ "#define yyerrflag ws->ws_yyerrflag", ++ "#define yychar ws->ws_yychar", ++ "#define yyssp ws->ws_yyssp", ++ "#define yyvsp ws->ws_yyvsp", ++ "#define yyval ws->ws_yyval", ++ "#define yylval ws->ws_yylval", ++ "#define yyss ws->ws_yyss", ++ "#define yyvs ws->ws_yyvs", ++ "#define yystacksize ws->ws_yystacksize", + "#else", +- "#define YYSTACKSIZE 500", +- "#define YYMAXDEPTH 500", ++ "#ifdef YYSTACKSIZE", ++ "#undef YYMAXDEPTH", ++ "#define YYMAXDEPTH YYSTACKSIZE", ++ "#else", ++ "#ifdef YYMAXDEPTH", ++ "#define YYSTACKSIZE YYMAXDEPTH", ++ "#else", ++ "#define YYSTACKSIZE 500", ++ "#define YYMAXDEPTH 500", ++ "#endif", ++ "#endif", ++ "int yydebug;", ++ "int yynerrs;", ++ "int yyerrflag;", ++ "int yychar;", ++ "short *yyssp;", ++ "YYSTYPE *yyvsp;", ++ "YYSTYPE yyval;", ++ "YYSTYPE yylval;", ++ "short yyss[YYSTACKSIZE];", ++ "YYSTYPE yyvs[YYSTACKSIZE];", ++ "#define yystacksize YYSTACKSIZE", + "#endif", +- "#endif", +- "int yydebug;", +- "int yynerrs;", +- "int yyerrflag;", +- "int yychar;", +- "short *yyssp;", +- "YYSTYPE *yyvsp;", +- "YYSTYPE yyval;", +- "YYSTYPE yylval;", +- "short yyss[YYSTACKSIZE];", +- "YYSTYPE yyvs[YYSTACKSIZE];", +- "#define yystacksize YYSTACKSIZE", ++ + 0 + }; + +@@ -82,11 +102,15 @@ + "#define YYACCEPT goto yyaccept", + "#define YYERROR goto yyerrlab", + "int", ++ "#if defined(YYREISER4_DEF)", ++ "yyparse(struct reiser4_syscall_w_space * ws)", ++ "#else", + "#if defined(__STDC__)", + "yyparse(void)", + "#else", + "yyparse()", + "#endif", ++ "#endif", + "{", + " register int yym, yyn, yystate;", + "#if YYDEBUG", +@@ -150,7 +174,11 @@ + " goto yyreduce;", + " }", + " if (yyerrflag) goto yyinrecovery;", ++ "#if defined(YYREISER4_DEF)", ++ " yyerror(ws,11111,yystate,yychar);", ++ "#else ", + " yyerror(\"syntax error\");", ++ "#endif", + "#ifdef lint", + " goto yyerrlab;", + "#endif", +@@ -275,7 +303,11 @@ + " *++yyvsp = yyval;", + " goto yyloop;", + "yyoverflow:", ++ "#if defined(YYREISER4_DEF)", ++ " yyerror(ws,101); /*yacc stack overflow*/", ++ "#else ", + " yyerror(\"yacc stack overflow\");", ++ "#endif", + "yyabort:", + " return (1);", + "yyaccept:", diff -Naurp linux-2.6.4/fs/reiser4/plugin/cryptcompress.c linux-2.6.4-ck1/fs/reiser4/plugin/cryptcompress.c --- linux-2.6.4/fs/reiser4/plugin/cryptcompress.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/cryptcompress.c 2004-03-11 22:45:15.288510363 +1100 @@ -0,0 +1,2748 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README + +This file contains all cluster operations and methods of the reiser4 +cryptcompress object plugin (see http://www.namesys.com/cryptcompress_design.html +for details). +The list of cryptcompress specific EA: + + Incore inode Disk stat-data +******************************************************************************************** +* data structure * field * data structure * field * +******************************************************************************************** +* plugin_set *file plugin id * reiser4_plugin_stat *file plugin id * +* *crypto plugin id * *crypto plugin id * +* *digest plugin id * *digest plugin id * +* *compression plugin id * *compression plugin id* +******************************************************************************************** +* crypto_stat_t * keysize * reiser4_crypto_stat * keysize * +* * keyid * * keyid * +******************************************************************************************** +* cluster_stat_t * cluster_shift * reiser4_cluster_stat * cluster_shift * +******************************************************************************************** +* cryptcompress_info_t * expkey * * * +******************************************************************************************** +*/ +#include "../debug.h" +#include "../inode.h" +#include "../jnode.h" +#include "../tree.h" +#include "../page_cache.h" +#include "../readahead.h" +#include "../forward.h" +#include "../super.h" +#include "../context.h" +#include "plugin.h" +#include "object.h" + +#include +#include +#include + +int do_readpage_ctail(reiser4_cluster_t *, struct page * page); +int ctail_read_cluster (reiser4_cluster_t *, struct inode *, int); +reiser4_key * append_cluster_key_ctail(const coord_t *, reiser4_key *); +int setattr_reserve(reiser4_tree *); +int reserve_cut_iteration(reiser4_tree *); +int writepage_ctail(struct page *); + +/* get cryptcompress specific portion of inode */ +reiser4_internal cryptcompress_info_t * +cryptcompress_inode_data(const struct inode * inode) +{ + return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info; +} + +static void +destroy_key(__u32 * expkey, crypto_plugin * cplug) +{ + assert("edward-410", cplug != NULL); + assert("edward-411", expkey != NULL); + + xmemset(expkey, 0, (cplug->nr_keywords)*sizeof(__u32)); + reiser4_kfree(expkey); +} + +static void +detach_crypto_stat(crypto_stat_t * stat, digest_plugin * dplug) +{ + assert("edward-412", stat != NULL); + assert("edward-413", dplug != NULL); + + reiser4_kfree(stat->keyid); + reiser4_kfree(stat); +} + +/* 1) fill cryptcompress specific part of inode + 2) set inode crypto stat which is supposed to be saved in stat-data */ +static int +inode_set_crypto(struct inode * object, crypto_data_t * data) +{ + int result; + crypto_stat_t * stat; + cryptcompress_info_t * info = cryptcompress_inode_data(object); + crypto_plugin * cplug = crypto_plugin_by_id(data->cra); + digest_plugin * dplug = digest_plugin_by_id(data->dia); + void * digest_ctx = NULL; + + assert("edward-414", dplug != NULL); + assert("edward-415", cplug != NULL); + assert("edward-416", data != NULL); + assert("edward-417", data->key!= NULL); + assert("edward-88", data->keyid != NULL); + assert("edward-83", data->keyid_size != 0); + assert("edward-89", data->keysize != 0); + + /* set secret key */ + info->expkey = reiser4_kmalloc((cplug->nr_keywords)*sizeof(__u32), GFP_KERNEL); + if (!info->expkey) + return RETERR(-ENOMEM); + result = cplug->set_key(info->expkey, data->key); + if (result) + goto destroy_key; + assert ("edward-34", !inode_get_flag(object, REISER4_SECRET_KEY_INSTALLED)); + inode_set_flag(object, REISER4_SECRET_KEY_INSTALLED); + + /* attach crypto stat */ + stat = reiser4_kmalloc(sizeof(*stat), GFP_KERNEL); + if (!stat) { + result = -ENOMEM; + goto destroy_key; + } + stat->keyid = reiser4_kmalloc((size_t)(dplug->digestsize), GFP_KERNEL); + if (!stat->keyid) { + reiser4_kfree(stat); + result = -ENOMEM; + goto destroy_key; + } + /* fingerprint creation of the pair (@key, @keyid) includes two steps: */ + /* 1. encrypt keyid by key: */ + /* FIXME-EDWARD: add encryption of keyid */ + + /* 2. make digest of encrypted keyid */ + result = dplug->alloc(digest_ctx); + if (result) + goto exit; + dplug->init(digest_ctx); + dplug->update(digest_ctx, data->keyid, data->keyid_size); + dplug->final(digest_ctx, stat->keyid); + dplug->free(digest_ctx); + + stat->keysize = data->keysize; + reiser4_inode_data(object)->crypt = stat; + return 0; + exit: + detach_crypto_stat(stat, dplug); + destroy_key: + destroy_key(info->expkey, cplug); + inode_clr_flag(object, REISER4_SECRET_KEY_INSTALLED); + return result; +} + + +/* plugin->create() method for crypto-compressed files + +. install plugins +. attach crypto info if specified +. attach compression info if specified +. attach cluster info +*/ +reiser4_internal int +create_cryptcompress(struct inode *object, struct inode *parent, reiser4_object_create_data * data) +{ + int result; + scint_t *extmask; + reiser4_inode * info; + digest_plugin * dplug = NULL; + crypto_plugin * cplug = NULL; + compression_plugin * coplug = NULL; + + assert("edward-23", object != NULL); + assert("edward-24", parent != NULL); + assert("edward-26", inode_get_flag(object, REISER4_NO_SD)); + assert("edward-27", data->id == CRC_FILE_PLUGIN_ID); + + info = reiser4_inode_data(object); + + assert("edward-29", info != NULL); + assert("edward-30", info->pset->crypto == NULL); + assert("edward-85", info->pset->digest == NULL); + assert("edward-31", info->pset->compression == NULL); + + extmask = &info->extmask; + + if (data->crypto) { + /* set plugins and crypto stat */ + cplug = crypto_plugin_by_id(data->crypto->cra); + dplug = digest_plugin_by_id(data->crypto->dia); + result = inode_set_crypto(object, data->crypto); + if (result) + return result; + result = scint_pack(extmask, scint_unpack(extmask) | + (1 << CRYPTO_STAT), GFP_ATOMIC); + if (result) + goto exit; + } + plugin_set_crypto(&info->pset, cplug); + plugin_set_digest(&info->pset, dplug); + + if (data->compression) + /* set plugin */ + coplug = compression_plugin_by_id(*data->compression); + plugin_set_compression(&info->pset, coplug); + + /* cluster params always is necessary */ + if(!data->cluster) { + printk("edward-418, create_cryptcompress: default cluster size (4K) was assigned\n"); + info->cluster_shift = 0; + } + else + info->cluster_shift = *data->cluster; + result = scint_pack(extmask, scint_unpack(extmask) | + (1 << PLUGIN_STAT) | + (1 << CLUSTER_STAT), GFP_ATOMIC); + if (result) + goto exit; + /* set bits */ + info->plugin_mask |= (1 << REISER4_FILE_PLUGIN_TYPE) | + (1 << REISER4_CRYPTO_PLUGIN_TYPE) | + (1 << REISER4_DIGEST_PLUGIN_TYPE) | + (1 << REISER4_COMPRESSION_PLUGIN_TYPE); + + /* save everything in disk stat-data */ + result = write_sd_by_inode_common(object); + if (!result) + return 0; + /* save() method failed, release attached crypto info */ + inode_clr_flag(object, REISER4_CRYPTO_STAT_LOADED); + inode_clr_flag(object, REISER4_CLUSTER_KNOWN); + + exit: + if (info->crypt) { + destroy_key(cryptcompress_inode_data(object)->expkey, cplug); + inode_clr_flag(object, REISER4_SECRET_KEY_INSTALLED); + detach_crypto_stat(info->crypt, dplug); + } + return result; +} + +static int +save_len_cryptcompress_plugin(struct inode * inode, reiser4_plugin * plugin) +{ + assert("edward-457", inode != NULL); + assert("edward-458", plugin != NULL); + assert("edward-459", plugin->h.id == CRC_FILE_PLUGIN_ID); + return 0; +} + +reiser4_internal int +load_cryptcompress_plugin(struct inode * inode, reiser4_plugin * plugin, char **area, int *len) +{ + assert("edward-455", inode != NULL); + assert("edward-456", (reiser4_inode_data(inode)->pset != NULL)); + + plugin_set_file(&reiser4_inode_data(inode)->pset, file_plugin_by_id(CRC_FILE_PLUGIN_ID)); + return 0; +} + +struct reiser4_plugin_ops cryptcompress_plugin_ops = { + .load = load_cryptcompress_plugin, + .save_len = save_len_cryptcompress_plugin, + .save = NULL, + .alignment = 8, + .change = NULL +}; + +reiser4_internal crypto_stat_t * inode_crypto_stat (struct inode * inode) +{ + assert("edward-90", inode != NULL); + assert("edward-91", reiser4_inode_data(inode) != NULL); + return (reiser4_inode_data(inode)->crypt); +} + +reiser4_internal __u8 inode_cluster_shift (struct inode * inode) +{ + reiser4_inode * info; + + assert("edward-92", inode != NULL); + + info = reiser4_inode_data(inode); + + assert("edward-93", info != NULL); + assert("edward-94", inode_get_flag(inode, REISER4_CLUSTER_KNOWN)); + assert("edward-95", info->cluster_shift <= MAX_CLUSTER_SHIFT); + + return info->cluster_shift; +} + +/* returns number of pages in the cluster */ +reiser4_internal int inode_cluster_pages (struct inode * inode) +{ + return (1 << inode_cluster_shift(inode)); +} + +reiser4_internal size_t inode_cluster_size (struct inode * inode) +{ + assert("edward-96", inode != NULL); + + return (PAGE_CACHE_SIZE << inode_cluster_shift(inode)); +} + +/* returns translated offset */ +reiser4_internal loff_t inode_scaled_offset (struct inode * inode, + const loff_t src_off /* input offset */) +{ + crypto_plugin * cplug; + crypto_stat_t * stat; + size_t size; + + assert("edward-97", inode != NULL); + + cplug = inode_crypto_plugin(inode); + + if (!cplug || src_off == get_key_offset(max_key())) + return src_off; + + stat = inode_crypto_stat(inode); + + assert("edward-98", stat != NULL); + assert("edward-99", stat->keysize != 0); + + size = cplug->blocksize(stat->keysize); + return cplug->scale(inode, size, src_off); +} + +static inline loff_t min_count(loff_t a, loff_t b) +{ + return (a < b ? a : b); +} + +/* returns disk cluster size */ +reiser4_internal size_t +inode_scaled_cluster_size (struct inode * inode) +{ + assert("edward-110", inode != NULL); + assert("edward-111", inode_get_flag(inode, REISER4_CLUSTER_KNOWN)); + + return inode_scaled_offset(inode, inode_cluster_size(inode)); +} + +reiser4_internal void reiser4_cluster_init (reiser4_cluster_t * clust){ + assert("edward-84", clust != NULL); + xmemset(clust, 0, sizeof *clust); + clust->stat = DATA_CLUSTER; +} + +/* release cluster's data */ +reiser4_internal void +release_cluster_buf(reiser4_cluster_t * clust, struct inode * inode) +{ + assert("edward-121", clust != NULL); + assert("edward-124", inode != NULL); + assert("edward-125", inode_get_flag(inode, REISER4_CLUSTER_KNOWN)); + + if (clust->buf) { + assert("edward-615", clust->bsize != 0); + reiser4_kfree(clust->buf); + } +} + +reiser4_internal void +put_cluster_data(reiser4_cluster_t * clust, struct inode * inode) +{ + assert("edward-435", clust != NULL); + + release_cluster_buf(clust, inode); + /* invalidate cluster data */ + xmemset(clust, 0, sizeof *clust); +} + +/* returns true if we don't need to read new cluster from disk */ +reiser4_internal int cluster_is_uptodate (reiser4_cluster_t * clust) +{ + assert("edward-126", clust != NULL); + return (clust->buf != NULL); +} + +reiser4_internal unsigned long +pg_to_clust(unsigned long idx, struct inode * inode) +{ + return idx >> inode_cluster_shift(inode); +} + +reiser4_internal unsigned long +clust_to_pg(unsigned long idx, struct inode * inode) +{ + return idx << inode_cluster_shift(inode); +} + +reiser4_internal inline unsigned long +pg_to_clust_to_pg(unsigned long idx, struct inode * inode) +{ + return clust_to_pg(pg_to_clust(idx, inode), inode); +} + +reiser4_internal unsigned long +off_to_pg(loff_t off) +{ + return (off >> PAGE_CACHE_SHIFT); +} + +static inline loff_t +pg_to_off(unsigned long idx) +{ + return ((loff_t)(idx) << PAGE_CACHE_SHIFT); +} + +static inline unsigned long +off_to_clust(loff_t off, struct inode * inode) +{ + return pg_to_clust(off_to_pg(off), inode); +} + +reiser4_internal loff_t +clust_to_off(unsigned long idx, struct inode * inode) +{ + return pg_to_off(clust_to_pg(idx, inode)); +} + +static loff_t +off_to_clust_to_off(loff_t off, struct inode * inode) +{ + return clust_to_off(off_to_clust(off, inode), inode); +} + +static inline unsigned long +off_to_clust_to_pg(loff_t off, struct inode * inode) +{ + return clust_to_pg(off_to_clust(off, inode), inode); +} + +reiser4_internal unsigned +off_to_pgoff(loff_t off) +{ + return off & (PAGE_CACHE_SIZE - 1); +} + +static inline unsigned +off_to_cloff(loff_t off, struct inode * inode) +{ + return off & ((loff_t)(inode_cluster_size(inode)) - 1); +} + +reiser4_internal unsigned +pg_to_off_to_cloff(unsigned long idx, struct inode * inode) +{ + return off_to_cloff(pg_to_off(idx), inode); +} + +/* return true if the cluster contains specified page */ +reiser4_internal int +page_of_cluster(struct page * page, reiser4_cluster_t * clust, struct inode * inode) +{ + assert("edward-162", page != NULL); + assert("edward-163", clust != NULL); + assert("edward-164", inode != NULL); + assert("edward-165", inode_get_flag(inode, REISER4_CLUSTER_KNOWN)); + + return (pg_to_clust(page->index, inode) == clust->index); +} + +reiser4_internal int count_to_nrpages(unsigned count) +{ + return (!count ? 0 : off_to_pg(count - 1) + 1); +} + +static int +new_cluster(reiser4_cluster_t * clust, struct inode * inode) +{ + return (clust_to_off(clust->index, inode) >= inode->i_size); +} + +/* set minimal number of cluster pages (start from first one) + which cover hole and users data */ +static void +set_nrpages_by_frame(reiser4_cluster_t * clust) +{ + assert("edward-180", clust != NULL); + + if (clust->count + clust->delta == 0) { + /* nothing to write - nothing to read */ + clust->nr_pages = 0; + return; + } + clust->nr_pages = count_to_nrpages(clust->off + clust->count + clust->delta); +} + +static unsigned +off_to_count(loff_t off, unsigned long idx, struct inode * inode) +{ + if(idx > off_to_clust(off, inode)) + return 0; + return min_count(inode_cluster_size(inode), off - clust_to_off(idx, inode)); +} + +reiser4_internal unsigned +off_to_pgcount(loff_t off, unsigned long idx) +{ + if (idx > off_to_pg(off)) + return 0; + return min_count(PAGE_CACHE_SIZE, off - pg_to_off(idx)); +} + +reiser4_internal unsigned +fsize_to_count(reiser4_cluster_t * clust, struct inode * inode) +{ + assert("edward-288", clust != NULL); + assert("edward-289", inode != NULL); + + return off_to_count(inode->i_size, clust->index, inode); +} + +/* plugin->key_by_inode() */ +/* see plugin/plugin.h for details */ +reiser4_internal int +key_by_inode_cryptcompress(struct inode *inode, loff_t off, reiser4_key * key) +{ + assert("edward-64", inode != 0); + assert("edward-112", ergo(off != get_key_offset(max_key()), !off_to_cloff(off, inode))); + /* don't come here with other offsets */ + + build_sd_key(inode, key); + set_key_type(key, KEY_BODY_MINOR); + set_key_offset(key, (__u64) (!inode_crypto_stat(inode) ? off : inode_scaled_offset(inode, off))); + return 0; +} + +/* plugin->flow_by_inode */ +reiser4_internal int +flow_by_inode_cryptcompress(struct inode *inode /* file to build flow for */ , + char *buf /* user level buffer */ , + int user /* 1 if @buf is of user space, 0 - if it is + kernel space */ , + loff_t size /* buffer size */ , + loff_t off /* offset to start io from */ , + rw_op op /* READ or WRITE */ , + flow_t * f /* resulting flow */) +{ + assert("edward-436", f != NULL); + assert("edward-149", inode != NULL); + assert("edward-150", inode_file_plugin(inode) != NULL); + assert("edward-151", inode_file_plugin(inode)->key_by_inode == key_by_inode_cryptcompress); + + + f->length = size; + f->data = buf; + f->user = user; + f->op = op; + + if (op == WRITE_OP && user == 1) + return 0; + return key_by_inode_cryptcompress(inode, off, &f->key); +} + +reiser4_internal int +find_cluster_item(hint_t * hint, /* coord, lh, seal */ + const reiser4_key *key, /* key of next cluster item to read */ + znode_lock_mode lock_mode /* which lock */, + ra_info_t *ra_info, + lookup_bias bias) +{ + int result; + coord_t *coord; + + assert("edward-152", schedulable()); + + init_lh(hint->coord.lh); + coord = &hint->coord.base_coord; + if(hint) { + result = hint_validate(hint, key, 1 /* check key */, lock_mode); + if (!result) { + if (coord->between == AFTER_UNIT && equal_to_rdk(coord->node, key)) { + result = goto_right_neighbor(coord, hint->coord.lh); + if (result == -E_NO_NEIGHBOR) + return RETERR(-EIO); + if (result) + return result; + assert("vs-1152", equal_to_ldk(coord->node, key)); + /* we moved to different node. Invalidate coord extension, zload is necessary to init it + again */ + hint->coord.valid = 0; + } + return CBK_COORD_FOUND; + } + } + coord_init_zero(coord); + hint->coord.valid = 0; + return coord_by_key(current_tree, key, coord, hint->coord.lh, lock_mode, + bias, LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, ra_info); +} + +/* This represent reiser4 crypto alignment policy. + Returns the size > 0 of aligning overhead, if we should align/cut, + returns 0, if we shouldn't (alignment assumes appinding an overhead of the size > 0) */ +static int +crypto_overhead(size_t len /* advised length */, + reiser4_cluster_t * clust, + struct inode * inode, rw_op rw) +{ + crypto_plugin * cplug = inode_crypto_plugin(inode); + crypto_stat_t * stat = inode_crypto_stat (inode); + size_t size = 0; + int result = 0; + int oh; + + assert("edward-486", clust != 0); + + if (!cplug || !cplug->align_cluster) + return 0; + if (!len) + size = clust->len; + + assert("edward-615", size != 0); + assert("edward-487", stat != NULL); + + switch (rw) { + case WRITE_OP: /* align */ + assert("edward-488", size <= inode_cluster_size(inode)); + + oh = size % cplug->blocksize(stat->keysize); + + if (!oh && size == fsize_to_count(clust, inode)) + /* cluster don't need alignment and didn't get compressed */ + return 0; + assert("edward-489", cplug->blocksize != NULL); + result = (cplug->blocksize(stat->keysize) - oh); + break; + case READ_OP: /* cut */ + assert("edward-490", size <= inode_scaled_cluster_size(inode)); + if (size >= inode_scaled_offset(inode, fsize_to_count(clust, inode))) + /* cluster didn't get aligned */ + return 0; + assert("edward-491", clust->buf != NULL); + + result = *(clust->buf + size - 1); + break; + default: + impossible("edward-493", "bad option for getting alignment"); + } + return result; +} + +/* alternating the pairs (@clust->buf, @clust->bsize) and (@buf, @bufsize) */ +static void +alternate_buffers(reiser4_cluster_t * clust, __u8 ** buf, size_t * bufsize) +{ + __u8 * tmp_buf; + size_t tmp_size; + + assert("edward-405", bufsize != NULL); + assert("edward-406", *bufsize != 0); + + tmp_buf = *buf; + tmp_size = *bufsize; + + *buf = clust->buf; + *bufsize = clust->bsize; + + clust->buf = tmp_buf; + clust->bsize = tmp_size; +} + +/* maximal aligning overhead which can be appended + to the flow before encryption if any */ +reiser4_internal unsigned +max_crypto_overhead(crypto_plugin * cplug, crypto_stat_t * stat) +{ + if (!cplug || !cplug->align_cluster) + return 0; + else { + assert("edward-494", stat != NULL); + return cplug->blocksize(stat->keysize); + } +} + +reiser4_internal unsigned +compress_overhead(compression_plugin * cplug) +{ + return (cplug ? cplug->overrun : 0); +} + +/* The following two functions represent reiser4 compression policy */ +static int +try_compress(reiser4_cluster_t * clust, struct inode * inode) +{ + return (inode_compression_plugin(inode) && (clust->count >= MIN_SIZE_FOR_COMPRESSION)); +} + +/* Decide by the lengths of compressed and decompressed cluster, should we save or should + we discard the result of compression. The policy is that the length of compressed then + encrypted cluster including _all_ appended infrasrtucture should be _less_ then its lenght + before compression. */ +static int +save_compressed(reiser4_cluster_t * clust, struct inode * inode) +{ +/* NOTE: Actually we use max_crypto_overhead instead of precise overhead + (a bit stronger condition) to avoid divisions */ + return (clust->len + CLUSTER_MAGIC_SIZE + + max_crypto_overhead(inode_crypto_plugin(inode), inode_crypto_stat(inode)) < + clust->count); +} + +/* guess if the cluster was compressed */ +static int +need_decompression(reiser4_cluster_t * clust, struct inode * inode, + int encrypted /* is cluster encrypted */) +{ + assert("edward-142", clust != 0); + assert("edward-143", inode != NULL); + + return (inode_compression_plugin(inode) && clust->len < + (encrypted ? + inode_scaled_offset(inode, fsize_to_count(clust, inode)) : + fsize_to_count(clust, inode))); +} + +reiser4_internal void set_compression_magic(__u8 * magic) +{ + /* FIXME-EDWARD: If crypto_plugin != NULL, this should be private! + Use 4 bytes of decrypted keyid. PARANOID? */ + assert("edward-279", magic != NULL); + xmemset(magic, 0, CLUSTER_MAGIC_SIZE); +} + +/* + Common cluster deflate manager. + + . accept a flow as a single page or cluster of pages assembled into a buffer + of cluster handle @clust + . maybe allocate buffer @bf to store temporary results + . maybe compress accepted flow and attach compression magic if result of + compression is acceptable + . maybe align and encrypt the flow. + . stores the result in the buffer of cluster handle + _ _ _ _ _ _ _ _ + | | + | disk cluster | + |_ _ _ _ _ _ _ _| + ^ + | + _______________ _______|_______ _ _ _ _ _ _ _ _ + | | <---1--- | | | | + | @bf | ----2--> | @clust |<----| page cluster | + |_______________| ----3--> |_______________| |_ _ _ _ _ _ _ _| + ^ ^ + 4 | ______________ | 5 + | | | | + +---- | page | ----+ + |______________| + + + " --n-> " means one of the following operations on a pair of pointers (src, dst) + + 1 - compression or encryption + 2 - encryption + 3 - alternation + 4 - compression + 5 - compression or encryption or copy + + where + . compression is plugin->compress(), + . encryption is plugin->encrypt(), + . alternation is alternate_buffers() (if the final result is contained in temporary buffer @bf, + we should move it to the cluster handle @clust) + . copy is memcpy() + + + FIXME-EDWARD: Currently the only symmetric crypto algorithms with ecb are + supported +*/ + +reiser4_internal int +deflate_cluster(reiser4_cluster_t *clust, /* contains data to process */ + struct inode *inode) +{ + int result = 0; + __u8 * bf = NULL; + __u8 * src = NULL; + __u8 * dst = NULL; + size_t bfsize = clust->count; + struct page * pg = NULL; + + assert("edward-401", inode != NULL); + assert("edward-495", clust != NULL); + assert("edward-496", clust->count != 0); + assert("edward-497", clust->len == 0); + assert("edward-498", clust->buf && clust->bsize); + + if (try_compress(clust, inode)) { + /* try to compress, discard bad results */ + __u8 * wbuf; + __s32 dst_len; + compression_plugin * cplug = inode_compression_plugin(inode); + + assert("edward-602", cplug != NULL); + + if (inode_crypto_plugin(inode) || clust->nr_pages != 1) { + /* [12], [42], [13], tmp buffer is required */ + bfsize += cplug->overrun; + bf = reiser4_kmalloc(bfsize, GFP_KERNEL); + if (!bf) + return -ENOMEM; + dst = bf; + } + else + /* [5] */ + dst = clust->buf; + if (clust->nr_pages == 1) { + /* [42], [5] */ + assert("edward-619", clust->pages != NULL); + assert("edward-620", PageDirty(*clust->pages)); + + pg = *clust->pages; + lock_page(pg); + assert("edward-621", PageDirty(pg)); + src = kmap(pg); + } + else + /* [12], [13] */ + src = clust->buf; + + wbuf = reiser4_kmalloc(cplug->mem_req, GFP_KERNEL); + if (wbuf == NULL) { + result = -ENOMEM; + goto exit; + } + cplug->compress(wbuf, src, clust->count, dst/* res */, &dst_len); + + if (dst_len < 0) + goto discard; + + clust->len = dst_len; + + assert("edward-603", clust->len <= (bf ? bfsize : clust->bsize)); + + /* estimate compression quality to accept or discard + the results of our efforts */ + if (save_compressed(clust, inode)) { + /* Accepted */ + set_compression_magic(dst + clust->len); + clust->len += CLUSTER_MAGIC_SIZE; + } + else { + + discard: + clust->len = clust->count; + } + reiser4_kfree(wbuf); + } + + if (inode_crypto_plugin(inode)) { + /* align and encrypt */ + int oh; /* ohhh, the crypto alignment overhead */ + int i, icb, ocb; + __u32 * expkey; + crypto_plugin * cplug = inode_crypto_plugin(inode); + crypto_stat_t * stat = inode_crypto_stat(inode); + + assert("edward-604", stat != NULL); + + icb = cplug->blocksize(stat->keysize); + ocb = inode_scaled_offset(inode, icb); + + assert("edward-605", icb != 0); + + /* precise crypto-overhead */ + oh = crypto_overhead(0, clust, inode, WRITE_OP); + + if (dst) { + /* compression is specified */ + assert("edward-622", src != NULL); + assert("edward-623", bf != NULL && clust->len != 0); + assert("edward-624", clust->len <= clust->count); + + if (clust->len != clust->count) + /* saved */ + src = dst; + else + /* refused */ + ; + if (pg) { + /* release flushed page */ + assert("edward-625", PageLocked(pg)); + + kunmap(pg); + uncapture_page(pg); + unlock_page(pg); + page_cache_release(pg); + reiser4_kfree(clust->pages); + pg = NULL; + } + } + else { + /* [13], [5], compression wasn't specified */ + + assert("edward-626", !clust->len); + + if (clust->nr_pages != 1) { + /* [13], tmp buffer required */ + assert("edward-627", !bf); + + bfsize += oh; + bf = reiser4_kmalloc(bfsize, GFP_KERNEL); + if (!bf) { + result = -ENOMEM; + goto exit; + } + alternate_buffers(clust, &bf, &bfsize); + src = bf; + } + else { + /* [5] */ + pg = *clust->pages; + lock_page(pg); + assert("edward-628", PageDirty(pg)); + src = kmap(pg); + } + clust->len = clust->count; + } + + dst = clust->buf; + + if (oh) { + /* align the source */ + clust->len += cplug->align_cluster(src + clust->len, clust->len, icb); + + assert("edward-402", clust->len <= (pg ? PAGE_CACHE_SIZE : bfsize)); + + *(src + clust->len - 1) = oh; + } +#if REISER4_DEBUG + if (clust->len % icb) + impossible("edward-403", "bad alignment"); +#endif + + expkey = cryptcompress_inode_data(inode)->expkey; + + assert("edward-404", expkey != NULL); + + for (i=0; i < clust->len/icb; i++) + cplug->encrypt(expkey, clust->buf + i*ocb /* dst */, src + i*icb); + } + + else if (dst && clust->len != clust->count) { + /* [13], [5], saved compression, no encryption */ + if (bf) { + /* [13] */ + assert("edward-635", bf == dst); + assert("edward-636", !clust->pages); + alternate_buffers(clust, &bf, &bfsize); + } + } + else { + /* not specified or discarded compression, no encryption, + [13], [5], [] */ + + if (clust->nr_pages == 1) { + if (!pg) { + assert("edward-629", !src); + assert("edward-631", !clust->len); + /* -not specified, [5] */ + pg = *clust->pages; + lock_page(pg); + src = kmap(pg); + clust->len = clust->count; + } + else { + /* -discarded, [13] */ + assert("edward-630", src != NULL); + assert("edward-632", clust->len == clust->count); + } + xmemcpy(clust->buf, src, clust->count); + } + if (!clust->len) + /* not specified, [] */ + clust->len = clust->count; + } + exit: + if (bf) + reiser4_kfree(bf); + if (pg) { + assert("edward-621", PageLocked(pg)); + + kunmap(pg); + uncapture_page(pg); + unlock_page(pg); + page_cache_release(pg); + reiser4_kfree(clust->pages); + } + return result; +} + +/* Common inflate cluster manager. Is used in readpage() or readpages() methods of + cryptcompress object plugins. + . maybe allocate temporary buffer (@bf) + . maybe decrypt disk cluster (assembled in united flow of cluster handle) and + cut crypto-alignment overhead (if any) + . maybe check for compression magic and decompress + + The final result is stored in the buffer of the cluster handle (@clust) + (which contained assembled disk cluster at the beginning of this procedure) + and is supposed to be sliced into page cluster by appropriate fillers, but if + cluster size is equal PAGE_SIZE we fill the single page (@pg) right here: + + _ _ _ _ _ _ _ _ + | | + | disk cluster | + |_ _ _ _ _ _ _ _| + | + | + ________________ _______V_______ _ _ _ _ _ _ _ _ + | | <---1--- | | | | + | @bf | ----2--> | @clust |---->| page cluster | + |________________| ----3--> |_______________| |_ _ _ _ _ _ _ _| + | | + 4 | _______________ | 5 + | | | | + +---> | @pg | <---+ + |_______________| + + + " --n-> " means one of the following functions on a pair of pointers (src, dst): + + 1, 5 - decryption or decompression + 2, 4 - decompression + 3 - alternation + + Where: + + decryption is plugin->decrypt(), + decompression is plugin->decompress, + alternation is alternate_buffers() +*/ +reiser4_internal int +inflate_cluster(reiser4_cluster_t *clust, /* cluster handle, contains assembled + disk cluster to process */ + struct inode *inode) +{ + int result = 0; + __u8 * dst = NULL; + __u8 * bf = NULL; /* buffer to handle temporary results */ + size_t bfsize = 0; /* size of the buffer above */ + struct page * pg = NULL; /* pointer to a single page if cluster size + is equal page size */ + if (clust->stat == FAKE_CLUSTER) + return 0; + + assert("edward-407", clust->buf != NULL); + assert("edward-408", clust->len != 0); + + if (inode_crypto_plugin(inode) != NULL) { + /* decrypt */ + int i; + int oh = 0; + int icb, ocb; + __u32 * expkey; + crypto_plugin * cplug = inode_crypto_plugin(inode); + crypto_stat_t * stat = inode_crypto_stat(inode); + + assert("edward-616", stat != 0); + assert("edward-617", cplug != 0); + + if (clust->nr_pages == 1) + pg = *clust->pages; + oh = crypto_overhead(0, clust, inode, READ_OP); + + /* input/output crypto blocksizes */ + icb = cplug->blocksize(stat->keysize); + ocb = inode_scaled_offset(inode, icb); + + assert("edward-608", clust->len % ocb); + + if (pg && !need_decompression(clust, inode, + 1 /* estimate for encrypted cluster */)) { + /* [5] */ + assert("edward-609", clust->nr_pages == 1); + assert("edward-610", inode_cluster_size(inode) == PAGE_CACHE_SIZE); + + lock_page(pg); + if (PageUptodate(pg)) { + /* races with other read/write */ + goto exit; + } + dst = kmap(pg); + } + else { /* [12] or [13], tmp buffer is needed, estimate its size */ + bfsize = fsize_to_count(clust, inode); + bfsize += crypto_overhead(bfsize, clust, inode, WRITE_OP); + bf = reiser4_kmalloc(bfsize, GFP_KERNEL); + if (!bf) { + result = -ENOMEM; + goto exit; + } + dst = bf; + } + + /* decrypt cluster with the simplest mode + * FIXME-EDWARD: call here stream mode plugin */ + + expkey = cryptcompress_inode_data(inode)->expkey; + + assert("edward-141", expkey != NULL); + + for (i=0; i < clust->len/ocb; i++) + cplug->decrypt(expkey, dst + i*icb /* dst */, clust->buf + i*ocb /* src */); + + /* cut the alignment overhead */ + clust->len -= crypto_overhead(0, clust, inode, READ_OP); + } + if (need_decompression(clust, inode, 0 /* estimate for decrypted cluster */)) { + unsigned dst_len = inode_cluster_size(inode); + __u8 * src = bf; + __u8 * wbuf; + __u8 magic[CLUSTER_MAGIC_SIZE]; + compression_plugin * cplug = inode_compression_plugin(inode); + + src = bf; + + if (clust->nr_pages == 1) + pg = *clust->pages; + + if (pg) { + /* [5] or [14] */ + lock_page(pg); + if (PageUptodate(pg)) { + /* races with other read/write */ + goto exit; + } + dst = kmap(pg); + if (!bf) + src = clust->buf; + } + else { + /* [12] or [13] */ + if (!bf) { + /* [13], tmp buffer is needed, estimate its size */ + bfsize = fsize_to_count(clust, inode); + bf = reiser4_kmalloc(bfsize, GFP_KERNEL); + if (!bf) { + result = -ENOMEM; + goto exit; + } + alternate_buffers(clust, &bf, &bfsize); + } + dst = clust->buf; + src = bf; + } + + /* Check compression magic for possible IO errors. + + End-of-cluster format created before encryption: + + data + compression_magic (4) Indicates presence of compression + infrastructure, should be private. + Can be absent. + crypto_overhead Created by ->align() method of crypto-plugin, + Can be absent. + + Crypto overhead format: + + data + tail_size (1) size of aligning tail, + 1 <= tail_size <= blksize + */ + set_compression_magic(magic); + + if (memcmp(src + (clust->len - (size_t)CLUSTER_MAGIC_SIZE), + magic, (size_t)CLUSTER_MAGIC_SIZE)) { + printk("edward-156: wrong compression magic\n"); + result = -EIO; + goto exit; + } + clust->len -= (size_t)CLUSTER_MAGIC_SIZE; + /* decompress cluster */ + wbuf = reiser4_kmalloc(cplug->mem_req, GFP_KERNEL); + if (wbuf == NULL) { + result = -ENOMEM; + goto exit; + } + cplug->decompress(wbuf, src, clust->len, dst, &dst_len); + + /* check the length of decompressed data */ + assert("edward-157", dst_len == fsize_to_count(clust, inode)); + + clust->len = dst_len; + + reiser4_kfree(wbuf); + } + exit: + if (bf) + reiser4_kfree(bf); + if (clust->nr_pages == 1) { + + assert("edward-618", clust->len <= PAGE_CACHE_SIZE); + + if (!pg) { + /* no encryption, no compression */ + pg = *clust->pages; + lock_page(pg); + if (PageUptodate(pg)) { + /* races with other read/write */ + unlock_page(pg); + return result; + } + dst = kmap(pg); + xmemcpy(dst, clust->buf, clust->len); + } + + assert("edward-611", PageLocked(pg)); + assert("edward-637", !PageUptodate(pg)); + assert("edward-638", dst != NULL); + + xmemset(dst + clust->len, 0, (size_t)PAGE_CACHE_SIZE - clust->len); + kunmap(pg); + SetPageUptodate(pg); + unlock_page(pg); + } + return result; +} + +/* plugin->read() : + * generic_file_read() + * All key offsets don't make sense in traditional unix semantics unless they + * represent the beginning of clusters, so the only thing we can do is start + * right from mapping to the address space (this is precisely what filemap + * generic method does) */ + +/* plugin->readpage() */ +reiser4_internal int +readpage_cryptcompress(void *vp, struct page *page) +{ + reiser4_cluster_t clust; + struct file * file; + item_plugin * iplug; + int result; + + assert("edward-88", PageLocked(page)); + assert("edward-89", page->mapping && page->mapping->host); + + file = vp; + if (file) + assert("edward-113", page->mapping == file->f_dentry->d_inode->i_mapping); + + if (PageUptodate(page)) { + printk("readpage_cryptcompress: page became already uptodate\n"); + unlock_page(page); + return 0; + } + reiser4_cluster_init(&clust); + + iplug = item_plugin_by_id(CTAIL_ID); + if (!iplug->s.file.readpage) + return -EINVAL; + + result = iplug->s.file.readpage(&clust, page); + + assert("edward-64", ergo(result == 0, (PageLocked(page) || PageUptodate(page)))); + /* if page has jnode - that jnode is mapped + assert("edward-65", ergo(result == 0 && PagePrivate(page), + jnode_mapped(jprivate(page)))); + */ + return result; +} + +/* plugin->readpages() */ +reiser4_internal void +readpages_cryptcompress(struct file *file UNUSED_ARG, struct address_space *mapping, + struct list_head *pages) +{ + item_plugin *iplug; + + iplug = item_plugin_by_id(CTAIL_ID); + iplug->s.file.readpages(NULL, mapping, pages); + return; +} + +static void +set_cluster_pages_dirty(reiser4_cluster_t * clust, int * num) +{ + int i, nr; + struct page * pg; + + nr = (num ? *num : clust->nr_pages); + + for (i=0; i < nr; i++) { + + pg = clust->pages[i]; + + lock_page(pg); + + set_page_dirty_internal(pg); + SetPageUptodate(pg); + mark_page_accessed(pg); + + unlock_page(pg); + + page_cache_release(pg); + } +} + +/* This is the interface to capture cluster nodes via their struct page reference. + Any two blocks of the same cluster contain dependent modification and should + commit at the same time */ +static int +try_capture_cluster(reiser4_cluster_t * clust, int * num) +{ + int i, nr; + int result = 0; + + nr = (num ? *num : clust->nr_pages); + + for (i=0; i < nr; i++) { + jnode * node; + struct page *pg; + + pg = clust->pages[i]; + node = jprivate(pg); + + assert("edward-220", node != NULL); + + LOCK_JNODE(node); + + result = try_capture(node, ZNODE_WRITE_LOCK, 0/* not non-blocking */, 0 /* no can_coc */); + if (result) { + UNLOCK_JNODE(node); + jput(node); + break; + } + UNLOCK_JNODE(node); + } + if(result) + /* drop nodes */ + while(i) { + i--; + uncapture_jnode(jprivate(clust->pages[i])); + } + return result; +} + +static void +make_cluster_jnodes_dirty(reiser4_cluster_t * clust, int *num) +{ + int i, nr; + jnode * node; + + nr = (num? *num : clust->nr_pages); + + for (i=0; i < nr; i++) { + node = jprivate(clust->pages[i]); + + assert("edward-221", node != NULL); + + LOCK_JNODE(node); + jnode_make_dirty_locked(node); + UNLOCK_JNODE(node); + + jput(node); + } +} + +/* collect unlocked cluster pages and jnodes */ +static int +grab_cache_cluster(struct inode * inode, reiser4_cluster_t * clust) +{ + int i; + int result = 0; + jnode * node; + + assert("edward-182", clust != NULL); + assert("edward-183", clust->pages != NULL); + assert("edward-437", clust->nr_pages != 0); + assert("edward-184", 0 < clust->nr_pages <= inode_cluster_pages(inode)); + + for (i = 0; i < clust->nr_pages; i++) { + clust->pages[i] = grab_cache_page(inode->i_mapping, clust_to_pg(clust->index, inode) + i); + if (!(clust->pages[i])) { + result = RETERR(-ENOMEM); + break; + } + node = jnode_of_page(clust->pages[i]); + unlock_page(clust->pages[i]); + if (IS_ERR(node)) { + page_cache_release(clust->pages[i]); + result = PTR_ERR(node); + break; + } + LOCK_JNODE(node); + JF_SET(node, JNODE_CLUSTER_PAGE); + UNLOCK_JNODE(node); + } + if (result) { + while(i) { + i--; + page_cache_release(clust->pages[i]); + assert("edward-222", jprivate(clust->pages[i]) != NULL); + jput(jprivate(clust->pages[i])); + } + } + return result; +} + +static void +set_cluster_unlinked(reiser4_cluster_t * clust, struct inode * inode) +{ + jnode * node; + + node = jprivate(clust->pages[0]); + + assert("edward-640", node); + + LOCK_JNODE(node); + JF_SET(node, JNODE_NEW); + UNLOCK_JNODE(node); +} + +static void +put_cluster_jnodes(reiser4_cluster_t * clust) +{ + int i; + + assert("edward-223", clust != NULL); + + for (i=0; i < clust->nr_pages; i++) { + + assert("edward-208", clust->pages[i] != NULL); + assert("edward-224", jprivate(clust->pages[i]) != NULL); + + jput(jprivate(clust->pages[i])); + } +} + +/* put cluster pages and jnodes */ +static void +release_cluster_pages(reiser4_cluster_t * clust, int from) +{ + int i; + + assert("edward-447", clust != NULL); + assert("edward-448", from < clust->nr_pages); + + for (i = from; i < clust->nr_pages; i++) { + + assert("edward-449", clust->pages[i] != NULL); + + page_cache_release(clust->pages[i]); + } +} + +static void +release_cluster(reiser4_cluster_t * clust) +{ + int i; + + assert("edward-445", clust != NULL); + + for (i=0; i < clust->nr_pages; i++) { + + assert("edward-446", clust->pages[i] != NULL); + assert("edward-447", jprivate(clust->pages[i]) != NULL); + + page_cache_release(clust->pages[i]); + jput(jprivate(clust->pages[i])); + } +} + +/* debugging purposes */ +#if REISER4_DEBUG +reiser4_internal int +cluster_invariant(reiser4_cluster_t * clust, struct inode * inode) +{ + assert("edward-279", clust != NULL); + + return (clust->pages != NULL && + clust->off < inode_cluster_size(inode) && + ergo(clust->delta != 0, clust->stat == HOLE_CLUSTER) && + clust->off + clust->count + clust->delta <= inode_cluster_size(inode)); +} +#endif + +/* guess next cluster status */ +static inline reiser4_cluster_status +next_cluster_stat(reiser4_cluster_t * clust) +{ + return (clust->stat == HOLE_CLUSTER && clust->delta == 0 /* no non-zero data */ ? HOLE_CLUSTER : DATA_CLUSTER); +} + +/* guess next cluster params */ +static void +update_cluster(struct inode * inode, reiser4_cluster_t * clust, loff_t file_off, loff_t to_file) +{ + assert ("edward-185", clust != NULL); + assert ("edward-438", clust->pages != NULL); + assert ("edward-281", cluster_invariant(clust, inode)); + + switch (clust->stat) { + case DATA_CLUSTER: + /* increment */ + clust->stat = DATA_CLUSTER; + clust->off = 0; + clust->index++; + clust->count = min_count(inode_cluster_size(inode), to_file); + xmemset(clust->pages, 0, sizeof(clust->pages) << inode_cluster_shift(inode)); + break; + case HOLE_CLUSTER: + switch(next_cluster_stat(clust)) { + case HOLE_CLUSTER: + /* skip */ + clust->stat = HOLE_CLUSTER; + clust->off = 0; + clust->index = off_to_clust(file_off, inode); + clust->count = off_to_cloff(file_off, inode); + clust->delta = min_count(inode_cluster_size(inode) - clust->count, to_file); + xmemset(clust->pages, 0, sizeof(clust->pages) << inode_cluster_shift(inode)); + break; + case DATA_CLUSTER: + /* keep immovability */ + clust->stat = DATA_CLUSTER; + clust->off = clust->off + clust->count; + clust->count = clust->delta; + clust->delta = 0; + break; + default: + impossible ("edward-282", "wrong next cluster status"); + } + default: + impossible ("edward-283", "wrong current cluster status"); + } +} + +static int +__reserve4cluster(struct inode * inode, reiser4_cluster_t * clust) +{ + int result = 0; + jnode * j; + + assert("edward-439", inode != NULL); + assert("edward-440", clust != NULL); + assert("edward-441", clust->pages != NULL); + assert("edward-442", jprivate(clust->pages[0]) != NULL); + + j = jprivate(clust->pages[0]); + + LOCK_JNODE(j); + if (JF_ISSET(j, JNODE_CREATED)) { + /* jnode mapped <=> space reserved */ + UNLOCK_JNODE(j); + return 0; + } + result = reiser4_grab_space_force( + /* estimate_insert_flow(current_tree->height) + estimate_one_insert_into_item(current_tree) */ + estimate_insert_cluster(inode), 0); + if (result) + return result; + JF_SET(j, JNODE_CREATED); + + grabbed2cluster_reserved(estimate_insert_cluster(inode)); + + UNLOCK_JNODE(j); + return 0; +} + +#if REISER4_TRACE +#define reserve4cluster(inode, clust, msg) __reserve4cluster(inode, clust) +#else +#define reserve4cluster(inode, clust, msg) __reserve4cluster(inode, clust) +#endif + +static void +free_reserved4cluster(struct inode * inode, reiser4_cluster_t * clust) +{ + jnode * j; + + j = jprivate(clust->pages[0]); + + LOCK_JNODE(j); + + assert("edward-443", jnode_is_cluster_page(j)); + assert("edward-444", JF_ISSET(j, JNODE_CREATED)); + + cluster_reserved2free(estimate_insert_cluster(inode)); + JF_CLR(j, JNODE_CREATED); + UNLOCK_JNODE(j); +} + +static int +update_inode_cryptcompress(struct inode *inode, + loff_t new_size, + int update_i_size, int update_times, + int do_update) +{ + int result = 0; + int old_grabbed; + reiser4_context *ctx = get_current_context(); + reiser4_super_info_data * sbinfo = get_super_private(ctx->super); + + old_grabbed = ctx->grabbed_blocks; + + grab_space_enable(); + + result = reiser4_grab_space(/* one for stat data update */ + estimate_update_common(inode), + 0/* flags */); + if (result) + return result; + if (do_update) { + INODE_SET_FIELD(inode, i_size, new_size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + result = reiser4_update_sd(inode); + } + grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - old_grabbed); + return result; +} + +/* stick pages into united flow, then release the ones */ +reiser4_internal int +flush_cluster_pages(reiser4_cluster_t * clust, struct inode * inode) +{ + int i; + + assert("edward-236", inode != NULL); + assert("edward-237", clust != NULL); + assert("edward-238", clust->off == 0); + assert("edward-239", clust->count == 0); + assert("edward-240", clust->delta == 0); + assert("edward-241", schedulable()); + + clust->count = fsize_to_count(clust, inode); + set_nrpages_by_frame(clust); + + cluster_reserved2grabbed(estimate_insert_cluster(inode)); + + /* estimate max size of the cluster after compression and encryption + including all appended infrastructure, and allocate a buffer */ + clust->bsize = clust->count + + max_crypto_overhead(inode_crypto_plugin(inode), inode_crypto_stat(inode)); + clust->bsize = inode_scaled_offset(inode, clust->bsize); + + if (clust->bsize > inode_scaled_cluster_size(inode)) + clust->bsize = inode_scaled_cluster_size(inode); + if (try_compress(clust, inode)) + clust->bsize += compress_overhead(inode_compression_plugin(inode)); + + clust->buf = reiser4_kmalloc(clust->bsize, GFP_KERNEL); + if (!clust->buf) + return -ENOMEM; + + if (clust->count <= PAGE_CACHE_SIZE) { + /* delay flushing of a single page */ + assert("edward-612", clust->nr_pages == 1); + + clust->pages = reiser4_kmalloc(sizeof(*clust->pages), GFP_KERNEL); + if (!clust->pages) { + reiser4_kfree(clust->buf); + return -ENOMEM; + } + *clust->pages = find_get_page(inode->i_mapping, clust_to_pg(clust->index, inode)); + + assert("edward-613", *clust->pages != NULL); + assert("edward-614", PageDirty(*clust->pages)); + + return 0; + } + + /* flush more then one page after its assembling into united flow */ + for (i=0; i < clust->nr_pages; i++){ + struct page * page; + char * data; + + page = find_get_page(inode->i_mapping, clust_to_pg(clust->index, inode) + i); + + assert("edward-242", page != NULL); + assert("edward-243", PageDirty(page)); + assert("edward-634", clust->count <= clust->bsize); + /* FIXME_EDWARD: Make sure that jnodes are from the same dirty list */ + + lock_page(page); + data = kmap(page); + xmemcpy(clust->buf + pg_to_off(i), data, off_to_pgcount(clust->count, i)); + kunmap(page); + uncapture_page(page); + unlock_page(page); + page_cache_release(page); + } + return 0; +} + +/* set zeroes to the cluster, update it, and maybe, try to capture its pages */ +static int +write_hole(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off, loff_t to_file) +{ + char * data; + int result = 0; + unsigned cl_off, cl_count = 0; + unsigned to_pg, pg_off; + + assert ("edward-190", clust != NULL); + assert ("edward-191", inode != NULL); + assert ("edward-192", cluster_invariant(clust, inode)); + assert ("edward-201", clust->stat == HOLE_CLUSTER); + + if (clust->off == 0 && clust->count == inode_cluster_size(inode)) { + /* fake cluster, just update it */ + goto update; + } + + if (clust->count == 0) { + /* nothing to write */ + goto update; + } + cl_count = clust->count; /* number of zeroes to write */ + cl_off = clust->off; + pg_off = off_to_pgoff(clust->off); + + while (cl_count) { + struct page * page; + page = clust->pages[off_to_pg(cl_off)]; + + assert ("edward-284", page != NULL); + + to_pg = min_count(PAGE_CACHE_SIZE - pg_off, cl_count); + lock_page(page); + data = kmap_atomic(page, KM_USER0); + xmemset(data + pg_off, 0, to_pg); + kunmap_atomic(data, KM_USER0); + unlock_page(page); + + cl_off += to_pg; + cl_count -= to_pg; + pg_off = 0; + } + if (!clust->delta) { + /* only zeroes, try to flush */ + + set_cluster_pages_dirty(clust, NULL); + result = try_capture_cluster(clust, NULL); + if (result) + return result; + make_cluster_jnodes_dirty(clust, NULL); + result = update_inode_cryptcompress(inode, clust_to_off(clust->index, inode) + clust->off + clust->count, 1, 1, 1); + if (result) + return result; + balance_dirty_pages_ratelimited(inode->i_mapping); + } + update: + update_cluster(inode, clust, file_off, to_file); + return 0; +} + +/* + This is the main disk search procedure for cryptcompress plugins, which + . finds all items of a disk cluster, + . maybe reads each of them to the flow (if @read != 0) + . maybe makes each znode dirty (if @write != 0) +*/ +reiser4_internal int +find_cluster(reiser4_cluster_t * clust, + struct inode * inode, + int read, + int write) +{ + flow_t f; + lock_handle lh; + hint_t hint; + int result; + unsigned long cl_idx; + ra_info_t ra_info; + file_plugin * fplug; + item_plugin * iplug; + static int cnt = 0; + + cnt ++; + + assert("edward-225", read || write); + assert("edward-226", schedulable()); + assert("edward-137", inode != NULL); + assert("edward-138", clust != NULL); + assert("edward-461", ergo(read, clust->buf != NULL)); + assert("edward-462", ergo(!read, !cluster_is_uptodate(clust))); + assert("edward-474", get_current_context()->grabbed_blocks == 0); + + cl_idx = clust->index; + fplug = inode_file_plugin(inode); + iplug = item_plugin_by_id(CTAIL_ID); + /* build flow for the cluster */ + fplug->flow_by_inode(inode, clust->buf, 0 /* kernel space */, + inode_scaled_cluster_size(inode), clust_to_off(cl_idx, inode), READ_OP, &f); + result = load_file_hint(clust->file, &hint, &lh); + if (result) + return result; + if (write) { + result = reiser4_grab_space_force(estimate_disk_cluster(inode), 0); + if (result) + goto out2; + } + ra_info.key_to_stop = f.key; + set_key_offset(&ra_info.key_to_stop, get_key_offset(max_key())); + + while (f.length) { + result = find_cluster_item(&hint, &f.key, (write ? ZNODE_WRITE_LOCK : ZNODE_READ_LOCK), &ra_info, FIND_EXACT); + switch (result) { + case CBK_COORD_NOTFOUND: + if (inode_scaled_offset(inode, clust_to_off(cl_idx, inode)) == get_key_offset(&f.key)) { + /* first item not found */ + if (read) + /* hole cluster */ + clust->stat = FAKE_CLUSTER; + result = 0; + goto out2; + } + /* we are outside the cluster, stop search here */ + assert("edward-146", f.length != inode_scaled_cluster_size(inode)); + done_lh(&lh); + goto ok; + case CBK_COORD_FOUND: + assert("edward-148", hint.coord.base_coord.between == AT_UNIT); + assert("edward-460", hint.coord.base_coord.unit_pos == 0); + + coord_clear_iplug(&hint.coord.base_coord); + result = zload_ra(hint.coord.base_coord.node, &ra_info); + if (unlikely(result)) + goto out2; + assert("edward-147", item_plugin_by_coord(&hint.coord.base_coord) == iplug); + if (read) { + result = iplug->s.file.read(NULL, &f, &hint); + if(result) + goto out; + } + if (write) { + znode_make_dirty(hint.coord.base_coord.node); + znode_set_squeezable(hint.coord.base_coord.node); + if (!read) + move_flow_forward(&f, iplug->b.nr_units(&hint.coord.base_coord)); + } + zrelse(hint.coord.base_coord.node); + done_lh(&lh); + break; + default: + goto out2; + } + } + ok: + /* at least one item was found */ + /* NOTE-EDWARD: + Callers should handle the case when disk cluster is incomplete (-EIO) */ + clust->len = inode_scaled_cluster_size(inode) - f.length; + save_file_hint(clust->file, &hint); + all_grabbed2free(); + return 0; + out: + zrelse(hint.coord.base_coord.node); + out2: + done_lh(&lh); + save_file_hint(clust->file, &hint); + all_grabbed2free(); + return result; +} + +/* Read before write. + We don't take an interest in how much bytes was written when error occures */ +static int +read_some_cluster_pages(struct inode * inode, reiser4_cluster_t * clust) +{ + int i; + int result = 0; + unsigned to_read; + item_plugin * iplug; + + iplug = item_plugin_by_id(CTAIL_ID); + + if (new_cluster(clust, inode)) + /* new cluster, nothing to read */ + return 0; + /* bytes we wanna read starting from the beginning of cluster + to keep first @off ones */ + to_read = clust->off + clust->count + clust->delta; + + assert("edward-298", to_read <= inode_cluster_size(inode)); + + for (i = 0; i < clust->nr_pages; i++) { + struct page * pg = clust->pages[i]; + + if (clust->off <= pg_to_off(i) && pg_to_off(i) <= to_read - 1) + /* page will be completely overwritten */ + continue; + lock_page(pg); + if (PageUptodate(pg)) { + unlock_page(pg); + continue; + } + unlock_page(pg); + + if (!cluster_is_uptodate(clust)) { + /* read cluster and mark its znodes dirty */ + result = ctail_read_cluster(clust, inode, 1 /* write */); + if (result) + goto out; + } + lock_page(pg); + result = do_readpage_ctail(clust, pg); + unlock_page(pg); + if (result) { + impossible("edward-219", "do_readpage_ctail returned crap"); + goto out; + } + } + if (!cluster_is_uptodate(clust)) + /* disk cluster unclaimed, make its znodes dirty */ + find_cluster(clust, inode, 0 /* do not read */, 1 /*write */); + out: + release_cluster_buf(clust, inode); + return result; +} + +/* Prepare before write. Called by write, writepage, truncate, etc.. + . grab cluster pages, + . maybe read pages from disk, + . maybe write hole +*/ +static int +prepare_cluster(struct inode *inode, + loff_t file_off /* write position in the file */, + loff_t to_file, /* bytes of users data to write to the file */ + int * nr_pages, /* advised number of pages */ + reiser4_cluster_t *clust, + const char * msg) + +{ + char *data; + int result = 0; + unsigned o_c_d; + + assert("edward-177", inode != NULL); + assert("edward-280", cluster_invariant(clust, inode)); + + o_c_d = clust->count + clust->delta; + + if (nr_pages != NULL) { + assert("edward-422", *nr_pages <= inode_cluster_pages(inode)); + clust->nr_pages = *nr_pages; + } + else + /* wasn't advised, guess by frame */ + set_nrpages_by_frame(clust); + if(!clust->nr_pages) + /* do nothing */ + return 0; + /* collect unlocked pages and jnodes */ + result = grab_cache_cluster(inode, clust); + if (result) + return result; + if (clust->off == 0 && inode->i_size <= clust_to_off(clust->index, inode) + o_c_d) { + /* we don't need to read cluster from disk, just + align the current chunk of data up to nr_pages */ + unsigned off = off_to_pgcount(o_c_d, clust->nr_pages - 1); + struct page * pg = clust->pages[clust->nr_pages - 1]; + crypto_plugin * cplug = inode_crypto_plugin(inode); + + assert("edward-285", pg != NULL); + + lock_page(pg); + data = kmap_atomic(pg, KM_USER0); + if (cplug) + cplug->align_cluster(data + off, off, PAGE_CACHE_SIZE); + else + xmemset(data + off, 0, PAGE_CACHE_SIZE - off); + kunmap_atomic(data, KM_USER0); + unlock_page(pg); + } + result = reserve4cluster(inode, clust, msg); + if (result) + goto exit1; + result = read_some_cluster_pages(inode, clust); + if (result) + goto exit2; + if (new_cluster(clust, inode) || clust->stat == FAKE_CLUSTER) + set_cluster_unlinked(clust, inode); + if (clust->stat == HOLE_CLUSTER) + result = write_hole(inode, clust, file_off, to_file); + if (!result) + return 0; + exit2: + free_reserved4cluster(inode, clust); + exit1: + put_cluster_jnodes(clust); + return result; +} + +/* get cluster handle params by two offsets */ +static void +clust_by_offs(reiser4_cluster_t * clust, struct inode * inode, loff_t o1, loff_t o2) +{ + assert("edward-295", clust != NULL); + assert("edward-296", inode != NULL); + assert("edward-297", o1 <= o2); + + clust->index = off_to_clust(o1, inode); + clust->off = off_to_cloff(o1, inode); + clust->count = min_count(inode_cluster_size(inode) - clust->off, o2 - o1); + clust->delta = 0; +} + +static void +set_cluster_params(struct inode * inode, reiser4_cluster_t * clust, flow_t * f, loff_t file_off) +{ + assert("edward-197", clust != NULL); + assert("edward-286", clust->pages != NULL); + assert("edward-198", inode != NULL); + + xmemset(clust->pages, 0, sizeof(clust->pages) << inode_cluster_shift(inode)); + + if (file_off > inode->i_size) { + /* Uhmm, hole in crypto-file... */ + loff_t hole_size; + hole_size = file_off - inode->i_size; + + printk("edward-176, Warning: Hole of size %llu in " + "cryptocompressed file (inode %llu, offset %llu) \n", + hole_size, get_inode_oid(inode), file_off); + + clust_by_offs(clust, inode, inode->i_size, file_off); + clust->stat = HOLE_CLUSTER; + if (clust->off + hole_size < inode_cluster_size(inode)) + /* besides there is also user's data to write to this cluster */ + clust->delta = min_count(inode_cluster_size(inode) - (clust->off + clust->count), f->length); + return; + } + clust_by_offs(clust, inode, file_off, file_off + f->length); + clust->stat = DATA_CLUSTER; +} + +/* Main write procedure for cryptcompress objects, + this slices user's data into clusters and copies to page cache. + If @buf != NULL, returns number of bytes in successfully written clusters, + otherwise returns error */ +/* FIXME_EDWARD replace flow by something lightweigth */ + +static loff_t +write_cryptcompress_flow(struct file * file , struct inode * inode, const char *buf, size_t count, loff_t pos) +{ + int i; + flow_t f; + int result = 0; + size_t to_write = 0; + loff_t file_off; + reiser4_cluster_t clust; + struct page ** pages; + static int cnt = 0; + + cnt++; /* FIXME-EDWARD: Remove me */ + + assert("edward-159", current_blocksize == PAGE_CACHE_SIZE); + + pages = reiser4_kmalloc(sizeof(*pages) << inode_cluster_shift(inode), GFP_KERNEL); + if (!pages) + return -ENOMEM; + result = flow_by_inode_cryptcompress(inode, (char *)buf, 1 /* user space */, count, pos, WRITE_OP, &f); + if (result) + goto exit; + to_write = f.length; + + /* current write position in file */ + file_off = pos; + reiser4_cluster_init(&clust); + clust.file = file; + clust.pages = pages; + + set_cluster_params(inode, &clust, &f, file_off); + + if (next_cluster_stat(&clust) == HOLE_CLUSTER) { + result = prepare_cluster(inode, file_off, f.length, NULL, &clust, "write cryptcompress hole"); + if (result) + goto exit; + } + do { + char *src; + unsigned page_off, page_count; + + result = prepare_cluster(inode, file_off, f.length, NULL, &clust, "write cryptcompress flow"); /* jp+ */ + if (result) + goto exit; + assert("edward-204", clust.stat == DATA_CLUSTER); + assert("edward-161", schedulable()); + + /* set write position in page */ + page_off = off_to_pgoff(clust.off); + + /* copy user's data to cluster pages */ + for (i = off_to_pg(clust.off), src = f.data; i < count_to_nrpages(clust.off + clust.count); i++, src += (int)PAGE_CACHE_SIZE) { + page_count = min_count(PAGE_CACHE_SIZE - page_off, clust.count); + + assert("edward-287", pages[i] != NULL); + + lock_page(pages[i]); + result = __copy_from_user((char *)kmap(pages[i]) + page_off, src, page_count); + kunmap(pages[i]); + if (unlikely(result)) { + unlock_page(pages[i]); + result = -EFAULT; + release_cluster(&clust); /* jp- */ + goto exit1; + } + unlock_page(pages[i]); + page_off = 0; + } + + set_cluster_pages_dirty(&clust, NULL); /* p- */ + + result = try_capture_cluster(&clust, NULL); + if (result) + goto exit2; + + make_cluster_jnodes_dirty(&clust, NULL); /* j- */ + + result = update_inode_cryptcompress(inode, + clust_to_off(clust.index, inode) + clust.off + clust.count /* new_size */, + (clust_to_off(clust.index, inode) + clust.off + clust.count > inode->i_size) ? 1 : 0, + 1, + 1/* update stat data */); + if (result) + goto exit1; + balance_dirty_pages_ratelimited(inode->i_mapping); + + move_flow_forward(&f, clust.count); + update_cluster(inode, &clust, 0, f.length); + continue; + exit2: + put_cluster_jnodes(&clust); /* j- */ + exit1: + free_reserved4cluster(inode, &clust); + break; + } while (f.length); + + exit: + if (result == -EEXIST) + printk("write returns EEXIST!\n"); + + reiser4_kfree(pages); + + if (buf) { + /* if nothing were written - there must be an error */ + assert("edward-195", ergo((to_write == f.length), result < 0)); + return (to_write - f.length) ? (to_write - f.length) : result; + } + return result; +} + +static ssize_t +write_crc_file(struct file * file, /* file to write to */ + struct inode *inode, /* inode */ + const char *buf, /* address of user-space buffer */ + size_t count, /* number of bytes to write */ + loff_t * off /* position to write which */) +{ + + int result; + loff_t pos; + ssize_t written; + + assert("edward-196", inode_get_flag(inode, REISER4_CLUSTER_KNOWN)); + + result = generic_write_checks(file, off, &count, 0); + if (unlikely(result != 0)) + return result; + + if (unlikely(count == 0)) + return 0; + + /* FIXME-EDWARD: other UNIX features */ + + pos = *off; + written = write_cryptcompress_flow(file, inode, (char *)buf, count, pos); + if (written < 0) { + if (written == -EEXIST) + printk("write_crc_file returns EEXIST!\n"); + return written; + } + + /* update position in a file */ + *off = pos + written; + /* return number of written bytes */ + return written; +} + +/* plugin->u.file.write */ +reiser4_internal ssize_t +write_cryptcompress(struct file * file, /* file to write to */ + const char *buf, /* address of user-space buffer */ + size_t count, /* number of bytes to write */ + loff_t * off /* position to write which */) +{ + ssize_t result; + struct inode *inode; + + inode = file->f_dentry->d_inode; + + down(&inode->i_sem); + + result = write_crc_file(file, inode, buf, count, off); + + up(&inode->i_sem); + return result; +} + +/* Helper function for cryptcompress_truncate. + If this returns 0, then @idx is minimal cluster + index that isn't contained in this file */ +static int +find_file_idx(struct inode *inode, unsigned long * idx) +{ + int result; + reiser4_key key; + hint_t hint; + coord_t *coord; + lock_handle lh; + item_plugin *iplug; + + assert("edward-276", inode_file_plugin(inode)->key_by_inode == key_by_inode_cryptcompress); + key_by_inode_cryptcompress(inode, get_key_offset(max_key()), &key); + + hint_init_zero(&hint, &lh); + /* find the last item of this file */ + result = find_cluster_item(&hint, &key, ZNODE_READ_LOCK, 0/* ra_info */, FIND_MAX_NOT_MORE_THAN); + if (result == CBK_COORD_NOTFOUND) { + /* there are no items of this file */ + done_lh(&lh); + *idx = 0; + return 0; + } + if (result != CBK_COORD_FOUND) { + /* error occured */ + done_lh(&lh); + return result; + } + coord = &hint.coord.base_coord; + + /* there are items of this file (at least one) */ + coord_clear_iplug(coord); + result = zload(coord->node); + if (unlikely(result)) { + done_lh(&lh); + return result; + } + iplug = item_plugin_by_coord(coord); + assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID)); + + append_cluster_key_ctail(coord, &key); + + *idx = off_to_clust(get_key_offset(&key), inode); + + zrelse(coord->node); + done_lh(&lh); + + return 0; +} + +static int +cut_items_cryptcompress(struct inode *inode, loff_t new_size, int update_sd) +{ + reiser4_key from_key, to_key; + reiser4_key smallest_removed; + int result = 0; + + assert("edward-293", inode_file_plugin(inode)->key_by_inode == key_by_inode_cryptcompress); + key_by_inode_cryptcompress(inode, off_to_clust_to_off(new_size, inode), &from_key); + to_key = from_key; + set_key_offset(&to_key, get_key_offset(max_key())); + + while (1) { + result = reserve_cut_iteration(tree_by_inode(inode)); + if (result) + break; + + result = cut_tree_object(current_tree, &from_key, &to_key, + &smallest_removed, inode); + if (result == -E_REPEAT) { + /* -E_REPEAT is a signal to interrupt a long file truncation process */ + /* FIXME(Zam) cut_tree does not support that signaling.*/ + result = update_inode_cryptcompress + (inode, get_key_offset(&smallest_removed), 1, 1, update_sd); + if (result) + break; + + all_grabbed2free(); + reiser4_release_reserved(inode->i_sb); + + { + reiser4_context * ctx; + long long_ret; + + ctx = get_current_context(); + long_ret = txn_end(ctx); + txn_begin(ctx); + if (long_ret < 0) { + result = (int)long_ret; + break; + } + } + continue; + } + if (result) + break; + result = update_inode_cryptcompress + (inode, get_key_offset(&smallest_removed), 1, 1, update_sd); + break; + } + + all_grabbed2free(); + reiser4_release_reserved(inode->i_sb); + return result; +} + +/* The following two procedures are called when truncate decided + to deal with real items */ +static int +cryptcompress_append_hole(struct inode * inode, loff_t new_size) +{ + return write_cryptcompress_flow(0, inode, 0, 0, new_size); +} + +/* safe taking down pages */ +reiser4_internal void +truncate_pages_cryptcompress(struct address_space * mapping, unsigned long index) +{ + truncate_inode_pages(mapping, pg_to_off(index)); +} + +static int +shorten_cryptcompress(struct inode * inode, loff_t new_size, int update_sd) +{ + int result; + int nrpages; + struct page ** pages; + loff_t old_size; + char * kaddr; + pgoff_t pg_padd; + reiser4_cluster_t clust; + crypto_plugin * cplug; + + assert("edward-290", inode->i_size > new_size); + + old_size = inode->i_size; + cplug = inode_crypto_plugin(inode); + result = cut_items_cryptcompress(inode, new_size, update_sd); + if(result) + return result; + if (!off_to_cloff(new_size, inode)) + /* truncated to cluster boundary */ + return 0; + /* FIXME-EDWARD: reserve partial page */ + pages = reiser4_kmalloc(sizeof(*pages) << inode_cluster_shift(inode), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + reiser4_cluster_init(&clust); + clust.pages = pages; + clust_by_offs(&clust, inode, new_size, old_size); + + /* read the whole cluster */ + result = prepare_cluster(inode, 0, 0, NULL, &clust, "shorten cryptcompress"); + if (result) + goto exit2; + /* truncate cluster, so flush will deal with new number of pages */ + + assert("edward-294", clust.stat == DATA_CLUSTER); + + pg_padd = PAGE_CACHE_SIZE - off_to_pgoff(clust.off); + + /* release last truncated pages */ + release_cluster_pages(&clust, off_to_pg(clust.off) + 1 /* ?? */); + + truncate_pages_cryptcompress(inode->i_mapping, off_to_pg(clust_to_off(clust.index, inode) + clust.off + pg_padd)); + + /* align last non-truncated page */ + lock_page(pages[off_to_pg(clust.off)]); + kaddr = kmap_atomic(pages[off_to_pg(clust.off)], KM_USER0); + + if (cplug) + cplug->align_cluster(kaddr + off_to_pgoff(clust.off), off_to_pgoff(clust.off), PAGE_CACHE_SIZE); + else + xmemset(kaddr + off_to_pgoff(clust.off), 0, PAGE_CACHE_SIZE - off_to_pgoff(clust.off)); + unlock_page(pages[off_to_pg(clust.off)]); + + nrpages = off_to_pg(clust.off) + 1; + set_cluster_pages_dirty(&clust, &nrpages); + result = try_capture_cluster(&clust, &nrpages); + if(result) + goto exit; + make_cluster_jnodes_dirty(&clust, &nrpages); + + result = update_inode_cryptcompress(inode, new_size, 1, 1, update_sd); + if(!result) + goto exit2; + exit: + free_reserved4cluster(inode, &clust); + exit2: + put_cluster_jnodes(&clust); + reiser4_kfree(pages); + return result; +} + +/* This is called in setattr_cryptcompress when it is used to truncate, + and in delete_cryptcompress */ + +static int +cryptcompress_truncate(struct inode *inode, /* old size */ + loff_t new_size, /* new size */ + int update_sd) +{ + int result; + loff_t old_size = inode->i_size; + unsigned long idx; +// unsigned long old_idx = off_to_clust(old_size, inode); + unsigned long new_idx = off_to_clust(new_size, inode); + + /* inode->i_size != new size */ + + /* without decompression we can specify + real file offsets only up to cluster size */ + result = find_file_idx(inode, &idx); + +// assert("edward-278", idx <= old_idx); + + if (result) + return result; + if (idx <= new_idx) { + /* do not deal with items */ + if (update_sd) { + result = setattr_reserve(tree_by_inode(inode)); + if (!result) + result = update_inode_cryptcompress(inode, new_size, 1, 1, 1); + all_grabbed2free(); + } + return result; + } + result = (old_size < new_size ? cryptcompress_append_hole(inode, new_size) : + shorten_cryptcompress(inode, new_size, update_sd)); + return result; +} + +/* plugin->u.file.truncate */ +reiser4_internal int +truncate_cryptcompress(struct inode *inode, loff_t new_size) +{ + return 0; +} + +#if 0 +static int +cryptcompress_writepage(struct page * page, reiser4_cluster_t * clust) +{ + int result = 0; + int nrpages; + struct inode * inode; + + assert("edward-423", page->mapping && page->mapping->host); + + inode = page->mapping->host; + reiser4_cluster_init(&clust); + + /* read all cluster pages if necessary */ + clust.pages = reiser4_kmalloc(sizeof(*clust.pages) << inode_cluster_shift(inode), GFP_KERNEL); + if (!pages) + return -ENOMEM; + clust.index = pg_to_clust(page->index, inode); + clust.off = pg_to_off_to_cloff(page->index, inode); + clust.count = PAGE_CACHE_SIZE; + nrpages = count_to_nrpages(fsize_to_count(&clust, inode)); + + result = prepare_cluster(page->mapping->host, 0, 0, &nrpages, &clust, "cryptcompress_writepage"); /* jp+ */ + if(result) + goto exit; + + set_cluster_pages_dirty(&clust, NULL); /* p- */ + result = try_capture_cluster(&clust, NULL); + if (result) { + free_reserved4cluster(inode, &clust); + put_cluster_jnodes(&clust); /* j- */ + goto exit; + } + lock_page(page); + make_cluster_jnodes_dirty(&clust, NULL); + put_cluster_jnodes(&clust); /* j- */ + exit: + reiser4_kfree(clust.pages); + return result; +} + +/* make sure for each page the whole cluster was captured */ +static int +writepages_cryptcompress(struct address_space * mapping) +{ + struct list_head *mpages; + int result; + int nr; + int nrpages; + int captured = 0, clean = 0, writeback = 0; + reiser4_cluster_t * clust; + + reiser4_cluster_init(clust); + result = 0; + nr = 0; + + spin_lock (&mapping->page_lock); + + mpages = get_moved_pages(mapping); + while ((result == 0 || result == 1) && !list_empty (mpages) && nr < CAPTURE_APAGE_BURST) { + struct page *pg = list_to_page(mpages); + + assert("edward-481", PageDirty(pg)); + + if (!clust->nr_pages || !page_of_cluster(pg, &clust, inode)) { + /* update cluster handle */ + clust.index = pg_to_clust(pg->index, inode); + clust.off = pg_to_off_to_cloff(pg->index, inode); + clust.count = PAGE_CACHE_SIZE; + /* advice number of pages */ + nrpages = count_to_nrpages(fsize_to_count(&clust, inode)); + + result = prepare_cluster(mapping->host, 0, 0, &nrpages, &clust, + } + result = capture_anonymous_page(pg, 0); + if (result == 1) { + ++ nr; + result = 0; + } + } + spin_unlock(&mapping->page_lock); + + if (result) { + warning("vs-1454", "Cannot capture anon pages: %i (%d %d %d)\n", result, captured, clean, writeback); + return result; + } + + + if (nr >= CAPTURE_APAGE_BURST) + redirty_inode(mapping->host); + + if (result == 0) + result = capture_anonymous_jnodes(mapping->host); + + if (result != 0) + warning("nikita-3328", "Cannot capture anon pages: %i\n", result); + return result; +} + +#endif + +/* plugin->u.file.capture + FIXME: capture method of file plugin is called by reiser4_writepages. It has to capture all + anonymous pages and jnodes of the mapping. See capture_unix_file, for example + */ +reiser4_internal int +capture_cryptcompress(struct inode *inode, struct writeback_control *wbc) +{ + +#if 0 + int result; + struct inode *inode; + + assert("edward-424", PageLocked(page)); + assert("edward-425", PageUptodate(page)); + assert("edward-426", page->mapping && page->mapping->host); + + inode = page->mapping->host; + assert("edward-427", pg_to_off(page->index) < inode->i_size); + + unlock_page(page); + if (pg_to_off(page->index) >= inode->i_size) { + /* race with truncate? */ + lock_page(page); + page_cache_release(page); + return RETERR(-EIO); + } + /* FIXME-EDWARD: Estimate insertion */ + result = cryptcompress_writepage(page); + assert("edward-428", PageLocked(page)); + return result; + + int result; + reiser4_context ctx; + + if (!inode_has_anonymous_pages(inode)) + return 0; + + init_context(&ctx, inode->i_sb); + + ctx.nobalance = 1; + assert("edward-482", lock_stack_isclean(get_current_lock_stack())); + + result = 0; + + do { + result = writepages_cryptcompress(inode->i_mapping); + if (result != 0 || wbc->sync_mode != WB_SYNC_ALL) + break; + result = txnmgr_force_commit_all(inode->i_sb, 0); + } while (result == 0 && inode_has_anonymous_pages(inode)); + + reiser4_exit_context(&ctx); + return result; +#endif + return 0; +} + +static inline void +validate_crc_extended_coord(uf_coord_t *uf_coord, loff_t offset) +{ + assert("edward-418", uf_coord->valid == 0); + assert("edward-419", item_plugin_by_coord(&uf_coord->base_coord)->s.file.init_coord_extension); + + /* FIXME: */ + item_body_by_coord(&uf_coord->base_coord); + item_plugin_by_coord(&uf_coord->base_coord)->s.file.init_coord_extension(uf_coord, offset); +} + +/* plugin->u.file.mmap: + generic_file_mmap */ + +/* plugin->u.file.release */ +/* plugin->u.file.get_block */ +/* This function is used for ->bmap() VFS method in reiser4 address_space_operations */ +reiser4_internal int +get_block_cryptcompress(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create UNUSED_ARG) +{ + if (current_blocksize != inode_cluster_size(inode)) + return RETERR(-EINVAL); + else { + int result; + reiser4_key key; + hint_t hint; + lock_handle lh; + item_plugin *iplug; + + assert("edward-420", create == 0); + key_by_inode_cryptcompress(inode, (loff_t)block * current_blocksize, &key); + hint_init_zero(&hint, &lh); + result = find_cluster_item(&hint, &key, ZNODE_READ_LOCK, 0, FIND_EXACT); + if (result != CBK_COORD_FOUND) { + done_lh(&lh); + return result; + } + result = zload(hint.coord.base_coord.node); + if (unlikely(result)) { + done_lh(&lh); + return result; + } + iplug = item_plugin_by_coord(&hint.coord.base_coord); + + assert("edward-421", iplug == item_plugin_by_id(CTAIL_ID)); + + if (!hint.coord.valid) + validate_crc_extended_coord(&hint.coord, + (loff_t) block << PAGE_CACHE_SHIFT); + if (iplug->s.file.get_block) + result = iplug->s.file.get_block(&hint.coord, block, bh_result); + else + result = RETERR(-EINVAL); + + zrelse(hint.coord.base_coord.node); + done_lh(&lh); + return result; + } +} + +/* plugin->u.file.delete */ +/* EDWARD-FIXME-HANS: comment is where? */ +reiser4_internal int +delete_cryptcompress(struct inode *inode) +{ + int result; + + assert("edward-429", inode->i_nlink == 0); + + if (inode->i_size) { + result = cryptcompress_truncate(inode, 0, 0); + if (result) { + warning("edward-430", "cannot truncate cryptcompress file %lli: %i", + get_inode_oid(inode), result); + return result; + } + } + return delete_file_common(inode); +} + +/* plugin->u.file.init_inode_data */ +/* plugin->u.file.owns_item: + owns_item_common */ +/* plugin->u.file.pre_delete */ +/* EDWARD-FIXME-HANS: comment is where? */ +reiser4_internal int +pre_delete_cryptcompress(struct inode *inode) +{ + return cryptcompress_truncate(inode, 0, 0); +} + +/* plugin->u.file.setattr method */ +reiser4_internal int +setattr_cryptcompress(struct inode *inode, /* Object to change attributes */ + struct iattr *attr /* change description */ ) +{ + int result; + + if (attr->ia_valid & ATTR_SIZE) { + /* EDWARD-FIXME-HANS: VS-FIXME-HANS: + Q: this case occurs when? truncate? + A: yes + + Q: If so, why isn't this code in truncate itself instead of here? + + A: because vfs calls fs's truncate after it has called truncate_inode_pages to get rid of pages + corresponding to part of file being truncated. In reiser4 it may cause existence of unallocated + extents which do not have jnodes. Flush code does not expect that. Solution of this problem is + straightforward. As vfs's truncate is implemented using setattr operation (common implementaion of + which calls truncate_inode_pages and fs's truncate in case when size of file changes) - it seems + reasonable to have reiser4_setattr which will take care of removing pages, jnodes and extents + simultaneously in case of truncate. + */ + + /* truncate does reservation itself and requires exclusive access obtained */ + if (inode->i_size != attr->ia_size) { + loff_t old_size; + + inode_check_scale(inode, inode->i_size, attr->ia_size); + + old_size = inode->i_size; + + result = cryptcompress_truncate(inode, attr->ia_size, 1/* update stat data */); + + if (!result) { + /* items are removed already. inode_setattr will call vmtruncate to invalidate truncated + pages and truncate_cryptcompress which will do nothing. FIXME: is this necessary? */ + INODE_SET_FIELD(inode, i_size, old_size); + result = inode_setattr(inode, attr); + } + } else + result = 0; + } else { + result = setattr_reserve(tree_by_inode(inode)); + if (!result) { + result = inode_setattr(inode, attr); + if (!result) + /* "capture" inode */ + result = reiser4_mark_inode_dirty(inode); + all_grabbed2free(); + } + } + return result; +} + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/cryptcompress.h linux-2.6.4-ck1/fs/reiser4/plugin/cryptcompress.h --- linux-2.6.4/fs/reiser4/plugin/cryptcompress.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/cryptcompress.h 2004-03-11 22:45:15.289510208 +1100 @@ -0,0 +1,105 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ +/* See http://www.namesys.com/cryptcompress_design.html */ + +#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ ) +#define __FS_REISER4_CRYPTCOMPRESS_H__ + + +#include + +#define MIN_CLUSTER_SIZE PAGE_CACHE_SIZE +#define MAX_CLUSTER_SHIFT 4 +#define MIN_SIZE_FOR_COMPRESSION 64 +#define MIN_CRYPTO_BLOCKSIZE 8 +#define CLUSTER_MAGIC_SIZE (MIN_CRYPTO_BLOCKSIZE >> 1) + +/* cluster status */ +typedef enum { + DATA_CLUSTER = 0, + HOLE_CLUSTER = 1, /* indicates hole for write ops */ + FAKE_CLUSTER = 2 /* indicates absence of disk cluster for read ops */ +} reiser4_cluster_status; + +/* Write modes for item conversion in flush squeeze phase */ +typedef enum { + CRC_FIRST_ITEM = 1, + CRC_APPEND_ITEM = 2, + CRC_OVERWRITE_ITEM = 3, + CRC_CUT_ITEM = 4 +} crc_write_mode_t; + +/* reiser4 cluster manager transforms page cluster into disk cluster (and back) via + input/output stream of crypto/compression algorithms using copy on clustering. + COC means that page cluster will be assembled into united stream before compression, + and output stream of decompression algorithm will be split into pages. + This manager consists mostly of operations on the following object which represents + one cluster: +*/ +typedef struct reiser4_cluster{ + __u8 * buf; /* pointer to input/output stream of crypto/compression algorithm */ + size_t bsize; /* size of the buffer allocated for the stream */ + size_t len; /* actual length of the stream above */ + int nr_pages; /* number of attached pages */ + struct page ** pages; /* attached pages */ + struct file * file; + reiser4_cluster_status stat; + /* sliding frame of cluster size in loff_t-space to translate main file 'offsets' + like read/write position, size, new size (for truncate), etc.. into number + of pages, cluster status, etc..*/ + unsigned long index; /* cluster index, coord of the frame */ + unsigned off; /* offset we want to read/write/truncate from */ + unsigned count; /* bytes to read/write/truncate */ + unsigned delta; /* bytes of user's data to append to the hole */ +} reiser4_cluster_t; + +/* security attributes supposed to be stored on disk + are loaded by stat-data methods (see plugin/item/static_stat.c */ +typedef struct crypto_stat { + __u8 * keyid; /* pointer to a fingerprint */ + __u16 keysize; /* key size, bits */ +} crypto_stat_t; + +/* cryptcompress specific part of reiser4_inode */ +typedef struct cryptcompress_info { + /* cpu-key words */ + __u32 * expkey; +} cryptcompress_info_t; + +cryptcompress_info_t *cryptcompress_inode_data(const struct inode * inode); +int equal_to_rdk(znode *, const reiser4_key *); +int equal_to_ldk(znode *, const reiser4_key *); +int goto_right_neighbor(coord_t *, lock_handle *); +int load_file_hint(struct file *, hint_t *, lock_handle *); +void save_file_hint(struct file *, const hint_t *); + +/* declarations of functions implementing methods of cryptcompress object plugin */ +int create_cryptcompress(struct inode *, struct inode *, reiser4_object_create_data *); +int truncate_cryptcompress(struct inode *, loff_t size); +int readpage_cryptcompress(void *, struct page *); +int capture_cryptcompress(struct inode *inode, struct writeback_control *wbc); +ssize_t write_cryptcompress(struct file *, const char *buf, size_t size, loff_t *off); +int release_cryptcompress(struct inode *inode, struct file *); +int mmap_cryptcompress(struct file *, struct vm_area_struct *vma); +int get_block_cryptcompress(struct inode *, sector_t block, struct buffer_head *bh_result, int create); +int flow_by_inode_cryptcompress(struct inode *, char *buf, int user, loff_t, loff_t, rw_op, flow_t *); +int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *); +int delete_cryptcompress(struct inode *); +int owns_item_cryptcompress(const struct inode *, const coord_t *); +int setattr_cryptcompress(struct inode *, struct iattr *); +void readpages_cryptcompress(struct file *, struct address_space *, struct list_head *pages); +void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *, int create); +int pre_delete_cryptcompress(struct inode *); +void hint_init_zero(hint_t *, lock_handle *); + +#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */ + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/digest.c linux-2.6.4-ck1/fs/reiser4/plugin/digest.c --- linux-2.6.4/fs/reiser4/plugin/digest.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/digest.c 2004-03-11 22:45:15.289510208 +1100 @@ -0,0 +1,58 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* NIKITA-FIXME-HANS: digest plugins do what? why no comments */ + +/* crypto-hash functions */ + +#include "../debug.h" +#include "plugin_header.h" +#include "plugin.h" + +#include + +#define NONE_BLOCK_SIZE 64 +#define NONE_DIGEST_SIZE 16 + +static int alloc_none(void * ctx) +{ + return 0; +} + +static void free_none(void *ctx) +{ +} + +static void init_none(void *ctx) +{ +} + +static void update_none(void *ctx, const __u8 *data, unsigned int len) +{ +} + +static void final_none(void *ctx, __u8 *out) +{ + memset(out, 0, NONE_DIGEST_SIZE); +} + + +/* digest plugins */ +digest_plugin digest_plugins[LAST_DIGEST_ID] = { + [NONE_DIGEST_ID] = { + .h = { + .type_id = REISER4_DIGEST_PLUGIN_TYPE, + .id = NONE_DIGEST_ID, + .pops = NULL, + .label = "none", + .desc = "trivial digest", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .blksize = NONE_BLOCK_SIZE, + .digestsize = NONE_DIGEST_SIZE, + .alloc = alloc_none, + .free = free_none, + .init = init_none, + .update = update_none, + .final = final_none + } +}; diff -Naurp linux-2.6.4/fs/reiser4/plugin/dir/dir.c linux-2.6.4-ck1/fs/reiser4/plugin/dir/dir.c --- linux-2.6.4/fs/reiser4/plugin/dir/dir.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/dir/dir.c 2004-03-11 22:45:15.291509897 +1100 @@ -0,0 +1,1302 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Methods of directory plugin. */ + +#include "../../forward.h" +#include "../../debug.h" +#include "../../spin_macros.h" +#include "../plugin_header.h" +#include "../../key.h" +#include "../../kassign.h" +#include "../../coord.h" +#include "../../type_safe_list.h" +#include "../plugin.h" +#include "dir.h" +#include "../item/item.h" +#include "../security/perm.h" +#include "../../jnode.h" +#include "../../znode.h" +#include "../../tap.h" +#include "../../vfs_ops.h" +#include "../../inode.h" +#include "../../super.h" +#include "../../safe_link.h" +#include "../object.h" + +#include "hashed_dir.h" +#include "pseudo_dir.h" + +#include /* for __u?? */ +#include /* for struct file */ +#include +#include /* for struct dentry */ + +/* helper function. Standards require than for many file-system operations + on success ctime and mtime of parent directory is to be updated. */ +reiser4_internal int +reiser4_update_dir(struct inode *dir) +{ + assert("nikita-2525", dir != NULL); + + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + return reiser4_mark_inode_dirty(dir); +} + +/* estimate disk space necessary to add a link from @parent to @object. */ +static reiser4_block_nr common_estimate_link( + struct inode *parent /* parent directory */, + struct inode *object /* object to which new link is being cerated */) +{ + reiser4_block_nr res = 0; + file_plugin *fplug; + dir_plugin *dplug; + + assert("vpf-317", object != NULL); + assert("vpf-318", parent != NULL ); + + fplug = inode_file_plugin(object); + dplug = inode_dir_plugin(parent); + + /* reiser4_add_nlink(object) */ + res += fplug->estimate.update(object); + /* add_entry(parent) */ + res += dplug->estimate.add_entry(parent); + /* reiser4_del_nlink(object) */ + res += fplug->estimate.update(object); + /* update_dir(parent) */ + res += inode_file_plugin(parent)->estimate.update(parent); + + return res; +} + +/* add link from @parent directory to @existing object. + + . get plugins + . check permissions + . check that "existing" can hold yet another link + . start transaction + . add link to "existing" + . add entry to "parent" + . if last step fails, remove link from "existing" + +*/ +static int +link_common(struct inode *parent /* parent directory */ , + struct dentry *existing /* dentry of object to which + * new link is being + * cerated */ , + struct dentry *newname /* new name */ ) +{ + int result; + struct inode *object; + dir_plugin *parent_dplug; + reiser4_dir_entry_desc entry; + reiser4_object_create_data data; + reiser4_block_nr reserve; + + assert("nikita-1431", existing != NULL); + assert("nikita-1432", parent != NULL); + assert("nikita-1433", newname != NULL); + + object = existing->d_inode; + assert("nikita-1434", object != NULL); + + /* check for race with create_object() */ + if (inode_get_flag(object, REISER4_IMMUTABLE)) + return RETERR(-E_REPEAT); + + /* links to directories are not allowed if file-system + logical name-space should be ADG */ + if (S_ISDIR(object->i_mode) && reiser4_is_set(parent->i_sb, REISER4_ADG)) + return RETERR(-EISDIR); + + /* check permissions */ + result = perm_chk(parent, link, existing, parent, newname); + if (result != 0) + return result; + + parent_dplug = inode_dir_plugin(parent); + + xmemset(&entry, 0, sizeof entry); + entry.obj = object; + + data.mode = object->i_mode; + data.id = inode_file_plugin(object)->h.id; + + reserve = common_estimate_link(parent, existing->d_inode); + if ((__s64)reserve < 0) + return reserve; + + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) + return RETERR(-ENOSPC); + + result = reiser4_add_nlink(object, parent, 1); + if (result == 0) { + /* add entry to the parent */ + result = parent_dplug->add_entry(parent, newname, &data, &entry); + if (result != 0) { + /* failure to add entry to the parent, remove + link from "existing" */ + reiser4_del_nlink(object, parent, 1); + /* now, if this fails, we have a file with too + big nlink---space leak, much better than + directory entry pointing to nowhere */ + /* may be it should be recorded somewhere, but + if addition of link to parent and update of + object's stat data both failed, chances are + that something is going really wrong */ + } + } + if (result == 0) { + atomic_inc(&object->i_count); + /* Upon successful completion, link() shall mark for update + the st_ctime field of the file. Also, the st_ctime and + st_mtime fields of the directory that contains the new + entry shall be marked for update. --SUS + */ + result = reiser4_update_dir(parent); + } + return result; +} + +/* estimate disk space necessary to remove a link between @parent and + * @object. */ +static reiser4_block_nr common_estimate_unlink ( + struct inode *parent /* parent directory */, + struct inode *object /* object to which new link is being cerated */) +{ + reiser4_block_nr res = 0; + file_plugin *fplug; + dir_plugin *dplug; + + assert("vpf-317", object != NULL); + assert("vpf-318", parent != NULL ); + + fplug = inode_file_plugin(object); + dplug = inode_dir_plugin(parent); + + /* rem_entry(parent) */ + res += dplug->estimate.rem_entry(parent); + /* reiser4_del_nlink(object) */ + res += fplug->estimate.update(object); + /* update_dir(parent) */ + res += inode_file_plugin(parent)->estimate.update(parent); + /* fplug->unlink */ + res += fplug->estimate.unlink(object, parent); + /* safe-link */ + res += estimate_one_insert_item(tree_by_inode(object)); + + return res; +} + +/* grab space for unlink. */ +static int +unlink_check_and_grab(struct inode *parent, struct dentry *victim) +{ + file_plugin *fplug; + struct inode *child; + int result; + + result = 0; + child = victim->d_inode; + fplug = inode_file_plugin(child); + + /* check for race with create_object() */ + if (inode_get_flag(child, REISER4_IMMUTABLE)) + return RETERR(-E_REPEAT); + /* object being deleted should have stat data */ + assert("vs-949", !inode_get_flag(child, REISER4_NO_SD)); + + /* check permissions */ + result = perm_chk(parent, unlink, parent, victim); + if (result != 0) + return result; + + /* ask object plugin */ + if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child)) + return RETERR(-ENOTEMPTY); + + result = (int)common_estimate_unlink(parent, child); + if (result < 0) + return result; + + return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT); +} + +/* remove link from @parent directory to @victim object. + + . get plugins + . find entry in @parent + . check permissions + . decrement nlink on @victim + . if nlink drops to 0, delete object +*/ +static int +unlink_common(struct inode *parent /* parent object */ , + struct dentry *victim /* name being removed from @parent */) +{ + int result; + struct inode *object; + file_plugin *fplug; + + object = victim->d_inode; + fplug = inode_file_plugin(object); + assert("nikita-2882", fplug->detach != NULL); + + result = unlink_check_and_grab(parent, victim); + if (result != 0) + return result; + + if ((result = fplug->detach(object, parent)) == 0) { + dir_plugin *parent_dplug; + reiser4_dir_entry_desc entry; + + parent_dplug = inode_dir_plugin(parent); + xmemset(&entry, 0, sizeof entry); + + /* first, delete directory entry */ + result = parent_dplug->rem_entry(parent, victim, &entry); + if (result == 0) { + /* now that directory entry is removed, update + * stat-data */ + result = reiser4_del_nlink(object, parent, 1); + if (result == 0) + /* Upon successful completion, unlink() shall + mark for update the st_ctime and st_mtime + fields of the parent directory. Also, if + the file's link count is not 0, the + st_ctime field of the file shall be marked + for update. --SUS */ + result = reiser4_update_dir(parent); + /* add safe-link for this file */ + if (result == 0 && fplug->not_linked(object)) + result = safe_link_add(object, SAFE_UNLINK); + } + if (unlikely(result != 0 && result != -ENOMEM)) + warning("nikita-3398", "Cannot unlink %llu (%i)", + get_inode_oid(object), result); + } + + if (unlikely(result != 0)) { + /* if operation failed commit pending inode modifications to + * the stat-data */ + reiser4_update_sd(object); + reiser4_update_sd(parent); + } + + reiser4_release_reserved(object->i_sb); + + /* @object's i_ctime was updated by ->rem_link() method(). */ + + return result; +} + +/* Estimate the maximum amount of nodes will be allocated or changed for: + - insert an in the parent entry + - update the SD of parent + - estimate child creation +*/ +static reiser4_block_nr common_estimate_create_child( + struct inode *parent, /* parent object */ + struct inode *object /* object */) +{ + assert("vpf-309", parent != NULL); + assert("vpf-307", object != NULL); + + return + /* object creation estimation */ + inode_file_plugin(object)->estimate.create(object) + + /* stat data of parent directory estimation */ + inode_file_plugin(parent)->estimate.update(parent) + + /* adding entry estimation */ + inode_dir_plugin(parent)->estimate.add_entry(parent) + + /* to undo in the case of failure */ + inode_dir_plugin(parent)->estimate.rem_entry(parent); +} + +/* Create child in directory. + + . get object's plugin + . get fresh inode + . initialize inode + . add object's stat-data + . initialize object's directory + . add entry to the parent + . instantiate dentry + +*/ +/* ->create_child method of directory plugin */ +static int +create_child_common(reiser4_object_create_data * data /* parameters + * of new + * object */, + struct inode ** retobj) +{ + int result; + + struct dentry *dentry; /* parent object */ + struct inode *parent; /* new name */ + + dir_plugin *par_dir; /* directory plugin on the parent */ + dir_plugin *obj_dir; /* directory plugin on the new object */ + file_plugin *obj_plug; /* object plugin on the new object */ + struct inode *object; /* new object */ + reiser4_block_nr reserve; + + reiser4_dir_entry_desc entry; /* new directory entry */ + + assert("nikita-1420", data != NULL); + parent = data->parent; + dentry = data->dentry; + + assert("nikita-1418", parent != NULL); + assert("nikita-1419", dentry != NULL); + par_dir = inode_dir_plugin(parent); + /* check permissions */ + result = perm_chk(parent, create, parent, dentry, data); + if (result != 0) + return result; + + /* check, that name is acceptable for parent */ + if (par_dir->is_name_acceptable && + !par_dir->is_name_acceptable(parent, + dentry->d_name.name, + (int) dentry->d_name.len)) + return RETERR(-ENAMETOOLONG); + + result = 0; + obj_plug = file_plugin_by_id((int) data->id); + if (obj_plug == NULL) { + warning("nikita-430", "Cannot find plugin %i", data->id); + return RETERR(-ENOENT); + } + object = new_inode(parent->i_sb); + if (object == NULL) + return RETERR(-ENOMEM); + /* we'll update i_nlink below */ + object->i_nlink = 0; + /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0, + * to simplify error handling: if some error occurs before i_ino is + * initialized with oid, i_ino should already be set to some + * distinguished value. */ + object->i_ino = 0; + + /* So that on error iput will be called. */ + *retobj = object; + + if (DQUOT_ALLOC_INODE(object)) { + DQUOT_DROP(object); + object->i_flags |= S_NOQUOTA; + return RETERR(-EDQUOT); + } + + xmemset(&entry, 0, sizeof entry); + entry.obj = object; + + plugin_set_file(&reiser4_inode_data(object)->pset, obj_plug); + result = obj_plug->set_plug_in_inode(object, parent, data); + if (result) { + warning("nikita-431", "Cannot install plugin %i on %llx", + data->id, get_inode_oid(object)); + return result; + } + + /* reget plugin after installation */ + obj_plug = inode_file_plugin(object); + + if (obj_plug->create == NULL) + return RETERR(-EPERM); + + /* if any of hash, tail, sd or permission plugins for newly created + object are not set yet set them here inheriting them from parent + directory + */ + assert("nikita-2070", obj_plug->adjust_to_parent != NULL); + result = obj_plug->adjust_to_parent(object, + parent, + object->i_sb->s_root->d_inode); + if (result != 0) { + warning("nikita-432", "Cannot inherit from %llx to %llx", + get_inode_oid(parent), get_inode_oid(object)); + return result; + } + + /* call file plugin's method to initialize plugin specific part of + * inode */ + if (obj_plug->init_inode_data) + obj_plug->init_inode_data(object, data, 1/*create*/); + + /* obtain directory plugin (if any) for new object. */ + obj_dir = inode_dir_plugin(object); + if (obj_dir != NULL && obj_dir->init == NULL) + return RETERR(-EPERM); + + reiser4_inode_data(object)->locality_id = get_inode_oid(parent); + + reserve = common_estimate_create_child(parent, object); + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) + return RETERR(-ENOSPC); + + /* mark inode `immutable'. We disable changes to the file being + created until valid directory entry for it is inserted. Otherwise, + if file were expanded and insertion of directory entry fails, we + have to remove file, but we only alloted enough space in + transaction to remove _empty_ file. 3.x code used to remove stat + data in different transaction thus possibly leaking disk space on + crash. This all only matters if it's possible to access file + without name, for example, by inode number + */ + inode_set_flag(object, REISER4_IMMUTABLE); + + /* create empty object, this includes allocation of new objectid. For + directories this implies creation of dot and dotdot */ + assert("nikita-2265", inode_get_flag(object, REISER4_NO_SD)); + + /* mark inode as `loaded'. From this point onward + reiser4_delete_inode() will try to remove its stat-data. */ + inode_set_flag(object, REISER4_LOADED); + + result = obj_plug->create(object, parent, data); + if (result != 0) { + inode_clr_flag(object, REISER4_IMMUTABLE); + if (result != -ENAMETOOLONG && result != -ENOMEM) + warning("nikita-2219", + "Failed to create sd for %llu", + get_inode_oid(object)); + return result; + } + + if (obj_dir != NULL) + result = obj_dir->init(object, parent, data); + if (result == 0) { + assert("nikita-434", !inode_get_flag(object, REISER4_NO_SD)); + /* insert inode into VFS hash table */ + insert_inode_hash(object); + /* create entry */ + result = par_dir->add_entry(parent, dentry, data, &entry); + if (result == 0) { + result = reiser4_add_nlink(object, parent, 0); + /* If O_CREAT is set and the file did not previously + exist, upon successful completion, open() shall + mark for update the st_atime, st_ctime, and + st_mtime fields of the file and the st_ctime and + st_mtime fields of the parent directory. --SUS + */ + /* @object times are already updated by + reiser4_add_nlink() */ + if (result == 0) + reiser4_update_dir(parent); + if (result != 0) + /* cleanup failure to add nlink */ + par_dir->rem_entry(parent, dentry, &entry); + } + if (result != 0) + /* cleanup failure to add entry */ + obj_plug->detach(object, parent); + } else if (result != -ENOMEM) + warning("nikita-2219", "Failed to initialize dir for %llu: %i", + get_inode_oid(object), result); + + /* + * update stat-data, committing all pending modifications to the inode + * fields. + */ + reiser4_update_sd(object); + if (result != 0) { + /* if everything was ok (result == 0), parent stat-data is + * already updated above (update_parent_dir()) */ + reiser4_update_sd(parent); + /* failure to create entry, remove object */ + obj_plug->delete(object); + } + + /* file has name now, clear immutable flag */ + inode_clr_flag(object, REISER4_IMMUTABLE); + + /* on error, iput() will call ->delete_inode(). We should keep track + of the existence of stat-data for this inode and avoid attempt to + remove it in reiser4_delete_inode(). This is accomplished through + REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags + */ + return result; +} + +/* ->is_name_acceptable() method of directory plugin */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +is_name_acceptable(const struct inode *inode /* directory to check */ , + const char *name UNUSED_ARG /* name to check */ , + int len /* @name's length */ ) +{ + assert("nikita-733", inode != NULL); + assert("nikita-734", name != NULL); + assert("nikita-735", len > 0); + + return len <= reiser4_max_filename_len(inode); +} + +static int +is_valid_dir_coord(struct inode * inode, coord_t * coord) +{ + return + item_type_by_coord(coord) == DIR_ENTRY_ITEM_TYPE && + inode_file_plugin(inode)->owns_item(inode, coord); +} + +/* true if directory is empty (only contains dot and dotdot) */ +reiser4_internal int +is_dir_empty(const struct inode *dir) +{ + assert("nikita-1976", dir != NULL); + + /* rely on our method to maintain directory i_size being equal to the + number of entries. */ + return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY); +} + +/* compare two logical positions within the same directory */ +reiser4_internal cmp_t dir_pos_cmp(const dir_pos * p1, const dir_pos * p2) +{ + cmp_t result; + + assert("nikita-2534", p1 != NULL); + assert("nikita-2535", p2 != NULL); + + result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key); + if (result == EQUAL_TO) { + int diff; + + diff = p1->pos - p2->pos; + result = (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO); + } + return result; +} + + +#if REISER4_DEBUG_OUTPUT && REISER4_TRACE +static char filter(const d8 *dch) +{ + char ch; + + ch = d8tocpu(dch); + if (' ' <= ch && ch <= '~') + return ch; + else + return '?'; +} + +static void +print_de_id(const char *prefix, const de_id *did) +{ + reiser4_key key; + + extract_key_from_de_id(0, did, &key); + print_key(prefix, &key); + return; + printk("%s: %c%c%c%c%c%c%c%c:%c%c%c%c%c%c%c%c", + prefix, + filter(&did->objectid[0]), + filter(&did->objectid[1]), + filter(&did->objectid[2]), + filter(&did->objectid[3]), + filter(&did->objectid[4]), + filter(&did->objectid[5]), + filter(&did->objectid[6]), + filter(&did->objectid[7]), + + filter(&did->offset[0]), + filter(&did->offset[1]), + filter(&did->offset[2]), + filter(&did->offset[3]), + filter(&did->offset[4]), + filter(&did->offset[5]), + filter(&did->offset[6]), + filter(&did->offset[7])); +} + +static void +print_dir_pos(const char *prefix, const dir_pos *pos) +{ + print_de_id(prefix, &pos->dir_entry_key); + printk(" pos: %u", pos->pos); +} + +#else +#define print_de_id(p, did) noop +#define print_dir_pos(prefix, pos) noop +#endif + +/* see comment before readdir_common() for overview of why "adjustment" is + * necessary. */ +static void +adjust_dir_pos(struct file * dir, + readdir_pos * readdir_spot, + const dir_pos * mod_point, + int adj) +{ + dir_pos *pos; + + /* + * new directory entry was added (adj == +1) or removed (adj == -1) at + * the @mod_point. Directory file descriptor @dir is doing readdir and + * is currently positioned at @readdir_spot. Latter has to be updated + * to maintain stable readdir. + */ + + ON_TRACE(TRACE_DIR, "adjust: %s/%i", dir->f_dentry->d_name.name, adj); + IF_TRACE(TRACE_DIR, print_dir_pos("\n mod", mod_point)); + IF_TRACE(TRACE_DIR, print_dir_pos("\nspot", &readdir_spot->position)); + ON_TRACE(TRACE_DIR, "\nf_pos: %llu, spot.entry_no: %llu\n", + dir->f_pos, readdir_spot->entry_no); + + reiser4_stat_inc(dir.readdir.adjust_pos); + + /* directory is positioned to the beginning. */ + if (dir->f_pos == 0) + return; + + pos = &readdir_spot->position; + switch (dir_pos_cmp(mod_point, pos)) { + case LESS_THAN: + /* @mod_pos is _before_ @readdir_spot, that is, entry was + * added/removed on the left (in key order) of current + * position. */ + readdir_spot->entry_no += adj; + assert("nikita-2577", dir->f_pos + adj >= 0); + /* logical number of directory entry readdir is "looking" at + * changes */ + dir->f_pos += adj; + if (de_id_cmp(&pos->dir_entry_key, &mod_point->dir_entry_key) == EQUAL_TO) { + assert("nikita-2575", mod_point->pos < pos->pos); + /* + * if entry added/removed has the same key as current + * for readdir, update counter of duplicate keys in + * @readdir_spot. + */ + pos->pos += adj; + } + reiser4_stat_inc(dir.readdir.adjust_lt); + break; + case GREATER_THAN: + /* directory is modified after @pos: nothing to do. */ + reiser4_stat_inc(dir.readdir.adjust_gt); + break; + case EQUAL_TO: + /* cannot insert an entry readdir is looking at, because it + already exists. */ + assert("nikita-2576", adj < 0); + /* directory entry to which @pos points to is being + removed. + + NOTE-NIKITA: Right thing to do is to update @pos to point + to the next entry. This is complex (we are under spin-lock + for one thing). Just rewind it to the beginning. Next + readdir will have to scan the beginning of + directory. Proper solution is to use semaphore in + spin lock's stead and use rewind_right() here. + + NOTE-NIKITA: now, semaphore is used, so... + */ + xmemset(readdir_spot, 0, sizeof *readdir_spot); + reiser4_stat_inc(dir.readdir.adjust_eq); + } +} + +/* scan all file-descriptors for this directory and adjust their positions + respectively. */ +reiser4_internal void +adjust_dir_file(struct inode *dir, const struct dentry * de, int offset, int adj) +{ + reiser4_file_fsdata *scan; + dir_pos mod_point; + + assert("nikita-2536", dir != NULL); + assert("nikita-2538", de != NULL); + assert("nikita-2539", adj != 0); + + build_de_id(dir, &de->d_name, &mod_point.dir_entry_key); + mod_point.pos = offset; + + spin_lock_inode(dir); + + /* + * new entry was added/removed in directory @dir. Scan all file + * descriptors for @dir that are currently involved into @readdir and + * update them. + */ + + for_all_type_safe_list(readdir, get_readdir_list(dir), scan) + adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj); + + spin_unlock_inode(dir); +} + +static int +dir_go_to(struct file *dir, readdir_pos * pos, tap_t * tap) +{ + reiser4_key key; + int result; + struct inode *inode; + + assert("nikita-2554", pos != NULL); + + inode = dir->f_dentry->d_inode; + result = inode_dir_plugin(inode)->build_readdir_key(dir, &key); + if (result != 0) + return result; + result = object_lookup(inode, + &key, + tap->coord, + tap->lh, + tap->mode, + FIND_EXACT, + LEAF_LEVEL, + LEAF_LEVEL, + 0, + &tap->ra_info); + if (result == CBK_COORD_FOUND) + result = rewind_right(tap, (int) pos->position.pos); + else { + tap->coord->node = NULL; + done_lh(tap->lh); + result = RETERR(-EIO); + } + return result; +} + +static int +set_pos(struct inode * inode, readdir_pos * pos, tap_t * tap) +{ + int result; + coord_t coord; + lock_handle lh; + tap_t scan; + de_id *did; + reiser4_key de_key; + + coord_init_zero(&coord); + init_lh(&lh); + tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK); + tap_copy(&scan, tap); + tap_load(&scan); + pos->position.pos = 0; + + did = &pos->position.dir_entry_key; + + if (is_valid_dir_coord(inode, scan.coord)) { + + build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did); + + while (1) { + + result = go_prev_unit(&scan); + if (result != 0) + break; + + if (!is_valid_dir_coord(inode, scan.coord)) { + result = -EINVAL; + break; + } + + /* get key of directory entry */ + unit_key_by_coord(scan.coord, &de_key); + if (de_id_key_cmp(did, &de_key) != EQUAL_TO) { + /* duplicate-sequence is over */ + break; + } + pos->position.pos ++; + } + } else + result = RETERR(-ENOENT); + tap_relse(&scan); + tap_done(&scan); + return result; +} + + +/* + * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly. + */ +static int +dir_rewind(struct file *dir, readdir_pos * pos, loff_t offset, tap_t * tap) +{ + __u64 destination; + int shift; + int result; + + assert("nikita-2553", dir != NULL); + assert("nikita-2548", pos != NULL); + assert("nikita-2551", tap->coord != NULL); + assert("nikita-2552", tap->lh != NULL); + + if (offset < 0) + return RETERR(-EINVAL); + else if (offset >= dir->f_dentry->d_inode->i_size) + return RETERR(-ENOENT); + else if (offset == 0ll) { + /* rewind to the beginning of directory */ + xmemset(pos, 0, sizeof *pos); + reiser4_stat_inc(dir.readdir.reset); + return dir_go_to(dir, pos, tap); + } + + destination = (__u64) offset; + + shift = pos->entry_no - destination; + if (shift >= 0) { + /* rewinding to the left */ + reiser4_stat_inc(dir.readdir.rewind_left); + if (shift <= (int) pos->position.pos) { + /* destination is within sequence of entries with + duplicate keys. */ + reiser4_stat_inc(dir.readdir.left_non_uniq); + result = dir_go_to(dir, pos, tap); + } else { + shift -= pos->position.pos; + while (1) { + /* repetitions: deadlock is possible when + going to the left. */ + result = dir_go_to(dir, pos, tap); + if (result == 0) { + result = rewind_left(tap, shift); + if (result == -E_DEADLOCK) { + tap_done(tap); + reiser4_stat_inc(dir.readdir.left_restart); + continue; + } + } + break; + } + } + } else { + /* rewinding to the right */ + reiser4_stat_inc(dir.readdir.rewind_right); + result = dir_go_to(dir, pos, tap); + if (result == 0) + result = rewind_right(tap, -shift); + } + if (result == 0) { + result = set_pos(dir->f_dentry->d_inode, pos, tap); + if (result == 0) + /* update pos->position.pos */ + pos->entry_no = destination; + } + return result; +} + +/* + * Function that is called by common_readdir() on each directory entry while + * doing readdir. ->filldir callback may block, so we had to release long term + * lock while calling it. To avoid repeating tree traversal, seal is used. If + * seal is broken, we return -E_REPEAT. Node is unlocked in this case. + * + * Whether node is unlocked in case of any other error is undefined. It is + * guaranteed to be still locked if success (0) is returned. + * + * When ->filldir() wants no more, feed_entry() returns 1, and node is + * unlocked. + */ +static int +feed_entry(readdir_pos * pos, tap_t *tap, filldir_t filldir, void *dirent) +{ + item_plugin *iplug; + char *name; + reiser4_key sd_key; + int result; + char buf[DE_NAME_BUF_LEN]; + char name_buf[32]; + char *local_name; + unsigned file_type; + seal_t seal; + coord_t *coord; + reiser4_key entry_key; + + coord = tap->coord; + iplug = item_plugin_by_coord(coord); + + /* pointer to name within the node */ + name = iplug->s.dir.extract_name(coord, buf); + assert("nikita-1371", name != NULL); + + /* key of object the entry points to */ + if (iplug->s.dir.extract_key(coord, &sd_key) != 0) + return RETERR(-EIO); + + /* we must release longterm znode lock before calling filldir to avoid + deadlock which may happen if filldir causes page fault. So, copy + name to intermediate buffer */ + if (strlen(name) + 1 > sizeof(name_buf)) { + local_name = kmalloc(strlen(name) + 1, GFP_KERNEL); + if (local_name == NULL) + return RETERR(-ENOMEM); + } else + local_name = name_buf; + + strcpy(local_name, name); + file_type = iplug->s.dir.extract_file_type(coord); + + unit_key_by_coord(coord, &entry_key); + seal_init(&seal, coord, &entry_key); + + longterm_unlock_znode(tap->lh); + + ON_TRACE(TRACE_DIR | TRACE_VFS_OPS, "readdir: %s, %llu, %llu\n", + name, pos->entry_no, get_key_objectid(&sd_key)); + + /* + * send information about directory entry to the ->filldir() filler + * supplied to us by caller (VFS). + * + * ->filldir is entitled to do weird things. For example, ->filldir + * supplied by knfsd re-enters file system. Make sure no locks are + * held. + */ + assert("nikita-3436", lock_stack_isclean(get_current_lock_stack())); + + result = filldir(dirent, name, (int) strlen(name), + /* offset of the next entry */ + (loff_t) pos->entry_no, + /* inode number of object bounden by this entry */ + oid_to_uino(get_key_objectid(&sd_key)), + file_type); + if (local_name != name_buf) + kfree(local_name); + if (result < 0) + /* ->filldir() is satisfied. (no space in buffer, IOW) */ + result = 1; + else + result = seal_validate(&seal, coord, &entry_key, LEAF_LEVEL, + tap->lh, FIND_EXACT, + tap->mode, ZNODE_LOCK_HIPRI); + return result; +} + +static void +move_entry(readdir_pos * pos, coord_t * coord) +{ + reiser4_key de_key; + de_id *did; + + /* update @pos */ + ++pos->entry_no; + did = &pos->position.dir_entry_key; + + /* get key of directory entry */ + unit_key_by_coord(coord, &de_key); + + if (de_id_key_cmp(did, &de_key) == EQUAL_TO) + /* we are within sequence of directory entries + with duplicate keys. */ + ++pos->position.pos; + else { + pos->position.pos = 0; + build_de_id_by_key(&de_key, did); + } +} + +reiser4_internal int +dir_readdir_init(struct file *f, tap_t * tap, readdir_pos ** pos) +{ + struct inode *inode; + reiser4_file_fsdata *fsdata; + + assert("nikita-1359", f != NULL); + inode = f->f_dentry->d_inode; + assert("nikita-1360", inode != NULL); + + if (!S_ISDIR(inode->i_mode)) + return RETERR(-ENOTDIR); + + fsdata = reiser4_get_file_fsdata(f); + assert("nikita-2571", fsdata != NULL); + if (IS_ERR(fsdata)) + return PTR_ERR(fsdata); + + spin_lock_inode(inode); + if (readdir_list_is_clean(fsdata)) + readdir_list_push_front(get_readdir_list(inode), fsdata); + *pos = &fsdata->dir.readdir; + spin_unlock_inode(inode); + + IF_TRACE(TRACE_DIR, print_dir_pos("readdir", &(*pos)->position)); + ON_TRACE(TRACE_DIR, " entry_no: %llu\n", (*pos)->entry_no); + + /* move @tap to the current position */ + return dir_rewind(f, *pos, f->f_pos, tap); +} + +/* + * ->readdir method of directory plugin + * + * readdir problems: + * + * Traditional UNIX API for scanning through directory + * (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based + * on the assumption that directory is structured very much like regular + * file, in particular, it is implied that each name within given + * directory (directory entry) can be uniquely identified by scalar offset + * and that such offset is stable across the life-time of the name is + * identifies. + * + * This is manifestly not so for reiser4. In reiser4 the only stable + * unique identifies for the directory entry is its key that doesn't fit + * into seekdir/telldir API. + * + * solution: + * + * Within each file descriptor participating in readdir-ing of directory + * plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track + * of the "current" directory entry that file descriptor looks at. It + * contains a key of directory entry (plus some additional info to deal + * with non-unique keys that we wouldn't dwell onto here) and a logical + * position of this directory entry starting from the beginning of the + * directory, that is ordinal number of this entry in the readdir order. + * + * Obviously this logical position is not stable in the face of directory + * modifications. To work around this, on each addition or removal of + * directory entry all file descriptors for directory inode are scanned + * and their readdir_pos are updated accordingly (adjust_dir_pos()). + * + */ +static int +readdir_common(struct file *f /* directory file being read */ , + void *dirent /* opaque data passed to us by VFS */ , + filldir_t filld /* filler function passed to us + * by VFS */ ) +{ + int result; + struct inode *inode; + coord_t coord; + lock_handle lh; + tap_t tap; + readdir_pos *pos; + + assert("nikita-1359", f != NULL); + inode = f->f_dentry->d_inode; + assert("nikita-1360", inode != NULL); + + reiser4_stat_inc(dir.readdir.calls); + + if (!S_ISDIR(inode->i_mode)) + return RETERR(-ENOTDIR); + + coord_init_zero(&coord); + init_lh(&lh); + tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); + + /* initialize readdir readahead information: include into readahead stat data of all files of the directory */ + set_key_locality(&tap.ra_info.key_to_stop, get_inode_oid(inode)); + set_key_type(&tap.ra_info.key_to_stop, KEY_SD_MINOR); + set_key_ordering(&tap.ra_info.key_to_stop, get_key_ordering(max_key())); + set_key_objectid(&tap.ra_info.key_to_stop, get_key_objectid(max_key())); + set_key_offset(&tap.ra_info.key_to_stop, get_key_offset(max_key())); + + ON_TRACE(TRACE_DIR | TRACE_VFS_OPS, + "readdir: inode: %llu offset: %lli\n", + get_inode_oid(inode), f->f_pos); + + repeat: + result = dir_readdir_init(f, &tap, &pos); + if (result == 0) { + result = tap_load(&tap); + /* scan entries one by one feeding them to @filld */ + while (result == 0) { + coord_t *coord; + + coord = tap.coord; + assert("nikita-2572", coord_is_existing_unit(coord)); + assert("nikita-3227", is_valid_dir_coord(inode, coord)); + + result = feed_entry(pos, &tap, filld, dirent); + ON_TRACE(TRACE_DIR | TRACE_VFS_OPS, + "readdir: entry: offset: %lli\n", f->f_pos); + if (result > 0) { + break; + } else if (result == 0) { + ++ f->f_pos; + result = go_next_unit(&tap); + if (result == -E_NO_NEIGHBOR || + result == -ENOENT) { + result = 0; + break; + } else if (result == 0) { + if (is_valid_dir_coord(inode, coord)) + move_entry(pos, coord); + else + break; + } + } else if (result == -E_REPEAT) { + /* feed_entry() had to restart. */ + ++ f->f_pos; + tap_relse(&tap); + goto repeat; + } else + warning("vs-1617", "readdir_common: unexpected error %d", result); + } + tap_relse(&tap); + + if (result >= 0) + f->f_version = inode->i_version; + } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) + result = 0; + tap_done(&tap); + ON_TRACE(TRACE_DIR | TRACE_VFS_OPS, + "readdir_exit: offset: %lli\n", f->f_pos); + return (result <= 0) ? result : 0; +} + +/* ->attach method of directory plugin */ +static int +attach_common(struct inode *child UNUSED_ARG, struct inode *parent UNUSED_ARG) +{ + assert("nikita-2647", child != NULL); + assert("nikita-2648", parent != NULL); + + return 0; +} + +/* ->estimate.add_entry method of directory plugin + estimation of adding entry which supposes that entry is inserting a unit into item +*/ +static reiser4_block_nr +estimate_add_entry_common(struct inode *inode) +{ + return estimate_one_insert_into_item(tree_by_inode(inode)); +} + +/* ->estimate.rem_entry method of directory plugin */ +static reiser4_block_nr +estimate_rem_entry_common(struct inode *inode) +{ + return estimate_one_item_removal(tree_by_inode(inode)); +} + +static ssize_t +noperm(void) +{ + return RETERR(-EPERM); +} + +#define dir_eperm ((void *)noperm) + +static int +_noop(void) +{ + return 0; +} + +#define enoop ((void *)_noop) + +/* + * definition of directory plugins + */ + +dir_plugin dir_plugins[LAST_DIR_ID] = { + /* standard hashed directory plugin */ + [HASHED_DIR_PLUGIN_ID] = { + .h = { + .type_id = REISER4_DIR_PLUGIN_TYPE, + .id = HASHED_DIR_PLUGIN_ID, + .pops = NULL, + .label = "dir", + .desc = "hashed directory", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .lookup_name = lookup_name_hashed, + .lookup = lookup_hashed, + .unlink = unlink_common, + .link = link_common, + .is_name_acceptable = is_name_acceptable, + .build_entry_key = build_entry_key_common, + .build_readdir_key = build_readdir_key_common, + .add_entry = add_entry_hashed, + .rem_entry = rem_entry_hashed, + .create_child = create_child_common, + .rename = rename_hashed, + .readdir = readdir_common, + .init = init_hashed, + .done = done_hashed, + .attach = attach_common, + .detach = detach_hashed, + .estimate = { + .add_entry = estimate_add_entry_common, + .rem_entry = estimate_rem_entry_common, + .unlink = estimate_unlink_hashed + } + }, + /* hashed directory for which seekdir/telldir are guaranteed to + * work. Brain-damage. */ + [SEEKABLE_HASHED_DIR_PLUGIN_ID] = { + .h = { + .type_id = REISER4_DIR_PLUGIN_TYPE, + .id = SEEKABLE_HASHED_DIR_PLUGIN_ID, + .pops = NULL, + .label = "dir", + .desc = "directory hashed with 31 bit hash", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .lookup_name = lookup_name_hashed, + .lookup = lookup_hashed, + .unlink = unlink_common, + .link = link_common, + .is_name_acceptable = is_name_acceptable, + .build_entry_key = build_entry_key_stable_entry, + .build_readdir_key = build_readdir_key_common, + .add_entry = add_entry_hashed, + .rem_entry = rem_entry_hashed, + .create_child = create_child_common, + .rename = rename_hashed, + .readdir = readdir_common, + .init = init_hashed, + .done = done_hashed, + .attach = attach_common, + .detach = detach_hashed, + .estimate = { + .add_entry = estimate_add_entry_common, + .rem_entry = estimate_rem_entry_common, + .unlink = estimate_unlink_hashed + } + }, + /* pseudo directory. */ + [PSEUDO_DIR_PLUGIN_ID] = { + .h = { + .type_id = REISER4_DIR_PLUGIN_TYPE, + .id = PSEUDO_DIR_PLUGIN_ID, + .pops = NULL, + .label = "pseudo", + .desc = "pseudo directory", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .lookup = lookup_pseudo, + .unlink = dir_eperm, + .link = dir_eperm, + .is_name_acceptable = NULL, + .build_entry_key = NULL, + .build_readdir_key = NULL, + .add_entry = dir_eperm, + .rem_entry = dir_eperm, + .create_child = NULL, + .rename = dir_eperm, + .readdir = readdir_pseudo, + .init = enoop, + .done = enoop, + .attach = enoop, + .detach = enoop, + .estimate = { + .add_entry = NULL, + .rem_entry = NULL, + .unlink = NULL + } + } +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/dir/dir.h linux-2.6.4-ck1/fs/reiser4/plugin/dir/dir.h --- linux-2.6.4/fs/reiser4/plugin/dir/dir.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/dir/dir.h 2004-03-11 22:45:15.292509742 +1100 @@ -0,0 +1,80 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Directory plugin's methods. */ + +#if !defined( __REISER4_DIR_H__ ) +#define __REISER4_DIR_H__ + +#include "../../forward.h" +#include "../../kassign.h" + +#include /* for __u?? */ +#include /* for struct file */ + +/* locking: fields of per file descriptor readdir_pos and ->f_pos are + * protected by ->i_sem on inode. Under this lock following invariant + * holds: + * + * file descriptor is "looking" at the entry_no-th directory entry from + * the beginning of directory. This entry has key dir_entry_key and is + * pos-th entry with duplicate-key sequence. + * + */ + +/* logical position within directory */ +typedef struct { + /* key of directory entry (actually, part of a key sufficient to + identify directory entry) */ + de_id dir_entry_key; + /* ordinal number of directory entry among all entries with the same + key. (Starting from 0.) */ + unsigned pos; +} dir_pos; + +typedef struct { + /* logical position within directory */ + dir_pos position; + /* logical number of directory entry within + directory */ + __u64 entry_no; +} readdir_pos; + +extern void adjust_dir_file(struct inode *dir, const struct dentry *de, + int offset, int adj); +extern int dir_readdir_init(struct file *f, tap_t * tap, readdir_pos ** pos); + +/* description of directory entry being created/destroyed/sought for + + It is passed down to the directory plugin and farther to the + directory item plugin methods. Creation of new directory is done in + several stages: first we search for an entry with the same name, then + create new one. reiser4_dir_entry_desc is used to store some information + collected at some stage of this process and required later: key of + item that we want to insert/delete and pointer to an object that will + be bound by the new directory entry. Probably some more fields will + be added there. + +*/ +struct reiser4_dir_entry_desc { + /* key of directory entry */ + reiser4_key key; + /* object bound by this entry. */ + struct inode *obj; +}; + +int is_name_acceptable(const struct inode *inode, const char *name UNUSED_ARG, int len); +int is_dir_empty(const struct inode *dir); +int reiser4_update_dir(struct inode *dir); + +/* __REISER4_DIR_H__ */ +#endif + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.4-ck1/fs/reiser4/plugin/dir/hashed_dir.c --- linux-2.6.4/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/dir/hashed_dir.c 2004-03-11 22:45:15.295509275 +1100 @@ -0,0 +1,1341 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map + file names to the files. */ + +#include "../../forward.h" +#include "../../debug.h" +#include "../../spin_macros.h" +#include "../../key.h" +#include "../../kassign.h" +#include "../../coord.h" +#include "../../seal.h" +#include "dir.h" +#include "../item/item.h" +#include "../security/perm.h" +#include "../pseudo/pseudo.h" +#include "../plugin.h" +#include "../object.h" +#include "../../jnode.h" +#include "../../znode.h" +#include "../../tree.h" +#include "../../vfs_ops.h" +#include "../../inode.h" +#include "../../reiser4.h" +#include "../../safe_link.h" + +#include /* for struct inode */ +#include /* for struct dentry */ + +static int create_dot_dotdot(struct inode *object, struct inode *parent); +static int find_entry(struct inode *dir, struct dentry *name, + lock_handle * lh, znode_lock_mode mode, + reiser4_dir_entry_desc * entry); +static int check_item(const struct inode *dir, + const coord_t * coord, const char *name); + +reiser4_internal reiser4_block_nr +hashed_estimate_init(struct inode *parent, struct inode *object) +{ + reiser4_block_nr res = 0; + + assert("vpf-321", parent != NULL); + assert("vpf-322", object != NULL); + + /* hashed_add_entry(object) */ + res += inode_dir_plugin(object)->estimate.add_entry(object); + /* reiser4_add_nlink(object) */ + res += inode_file_plugin(object)->estimate.update(object); + /* hashed_add_entry(object) */ + res += inode_dir_plugin(object)->estimate.add_entry(object); + /* reiser4_add_nlink(parent) */ + res += inode_file_plugin(parent)->estimate.update(parent); + + return 0; +} + +/* plugin->u.dir.init + create sd for directory file. Create stat-data, dot, and dotdot. */ +reiser4_internal int +init_hashed(struct inode *object /* new directory */ , + struct inode *parent /* parent directory */ , + reiser4_object_create_data * data UNUSED_ARG /* info passed + * to us, this + * is filled by + * reiser4() + * syscall in + * particular */ ) +{ + reiser4_block_nr reserve; + + assert("nikita-680", object != NULL); + assert("nikita-681", S_ISDIR(object->i_mode)); + assert("nikita-682", parent != NULL); + assert("nikita-684", data != NULL); + assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID); + assert("nikita-687", object->i_mode & S_IFDIR); + trace_stamp(TRACE_DIR); + + reserve = hashed_estimate_init(parent, object); + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) + return RETERR(-ENOSPC); + + return create_dot_dotdot(object, parent); +} + +static reiser4_block_nr +hashed_estimate_done(struct inode *object) +{ + reiser4_block_nr res = 0; + + /* hashed_rem_entry(object) */ + res += inode_dir_plugin(object)->estimate.rem_entry(object); + return res; +} + +/* plugin->u.dir.estimate.unlink */ +reiser4_internal reiser4_block_nr +estimate_unlink_hashed(struct inode *parent, struct inode *object) +{ + reiser4_block_nr res = 0; + + /* hashed_rem_entry(object) */ + res += inode_dir_plugin(object)->estimate.rem_entry(object); + /* del_nlink(parent) */ + res += 2 * inode_file_plugin(parent)->estimate.update(parent); + + return res; +} + +/* ->delete() method of directory plugin + plugin->u.dir.done + Delete dot, and call common_file_delete() to delete stat data. +*/ +reiser4_internal int +done_hashed(struct inode *object /* object being deleted */) +{ + int result; + reiser4_block_nr reserve; + struct dentry goodby_dots; + reiser4_dir_entry_desc entry; + + assert("nikita-1449", object != NULL); + + if (inode_get_flag(object, REISER4_NO_SD)) + return 0; + + /* of course, this can be rewritten to sweep everything in one + cut_tree(). */ + xmemset(&entry, 0, sizeof entry); + + reserve = hashed_estimate_done(object); + if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED)) + return RETERR(-ENOSPC); + + xmemset(&goodby_dots, 0, sizeof goodby_dots); + entry.obj = goodby_dots.d_inode = object; + goodby_dots.d_name.name = "."; + goodby_dots.d_name.len = 1; + result = rem_entry_hashed(object, &goodby_dots, &entry); + reiser4_free_dentry_fsdata(&goodby_dots); + if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT)) + /* only worth a warning + + "values of B will give rise to dom!\n" + -- v6src/s2/mv.c:89 + */ + warning("nikita-2252", "Cannot remove dot of %lli: %i", + get_inode_oid(object), result); + return 0; +} + +/* ->detach() method of directory plugin + plugin->u.dir.done + Delete dotdot, decrease nlink on parent +*/ +reiser4_internal int +detach_hashed(struct inode *object, struct inode *parent) +{ + int result; + struct dentry goodby_dots; + reiser4_dir_entry_desc entry; + + assert("nikita-2885", object != NULL); + assert("nikita-2886", !inode_get_flag(object, REISER4_NO_SD)); + + xmemset(&entry, 0, sizeof entry); + + /* NOTE-NIKITA this only works if @parent is -the- parent of + @object, viz. object whose key is stored in dotdot + entry. Wouldn't work with hard-links on directories. */ + xmemset(&goodby_dots, 0, sizeof goodby_dots); + entry.obj = goodby_dots.d_inode = parent; + goodby_dots.d_name.name = ".."; + goodby_dots.d_name.len = 2; + result = rem_entry_hashed(object, &goodby_dots, &entry); + reiser4_free_dentry_fsdata(&goodby_dots); + if (result == 0) { + /* the dot should be the only entry remaining at this time... */ + assert("nikita-3400", object->i_size == 1); + /* and, together with the only name directory can have, they + * provides for the last 2 remaining references. If we get + * here as part of error handling during mkdir, @object + * possibly has no name yet, so its nlink == 1. */ + assert("nikita-3401", + object->i_nlink == 2 || object->i_nlink == 1); + + reiser4_del_nlink(parent, object, 0); + } + return result; +} + + +/* ->owns_item() for hashed directory object plugin. */ +reiser4_internal int +owns_item_hashed(const struct inode *inode /* object to check against */ , + const coord_t * coord /* coord of item to check */ ) +{ + reiser4_key item_key; + + assert("nikita-1335", inode != NULL); + assert("nikita-1334", coord != NULL); + + if (item_type_by_coord(coord) == DIR_ENTRY_ITEM_TYPE) + return get_key_locality(item_key_by_coord(coord, &item_key)) == get_inode_oid(inode); + else + return owns_item_common(inode, coord); +} + +/* helper function for directory_file_create(). Create "." and ".." */ +static int +create_dot_dotdot(struct inode *object /* object to create dot and + * dotdot for */ , + struct inode *parent /* parent of @object */ ) +{ + int result; + struct dentry dots_entry; + reiser4_dir_entry_desc entry; + + assert("nikita-688", object != NULL); + assert("nikita-689", S_ISDIR(object->i_mode)); + assert("nikita-691", parent != NULL); + trace_stamp(TRACE_DIR); + + /* We store dot and dotdot as normal directory entries. This is + not necessary, because almost all information stored in them + is already in the stat-data of directory, the only thing + being missed is objectid of grand-parent directory that can + easily be added there as extension. + + But it is done the way it is done, because not storing dot + and dotdot will lead to the following complications: + + . special case handling in ->lookup(). + . addition of another extension to the sd. + . dependency on key allocation policy for stat data. + + */ + + xmemset(&entry, 0, sizeof entry); + xmemset(&dots_entry, 0, sizeof dots_entry); + entry.obj = dots_entry.d_inode = object; + dots_entry.d_name.name = "."; + dots_entry.d_name.len = 1; + result = add_entry_hashed(object, &dots_entry, NULL, &entry); + reiser4_free_dentry_fsdata(&dots_entry); + + if (result == 0) { + result = reiser4_add_nlink(object, object, 0); + if (result == 0) { + entry.obj = dots_entry.d_inode = parent; + dots_entry.d_name.name = ".."; + dots_entry.d_name.len = 2; + result = add_entry_hashed(object, + &dots_entry, NULL, &entry); + reiser4_free_dentry_fsdata(&dots_entry); + /* if creation of ".." failed, iput() will delete + object with ".". */ + if (result == 0) { + result = reiser4_add_nlink(parent, object, 0); + if (result != 0) + /* + * if we failed to bump i_nlink, try + * to remove ".." + */ + detach_hashed(object, parent); + } + } + } + + if (result != 0) { + /* + * in the case of error, at least update stat-data so that, + * ->i_nlink updates are not lingering. + */ + reiser4_update_sd(object); + reiser4_update_sd(parent); + } + + return result; +} + +/* implementation of lookup_name() method for hashed directories + + it looks for name specified in @dentry in directory @parent and if name is found - key of object found entry points + to is stored in @entry->key */ +reiser4_internal int +lookup_name_hashed(struct inode *parent /* inode of directory to lookup for name in */, + struct dentry *dentry /* name to look for */, + reiser4_key *key /* place to store key */) +{ + int result; + coord_t *coord; + lock_handle lh; + const char *name; + int len; + reiser4_dir_entry_desc entry; + reiser4_dentry_fsdata *fsdata; + + assert("nikita-1247", parent != NULL); + assert("nikita-1248", dentry != NULL); + assert("nikita-1123", dentry->d_name.name != NULL); + assert("vs-1486", + dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry); + + result = perm_chk(parent, lookup, parent, dentry); + if (result != 0) + return 0; + + name = dentry->d_name.name; + len = dentry->d_name.len; + + if (!is_name_acceptable(parent, name, len)) + /* some arbitrary error code to return */ + return RETERR(-ENAMETOOLONG); + + fsdata = reiser4_get_dentry_fsdata(dentry); + if (IS_ERR(fsdata)) + return PTR_ERR(fsdata); + + coord = &fsdata->dec.entry_coord; + coord_clear_iplug(coord); + init_lh(&lh); + + ON_TRACE(TRACE_DIR | TRACE_VFS_OPS, "lookup inode: %lli \"%s\"\n", get_inode_oid(parent), dentry->d_name.name); + + /* find entry in a directory. This is plugin method. */ + result = find_entry(parent, dentry, &lh, ZNODE_READ_LOCK, &entry); + if (result == 0) { + /* entry was found, extract object key from it. */ + result = WITH_COORD(coord, item_plugin_by_coord(coord)->s.dir.extract_key(coord, key)); + } + done_lh(&lh); + return result; + +} + +/* implementation of ->lookup() method for hashed directories. */ +reiser4_internal int +lookup_hashed(struct inode * parent /* inode of directory to + * lookup into */ , + struct dentry **dentryloc /* name to look for */ ) +{ + int result; + struct inode *inode; + struct dentry *dentry; + reiser4_dir_entry_desc entry; + + dentry = *dentryloc; + /* set up operations on dentry. */ + dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry; + + result = lookup_name_hashed(parent, dentry, &entry.key); + if (result == 0) { + inode = reiser4_iget(parent->i_sb, &entry.key, 0); + if (!IS_ERR(inode)) { + if (inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) { + inode->i_uid = parent->i_uid; + inode->i_gid = parent->i_gid; + /* clear light-weight flag. If inode would be + read by any other name, [ug]id wouldn't + change. */ + inode_clr_flag(inode, REISER4_LIGHT_WEIGHT); + } + /* success */ + *dentryloc = d_splice_alias(inode, dentry); + reiser4_iget_complete(inode); + } else + result = PTR_ERR(inode); + } else if (result == -ENOENT) { + result = lookup_pseudo_file(parent, dentryloc); + } + return result; +} + +static const char *possible_leak = "Possible disk space leak."; + +/* re-bind existing name at @from_coord in @from_dir to point to @to_inode. + + Helper function called from hashed_rename() */ +static int +replace_name(struct inode *to_inode /* inode where @from_coord is + * to be re-targeted at */ , + struct inode *from_dir /* directory where @from_coord + * lives */ , + struct inode *from_inode /* inode @from_coord + * originally point to */ , + coord_t * from_coord /* where directory entry is in + * the tree */ , + lock_handle * from_lh /* lock handle on @from_coord */ ) +{ + item_plugin *from_item; + int result; + znode *node; + + coord_clear_iplug(from_coord); + node = from_coord->node; + result = zload(node); + if (result != 0) + return result; + from_item = item_plugin_by_coord(from_coord); + if (item_type_by_coord(from_coord) == DIR_ENTRY_ITEM_TYPE) { + reiser4_key to_key; + + build_sd_key(to_inode, &to_key); + + /* everything is found and prepared to change directory entry + at @from_coord to point to @to_inode. + + @to_inode is just about to get new name, so bump its link + counter. + + */ + result = reiser4_add_nlink(to_inode, from_dir, 0); + if (result != 0) { + /* Don't issue warning: this may be plain -EMLINK */ + zrelse(node); + return result; + } + + result = from_item->s.dir.update_key(from_coord, &to_key, from_lh); + if (result != 0) { + reiser4_del_nlink(to_inode, from_dir, 0); + zrelse(node); + return result; + } + + /* @from_inode just lost its name, he-he. + + If @from_inode was directory, it contained dotdot pointing + to @from_dir. @from_dir i_nlink will be decreased when + iput() will be called on @from_inode. + + If file-system is not ADG (hard-links are + supported on directories), iput(from_inode) will not remove + @from_inode, and thus above is incorrect, but hard-links on + directories are problematic in many other respects. + */ + result = reiser4_del_nlink(from_inode, from_dir, 0); + if (result != 0) { + warning("nikita-2330", + "Cannot remove link from source: %i. %s", + result, possible_leak); + } + /* Has to return success, because entry is already + * modified. */ + result = 0; + + /* NOTE-NIKITA consider calling plugin method in stead of + accessing inode fields directly. */ + from_dir->i_mtime = CURRENT_TIME; + } else { + warning("nikita-2326", "Unexpected item type"); + print_plugin("item", item_plugin_to_plugin(from_item)); + result = RETERR(-EIO); + } + zrelse(node); + return result; +} + +/* add new entry pointing to @inode into @dir at @coord, locked by @lh + + Helper function used by hashed_rename(). */ +static int +add_name(struct inode *inode /* inode where @coord is to be + * re-targeted at */ , + struct inode *dir /* directory where @coord lives */ , + struct dentry *name /* new name */ , + coord_t * coord /* where directory entry is in the tree */ , + lock_handle * lh /* lock handle on @coord */ , + int is_dir /* true, if @inode is directory */ ) +{ + int result; + reiser4_dir_entry_desc entry; + + assert("nikita-2333", lh->node == coord->node); + assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode)); + + xmemset(&entry, 0, sizeof entry); + entry.obj = inode; + /* build key of directory entry description */ + inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key); + + /* ext2 does this in different order: first inserts new entry, + then increases directory nlink. We don't want do this, + because reiser4_add_nlink() calls ->add_link() plugin + method that can fail for whatever reason, leaving as with + cleanup problems. + */ + /* @inode is getting new name */ + reiser4_add_nlink(inode, dir, 0); + /* create @new_name in @new_dir pointing to + @old_inode */ + result = WITH_COORD(coord, + inode_dir_item_plugin(dir)->s.dir.add_entry(dir, + coord, + lh, + name, + &entry)); + if (result != 0) { + int result2; + result2 = reiser4_del_nlink(inode, dir, 0); + if (result2 != 0) { + warning("nikita-2327", "Cannot drop link on %lli %i. %s", + get_inode_oid(inode), + result2, possible_leak); + } + } else + INODE_INC_FIELD(dir, i_size); + return result; +} + +static reiser4_block_nr +hashed_estimate_rename( + struct inode *old_dir /* directory where @old is located */, + struct dentry *old_name /* old name */, + struct inode *new_dir /* directory where @new is located */, + struct dentry *new_name /* new name */) +{ + reiser4_block_nr res1, res2; + dir_plugin *p_parent_old, *p_parent_new; + file_plugin *p_child_old, *p_child_new; + + assert("vpf-311", old_dir != NULL); + assert("vpf-312", new_dir != NULL); + assert("vpf-313", old_name != NULL); + assert("vpf-314", new_name != NULL); + + p_parent_old = inode_dir_plugin(old_dir); + p_parent_new = inode_dir_plugin(new_dir); + p_child_old = inode_file_plugin(old_name->d_inode); + if (new_name->d_inode) + p_child_new = inode_file_plugin(new_name->d_inode); + else + p_child_new = 0; + + /* find_entry - can insert one leaf. */ + res1 = res2 = 1; + + /* replace_name */ + { + /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */ + res1 += 2 * p_child_old->estimate.update(old_name->d_inode); + /* update key */ + res1 += 1; + /* reiser4_del_nlink(p_child_new) */ + if (p_child_new) + res1 += p_child_new->estimate.update(new_name->d_inode); + } + + /* else add_name */ + { + /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */ + res2 += 2 * inode_file_plugin(new_dir)->estimate.update(new_dir); + /* reiser4_add_nlink(p_parent_old) */ + res2 += p_child_old->estimate.update(old_name->d_inode); + /* add_entry(p_parent_new) */ + res2 += p_parent_new->estimate.add_entry(new_dir); + /* reiser4_del_nlink(p_parent_old) */ + res2 += p_child_old->estimate.update(old_name->d_inode); + } + + res1 = res1 < res2 ? res2 : res1; + + + /* reiser4_write_sd(p_parent_new) */ + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); + + /* reiser4_write_sd(p_child_new) */ + if (p_child_new) + res1 += p_child_new->estimate.update(new_name->d_inode); + + /* hashed_rem_entry(p_parent_old) */ + res1 += p_parent_old->estimate.rem_entry(old_dir); + + /* reiser4_del_nlink(p_child_old) */ + res1 += p_child_old->estimate.update(old_name->d_inode); + + /* replace_name */ + { + /* reiser4_add_nlink(p_parent_dir_new) */ + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); + /* update_key */ + res1 += 1; + /* reiser4_del_nlink(p_parent_new) */ + res1 += inode_file_plugin(new_dir)->estimate.update(new_dir); + /* reiser4_del_nlink(p_parent_old) */ + res1 += inode_file_plugin(old_dir)->estimate.update(old_dir); + } + + /* reiser4_write_sd(p_parent_old) */ + res1 += inode_file_plugin(old_dir)->estimate.update(old_dir); + + /* reiser4_write_sd(p_child_old) */ + res1 += p_child_old->estimate.update(old_name->d_inode); + + return res1; +} + +static int +hashed_rename_estimate_and_grab( + struct inode *old_dir /* directory where @old is located */ , + struct dentry *old_name /* old name */ , + struct inode *new_dir /* directory where @new is located */ , + struct dentry *new_name /* new name */ ) +{ + reiser4_block_nr reserve; + + reserve = hashed_estimate_rename(old_dir, old_name, new_dir, new_name); + + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) + return RETERR(-ENOSPC); + + return 0; +} + +/* check whether @old_inode and @new_inode can be moved within file system + * tree. This singles out attempts to rename pseudo-files, for example. */ +static int +can_rename(struct inode *old_inode, struct inode *new_inode) +{ + file_plugin *fplug; + + assert("nikita-3370", old_inode != NULL); + + fplug = inode_file_plugin(old_inode); + if (fplug->can_add_link(old_inode)) { + if (new_inode != NULL) { + fplug = inode_file_plugin(new_inode); + if (fplug->can_rem_link != NULL && + !fplug->can_rem_link(new_inode)) + return RETERR(-EBUSY); + } + return 0; + } else + return RETERR(-EMLINK); + +} + +/* ->rename directory plugin method implementation for hashed directories. + plugin->u.dir.rename + See comments in the body. + + It is arguable that this function can be made generic so, that it will be + applicable to any kind of directory plugin that deals with directories + composed out of directory entries. The only obstacle here is that we don't + have any data-type to represent directory entry. This should be + re-considered when more than one different directory plugin will be + implemented. +*/ +reiser4_internal int +rename_hashed(struct inode *old_dir /* directory where @old is located */ , + struct dentry *old_name /* old name */ , + struct inode *new_dir /* directory where @new is located */ , + struct dentry *new_name /* new name */ ) +{ + /* From `The Open Group Base Specifications Issue 6' + + + If either the old or new argument names a symbolic link, rename() + shall operate on the symbolic link itself, and shall not resolve + the last component of the argument. If the old argument and the new + argument resolve to the same existing file, rename() shall return + successfully and perform no other action. + + [this is done by VFS: vfs_rename()] + + + If the old argument points to the pathname of a file that is not a + directory, the new argument shall not point to the pathname of a + directory. + + [checked by VFS: vfs_rename->may_delete()] + + If the link named by the new argument exists, it shall + be removed and old renamed to new. In this case, a link named new + shall remain visible to other processes throughout the renaming + operation and refer either to the file referred to by new or old + before the operation began. + + [we should assure this] + + Write access permission is required for + both the directory containing old and the directory containing new. + + [checked by VFS: vfs_rename->may_delete(), may_create()] + + If the old argument points to the pathname of a directory, the new + argument shall not point to the pathname of a file that is not a + directory. + + [checked by VFS: vfs_rename->may_delete()] + + If the directory named by the new argument exists, it + shall be removed and old renamed to new. In this case, a link named + new shall exist throughout the renaming operation and shall refer + either to the directory referred to by new or old before the + operation began. + + [we should assure this] + + If new names an existing directory, it shall be + required to be an empty directory. + + [we should check this] + + If the old argument points to a pathname of a symbolic link, the + symbolic link shall be renamed. If the new argument points to a + pathname of a symbolic link, the symbolic link shall be removed. + + The new pathname shall not contain a path prefix that names + old. Write access permission is required for the directory + containing old and the directory containing new. If the old + argument points to the pathname of a directory, write access + permission may be required for the directory named by old, and, if + it exists, the directory named by new. + + [checked by VFS: vfs_rename(), vfs_rename_dir()] + + If the link named by the new argument exists and the file's link + count becomes 0 when it is removed and no process has the file + open, the space occupied by the file shall be freed and the file + shall no longer be accessible. If one or more processes have the + file open when the last link is removed, the link shall be removed + before rename() returns, but the removal of the file contents shall + be postponed until all references to the file are closed. + + [iput() handles this, but we can do this manually, a la + reiser4_unlink()] + + Upon successful completion, rename() shall mark for update the + st_ctime and st_mtime fields of the parent directory of each file. + + [N/A] + + */ + + int result; + int is_dir; /* is @old_name directory */ + + struct inode *old_inode; + struct inode *new_inode; + + reiser4_dir_entry_desc old_entry; + reiser4_dir_entry_desc new_entry; + + coord_t *new_coord; + + reiser4_dentry_fsdata *new_fsdata; + + lock_handle new_lh; + + dir_plugin *dplug; + + assert("nikita-2318", old_dir != NULL); + assert("nikita-2319", new_dir != NULL); + assert("nikita-2320", old_name != NULL); + assert("nikita-2321", new_name != NULL); + + old_inode = old_name->d_inode; + new_inode = new_name->d_inode; + + dplug = inode_dir_plugin(old_dir); + + new_fsdata = reiser4_get_dentry_fsdata(new_name); + if (IS_ERR(new_fsdata)) + return PTR_ERR(new_fsdata); + + new_coord = &new_fsdata->dec.entry_coord; + coord_clear_iplug(new_coord); + + is_dir = S_ISDIR(old_inode->i_mode); + + assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir); + + /* if target is existing directory and it's not empty---return error. + + This check is done specifically, because is_dir_empty() requires + tree traversal and have to be done before locks are taken. + */ + if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) + return RETERR(-ENOTEMPTY); + + result = can_rename(old_inode, new_inode); + if (result != 0) + return result; + + result = hashed_rename_estimate_and_grab(old_dir, old_name, + new_dir, new_name); + if (result != 0) + return result; + + init_lh(&new_lh); + + /* find entry for @new_name */ + result = find_entry(new_dir, + new_name, &new_lh, ZNODE_WRITE_LOCK, &new_entry); + + if (IS_CBKERR(result)) { + done_lh(&new_lh); + return result; + } + + seal_done(&new_fsdata->dec.entry_seal); + + /* add or replace name for @old_inode as @new_name */ + if (new_inode != NULL) { + /* target (@new_name) exists. */ + /* Not clear what to do with objects that are + both directories and files at the same time. */ + if (result == CBK_COORD_FOUND) { + result = replace_name(old_inode, + new_dir, + new_inode, + new_coord, + &new_lh); + if (result == 0) { + file_plugin *fplug; + + fplug = inode_file_plugin(new_inode); + /* detach @new_inode from name-space */ + result = fplug->detach(new_inode, new_dir); + if (result != 0) { + warning("nikita-2330", + "Cannot detach %lli: %i. %s", + get_inode_oid(new_inode), + result, possible_leak); + } + } + } else if (result == CBK_COORD_NOTFOUND) { + /* VFS told us that @new_name is bound to existing + inode, but we failed to find directory entry. */ + warning("nikita-2324", "Target not found"); + result = RETERR(-ENOENT); + } + } else { + /* target (@new_name) doesn't exists. */ + if (result == CBK_COORD_NOTFOUND) + result = add_name(old_inode, + new_dir, + new_name, + new_coord, + &new_lh, is_dir); + else if (result == CBK_COORD_FOUND) { + /* VFS told us that @new_name is "negative" dentry, + but we found directory entry. */ + warning("nikita-2331", "Target found unexpectedly"); + result = RETERR(-EIO); + } + } + + assert("nikita-3462", ergo(result == 0, + old_inode->i_nlink >= 2 + !!is_dir)); + + /* We are done with all modifications to the @new_dir, release lock on + node. */ + done_lh(&new_lh); + + if (new_inode != NULL) + reiser4_mark_inode_dirty(new_inode); + + if (result == 0) { + xmemset(&old_entry, 0, sizeof old_entry); + old_entry.obj = old_inode; + + dplug->build_entry_key(old_dir, + &old_name->d_name, &old_entry.key); + + /* At this stage new name was introduced for + @old_inode. @old_inode, @new_dir, and @new_inode i_nlink + counters were updated. + + We want to remove @old_name now. If @old_inode wasn't + directory this is simple. + */ + result = rem_entry_hashed(old_dir, old_name, &old_entry); + if (result != 0 && result != -ENOMEM) { + warning("nikita-2335", + "Cannot remove old name: %i", result); + } else { + result = reiser4_del_nlink(old_inode, old_dir, 0); + if (result != 0 && result != -ENOMEM) { + warning("nikita-2337", + "Cannot drop link on old: %i", result); + } + } + + if (result == 0 && is_dir) { + /* @old_inode is directory. We also have to update + dotdot entry. */ + coord_t *dotdot_coord; + lock_handle dotdot_lh; + struct dentry dotdot_name; + reiser4_dir_entry_desc dotdot_entry; + reiser4_dentry_fsdata dataonstack; + reiser4_dentry_fsdata *fsdata; + + xmemset(&dataonstack, 0, sizeof dataonstack); + xmemset(&dotdot_entry, 0, sizeof dotdot_entry); + dotdot_entry.obj = old_dir; + xmemset(&dotdot_name, 0, sizeof dotdot_name); + dotdot_name.d_name.name = ".."; + dotdot_name.d_name.len = 2; + /* + * allocate ->d_fsdata on the stack to avoid using + * reiser4_get_dentry_fsdata(). Locking is not needed, + * because dentry is private to the current thread. + */ + dotdot_name.d_fsdata = &dataonstack; + init_lh(&dotdot_lh); + + fsdata = &dataonstack; + dotdot_coord = &fsdata->dec.entry_coord; + coord_clear_iplug(dotdot_coord); + + result = find_entry(old_inode, &dotdot_name, &dotdot_lh, + ZNODE_WRITE_LOCK, &dotdot_entry); + if (result == 0) { + /* replace_name() decreases i_nlink on + * @old_dir */ + result = replace_name(new_dir, + old_inode, + old_dir, + dotdot_coord, + &dotdot_lh); + } else + result = RETERR(-EIO); + done_lh(&dotdot_lh); + } + } + reiser4_update_dir(new_dir); + reiser4_update_dir(old_dir); + reiser4_mark_inode_dirty(old_inode); + if (result == 0) { + file_plugin *fplug; + + if (new_inode != NULL) { + /* add safe-link for target file (in case we removed + * last reference to the poor fellow */ + fplug = inode_file_plugin(new_inode); + if (fplug->not_linked(new_inode)) + result = safe_link_add(new_inode, SAFE_UNLINK); + } + } + return result; +} + +/* ->add_entry() method for hashed directory object plugin. + plugin->u.dir.add_entry +*/ +reiser4_internal int +add_entry_hashed(struct inode *object /* directory to add new name + * in */ , + struct dentry *where /* new name */ , + reiser4_object_create_data * data UNUSED_ARG /* parameters + * of new + * object */ , + reiser4_dir_entry_desc * entry /* parameters of new + * directory entry */ ) +{ + int result; + coord_t *coord; + lock_handle lh; + reiser4_dentry_fsdata *fsdata; + reiser4_block_nr reserve; + + assert("nikita-1114", object != NULL); + assert("nikita-1250", where != NULL); + + fsdata = reiser4_get_dentry_fsdata(where); + if (unlikely(IS_ERR(fsdata))) + return PTR_ERR(fsdata); + + reserve = inode_dir_plugin(object)->estimate.add_entry(object); + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) + return RETERR(-ENOSPC); + + init_lh(&lh); + ON_TRACE(TRACE_DIR, "[%i]: creating \"%s\" in %llu\n", current->pid, where->d_name.name, get_inode_oid(object)); + coord = &fsdata->dec.entry_coord; + coord_clear_iplug(coord); + + /* check for this entry in a directory. This is plugin method. */ + result = find_entry(object, where, &lh, ZNODE_WRITE_LOCK, entry); + if (likely(result == -ENOENT)) { + /* add new entry. Just pass control to the directory + item plugin. */ + assert("nikita-1709", inode_dir_item_plugin(object)); + assert("nikita-2230", coord->node == lh.node); + seal_done(&fsdata->dec.entry_seal); + result = inode_dir_item_plugin(object)->s.dir.add_entry(object, coord, &lh, where, entry); + if (result == 0) { + adjust_dir_file(object, where, fsdata->dec.pos + 1, +1); + INODE_INC_FIELD(object, i_size); + } + } else if (result == 0) { + assert("nikita-2232", coord->node == lh.node); + result = RETERR(-EEXIST); + } + done_lh(&lh); + + return result; +} + +/* ->rem_entry() method for hashed directory object plugin. + plugin->u.dir.rem_entry + */ +reiser4_internal int +rem_entry_hashed(struct inode *object /* directory from which entry + * is begin removed */ , + struct dentry *where /* name that is being + * removed */ , + reiser4_dir_entry_desc * entry /* description of entry being + * removed */ ) +{ + int result; + coord_t *coord; + lock_handle lh; + reiser4_dentry_fsdata *fsdata; + __u64 tograb; + + /* yes, nested function, so what? Sue me. */ + int rem_entry(void) { + item_plugin *iplug; + struct inode *child; + + iplug = inode_dir_item_plugin(object); + child = where->d_inode; + assert("nikita-3399", child != NULL); + + /* check that we are really destroying an entry for @child */ + if (REISER4_DEBUG) { + int result; + reiser4_key key; + + result = iplug->s.dir.extract_key(coord, &key); + if (result != 0) + return result; + if (get_key_objectid(&key) != get_inode_oid(child)) { + warning("nikita-3397", + "rem_entry: %#llx != %#llx\n", + get_key_objectid(&key), + get_inode_oid(child)); + return RETERR(-EIO); + } + } + return iplug->s.dir.rem_entry(object, + &where->d_name, coord, &lh, entry); + } + + assert("nikita-1124", object != NULL); + assert("nikita-1125", where != NULL); + + tograb = inode_dir_plugin(object)->estimate.rem_entry(object); + result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED); + if (result != 0) + return RETERR(-ENOSPC); + + init_lh(&lh); + + /* check for this entry in a directory. This is plugin method. */ + result = find_entry(object, where, &lh, ZNODE_WRITE_LOCK, entry); + fsdata = reiser4_get_dentry_fsdata(where); + if (IS_ERR(fsdata)) + return PTR_ERR(fsdata); + + coord = &fsdata->dec.entry_coord; + + assert("nikita-3404", + get_inode_oid(where->d_inode) != get_inode_oid(object) || + object->i_size <= 1); + + coord_clear_iplug(coord); + if (result == 0) { + /* remove entry. Just pass control to the directory item + plugin. */ + assert("vs-542", inode_dir_item_plugin(object)); + seal_done(&fsdata->dec.entry_seal); + adjust_dir_file(object, where, fsdata->dec.pos, -1); + result = WITH_COORD(coord, rem_entry()); + if (result == 0) { + if (object->i_size >= 1) + INODE_DEC_FIELD(object, i_size); + else { + warning("nikita-2509", "Dir %llu is runt", + get_inode_oid(object)); + result = RETERR(-EIO); + } + write_current_tracef("..de k %#llx %#llx %i %lli", + get_inode_oid(where->d_inode), + get_inode_oid(object), + where->d_inode->i_nlink, + where->d_inode->i_size); + assert("nikita-3405", where->d_inode->i_nlink != 1 || + where->d_inode->i_size != 2 || + inode_dir_plugin(where->d_inode) == NULL); + } + } + done_lh(&lh); + + return result; +} + +static int entry_actor(reiser4_tree * tree /* tree being scanned */ , + coord_t * coord /* current coord */ , + lock_handle * lh /* current lock handle */ , + void *args /* argument to scan */ ); + +typedef struct entry_actor_args { + const char *name; + reiser4_key *key; + int non_uniq; +#if REISER4_USE_COLLISION_LIMIT || REISER4_STATS + int max_non_uniq; +#endif + int not_found; + znode_lock_mode mode; + + coord_t last_coord; + lock_handle last_lh; + const struct inode *inode; +} entry_actor_args; + +static int +check_entry(const struct inode *dir, coord_t *coord, const struct qstr *name) +{ + return WITH_COORD(coord, check_item(dir, coord, name->name)); +} + +/* Look for given @name within directory @dir. + + This is called during lookup, creation and removal of directory + entries. + + First calculate key that directory entry for @name would have. Search + for this key in the tree. If such key is found, scan all items with + the same key, checking name in each directory entry along the way. +*/ +static int +find_entry(struct inode *dir /* directory to scan */, + struct dentry *de /* name to search for */, + lock_handle * lh /* resulting lock handle */, + znode_lock_mode mode /* required lock mode */, + reiser4_dir_entry_desc * entry /* parameters of found directory + * entry */) +{ + const struct qstr *name; + seal_t *seal; + coord_t *coord; + int result; + __u32 flags; + de_location *dec; + reiser4_dentry_fsdata *fsdata; + + assert("nikita-1130", lh != NULL); + assert("nikita-1128", dir != NULL); + + name = &de->d_name; + assert("nikita-1129", name != NULL); + + /* dentry private data don't require lock, because dentry + manipulations are protected by i_sem on parent. + + This is not so for inodes, because there is no -the- parent in + inode case. + */ + fsdata = reiser4_get_dentry_fsdata(de); + if (IS_ERR(fsdata)) + return PTR_ERR(fsdata); + dec = &fsdata->dec; + + coord = &dec->entry_coord; + coord_clear_iplug(coord); + seal = &dec->entry_seal; + /* compose key of directory entry for @name */ + inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key); + + if (seal_is_set(seal)) { + /* check seal */ + result = seal_validate(seal, coord, &entry->key, LEAF_LEVEL, + lh, FIND_EXACT, mode, ZNODE_LOCK_LOPRI); + if (result == 0) { + /* key was found. Check that it is really item we are + looking for. */ + result = check_entry(dir, coord, name); + if (result == 0) + return 0; + } + } + flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0; + result = object_lookup(dir, + &entry->key, + coord, + lh, + mode, + FIND_EXACT, + LEAF_LEVEL, + LEAF_LEVEL, + flags, + 0/*ra_info*/); + + if (result == CBK_COORD_FOUND) { + entry_actor_args arg; + + /* fast path: no hash collisions */ + result = check_entry(dir, coord, name); + if (result == 0) { + seal_init(seal, coord, &entry->key); + dec->pos = 0; + } else if (result > 0) { + /* Iterate through all units with the same keys. */ + arg.name = name->name; + arg.key = &entry->key; + arg.not_found = 0; + arg.non_uniq = 0; +#if REISER4_USE_COLLISION_LIMIT + arg.max_non_uniq = max_hash_collisions(dir); + assert("nikita-2851", arg.max_non_uniq > 1); +#endif + arg.mode = mode; + arg.inode = dir; + coord_init_zero(&arg.last_coord); + init_lh(&arg.last_lh); + + result = iterate_tree(tree_by_inode(dir), coord, lh, + entry_actor, &arg, mode, 1); + /* if end of the tree or extent was reached during + scanning. */ + if (arg.not_found || (result == -E_NO_NEIGHBOR)) { + /* step back */ + done_lh(lh); + + result = zload(arg.last_coord.node); + if (result == 0) { + coord_clear_iplug(&arg.last_coord); + coord_dup(coord, &arg.last_coord); + move_lh(lh, &arg.last_lh); + result = RETERR(-ENOENT); + zrelse(arg.last_coord.node); + --arg.non_uniq; + } + } + + done_lh(&arg.last_lh); + if (result == 0) + seal_init(seal, coord, &entry->key); + + if (result == 0 || result == -ENOENT) { + assert("nikita-2580", arg.non_uniq > 0); + dec->pos = arg.non_uniq - 1; + } + } + } else + dec->pos = -1; + return result; +} + +/* Function called by find_entry() to look for given name in the directory. */ +static int +entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ , + coord_t * coord /* current coord */ , + lock_handle * lh /* current lock handle */ , + void *entry_actor_arg /* argument to scan */ ) +{ + reiser4_key unit_key; + entry_actor_args *args; + + assert("nikita-1131", tree != NULL); + assert("nikita-1132", coord != NULL); + assert("nikita-1133", entry_actor_arg != NULL); + + args = entry_actor_arg; + ++args->non_uniq; +#if REISER4_USE_COLLISION_LIMIT + if (args->non_uniq > args->max_non_uniq) { + args->not_found = 1; + /* hash collision overflow. */ + return RETERR(-EBUSY); + } +#endif + + if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) { + assert("nikita-1791", keylt(args->key, unit_key_by_coord(coord, &unit_key))); + args->not_found = 1; + args->last_coord.between = AFTER_UNIT; + return 0; + } + + coord_dup(&args->last_coord, coord); + if (args->last_lh.node != lh->node) { + int lock_result; + + done_lh(&args->last_lh); + assert("nikita-1896", znode_is_any_locked(lh->node)); + lock_result = longterm_lock_znode(&args->last_lh, lh->node, args->mode, ZNODE_LOCK_HIPRI); + if (lock_result != 0) + return lock_result; + } + return check_item(args->inode, coord, args->name); +} + +static int +check_item(const struct inode *dir, const coord_t * coord, const char *name) +{ + item_plugin *iplug; + char buf[DE_NAME_BUF_LEN]; + + iplug = item_plugin_by_coord(coord); + if (iplug == NULL) { + warning("nikita-1135", "Cannot get item plugin"); + print_coord("coord", coord, 1); + return RETERR(-EIO); + } else if (item_id_by_coord(coord) != item_id_by_plugin(inode_dir_item_plugin(dir))) { + /* item id of current item does not match to id of items a + directory is built of */ + warning("nikita-1136", "Wrong item plugin"); + print_coord("coord", coord, 1); + print_plugin("plugin", item_plugin_to_plugin(iplug)); + return RETERR(-EIO); + } + assert("nikita-1137", iplug->s.dir.extract_name); + + ON_TRACE(TRACE_DIR, "[%i]: check_item: \"%s\", \"%s\" in %lli (%lli)\n", + current->pid, name, iplug->s.dir.extract_name(coord, buf), + get_inode_oid(dir), *znode_get_block(coord->node)); + /* Compare name stored in this entry with name we are looking for. + + NOTE-NIKITA Here should go code for support of something like + unicode, code tables, etc. + */ + return !!strcmp(name, iplug->s.dir.extract_name(coord, buf)); +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/dir/hashed_dir.h linux-2.6.4-ck1/fs/reiser4/plugin/dir/hashed_dir.h --- linux-2.6.4/fs/reiser4/plugin/dir/hashed_dir.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/dir/hashed_dir.h 2004-03-11 22:45:15.295509275 +1100 @@ -0,0 +1,44 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map + file names to to files. */ + +#if !defined( __HASHED_DIR_H__ ) +#define __HASHED_DIR_H__ + +#include "../../forward.h" + +#include /* for struct inode */ +#include /* for struct dentry */ + +/* create sd for directory file. Create stat-data, dot, and dotdot. */ +extern int init_hashed(struct inode *object, struct inode *parent, reiser4_object_create_data *); +extern int done_hashed(struct inode *object); +extern int detach_hashed(struct inode *object, struct inode *parent); +extern int owns_item_hashed(const struct inode *inode, const coord_t * coord); +extern int lookup_name_hashed(struct inode *inode, struct dentry *dentry, reiser4_key *); +extern int lookup_hashed(struct inode *inode, struct dentry **dentry); +extern int rename_hashed(struct inode *old_dir, + struct dentry *old_name, struct inode *new_dir, struct dentry *new_name); +extern int add_entry_hashed(struct inode *object, + struct dentry *where, reiser4_object_create_data *, reiser4_dir_entry_desc * entry); +extern int rem_entry_hashed(struct inode *object, struct dentry *where, reiser4_dir_entry_desc * entry); +extern reiser4_block_nr estimate_rename_hashed(struct inode *old_dir, + struct dentry *old_name, + struct inode *new_dir, + struct dentry *new_name); +extern reiser4_block_nr estimate_unlink_hashed(struct inode *parent, + struct inode *object); + +/* __HASHED_DIR_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/dir/pseudo_dir.c linux-2.6.4-ck1/fs/reiser4/plugin/dir/pseudo_dir.c --- linux-2.6.4/fs/reiser4/plugin/dir/pseudo_dir.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/dir/pseudo_dir.c 2004-03-11 22:45:15.296509120 +1100 @@ -0,0 +1,74 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Directory plugin for pseudo files */ + +#include "../../debug.h" +#include "../../inode.h" +#include "../pseudo/pseudo.h" +#include "dir.h" + +#include /* for struct inode */ +#include /* for struct dentry */ + +/* implementation of ->lookup() method for pseudo files. */ +reiser4_internal int lookup_pseudo(struct inode * parent, struct dentry **dentry) +{ + pseudo_plugin *pplug; + int result; + + pplug = reiser4_inode_data(parent)->file_plugin_data.pseudo_info.plugin; + assert("nikita-3222", pplug->lookup != NULL); + result = pplug->lookup(parent, dentry); + if (result == -ENOENT) + result = lookup_pseudo_file(parent, dentry); + return result; +} + + +reiser4_internal int +readdir_pseudo(struct file *f, void *dirent, filldir_t filld) +{ + pseudo_plugin *pplug; + struct inode *inode; + struct dentry *dentry; + int result = 0; + + dentry = f->f_dentry; + inode = dentry->d_inode; + pplug = reiser4_inode_data(inode)->file_plugin_data.pseudo_info.plugin; + if (pplug->readdir != NULL) + result = pplug->readdir(f, dirent, filld); + else { + ino_t ino; + int i; + + i = f->f_pos; + switch (i) { + case 0: + ino = get_inode_oid(dentry->d_inode); + if (filld(dirent, ".", 1, i, ino, DT_DIR) < 0) + break; + f->f_pos++; + i++; + /* fallthrough */ + case 1: + ino = parent_ino(dentry); + if (filld(dirent, "..", 2, i, ino, DT_DIR) < 0) + break; + f->f_pos++; + i++; + /* fallthrough */ + } + } + return result; +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/dir/pseudo_dir.h linux-2.6.4-ck1/fs/reiser4/plugin/dir/pseudo_dir.h --- linux-2.6.4/fs/reiser4/plugin/dir/pseudo_dir.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/dir/pseudo_dir.h 2004-03-11 22:45:15.296509120 +1100 @@ -0,0 +1,27 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Directory plugin for pseudo files */ + +#if !defined( __PSEUDO_DIR_H__ ) +#define __PSEUDO_DIR_H__ + +#include "../../forward.h" + +#include /* for struct inode */ +#include /* for struct dentry */ + +extern int lookup_pseudo(struct inode * parent, struct dentry **dentry); +extern int readdir_pseudo(struct file *f, void *dirent, filldir_t filld); + +/* __PSEUDO_DIR_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.4-ck1/fs/reiser4/plugin/disk_format/disk_format40.c --- linux-2.6.4/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/disk_format/disk_format40.c 2004-03-11 22:45:15.298508809 +1100 @@ -0,0 +1,575 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "../../debug.h" +#include "../../dformat.h" +#include "../../key.h" +#include "../node/node.h" +#include "../space/space_allocator.h" +#include "disk_format40.h" +#include "../plugin.h" +#include "../../txnmgr.h" +#include "../../jnode.h" +#include "../../tree.h" +#include "../../super.h" +#include "../../wander.h" +#include "../../diskmap.h" +#include "../../inode.h" +#include "../../ktxnmgrd.h" +#include "../../status_flags.h" + +#include /* for __u?? */ +#include /* for struct super_block */ +#include + +/* reiser 4.0 default disk layout */ + +/* functions to access fields of format40_disk_super_block */ +static __u64 +get_format40_block_count(const format40_disk_super_block * sb) +{ + return d64tocpu(&sb->block_count); +} + +static __u64 +get_format40_free_blocks(const format40_disk_super_block * sb) +{ + return d64tocpu(&sb->free_blocks); +} + +static __u64 +get_format40_root_block(const format40_disk_super_block * sb) +{ + return d64tocpu(&sb->root_block); +} + +static __u16 +get_format40_tree_height(const format40_disk_super_block * sb) +{ + return d16tocpu(&sb->tree_height); +} + +static __u64 +get_format40_file_count(const format40_disk_super_block * sb) +{ + return d64tocpu(&sb->file_count); +} + +static __u64 +get_format40_oid(const format40_disk_super_block * sb) +{ + return d64tocpu(&sb->oid); +} + +static __u16 +get_format40_formatting_policy(const format40_disk_super_block * sb) +{ + return d16tocpu(&sb->formatting_policy); +} + +static __u32 +get_format40_mkfs_id(const format40_disk_super_block * sb) +{ + return d32tocpu(&sb->mkfs_id); +} + +static __u64 +get_format40_flags(const format40_disk_super_block * sb) +{ + return d64tocpu(&sb->flags); +} + +static format40_super_info * +get_sb_info(struct super_block *super) +{ + return &get_super_private(super)->u.format40; +} + +static int +consult_diskmap(struct super_block *s) +{ + format40_super_info *info; + journal_location *jloc; + + info = get_sb_info(s); + jloc = &get_super_private(s)->jloc; + /* Default format-specific locations, if there is nothing in + * diskmap */ + jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR; + jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR; + info->loc.super = FORMAT40_OFFSET / s->s_blocksize; +#ifdef CONFIG_REISER4_BADBLOCKS + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF, + &jloc->footer); + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH, + &jloc->header); + reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER, + &info->loc.super); +#endif + return 0; +} + +/* find any valid super block of disk_format40 (even if the first + super block is destroyed), will change block numbers of actual journal header/footer (jf/jh) + if needed */ +static struct buffer_head * +find_a_disk_format40_super_block(struct super_block *s) +{ + struct buffer_head *super_bh; + format40_disk_super_block *disk_sb; + format40_super_info *info; + + assert("umka-487", s != NULL); + + info = get_sb_info(s); + + super_bh = sb_bread(s, info->loc.super); + if (super_bh == NULL) + return ERR_PTR(RETERR(-EIO)); + + disk_sb = (format40_disk_super_block *) super_bh->b_data; + if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) { + brelse(super_bh); + return ERR_PTR(RETERR(-EINVAL)); + } + + reiser4_set_block_count(s, d64tocpu(&disk_sb->block_count)); + reiser4_set_data_blocks(s, d64tocpu(&disk_sb->block_count) - + d64tocpu(&disk_sb->free_blocks)); + reiser4_set_free_blocks(s, (d64tocpu(&disk_sb->free_blocks))); + + return super_bh; +} + +/* find the most recent version of super block. This is called after journal is + replayed */ +static struct buffer_head * +read_super_block(struct super_block *s UNUSED_ARG) +{ + /* Here the most recent superblock copy has to be read. However, as + journal replay isn't complete, we are using + find_a_disk_format40_super_block() function. */ + return find_a_disk_format40_super_block(s); +} + +static int +get_super_jnode(struct super_block *s) +{ + reiser4_super_info_data *sbinfo = get_super_private(s); + jnode *sb_jnode; + int ret; + + sb_jnode = alloc_io_head(&get_sb_info(s)->loc.super); + + ret = jload(sb_jnode); + + if (ret) { + drop_io_head(sb_jnode); + return ret; + } + + pin_jnode_data(sb_jnode); + jrelse(sb_jnode); + + sbinfo->u.format40.sb_jnode = sb_jnode; + + return 0; +} + +static void +done_super_jnode(struct super_block *s) +{ + jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode; + + if (sb_jnode) { + unpin_jnode_data(sb_jnode); + drop_io_head(sb_jnode); + } +} + +typedef enum format40_init_stage { + NONE_DONE = 0, + CONSULT_DISKMAP, + FIND_A_SUPER, + INIT_JOURNAL_INFO, + INIT_EFLUSH, + INIT_STATUS, + JOURNAL_REPLAY, + READ_SUPER, + KEY_CHECK, + INIT_OID, + INIT_TREE, + JOURNAL_RECOVER, + INIT_SA, + INIT_JNODE, + ALL_DONE +} format40_init_stage; + +static int +try_init_format40(struct super_block *s, format40_init_stage *stage) +{ + int result; + struct buffer_head *super_bh; + reiser4_super_info_data *sbinfo; + format40_disk_super_block sb; + /* FIXME-NIKITA ugly work-around: keep copy of on-disk super-block */ + format40_disk_super_block *sb_copy = &sb; + tree_level height; + reiser4_block_nr root_block; + node_plugin *nplug; + + cassert(sizeof sb == 512); + + assert("vs-475", s != NULL); + assert("vs-474", get_super_private(s)); + + /* initialize reiser4_super_info_data */ + sbinfo = get_super_private(s); + + *stage = NONE_DONE; + + result = consult_diskmap(s); + if (result) + return result; + *stage = CONSULT_DISKMAP; + + super_bh = find_a_disk_format40_super_block(s); + if (IS_ERR(super_bh)) + return PTR_ERR(super_bh); + brelse(super_bh); + *stage = FIND_A_SUPER; + + /* map jnodes for journal control blocks (header, footer) to disk */ + result = init_journal_info(s); + if (result) + return result; + *stage = INIT_JOURNAL_INFO; + + result = eflush_init_at(s); + if (result) + return result; + *stage = INIT_EFLUSH; + + /* ok, we are sure that filesystem format is a format40 format */ + /* Now check it's state */ + result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR); + if (result != 0 && result != -EINVAL) + /* -EINVAL means there is no magic, so probably just old + * fs. */ + return result; + *stage = INIT_STATUS; + + result = reiser4_status_query(NULL, NULL); + if (result == REISER4_STATUS_MOUNT_WARN) + printk("Warning, mounting filesystem with errors\n"); + if (result == REISER4_STATUS_MOUNT_RO) { + printk("Warning, mounting filesystem with fatal errors, forcing read-only mount\n"); + /* FIXME: here we should actually enforce read-only mount, + * only it is unsupported yet. */ + } + + result = reiser4_journal_replay(s); + if (result) + return result; + *stage = JOURNAL_REPLAY; + + super_bh = read_super_block(s); + if (IS_ERR(super_bh)) + return PTR_ERR(super_bh); + *stage = READ_SUPER; + + xmemcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data), sizeof (*sb_copy)); + brelse(super_bh); + + if (!equi(REISER4_LARGE_KEY, + get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) { + warning("nikita-3228", "Key format mismatch. " + "Only %s keys are supported.", + REISER4_LARGE_KEY ? "large" : "small"); + return RETERR(-EINVAL); + } + *stage = KEY_CHECK; + + result = oid_init_allocator(s, get_format40_file_count(sb_copy), get_format40_oid(sb_copy)); + if (result) + return result; + *stage = INIT_OID; + + /* initializing tail policy */ + sbinfo->plug.t = formatting_plugin_by_id(get_format40_formatting_policy(sb_copy)); + assert("umka-751", sbinfo->plug.t != NULL); + + /* get things necessary to init reiser4_tree */ + root_block = get_format40_root_block(sb_copy); + height = get_format40_tree_height(sb_copy); + nplug = node_plugin_by_id(NODE40_ID); + + sbinfo->tree.super = s; + /* init reiser4_tree for the filesystem */ + result = init_tree(&sbinfo->tree, &root_block, height, nplug); + if (result) + return result; + *stage = INIT_TREE; + + /* initialize reiser4_super_info_data */ + sbinfo->default_uid = 0; + sbinfo->default_gid = 0; + + reiser4_set_mkfs_id(s, get_format40_mkfs_id(sb_copy)); + reiser4_set_block_count(s, get_format40_block_count(sb_copy)); + reiser4_set_free_blocks(s, get_format40_free_blocks(sb_copy)); + + sbinfo->fsuid = 0; + sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories + * are not supported */ + sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN); /* all nodes in + * layout 40 are + * of one + * plugin */ + /* sbinfo->tmgr is initialized already */ + + /* recover sb data which were logged separately from sb block */ + + /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls + * oid_init_allocator() and reiser4_set_free_blocks() with new + * data. What's the reason to call them above? */ + result = reiser4_journal_recover_sb_data(s); + if (result != 0) + return result; + *stage = JOURNAL_RECOVER; + + /* Set number of used blocks. The number of used blocks is not stored + neither in on-disk super block nor in the journal footer blocks. At + this moment actual values of total blocks and free block counters are + set in the reiser4 super block (in-memory structure) and we can + calculate number of used blocks from them. */ + reiser4_set_data_blocks(s, + reiser4_block_count(s) - reiser4_free_blocks(s)); + +#if REISER4_DEBUG + sbinfo->min_blocks_used = + 16 /* reserved area */ + + 2 /* super blocks */ + + 2 /* journal footer and header */; +#endif + + /* init disk space allocator */ + result = sa_init_allocator(get_space_allocator(s), s, 0); + if (result) + return result; + *stage = INIT_SA; + + result = get_super_jnode(s); + if (result == 0) + *stage = ALL_DONE; + return result; +} + +/* plugin->u.format.get_ready */ +reiser4_internal int +get_ready_format40(struct super_block *s, void *data UNUSED_ARG) +{ + int result; + format40_init_stage stage; + + result = try_init_format40(s, &stage); + switch (stage) { + case ALL_DONE: + assert("nikita-3458", result == 0); + break; + case INIT_JNODE: + done_super_jnode(s); + case INIT_SA: + sa_destroy_allocator(get_space_allocator(s), s); + case JOURNAL_RECOVER: + case INIT_TREE: + done_tree(&get_super_private(s)->tree); + case INIT_OID: + case KEY_CHECK: + case READ_SUPER: + case JOURNAL_REPLAY: + case INIT_STATUS: + reiser4_status_finish(); + case INIT_EFLUSH: + eflush_done_at(s); + case INIT_JOURNAL_INFO: + done_journal_info(s); + case FIND_A_SUPER: + case CONSULT_DISKMAP: + case NONE_DONE: + break; + default: + impossible("nikita-3457", "init stage: %i", stage); + } + return result; +} + +static void +pack_format40_super(const struct super_block *s, char *data) +{ + format40_disk_super_block *super_data = (format40_disk_super_block *) data; + reiser4_super_info_data *sbinfo = get_super_private(s); + + assert("zam-591", data != NULL); + + cputod64(reiser4_free_committed_blocks(s), &super_data->free_blocks); + cputod64(sbinfo->tree.root_block, &super_data->root_block); + + cputod64(oid_next(s), &super_data->oid); + cputod64(oids_used(s), &super_data->file_count); + + cputod16(sbinfo->tree.height, &super_data->tree_height); +} + +/* plugin->u.format.log_super + return a jnode which should be added to transaction when the super block + gets logged */ +reiser4_internal jnode * +log_super_format40(struct super_block *s) +{ + jnode *sb_jnode; + + sb_jnode = get_super_private(s)->u.format40.sb_jnode; + + jload(sb_jnode); + + pack_format40_super(s, jdata(sb_jnode)); + + jrelse(sb_jnode); + + return sb_jnode; +} + +/* plugin->u.format.release */ +reiser4_internal int +release_format40(struct super_block *s) +{ + int ret; + reiser4_super_info_data *sbinfo; + + sbinfo = get_super_private(s); + assert("zam-579", sbinfo != NULL); + + /* FIXME-UMKA: Should we tell block transaction manager to commit all if + * we will have no space left? */ + if (reiser4_grab_space(1, BA_RESERVED)) + return RETERR(-ENOSPC); + + if (!rofs_super(s)) { + ret = capture_super_block(s); + if (ret != 0) + warning("vs-898", "capture_super_block failed: %d", ret); + + ret = txnmgr_force_commit_all(s, 1); + if (ret != 0) + warning("jmacd-74438", "txn_force failed: %d", ret); + } + if (reiser4_is_debugged(s, REISER4_STATS_ON_UMOUNT)) + print_fs_info("umount ok", s); + + /* shutdown daemon if last mount is removed. This should be done + * before disk format is shut down. */ + ktxnmgrd_detach(&sbinfo->tmgr); + + /*done_tree(&sbinfo->tree);*/ + + sa_destroy_allocator(&sbinfo->space_allocator, s); + done_journal_info(s); + eflush_done_at(s); + done_super_jnode(s); + + return 0; +} + +#define FORMAT40_ROOT_LOCALITY 41 +#define FORMAT40_ROOT_OBJECTID 42 + +/* plugin->u.format.root_dir_key */ +reiser4_internal const reiser4_key * +root_dir_key_format40(const struct super_block *super UNUSED_ARG) +{ + static const reiser4_key FORMAT40_ROOT_DIR_KEY = { + .el = {{(FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR}, +#if REISER4_LARGE_KEY + {0ull}, +#endif + {FORMAT40_ROOT_OBJECTID}, {0ull}} + }; + + return &FORMAT40_ROOT_DIR_KEY; +} + +/* plugin->u.format.print_info */ +reiser4_internal void +print_info_format40(const struct super_block *s) +{ +#if 0 + format40_disk_super_block *sb_copy; + + sb_copy = &get_super_private(s)->u.format40.actual_sb; + + printk("\tblock count %llu\n" + "\tfree blocks %llu\n" + "\troot_block %llu\n" + "\ttail policy %s\n" + "\tmin free oid %llu\n" + "\tfile count %llu\n" + "\ttree height %d\n", + get_format40_block_count(sb_copy), + get_format40_free_blocks(sb_copy), + get_format40_root_block(sb_copy), + formatting_plugin_by_id(get_format40_formatting_policy(sb_copy))->h.label, + get_format40_oid(sb_copy), get_format40_file_count(sb_copy), get_format40_tree_height(sb_copy)); +#endif +} + +reiser4_internal int +check_mount_format40(const struct super_block *s) { + int res; + + /* Check the structure of allocator. Namely, the CRC. */ + if ((res = sa_check_struct(get_space_allocator(s), s))) + return res; + + /* Some other checks? Like the root block must be used, etc */ + return 0; +} + +/* plugin->u.format.check_open. + Check the opened object for validness. For now it checks for the valid oid & + locality only, can be improved later and it its work may depend on the mount + options. */ +reiser4_internal int +check_open_format40(const struct inode *object) { + oid_t max, oid; + + max = oid_next(object->i_sb) - 1; + + /* Check the oid. */ + oid = get_inode_oid(object); + if (oid > max) { + warning("vpf-1360", "The object with the oid %llu greater then the " + "max used oid %llu found.", oid, max); + return RETERR(-EIO); + } + + /* Check the locality. */ + oid = reiser4_inode_data(object)->locality_id; + if (oid > max) { + warning("vpf-1360", "The object with the locality %llu greater then the " + "max used oid %llu found.", oid, max); + return RETERR(-EIO); + } + + return 0; +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.4-ck1/fs/reiser4/plugin/disk_format/disk_format40.h --- linux-2.6.4/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/disk_format/disk_format40.h 2004-03-11 22:45:15.299508653 +1100 @@ -0,0 +1,101 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* this file contains: + - definition of ondisk super block of standart disk layout for + reiser 4.0 (layout 40) + - definition of layout 40 specific portion of in-core super block + - declarations of functions implementing methods of layout plugin + for layout 40 + - declarations of functions used to get/set fields in layout 40 super block +*/ + +#ifndef __DISK_FORMAT40_H__ +#define __DISK_FORMAT40_H__ + +/* magic for default reiser4 layout */ +#define FORMAT40_MAGIC "ReIsEr40FoRmAt" +#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE) + +#include "../../dformat.h" + +#include /* for struct super_block */ + +typedef enum { + FORMAT40_LARGE_KEYS +} format40_flags; + +/* ondisk super block for format 40. It is 512 bytes long */ +typedef struct format40_disk_super_block { + /* 0 */ d64 block_count; + /* number of block in a filesystem */ + /* 8 */ d64 free_blocks; + /* number of free blocks */ + /* 16 */ d64 root_block; + /* filesystem tree root block */ + /* 24 */ d64 oid; + /* smallest free objectid */ + /* 32 */ d64 file_count; + /* number of files in a filesystem */ + /* 40 */ d64 flushes; + /* number of times super block was + flushed. Needed if format 40 + will have few super blocks */ + /* 48 */ d32 mkfs_id; + /* unique identifier of fs */ + /* 52 */ char magic[16]; + /* magic string ReIsEr40FoRmAt */ + /* 68 */ d16 tree_height; + /* height of filesystem tree */ + /* 70 */ d16 formatting_policy; + /* 72 */ d64 flags; + /* 72 */ char not_used[432]; +} format40_disk_super_block; + +/* format 40 specific part of reiser4_super_info_data */ +typedef struct format40_super_info { +/* format40_disk_super_block actual_sb; */ + jnode *sb_jnode; + struct { + reiser4_block_nr super; + } loc; +} format40_super_info; + +/* Defines for journal header and footer respectively. */ +#define FORMAT40_JOURNAL_HEADER_BLOCKNR \ + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3) + +#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \ + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4) + +#define FORMAT40_STATUS_BLOCKNR \ + ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5) + +/* Diskmap declarations */ +#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID)) +#define FORMAT40_SUPER 1 +#define FORMAT40_JH 2 +#define FORMAT40_JF 3 + +/* declarations of functions implementing methods of layout plugin for + format 40. The functions theirself are in disk_format40.c */ +int get_ready_format40(struct super_block *, void *data); +const reiser4_key *root_dir_key_format40(const struct super_block *); +int release_format40(struct super_block *s); +jnode *log_super_format40(struct super_block *s); +void print_info_format40(const struct super_block *s); +int check_mount_format40(const struct super_block *s); +int check_open_format40(const struct inode *object); + +/* __DISK_FORMAT40_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.4-ck1/fs/reiser4/plugin/disk_format/disk_format.c --- linux-2.6.4/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/disk_format/disk_format.c 2004-03-11 22:45:15.296509120 +1100 @@ -0,0 +1,39 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "../../debug.h" +#include "../plugin_header.h" +#include "disk_format40.h" +#include "disk_format.h" +#include "../plugin.h" + +/* initialization of disk layout plugins */ +disk_format_plugin format_plugins[LAST_FORMAT_ID] = { + [FORMAT40_ID] = { + .h = { + .type_id = REISER4_FORMAT_PLUGIN_TYPE, + .id = FORMAT40_ID, + .pops = NULL, + .label = "reiser40", + .desc = "standard disk layout for reiser40", + .linkage = TYPE_SAFE_LIST_LINK_ZERO, + }, + .get_ready = get_ready_format40, + .root_dir_key = root_dir_key_format40, + .release = release_format40, + .log_super = log_super_format40, + .print_info = print_info_format40, + .check_mount = check_mount_format40, + .check_open = check_open_format40 + } +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.4-ck1/fs/reiser4/plugin/disk_format/disk_format.h --- linux-2.6.4/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/disk_format/disk_format.h 2004-03-11 22:45:15.297508964 +1100 @@ -0,0 +1,41 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* identifiers for disk layouts, they are also used as indexes in array of disk + plugins */ + +#if !defined( __REISER4_DISK_FORMAT_H__ ) +#define __REISER4_DISK_FORMAT_H__ + +typedef enum { + /* standard reiser4 disk layout plugin id */ + FORMAT40_ID, + LAST_FORMAT_ID +} disk_format_id; + +/* __REISER4_DISK_FORMAT_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ + + + + + + + + + + + + + + diff -Naurp linux-2.6.4/fs/reiser4/plugin/file/file.c linux-2.6.4-ck1/fs/reiser4/plugin/file/file.c --- linux-2.6.4/fs/reiser4/plugin/file/file.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/file/file.c 2004-03-11 22:45:15.303508031 +1100 @@ -0,0 +1,2338 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "../../inode.h" +#include "../../super.h" +#include "../../tree_walk.h" +#include "../../carry.h" +#include "../../page_cache.h" +#include "../../ioctl.h" +#include "../object.h" +#include "../../prof.h" +#include "../../safe_link.h" +#include "funcs.h" + +#include + +/* this file contains file plugin methods of regular reiser4 files. Those files are either built of tail items only (FORMATTING_ID) or + of extent items only (EXTENT_POINTER_ID) or empty (have no items but stat data) */ + +static int unpack(struct inode *inode, int forever, int locked); + +/* get unix file plugin specific portion of inode */ +reiser4_internal inline unix_file_info_t * +unix_file_inode_data(const struct inode * inode) +{ + return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info; +} + +static int file_is_built_of_tails(const struct inode *inode) +{ + return unix_file_inode_data(inode)->container == UF_CONTAINER_TAILS; +} + +reiser4_internal int +file_is_built_of_extents(const struct inode *inode) +{ + return unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS; +} + +reiser4_internal int file_is_empty(const struct inode *inode) +{ + return unix_file_inode_data(inode)->container == UF_CONTAINER_EMPTY; +} + +reiser4_internal void set_file_state_extents(struct inode *inode) +{ + unix_file_inode_data(inode)->container = UF_CONTAINER_EXTENTS; +} + +reiser4_internal void set_file_state_tails(struct inode *inode) +{ + unix_file_inode_data(inode)->container = UF_CONTAINER_TAILS; +} + +static void set_file_state_empty(struct inode *inode) +{ + unix_file_inode_data(inode)->container = UF_CONTAINER_EMPTY; +} + +static int +less_than_ldk(znode *node, const reiser4_key *key) +{ + return UNDER_RW(dk, current_tree, read, keylt(key, znode_get_ld_key(node))); +} + +reiser4_internal int +equal_to_rdk(znode *node, const reiser4_key *key) +{ + return UNDER_RW(dk, current_tree, read, keyeq(key, znode_get_rd_key(node))); +} + +#if REISER4_DEBUG + +static int +less_than_rdk(znode *node, const reiser4_key *key) +{ + return UNDER_RW(dk, current_tree, read, keylt(key, znode_get_rd_key(node))); +} + +int +equal_to_ldk(znode *node, const reiser4_key *key) +{ + return UNDER_RW(dk, current_tree, read, keyeq(key, znode_get_ld_key(node))); +} + +/* get key of item next to one @coord is set to */ +static reiser4_key * +get_next_item_key(const coord_t *coord, reiser4_key *next_key) +{ + if (coord->item_pos == node_num_items(coord->node) - 1) { + /* get key of next item if it is in right neighbor */ + UNDER_RW_VOID(dk, znode_get_tree(coord->node), read, + *next_key = *znode_get_rd_key(coord->node)); + } else { + /* get key of next item if it is in the same node */ + coord_t next; + + coord_dup_nocheck(&next, coord); + next.unit_pos = 0; + check_me("vs-730", coord_next_item(&next) == 0); + item_key_by_coord(&next, next_key); + } + return next_key; +} + +static int +item_of_that_file(const coord_t *coord, const reiser4_key *key) +{ + reiser4_key max_possible; + item_plugin *iplug; + + iplug = item_plugin_by_coord(coord); + assert("vs-1011", iplug->b.max_key_inside); + return keylt(key, iplug->b.max_key_inside(coord, &max_possible)); +} + +static int +check_coord(const coord_t *coord, const reiser4_key *key) +{ + coord_t twin; + + if (!REISER4_DEBUG) + return 1; + node_plugin_by_node(coord->node)->lookup(coord->node, key, FIND_MAX_NOT_MORE_THAN, &twin); + return coords_equal(coord, &twin); +} + +#endif /* REISER4_DEBUG */ + +static inline void +invalidate_extended_coord(uf_coord_t *uf_coord) +{ + coord_clear_iplug(&uf_coord->base_coord); + uf_coord->valid = 0; +} + +static inline void +validate_extended_coord(uf_coord_t *uf_coord, loff_t offset) +{ + assert("vs-1333", uf_coord->valid == 0); + assert("vs-1348", item_plugin_by_coord(&uf_coord->base_coord)->s.file.init_coord_extension); + + /* FIXME: */ + item_body_by_coord(&uf_coord->base_coord); + item_plugin_by_coord(&uf_coord->base_coord)->s.file.init_coord_extension(uf_coord, offset); +} + +reiser4_internal write_mode_t +how_to_write(uf_coord_t *uf_coord, const reiser4_key *key) +{ + write_mode_t result; + coord_t *coord; + ON_DEBUG(reiser4_key check); + + coord = &uf_coord->base_coord; + + assert("vs-1252", znode_is_wlocked(coord->node)); + assert("vs-1253", znode_is_loaded(coord->node)); + + if (uf_coord->valid == 1) { + assert("vs-1332", check_coord(coord, key)); + return (coord->between == AFTER_UNIT) ? APPEND_ITEM : OVERWRITE_ITEM; + } + + if (less_than_ldk(coord->node, key)) { + assert("vs-1014", get_key_offset(key) == 0); + + coord_init_before_first_item(coord, coord->node); + uf_coord->valid = 1; + result = FIRST_ITEM; + goto ok; + } + + assert("vs-1335", less_than_rdk(coord->node, key)); + + if (node_is_empty(coord->node)) { + assert("vs-879", znode_get_level(coord->node) == LEAF_LEVEL); + assert("vs-880", get_key_offset(key) == 0); + /* + * Situation that check below tried to handle is follows: some + * other thread writes to (other) file and has to insert empty + * leaf between two adjacent extents. Generally, we are not + * supposed to muck with this node. But it is possible that + * said other thread fails due to some error (out of disk + * space, for example) and leaves empty leaf + * lingering. Nothing prevents us from reusing it. + */ + assert("vs-1000", UNDER_RW(dk, current_tree, read, + keylt(key, znode_get_rd_key(coord->node)))); + assert("vs-1002", coord->between == EMPTY_NODE); + result = FIRST_ITEM; + uf_coord->valid = 1; + goto ok; + } + + assert("vs-1336", coord->item_pos < node_num_items(coord->node)); + assert("vs-1007", ergo(coord->between == AFTER_UNIT || coord->between == AT_UNIT, keyle(item_key_by_coord(coord, &check), key))); + assert("vs-1008", ergo(coord->between == AFTER_UNIT || coord->between == AT_UNIT, keylt(key, get_next_item_key(coord, &check)))); + + switch(coord->between) { + case AFTER_ITEM: + uf_coord->valid = 1; + result = FIRST_ITEM; + break; + case AFTER_UNIT: + assert("vs-1323", (item_is_tail(coord) || item_is_extent(coord)) && item_of_that_file(coord, key)); + assert("vs-1208", keyeq(item_plugin_by_coord(coord)->s.file.append_key(coord, &check), key)); + result = APPEND_ITEM; + validate_extended_coord(uf_coord, get_key_offset(key)); + break; + case AT_UNIT: + /* FIXME: it would be nice to check that coord matches to key */ + assert("vs-1324", (item_is_tail(coord) || item_is_extent(coord)) && item_of_that_file(coord, key)); + validate_extended_coord(uf_coord, get_key_offset(key)); + result = OVERWRITE_ITEM; + break; + default: + assert("vs-1337", 0); + result = OVERWRITE_ITEM; + break; + } + +ok: + assert("vs-1349", uf_coord->valid == 1); + assert("vs-1332", check_coord(coord, key)); + return result; +} + +#if 0 +/* update inode's timestamps and size. If any of these change - update sd as well */ +reiser4_internal int +update_inode_and_sd_if_necessary(struct inode *inode, + loff_t new_size, + int update_i_size, int update_times) +{ + int result; + int inode_changed; + + result = 0; + + /* FIXME: no need to avoid mark_inode_dirty call. It does not do anything but "capturing" inode */ + inode_changed = 0; + + if (update_i_size && (inode->i_size != new_size)) { + INODE_SET_FIELD(inode, i_size, new_size); + inode_changed = 1; + } + + if (update_times && (inode->i_ctime.tv_sec != get_seconds() || + inode->i_mtime.tv_sec != get_seconds())) { + /* time stamps are to be updated */ + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode_changed = 1; + } + + if (inode_changed) { + assert("vs-946", !inode_get_flag(inode, REISER4_NO_SD)); + /* update sd inode */ + result = reiser4_update_sd(inode); + if (result) + warning("vs-636", "updating stat data failed: %i", result); + } + + return result; +} + +#endif + +/* look for item of file @inode corresponding to @key */ + +#ifdef PSEUDO_CODE_CAN_COMPILE +find_item_obsolete() +{ + if (!coord && seal) + set coord based on seal; + if (coord) { + if (key_in_coord(coord)) + return coord; + coord = get_next_item(coord); + if (key_in_coord(coord)) + return coord; + } + coord_by_key(); +} +find_item() +{ + if (seal_is_set) { + set coord by seal; + if (key is in coord) + return; + if (key is right delim key) { + get right neighbor; + return first unit in it; + } + } +} +#endif + +/* obtain lock on right neighbor and drop lock on current node */ +reiser4_internal int +goto_right_neighbor(coord_t * coord, lock_handle * lh) +{ + int result; + lock_handle lh_right; + + assert("vs-1100", znode_is_locked(coord->node)); + + init_lh(&lh_right); + result = reiser4_get_right_neighbor( + &lh_right, coord->node, + znode_is_wlocked(coord->node) ? ZNODE_WRITE_LOCK : ZNODE_READ_LOCK, + GN_CAN_USE_UPPER_LEVELS); + if (result) { + done_lh(&lh_right); + return result; + } + + done_lh(lh); + + coord_init_first_unit_nocheck(coord, lh_right.node); + move_lh(lh, &lh_right); + + return 0; + +} + +/* this is to be used after find_file_item to determine real state of file */ +static void +set_file_state(unix_file_info_t *uf_info, int cbk_result, tree_level level) +{ + if (!uf_info) + return; + + assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL); + + if (uf_info->container == UF_CONTAINER_UNKNOWN) { + if (cbk_result == CBK_COORD_NOTFOUND) + uf_info->container = UF_CONTAINER_EMPTY; + else if (level == LEAF_LEVEL) + uf_info->container = UF_CONTAINER_TAILS; + else + uf_info->container = UF_CONTAINER_EXTENTS; + } else { + /* file state is know, check that it is set correctly */ + assert("vs-1161", ergo(cbk_result == CBK_COORD_NOTFOUND, + uf_info->container == UF_CONTAINER_EMPTY)); + assert("vs-1162", ergo(level == LEAF_LEVEL && cbk_result == CBK_COORD_FOUND, + uf_info->container == UF_CONTAINER_TAILS)); + assert("vs-1165", ergo(level == TWIG_LEVEL && cbk_result == CBK_COORD_FOUND, + uf_info->container == UF_CONTAINER_EXTENTS)); + } +} + +reiser4_internal int +find_file_item(hint_t *hint, /* coord, lock handle and seal are here */ + const reiser4_key *key, /* key of position in a file of next read/write */ + znode_lock_mode lock_mode, /* which lock (read/write) to put on returned node */ + __u32 cbk_flags, /* coord_by_key flags: CBK_UNIQUE [| CBK_FOR_INSERT] */ + ra_info_t *ra_info, + unix_file_info_t *uf_info) +{ + int result; + coord_t *coord; + lock_handle *lh; + + assert("nikita-3030", schedulable()); + + /* collect statistics on the number of calls to this function */ + reiser4_stat_inc(file.find_file_item); + + coord = &hint->coord.base_coord; + lh = hint->coord.lh; + init_lh(lh); + if (hint) { + result = hint_validate(hint, key, 1/*check key*/, lock_mode); + if (!result) { + if (coord->between == AFTER_UNIT && equal_to_rdk(coord->node, key)) { + result = goto_right_neighbor(coord, lh); + if (result == -E_NO_NEIGHBOR) + return RETERR(-EIO); + if (result) + return result; + assert("vs-1152", equal_to_ldk(coord->node, key)); + /* we moved to different node. Invalidate coord extension, zload is necessary to init it + again */ + hint->coord.valid = 0; + reiser4_stat_inc(file.find_file_item_via_right_neighbor); + } else { + reiser4_stat_inc(file.find_file_item_via_seal); + } + + set_file_state(uf_info, CBK_COORD_FOUND, znode_get_level(coord->node)); + return CBK_COORD_FOUND; + } + } + + /* collect statistics on the number of calls to this function which did not get optimized */ + reiser4_stat_inc(file.find_file_item_via_cbk); + + coord_init_zero(coord); + if (uf_info != NULL) { + result = object_lookup(unix_file_info_to_inode(uf_info), + key, + coord, + lh, + lock_mode, + FIND_MAX_NOT_MORE_THAN, + TWIG_LEVEL, + LEAF_LEVEL, + cbk_flags, + ra_info); + } else { + result = coord_by_key(current_tree, + key, + coord, + lh, + lock_mode, + FIND_MAX_NOT_MORE_THAN, + TWIG_LEVEL, + LEAF_LEVEL, + cbk_flags, + ra_info); + } + if (!IS_CBKERR(result)) + set_file_state(uf_info, result, znode_get_level(coord->node)); + + /* FIXME: we might already have coord extension initialized */ + hint->coord.valid = 0; + return result; +} + +/* plugin->u.file.write_flowom = NULL + plugin->u.file.read_flow = NULL */ + +reiser4_internal void +hint_init_zero(hint_t *hint, lock_handle *lh) +{ + xmemset(hint, 0, sizeof (*hint)); + hint->coord.lh = lh; +} + +/* find position of last byte of last item of the file plus 1. This is used by truncate and mmap to find real file + size */ +static int +find_file_size(struct inode *inode, loff_t *file_size) +{ + int result; + reiser4_key key; + hint_t hint; + coord_t *coord; + lock_handle lh; + item_plugin *iplug; + + assert("vs-1247", inode_file_plugin(inode)->key_by_inode == key_by_inode_unix_file); + key_by_inode_unix_file(inode, get_key_offset(max_key()), &key); + + hint_init_zero(&hint, &lh); + result = find_file_item(&hint, &key, ZNODE_READ_LOCK, CBK_UNIQUE, 0/* ra_info */, unix_file_inode_data(inode)); + if (result == CBK_COORD_NOTFOUND) { + /* there are no items of this file */ + done_lh(&lh); + *file_size = 0; + return 0; + } + + if (result != CBK_COORD_FOUND) { + /* error occured */ + done_lh(&lh); + return result; + } + + coord = &hint.coord.base_coord; + + /* there are items of this file (at least one) */ + coord_clear_iplug(coord); + result = zload(coord->node); + if (unlikely(result)) { + done_lh(&lh); + return result; + } + iplug = item_plugin_by_coord(coord); + + assert("vs-853", iplug->s.file.append_key); + iplug->s.file.append_key(coord, &key); + + *file_size = get_key_offset(&key); + + zrelse(coord->node); + done_lh(&lh); + + return 0; +} + +/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat + data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen + if page corresponds to hole extent and unallocated one will have to be created */ +static int reserve_partial_page(reiser4_tree *tree) +{ + grab_space_enable(); + return reiser4_grab_reserved(reiser4_get_current_sb(), + 1 + + 2 * estimate_one_insert_into_item(tree), + BA_CAN_COMMIT); +} + +/* estimate and reserve space needed to cut one item and update one stat data */ +reiser4_internal int reserve_cut_iteration(reiser4_tree *tree) +{ + __u64 estimate = estimate_one_item_removal(tree) + + estimate_one_insert_into_item(tree); + + assert("nikita-3172", lock_stack_isclean(get_current_lock_stack())); + + grab_space_enable(); + /* We need to double our estimate now that we can delete more than one + node. */ + return reiser4_grab_reserved(reiser4_get_current_sb(), estimate*2, + BA_CAN_COMMIT); +} + +/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space + and update file stat data on every single cut from the tree */ +static int +cut_file_items(struct inode *inode, loff_t new_size, int update_sd, loff_t cur_size) +{ + reiser4_key from_key, to_key; + reiser4_key smallest_removed; + int result; + + assert("vs-1248", inode_file_plugin(inode)->key_by_inode == key_by_inode_unix_file); + key_by_inode_unix_file(inode, new_size, &from_key); + to_key = from_key; + set_key_offset(&to_key, cur_size - 1/*get_key_offset(max_key())*/); + /* this loop normally runs just once */ + while (1) { + result = reserve_cut_iteration(tree_by_inode(inode)); + if (result) + break; + + result = cut_tree_object(current_tree, &from_key, &to_key, + &smallest_removed, inode); + if (result == -E_REPEAT) { + /* -E_REPEAT is a signal to interrupt a long file truncation process */ + if (update_sd) { + INODE_SET_FIELD(inode, i_size, get_key_offset(&smallest_removed)); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + result = reiser4_update_sd(inode); + if (result) + break; + } + + all_grabbed2free(); + reiser4_release_reserved(inode->i_sb); + + /* cut_tree_object() was interrupted probably because + * current atom requires commit, we have to release + * transaction handle to allow atom commit. */ + { + reiser4_context * ctx; + long long_ret; + + ctx = get_current_context(); + long_ret = txn_end(ctx); + txn_begin(ctx); + if (long_ret < 0) { + result = (int)long_ret; + break; + } + } + continue; + } + if (result) + break; + + /* Final sd update after the file gets its correct size */ + INODE_SET_FIELD(inode, i_size, new_size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + result = reiser4_update_sd(inode); + break; + } + + all_grabbed2free(); + reiser4_release_reserved(inode->i_sb); + + return result; +} + +int find_or_create_extent(struct page *page); + +/* part of unix_file_truncate: it is called when truncate is used to make file shorter */ +static int +shorten_file(struct inode *inode, loff_t new_size, int update_sd, loff_t cur_size) +{ + int result; + struct page *page; + int padd_from; + unsigned long index; + char *kaddr; + + assert("vs-1106", inode->i_size > new_size); + + /* all items of ordinary reiser4 file are grouped together. That is why we can use cut_tree. Plan B files (for + instance) can not be truncated that simply */ + result = cut_file_items(inode, new_size, update_sd, cur_size); + if (result) + return result; + + assert("vs-1105", new_size == inode->i_size); + if (inode->i_size == 0) { + set_file_state_empty(inode); + return 0; + } + + if (file_is_built_of_tails(inode)) + /* No need to worry about zeroing last page after new file end */ + return 0; + + padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1); + if (!padd_from) + /* file is truncated to page boundary */ + return 0; + + result = reserve_partial_page(tree_by_inode(inode)); + if (result) { + reiser4_release_reserved(inode->i_sb); + return result; + } + + /* last page is partially truncated - zero its content */ + index = (inode->i_size >> PAGE_CACHE_SHIFT); + page = read_cache_page(inode->i_mapping, index, readpage_unix_file/*filler*/, 0); + if (IS_ERR(page)) { + all_grabbed2free(); + reiser4_release_reserved(inode->i_sb); + if (likely(PTR_ERR(page) == -EINVAL)) { + /* looks like file is built of tail items */ + return 0; + } + return PTR_ERR(page); + } + wait_on_page_locked(page); + if (!PageUptodate(page)) { + all_grabbed2free(); + page_cache_release(page); + reiser4_release_reserved(inode->i_sb); + return RETERR(-EIO); + } + + /* if page correspons to hole extent unit - unallocated one will be created here. This is not necessary */ + result = find_or_create_extent(page); + + /* FIXME: cut_file_items has already updated inode. Probably it would be better to update it here when file is + really truncated */ + all_grabbed2free(); + if (result) { + page_cache_release(page); + reiser4_release_reserved(inode->i_sb); + return result; + } + + lock_page(page); + assert("vs-1066", PageLocked(page)); + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + padd_from, 0, PAGE_CACHE_SIZE - padd_from); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + unlock_page(page); + page_cache_release(page); + reiser4_release_reserved(inode->i_sb); + return 0; +} + +static loff_t +write_flow(struct file *file, unix_file_info_t *uf_info, const char *buf, loff_t count, loff_t pos); + +/* it is called when truncate is used to make file longer and when write position is set past real end of file. It + appends file which has size @cur_size with hole of certain size (@hole_size). It returns 0 on success, error code + otherwise */ +static int +append_hole(unix_file_info_t *uf_info, loff_t new_size) +{ + int result; + loff_t written; + loff_t hole_size; + + assert("vs-1107", unix_file_info_to_inode(uf_info)->i_size < new_size); + + result = 0; + hole_size = new_size - unix_file_info_to_inode(uf_info)->i_size; + written = write_flow(0/*file*/, uf_info, 0/*buf*/, hole_size, + unix_file_info_to_inode(uf_info)->i_size); + if (written != hole_size) { + /* return error because file is not expanded as required */ + if (written > 0) + result = RETERR(-ENOSPC); + else + result = written; + } else { + assert("vs-1081", + unix_file_info_to_inode(uf_info)->i_size == new_size); + } + return result; +} + +reiser4_internal int +setattr_reserve(reiser4_tree *tree) +{ + assert("vs-1096", is_grab_enabled(get_current_context())); + return reiser4_grab_space(estimate_one_insert_into_item(tree), + BA_CAN_COMMIT); +} + +/* this either cuts or add items of/to the file so that items match new_size. It is used in unix_file_setattr when it is + used to truncate +VS-FIXME-HANS: explain that +and in unix_file_delete */ +static int +truncate_file(struct inode *inode, loff_t new_size, int update_sd) +{ + int result; + loff_t cur_size; + + result = find_file_size(inode, &cur_size); + if (result != 0) + return result; + + if (new_size != cur_size) { + INODE_SET_FIELD(inode, i_size, cur_size); + if (cur_size < new_size) + result = append_hole(unix_file_inode_data(inode), + new_size); + else + result = shorten_file(inode, + new_size, update_sd, cur_size); + } else { + /* when file is built of extens - find_file_size can only + * calculate old file size up to page size. Case of not + * changing file size is detected in unix_file_setattr, + * therefore here we have expanding file within its last page + * up to the end of that page */ + assert("vs-1115", + file_is_built_of_extents(inode) || + (file_is_empty(inode) && cur_size == 0)); + assert("vs-1116", (new_size & ~PAGE_CACHE_MASK) == 0); + + /* update stat data */ + if (update_sd) { + result = setattr_reserve(tree_by_inode(inode)); + if (!result && update_sd) { + INODE_SET_FIELD(inode, i_size, cur_size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + result = reiser4_update_sd(inode); + } + all_grabbed2free(); + } + } + return result; +} + +/* plugin->u.file.truncate + all the work is done on reiser4_setattr->unix_file_setattr->truncate_file +*/ +reiser4_internal int +truncate_unix_file(struct inode *inode, loff_t new_size) +{ + return 0; +} + +/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */ + +/* get access hint (seal, coord, key, level) stored in reiser4 private part of + struct file if it was stored in a previous access to the file */ +reiser4_internal int +load_file_hint(struct file *file, hint_t *hint, lock_handle *lh) +{ + reiser4_file_fsdata *fsdata; + + if (file) { + fsdata = reiser4_get_file_fsdata(file); + if (IS_ERR(fsdata)) + return PTR_ERR(fsdata); + + if (seal_is_set(&fsdata->reg.hint.seal)) { + *hint = fsdata->reg.hint; + hint->coord.lh = lh; + /* force re-validation of the coord on the first + * iteration of the read/write loop. */ + hint->coord.valid = 0; + return 0; + } + xmemset(&fsdata->reg.hint, 0, sizeof(hint_t)); + } + hint_init_zero(hint, lh); + return 0; +} + + +/* this copies hint for future tree accesses back to reiser4 private part of + struct file */ +reiser4_internal void +save_file_hint(struct file *file, const hint_t *hint) +{ + reiser4_file_fsdata *fsdata; + + if (!file || !seal_is_set(&hint->seal)) + return; + + fsdata = reiser4_get_file_fsdata(file); + assert("vs-965", !IS_ERR(fsdata)); + fsdata->reg.hint = *hint; + return; +} + +reiser4_internal void +unset_hint(hint_t *hint) +{ + assert("vs-1315", hint); + seal_done(&hint->seal); +} + +/* coord must be set properly. So, that set_hint has nothing to do */ +reiser4_internal void +set_hint(hint_t *hint, const reiser4_key *key, znode_lock_mode mode) +{ + ON_DEBUG(coord_t *coord = &hint->coord.base_coord); + assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key))); + + seal_init(&hint->seal, &hint->coord.base_coord, key); + hint->offset = get_key_offset(key); + hint->level = znode_get_level(hint->coord.base_coord.node); + hint->mode = mode; +} + +reiser4_internal int +hint_is_set(const hint_t *hint) +{ + return seal_is_set(&hint->seal); +} + +#if REISER4_DEBUG +static int all_but_offset_key_eq(const reiser4_key *k1, const reiser4_key *k2) +{ + return (get_key_locality(k1) == get_key_locality(k2) && + get_key_type(k1) == get_key_type(k2) && + get_key_band(k1) == get_key_band(k2) && + get_key_ordering(k1) == get_key_ordering(k2) && + get_key_objectid(k1) == get_key_objectid(k2)); +} +#endif + +reiser4_internal int +hint_validate(hint_t *hint, const reiser4_key *key, int check_key, znode_lock_mode lock_mode) +{ + if (!hint || !hint_is_set(hint) || hint->mode != lock_mode) + /* hint either not set or set by different operation */ + return RETERR(-E_REPEAT); + + assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key)); + + if (check_key && get_key_offset(key) != hint->offset) + /* hint is set for different key */ + return RETERR(-E_REPEAT); + + return seal_validate(&hint->seal, &hint->coord.base_coord, key, + hint->level, hint->coord.lh, FIND_MAX_NOT_MORE_THAN, lock_mode, ZNODE_LOCK_LOPRI); +} + +/* look for place at twig level for extent corresponding to page, call extent's writepage method to create + unallocated extent if it does not exist yet, initialize jnode, capture page */ +reiser4_internal int +find_or_create_extent(struct page *page) +{ + int result; + lock_handle lh; + hint_t hint; + reiser4_key key; + item_plugin *iplug; + znode *loaded; + + reiser4_stat_inc(file.page_ops.writepage_calls); + + assert("vs-1065", page->mapping && page->mapping->host); + + /* get key of first byte of the page */ + key_by_inode_unix_file(page->mapping->host, (loff_t) page->index << PAGE_CACHE_SHIFT, &key); + + hint_init_zero(&hint, &lh); + result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, CBK_UNIQUE | CBK_FOR_INSERT, 0/*ra_info*/, 0/* inode */); + + if (IS_CBKERR(result)) { + done_lh(&lh); + return result; + } + + result = zload(lh.node); + if (result) { + done_lh(&lh); + return result; + } + + loaded = lh.node; + /* get plugin of extent item */ + iplug = item_plugin_by_id(EXTENT_POINTER_ID); + result = iplug->s.file.capture(&key, + &hint.coord, + page, how_to_write(&hint.coord, &key)); + assert("vs-429378", result != -E_REPEAT); + zrelse(loaded); + done_lh(&lh); + return result; +} + +/* Check mapping for existence of not captured dirty pages */ +static int inode_has_anonymous_pages(struct inode *inode) +{ + int ret; + struct address_space *mapping; + + mapping = inode->i_mapping; + spin_lock (&mapping->page_lock); + ret = !list_empty (get_moved_pages(mapping)); + spin_unlock (&mapping->page_lock); + ret |= (reiser4_inode_data(inode)->eflushed_anon > 0); + + return ret; +} + +static int capture_page_and_create_extent(struct page *page) +{ + int result; + struct inode *inode; + + assert("vs-1084", page->mapping && page->mapping->host); + inode = page->mapping->host; + assert("vs-1139", file_is_built_of_extents(inode)); + /* page belongs to file */ + assert("vs-1393", inode->i_size > ((loff_t) page->index << PAGE_CACHE_SHIFT)); + + /* page capture may require extent creation (if it does not exist yet) and stat data's update (number of blocks + changes on extent creation) */ + grab_space_enable (); + result = reiser4_grab_space(2 * estimate_one_insert_into_item(tree_by_inode(inode)), BA_CAN_COMMIT); + if (likely(!result)) + result = find_or_create_extent(page); + + all_grabbed2free(); + if (result != 0) + SetPageError(page); + return result; +} + +/* plugin->u.file.capturepage handler */ +reiser4_internal int +capturepage_unix_file(struct page * page) { + int result; + + page_cache_get(page); + unlock_page(page); + result = capture_page_and_create_extent(page); + lock_page(page); + page_cache_release(page); + return result; +} + +static void redirty_inode(struct inode *inode) +{ + spin_lock(&inode_lock); + inode->i_state |= I_DIRTY; + spin_unlock(&inode_lock); +} + +/* this returns 1 if it captured page */ +static int capture_anonymous_page(struct page *pg, int keepme) +{ + struct address_space *mapping; + int result; + + mapping = pg->mapping; + result = 0; + if (PageWriteback(pg)) { + if (PageDirty(pg)) + list_move(&pg->list, &mapping->dirty_pages); + else + list_move(&pg->list, &mapping->locked_pages); + } else if (!PageDirty(pg) && !keepme) { + list_move(&pg->list, &mapping->clean_pages); + } else { + jnode *node; + + list_move(&pg->list, &mapping->io_pages); + page_cache_get (pg); + + spin_unlock (&mapping->page_lock); + + lock_page(pg); + /* page is guaranteed to be in the mapping, because we are + * operating under rw-semaphore. */ + assert("nikita-3336", pg->mapping == mapping); + node = jnode_of_page(pg); + unlock_page(pg); + if (!IS_ERR(node)) { + result = jload(node); + assert("nikita-3334", result == 0); + assert("nikita-3335", jnode_page(node) == pg); + result = capture_page_and_create_extent(pg); + if (result == 0) { + /* + * node will be captured into atom by + * capture_page_and_create_extent(). Atom + * cannot commit (because we have open + * transaction handle), and node cannot be + * truncated, because we have non-exclusive + * access to the file. + */ + assert("nikita-3327", node->atom != NULL); + JF_CLR(node, JNODE_KEEPME); + result = 1; + } else + warning("nikita-3329", + "Cannot capture anon page: %i", result); + jrelse(node); + jput(node); + } else + result = PTR_ERR(node); + page_cache_release(pg); + spin_lock(&mapping->page_lock); + } + return result; +} + +#define CAPTURE_AJNODE_BURST (128) +#define CAPTURE_APAGE_BURST (1024) + +static int capture_anonymous_jnodes(struct inode *inode) +{ + struct list_head *tmp, *next; + reiser4_inode *info; + reiser4_tree *tree; + int nr; + int result; + int too_many; + int scan_over; + int keepme; + + tree = tree_by_inode(inode); + + info = reiser4_inode_data(inode); + result = 0; + nr = 0; + too_many = 0; + do { + spin_lock_eflush(tree->super); + + scan_over = 1; + + list_for_each_safe(tmp, next, &info->eflushed_jnodes) { + eflush_node_t *ef; + jnode *node; + + ef = list_entry(tmp, eflush_node_t, inode_link); + if (!ef->hadatom) + -- info->eflushed_anon; + node = ef->node; + /* + * anonymous jnode doesn't have an atom. + * + * jnode spin-lock is not needed, because we don't have + * requirement to capture _all_ anonymous jnodes anyway. + */ + if (node->atom != NULL) + continue; + + jref(node); + keepme = JF_ISSET(node, JNODE_KEEPME); + + spin_unlock_eflush(tree->super); + result = jload(node); + jput(node); + if (result != 0) + return result; + + spin_lock(&inode->i_mapping->page_lock); + result = capture_anonymous_page(jnode_page(node), keepme); + spin_unlock(&inode->i_mapping->page_lock); + jrelse(node); + spin_lock_eflush(tree->super); + + if (result == 1) { + /* jnode is captured */ + nr ++; + result = 0; + if (nr >= CAPTURE_AJNODE_BURST) { + too_many = 1; + redirty_inode(inode); + } + + scan_over = 0; + break; + } + } + spin_unlock_eflush(tree->super); + if (too_many) + break; + if (scan_over) + break; + } while (result == 0); + + return result; +} + +static int capture_anonymous_pages(struct address_space * mapping) +{ + struct list_head *mpages; + int result; + int nr; + int captured = 0, clean = 0, writeback = 0; + + result = 0; + nr = 0; + + spin_lock (&mapping->page_lock); + + mpages = get_moved_pages(mapping); + while ((result == 0 || result == 1) && !list_empty (mpages) && nr < CAPTURE_APAGE_BURST) { + struct page *pg = list_entry(mpages->prev, struct page, list); + + assert("vs-1455", PageDirty(pg)); + result = capture_anonymous_page(pg, 0); + if (result == 1) { + ++ nr; + result = 0; + } + } + spin_unlock(&mapping->page_lock); + + if (result) { + warning("vs-1454", "Cannot capture anon pages: %i (%d %d %d)\n", result, captured, clean, writeback); + return result; + } + + if (nr >= CAPTURE_APAGE_BURST) + redirty_inode(mapping->host); + + if (result == 0) + result = capture_anonymous_jnodes(mapping->host); + + if (result != 0) + warning("nikita-3328", "Cannot capture anon pages: %i\n", result); + return result; +} + +/* + * this file plugin method is called to capture into current atom all + * "anonymous pages", that is, pages modified through mmap(2). For each such + * page this function creates jnode, captures this jnode, and creates (or + * modifies) extent. Anonymous pages are kept on the special inode list. Some + * of them can be emergency flushed. To cope with this list of eflushed jnodes + * from this inode is scanned. + */ +reiser4_internal int +capture_unix_file(struct inode *inode, struct writeback_control *wbc) +{ + int result; + unix_file_info_t *uf_info; + reiser4_context ctx; + + if (!inode_has_anonymous_pages(inode)) + return 0; + + init_context(&ctx, inode->i_sb); + /* avoid recursive calls to ->sync_inodes */ + ctx.nobalance = 1; + assert("zam-760", lock_stack_isclean(get_current_lock_stack())); + + result = 0; + do { + uf_info = unix_file_inode_data(inode); + /* + * locking: creation of extent requires read-semaphore on + * file. _But_, this function can also be called in the + * context of write system call from + * balance_dirty_pages(). So, write keeps semaphore (possible + * in write mode) on file A, and this function tries to + * acquire semaphore on (possibly) different file B. A/B + * deadlock is on a way. To avoid this try-lock is used + * here. This however leads to the complications in the + * fsync() case, which are not yet handled. + */ + if (rw_latch_try_read(&uf_info->latch) == 0) { + LOCK_CNT_INC(inode_sem_r); + + result = capture_anonymous_pages(inode->i_mapping); + rw_latch_up_read(&uf_info->latch); + LOCK_CNT_DEC(inode_sem_r); + if (result != 0 || wbc->sync_mode != WB_SYNC_ALL) + break; + result = txnmgr_force_commit_all(inode->i_sb, 0); + } else + result = RETERR(-EBUSY); + } while (result == 0 && inode_has_anonymous_pages(inode)); + + reiser4_exit_context(&ctx); + return result; +} + +/* plugin->u.file.readpage + page must be not out of file. This is called either via page fault and in that case vp is struct file *file, or on + truncate when last page of a file is to be read to perform its partial truncate and in that case vp is 0 +*/ +reiser4_internal int +readpage_unix_file(void *vp, struct page *page) +{ + int result; + struct inode *inode; + lock_handle lh; + reiser4_key key; + item_plugin *iplug; + hint_t hint; + coord_t *coord; + struct file *file; + + reiser4_stat_inc(file.page_ops.readpage_calls); + + assert("vs-1062", PageLocked(page)); + assert("vs-1061", page->mapping && page->mapping->host); + assert("vs-1078", (page->mapping->host->i_size > ((loff_t) page->index << PAGE_CACHE_SHIFT))); + + inode = page->mapping->host; + + file = vp; + result = load_file_hint(file, &hint, &lh); + if (result) + return result; + + /* get key of first byte of the page */ + key_by_inode_unix_file(inode, (loff_t) page->index << PAGE_CACHE_SHIFT, &key); + + /* look for file metadata corresponding to first byte of page */ + unlock_page(page); + result = find_file_item(&hint, &key, ZNODE_READ_LOCK, CBK_UNIQUE, 0/* ra_info */, unix_file_inode_data(inode)); + lock_page(page); + if (result != CBK_COORD_FOUND) { + /* this indicates file corruption */ + done_lh(&lh); + return result; + } + + if (PageUptodate(page)) { + done_lh(&lh); + unlock_page(page); + return 0; + } + + coord = &hint.coord.base_coord; + coord_clear_iplug(coord); + result = zload(coord->node); + if (result) { + done_lh(&lh); + return result; + } + if (!hint.coord.valid) + validate_extended_coord(&hint.coord, (loff_t) page->index << PAGE_CACHE_SHIFT); + + if (!coord_is_existing_unit(coord)) { + /* this indicates corruption */ + warning("vs-280", + "Looking for page %lu of file %llu (size %lli). " + "No file items found (%d). " + "File is corrupted?\n", + page->index, get_inode_oid(inode), inode->i_size, result); + zrelse(coord->node); + done_lh(&lh); + return RETERR(-EIO); + } + + /* get plugin of found item or use plugin if extent if there are no + one */ + iplug = item_plugin_by_coord(coord); + if (iplug->s.file.readpage) + result = iplug->s.file.readpage(coord, page); + else + result = RETERR(-EINVAL); + + if (!result) { + set_key_offset(&key, (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT); + /* FIXME should call set_hint() */ + unset_hint(&hint); + } else + unset_hint(&hint); + zrelse(coord->node); + done_lh(&lh); + + save_file_hint(file, &hint); + + assert("vs-979", ergo(result == 0, (PageLocked(page) || PageUptodate(page)))); + return result; +} + +/* returns 1 if file of that size (@new_size) has to be stored in unformatted + nodes */ +/* Audited by: green(2002.06.15) */ +static int +should_have_notail(const unix_file_info_t *uf_info, loff_t new_size) +{ + if (!uf_info->tplug) + return 1; + return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info), + new_size); + +} + +static reiser4_block_nr unix_file_estimate_read(struct inode *inode, + loff_t count UNUSED_ARG) +{ + /* We should reserve one block, because of updating of the stat data + item */ + assert("vs-1249", inode_file_plugin(inode)->estimate.update == estimate_update_common); + return estimate_update_common(inode); +} + +/* plugin->u.file.read + + the read method for the unix_file plugin + +*/ +reiser4_internal ssize_t +read_unix_file(struct file *file, char *buf, size_t read_amount, loff_t *off) +{ + int result; + struct inode *inode; + flow_t f; + lock_handle lh; + hint_t hint; + coord_t *coord; + + size_t read; + reiser4_block_nr needed; + int (*read_f) (struct file *, flow_t *, hint_t *); + unix_file_info_t *uf_info; + + if (unlikely(!read_amount)) + return 0; + + inode = file->f_dentry->d_inode; + assert("vs-972", !inode_get_flag(inode, REISER4_NO_SD)); + uf_info = unix_file_inode_data(inode); + + if (inode_get_flag(inode, REISER4_PART_CONV)) { + get_exclusive_access(uf_info); + result = finish_conversion(inode); + if (result != 0) { + drop_access(uf_info); + return result; + } + } else + get_nonexclusive_access(uf_info); + + if (*off >= inode->i_size) { + /* position to read from is past the end of file */ + drop_access(uf_info); + return 0; + } + if (*off + read_amount > inode->i_size) + read_amount = inode->i_size - *off; + + needed = unix_file_estimate_read(inode, read_amount); /* FIXME: tree_by_inode(inode)->estimate_one_insert */ + result = reiser4_grab_space(needed, BA_CAN_COMMIT); + if (result != 0) { + drop_access(uf_info); + return result; + } + + /* build flow */ + assert("vs-1250", inode_file_plugin(inode)->flow_by_inode == flow_by_inode_unix_file); + result = flow_by_inode_unix_file(inode, buf, 1 /* user space */ , read_amount, *off, READ_OP, &f); + if (unlikely(result)) { + drop_access(uf_info); + return result; + } + + /* get seal and coord sealed with it from reiser4 private data of struct file. The coord will tell us where our + last read of this file finished, and the seal will help to determine if that location is still valid. + */ + result = load_file_hint(file, &hint, &lh); + if (unlikely(result)) { + drop_access(uf_info); + return result; + } + + switch(uf_info->container) { + case UF_CONTAINER_EXTENTS: + read_f = item_plugin_by_id(EXTENT_POINTER_ID)->s.file.read; + break; + case UF_CONTAINER_TAILS: + /* this is read-ahead for tails-only files */ + result = reiser4_file_readahead(file, *off, read_amount); + if (result) + return result; + + read_f = item_plugin_by_id(FORMATTING_ID)->s.file.read; + break; + case UF_CONTAINER_UNKNOWN: + read_f = 0; + break; + default: + read_f = 0; + warning("vs-1297", "File (ino %llu) has unknown state: %d\n", get_inode_oid(inode), uf_info->container); + return -EIO; + } + + while (f.length) { + assert("vs-1354", inode->i_size > get_key_offset(&f.key)); + + result = find_file_item(&hint, &f.key, ZNODE_READ_LOCK, CBK_UNIQUE, NULL, uf_info); + if (result != CBK_COORD_FOUND) { + /* item had to be found, as it was not - we have + -EIO */ + done_lh(&lh); + break; + } + + coord = &hint.coord.base_coord; + if (coord->between != AT_UNIT) { + printk("zam-829: unix_file_read: key not in item, " + "reading offset (%llu) from the file (oid %llu) with size (%llu)\n", + (unsigned long long)get_key_offset(&f.key), + get_inode_oid(inode), + (unsigned long long)inode->i_size); + longterm_unlock_znode(&lh); + break; + } + + coord_clear_iplug(coord); + hint.coord.valid = 0; + result = zload(coord->node); + if (unlikely(result)) { + longterm_unlock_znode(&lh); + return result; + } + validate_extended_coord(&hint.coord, get_key_offset(&f.key)); + + /* call item's read method */ + if (!read_f) + read_f = item_plugin_by_coord(coord)->s.file.read; + result = read_f(file, &f, &hint); + zrelse(coord->node); + done_lh(&lh); + if (result == -E_REPEAT) { + printk("zam-830: unix_file_read: key was not found in item, repeat search\n"); + unset_hint(&hint); + continue; + } + if (result) + break; + } + + save_file_hint(file, &hint); + + read = read_amount - f.length; + if (read) { + /* something was read. Update stat data */ + update_atime(inode); + } + + drop_access(uf_info); + + /* update position in a file */ + *off += read; + + /* return number of read bytes or error code if nothing is read */ + return read ?: result; +} + +typedef int (*write_f_t)(struct inode *, flow_t *, hint_t *, int grabbed, write_mode_t); + +/* This searches for write position in the tree and calls write method of + appropriate item to actually copy user data into filesystem. This loops + until all the data from flow @f are written to a file. */ +static loff_t +append_and_or_overwrite(struct file *file, unix_file_info_t *uf_info, flow_t *flow) +{ + int result; + lock_handle lh; + hint_t hint; + loff_t to_write; + write_f_t write_f; + file_container_t cur_container, new_container; + znode *loaded; + loff_t off; + + assert("nikita-3031", schedulable()); + assert("vs-1109", get_current_context()->grabbed_blocks == 0); + + /* get seal and coord sealed with it from reiser4 private data of + struct file */ + result = load_file_hint(file, &hint, &lh); + if (result) + return result; + + to_write = flow->length; + + while (flow->length) { + off = get_key_offset(&flow->key); + /*XXX*//*printk("write (oid %llu, size %llu, offset %llu)\n", + get_inode_oid(uf_info->inode), uf_info->inode->i_size, off);*/ + assert("vs-1123", get_current_context()->grabbed_blocks == 0); + if (to_write == flow->length) { + /* it may happend that find_next_item will have to insert empty node to the tree (empty leaf + node between two extent items) */ + result = reiser4_grab_space_force(1 + estimate_one_insert_item(tree_by_inode(unix_file_info_to_inode(uf_info))), 0); + if (result) + return result; + } + /* look for file's metadata (extent or tail item) corresponding to position we write to */ + result = find_file_item(&hint, &flow->key, ZNODE_WRITE_LOCK, CBK_UNIQUE | CBK_FOR_INSERT, 0/* ra_info */, uf_info); + all_grabbed2free(); + if (IS_CBKERR(result)) { + /* error occurred */ + done_lh(&lh); + return result; + } + + cur_container = uf_info->container; + switch (cur_container) { + case UF_CONTAINER_EMPTY: + assert("vs-1196", get_key_offset(&flow->key) == 0); + if (should_have_notail(uf_info, get_key_offset(&flow->key) + flow->length)) { + new_container = UF_CONTAINER_EXTENTS; + write_f = item_plugin_by_id(EXTENT_POINTER_ID)->s.file.write; + } else { + new_container = UF_CONTAINER_TAILS; + write_f = item_plugin_by_id(FORMATTING_ID)->s.file.write; + } + break; + + case UF_CONTAINER_EXTENTS: + write_f = item_plugin_by_id(EXTENT_POINTER_ID)->s.file.write; + new_container = cur_container; + break; + + case UF_CONTAINER_TAILS: + if (should_have_notail(uf_info, get_key_offset(&flow->key) + flow->length)) { + longterm_unlock_znode(&lh); + if (!ea_obtained(uf_info)) + return RETERR(-E_REPEAT); + result = tail2extent(uf_info); + if (result) + return result; + unset_hint(&hint); + continue; + } + write_f = item_plugin_by_id(FORMATTING_ID)->s.file.write; + new_container = cur_container; + break; + + default: + longterm_unlock_znode(&lh); + return RETERR(-EIO); + } + + result = zload(lh.node); + if (result) { + longterm_unlock_znode(&lh); + return result; + } + loaded = lh.node; + coord_clear_iplug(&hint.coord.base_coord); + + result = write_f(unix_file_info_to_inode(uf_info), + flow, + &hint, + 0/* not grabbed */, + how_to_write(&hint.coord, &flow->key)); + + assert("nikita-3142", get_current_context()->grabbed_blocks == 0); + if (cur_container == UF_CONTAINER_EMPTY && to_write != flow->length) { + /* file was empty and we have written something and we are having exclusive access to the file - + change file state */ + assert("vs-1195", (new_container == UF_CONTAINER_TAILS || + new_container == UF_CONTAINER_EXTENTS)); + uf_info->container = new_container; + } + zrelse(loaded); + done_lh(&lh); + if (result && result != -E_REPEAT) + break; + /*XXX*//*printk("write (oid %llu, size %llu, offset %llu) - done\n", + get_inode_oid(uf_info->inode), uf_info->inode->i_size, off);*/ + preempt_point(); + } + if (result == -EEXIST) + printk("write returns EEXIST!\n"); + save_file_hint(file, &hint); + + /* if nothing were written - there must be an error */ + assert("vs-951", ergo((to_write == flow->length), result < 0)); + assert("vs-1110", get_current_context()->grabbed_blocks == 0); + + return (to_write - flow->length) ? (to_write - flow->length) : result; +} + +/* make flow and write data (@buf) to the file. If @buf == 0 - hole of size @count will be created. This is called with + uf_info->latch either read- or write-locked */ +static loff_t +write_flow(struct file *file, unix_file_info_t *uf_info, const char *buf, loff_t count, loff_t pos) +{ + int result; + flow_t flow; + + assert("vs-1251", inode_file_plugin(unix_file_info_to_inode(uf_info))->flow_by_inode == flow_by_inode_unix_file); + + result = flow_by_inode_unix_file(unix_file_info_to_inode(uf_info), + (char *)buf, 1 /* user space */, count, pos, WRITE_OP, &flow); + if (result) + return result; + + return append_and_or_overwrite(file, uf_info, &flow); +} + +reiser4_internal void +drop_access(unix_file_info_t *uf_info) +{ + if (uf_info->exclusive_use) + drop_exclusive_access(uf_info); + else + drop_nonexclusive_access(uf_info); +} + +reiser4_internal void balance_dirty_page_unix_file(struct inode *object) +{ + /* balance dirty pages periodically */ + balance_dirty_pages_ratelimited(object->i_mapping); +} + +reiser4_internal struct page * +unix_file_filemap_nopage(struct vm_area_struct *area, unsigned long address, int * unused) +{ + struct page *page; + struct inode *inode; + + inode = area->vm_file->f_dentry->d_inode; + get_nonexclusive_access(unix_file_inode_data(inode)); + page = filemap_nopage(area, address, 0); + drop_nonexclusive_access(unix_file_inode_data(inode)); + return page; +} + +static struct vm_operations_struct unix_file_vm_ops = { + .nopage = unix_file_filemap_nopage, +}; + +/* This function takes care about @file's pages. First of all it checks if + filesystems readonly and if so gets out. Otherwise, it throws out all + pages of file if it was mapped for read and going to be mapped for write + and consists of tails. This is done in order to not manage few copies + of the data (first in page cache and second one in tails them selves) + for the case of mapping files consisting tails. + + Here also tail2extent conversion is performed if it is allowed and file + is going to be written or mapped for write. This functions may be called + from write_unix_file() or mmap_unix_file(). */ +static int +check_pages_unix_file(struct file *file, int vm_flags, int caller_is_write, + int perform_convert, int *gotaccess) +{ + int result; + struct inode *inode; + unix_file_info_t *uf_info; + + inode = file->f_dentry->d_inode; + uf_info = unix_file_inode_data(inode); + + *gotaccess = 0; + + /* Check if object is RDONLY. If so, we go out with no error. This may + happened, when file is opened for RDONLY and then mmaped for read + with flags MAP_PRIVATE. */ + if (IS_RDONLY(inode)) + return 0; + + result = 0; + /* throwing out mapped pages if they was mapped for read and file + * consists of tails. */ + if (inode_get_flag(inode, REISER4_TAILS_FILE_MMAPED)) { + if ((vm_flags & VM_MAYWRITE) || caller_is_write) { + get_exclusive_access(uf_info); + *gotaccess = 1; + reiser4_invalidate_pages(inode->i_mapping, 0, + (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT); + inode_clr_flag(inode, REISER4_TAILS_FILE_MMAPED); + } + } + + /* converting file to extents if it is going to be mapped for write. */ + if (perform_convert) { + if ((vm_flags & VM_MAYWRITE) || caller_is_write) { + if (!*gotaccess) { + get_exclusive_access(uf_info); + *gotaccess = 1; + } + result = unpack(inode, 0, 1); + } + } + return result; +} + +/* plugin->u.file.mmap + make sure that file is built of extent blocks. An estimation is in tail2extent */ +reiser4_internal int +mmap_unix_file(struct file *file, struct vm_area_struct *vma) +{ + int result; + struct inode *inode; + unix_file_info_t *uf_info; + int gotaccess; + + inode = file->f_dentry->d_inode; + uf_info = unix_file_inode_data(inode); + + /* Checking for mapped pages, converting to something (tails, extents) */ + result = check_pages_unix_file(file, vma->vm_flags, 0, 1, &gotaccess); + if (gotaccess) + drop_access(uf_info); + + if (result != 0) + return result; + + result = generic_file_mmap(file, vma); + if (result) + return result; + + vma->vm_ops = &unix_file_vm_ops; + + /* marking file as mapped for read only and it contains of tails. */ + if (!(vma->vm_flags & VM_MAYWRITE) && (uf_info->container == UF_CONTAINER_TAILS)) + inode_set_flag(inode, REISER4_TAILS_FILE_MMAPED); + + return 0; +} + +static ssize_t +write_file(struct file *file, /* file to write to */ + const char *buf, /* address of user-space buffer */ + size_t count, /* number of bytes to write */ + loff_t *off /* position in file to write to */, + unix_file_info_t *uf_info) +{ + struct inode *inode; + ssize_t written; /* amount actually written so far */ + loff_t pos; /* current location in the file */ + + inode = file->f_dentry->d_inode; + + /* estimation for write is entrusted to write item plugins */ + pos = *off; + + if (inode->i_size < pos) { + /* pos is set past real end of file */ + written = append_hole(uf_info, pos); + if (written) + return written; + assert("vs-1081", pos == inode->i_size); + } + + /* write user data to the file */ + written = write_flow(file, uf_info, buf, count, pos); + if (written > 0) + /* update position in a file */ + *off = pos + written; + + /* return number of written bytes, or error code */ + return written; +} + +/* plugin->u.file.write */ +reiser4_internal ssize_t +write_unix_file(struct file *file, /* file to write to */ + const char *buf, /* address of user-space buffer */ + size_t count, /* number of bytes to write */ + loff_t *off /* position in file to write to */) +{ + struct inode *inode; + ssize_t written; /* amount actually written so far */ + + if (unlikely(count == 0)) + return 0; + + inode = file->f_dentry->d_inode; + assert("vs-947", !inode_get_flag(inode, REISER4_NO_SD)); + + /* linux's VM requires this. See mm/vmscan.c:shrink_list() */ + current->backing_dev_info = inode->i_mapping->backing_dev_info; + + down(&inode->i_sem); + written = generic_write_checks(file, off, &count, 0); + if (written == 0) { + int gotaccess; + + + /* Checking for mapped pages, converting to something (tails, extents) */ + written = check_pages_unix_file(file, 0, 1, 0, &gotaccess); + if (written == 0) { + unix_file_info_t *uf_info; + int rep; + + uf_info = unix_file_inode_data(inode); + + for (rep = 0;; ++ rep) { + if (!gotaccess) { + if (inode_get_flag(inode, + REISER4_PART_CONV)) { + get_exclusive_access(uf_info); + written = finish_conversion(inode); + if (written != 0) { + drop_access(uf_info); + break; + } + /* check_pages_unix_file returned + without taking any access. We need + to take access. We take excluse if + inode size is 0 */ + } else if (inode->i_size == 0 || rep) + get_exclusive_access(uf_info); + else + get_nonexclusive_access(uf_info); + } + + if (rep == 0) { + /* UNIX behavior: clear suid bit on file modification */ + remove_suid(file->f_dentry); + grab_space_enable(); + } + + all_grabbed2free(); + written = write_file(file, buf, count, off, uf_info); + drop_access(uf_info); + gotaccess = 0; + + if (written == -E_REPEAT) { + /* write_file required exclusive access (for tail2extent). It returned E_REPEAT + * so that we restart it with exclusive access */ + reiser4_context * ctx; + + ctx = get_current_context(); + written = txn_end(ctx); + if (written < 0) + break; + txn_begin(ctx); + } else + break; + } + } + } + + up(&inode->i_sem); + current->backing_dev_info = 0; + return written; +} + +/* plugin->u.file.release() convert all extent items into tail items if + necessary */ +reiser4_internal int +release_unix_file(struct inode *object, struct file *file) +{ + unix_file_info_t *uf_info; + int result; + + uf_info = unix_file_inode_data(object); + result = 0; + + get_exclusive_access(uf_info); + if (atomic_read(&file->f_dentry->d_count) == 1 && + uf_info->container == UF_CONTAINER_EXTENTS && + !should_have_notail(uf_info, object->i_size) && + !rofs_inode(object)) { + result = extent2tail(uf_info); + if (result != 0) { + warning("nikita-3233", "Failed to convert in %s (%llu)", + __FUNCTION__, get_inode_oid(object)); + print_inode("inode", object); + } + } + drop_exclusive_access(uf_info); + return 0; +} + +static void +set_file_notail(struct inode *inode) +{ + reiser4_inode *state; + formatting_plugin *tplug; + + state = reiser4_inode_data(inode); + tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID); + plugin_set_formatting(&state->pset, tplug); + inode_set_plugin(inode, formatting_plugin_to_plugin(tplug)); +} + +/* if file is built of tails - convert it to extents */ +static int +unpack(struct inode *inode, int forever, int locked) +{ + int result = 0; + unix_file_info_t *uf_info; + + uf_info = unix_file_inode_data(inode); + + if (!locked) + get_exclusive_access(uf_info); + + if (uf_info->container == UF_CONTAINER_UNKNOWN) { + loff_t file_size; + + result = find_file_size(inode, &file_size); + } + assert("vs-1074", ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN)); + if (result == 0) { + if (uf_info->container == UF_CONTAINER_TAILS) + result = tail2extent(uf_info); + if (result == 0 && forever) + set_file_notail(inode); + } + + if (!locked) + drop_exclusive_access(uf_info); + + if (result == 0) { + __u64 tograb; + + grab_space_enable(); + tograb = inode_file_plugin(inode)->estimate.update(inode); + result = reiser4_grab_space(tograb, BA_CAN_COMMIT); + if (result == 0) + update_atime(inode); + } + + return result; +} + +/* plugin->u.file.ioctl */ +reiser4_internal int +ioctl_unix_file(struct inode *inode, struct file *filp UNUSED_ARG, unsigned int cmd, unsigned long arg UNUSED_ARG) +{ + int result; + + switch (cmd) { + case REISER4_IOC_UNPACK: + result = unpack(inode, 1, 0); + break; + + default: + result = RETERR(-ENOSYS); + break; + } + return result; +} + +/* plugin->u.file.get_block */ +reiser4_internal int +get_block_unix_file(struct inode *inode, + sector_t block, struct buffer_head *bh_result, int create UNUSED_ARG) +{ + int result; + reiser4_key key; + hint_t hint; + lock_handle lh; + item_plugin *iplug; + + assert("vs-1091", create == 0); + key_by_inode_unix_file(inode, (loff_t) block * current_blocksize, &key); + + hint_init_zero(&hint, &lh); + result = find_file_item(&hint, &key, ZNODE_READ_LOCK, CBK_UNIQUE, 0/* ra_info */, unix_file_inode_data(inode)); + if (result != CBK_COORD_FOUND || hint.coord.base_coord.between != AT_UNIT) { + done_lh(&lh); + return result; + } + coord_clear_iplug(&hint.coord.base_coord); + result = zload(hint.coord.base_coord.node); + if (result) { + done_lh(&lh); + return result; + } + iplug = item_plugin_by_coord(&hint.coord.base_coord); + if (!hint.coord.valid) + validate_extended_coord(&hint.coord, + (loff_t) block << PAGE_CACHE_SHIFT); + if (iplug->s.file.get_block) + result = iplug->s.file.get_block(&hint.coord, block, bh_result); + else + result = RETERR(-EINVAL); + + zrelse(hint.coord.base_coord.node); + done_lh(&lh); + return result; +} + +/* plugin->u.file.flow_by_inode + initialize flow (key, length, buf, etc) */ +reiser4_internal int +flow_by_inode_unix_file(struct inode *inode /* file to build flow for */ , + char *buf /* user level buffer */ , + int user /* 1 if @buf is of user space, 0 - if it is kernel space */ , + loff_t size /* buffer size */ , + loff_t off /* offset to start operation(read/write) from */ , + rw_op op /* READ or WRITE */ , + flow_t *flow /* resulting flow */ ) +{ + assert("nikita-1100", inode != NULL); + + flow->length = size; + flow->data = buf; + flow->user = user; + flow->op = op; + assert("nikita-1931", inode_file_plugin(inode) != NULL); + assert("nikita-1932", inode_file_plugin(inode)->key_by_inode == key_by_inode_unix_file); + /* calculate key of write position and insert it into flow->key */ + return key_by_inode_unix_file(inode, off, &flow->key); +} + +/* plugin->u.file.key_by_inode */ +reiser4_internal int +key_by_inode_unix_file(struct inode *inode, loff_t off, reiser4_key *key) +{ + return key_by_inode_and_offset_common(inode, off, key); +} + +/* plugin->u.file.set_plug_in_sd = NULL + plugin->u.file.set_plug_in_inode = NULL + plugin->u.file.create_blank_sd = NULL */ +/* plugin->u.file.delete */ +/* + plugin->u.file.add_link = add_link_common + plugin->u.file.rem_link = NULL */ + +/* plugin->u.file.owns_item + this is common_file_owns_item with assertion */ +/* Audited by: green(2002.06.15) */ +reiser4_internal int +owns_item_unix_file(const struct inode *inode /* object to check against */ , + const coord_t *coord /* coord to check */ ) +{ + int result; + + result = owns_item_common(inode, coord); + if (!result) + return 0; + if (item_type_by_coord(coord) != UNIX_FILE_METADATA_ITEM_TYPE) + return 0; + assert("vs-547", + item_id_by_coord(coord) == EXTENT_POINTER_ID || + item_id_by_coord(coord) == FORMATTING_ID); + return 1; +} + +static int +setattr_truncate(struct inode *inode, struct iattr *attr) +{ + int result; + int s_result; + loff_t old_size; + + inode_check_scale(inode, inode->i_size, attr->ia_size); + + old_size = inode->i_size; + + result = safe_link_grab(tree_by_inode(inode), BA_CAN_COMMIT); + if (result == 0) + result = safe_link_add(inode, SAFE_TRUNCATE); + all_grabbed2free(); + if (result == 0) + result = truncate_file(inode, attr->ia_size, 1); + if (result == 0) { + /* items are removed already. inode_setattr will call + vmtruncate to invalidate truncated pages and + unix_file_truncate which will do nothing. FIXME: is this + necessary? */ + INODE_SET_FIELD(inode, i_size, old_size); + result = inode_setattr(inode, attr); + } else + warning("vs-1588", "truncate_file failed: oid %lli, old size %lld, new size %lld, retval %d", + get_inode_oid(inode), old_size, attr->ia_size, result); + + s_result = safe_link_grab(tree_by_inode(inode), BA_CAN_COMMIT); + if (s_result == 0) + s_result = safe_link_del(inode, SAFE_TRUNCATE); + if (s_result != 0) { + warning("nikita-3417", "Cannot kill safelink %lli: %i", + get_inode_oid(inode), s_result); + } + safe_link_release(tree_by_inode(inode)); + all_grabbed2free(); + return result; +} + +/* plugin->u.file.setattr method */ +/* This calls inode_setattr and if truncate is in effect it also takes + exclusive inode access to avoid races */ +reiser4_internal int +setattr_unix_file(struct inode *inode, /* Object to change attributes */ + struct iattr *attr /* change description */ ) +{ + int result; + + if (attr->ia_valid & ATTR_SIZE) { + /* truncate does reservation itself and requires exclusive + * access obtained */ + if (inode->i_size != attr->ia_size) { + unix_file_info_t *ufo; + + ufo = unix_file_inode_data(inode); + get_exclusive_access(ufo); + result = setattr_truncate(inode, attr); + drop_exclusive_access(ufo); + } else + result = 0; + } else { + result = setattr_reserve(tree_by_inode(inode)); + if (!result) { + result = inode_setattr(inode, attr); + if (!result) + /* "capture" inode */ + result = reiser4_mark_inode_dirty(inode); + all_grabbed2free(); + } + } + return result; +} + +/* plugin->u.file.can_add_link = common_file_can_add_link */ +/* VS-FIXME-HANS: why does this always resolve to extent pointer? this wrapper serves what purpose? get rid of it. */ +/* plugin->u.file.readpages method */ +reiser4_internal void +readpages_unix_file(struct file *file, struct address_space *mapping, + struct list_head *pages) +{ + reiser4_file_fsdata *fsdata; + item_plugin *iplug; + + assert("vs-1282", unix_file_inode_data(mapping->host)->container == UF_CONTAINER_EXTENTS); + + fsdata = reiser4_get_file_fsdata(file); + iplug = item_plugin_by_id(EXTENT_POINTER_ID); + iplug->s.file.readpages(fsdata->reg.coord, mapping, pages); + return; +} + +/* plugin->u.file.init_inode_data */ +reiser4_internal void +init_inode_data_unix_file(struct inode *inode, + reiser4_object_create_data *crd, int create) +{ + unix_file_info_t *data; + + data = unix_file_inode_data(inode); + data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN; + rw_latch_init(&data->latch); + data->tplug = inode_formatting_plugin(inode); + data->exclusive_use = 0; + +#if REISER4_DEBUG + data->ea_owner = 0; +#endif + init_inode_ordering(inode, crd, create); +} +/* VS-FIXME-HANS: what is pre deleting all about? */ +/* plugin->u.file.pre_delete */ +reiser4_internal int +pre_delete_unix_file(struct inode *inode) +{ + return truncate_file(inode, 0/* size */, 0/* no stat data update */); +} + +reiser4_internal int +safelink_unix_file(struct inode *object, reiser4_safe_link_t link, + __u64 value) +{ + int result; + + if (link == SAFE_E2T || link == SAFE_T2E) { + unix_file_info_t *ufo; + + ufo = unix_file_inode_data(object); + inode_set_flag(object, REISER4_PART_CONV); + get_exclusive_access(ufo); + if (link == SAFE_E2T) + result = extent2tail(ufo); + else + result = tail2extent(ufo); + drop_access(ufo); + } else + result = safelink_common(object, link, value); + return result; +} + + +/* Reads @count bytes from @file and calls @actor for every page read. This is + needed for loop back devices support. */ +reiser4_internal ssize_t sendfile_common ( + struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void __user *target) +{ + ssize_t amount_read; + file_plugin *fplug; + struct inode *inode; + read_descriptor_t desc; + struct page *page = NULL; + int ret = 0; + + assert("umka-3108", file != NULL); + + inode = file->f_dentry->d_inode; + + desc.error = 0; + desc.written = 0; + desc.buf = target; + desc.count = count; + + fplug = inode_file_plugin(inode); + if (fplug->readpage == NULL) + return RETERR(-EINVAL); + + amount_read = 0; + + while (desc.count != 0) { + unsigned long read_request_size; + unsigned long index; + unsigned long offset; + loff_t file_size = i_size_read(inode); + + if (*ppos >= file_size) + break; + + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + + page_cache_readahead(inode->i_mapping, &file->f_ra, file, offset); + + /* determine valid read request size. */ + read_request_size = PAGE_CACHE_SIZE - offset; + if (read_request_size > desc.count) + read_request_size = desc.count; + if (*ppos + read_request_size >= file_size) { + read_request_size = file_size - *ppos; + if (read_request_size == 0) + break; + } + page = grab_cache_page(inode->i_mapping, index); + if (unlikely(page == NULL)) { + ret = RETERR(-ENOMEM); + goto fail_no_page; + } + + if (PageUptodate(page)) + /* process locked, up-to-date page by read actor */ + goto actor; + + ret = fplug->readpage(file, page); + if (ret != 0) { + SetPageError(page); + ClearPageUptodate(page); + goto fail_locked_page; + } + + lock_page(page); + if (!PageUptodate(page)) { + ret = RETERR(-EIO); + goto fail_locked_page; + } + + actor: + ret = actor(&desc, page, offset, read_request_size); + unlock_page(page); + page_cache_release(page); + if (ret < 0) + goto fail_no_page; + + (*ppos) += ret; + amount_read += ret; + } + + update_atime(inode); + return amount_read; + + + fail_locked_page: + unlock_page(page); + page_cache_release(page); + fail_no_page: + + update_atime(inode); + return ret; +} + +reiser4_internal ssize_t sendfile_unix_file ( + struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void __user *target) +{ + struct inode * inode; + ssize_t ret; + + int gotaccess = 0; + + inode = file->f_dentry->d_inode; + + down(&inode->i_sem); + + ret = check_pages_unix_file(file, 0, 1, 1, &gotaccess); + if (gotaccess) + drop_exclusive_access(unix_file_inode_data(inode)); + up(&inode->i_sem); + if (ret) + return ret; + + return sendfile_common(file, ppos, count, actor, target); +} + +reiser4_internal int prepare_write_unix_file ( + struct file * file, struct page * page, unsigned from, unsigned to) +{ + ssize_t ret; + int gotaccess; + + ret = check_pages_unix_file(file, 0, 1, 1, &gotaccess); + if (ret) + return ret; + if (gotaccess) + drop_exclusive_access(unix_file_inode_data(file->f_dentry->d_inode)); + return prepare_write_common(file, page, from, to); +} + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/file/file.h linux-2.6.4-ck1/fs/reiser4/plugin/file/file.h --- linux-2.6.4/fs/reiser4/plugin/file/file.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/file/file.h 2004-03-11 22:45:15.304507876 +1100 @@ -0,0 +1,141 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#if !defined( __REISER4_FILE_H__ ) +#define __REISER4_FILE_H__ + +/* declarations of functions implementing file plugin for unix file plugin */ +int truncate_unix_file(struct inode *, loff_t size); +int readpage_unix_file(void *, struct page *); +int capturepage_unix_file(struct page *); +int capture_unix_file(struct inode *inode, struct writeback_control *wbc); +ssize_t read_unix_file(struct file *, char *buf, size_t size, loff_t *off); +ssize_t write_unix_file(struct file *, const char *buf, size_t size, loff_t *off); +int release_unix_file(struct inode *inode, struct file *); +int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd, unsigned long arg); +int mmap_unix_file(struct file *, struct vm_area_struct *vma); +int get_block_unix_file(struct inode *, sector_t block, struct buffer_head *bh_result, int create); +int flow_by_inode_unix_file(struct inode *, char *buf, int user, loff_t, loff_t, rw_op, flow_t *); +int key_by_inode_unix_file(struct inode *, loff_t off, reiser4_key *); +int owns_item_unix_file(const struct inode *, const coord_t *); +int setattr_unix_file(struct inode *, struct iattr *); +void readpages_unix_file(struct file *, struct address_space *, struct list_head *pages); +void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *, int create); +int pre_delete_unix_file(struct inode *); +int safelink_unix_file(struct inode *object, reiser4_safe_link_t link, + __u64 value); + +extern ssize_t sendfile_common ( + struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void __user *target); +extern ssize_t sendfile_unix_file ( + struct file *file, loff_t *ppos, size_t count, read_actor_t actor, void __user *target); +extern int prepare_write_unix_file (struct file *, struct page *, unsigned, unsigned); + +void balance_dirty_page_unix_file(struct inode *object); + +/* all the write into unix file is performed by item write method. Write method of unix file plugin only decides which + item plugin (extent or tail) and in which mode (one from the enum below) to call */ +typedef enum { + FIRST_ITEM = 1, + APPEND_ITEM = 2, + OVERWRITE_ITEM = 3 +} write_mode_t; + + +/* unix file may be in one the following states */ +typedef enum { + UF_CONTAINER_UNKNOWN = 0, + UF_CONTAINER_TAILS = 1, + UF_CONTAINER_EXTENTS = 2, + UF_CONTAINER_EMPTY = 3 +} file_container_t; + +#include "../../latch.h" + +struct formatting_plugin; +struct inode; + +/* unix file plugin specific part of reiser4 inode */ +typedef struct unix_file_info { + rw_latch_t latch; /* this read-write lock protects file containerization change. Accesses which do not change + file containerization (see file_container_t) (read, readpage, writepage, write (until tail + conversion is involved)) take read-lock. Accesses which modify file containerization + (truncate, conversion from tail to extent and back) take write-lock. */ + file_container_t container; /* this enum specifies which items are used to build the file */ + struct formatting_plugin *tplug; /* plugin which controls when file is to be converted to extents and back to + tail */ + /* if this is set, file is in exclusive use */ + int exclusive_use; +#if REISER4_DEBUG + void *ea_owner; /* pointer to task struct of thread owning exclusive + * access to file */ +#endif +} unix_file_info_t; + +struct unix_file_info *unix_file_inode_data(const struct inode * inode); + +#include "../../coord.h" +#include "../item/extent.h" +#include "../item/tail.h" + +struct uf_coord { + coord_t base_coord; + lock_handle *lh; + int valid; + union { + extent_coord_extension_t extent; + tail_coord_extension_t tail; + } extension; +}; + +#include "../../seal.h" + +/* structure used to speed up file operations (reads and writes). It contains + * a seal over last file item accessed. */ +struct hint { + seal_t seal; + uf_coord_t coord; + loff_t offset; + tree_level level; + znode_lock_mode mode; +}; + +void set_hint(hint_t *, const reiser4_key *, znode_lock_mode); +void unset_hint(hint_t *); +int hint_validate(hint_t *, const reiser4_key *, int check_key, znode_lock_mode); + + +#if REISER4_DEBUG +static inline struct task_struct * +inode_ea_owner(const unix_file_info_t *uf_info) +{ + return uf_info->ea_owner; +} + +static inline void ea_set(unix_file_info_t *uf_info, void *value) +{ + uf_info->ea_owner = value; +} +#else +#define ea_set(inode, value) noop +#endif + +static inline int ea_obtained(const unix_file_info_t *uf_info) +{ + assert("vs-1167", ergo (inode_ea_owner(uf_info) != NULL, + inode_ea_owner(uf_info) == current)); + return uf_info->exclusive_use; +} + +/* __REISER4_FILE_H__ */ +#endif + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/file/funcs.h linux-2.6.4-ck1/fs/reiser4/plugin/file/funcs.h --- linux-2.6.4/fs/reiser4/plugin/file/funcs.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/file/funcs.h 2004-03-11 22:45:15.304507876 +1100 @@ -0,0 +1,17 @@ +void get_exclusive_access(unix_file_info_t *); +void drop_exclusive_access(unix_file_info_t *); +void get_nonexclusive_access(unix_file_info_t *); +void drop_nonexclusive_access(unix_file_info_t *); +void drop_access(unix_file_info_t *uf_info); + +int tail2extent(unix_file_info_t *); +int extent2tail(unix_file_info_t *); +int finish_conversion(struct inode *inode); + +void hint_init_zero(hint_t *hint, lock_handle *lh); +int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode, __u32 cbk_flags, + ra_info_t *, unix_file_info_t *); + +int goto_right_neighbor(coord_t *, lock_handle *); +int find_or_create_extent(struct page *page); +write_mode_t how_to_write(uf_coord_t *, const reiser4_key *); diff -Naurp linux-2.6.4/fs/reiser4/plugin/file/invert.c linux-2.6.4-ck1/fs/reiser4/plugin/file/invert.c --- linux-2.6.4/fs/reiser4/plugin/file/invert.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/file/invert.c 2004-03-11 22:45:15.305507720 +1100 @@ -0,0 +1,511 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Suppose you want to conveniently read and write a large variety of small files conveniently within a single emacs + buffer, without having a separate buffer for each 8 byte or so file. Inverts are the way to do that. An invert + provides you with the contents of a set of subfiles plus its own contents. It is a file which inherits other files + when you read it, and allows you to write to it and through it to the files that it inherits from. In order for it + to know which subfiles each part of your write should go into, there must be delimiters indicating that. It tries to + make that easy for you by providing those delimiters in what you read from it. + + When you read it, an invert performs an inverted assignment. Instead of taking an assignment command and writing a + bunch of files, it takes a bunch of files and composes an assignment command for you to read from it that if executed + would create those files. But which files? Well, that must be specified in the body of the invert using a special + syntax, and that specification is called the invert of the assignment. + + When written to, an invert performs the assignment command that is written + to it, and modifies its own body to contain the invert of that + assignment. + + In other words, writing to an invert file what you have read from it + is the identity operation. + + Malformed assignments cause write errors. Partial writes are not + supported in v4.0, but will be. + + Example: + + If an invert contains: + + /filenameA/<>+"(some text stored in the invert)+/filenameB/<> + +====================== +Each element in this definition should be an invert, and all files +should be called recursively - too. This is bad. If one of the +included files in not a regular or invert file, then we can't read +main file. + +I think to make it is possible easier: + +internal structure of invert file should be like symlink file. But +read and write method should be explitely indicated in i/o operation.. + +By default we read and write (if probably) as symlink and if we +specify ..invert at reading time that too we can specify it at write time. + +example: +/my_invert_file/..invert<- ( (/filenameA<-"(The contents of filenameA))+"(some text stored in the invert)+(/filenameB<-"(The contents of filenameB) ) ) +will create /my_invert_file as invert, and will creat /filenameA and /filenameB with specified body. + +read of /my_invert_file/..invert will be +/filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB) + +but read of /my_invert_file/ will be +The contents of filenameAsome text stored in the invertThe contents of filenameB + +we also can creat this file as +/my_invert_file/<-/filenameA+"(some text stored in the invert)+/filenameB +will create /my_invert_file , and use existing files /filenameA and /filenameB. + +and when we will read it will be as previously invert file. + +This is correct? + + vv +DEMIDOV-FIXME-HANS: + +Maybe you are right, but then you must disable writes to /my_invert_file/ and only allow writes to /my_invert_file/..invert + +Do you agree? Discuss it on reiserfs-list.... + +-Hans +======================= + + Then a read will return: + + /filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB) + + and a write of the line above to the invert will set the contents of + the invert and filenameA and filenameB to their original values. + + Note that the contents of an invert have no influence on the effect + of a write unless the write is a partial write (and a write of a + shorter file without using truncate first is a partial write). + + truncate() has no effect on filenameA and filenameB, it merely + resets the value of the invert. + + Writes to subfiles via the invert are implemented by preceding them + with truncates. + + Parse failures cause write failures. + + Questions to ponder: should the invert be acted on prior to file + close when writing to an open filedescriptor? + + Example: + + If an invert contains: + + "(This text and a pair of quotes are all that is here.) + +Then a read will return: + + "(This text and a pair of quotes are all that is here.) + +*/ + +/* OPEN method places a struct file in memory associated with invert body + and returns something like file descriptor to the user for the future access + to the invert file. + During opening we parse the body of invert and get a list of the 'entryes' + (that describes all its subfiles) and place pointer on the first struct in + reiserfs-specific part of invert inode (arbitrary decision). + + Each subfile is described by the struct inv_entry that has a pointer @sd on + in-core based stat-data and a pointer on struct file @f (if we find that the + subfile uses more then one unformated node (arbitrary decision), we load + struct file in memory, otherwise we load base stat-data (and maybe 1-2 bytes + of some other information we need) + + Since READ and WRITE methods for inverts were formulated in assignment + language, they don't contain arguments 'size' and 'offset' that make sense + only in ordinary read/write methods. + + READ method is a combination of two methods: + 1) ordinary read method (with offset=0, lenght = @f->...->i_size) for entries + with @f != 0, this method uses pointer on struct file as an argument + 2) read method for inode-less files with @sd != 0, this method uses + in-core based stat-data instead struct file as an argument. + in the first case we don't use pagecache, just copy data that we got after + cbk() into userspace. + + WRITE method for invert files is more complex. + Besides declared WRITE-interface in assignment languageb above we need + to have an opportunity to edit unwrapped body of invert file with some + text editor, it means we need GENERIC WRITE METHOD for invert file: + + my_invert_file/..invert <- "string" + + this method parses "string" and looks for correct subfile signatures, also + the parsing process splits this "string" on the set of flows in accordance + with the set of subfiles specified by this signarure. + The found list of signatures #S is compared with the opened one #I of invert + file. If it doesn't have this one (#I==0, it will be so for instance if we + have just create this invert file) the write method assignes found signature + (#I=#S;) to the invert file. Then if #I==#S, generic write method splits + itself to the some write methods for ordinary or light-weight, or call itself + recursively for invert files with corresponding flows. + I am not sure, but the list of signatures looks like what mr.Demidov means + by 'delimiters'. + + The cases when #S<#I (#I<#S) (in the sense of set-theory) are also available + and cause delete (create new) subfiles (arbitrary decision - it may looks + too complex, but this interface will be the completest). The order of entries + of list #S (#I) and inherited order on #I (#S) must coincide. + The other parsing results give malformed signature that aborts READ method + and releases all resources. + + + Format of subfile (entry) signature: + + "START_MAGIC"<>(TYPE="...",LOOKUP_ARG="...")SUBFILE_BODY"END_MAGIC" + + Legend: + + START_MAGIC - keyword indicates the start of subfile signature; + + <> indicates the start of 'subfile metadata', that is the pair + (TYPE="...",LOOKUP_ARG="...") in parenthesis separated by comma. + + TYPE - the string "type" indicates the start of one of the three words: + - ORDINARY_FILE, + - LIGHT_WEIGHT_FILE, + - INVERT_FILE; + + LOOKUP_ARG - lookup argument depends on previous type: + */ + + /************************************************************/ + /* TYPE * LOOKUP ARGUMENT */ + /************************************************************/ + /* LIGH_WEIGHT_FILE * stat-data key */ + /************************************************************/ + /* ORDINARY_FILE * filename */ + /************************************************************/ + /* INVERT_FILE * filename */ + /************************************************************/ + + /* where: + *stat-data key - the string contains stat data key of this subfile, it will be + passed to fast-access lookup method for light-weight files; + *filename - pathname of this subfile, iyt well be passed to VFS lookup methods + for ordinary and invert files; + + SUBFILE_BODY - data of this subfile (it will go to the flow) + END_MAGIC - the keyword indicates the end of subfile signature. + + The other simbols inside the signature interpreted as 'unformatted content', + which is available with VFS's read_link() (arbitraruy decision). + + NOTE: Parse method for a body of invert file uses mentioned signatures _without_ + subfile bodies. + + Now the only unclear thing is WRITE in regular light-weight subfile A that we + can describe only in assignment language: + + A <- "some_string" + + I guess we don't want to change stat-data and body items of file A + if this file exist, and size(A) != size("some_string") because this operation is + expencive, so we only do the partial write if size(A) > size("some_string") + and do truncate of the "some_string", and then do A <- "truncated string", if + size(A) < size("some_string"). This decision is also arbitrary.. + */ + +/* here is infrastructure for formated flows */ + +#define SUBFILE_HEADER_MAGIC 0x19196605 +#define FLOW_HEADER_MAGIC 0x01194304 + +#include "../plugin.h" +#include "../../debug.h" +#include "../../forward.h" +#include "../object.h" +#include "../item/item.h" +#include "../item/static_stat.h" +#include "../../dformat.h" +#include "../znode.h" +#include "../inode.h" + +#include +#include /* for struct file */ +#include /* for struct list_head */ + +typedef enum { + LIGHT_WEIGHT_FILE, + ORDINARY_FILE, + INVERT_FILE +} inv_entry_type; + +typedef struct flow_header { + d32 fl_magic; + d16 fl_nr; /* number of subfiles in the flow */ +}; + +typedef struct subfile_header { + d32 sh_magic; /* subfile magic */ + d16 sh_type; /* type of subfile: light-weight, ordinary, invert */ + d16 sh_arg_len; /* lenght of lookup argument (filename, key) */ + d32 sh_body_len; /* lenght of subfile body */ +}; + +/* functions to get/set fields of flow header */ + +static void +fl_set_magic(flow_header * fh, __u32 value) +{ + cputod32(value, &fh->fh_magic); +} + +static __u32 +fl_get_magic(flow_header * fh) +{ + return d32tocpu(&fh->fh_magic); +} +static void +fl_set_number(flow_header * fh, __u16 value) +{ + cputod16(value, &fh->fh_nr); +} +static unsigned +fl_get_number(flow_header * fh) +{ + return d16tocpu(&fh->fh_nr); +} + +/* functions to get/set fields of subfile header */ + +static void +sh_set_magic(subfile_header * sh, __u32 value) +{ + cputod32(value, &sh->sh_magic); +} + +static __u32 +sh_get_magic(subfile_header * sh) +{ + return d32tocpu(&sh->sh_magic); +} +static void +sh_set_type(subfile_header * sh, __u16 value) +{ + cputod16(value, &sh->sh_magic); +} +static unsigned +sh_get_type(subfile_header * sh) +{ + return d16tocpu(&sh->sh_magic); +} +static void +sh_set_arg_len(subfile_header * sh, __u16 value) +{ + cputod16(value, &sh->sh_arg_len); +} +static unsigned +sh_get_arg_len(subfile_header * sh) +{ + return d16tocpu(&sh->sh_arg_len); +} +static void +sh_set_body_len(subfile_header * sh, __u32 value) +{ + cputod32(value, &sh->sh_body_len); +} + +static __u32 +sh_get_body_len(subfile_header * sh) +{ + return d32tocpu(&sh->sh_body_len); +} + +/* in-core minimal stat-data, light-weight analog of inode */ + +struct incore_sd_base { + umode_t isd_mode; + nlink_t isd_nlink; + loff_t isd_size; + char *isd_data; /* 'subflow' to write */ +}; + +/* open invert create a list of invert entries, + every entry is represented by structure inv_entry */ + +struct inv_entry { + struct list_head *ie_list; + struct file *ie_file; /* this is NULL if the file doesn't + have unformated nodes */ + struct incore_sd_base *ie_sd; /* inode-less analog of struct file */ +}; + +/* allocate and init invert entry */ + +static struct inv_entry * +allocate_inv_entry(void) +{ + struct inv_entry *inv_entry; + + inv_entry = reiser4_kmalloc(sizeof (struct inv_entry), GFP_KERNEL); + if (!inv_entry) + return ERR_PTR(RETERR(-ENOMEM)); + inv_entry->ie_file = NULL; + inv_entry->ie_sd = NULL; + INIT_LIST_HEAD(&inv_entry->ie_list); + return inv_entry; +} + +static int +put_inv_entry(struct inv_entry *ientry) +{ + int result = 0; + + assert("edward-96", ientry != NULL); + assert("edward-97", ientry->ie_list != NULL); + + list_del(ientry->ie_list); + if (ientry->ie_sd != NULL) { + kfree(ientry->ie_sd); + kfree(ientry); + } + if (ientry->ie_file != NULL) + result = filp_close(ientry->file, NULL); + return result; +} + +static int +allocate_incore_sd_base(struct inv_entry *inv_entry) +{ + struct incore_sd_base *isd_base assert("edward-98", inv_entry != NULL); + assert("edward-99", inv_entry->ie_inode = NULL); + assert("edward-100", inv_entry->ie_sd = NULL); + + isd_base = reiser4_kmalloc(sizeof (struct incore_sd_base), GFP_KERNEL); + if (!isd_base) + return RETERR(-ENOMEM); + inv_entry->ie_sd = isd_base; + return 0; +} + +/* this can be installed as ->init_inv_entry () method of + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). + Copies data from on-disk stat-data format into light-weight analog of inode . + Doesn't hanlde stat-data extensions. */ + +static void +sd_base_load(struct inv_entry *inv_entry, char *sd) +{ + reiser4_stat_data_base *sd_base; + + assert("edward-101", inv_entry != NULL); + assert("edward-101", inv_entry->ie_sd != NULL); + assert("edward-102", sd != NULL); + + sd_base = (reiser4_stat_data_base *) sd; + inv_entry->incore_sd_base->isd_mode = d16tocpu(&sd_base->mode); + inv_entry->incore_sd_base->isd_nlink = d32tocpu(&sd_base->nlink); + inv_entry->incore_sd_base->isd_size = d64tocpu(&sd_base->size); + inv_entry->incore_sd_base->isd_data = NULL; +} + +/* initialise incore stat-data */ + +static void +init_incore_sd_base(struct inv_entry *inv_entry, coord_t * coord) +{ + reiser4_plugin *plugin = item_plugin_by_coord(coord); + void *body = item_body_by_coord(coord); + + assert("edward-103", inv_entry != NULL); + assert("edward-104", plugin != NULL); + assert("edward-105", body != NULL); + + sd_base_load(inv_entry, body); +} + +/* takes a key or filename and allocates new invert_entry, + init and adds it into the list, + we use lookup_sd_by_key() for light-weight files and VFS lookup by filename */ + +int +get_inv_entry(struct inode *invert_inode, /* inode of invert's body */ + inv_entry_type type, /* LIGHT-WEIGHT or ORDINARY */ + const reiser4_key * key, /* key of invert entry stat-data */ + char *filename, /* filename of the file to be opened */ + int flags, int mode) +{ + int result; + struct inv_entry *ientry; + + assert("edward-107", invert_inode != NULL); + + ientry = allocate_inv_entry(); + if (IS_ERR(ientry)) + return (PTR_ERR(ientry)); + + if (type == LIGHT_WEIGHT_FILE) { + coord_t coord; + lock_handle lh; + + assert("edward-108", key != NULL); + + init_coord(&coord); + init_lh(&lh); + result = lookup_sd_by_key(tree_by_inode(invert_inode), ZNODE_READ_LOCK, &coord, &lh, key); + if (result == 0) + init_incore_sd_base(ientry, coord); + + done_lh(&lh); + done_coord(&coord); + return (result); + } else { + struct file *file = filp_open(filename, flags, mode); + /* FIXME_EDWARD here we need to check if we + did't follow to any mount point */ + + assert("edward-108", filename != NULL); + + if (IS_ERR(file)) + return (PTR_ERR(file)); + ientry->ie_file = file; + return 0; + } +} + +/* takes inode of invert, reads the body of this invert, parses it, + opens all invert entries and return pointer on the first inv_entry */ + +struct inv_entry * +open_invert(struct file *invert_file) +{ + +} + +ssize_t subfile_read(struct *invert_entry, flow * f) +{ + +} + +ssize_t subfile_write(struct *invert_entry, flow * f) +{ + +} + +ssize_t invert_read(struct *file, flow * f) +{ + +} + +ssize_t invert_write(struct *file, flow * f) +{ + +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/file/pseudo.c linux-2.6.4-ck1/fs/reiser4/plugin/file/pseudo.c --- linux-2.6.4/fs/reiser4/plugin/file/pseudo.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/file/pseudo.c 2004-03-11 22:45:15.306507565 +1100 @@ -0,0 +1,146 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* + * Pseudo file plugin. This contains helper functions used by pseudo files. + */ + +#include "pseudo.h" +#include "../plugin.h" + +#include "../../inode.h" + +#include +#include + +struct seq_operations pseudo_seq_op; + +/* extract pseudo file plugin, stored in @file */ +static pseudo_plugin * +get_pplug(struct file * file) +{ + struct inode *inode; + + inode = file->f_dentry->d_inode; + return reiser4_inode_data(inode)->file_plugin_data.pseudo_info.plugin; +} + +/* common routine to open pseudo file. */ +reiser4_internal int open_pseudo(struct inode * inode, struct file * file) +{ + int result; + pseudo_plugin *pplug; + + pplug = get_pplug(file); + + /* for pseudo files based on seq_file interface */ + if (pplug->read_type == PSEUDO_READ_SEQ) { + result = seq_open(file, &pplug->read.ops); + if (result == 0) { + struct seq_file *m; + + m = file->private_data; + m->private = file; + } + } else if (pplug->read_type == PSEUDO_READ_SINGLE) + /* for pseudo files containing one record */ + result = single_open(file, pplug->read.single_show, file); + else + result = 0; + + return result; +} + +/* common read method for pseudo files */ +reiser4_internal ssize_t read_pseudo(struct file *file, + char __user *buf, size_t size, loff_t *ppos) +{ + switch (get_pplug(file)->read_type) { + case PSEUDO_READ_SEQ: + case PSEUDO_READ_SINGLE: + /* seq_file behaves like pipe, requiring @ppos to always be + * address of file->f_pos */ + return seq_read(file, buf, size, &file->f_pos); + case PSEUDO_READ_FORWARD: + return get_pplug(file)->read.read(file, buf, size, ppos); + default: + return 0; + } +} + +/* common seek method for pseudo files */ +reiser4_internal loff_t seek_pseudo(struct file *file, loff_t offset, int origin) +{ + switch (get_pplug(file)->read_type) { + case PSEUDO_READ_SEQ: + case PSEUDO_READ_SINGLE: + return seq_lseek(file, offset, origin); + default: + return 0; + } +} + +/* common release method for pseudo files */ +reiser4_internal int release_pseudo(struct inode *inode, struct file *file) +{ + int result; + + switch (get_pplug(file)->read_type) { + case PSEUDO_READ_SEQ: + case PSEUDO_READ_SINGLE: + result = seq_release(inode, file); + file->private_data = NULL; + break; + default: + result = 0; + } + return result; +} + +/* pseudo files need special ->drop() method, because they don't have nlink + * and only exist while host object does. */ +reiser4_internal void drop_pseudo(struct inode * object) +{ + /* pseudo files are not protected from deletion by their ->i_nlink */ + generic_delete_inode(object); +} + +/* common write method for pseudo files */ +reiser4_internal ssize_t +write_pseudo(struct file *file, + const char __user *buf, size_t size, loff_t *ppos) +{ + ssize_t result; + + switch (get_pplug(file)->write_type) { + case PSEUDO_WRITE_STRING: { + char * inkernel; + + inkernel = getname(buf); + if (!IS_ERR(inkernel)) { + result = get_pplug(file)->write.gets(file, inkernel); + putname(inkernel); + if (result == 0) + result = size; + } else + result = PTR_ERR(inkernel); + break; + } + case PSEUDO_WRITE_FORWARD: + result = get_pplug(file)->write.write(file, buf, size, ppos); + break; + default: + result = size; + } + return result; +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/file/pseudo.h linux-2.6.4-ck1/fs/reiser4/plugin/file/pseudo.h --- linux-2.6.4/fs/reiser4/plugin/file/pseudo.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/file/pseudo.h 2004-03-11 22:45:15.306507565 +1100 @@ -0,0 +1,30 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#if !defined(__REISER4_PSEUDO_FILE_H__) +#define __REISER4_PSEUDO_FILE_H__ + +#include + +extern int open_pseudo(struct inode * inode, struct file * file); +extern ssize_t read_pseudo(struct file *file, + char __user *buf, size_t size, loff_t *ppos); +extern ssize_t write_pseudo(struct file *file, + const char __user *buf, size_t size, loff_t *ppos); +extern loff_t seek_pseudo(struct file *file, loff_t offset, int origin); +extern int release_pseudo(struct inode *inode, struct file *file); +extern void drop_pseudo(struct inode * object); + +/* __REISER4_PSEUDO_FILE_H__ */ +#endif + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ + diff -Naurp linux-2.6.4/fs/reiser4/plugin/file/symfile.c linux-2.6.4-ck1/fs/reiser4/plugin/file/symfile.c --- linux-2.6.4/fs/reiser4/plugin/file/symfile.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/file/symfile.c 2004-03-11 22:45:15.307507409 +1100 @@ -0,0 +1,98 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Symfiles are a generalization of Unix symlinks. + + A symfile when read behaves as though you took its contents and + substituted them into the reiser4 naming system as the right hand side + of an assignment, and then read that which you had assigned to it. + + A key issue for symfiles is how to implement writes through to + subfiles. In general, one must have some method of determining what + of that which is written to the symfile is written to what subfile. + This can be done by use of custom plugin methods written by users, or + by using a few general methods we provide for those willing to endure + the insertion of delimiters into what is read. + + Writing to symfiles without delimiters to denote what is written to + what subfile is not supported by any plugins we provide in this + release. Our most sophisticated support for writes is that embodied + by the invert plugin (see invert.c). + + A read only version of the /etc/passwd file might be + constructed as a symfile whose contents are as follows: + + /etc/passwd/userlines/* + + or + + /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root + + or + + /etc/passwd/userlines/(demidov+edward+reiser+root) + + A symfile with contents + + /filenameA+"(some text stored in the uninvertable symfile)+/filenameB + + will return when read + + The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB + + and write of what has been read will not be possible to implement as + an identity operation because there are no delimiters denoting the + boundaries of what is to be written to what subfile. + + Note that one could make this a read/write symfile if one specified + delimiters, and the write method understood those delimiters delimited + what was written to subfiles. + + So, specifying the symfile in a manner that allows writes: + + /etc/passwd/userlines/demidov+"( + )+/etc/passwd/userlines/edward+"( + )+/etc/passwd/userlines/reiser+"( + )+/etc/passwd/userlines/root+"( + ) + + or + + /etc/passwd/userlines/(demidov+"( + )+edward+"( + )+reiser+"( + )+root+"( + )) + + and the file demidov might be specified as: + + /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell + + or + + /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell) + + Notice that if the file demidov has a carriage return in it, the + parsing fails, but then if you put carriage returns in the wrong place + in a normal /etc/passwd file it breaks things also. + + Note that it is forbidden to have no text between two interpolations + if one wants to be able to define what parts of a write go to what + subfiles referenced in an interpolation. + + If one wants to be able to add new lines by writing to the file, one + must either write a custom plugin for /etc/passwd that knows how to + name an added line, or one must use an invert, or one must use a more + sophisticated symfile syntax that we are not planning to write for + version 4.0. +*/ + + + + + + + + + + + diff -Naurp linux-2.6.4/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.4-ck1/fs/reiser4/plugin/file/tail_conversion.c --- linux-2.6.4/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/file/tail_conversion.c 2004-03-11 22:45:15.308507254 +1100 @@ -0,0 +1,696 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "../../inode.h" +#include "../../super.h" +#include "../../page_cache.h" +#include "../../carry.h" +#include "../../lib.h" +#include "../../safe_link.h" +#include "funcs.h" + +/* this file contains: + tail2extent and extent2tail */ + + +/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */ +reiser4_internal void +get_exclusive_access(unix_file_info_t *uf_info) +{ + assert("nikita-3028", schedulable()); + assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w)); + assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r)); + assert("nikita-3361", get_current_context()->trans->atom == NULL); + BUG_ON(get_current_context()->trans->atom != NULL); + LOCK_CNT_INC(inode_sem_w); + rw_latch_down_write(&uf_info->latch); + assert("nikita-3060", inode_ea_owner(uf_info) == NULL); + assert("vs-1157", !ea_obtained(uf_info)); + ea_set(uf_info, current); + uf_info->exclusive_use = 1; +} + +reiser4_internal void +drop_exclusive_access(unix_file_info_t *uf_info) +{ + assert("nikita-3060", inode_ea_owner(uf_info) == current); + assert("vs-1158", ea_obtained(uf_info)); + ea_set(uf_info, 0); + uf_info->exclusive_use = 0; + rw_latch_up_write(&uf_info->latch); + assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r)); + assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w)); + LOCK_CNT_DEC(inode_sem_w); +} + +/* nonexclusive access to a file is acquired for read, write, readpage */ +reiser4_internal void +get_nonexclusive_access(unix_file_info_t *uf_info) +{ + assert("nikita-3029", schedulable()); + rw_latch_down_read(&uf_info->latch); + LOCK_CNT_INC(inode_sem_r); + assert("nikita-3060", inode_ea_owner(uf_info) == NULL); + assert("vs-1159", !ea_obtained(uf_info)); +} + +reiser4_internal void +drop_nonexclusive_access(unix_file_info_t *uf_info) +{ + assert("nikita-3060", inode_ea_owner(uf_info) == NULL); + assert("vs-1160", !ea_obtained(uf_info)); + rw_latch_up_read(&uf_info->latch); + LOCK_CNT_DEC(inode_sem_r); +} + +/* part of tail2extent. Cut all items covering @count bytes starting from + @offset */ +/* Audited by: green(2002.06.15) */ +static int +cut_formatting_items(struct inode *inode, loff_t offset, int count) +{ + reiser4_key from, to; + + /* AUDIT: How about putting an assertion here, what would check + all provided range is covered by tail items only? */ + /* key of first byte in the range to be cut */ + key_by_inode_unix_file(inode, offset, &from); + + /* key of last byte in that range */ + to = from; + set_key_offset(&to, (__u64) (offset + count - 1)); + + /* cut everything between those keys */ + return cut_tree(tree_by_inode(inode), &from, &to, inode); +} + +static void +release_all_pages(struct page **pages, unsigned nr_pages) +{ + unsigned i; + + for (i = 0; i < nr_pages; i++) { + if (pages[i] == NULL) { + unsigned j; + for (j = i + 1; j < nr_pages; j ++) + assert("vs-1620", pages[j] == NULL); + break; + } + page_cache_release(pages[i]); + pages[i] = NULL; + } +} + +/* part of tail2extent. replace tail items with extent one. Content of tail + items (@count bytes) being cut are copied already into + pages. extent_writepage method is called to create extents corresponding to + those pages */ +static int +replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count) +{ + int result; + unsigned i; + STORE_COUNTERS; + + assert("vs-596", nr_pages > 0 && pages[0]); + + /* cut copied items */ + result = cut_formatting_items(inode, (loff_t) pages[0]->index << PAGE_CACHE_SHIFT, count); + if (result) + return result; + + CHECK_COUNTERS; + + /* put into tree replacement for just removed items: extent item, namely */ + for (i = 0; i < nr_pages; i++) { + result = add_to_page_cache_lru(pages[i], inode->i_mapping, + pages[i]->index, mapping_gfp_mask(inode->i_mapping)); + if (result) + break; + unlock_page(pages[i]); + result = find_or_create_extent(pages[i]); + if (result) + break; + SetPageUptodate(pages[i]); + } + return result; +} + +#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail + * items */ + +static int +reserve_tail2extent_iteration(struct inode *inode) +{ + reiser4_block_nr unformatted_nodes; + reiser4_tree *tree; + + tree = tree_by_inode(inode); + + /* number of unformatted nodes which will be created */ + unformatted_nodes = TAIL2EXTENT_PAGE_NUM; + + /* + * space required for one iteration of extent->tail conversion: + * + * 1. kill N tail items + * + * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes + * + * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block + * extents) extent units. + * + * 4. drilling to the leaf level by coord_by_key() + * + * 5. possible update of stat-data + * + * 6. removal of safe-link + * + */ + grab_space_enable(); + return reiser4_grab_space + (2 * tree->height + + TAIL2EXTENT_PAGE_NUM + + TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) + + 1 + estimate_one_insert_item(tree) + + inode_file_plugin(inode)->estimate.update(inode) + + safe_link_tograb(tree), + BA_CAN_COMMIT); +} + +static int +find_start(struct inode *object, reiser4_plugin_id id, __u64 *offset) +{ + int result; + lock_handle lh; + unix_file_info_t *ufo; + int found; + reiser4_key key; + + ufo = unix_file_inode_data(object); + init_lh(&lh); + result = 0; + found = 0; + key_by_inode_unix_file(object, *offset, &key); + do { + hint_t hint; + + hint_init_zero(&hint, &lh); + result = find_file_item(&hint, &key, + ZNODE_READ_LOCK, CBK_UNIQUE, 0, ufo); + + if (result == CBK_COORD_FOUND) { + coord_t *coord; + + coord = &hint.coord.base_coord; + if (coord->between == AT_UNIT) { + coord_clear_iplug(coord); + result = zload(coord->node); + if (result == 0) { + if (item_id_by_coord(coord) == id) + found = 1; + else + item_plugin_by_coord(coord)->s.file.append_key(coord, &key); + zrelse(coord->node); + } + } else + result = RETERR(-ENOENT); + } + } while (result == 0 && !found); + done_lh(&lh); + *offset = get_key_offset(&key); + return result; +} + +reiser4_internal int +tail2extent(unix_file_info_t *uf_info) +{ + int result; + int s_result; + reiser4_key key; /* key of next byte to be moved to page */ + ON_DEBUG(reiser4_key tmp;) + char *p_data; /* data of page */ + unsigned page_off = 0, /* offset within the page where to copy data */ + count; /* number of bytes of item which can be + * copied to page */ + struct page *pages[TAIL2EXTENT_PAGE_NUM]; + int done; /* set to 1 when all file is read */ + char *item; + int i; + struct inode *inode; + __u64 offset; + int first_iteration; + + assert("nikita-3362", ea_obtained(uf_info)); + + inode = unix_file_info_to_inode(uf_info); + + assert("nikita-3412", !IS_RDONLY(inode)); + + if (uf_info->container == UF_CONTAINER_EXTENTS) { + warning("vs-1171", + "file %llu is built of tails already. Should not happen", + get_inode_oid(inode)); + return 0; + } + + /* collect statistics on the number of tail2extent conversions */ + reiser4_stat_inc(file.tail2extent); + + offset = 0; + if (inode_get_flag(inode, REISER4_PART_CONV)) { + /* find_start() doesn't need block reservation */ + result = find_start(inode, FORMATTING_ID, &offset); + if (result == -ENOENT) + /* no extent found, everything is converted */ + return 0; + else if (result != 0) + /* some other error */ + return result; + } + + result = safe_link_grab(tree_by_inode(inode), BA_CAN_COMMIT); + if (result == 0) { + result = safe_link_add(inode, SAFE_T2E); + if (result != 0) + goto out; + } else if (result != -EEXIST) + goto out; + + /* get key of first byte of a file */ + key_by_inode_unix_file(inode, offset, &key); + + done = 0; + result = 0; + first_iteration = 1; + while (!done) { + xmemset(pages, 0, sizeof (pages)); + all_grabbed2free(); + result = reserve_tail2extent_iteration(inode); + if (result != 0) + goto out; + if (first_iteration) { + inode_set_flag(inode, REISER4_PART_CONV); + reiser4_update_sd(inode); + first_iteration = 0; + } + for (i = 0; i < sizeof_array(pages) && !done; i++) { + assert("vs-598", (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0); + pages[i] = alloc_page(mapping_gfp_mask(inode->i_mapping)); + if (!pages[i]) { + result = RETERR(-ENOMEM); + goto error; + } + + pages[i]->index = (unsigned long) (get_key_offset(&key) >> PAGE_CACHE_SHIFT); + /* usually when one is going to longterm lock znode (as + find_file_item does, for instance) he must not hold + locked pages. However, there is an exception for + case tail2extent. Pages appearing here are not + reachable to everyone else, they are clean, they do + not have jnodes attached so keeping them locked do + not risk deadlock appearance + */ + assert("vs-983", !PagePrivate(pages[i])); + + for (page_off = 0; page_off < PAGE_CACHE_SIZE;) { + hint_t hint; + coord_t *coord; + lock_handle lh; + + /* get next item */ + hint_init_zero(&hint, &lh); + result = find_file_item(&hint, &key, ZNODE_READ_LOCK, CBK_UNIQUE, 0/* ra_info */, uf_info); + if (result != CBK_COORD_FOUND) { + /* tail conversion can not be called for empty file */ + assert("vs-1169", result != CBK_COORD_NOTFOUND); + done_lh(&lh); + goto error; + } + coord = &hint.coord.base_coord; + if (coord->between == AFTER_UNIT) { + /* this is used to detect end of file when inode->i_size can not be used */ + done_lh(&lh); + done = 1; + p_data = kmap_atomic(pages[i], KM_USER0); + xmemset(p_data + page_off, 0, PAGE_CACHE_SIZE - page_off); + kunmap_atomic(p_data, KM_USER0); + break; + } + coord_clear_iplug(coord); + result = zload(coord->node); + if (result) { + done_lh(&lh); + goto error; + } + assert("vs-562", owns_item_unix_file(inode, coord)); + assert("vs-856", coord->between == AT_UNIT); + assert("green-11", keyeq(&key, unit_key_by_coord(coord, &tmp))); + item = ((char *)item_body_by_coord(coord)) + coord->unit_pos; + + /* how many bytes to copy */ + count = item_length_by_coord(coord) - coord->unit_pos; + /* limit length of copy to end of page */ + if (count > PAGE_CACHE_SIZE - page_off) + count = PAGE_CACHE_SIZE - page_off; + + /* kmap/kunmap are necessary for pages which + are not addressable by direct kernel virtual + addresses */ + p_data = kmap_atomic(pages[i], KM_USER0); + /* copy item (as much as will fit starting from + the beginning of the item) into the page */ + memcpy(p_data + page_off, item, (unsigned) count); + kunmap_atomic(p_data, KM_USER0); + + page_off += count; + set_key_offset(&key, get_key_offset(&key) + count); + + zrelse(coord->node); + done_lh(&lh); + + if (get_key_offset(&key) == (__u64)unix_file_info_to_inode(uf_info)->i_size) { + /* end of file is detected here */ + p_data = kmap_atomic(pages[i], KM_USER0); + memset(p_data + page_off, 0, PAGE_CACHE_SIZE - page_off); + kunmap_atomic(p_data, KM_USER0); + done = 1; + break; + } + } /* for */ + } /* for */ + + assert("vs-1619", i > 0); + result = replace(inode, pages, i, (int) ((i - 1) * PAGE_CACHE_SIZE + page_off)); + release_all_pages(pages, sizeof_array(pages)); + if (result) + goto error; + /* throttle the conversion */ + balance_dirty_page_unix_file(inode); + } + + if (result == 0) { + /* tail converted */ + uf_info->container = UF_CONTAINER_EXTENTS; + + if (inode_get_flag(inode, REISER4_PART_CONV)) { + inode_clr_flag(inode, REISER4_PART_CONV); + reiser4_update_sd(inode); + } + /* It is advisable to check here that all grabbed pages were + * freed */ + } else { + /* conversion is not complete. Inode was already marked as + * REISER4_PART_CONV and stat-data were updated at the first + * iteration of the loop above. */ + error: + release_all_pages(pages, sizeof_array(pages)); + warning("nikita-2282", "Partial conversion of %llu: %i", + get_inode_oid(inode), result); + print_inode("inode", inode); + } + + s_result = safe_link_del(inode, SAFE_T2E); + if (s_result != 0) + warning("nikita-3425", "Cannot kill safe-link %lli: %i", + get_inode_oid(inode), s_result); + + out: + safe_link_release(tree_by_inode(inode)); + all_grabbed2free(); + return result; +} + + +/* part of extent2tail. Page contains data which are to be put into tree by + tail items. Use tail_write for this. flow is composed like in + unix_file_write. The only difference is that data for writing are in + kernel space */ +/* Audited by: green(2002.06.15) */ +static int +write_page_by_tail(struct inode *inode, struct page *page, unsigned count) +{ + flow_t f; + hint_t hint; + coord_t *coord; + lock_handle lh; + znode *loaded; + item_plugin *iplug; + int result; + + result = 0; + + assert("vs-1089", count); + + /* build flow */ + inode_file_plugin(inode)->flow_by_inode(inode, kmap(page), 0 /* not user space */ , + count, (loff_t) (page->index << PAGE_CACHE_SHIFT), WRITE_OP, &f); + iplug = item_plugin_by_id(FORMATTING_ID); + while (f.length) { + hint_init_zero(&hint, &lh); + result = find_file_item(&hint, &f.key, ZNODE_WRITE_LOCK, CBK_UNIQUE | CBK_FOR_INSERT, 0/* ra_info */, 0/* inode */); + if (IS_CBKERR(result)) + break; + + assert("vs-957", ergo(result == CBK_COORD_NOTFOUND, get_key_offset(&f.key) == 0)); + assert("vs-958", ergo(result == CBK_COORD_FOUND, get_key_offset(&f.key) != 0)); + + coord = &hint.coord.base_coord; + coord_clear_iplug(coord); + result = zload(coord->node); + if (result) + break; + + loaded = coord->node; + result = iplug->s.file.write(inode, &f, &hint, 1/*grabbed*/, how_to_write(&hint.coord, &f.key)); + zrelse(loaded); + done_lh(&lh); + if (result == -E_REPEAT) + result = 0; + else if (result) + break; + } + + done_lh(&lh); + kunmap(page); + + /* result of write is 0 or error */ + assert("vs-589", result <= 0); + /* if result is 0 - all @count bytes is written completely */ + assert("vs-588", ergo(result == 0, f.length == 0)); + return result; +} + +/* flow insertion is limited by CARRY_FLOW_NEW_NODES_LIMIT of new nodes. Therefore, minimal number of bytes of flow + which can be put into tree by one insert_flow is number of bytes contained in CARRY_FLOW_NEW_NODES_LIMIT nodes if + they all are filled completely by one tail item. Fortunately, there is a one to one mapping between bytes of tail + items and bytes of flow. If there were not, we would have to have special item plugin */ +reiser4_internal int min_bytes_per_flow(void) +{ + assert("vs-1103", current_tree->nplug && current_tree->nplug->max_item_size); + return CARRY_FLOW_NEW_NODES_LIMIT * current_tree->nplug->max_item_size(); +} + +static int +reserve_extent2tail_iteration(struct inode *inode) +{ + reiser4_tree *tree; + + tree = tree_by_inode(inode); + /* + * reserve blocks for (in this order): + * + * 1. removal of extent item + * + * 2. insertion of tail by insert_flow() + * + * 3. drilling to the leaf level by coord_by_key() + * + * 4. possible update of stat-data + * + * 5. removal of safe-link + */ + grab_space_enable(); + return reiser4_grab_space + (estimate_one_item_removal(tree) + + estimate_insert_flow(tree->height) + + 1 + estimate_one_insert_item(tree) + + inode_file_plugin(inode)->estimate.update(inode) + + safe_link_tograb(tree), + BA_CAN_COMMIT); +} + +/* for every page of file: read page, cut part of extent pointing to this page, + put data of page tree by tail item */ +reiser4_internal int +extent2tail(unix_file_info_t *uf_info) +{ + int result; + int s_result; + struct inode *inode; + struct page *page; + unsigned long num_pages, i; + unsigned long start_page; + reiser4_key from; + reiser4_key to; + unsigned count; + __u64 offset; + int space_reserved; + + /* collect statistics on the number of extent2tail conversions */ + reiser4_stat_inc(file.extent2tail); + + inode = unix_file_info_to_inode(uf_info); + assert("nikita-3412", !IS_RDONLY(inode)); + + offset = 0; + if (inode_get_flag(inode, REISER4_PART_CONV)) { + /* find_start() doesn't need block reservation */ + result = find_start(inode, EXTENT_POINTER_ID, &offset); + if (result == -ENOENT) + /* no extent found, everything is converted */ + return 0; + else if (result != 0) + /* some other error */ + return result; + } + + result = safe_link_grab(tree_by_inode(inode), BA_CAN_COMMIT); + if (result == 0) { + result = safe_link_add(inode, SAFE_E2T); + if (result != 0) + return result; + } else if (result != -EEXIST) + return result; + + /* number of pages in the file */ + num_pages = + (inode->i_size - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + start_page = offset >> PAGE_CACHE_SHIFT; + + key_by_inode_unix_file(inode, offset, &from); + to = from; + + result = 0; + space_reserved = 0; + for (i = 0; i < num_pages; i++) { + __u64 start_byte; + + all_grabbed2free(); + space_reserved = 0; + result = reserve_extent2tail_iteration(inode); + if (result != 0) + break; + space_reserved = 1; + if (i == 0) { + inode_set_flag(inode, REISER4_PART_CONV); + reiser4_update_sd(inode); + } + + page = read_cache_page(inode->i_mapping, + (unsigned) (i + start_page), + readpage_unix_file/*filler*/, 0); + if (IS_ERR(page)) { + result = PTR_ERR(page); + break; + } + + wait_on_page_locked(page); + + if (!PageUptodate(page)) { + page_cache_release(page); + result = RETERR(-EIO); + break; + } + + /* cut part of file we have read */ + start_byte = (__u64) (i << PAGE_CACHE_SHIFT) + offset; + set_key_offset(&from, start_byte); + set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1); + /* + * cut_tree_object() returns -E_REPEAT to allow atom + * commits during over-long truncates. But + * extent->tail conversion should be performed in one + * transaction. + */ + result = cut_tree(tree_by_inode(inode), &from, &to, inode); + + if (result) { + page_cache_release(page); + break; + } + + /* put page data into tree via tail_write */ + count = PAGE_CACHE_SIZE; + if (i == num_pages - 1) + count = (inode->i_size & ~PAGE_CACHE_MASK) ? : PAGE_CACHE_SIZE; + result = write_page_by_tail(inode, page, count); + if (result) { + page_cache_release(page); + break; + } + + /* release page */ + lock_page(page); + /* page is already detached from jnode and mapping. */ + assert("vs-1086", page->mapping == NULL); + assert("nikita-2690", (!PagePrivate(page) && page->private == 0)); + /* waiting for writeback completion with page lock held is + * perfectly valid. */ + wait_on_page_writeback(page); + drop_page(page); + /* release reference taken by read_cache_page() above */ + page_cache_release(page); + } + + assert("vs-1260", reiser4_inode_data(inode)->eflushed == 0); + + if (i == num_pages) { + uf_info->container = UF_CONTAINER_TAILS; + assert("nikita-3471", space_reserved); + if (inode_get_flag(inode, REISER4_PART_CONV)) { + inode_clr_flag(inode, REISER4_PART_CONV); + reiser4_update_sd(inode); + } + } else { + /* conversion is not complete. Inode was already marked as + * REISER4_PART_CONV and stat-data were updated at the first + * iteration of the loop above. */ + warning("nikita-2282", + "Partial conversion of %llu: %lu of %lu: %i", + get_inode_oid(inode), i, num_pages, result); + print_inode("inode", inode); + } + if (space_reserved) { + s_result = safe_link_del(inode, SAFE_E2T); + if (s_result != 0) { + warning("nikita-3422", "Cannot kill safe-link %lli: %i", + get_inode_oid(inode), s_result); + } + } else + s_result = 0; + all_grabbed2free(); + return result ? : s_result; +} + +reiser4_internal int +finish_conversion(struct inode *inode) +{ + int result; + + if (inode_get_flag(inode, REISER4_PART_CONV)) + result = tail2extent(unix_file_inode_data(inode)); + else + result = 0; + return result; +} + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/flush/flush.alg linux-2.6.4-ck1/fs/reiser4/plugin/flush/flush.alg --- linux-2.6.4/fs/reiser4/plugin/flush/flush.alg 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/flush/flush.alg 2004-03-11 22:45:15.310506943 +1100 @@ -0,0 +1,515 @@ + + + + + + + + + + +/* + +The use of atomic commits dramatically impacts the use of LRU as the +basis for page cleaning (though using it for clean page discarding is +still effective.) + +The use of write clustering dramatically impacts the use of LRU as the +basis for page cleaning. + +ReiserFS v4 uses both. + +We will not use LRU in v4.0 of reiserfs, and then in later versions we +may gradually partially reintroduce it. + +Optimizations to make on flush: + +* block (re)allocation + +* tail conversion + +* extent formation + +* node repacking + +* wandering log definition + + +Memory Pressure: + +There are kinds of memory pressure: + +* general lack of memory for processes requesting it + +* too much dirty memory + +* dirty memory is too old and should be more permanently preserved on disk + +* particular page needs freeing for DMA setup + +[All programmers should understand that I expect strict observance of the +following taboo: you will not add an unnecessary copying of data, while coding +this. I cannot understand why there is resistance to this, but I keep seeing +code which ignores this.] + + +Unlike clean pages, dirty memory must be written to disk before being +freed for other use. It also may require processing that will require +more memory before it can be cleaned by writing it to disk. This +processing makes it vulnerable to deadlocks in extreme cases. +Precisely reserving enough memory to allow that extra processing +without deadlock is often difficult. + +reiser4 limits its usage of dirty pages to 75%, which is enough to +ensure that the extra processing will not cause the system to run out +of memory. More safeguards are possible, including letting commit +choose to swap, but we will wait until this rather simple mechanism +has a problem in practice before elaborating it. + +reiser4 supports the reiserfs_flush(page) command for cleaning pages + +If the Linux VM system was properly designed, it would be based upon +memory sub-managers each reflecting some common principles of +responding to pressure that is in proportion to their size. Linux +used to have multiple caches, these caches had no understand of how +large they were, they made no attempt to proportionalize the pressure +upon each cache, and their management was generally badly designed +without any effective mechanism for ensuring that the caches did not +get out of balance with each other. From this it was concluded that +there should be only one unified cache, rather than designing an +effective mechanism for expressing to each subcache a sense of +pressure in proportion to the size of the subcache, and requiring that +the subcache embody some effective mechanism for responding to that +sense of pressure. + +The unified cache is indeed better than badly designed multiple +caches. It does however perform very poorly at handling caches of +objects that are not page sized. + +Linus says it already has a subcache manager design, we just need to +use writepage. Ok, fine, we will be the first subcache. + +So, understand that in reiserfs, writepage does not write pages, it +pressures the reiserfs memory manager, and understand that the place a +page has on the various mm lists does not determine when it gets +written out, it merely determines when it triggers the next pressure +on the reiserfs memory manager. + +What reiser4 does is interpret pressure on a page as pressure on a +subcache within reiserfs. + +Write clustering, transaction commits, objects whose value to cache is +out of proportion to the number of bytes consumed by them, caches +whose working set size and pattern of access is known to the +application, and those occasions when other factors knowable to the +filesystem or application but not the OS generally are important to +deciding what to eject, and objects much smaller than a page with no +correlation of references for objects on the same page, or larger than +a page with a complete correlation between their pages, are good example of when cache +submanagers should be employed. + + */ + +/* You should read crypt.c and then return. */ +/* You should read block_alloc.c and then return. */ + +current_leaf = find_leftmost_leaf_in_slum(); +/* current_leaf is locked */ +parent = parent(current_leaf); +/* parent is locked */ + +if (is_relocate_set(current_leaf)) +{ + dirty(parent); +} + +if (is_dirty(parent)) +{ + squeeze_level_left(parent); + /* this can create an enormous recursive chain that could overflow + the kernel stack, hmmmm..... */ + flush_all_other_child_slums(parent, min_key(current_leaf)); +} +else +{ + unlock(parent); +} +/* parent is unlocked by squeeze_level_left, and squeezing may have + changed the parent of current_leaf */ + +parent = parent(current_leaf); +/* parent is locked */ +if (leftmost_child(parent) == current_leaf) + allocate(parent); + +/* ok, now we are ready to proceed through slum on the leaf level */ +next_leaf = get_right_neighbor_in_slum_level(current_leaf); +/* next_leaf is locked or null */ + +/* need to review locking in the below */ +while(next_leaf) +{ + if (is_formatted(current_leaf) && is_formatted(next_leaf)) + { + squeeze_left(current_leaf, next_leaf); + if (is_empty(next_leaf)) + { + delete_node(next_leaf); + next_leaf = get_right_neighbor_in_slum_level(current_leaf); + check_and_handle_parent_change(); + continue; + } + } + if (is_unformatted(current_leaf)) + /* allocate or reallocate */ + allocate_extent_in_parent_if_needed(current_leaf); + /* the above may change the parent */ + check_and_handle_parent_change(); + allocate(current_leaf); + next_leaf = get_right_neighbor_in_slum_level(current_leaf); + check_and_handle_parent_change(); + + +} + +/* this means squeeze it as well as allocate it */ +handle_non_leaf_end_of_slum(); + +check_and_handle_parent_change() +{ +if ( (new_parent = parent(current_leaf)) != parent) + squeeze_left(parent, new_parent(parent)); + else + return; + +/* the line above can change who the parent is so retest... */ + if((new_parent = parent(current_leaf)) != parent) + { + parent = new_parent; + if (leftmost_child(parent) != current_leaf) + reiser_panic("reiser-2071: this needs recoding to handle this case"); + allocate_node(parent); + /* allocating other ancestors left for josh */ + + /* our new parent might not be well packed, and we want + it to be well packed even if our slum never reaches its edge + so we... */ + squeeze_left(parent, right_neighbor(parent)); + } +} + + +################################################################################ + + +The Problem: + +We need to know the relocate set in order to perform a left-to-right +parent-first allocate-and-squeeze traversal over a dirty sub-tree. We +could make this decision during the allocate-and-squeeze pass, but in +that case we would discover a node is dirty when we have already +passed over its position in the parent-first ordering. In otherwords, +we would discover this information too late to be useful. + +The Several-Pass Solution: + +It is possible to construct this relocate information at flush time by +scanning the tree, but it means at least two passes over the tree. +Using several passes has an advantage: we can then choose overwrite +when the "optimal location" is part of the atom's own preserve set. +This requires knowing the (partial) set of blocks being flushed before +allocation begins. This strategy was initially proposed by Hans, +before we realized it would require multiple passes. + +The Solution: + +Maintain an active count of dirty children for each node. This allows +us to mark a node dirty whenever its dirty count becomes >= 2 because +at that point overwriting the parent reduces the total number of +blocks written to disk. How much of a counter is needed? In order to +keep track of additions and subtractions to this count, a counter at +the same size as our znode's c_count field is needed. If this value +were only ever incremented, then we could use a single bit (0 = no +dirty children, 1 = single dirty child, otherwise mark dirty & +relocate). But since a node may have dirty children added while +flushes are active (it can happen, right?) this requires more than just +a bit. I worry about the complexity of maintaining this dirty count, +but I fear that the parent-first allocation policy will not succeed +without knowing before-hand all the dirty nodes it must consider. + +The Algorithm: + +Given the above assumption, that nodes are marked dirty whenever they +should be relocated (i.e., that the flush algorithm does not make this +decision during as part of its passing over the tree). + +Starting from some leaf node, find the greatest dirty ancestor, defined as the +least (i.e., lowest-level) ancestor of a node without a dirty parent. The +greatest dirty ancestor will be overwritten, therefore its preceding node in +the parent-first order should not be considered. + + [Dead text: If the greatest dirty ancestor is NOT the leftmost child of + its own parent (and not the root node), there may be a dirty + parent-first-ordered node in a subtree to the left of this one. In those + cases, from the greatest dirty ancestor, find the leftmost in-memory + descendant. If the leftmost descendant is dirty, consider its left + neighbor. If the neighbor is also dirty, repeat the steps of this + paragraph starting at that node (i.e., find the new greatest dirty + ancestor).] + +Pseudo-code for this is: + +/* Starting from a node, find the greatest dirty ancestor. */ +jnode* greatest_dirty_ancestor (jnode *node) +{ + while (! is_root_node (node)) { + znode *parent = get_parent (node); + + if (! znode_is_dirty (parent)) { + break; + } + + node = parent; + } + return node; +} + +Now we have found the greatest dirty ancestor from which to begin allocating +and squeezing. From this point we will traverse all descendants of the +greatest dirty ancestor, in parent-first order, allocating blocks and +squeezing nodes in the following order. Squeezing must be performed in a +bottom-up, left-to-right order, whereas allocation occurs in parent-first +order. The following pseudo-code accomplishes both at once: + +######################################################################## + +Problems with above to be addressed: + +Nikita suggests squeezing all the formatted nodes of a twig before allocating +its extents, thereby increasing room for extents to "inflate". + +######################################################################## + +/* A function to find the parent-first-preceder of a node, although + * there may not be enough nodes in memory to actually compute this. + * In that case, pick something else. If node is leftmost child of + * its parent, return its parent's block number. Otherwise if node + * is a leaf node, return its left neighbor. Finally, return the + * block number of the parent's left neighbor's rightmost descendent + * (which may not be in memory). In the actual implementation of the + * parent-first traversal below, we can optimize this (because we + * know the result most of the time). */ +blocknr parent_first_preceder_of (jnode *node) { ... } + +/* A parent-first recursive tree traversal, allocate and squeeze. + * This is called on the greatest dirty ancestor of a region to be + * flushed. + */ +void allocate_and_squeeze_parent_first (jnode *node) +{ + /* Stop recursion if its not dirty, meaning don't allocate children either. + * Children might be dirty but there is an overwrite below this level + * or else this node would be dirty. */ + if (! is_dirty (node)) { + return; + } + + /* Allocate (parent) first. */ + allocate_node (node, parent_first_preceder_of (node)); + + if (jnode_is_unformatted (node)) { + /* We got here because the parent (twig) of an unformatted node is + * not being relocated. Otherwise this recursion does not descend + * to unformatted nodes. */ + return; + } + + /* Recursive case: */ + if (jnode_get_level (node) > LEAF_LEVEL) { + + for (each_item_left_to_right (node)) { + + if (is_extent_item (item) && extent_item_is_dirty (item)) { + allocate_extent_item (item); + } else if (is_internal_item (item) && jnode_is_dirty (internal_item_child (item))) { + allocate_and_squeeze_parent_first (internal_item_child (item)); + } + } + } + + /* Squeeze a node: note that this makes the "one big memcpy" + * approach somewhat more difficult, but its still possible. */ + while (not_empty (node) && jnode_is_formatted (node->right) && is_dirty (node->right)) { + + item = first_item_of (node->right); + + if (is_extent_item (item) && extent_item_is_dirty (item)) { + allocate_extent_item_into (item, node); + } else if (can_shift_into (item, node)) { + shift_item (item, node); + } + } +} + +######################################################################## +######################################################################## + +######################################################################## +######################################################################## + +Hans says: + +Relocate parent if leftmost child is also relocated + +Relocate if leftmost-child of parent. + +Ignore the "always relocate children if two children of a node are dirty" +idea. + +Rather than scan left at the leaf level, why not jump to parent, check +left-most child dirty, and stop? + +######################################################################## +######################################################################## + +Dead pseudo code, older stuff: + +######################################################################## +######################################################################## + + +Problem: The root of a subtree gets overwritten, so the subtree to the left +will not follow in parent-first order. That would simplify things. Killed +this code: + +/* Starting at a node, find the greatest dirty parent, then see if it + * has a preceding dirty node on the leaf of the subtree to its left. */ +void find_maximal_dirty_ancestor (jnode *node) +{ + repeat: + node = greatest_dirty_ancestor (node) + + /* End search at the root node or if the node is the leftmost child + * of its parent, in which case the left-of-leftmost-descendent does + * not precede it in parent first order, its parent does in that + * case. */ + if (! is_root_node (node) && ! leftmost_child_of_parent (node)) { + jnode *godown = node; + + while (jnode_get_level (godown) > LEAF_LEVEL) { + /* Iterate downward as long as leftmost nodes in memory (note: + * they don't have to be dirty). */ + jnode *child = leftmost_child (godown); + + if (child == NULL) { + return node; + } + + godown = child; + } + + /* Reached the leftmost descendant of the maximal dirty node, + * now see if its left is dirty. Otherwise return. */ + if ((godown = godown->left) == NULL || ! jnode_is_dirty (godown)) { + return node; + } + + /* At this point, "godown" precedes "node" in the parent-first + * traversal, so search for a new maximal dirty node. */ + node = godown; + goto repeat; + } +} + +/* Allocate and squeeze starting at the greatest dirty ancestor + * described above. Repeat in rightward direction for adjacent + * subtrees. + */ +void allocate_and_squeeze_parent_first (jnode *node) +{ + jnode *right; + +repeat: + /* Do one sub-tree */ + allocate_and_squeeze_parent_first_subtree (node); + + /* Now try to repeat to the right. */ + right = get_right_neighbor (node); + + if (right != NULL && jnode_is_dirty (right)) { + node = greatest_dirty_ancestor (right); + goto repeat; + } +} + + +/* The crap below was my first attempt to write this iteratively. */ + + + + + +jnode *maximal_dirty_ancestor = ...; /* Computed using above algorithm */ +jnode *left_edge[MAX_LEVELS]; /* Vertical edge of left-to-right scan */ +int top_edge = jnode_get_level (node) - LEAF_LEVEL; /* Highest index to the left_edge array -- + * by subtracting LEAF_LEVEL it becomes 0-origin */ + +/* Initialize left_edge array entries to NULL, set top edge */ +left_edge[top_edge] = maximal_dirty_ancestor; + +/* For each node above the leaf level, set the child in left_edge */ +for (int level = top_edge; level >= 1; level -= 1) { + + jnode *parent = left_edge[level]; + + /* Find its leftmost dirty child. */ + jnode *child = leftmost_dirty_child (parent); + + /* Its possible that a dirty node could have no dirty children, + * in which case leave the lower edges NULL. */ + if (child == NULL) { break; } + + left_edge[level-1] = child; +} + +/* To store the lowest dirty entry in left_edge[]. */ +int current_level = 0; + +/* Allocate each node in the left edge. */ +for (int level = top_edge; level >= 0 && left_edge[level] != NULL; level -= 1) { + + jnode *node = left_edge[level]; + + /* Allocate this node... */ + allocate_node (node, parent_first_preceder_of (node)); + + current_level = level; +} + +/* Now starting with the current level, squeeze and allocate until finished. */ +while (current_level <= top_level) { + + jnode *current_node = left_edge[current_level]; + + if (jnode_is_formatted (current_node)) { + + do { + + /* Shift as much as possible. */ + while (node_has_room_to_shift_into (current_node)) { + if (is_twig_level (current_node)) { + shift_left (current_node, current_node->right); + } else { + allocate_extents_and_shift_left (current_node, current_node->right); + } + } + + /* Once it has been tightly packed, allocate it. */ + allocate_node (current_node, parent_first_preceder_of (node)); + + current_node = current_node->right; + } + + current_level += 1; +} + diff -Naurp linux-2.6.4/fs/reiser4/plugin/hash.c linux-2.6.4-ck1/fs/reiser4/plugin/hash.c --- linux-2.6.4/fs/reiser4/plugin/hash.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/hash.c 2004-03-11 22:45:15.311506787 +1100 @@ -0,0 +1,312 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Hash functions */ + +#include "../debug.h" +#include "plugin_header.h" +#include "plugin.h" +#include "../super.h" + +#include + +/* old rupasov (yura) hash */ +static __u64 +hash_rupasov(const unsigned char *name /* name to hash */ , + int len /* @name's length */ ) +{ + int i; + int j; + int pow; + __u64 a; + __u64 c; + + assert("nikita-672", name != NULL); + assert("nikita-673", len >= 0); + + for (pow = 1, i = 1; i < len; ++i) + pow = pow * 10; + + if (len == 1) + a = name[0] - 48; + else + a = (name[0] - 48) * pow; + + for (i = 1; i < len; ++i) { + c = name[i] - 48; + for (pow = 1, j = i; j < len - 1; ++j) + pow = pow * 10; + a = a + c * pow; + } + for (; i < 40; ++i) { + c = '0' - 48; + for (pow = 1, j = i; j < len - 1; ++j) + pow = pow * 10; + a = a + c * pow; + } + + for (; i < 256; ++i) { + c = i; + for (pow = 1, j = i; j < len - 1; ++j) + pow = pow * 10; + a = a + c * pow; + } + + a = a << 7; + return a; +} + +/* r5 hash */ +static __u64 +hash_r5(const unsigned char *name /* name to hash */ , + int len UNUSED_ARG /* @name's length */ ) +{ + __u64 a = 0; + + assert("nikita-674", name != NULL); + assert("nikita-675", len >= 0); + + while (*name) { + a += *name << 4; + a += *name >> 4; + a *= 11; + name++; + } + return a; +} + +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function + H0 = Key + Hi = E Mi(Hi-1) + Hi-1 + + (see Applied Cryptography, 2nd edition, p448). + + Jeremy Fitzhardinge 1998 + + Jeremy has agreed to the contents of reiserfs/README. -Hans + + This code was blindly upgraded to __u64 by s/__u32/__u64/g. +*/ +static __u64 +hash_tea(const unsigned char *name /* name to hash */ , + int len /* @name's length */ ) +{ + __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u }; + + __u64 h0 = k[0], h1 = k[1]; + __u64 a, b, c, d; + __u64 pad; + int i; + + assert("nikita-676", name != NULL); + assert("nikita-677", len >= 0); + +#define DELTA 0x9E3779B9u +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ +#define PARTROUNDS 6 /* 6 gets complete mixing */ + +/* a, b, c, d - data; h0, h1 - accumulated hash */ +#define TEACORE(rounds) \ + do { \ + __u64 sum = 0; \ + int n = rounds; \ + __u64 b0, b1; \ + \ + b0 = h0; \ + b1 = h1; \ + \ + do \ + { \ + sum += DELTA; \ + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ + } while(--n); \ + \ + h0 += b0; \ + h1 += b1; \ + } while(0) + + pad = (__u64) len | ((__u64) len << 8); + pad |= pad << 16; + + while (len >= 16) { + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << 16 | (__u64) name[3] << 24; + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << 16 | (__u64) name[7] << 24; + c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] << 16 | (__u64) name[11] << 24; + d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14] << 16 | (__u64) name[15] << 24; + + TEACORE(PARTROUNDS); + + len -= 16; + name += 16; + } + + if (len >= 12) { + //assert(len < 16); + if (len >= 16) + *(int *) 0 = 0; + + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << 16 | (__u64) name[3] << 24; + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << 16 | (__u64) name[7] << 24; + c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] << 16 | (__u64) name[11] << 24; + + d = pad; + for (i = 12; i < len; i++) { + d <<= 8; + d |= name[i]; + } + } else if (len >= 8) { + //assert(len < 12); + if (len >= 12) + *(int *) 0 = 0; + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << 16 | (__u64) name[3] << 24; + b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] << 16 | (__u64) name[7] << 24; + + c = d = pad; + for (i = 8; i < len; i++) { + c <<= 8; + c |= name[i]; + } + } else if (len >= 4) { + //assert(len < 8); + if (len >= 8) + *(int *) 0 = 0; + a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] << 16 | (__u64) name[3] << 24; + + b = c = d = pad; + for (i = 4; i < len; i++) { + b <<= 8; + b |= name[i]; + } + } else { + //assert(len < 4); + if (len >= 4) + *(int *) 0 = 0; + a = b = c = d = pad; + for (i = 0; i < len; i++) { + a <<= 8; + a |= name[i]; + } + } + + TEACORE(FULLROUNDS); + +/* return 0;*/ + return h0 ^ h1; + +} + +/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash. + + See http://www.isthe.com/chongo/tech/comp/fnv/ for details. + + Excerpts: + + FNV hashes are designed to be fast while maintaining a low collision + rate. + + [This version also seems to preserve lexicographical order locally.] + + FNV hash algorithms and source code have been released into the public + domain. + +*/ +static __u64 +hash_fnv1(const unsigned char *name /* name to hash */ , + int len UNUSED_ARG /* @name's length */ ) +{ + unsigned long long a = 0xcbf29ce484222325ull; + const unsigned long long fnv_64_prime = 0x100000001b3ull; + + assert("nikita-678", name != NULL); + assert("nikita-679", len >= 0); + + /* FNV-1 hash each octet in the buffer */ + for (; *name; ++name) { + /* multiply by the 32 bit FNV magic prime mod 2^64 */ + a *= fnv_64_prime; + /* xor the bottom with the current octet */ + a ^= (unsigned long long) (*name); + } + /* return our new hash value */ + return a; +} + +/* degenerate hash function used to simplify testing of non-unique key + handling */ +static __u64 +hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ , + int len UNUSED_ARG /* @name's length */ ) +{ + ON_TRACE(TRACE_DIR, "Hashing %s\n", name); + return 0xc0c0c0c010101010ull; +} + +/* hash plugins */ +hash_plugin hash_plugins[LAST_HASH_ID] = { + [RUPASOV_HASH_ID] = { + .h = { + .type_id = REISER4_HASH_PLUGIN_TYPE, + .id = RUPASOV_HASH_ID, + .pops = NULL, + .label = "rupasov", + .desc = "Original Yura's hash", + .linkage = TYPE_SAFE_LIST_LINK_ZERO} + , + .hash = hash_rupasov + }, + [R5_HASH_ID] = { + .h = { + .type_id = REISER4_HASH_PLUGIN_TYPE, + .id = R5_HASH_ID, + .pops = NULL, + .label = "r5", + .desc = "r5 hash", + .linkage = TYPE_SAFE_LIST_LINK_ZERO} + , + .hash = hash_r5 + }, + [TEA_HASH_ID] = { + .h = { + .type_id = REISER4_HASH_PLUGIN_TYPE, + .id = TEA_HASH_ID, + .pops = NULL, + .label = "tea", + .desc = "tea hash", + .linkage = TYPE_SAFE_LIST_LINK_ZERO} + , + .hash = hash_tea + }, + [FNV1_HASH_ID] = { + .h = { + .type_id = REISER4_HASH_PLUGIN_TYPE, + .id = FNV1_HASH_ID, + .pops = NULL, + .label = "fnv1", + .desc = "fnv1 hash", + .linkage = TYPE_SAFE_LIST_LINK_ZERO} + , + .hash = hash_fnv1 + }, + [DEGENERATE_HASH_ID] = { + .h = { + .type_id = REISER4_HASH_PLUGIN_TYPE, + .id = DEGENERATE_HASH_ID, + .pops = NULL, + .label = "degenerate hash", + .desc = "Degenerate hash: only for testing", + .linkage = TYPE_SAFE_LIST_LINK_ZERO} + , + .hash = hash_deg + } +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/acl.h linux-2.6.4-ck1/fs/reiser4/plugin/item/acl.h --- linux-2.6.4/fs/reiser4/plugin/item/acl.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/acl.h 2004-03-11 22:45:15.312506632 +1100 @@ -0,0 +1,64 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Directory entry. */ + +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ ) +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ + +#include "../../forward.h" +#include "../../dformat.h" +#include "../../kassign.h" +#include "../../key.h" + +#include +#include /* for struct dentry */ + +typedef struct directory_entry_format { + /* key of object stat-data. It's not necessary to store whole + key here, because it's always key of stat-data, so minor + packing locality and offset can be omitted here. But this + relies on particular key allocation scheme for stat-data, so, + for extensibility sake, whole key can be stored here. + + We store key as array of bytes, because we don't want 8-byte + alignment of dir entries. + */ + obj_key_id id; + /* file name. Null terminated string. */ + d8 name[0]; +} directory_entry_format; + +void print_de(const char *prefix, coord_t * coord); +int extract_key_de(const coord_t * coord, reiser4_key * key); +int update_key_de(const coord_t * coord, const reiser4_key * key, lock_handle * lh); +char *extract_name_de(const coord_t * coord, char *buf); +unsigned extract_file_type_de(const coord_t * coord); +int add_entry_de(struct inode *dir, coord_t * coord, + lock_handle * lh, const struct dentry *name, reiser4_dir_entry_desc * entry); +int rem_entry_de(struct inode *dir, const struct qstr * name, coord_t * coord, lock_handle * lh, reiser4_dir_entry_desc * entry); +int max_name_len_de(const struct inode *dir); + + +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length); + +char *extract_dent_name(const coord_t * coord, + directory_entry_format *dent, char *buf); + +#if REISER4_LARGE_KEY +#define DE_NAME_BUF_LEN (24) +#else +#define DE_NAME_BUF_LEN (16) +#endif + +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/blackbox.c linux-2.6.4-ck1/fs/reiser4/plugin/item/blackbox.c --- linux-2.6.4/fs/reiser4/plugin/item/blackbox.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/blackbox.c 2004-03-11 22:45:15.312506632 +1100 @@ -0,0 +1,142 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Black box item implementation */ + +#include "../../forward.h" +#include "../../debug.h" +#include "../../dformat.h" +#include "../../kassign.h" +#include "../../coord.h" +#include "../../tree.h" +#include "../../lock.h" + +#include "blackbox.h" +#include "item.h" +#include "../plugin.h" + + +reiser4_internal int +store_black_box(reiser4_tree *tree, + const reiser4_key *key, void *data, int length) +{ + int result; + reiser4_item_data idata; + coord_t coord; + lock_handle lh; + + xmemset(&idata, 0, sizeof idata); + + idata.data = data; + idata.user = 0; + idata.length = length; + idata.iplug = item_plugin_by_id(BLACK_BOX_ID); + + init_lh(&lh); + result = insert_by_key(tree, key, + &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE); + + assert("nikita-3413", + ergo(result == 0, + WITH_COORD(&coord, item_length_by_coord(&coord) == length))); + + done_lh(&lh); + return result; +} + +reiser4_internal int +load_black_box(reiser4_tree *tree, + reiser4_key *key, void *data, int length, int exact) +{ + int result; + coord_t coord; + lock_handle lh; + + init_lh(&lh); + result = coord_by_key(tree, key, + &coord, &lh, ZNODE_READ_LOCK, + exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN, + LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL); + + if (result == 0) { + int ilen; + + result = zload(coord.node); + if (result == 0) { + ilen = item_length_by_coord(&coord); + if (ilen <= length) { + xmemcpy(data, item_body_by_coord(&coord), ilen); + unit_key_by_coord(&coord, key); + } else if (exact) { + /* + * item is larger than buffer provided by the + * user. Only issue a warning if @exact is + * set. If @exact is false, we are iterating + * over all safe-links and here we are reaching + * the end of the iteration. + */ + warning("nikita-3415", + "Wrong black box length: %i > %i", + ilen, length); + result = RETERR(-EIO); + } + zrelse(coord.node); + } + } + + done_lh(&lh); + return result; + +} + +reiser4_internal int +update_black_box(reiser4_tree *tree, + const reiser4_key *key, void *data, int length) +{ + int result; + coord_t coord; + lock_handle lh; + + init_lh(&lh); + result = coord_by_key(tree, key, + &coord, &lh, ZNODE_READ_LOCK, + FIND_EXACT, + LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL); + if (result == 0) { + int ilen; + + result = zload(coord.node); + if (result == 0) { + ilen = item_length_by_coord(&coord); + if (length <= ilen) { + xmemcpy(item_body_by_coord(&coord), data, length); + } else { + warning("nikita-3437", + "Wrong black box length: %i < %i", + ilen, length); + result = RETERR(-EIO); + } + zrelse(coord.node); + } + } + + done_lh(&lh); + return result; + +} + +reiser4_internal int kill_black_box(reiser4_tree *tree, const reiser4_key *key) +{ + return cut_tree(tree, key, key, NULL); +} + + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/blackbox.h linux-2.6.4-ck1/fs/reiser4/plugin/item/blackbox.h --- linux-2.6.4/fs/reiser4/plugin/item/blackbox.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/blackbox.h 2004-03-11 22:45:15.313506476 +1100 @@ -0,0 +1,33 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* "Black box" entry to fixed-width contain user supplied data */ + +#if !defined( __FS_REISER4_BLACK_BOX_H__ ) +#define __FS_REISER4_BLACK_BOX_H__ + +#include "../../forward.h" +#include "../../dformat.h" +#include "../../kassign.h" +#include "../../key.h" + +extern int store_black_box(reiser4_tree *tree, + const reiser4_key *key, void *data, int length); +extern int load_black_box(reiser4_tree *tree, + reiser4_key *key, void *data, int length, int exact); +extern int kill_black_box(reiser4_tree *tree, const reiser4_key *key); +extern int update_black_box(reiser4_tree *tree, + const reiser4_key *key, void *data, int length); + +/* __FS_REISER4_BLACK_BOX_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/cde.c linux-2.6.4-ck1/fs/reiser4/plugin/item/cde.c --- linux-2.6.4/fs/reiser4/plugin/item/cde.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/cde.c 2004-03-11 22:45:15.315506165 +1100 @@ -0,0 +1,1073 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Directory entry implementation */ + +/* DESCRIPTION: + + This is "compound" directory item plugin implementation. This directory + item type is compound (as opposed to the "simple directory item" in + fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory + entries. + + The reason behind this decision is disk space efficiency: all directory + entries inside the same directory have identical fragment in their + keys. This, of course, depends on key assignment policy. In our default key + assignment policy, all directory entries have the same locality which is + equal to the object id of their directory. + + Composing directory item out of several directory entries for the same + directory allows us to store said key fragment only once. That is, this is + some ad hoc form of key compression (stem compression) that is implemented + here, because general key compression is not supposed to be implemented in + v4.0. + + Another decision that was made regarding all directory item plugins, is + that they will store entry keys unaligned. This is for that sake of disk + space efficiency again. + + In should be noted, that storing keys unaligned increases CPU consumption, + at least on some architectures. + + Internal on-disk structure of the compound directory item is the following: + + HEADER cde_item_format. Here number of entries is stored. + ENTRY_HEADER_0 cde_unit_header. Here part of entry key and + ENTRY_HEADER_1 offset of entry body are stored. + ENTRY_HEADER_2 (basically two last parts of key) + ... + ENTRY_HEADER_N + ENTRY_BODY_0 directory_entry_format. Here part of stat data key and + ENTRY_BODY_1 NUL-terminated name are stored. + ENTRY_BODY_2 (part of statadta key in the + sence that since all SDs have + zero offset, this offset is not + stored on disk). + ... + ENTRY_BODY_N + + When it comes to the balancing, each directory entry in compound directory + item is unit, that is, something that can be cut from one item and pasted + into another item of the same type. Handling of unit cut and paste is major + reason for the complexity of code below. + +*/ + +#include "../../forward.h" +#include "../../debug.h" +#include "../../dformat.h" +#include "../../kassign.h" +#include "../../key.h" +#include "../../coord.h" +#include "sde.h" +#include "cde.h" +#include "item.h" +#include "../node/node.h" +#include "../plugin.h" +#include "../../znode.h" +#include "../../carry.h" +#include "../../tree.h" +#include "../../inode.h" + +#include /* for struct inode */ +#include /* for struct dentry */ +#include + +#if 0 +#define CHECKME(coord) \ +({ \ + const char *message; \ + coord_t dup; \ + \ + coord_dup_nocheck(&dup, (coord)); \ + dup.unit_pos = 0; \ + assert("nikita-2871", cde_check(&dup, &message) == 0); \ +}) +#else +#define CHECKME(coord) noop +#endif + + +/* return body of compound directory item at @coord */ +static inline cde_item_format * +formatted_at(const coord_t * coord) +{ + assert("nikita-1282", coord != NULL); + return item_body_by_coord(coord); +} + +/* return entry header at @coord */ +static inline cde_unit_header * +header_at(const coord_t * coord /* coord of item */ , + int idx /* index of unit */ ) +{ + assert("nikita-1283", coord != NULL); + return &formatted_at(coord)->entry[idx]; +} + +/* return number of units in compound directory item at @coord */ +static int +units(const coord_t * coord /* coord of item */ ) +{ + return d16tocpu(&formatted_at(coord)->num_of_entries); +} + +/* return offset of the body of @idx-th entry in @coord */ +static unsigned int +offset_of(const coord_t * coord /* coord of item */ , + int idx /* index of unit */ ) +{ + if (idx < units(coord)) + return d16tocpu(&header_at(coord, idx)->offset); + else if (idx == units(coord)) + return item_length_by_coord(coord); + else + impossible("nikita-1308", "Wrong idx"); + return 0; +} + +/* set offset of the body of @idx-th entry in @coord */ +static void +set_offset(const coord_t * coord /* coord of item */ , + int idx /* index of unit */ , + unsigned int offset /* new offset */ ) +{ + cputod16((__u16) offset, &header_at(coord, idx)->offset); +} + +static void +adj_offset(const coord_t * coord /* coord of item */ , + int idx /* index of unit */ , + int delta /* offset change */ ) +{ + d16 *doffset; + __u16 offset; + + doffset = &header_at(coord, idx)->offset; + offset = d16tocpu(doffset); + offset += delta; + cputod16((__u16) offset, doffset); +} + +/* return pointer to @offset-th byte from the beginning of @coord */ +static char * +address(const coord_t * coord /* coord of item */ , + int offset) +{ + return ((char *) item_body_by_coord(coord)) + offset; +} + +/* return pointer to the body of @idx-th entry in @coord */ +static directory_entry_format * +entry_at(const coord_t * coord /* coord of + * item */ , + int idx /* index of unit */ ) +{ + return (directory_entry_format *) address(coord, (int) offset_of(coord, idx)); +} + +/* return number of unit referenced by @coord */ +static int +idx_of(const coord_t * coord /* coord of item */ ) +{ + assert("nikita-1285", coord != NULL); + return coord->unit_pos; +} + +/* find position where entry with @entry_key would be inserted into @coord */ +static int +find(const coord_t * coord /* coord of item */ , + const reiser4_key * entry_key /* key to look for */ , + cmp_t * last /* result of last comparison */ ) +{ + int entries; + + int left; + int right; + + cde_unit_header *header; + + assert("nikita-1295", coord != NULL); + assert("nikita-1296", entry_key != NULL); + assert("nikita-1297", last != NULL); + + entries = units(coord); + left = 0; + right = entries - 1; + while (right - left >= REISER4_SEQ_SEARCH_BREAK) { + int median; + + median = (left + right) >> 1; + + header = header_at(coord, median); + *last = de_id_key_cmp(&header->hash, entry_key); + switch (*last) { + case LESS_THAN: + left = median; + break; + case GREATER_THAN: + right = median; + break; + case EQUAL_TO: { + do { + median --; + header --; + } while (median >= 0 && + de_id_key_cmp(&header->hash, + entry_key) == EQUAL_TO); + return median + 1; + } + } + } + header = header_at(coord, left); + for (; left < entries; ++ left, ++ header) { + prefetch(header + 1); + *last = de_id_key_cmp(&header->hash, entry_key); + if (*last != LESS_THAN) + break; + } + if (left < entries) + return left; + else + return RETERR(-ENOENT); + +} + +/* expand @coord as to accomodate for insertion of @no new entries starting + from @pos, with total bodies size @size. */ +static int +expand_item(const coord_t * coord /* coord of item */ , + int pos /* unit position */ , int no /* number of new + * units*/ , + int size /* total size of new units' data */ , + unsigned int data_size /* free space already reserved + * in the item for insertion */ ) +{ + int entries; + cde_unit_header *header; + char *dent; + int i; + + assert("nikita-1310", coord != NULL); + assert("nikita-1311", pos >= 0); + assert("nikita-1312", no > 0); + assert("nikita-1313", data_size >= no * sizeof (directory_entry_format)); + assert("nikita-1343", item_length_by_coord(coord) >= (int) (size + data_size + no * sizeof *header)); + + entries = units(coord); + + if (pos == entries) + dent = address(coord, size); + else + dent = (char *) entry_at(coord, pos); + /* place where new header will be in */ + header = header_at(coord, pos); + /* free space for new entry headers */ + xmemmove(header + no, header, (unsigned) (address(coord, size) - (char *) header)); + /* if adding to the end initialise first new header */ + if (pos == entries) { + set_offset(coord, pos, (unsigned) size); + } + + /* adjust entry pointer and size */ + dent = dent + no * sizeof *header; + size += no * sizeof *header; + /* free space for new entries */ + xmemmove(dent + data_size, dent, (unsigned) (address(coord, size) - dent)); + + /* increase counter */ + entries += no; + cputod16((__u16) entries, &formatted_at(coord)->num_of_entries); + + /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header ) + bytes. */ + for (i = 0; i <= pos; ++i) + adj_offset(coord, i, no * sizeof *header); + /* [ pos + no ... +\infty ) entries were shifted by ( no * + sizeof *header + data_size ) bytes */ + for (i = pos + no; i < entries; ++i) + adj_offset(coord, i, no * sizeof *header + data_size); + return 0; +} + +/* insert new @entry into item */ +static int +expand(const coord_t * coord /* coord of item */ , + cde_entry * entry /* entry to insert */ , + int len /* length of @entry data */ , + int *pos /* position to insert */ , + reiser4_dir_entry_desc * dir_entry /* parameters for new + * entry */ ) +{ + cmp_t cmp_res; + int datasize; + + *pos = find(coord, &dir_entry->key, &cmp_res); + if (*pos < 0) + *pos = units(coord); + + datasize = sizeof (directory_entry_format); + if (is_longname(entry->name->name, entry->name->len)) + datasize += entry->name->len + 1; + + expand_item(coord, *pos, 1, item_length_by_coord(coord) - len, datasize); + return 0; +} + +/* paste body of @entry into item */ +static int +paste_entry(const coord_t * coord /* coord of item */ , + cde_entry * entry /* new entry */ , + int pos /* position to insert */ , + reiser4_dir_entry_desc * dir_entry /* parameters for + * new entry */ ) +{ + cde_unit_header *header; + directory_entry_format *dent; + const char *name; + int len; + + header = header_at(coord, pos); + dent = entry_at(coord, pos); + + build_de_id_by_key(&dir_entry->key, &header->hash); + build_inode_key_id(entry->obj, &dent->id); + /* AUDIT unsafe strcpy() operation! It should be replaced with + much less CPU hungry + memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len ); + + Also a more major thing is that there should be a way to figure out + amount of space in dent -> name and be able to check that we are + not going to overwrite more than we supposed to */ + name = entry->name->name; + len = entry->name->len; + if (is_longname(name, len)) { + strcpy((unsigned char *) dent->name, name); + cputod8(0, &dent->name[len]); + } + return 0; +} + +/* estimate how much space is necessary in item to insert/paste set of entries + described in @data. */ +reiser4_internal int +estimate_cde(const coord_t * coord /* coord of item */ , + const reiser4_item_data * data /* parameters for new item */ ) +{ + cde_entry_data *e; + int result; + int i; + + e = (cde_entry_data *) data->data; + + assert("nikita-1288", e != NULL); + assert("nikita-1289", e->num_of_entries >= 0); + + if (coord == NULL) + /* insert */ + result = sizeof (cde_item_format); + else + /* paste */ + result = 0; + + result += e->num_of_entries * + (sizeof (cde_unit_header) + sizeof (directory_entry_format)); + for (i = 0; i < e->num_of_entries; ++i) { + const char *name; + int len; + + name = e->entry[i].name->name; + len = e->entry[i].name->len; + assert("nikita-2054", strlen(name) == len); + if (is_longname(name, len)) + result += len + 1; + } + ((reiser4_item_data *) data)->length = result; + return result; +} + +/* ->nr_units() method for this item plugin. */ +reiser4_internal pos_in_node_t +nr_units_cde(const coord_t * coord /* coord of item */ ) +{ + return units(coord); +} + +/* ->unit_key() method for this item plugin. */ +reiser4_internal reiser4_key * +unit_key_cde(const coord_t * coord /* coord of item */ , + reiser4_key * key /* resulting key */ ) +{ + assert("nikita-1452", coord != NULL); + assert("nikita-1345", idx_of(coord) < units(coord)); + assert("nikita-1346", key != NULL); + + item_key_by_coord(coord, key); + extract_key_from_de_id(extract_dir_id_from_key(key), &header_at(coord, idx_of(coord))->hash, key); + return key; +} + +/* mergeable_cde(): implementation of ->mergeable() item method. + + Two directory items are mergeable iff they are from the same + directory. That simple. + +*/ +reiser4_internal int +mergeable_cde(const coord_t * p1 /* coord of first item */ , + const coord_t * p2 /* coord of second item */ ) +{ + reiser4_key k1; + reiser4_key k2; + + assert("nikita-1339", p1 != NULL); + assert("nikita-1340", p2 != NULL); + + return + (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) && + (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) == + extract_dir_id_from_key(item_key_by_coord(p2, &k2))); + +} + +/* ->max_key_inside() method for this item plugin. */ +reiser4_internal reiser4_key * +max_key_inside_cde(const coord_t * coord /* coord of item */ , + reiser4_key * result /* resulting key */) +{ + assert("nikita-1342", coord != NULL); + + item_key_by_coord(coord, result); + set_key_ordering(result, get_key_ordering(max_key())); + set_key_objectid(result, get_key_objectid(max_key())); + set_key_offset(result, get_key_offset(max_key())); + return result; +} + +/* @data contains data which are to be put into tree */ +reiser4_internal int +can_contain_key_cde(const coord_t * coord /* coord of item */ , + const reiser4_key * key /* key to check */ , + const reiser4_item_data * data /* parameters of new + * item/unit being + * created */ ) +{ + reiser4_key item_key; + + /* FIXME-VS: do not rely on anything but iplug field of @data. Only + data->iplug is initialized */ + assert("vs-457", data && data->iplug); +/* assert( "vs-553", data -> user == 0 );*/ + item_key_by_coord(coord, &item_key); + + return (item_plugin_by_coord(coord) == data->iplug) && + (extract_dir_id_from_key(&item_key) == extract_dir_id_from_key(key)); +} + +#if REISER4_DEBUG_OUTPUT +/* ->print() method for this item plugin. */ +reiser4_internal void +print_cde(const char *prefix /* prefix to print */ , + coord_t * coord /* coord of item to print */ ) +{ + assert("nikita-1077", prefix != NULL); + assert("nikita-1078", coord != NULL); + + if (item_length_by_coord(coord) < (int) sizeof (cde_item_format)) { + printk("%s: wrong size: %i < %i\n", prefix, item_length_by_coord(coord), sizeof (cde_item_format)); + } else { + char *name; + char *end; + char *start; + int i; + oid_t dirid; + reiser4_key key; + + start = address(coord, 0); + end = address(coord, item_length_by_coord(coord)); + item_key_by_coord(coord, &key); + dirid = extract_dir_id_from_key(&key); + + printk("%s: units: %i\n", prefix, nr_units_cde(coord)); + for (i = 0; i < units(coord); ++i) { + cde_unit_header *header; + + header = header_at(coord, i); + indent_znode(coord->node); + printk("\theader %i: ", i); + if ((char *) (header + 1) > end) { + printk("out of bounds: %p [%p, %p]\n", header, start, end); + } else { + extract_key_from_de_id(dirid, &header->hash, &key); + printk("%i: at %i, offset: %i, ", i, i * sizeof (*header), d16tocpu(&header->offset)); + print_key("key", &key); + } + } + for (i = 0; i < units(coord); ++i) { + directory_entry_format *entry; + char buf[DE_NAME_BUF_LEN]; + + entry = entry_at(coord, i); + indent_znode(coord->node); + printk("\tentry: %i: ", i); + if (((char *) (entry + 1) > end) || ((char *) entry < start)) { + printk("out of bounds: %p [%p, %p]\n", entry, start, end); + } else { + coord->unit_pos = i; + extract_key_cde(coord, &key); + name = extract_name_cde(coord, buf); + printk("at %i, name: %s, ", (char *) entry - start, name); + print_key("sdkey", &key); + } + } + } +} +#endif + +#if REISER4_DEBUG +/* cde_check ->check() method for compressed directory items + + used for debugging, every item should have here the most complete + possible check of the consistency of the item that the inventor can + construct +*/ +reiser4_internal int +check_cde(const coord_t * coord /* coord of item to check */ , + const char **error /* where to store error message */ ) +{ + int i; + int result; + char *item_start; + char *item_end; + reiser4_key key; + + coord_t c; + + assert("nikita-1357", coord != NULL); + assert("nikita-1358", error != NULL); + + if (!ergo(coord->item_pos != 0, + is_dot_key(item_key_by_coord(coord, &key)))) { + *error = "CDE doesn't start with dot"; + return -1; + } + item_start = item_body_by_coord(coord); + item_end = item_start + item_length_by_coord(coord); + + coord_dup(&c, coord); + result = 0; + for (i = 0; i < units(coord); ++i) { + directory_entry_format *entry; + + if ((char *) (header_at(coord, i) + 1) > item_end - units(coord) * sizeof *entry) { + *error = "CDE header is out of bounds"; + result = -1; + break; + } + entry = entry_at(coord, i); + if ((char *) entry < item_start + sizeof (cde_item_format)) { + *error = "CDE header is too low"; + result = -1; + break; + } + if ((char *) (entry + 1) > item_end) { + *error = "CDE header is too high"; + result = -1; + break; + } + } + + return result; +} +#endif + +/* ->init() method for this item plugin. */ +reiser4_internal int +init_cde(coord_t * coord /* coord of item */ , + coord_t * from UNUSED_ARG, + reiser4_item_data * data /* structure used for insertion */ + UNUSED_ARG) +{ + cputod16(0u, &formatted_at(coord)->num_of_entries); + return 0; +} + +/* ->lookup() method for this item plugin. */ +reiser4_internal lookup_result +lookup_cde(const reiser4_key * key /* key to search for */ , + lookup_bias bias /* search bias */ , + coord_t * coord /* coord of item to lookup in */ ) +{ + cmp_t last_comp; + int pos; + + reiser4_key utmost_key; + + assert("nikita-1293", coord != NULL); + assert("nikita-1294", key != NULL); + + CHECKME(coord); + + if (keygt(item_key_by_coord(coord, &utmost_key), key)) { + coord->unit_pos = 0; + coord->between = BEFORE_UNIT; + return CBK_COORD_NOTFOUND; + } + pos = find(coord, key, &last_comp); + if (pos >= 0) { + coord->unit_pos = (int) pos; + switch (last_comp) { + case EQUAL_TO: + coord->between = AT_UNIT; + return CBK_COORD_FOUND; + case GREATER_THAN: + coord->between = BEFORE_UNIT; + return RETERR(-ENOENT); + case LESS_THAN: + default: + impossible("nikita-1298", "Broken find"); + return RETERR(-EIO); + } + } else { + coord->unit_pos = units(coord) - 1; + coord->between = AFTER_UNIT; + return (bias == FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND; + } +} + +/* ->paste() method for this item plugin. */ +reiser4_internal int +paste_cde(coord_t * coord /* coord of item */ , + reiser4_item_data * data /* parameters of new unit being + * inserted */ , + carry_plugin_info * info UNUSED_ARG /* todo carry queue */ ) +{ + cde_entry_data *e; + int result; + int i; + + CHECKME(coord); + e = (cde_entry_data *) data->data; + + result = 0; + for (i = 0; i < e->num_of_entries; ++i) { + int pos; + int phantom_size; + + phantom_size = data->length; + if (units(coord) == 0) + phantom_size -= sizeof (cde_item_format); + + result = expand(coord, e->entry + i, phantom_size, &pos, data->arg); + if (result != 0) + break; + result = paste_entry(coord, e->entry + i, pos, data->arg); + if (result != 0) + break; + } + CHECKME(coord); + return result; +} + +/* amount of space occupied by all entries starting from @idx both headers and + bodies. */ +static unsigned int +part_size(const coord_t * coord /* coord of item */ , + int idx /* index of unit */ ) +{ + assert("nikita-1299", coord != NULL); + assert("nikita-1300", idx < (int) units(coord)); + + return sizeof (cde_item_format) + + (idx + 1) * sizeof (cde_unit_header) + offset_of(coord, idx + 1) - offset_of(coord, 0); +} + +/* how many but not more than @want units of @source can be merged with + item in @target node. If pend == append - we try to append last item + of @target by first units of @source. If pend == prepend - we try to + "prepend" first item in @target by last units of @source. @target + node has @free_space bytes of free space. Total size of those units + are returned via @size */ +reiser4_internal int +can_shift_cde(unsigned free_space /* free space in item */ , + coord_t * coord /* coord of source item */ , + znode * target /* target node */ , + shift_direction pend /* shift direction */ , + unsigned *size /* resulting number of shifted bytes */ , + unsigned want /* maximal number of bytes to shift */ ) +{ + int shift; + + CHECKME(coord); + if (want == 0) { + *size = 0; + return 0; + } + + /* pend == SHIFT_LEFT <==> shifting to the left */ + if (pend == SHIFT_LEFT) { + for (shift = min((int) want - 1, units(coord)); shift >= 0; --shift) { + *size = part_size(coord, shift); + if (target != NULL) + *size -= sizeof (cde_item_format); + if (*size <= free_space) + break; + } + shift = shift + 1; + } else { + int total_size; + + assert("nikita-1301", pend == SHIFT_RIGHT); + + total_size = item_length_by_coord(coord); + for (shift = units(coord) - want - 1; shift < units(coord) - 1; ++shift) { + *size = total_size - part_size(coord, shift); + if (target == NULL) + *size += sizeof (cde_item_format); + if (*size <= free_space) + break; + } + shift = units(coord) - shift - 1; + } + if (shift == 0) + *size = 0; + CHECKME(coord); + return shift; +} + +/* ->copy_units() method for this item plugin. */ +reiser4_internal void +copy_units_cde(coord_t * target /* coord of target item */ , + coord_t * source /* coord of source item */ , + unsigned from /* starting unit */ , + unsigned count /* how many units to copy */ , + shift_direction where_is_free_space /* shift direction */ , + unsigned free_space /* free space in item */ ) +{ + char *header_from; + char *header_to; + + char *entry_from; + char *entry_to; + + int pos_in_target; + int data_size; + int data_delta; + int i; +#if REISER4_TRACE && REISER4_DEBUG_OUTPUT + reiser4_key debug_key; +#endif + + assert("nikita-1303", target != NULL); + assert("nikita-1304", source != NULL); + assert("nikita-1305", (int) from < units(source)); + assert("nikita-1307", (int) (from + count) <= units(source)); + + IF_TRACE(TRACE_DIR | TRACE_NODES, print_key("cde_copy source", item_key_by_coord(source, &debug_key))); + IF_TRACE(TRACE_DIR | TRACE_NODES, print_key("cde_copy target", item_key_by_coord(target, &debug_key))); + + if (where_is_free_space == SHIFT_LEFT) { + assert("nikita-1453", from == 0); + pos_in_target = units(target); + } else { + assert("nikita-1309", (int) (from + count) == units(source)); + pos_in_target = 0; + xmemmove(item_body_by_coord(target), + (char *) item_body_by_coord(target) + free_space, item_length_by_coord(target) - free_space); + } + + CHECKME(target); + CHECKME(source); + + /* expand @target */ + data_size = offset_of(source, (int) (from + count)) - offset_of(source, (int) from); + + if (units(target) == 0) + free_space -= sizeof (cde_item_format); + + expand_item(target, pos_in_target, (int) count, + (int) (item_length_by_coord(target) - free_space), (unsigned) data_size); + + /* copy first @count units of @source into @target */ + data_delta = offset_of(target, pos_in_target) - offset_of(source, (int) from); + + /* copy entries */ + entry_from = (char *) entry_at(source, (int) from); + entry_to = (char *) entry_at(source, (int) (from + count)); + xmemmove(entry_at(target, pos_in_target), entry_from, (unsigned) (entry_to - entry_from)); + + /* copy headers */ + header_from = (char *) header_at(source, (int) from); + header_to = (char *) header_at(source, (int) (from + count)); + xmemmove(header_at(target, pos_in_target), header_from, (unsigned) (header_to - header_from)); + + /* update offsets */ + for (i = pos_in_target; i < (int) (pos_in_target + count); ++i) + adj_offset(target, i, data_delta); + CHECKME(target); + CHECKME(source); +} + +/* ->cut_units() method for this item plugin. */ +reiser4_internal int +cut_units_cde(coord_t * coord /* coord of item */ , + pos_in_node_t from /* start unit pos */ , + pos_in_node_t to /* stop unit pos */ , + struct carry_cut_data *cdata UNUSED_ARG, reiser4_key *smallest_removed, + reiser4_key *new_first) +{ + char *header_from; + char *header_to; + + char *entry_from; + char *entry_to; + + int size; + int entry_delta; + int header_delta; + int i; + + unsigned count; + + CHECKME(coord); + + count = to - from + 1; + + assert("nikita-1454", coord != NULL); + assert("nikita-1455", (int) (from + count) <= units(coord)); + + if (smallest_removed) + unit_key_by_coord(coord, smallest_removed); + + if (new_first) { + coord_t next; + + /* not everything is cut from item head */ + assert("vs-1527", from == 0); + assert("vs-1528", to < units(coord) - 1); + + coord_dup(&next, coord); + next.unit_pos ++; + unit_key_by_coord(&next, new_first); + } + + size = item_length_by_coord(coord); + if (count == (unsigned) units(coord)) { + return size; + } + + header_from = (char *) header_at(coord, (int) from); + header_to = (char *) header_at(coord, (int) (from + count)); + + entry_from = (char *) entry_at(coord, (int) from); + entry_to = (char *) entry_at(coord, (int) (from + count)); + + /* move headers */ + xmemmove(header_from, header_to, (unsigned) (address(coord, size) - header_to)); + + header_delta = header_to - header_from; + + entry_from -= header_delta; + entry_to -= header_delta; + size -= header_delta; + + /* copy entries */ + xmemmove(entry_from, entry_to, (unsigned) (address(coord, size) - entry_to)); + + entry_delta = entry_to - entry_from; + size -= entry_delta; + + /* update offsets */ + + for (i = 0; i < (int) from; ++i) + adj_offset(coord, i, - header_delta); + + for (i = from; i < units(coord) - (int) count; ++i) + adj_offset(coord, i, - header_delta - entry_delta); + + cputod16((__u16) units(coord) - count, &formatted_at(coord)->num_of_entries); + + if (from == 0) { + /* entries from head was removed - move remaining to right */ + xmemmove((char *) item_body_by_coord(coord) + + header_delta + entry_delta, item_body_by_coord(coord), (unsigned) size); + if (REISER4_DEBUG) + xmemset(item_body_by_coord(coord), 0, (unsigned) header_delta + entry_delta); + } else { + /* freed space is already at the end of item */ + if (REISER4_DEBUG) + xmemset((char *) item_body_by_coord(coord) + size, 0, (unsigned) header_delta + entry_delta); + } + + return header_delta + entry_delta; +} + +reiser4_internal int +kill_units_cde(coord_t * coord /* coord of item */ , + pos_in_node_t from /* start unit pos */ , + pos_in_node_t to /* stop unit pos */ , + struct carry_kill_data *kdata UNUSED_ARG, reiser4_key *smallest_removed, + reiser4_key *new_first) +{ + return cut_units_cde(coord, from, to, 0, smallest_removed, new_first); +} + +/* ->s.dir.extract_key() method for this item plugin. */ +reiser4_internal int +extract_key_cde(const coord_t * coord /* coord of item */ , + reiser4_key * key /* resulting key */ ) +{ + directory_entry_format *dent; + + assert("nikita-1155", coord != NULL); + assert("nikita-1156", key != NULL); + + dent = entry_at(coord, idx_of(coord)); + return extract_key_from_id(&dent->id, key); +} + +reiser4_internal int +update_key_cde(const coord_t * coord, const reiser4_key * key, lock_handle * lh UNUSED_ARG) +{ + directory_entry_format *dent; + obj_key_id obj_id; + int result; + + assert("nikita-2344", coord != NULL); + assert("nikita-2345", key != NULL); + + dent = entry_at(coord, idx_of(coord)); + result = build_obj_key_id(key, &obj_id); + if (result == 0) { + dent->id = obj_id; + znode_make_dirty(coord->node); + } + return 0; +} + +/* ->s.dir.extract_name() method for this item plugin. */ +reiser4_internal char * +extract_name_cde(const coord_t * coord /* coord of item */, char *buf) +{ + directory_entry_format *dent; + + assert("nikita-1157", coord != NULL); + + dent = entry_at(coord, idx_of(coord)); + return extract_dent_name(coord, dent, buf); +} + +static int +cde_bytes(int pasting, const reiser4_item_data * data) +{ + int result; + + result = data->length; + if (!pasting) + result -= sizeof (cde_item_format); + return result; +} + +/* ->s.dir.add_entry() method for this item plugin */ +reiser4_internal int +add_entry_cde(struct inode *dir /* directory object */ , + coord_t * coord /* coord of item */ , + lock_handle * lh /* lock handle for insertion */ , + const struct dentry *name /* name to insert */ , + reiser4_dir_entry_desc * dir_entry /* parameters of new + * directory entry */ ) +{ + reiser4_item_data data; + cde_entry entry; + cde_entry_data edata; + int result; + + assert("nikita-1656", coord->node == lh->node); + assert("nikita-1657", znode_is_write_locked(coord->node)); + + edata.num_of_entries = 1; + edata.entry = &entry; + + entry.dir = dir; + entry.obj = dir_entry->obj; + entry.name = &name->d_name; + + data.data = (char *) &edata; + data.user = 0; /* &edata is not user space */ + data.iplug = item_plugin_by_id(COMPOUND_DIR_ID); + data.arg = dir_entry; + assert("nikita-1302", data.iplug != NULL); + + result = is_dot_key(&dir_entry->key); + data.length = estimate_cde(result ? coord : NULL, &data); + + /* NOTE-NIKITA quota plugin? */ + if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data))) + return RETERR(-EDQUOT); + + if (result) + result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0); + else + result = resize_item(coord, &data, &dir_entry->key, lh, 0); + return result; +} + +/* ->s.dir.rem_entry() */ +reiser4_internal int +rem_entry_cde(struct inode *dir /* directory of item */ , + const struct qstr * name, + coord_t * coord /* coord of item */ , + lock_handle * lh UNUSED_ARG /* lock handle for + * removal */ , + reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of + * directory entry + * being removed */ ) +{ + coord_t shadow; + int result; + int length; + ON_DEBUG(char buf[DE_NAME_BUF_LEN]); + + assert("nikita-2870", strlen(name->name) == name->len); + assert("nikita-2869", !strcmp(name->name, extract_name_cde(coord, buf))); + + length = sizeof (directory_entry_format) + sizeof (cde_unit_header); + if (is_longname(name->name, name->len)) + length += name->len + 1; + + if (inode_get_bytes(dir) < length) { + warning("nikita-2628", "Dir is broke: %llu: %llu", get_inode_oid(dir), inode_get_bytes(dir)); + return RETERR(-EIO); + } + + /* cut_node() is supposed to take pointers to _different_ + coords, because it will modify them without respect to + possible aliasing. To work around this, create temporary copy + of @coord. + */ + coord_dup(&shadow, coord); + result = kill_node_content(coord, &shadow, NULL, NULL, NULL, 0, NULL, NULL); + if (result == 0) { + /* NOTE-NIKITA quota plugin? */ + DQUOT_FREE_SPACE_NODIRTY(dir, length); + } + return result; +} + +/* ->s.dir.max_name_len() method for this item plugin */ +reiser4_internal int +max_name_len_cde(const struct inode *dir /* directory */ ) +{ + return + tree_by_inode(dir)->nplug->max_item_size() - + sizeof (directory_entry_format) - sizeof (cde_item_format) - sizeof (cde_unit_header) - 2; +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/cde.h linux-2.6.4-ck1/fs/reiser4/plugin/item/cde.h --- linux-2.6.4/fs/reiser4/plugin/item/cde.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/cde.h 2004-03-11 22:45:15.315506165 +1100 @@ -0,0 +1,78 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Compound directory item. See cde.c for description. */ + +#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ ) +#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ + +#include "../../forward.h" +#include "../../kassign.h" +#include "../../dformat.h" + +#include /* for struct inode */ +#include /* for struct dentry, etc */ + +typedef struct cde_unit_header { + de_id hash; + d16 offset; +} cde_unit_header; + +typedef struct cde_item_format { + d16 num_of_entries; + cde_unit_header entry[0]; +} cde_item_format; + +typedef struct cde_entry { + const struct inode *dir; + const struct inode *obj; + const struct qstr *name; +} cde_entry; + +typedef struct cde_entry_data { + int num_of_entries; + cde_entry *entry; +} cde_entry_data; + +/* plugin->item.b.* */ +reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result); +int can_contain_key_cde(const coord_t * coord, const reiser4_key * key, const reiser4_item_data *); +int mergeable_cde(const coord_t * p1, const coord_t * p2); +pos_in_node_t nr_units_cde(const coord_t * coord); +reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key); +int estimate_cde(const coord_t * coord, const reiser4_item_data * data); +void print_cde(const char *prefix, coord_t * coord); +int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data); +lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias, coord_t * coord); +int paste_cde(coord_t * coord, reiser4_item_data * data, carry_plugin_info * info UNUSED_ARG); +int can_shift_cde(unsigned free_space, coord_t * coord, + znode * target, shift_direction pend, unsigned *size, unsigned want); +void copy_units_cde(coord_t * target, coord_t * source, + unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space); +int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to, + struct carry_cut_data *, reiser4_key * smallest_removed, reiser4_key *new_first); +int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to, + struct carry_kill_data *, reiser4_key * smallest_removed, reiser4_key *new_first); +void print_cde(const char *prefix, coord_t * coord); +int check_cde(const coord_t * coord, const char **error); + +/* plugin->u.item.s.dir.* */ +int extract_key_cde(const coord_t * coord, reiser4_key * key); +int update_key_cde(const coord_t * coord, const reiser4_key * key, lock_handle * lh); +char *extract_name_cde(const coord_t * coord, char *buf); +int add_entry_cde(struct inode *dir, coord_t * coord, + lock_handle * lh, const struct dentry *name, reiser4_dir_entry_desc * entry); +int rem_entry_cde(struct inode *dir, const struct qstr * name, coord_t * coord, lock_handle * lh, reiser4_dir_entry_desc * entry); +int max_name_len_cde(const struct inode *dir); + +/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/ctail.c linux-2.6.4-ck1/fs/reiser4/plugin/item/ctail.c --- linux-2.6.4/fs/reiser4/plugin/item/ctail.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/ctail.c 2004-03-11 22:45:15.317505854 +1100 @@ -0,0 +1,1139 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* ctails (aka "crypto tails") are items for cryptcompress objects */ + +/* DESCRIPTION: + +Each cryptcompress object is stored on disk as a set of clusters sliced +into ctails. + +Internal on-disk structure: + + HEADER (1) Here stored disk cluster shift + BODY +*/ + +#include "../../forward.h" +#include "../../debug.h" +#include "../../dformat.h" +#include "../../kassign.h" +#include "../../key.h" +#include "../../coord.h" +#include "item.h" +#include "../node/node.h" +#include "../plugin.h" +#include "../object.h" +#include "../../znode.h" +#include "../../carry.h" +#include "../../tree.h" +#include "../../inode.h" +#include "../../super.h" +#include "../../context.h" +#include "../../page_cache.h" +#include "../../flush.h" + +#include +#include + +/* return body of ctail item at @coord */ +static ctail_item_format * +ctail_formatted_at(const coord_t * coord) +{ + assert("edward-60", coord != NULL); + return item_body_by_coord(coord); +} + +static __u8 +cluster_shift_by_coord(const coord_t * coord) +{ + return d8tocpu(&ctail_formatted_at(coord)->cluster_shift); +} + +static unsigned long +pg_by_coord(const coord_t * coord) +{ + reiser4_key key; + + return get_key_offset(item_key_by_coord(coord, &key)) >> PAGE_CACHE_SHIFT; +} + +static unsigned long +clust_by_coord(const coord_t * coord) +{ + return pg_by_coord(coord) >> cluster_shift_by_coord(coord); +} + +#define cluster_key(key, coord) !(get_key_offset(key) & ~(~0ULL << cluster_shift_by_coord(coord) << PAGE_CACHE_SHIFT)) + +static char * +first_unit(coord_t * coord) +{ + /* FIXME: warning: pointer of type `void *' used in arithmetic */ + return (char *)item_body_by_coord(coord) + sizeof (ctail_item_format); +} + +/* plugin->u.item.b.max_key_inside : + tail_max_key_inside */ + +/* plugin->u.item.b.can_contain_key */ +reiser4_internal int +can_contain_key_ctail(const coord_t *coord, const reiser4_key *key, const reiser4_item_data *data) +{ + reiser4_key item_key; + + if (item_plugin_by_coord(coord) != data->iplug) + return 0; + + item_key_by_coord(coord, &item_key); + if (get_key_locality(key) != get_key_locality(&item_key) || + get_key_objectid(key) != get_key_objectid(&item_key)) + return 0; + if (get_key_offset(&item_key) + nr_units_ctail(coord) != get_key_offset(key)) + return 0; + if (cluster_key(key, coord)) + return 0; + return 1; +} + +/* plugin->u.item.b.mergeable + c-tails of different clusters are not mergeable */ +reiser4_internal int +mergeable_ctail(const coord_t * p1, const coord_t * p2) +{ + reiser4_key key1, key2; + + assert("edward-61", item_type_by_coord(p1) == UNIX_FILE_METADATA_ITEM_TYPE); + assert("edward-62", item_id_by_coord(p1) == CTAIL_ID); + + if (item_id_by_coord(p2) != CTAIL_ID) { + /* second item is of another type */ + return 0; + } + + item_key_by_coord(p1, &key1); + item_key_by_coord(p2, &key2); + if (get_key_locality(&key1) != get_key_locality(&key2) || + get_key_objectid(&key1) != get_key_objectid(&key2) || + get_key_type(&key1) != get_key_type(&key2)) { + /* items of different objects */ + return 0; + } + if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2)) + /* not adjacent items */ + return 0; + if (cluster_key(&key2, p2)) + return 0; + return 1; +} + +/* plugin->u.item.b.nr_units */ +reiser4_internal pos_in_node_t +nr_units_ctail(const coord_t * coord) +{ + return (item_length_by_coord(coord) - sizeof(ctail_formatted_at(coord)->cluster_shift)); +} + +/* plugin->u.item.b.estimate: + estimate how much space is needed to insert/paste @data->length bytes + into ctail at @coord */ +reiser4_internal int +estimate_ctail(const coord_t * coord /* coord of item */, + const reiser4_item_data * data /* parameters for new item */) +{ + if (coord == NULL) + /* insert */ + return (sizeof(ctail_item_format) + data->length); + else + /* paste */ + return data->length; +} + +#if REISER4_DEBUG_OUTPUT +static unsigned +cluster_size_by_coord(const coord_t * coord) +{ + return (PAGE_CACHE_SIZE << cluster_shift_by_coord(coord)); +} + + +/* ->print() method for this item plugin. */ +reiser4_internal void +print_ctail(const char *prefix /* prefix to print */ , + coord_t * coord /* coord of item to print */ ) +{ + assert("edward-63", prefix != NULL); + assert("edward-64", coord != NULL); + + if (item_length_by_coord(coord) < (int) sizeof (ctail_item_format)) + printk("%s: wrong size: %i < %i\n", prefix, item_length_by_coord(coord), sizeof (ctail_item_format)); + else + printk("%s: disk cluster size: %i\n", prefix, cluster_size_by_coord(coord)); +} +#endif + +/* ->init() method for this item plugin. */ +reiser4_internal int +init_ctail(coord_t * to /* coord of item */, + coord_t * from /* old_item */, + reiser4_item_data * data /* structure used for insertion */) +{ + int cluster_shift; /* cpu value to convert */ + + if (data) { + assert("edward-463", data->length > sizeof(ctail_item_format)); + + cluster_shift = (int)(*((char *)(data->arg))); + data->length -= sizeof(ctail_item_format); + } + else { + assert("edward-464", from != NULL); + + cluster_shift = (int)(cluster_shift_by_coord(from)); + } + cputod8(cluster_shift, &ctail_formatted_at(to)->cluster_shift); + + return 0; +} + +/* plugin->u.item.b.lookup: + NULL. (we are looking only for exact keys from item headers) */ + + +/* plugin->u.item.b.check */ + +/* plugin->u.item.b.paste */ +reiser4_internal int +paste_ctail(coord_t * coord, reiser4_item_data * data, carry_plugin_info * info UNUSED_ARG) +{ + unsigned old_nr_units; + + assert("edward-268", data->data != NULL); + /* copy only from kernel space */ + assert("edward-66", data->user == 0); + + old_nr_units = item_length_by_coord(coord) - sizeof(ctail_item_format) - data->length; + + /* ctail items never get pasted in the middle */ + + if (coord->unit_pos == 0 && coord->between == AT_UNIT) { + + /* paste at the beginning when create new item */ + assert("edward-450", item_length_by_coord(coord) == data->length + sizeof(ctail_item_format)); + assert("edward-451", old_nr_units == 0); + } + else if (coord->unit_pos == old_nr_units - 1 && coord->between == AFTER_UNIT) { + + /* paste at the end */ + coord->unit_pos++; + } + else + impossible("edward-453", "bad paste position"); + + xmemcpy(first_unit(coord) + coord->unit_pos, data->data, data->length); + + return 0; +} + +/* plugin->u.item.b.fast_paste */ + +/* plugin->u.item.b.can_shift + number of units is returned via return value, number of bytes via @size. For + ctail items they coincide */ +reiser4_internal int +can_shift_ctail(unsigned free_space, coord_t * source, + znode * target, shift_direction direction UNUSED_ARG, unsigned *size, unsigned want) +{ + /* make sure that that we do not want to shift more than we have */ + assert("edward-68", want > 0 && want <= nr_units_ctail(source)); + + *size = min(want, free_space); + + if (!target) { + /* new item will be created */ + if (*size <= sizeof(ctail_item_format)) { + *size = 0; + return 0; + } + return *size - sizeof(ctail_item_format); + } + return *size; +} + +/* plugin->u.item.b.copy_units */ +reiser4_internal void +copy_units_ctail(coord_t * target, coord_t * source, + unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space UNUSED_ARG) +{ + /* make sure that item @target is expanded already */ + assert("edward-69", (unsigned) item_length_by_coord(target) >= count); + assert("edward-70", free_space >= count); + + if (item_length_by_coord(target) == count) { + /* new item has been created */ + assert("edward-465", count > sizeof(ctail_item_format)); + + count--; + } + if (where_is_free_space == SHIFT_LEFT) { + /* append item @target with @count first bytes of @source: + this restriction came from ordinary tails */ + assert("edward-71", from == 0); + + xmemcpy(first_unit(target) + nr_units_ctail(target) - count, first_unit(source), count); + } else { + /* target item is moved to right already */ + reiser4_key key; + + assert("edward-72", nr_units_ctail(source) == from + count); + + xmemcpy(first_unit(target), first_unit(source) + from, count); + + /* new units are inserted before first unit in an item, + therefore, we have to update item key */ + item_key_by_coord(source, &key); + set_key_offset(&key, get_key_offset(&key) + from); + + node_plugin_by_node(target->node)->update_item_key(target, &key, 0 /*info */); + } +} + +/* plugin->u.item.b.create_hook */ +/* plugin->u.item.b.kill_hook */ +reiser4_internal int +kill_hook_ctail(const coord_t *coord, pos_in_node_t from, pos_in_node_t count, carry_kill_data *kdata) +{ + struct inode *inode; + + assert("edward-291", znode_is_write_locked(coord->node)); + + inode = kdata->inode; + if (inode) { + reiser4_key key; + item_key_by_coord(coord, &key); + + if (from == 0 && cluster_key(&key, coord)) + truncate_pages_cryptcompress(inode->i_mapping, off_to_pg(get_key_offset(&key))); + } + return 0; +} + +/* for shift_hook_ctail(), + return true if the first disk cluster item has dirty child +*/ +static int +ctail_squeezable (const coord_t *coord) +{ + int result; + reiser4_key key; + jnode * child = NULL; + + assert("edward-477", coord != NULL); + assert("edward-478", item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID)); + + item_key_by_coord(coord, &key); + child = jlookup(current_tree, get_key_objectid(&key), pg_by_coord(coord)); + + if (!child) + return 0; + LOCK_JNODE(child); + if (jnode_is_dirty(child)) + result = 1; + else + result = 0; + UNLOCK_JNODE(child); + jput(child); + return result; +} + +/* plugin->u.item.b.shift_hook */ +reiser4_internal int +shift_hook_ctail(const coord_t * item /* coord of item */ , + unsigned from UNUSED_ARG /* start unit */ , + unsigned count UNUSED_ARG /* stop unit */ , + znode * old_node /* old parent */ ) +{ + assert("edward-479", item != NULL); + assert("edward-480", item->node != old_node); + + if (!znode_squeezable(old_node) || znode_squeezable(item->node)) + return 0; + if (ctail_squeezable(item)) + znode_set_squeezable(item->node); + return 0; +} + +static int +cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, int cut, + void *p, reiser4_key * smallest_removed, reiser4_key *new_first) +{ + pos_in_node_t count; /* number of units to cut */ + char *item; + + count = to - from + 1; + item = item_body_by_coord(coord); + + /* When we cut from the end of item - we have nothing to do */ + assert("edward-73", count < nr_units_ctail(coord)); + assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord))); + + if (smallest_removed) { + /* store smallest key removed */ + item_key_by_coord(coord, smallest_removed); + set_key_offset(smallest_removed, get_key_offset(smallest_removed) + from); + } + + if (new_first) { + assert("vs-1531", from == 0); + + item_key_by_coord(coord, new_first); + set_key_offset(new_first, get_key_offset(new_first) + from + count); + } + + if (!cut) + kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p); + + if (from == 0) { + if (count != nr_units_ctail(coord)) { + /* part of item is removed, so move free space at the beginning + of the item and update item key */ + reiser4_key key; + xmemcpy(item + to + 1, item, sizeof(ctail_item_format)); + item_key_by_coord(coord, &key); + set_key_offset(&key, get_key_offset(&key) + count); + node_plugin_by_node(coord->node)->update_item_key(coord, &key, 0 /*info */ ); + } + else { + impossible("vs-1532", "cut_units should not be called to cut evrything"); + /* whole item is cut, so more then amount of space occupied + by units got freed */ + count += sizeof(ctail_item_format); + } + if (REISER4_DEBUG) + xmemset(item, 0, count); + } + else if (REISER4_DEBUG) + xmemset(item + sizeof(ctail_item_format) + from, 0, count); + return count; +} + +/* plugin->u.item.b.cut_units */ +reiser4_internal int +cut_units_ctail(coord_t *item, pos_in_node_t from, pos_in_node_t to, + carry_cut_data *cdata, reiser4_key *smallest_removed, reiser4_key *new_first) +{ + return cut_or_kill_ctail_units(item, from, to, 1, NULL, smallest_removed, new_first); +} + +/* plugin->u.item.b.kill_units */ +reiser4_internal int +kill_units_ctail(coord_t *item, pos_in_node_t from, pos_in_node_t to, + struct carry_kill_data *kdata, reiser4_key *smallest_removed, reiser4_key *new_first) +{ + return cut_or_kill_ctail_units(item, from, to, 0, kdata, smallest_removed, new_first); +} + +/* plugin->u.item.s.file.read */ +reiser4_internal int +read_ctail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint) +{ + uf_coord_t *uf_coord; + coord_t *coord; + + uf_coord = &hint->coord; + coord = &uf_coord->base_coord; + assert("edward-127", f->user == 0); + assert("edward-128", f->data); + assert("edward-129", coord && coord->node); + assert("edward-130", coord_is_existing_unit(coord)); +// assert("edward-131", znode_is_rlocked(coord->node)); + assert("edward-132", znode_is_loaded(coord->node)); + + /* start read only from the beginning of ctail */ + assert("edward-133", coord->unit_pos == 0); + /* read only whole ctails */ + assert("edward-135", nr_units_ctail(coord) <= f->length); + + assert("edward-136", schedulable()); + + memcpy(f->data, (char *)first_unit(coord), (size_t)nr_units_ctail(coord)); + + mark_page_accessed(znode_page(coord->node)); + move_flow_forward(f, nr_units_ctail(coord)); + coord->unit_pos --; /* ?? */ + coord->between = AFTER_UNIT; + return 0; +} + +/* this reads one cluster form disk, + attaches buffer with decrypted and decompressed data */ +reiser4_internal int +ctail_read_cluster (reiser4_cluster_t * clust, struct inode * inode, int write) +{ + int result; + + assert("edward-139", clust->buf == NULL); + assert("edward-140", clust->stat != FAKE_CLUSTER); + assert("edward-145", inode_get_flag(inode, REISER4_CLUSTER_KNOWN)); + + /* allocate temporary buffer of disk cluster size */ + + clust->bsize = inode_scaled_offset(inode, fsize_to_count(clust, inode) + + max_crypto_overhead(inode_crypto_plugin(inode), inode_crypto_stat(inode))); + if (clust->bsize > inode_scaled_cluster_size(inode)) + clust->bsize = inode_scaled_cluster_size(inode); + + clust->buf = reiser4_kmalloc(clust->bsize, GFP_KERNEL); + if (!clust->buf) + return -ENOMEM; + result = find_cluster(clust, inode, 1 /* read */, write); + if (result) + goto out; + result = inflate_cluster(clust, inode); + if(result) + goto out; + return 0; + out: + put_cluster_data(clust, inode); + return result; +} + +/* read one locked page */ +reiser4_internal int +do_readpage_ctail(reiser4_cluster_t * clust, struct page *page) +{ + int ret; + unsigned cloff; + struct inode * inode; + char * data; + int release = 0; + size_t pgcnt; + + assert("edward-212", PageLocked(page)); + + inode = page->mapping->host; + + if (!cluster_is_uptodate(clust)) { + clust->index = pg_to_clust(page->index, inode); + unlock_page(page); + ret = ctail_read_cluster(clust, inode, 0 /* do not write */); + lock_page(page); + if (ret) + return ret; + /* cluster was uptodated here, release it before exit */ + release = 1; + } + if(PageUptodate(page)) + /* Two possible reasons for it: + 1. page was filled by the caller, + 2. races with another read/write + */ + goto exit; + if (clust->stat == FAKE_CLUSTER) { + /* fill page by zeroes */ + char *kaddr = kmap_atomic(page, KM_USER0); + + assert("edward-119", clust->buf == NULL); + + memset(kaddr, 0, PAGE_CACHE_SIZE); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + SetPageUptodate(page); + + ON_TRACE(TRACE_CTAIL, " - hole, OK\n"); + return 0; + } + /* fill page by plain text from cluster handle */ + + assert("edward-120", clust->len <= inode_cluster_size(inode)); + + /* start page offset in the cluster */ + cloff = pg_to_off_to_cloff(page->index, inode); + /* bytes in page */ + pgcnt = off_to_pgcount(inode->i_size, page->index); + assert("edward-620", off_to_pgcount(inode->i_size, page->index) > 0); + + data = kmap(page); + memcpy(data, clust->buf + cloff, pgcnt); + memset(data + pgcnt, 0, (size_t)PAGE_CACHE_SIZE - pgcnt); + kunmap(page); + SetPageUptodate(page); + exit: + if (release) + put_cluster_data(clust, inode); + return 0; +} + +/* plugin->u.item.s.file.readpage */ +reiser4_internal int readpage_ctail(void * vp, struct page * page) +{ + int result; + reiser4_cluster_t * clust = vp; + + assert("edward-114", clust != NULL); + assert("edward-115", PageLocked(page)); + assert("edward-116", !PageUptodate(page)); + assert("edward-117", !jprivate(page) && !PagePrivate(page)); + assert("edward-118", page->mapping && page->mapping->host); + + result = do_readpage_ctail(clust, page); + + assert("edward-213", PageLocked(page)); + return result; +} + +/* plugin->s.file.writepage */ + +/* plugin->u.item.s.file.readpages + populate an address space with some pages, and start reads against them. + FIXME_EDWARD: this function should return errors +*/ +reiser4_internal void +readpages_ctail(void *coord UNUSED_ARG, struct address_space *mapping, struct list_head *pages) +{ + reiser4_cluster_t clust; + struct page *page; + struct pagevec lru_pvec; + int ret = 0; + struct inode * inode; + + if (!list_empty(pages) && pages->next != pages->prev) + /* more then one pages in the list - make sure its order is right */ + assert("edward-214", list_to_page(pages)->index < list_to_next_page(pages)->index); + + pagevec_init(&lru_pvec, 0); + reiser4_cluster_init(&clust); + inode = mapping->host; + + while (!list_empty(pages)) { + page = list_to_page(pages); + list_del(&page->list); + if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { + page_cache_release(page); + continue; + } + /* update cluster handle if it is necessary */ + if (!cluster_is_uptodate(&clust) || !page_of_cluster(page, &clust, inode)) { + put_cluster_data(&clust, inode); + clust.index = pg_to_clust(page->index, inode); + if (fsize_to_count(&clust, inode) <= PAGE_CACHE_SIZE) { + clust.pages = &page; + clust.nr_pages = 1; + } + unlock_page(page); + ret = ctail_read_cluster(&clust, inode, 0 /* do not write */); + if (ret) + goto exit; + lock_page(page); + } + ret = do_readpage_ctail(&clust, page); + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + if (ret) { + impossible("edward-215", "do_readpage_ctail returned crap"); + unlock_page(page); + exit: + while (!list_empty(pages)) { + struct page *victim; + + victim = list_to_page(pages); + list_del(&victim->list); + page_cache_release(victim); + } + break; + } + unlock_page(page); + } + put_cluster_data(&clust, inode); + pagevec_lru_add(&lru_pvec); + return; +} + +/* + plugin->u.item.s.file.append_key +*/ +reiser4_internal reiser4_key * +append_key_ctail(const coord_t *coord, reiser4_key *key) +{ + return NULL; +} + +/* key of the first item of the next cluster */ +reiser4_internal reiser4_key * +append_cluster_key_ctail(const coord_t *coord, reiser4_key *key) +{ + item_key_by_coord(coord, key); + set_key_offset(key, ((__u64)(clust_by_coord(coord)) + 1) << cluster_shift_by_coord(coord) << PAGE_CACHE_SHIFT); + return key; +} + +static int +insert_crc_flow(coord_t * coord, lock_handle * lh, flow_t * f, struct inode * inode) +{ + int result; + carry_pool pool; + carry_level lowest_level; + carry_op *op; + reiser4_item_data data; + __u8 cluster_shift = inode_cluster_shift(inode); + + init_carry_pool(&pool); + init_carry_level(&lowest_level, &pool); + + assert("edward-466", coord->between == AFTER_ITEM || coord->between == AFTER_UNIT || + coord->between == BEFORE_ITEM); + + if (coord->between == AFTER_UNIT) { + coord->unit_pos = 0; + coord->between = AFTER_ITEM; + } + op = post_carry(&lowest_level, COP_INSERT_FLOW, coord->node, 0 /* operate directly on coord -> node */ ); + if (IS_ERR(op) || (op == NULL)) + return RETERR(op ? PTR_ERR(op) : -EIO); + data.user = 0; + data.iplug = item_plugin_by_id(CTAIL_ID); + data.arg = &cluster_shift; + + data.length = 0; + data.data = 0; + + op->u.insert_flow.insert_point = coord; + op->u.insert_flow.flow = f; + op->u.insert_flow.data = &data; + op->u.insert_flow.new_nodes = 0; + + lowest_level.track_type = CARRY_TRACK_CHANGE; + lowest_level.tracked = lh; + + ON_STATS(lowest_level.level_no = znode_get_level(coord->node)); + result = carry(&lowest_level, 0); + done_carry_pool(&pool); + + return result; +} + +static int +insert_crc_flow_in_place(coord_t * coord, lock_handle * lh, flow_t * f, struct inode * inode) +{ + int ret; + coord_t point; + lock_handle lock; + + assert("edward-484", coord->between == AT_UNIT || + coord->between == AFTER_UNIT || coord->between == AFTER_ITEM); + + coord_dup (&point, coord); + + if (coord->between == AT_UNIT) { + coord_prev_item(&point); + + assert("edward-485", item_plugin_by_coord(&point) == item_plugin_by_id(CTAIL_ID)); + + point.between = AFTER_ITEM; + } + + init_lh (&lock); + copy_lh(&lock, lh); + + ret = insert_crc_flow(&point, &lock, f, inode); + done_lh(&lock); + return ret; +} + +/* overwrite tail citem or its part */ +static int +overwrite_ctail(coord_t * coord, flow_t * f) +{ + unsigned count; + + assert("edward-269", f->user == 0); + assert("edward-270", f->data != NULL); + assert("edward-271", f->length > 0); + assert("edward-272", coord_is_existing_unit(coord)); + assert("edward-273", coord->unit_pos == 0); + assert("edward-274", znode_is_write_locked(coord->node)); + assert("edward-275", schedulable()); + assert("edward-467", item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID)); + + count = nr_units_ctail(coord); + + if (count > f->length) + count = f->length; + xmemcpy(first_unit(coord), f->data, count); + move_flow_forward(f, count); + coord->unit_pos += count; + return 0; +} + +/* cut ctail item or its tail subset */ +static int +cut_ctail(coord_t * coord) +{ + coord_t stop; + + assert("edward-435", coord->between == AT_UNIT && + coord->item_pos < coord_num_items(coord) && + coord->unit_pos <= coord_num_units(coord)); + + if(coord->unit_pos == coord_num_units(coord)) { + /* nothing to cut */ + return 0; + } + coord_dup(&stop, coord); + stop.unit_pos = coord_last_unit_pos(coord); + + return cut_node_content(coord, &stop, NULL, NULL, NULL); +} + +/* plugin->u.item.s.file.write ? */ +reiser4_internal int +write_ctail(flush_pos_t * pos, crc_write_mode_t mode) +{ + int result; + ctail_squeeze_info_t * info; + + assert("edward-468", pos != NULL); + assert("edward-469", pos->idata != NULL); + + info = &pos->idata->u.ctail_info; + + switch (mode) { + case CRC_FIRST_ITEM: + case CRC_APPEND_ITEM: + result = insert_crc_flow_in_place(&pos->coord, &pos->lock, &info->flow, info->inode); + break; + case CRC_OVERWRITE_ITEM: + overwrite_ctail(&pos->coord, &info->flow); + case CRC_CUT_ITEM: + result = cut_ctail(&pos->coord); + break; + default: + result = RETERR(-EIO); + impossible("edward-244", "wrong ctail write mode"); + } + return result; +} + +reiser4_internal item_plugin * +item_plugin_by_jnode(jnode * node) +{ + assert("edward-302", jnode_is_cluster_page(node)); + return (item_plugin_by_id(CTAIL_ID)); +} + +static jnode * +next_jnode_cluster(jnode * node, struct inode *inode, reiser4_cluster_t * clust) +{ + return jlookup(tree_by_inode(inode), get_inode_oid(inode), clust_to_pg(clust->index + 1, inode)); +} + +/* plugin->u.item.f.scan */ +/* Check if the cluster node we started from is not presented by any items + in the tree. If so, create the link by inserting prosessed cluster into + the tree. Don't care about scan counter since leftward scanning will be + continued from rightmost dirty node. +*/ +reiser4_internal int scan_ctail(flush_scan * scan) +{ + int result; + struct page * page; + struct inode * inode; + reiser4_cluster_t clust; + flow_t f; + jnode * node = scan->node; + file_plugin * fplug; + + reiser4_cluster_init(&clust); + + assert("edward-227", scan->node != NULL); + assert("edward-228", jnode_is_cluster_page(scan->node)); + assert("edward-639", znode_is_write_locked(scan->parent_lock.node)); + + jref(node); + + if (get_flush_scan_nstat(scan) == LINKED) { + /* nothing to do */ + return 0; + } + do { + LOCK_JNODE(node); + if (!(jnode_is_dirty(node) && + (node->atom == ZJNODE(scan->parent_lock.node)->atom) && + JF_ISSET(node, JNODE_NEW))) { + /* don't touch! */ + UNLOCK_JNODE(node); + jput(node); + break; + } + UNLOCK_JNODE(node); + + reiser4_cluster_init(&clust); + + page = jnode_page(node); + + assert("edward-229", page->mapping != NULL); + assert("edward-230", page->mapping != NULL); + assert("edward-231", page->mapping->host != NULL); + + inode = page->mapping->host; + fplug = inode_file_plugin(inode); + + assert("edward-244", fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID)); + assert("edward-232", inode_get_flag(inode, REISER4_CLUSTER_KNOWN)); + assert("edward-233", scan->direction == LEFT_SIDE); + + clust.index = pg_to_clust(page->index, inode); + + /* remove jnode cluster from dirty list */ + result = flush_cluster_pages(&clust, inode); + if (result) + return result; + result = deflate_cluster(&clust, inode); + if (result) + goto error; + + assert("edward-633", clust.len != 0); + + fplug->flow_by_inode(inode, clust.buf, 0, clust.len, clust_to_off(clust.index, inode), WRITE, &f); + /* insert processed data */ + result = insert_crc_flow(&scan->parent_coord, /* insert point */ + &scan->parent_lock, &f, inode); + if (result) + goto error; + assert("edward-234", f.length == 0); + JF_CLR(node, JNODE_NEW); + release_cluster_buf(&clust, inode); + jput(node); + } + while ((node = next_jnode_cluster(node, inode, &clust))); + + /* now the child is linked to its parent, + set appropriate status */ + set_flush_scan_nstat(scan, LINKED); + return 0; + error: + release_cluster_buf(&clust, inode); + return result; +} + +/* If true, this function attaches children */ +static int +should_attach_squeeze_idata(flush_pos_t * pos) +{ + int result; + assert("edward-431", pos != NULL); + assert("edward-432", pos->child == NULL); + assert("edward-619", znode_is_write_locked(pos->coord.node)); + assert("edward-470", item_plugin_by_coord(&pos->coord) == item_plugin_by_id(CTAIL_ID)); + + /* check for leftmost child */ + utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child); + + if (!pos->child) + return 0; + LOCK_JNODE(pos->child); + result = jnode_is_dirty(pos->child) && + pos->child->atom == ZJNODE(pos->coord.node)->atom; + UNLOCK_JNODE(pos->child); + if (!result && pos->child) { + /* existing child isn't to attach, clear up this one */ + jput(pos->child); + pos->child = NULL; + } + return result; +} + +reiser4_internal void +init_squeeze_idata_ctail(flush_pos_t * pos) +{ + assert("edward-471", pos != NULL); + assert("edward-472", pos->idata != NULL); + assert("edward-473", item_plugin_by_coord(&pos->coord) == item_plugin_by_id(CTAIL_ID)); + + xmemset(pos->idata, 0, sizeof(*pos->idata)); + pos->idata->iplug = item_plugin_by_coord(&pos->coord); +} + +/* attach valid squeeze item data to the flush position */ +static int +attach_squeeze_idata(flush_pos_t * pos, struct inode * inode) +{ + int ret = 0; + file_plugin * fplug; + ctail_squeeze_info_t * info; + + assert("edward-248", pos != NULL); + assert("edward-249", pos->child != NULL); + assert("edward-250", pos->idata == NULL); + assert("edward-251", inode != NULL); + + fplug = inode_file_plugin(inode); + + assert("edward-252", fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID)); + + pos->idata = reiser4_kmalloc(sizeof(flush_squeeze_item_data_t), GFP_KERNEL); + if (pos->idata == NULL) + return -ENOMEM; + init_squeeze_idata_ctail(pos); + info = &pos->idata->u.ctail_info; + info->clust = reiser4_kmalloc(sizeof(reiser4_cluster_t), GFP_KERNEL); + if (info->clust == NULL) { + ret = -ENOMEM; + goto exit2; + } + reiser4_cluster_init(info->clust); + info->inode = inode; + info->clust->index = pg_to_clust(jnode_page(pos->child)->index, inode); + + ret = flush_cluster_pages(info->clust, inode); + if (ret) + goto exit1; + + ret = deflate_cluster(info->clust, inode); + if (ret) + goto exit1; + /* attach flow by cluster buffer */ + fplug->flow_by_inode(info->inode, info->clust->buf, 0/* kernel space */, info->clust->len, clust_to_off(info->clust->index, inode), WRITE_OP, &info->flow); + jput(pos->child); + return 0; + + exit1: + reiser4_kfree(info->clust); + exit2: + reiser4_kfree(pos->idata); + jput(pos->child); + /* invalidate squeeze item info */ + pos->idata = NULL; + return ret; +} + +static void +detach_squeeze_idata(flush_squeeze_item_data_t ** idata) +{ + + ctail_squeeze_info_t * info; + + assert("edward-253", idata != NULL); + info = &(*idata)->u.ctail_info; + + assert("edward-254", info->clust != NULL); + assert("edward-255", info->inode != NULL); + assert("edward-256", info->clust->buf != NULL); + + release_cluster_buf(info->clust, info->inode); + reiser4_kfree(info->clust); + reiser4_kfree(info); + + *idata = NULL; +} + +/* plugin->u.item.f.utmost_child */ + +/* This function sets leftmost child for a first cluster item, + if the child exists, and NULL in other cases. + NOTE-EDWARD: Do not call this for RIGHT_SIDE */ + +reiser4_internal int +utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child) +{ + reiser4_key key; + + item_key_by_coord(coord, &key); + + assert("edward-257", coord != NULL); + assert("edward-258", child != NULL); + assert("edward-259", side == LEFT_SIDE); + assert("edward-260", item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID)); + + if (!cluster_key(&key, coord)) + *child = NULL; + else + *child = jlookup(current_tree, get_key_objectid(item_key_by_coord(coord, &key)), pg_by_coord(coord)); + return 0; +} + +/* plugin->u.item.f.squeeze */ +/* write ctail in guessed mode */ +reiser4_internal int +squeeze_ctail(flush_pos_t * pos) +{ + int result; + ctail_squeeze_info_t * info = NULL; + crc_write_mode_t mode = CRC_OVERWRITE_ITEM; + + assert("edward-261", pos != NULL); + + if (pos->idata == NULL) { + if (should_attach_squeeze_idata(pos)) { + /* attach squeeze item info */ + struct inode * inode; + + assert("edward-264", pos->child != NULL); + assert("edward-265", jnode_page(pos->child) != NULL); + assert("edward-266", jnode_page(pos->child)->mapping != NULL); + + inode = jnode_page(pos->child)->mapping->host; + + assert("edward-267", inode != NULL); + + /* attach item squeeze info by child and put the last one */ + result = attach_squeeze_idata(pos, inode); + pos->child = NULL; + if (result != 0) + return result; + } + else + /* unsqueezable */ + return 0; + } + else { + /* there is attached squeeze info, it can be still valid! */ + info = &pos->idata->u.ctail_info; + + if (info->flow.length) { + /* append or overwrite */ + if (pos->idata->mergeable) { + mode = CRC_OVERWRITE_ITEM; + pos->idata->mergeable = 0; + } + else + mode = CRC_APPEND_ITEM; + } + else { + /* cut or invalidate */ + if (pos->idata->mergeable) { + mode = CRC_CUT_ITEM; + pos->idata->mergeable = 0; + } + else { + detach_squeeze_idata(&pos->idata); + return RETERR(-E_REPEAT); + } + } + } + assert("edward-433", pos->idata != NULL); + result = write_ctail(pos, mode); + if (result) { + detach_squeeze_idata(&pos->idata); + return result; + } + + if (mode == CRC_APPEND_ITEM) { + /* detach squeeze info */ + assert("edward-434", pos->idata->u.ctail_info.flow.length == 0); + detach_squeeze_idata(&pos->idata); + return RETERR(-E_REPEAT); + } + return 0; +} + + + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/ctail.h linux-2.6.4-ck1/fs/reiser4/plugin/item/ctail.h --- linux-2.6.4/fs/reiser4/plugin/item/ctail.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/ctail.h 2004-03-11 22:45:15.318505699 +1100 @@ -0,0 +1,110 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#if !defined( __FS_REISER4_CTAIL_H__ ) +#define __FS_REISER4_CTAIL_H__ + +/* cryptcompress object item. See ctail.c for description. */ +#include "../cryptcompress.h" + +#include + +typedef struct ctail_item_format { + /* cluster shift */ + d8 cluster_shift; + /* ctail body */ + d8 body[0]; +} __attribute__((packed)) ctail_item_format; + +/* for flush squeeze */ +typedef struct ctail_squeeze_info { + struct inode * inode; + reiser4_cluster_t * clust; + flow_t flow; +} ctail_squeeze_info_t; + +#define CTAIL_MIN_BODY_SIZE MIN_CRYPTO_BLOCKSIZE + +#define list_to_page(head) (list_entry((head)->prev, struct page, list)) +#define list_to_next_page(head) (list_entry((head)->prev->prev, struct page, list)) + +struct cut_list; + +/* plugin->item.b.* */ +int can_contain_key_ctail(const coord_t *, const reiser4_key *, const reiser4_item_data *); +int mergeable_ctail(const coord_t * p1, const coord_t * p2); +pos_in_node_t nr_units_ctail(const coord_t * coord); +int estimate_ctail(const coord_t * coord, const reiser4_item_data * data); +void print_ctail(const char *prefix, coord_t * coord); +lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *); + +int paste_ctail(coord_t * coord, reiser4_item_data * data, carry_plugin_info * info UNUSED_ARG); +int init_ctail(coord_t *, coord_t *, reiser4_item_data *); +int can_shift_ctail(unsigned free_space, coord_t * coord, + znode * target, shift_direction pend, unsigned *size, unsigned want); +void copy_units_ctail(coord_t * target, coord_t * source, + unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space); +int cut_units_ctail(coord_t *coord, pos_in_node_t from, pos_in_node_t to, + carry_cut_data *, reiser4_key * smallest_removed, reiser4_key *new_first); +int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to, + carry_kill_data *, reiser4_key * smallest_removed, reiser4_key *new_first); + +/*int check_check(const coord_t * coord, const char **error);*/ + +/* plugin->u.item.s.* */ +int read_ctail(struct file *, flow_t *, hint_t *); +int readpage_ctail(void *, struct page *); +void readpages_ctail(void *, struct address_space *, struct list_head *); +reiser4_key *append_key_ctail(const coord_t *, reiser4_key *); +int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t, carry_kill_data *); +int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *); + +/* plugin->u.item.f */ +int utmost_child_ctail(const coord_t *, sideof, jnode **); +int scan_ctail(flush_scan *); +int squeeze_ctail(flush_pos_t *); +item_plugin * item_plugin_by_jnode(jnode *); + +__u8 inode_cluster_shift (struct inode *); +size_t inode_cluster_size (struct inode *); +crypto_stat_t * inode_crypto_stat(struct inode *); +unsigned long pg_to_clust(unsigned long, struct inode *); +loff_t clust_to_off(unsigned long, struct inode *); +unsigned long off_to_pg(loff_t); +unsigned long clust_to_pg(unsigned long, struct inode *); +unsigned off_to_pgoff(loff_t); +unsigned off_to_pgcount(loff_t, unsigned long); +unsigned pg_to_off_to_cloff(unsigned long, struct inode *); +unsigned fsize_to_count(reiser4_cluster_t *, struct inode *); + +void reiser4_cluster_init(reiser4_cluster_t *); +void put_cluster_data(reiser4_cluster_t *, struct inode *); +int cluster_is_uptodate (reiser4_cluster_t *); +void release_cluster_buf(reiser4_cluster_t *, struct inode *); +size_t inode_scaled_cluster_size(struct inode *); +loff_t inode_scaled_offset (struct inode *, const loff_t); +__u8 inode_cluster_shift (struct inode * inode); +int inode_cluster_pages (struct inode * inode); +inline unsigned long pg_to_clust_to_pg(unsigned long idx, struct inode *); +unsigned max_crypto_overhead(crypto_plugin *, crypto_stat_t *); + +int inflate_cluster(reiser4_cluster_t *, struct inode *); +int find_cluster_item(hint_t * hint, const reiser4_key *key, + znode_lock_mode lock_mode, ra_info_t *ra_info, + lookup_bias bias); +int page_of_cluster(struct page *, reiser4_cluster_t *, struct inode *); +int find_cluster(reiser4_cluster_t *, struct inode *, int read, int write); +int flush_cluster_pages(reiser4_cluster_t *, struct inode *); +int deflate_cluster(reiser4_cluster_t *, struct inode *); +void truncate_pages_cryptcompress(struct address_space * mapping, unsigned long index); + +#endif /* __FS_REISER4_CTAIL_H__ */ + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/extent.c linux-2.6.4-ck1/fs/reiser4/plugin/item/extent.c --- linux-2.6.4/fs/reiser4/plugin/item/extent.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/extent.c 2004-03-11 22:45:15.319505543 +1100 @@ -0,0 +1,180 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "item.h" +#include "../../key.h" +#include "../../super.h" +#include "../../carry.h" +#include "../../inode.h" +#include "../../page_cache.h" +#include "../../emergency_flush.h" +#include "../../prof.h" +#include "../../flush.h" +#include "../object.h" + + +/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */ +/* Audited by: green(2002.06.13) */ +reiser4_internal reiser4_item_data * +init_new_extent(reiser4_item_data *data, void *ext_unit, int nr_extents) +{ + if (REISER4_ZERO_NEW_NODE) + memset(data, 0, sizeof(reiser4_item_data)); + + data->data = ext_unit; + /* data->data is kernel space */ + data->user = 0; + data->length = sizeof(reiser4_extent) * nr_extents; + data->arg = 0; + data->iplug = item_plugin_by_id(EXTENT_POINTER_ID); + return data; +} + +/* how many bytes are addressed by @nr first extents of the extent item */ +reiser4_internal reiser4_block_nr +extent_size(const coord_t *coord, pos_in_node_t nr) +{ + pos_in_node_t i; + reiser4_block_nr blocks; + reiser4_extent *ext; + + ext = item_body_by_coord(coord); + assert("vs-263", nr <= nr_units_extent(coord)); + + blocks = 0; + for (i = 0; i < nr; i++, ext++) { + blocks += extent_get_width(ext); + } + + return blocks * current_blocksize; +} + +reiser4_internal extent_state +state_of_extent(reiser4_extent *ext) +{ + switch ((int) extent_get_start(ext)) { + case 0: + return HOLE_EXTENT; + case 1: + return UNALLOCATED_EXTENT; + default: + break; + } + return ALLOCATED_EXTENT; +} + +reiser4_internal int +extent_is_unallocated(const coord_t *item) +{ + assert("jmacd-5133", item_is_extent(item)); + + return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT; +} + +reiser4_internal int +extent_is_allocated(const coord_t *item) +{ + assert("jmacd-5133", item_is_extent(item)); + + return state_of_extent(extent_by_coord(item)) == ALLOCATED_EXTENT; +} + +/* set extent's start and width */ +reiser4_internal void +set_extent(reiser4_extent *ext, reiser4_block_nr start, reiser4_block_nr width) +{ + extent_set_start(ext, start); + extent_set_width(ext, width); +} + +/* used in split_allocate_extent, allocated2unallocated, extent_handle_relocate_in_place, plug_hole to insert 1 or 2 + extent units after the one @un_extent is set to. @un_extent itself is changed to @new_ext */ +reiser4_internal int +replace_extent(coord_t *un_extent, lock_handle *lh, + reiser4_key *key, reiser4_item_data *data, const reiser4_extent *new_ext, unsigned flags) +{ + int result; + coord_t coord_after; + lock_handle lh_after; + tap_t watch; + znode *orig_znode; + ON_DEBUG(reiser4_extent orig_ext); /* this is for debugging */ + + assert("vs-990", coord_is_existing_unit(un_extent)); + assert("vs-1375", znode_is_write_locked(un_extent->node)); + assert("vs-1426", extent_get_width(new_ext) != 0); + assert("vs-1427", extent_get_width((reiser4_extent *)data->data) != 0); + + coord_dup(&coord_after, un_extent); + init_lh(&lh_after); + copy_lh(&lh_after, lh); + tap_init(&watch, &coord_after, &lh_after, ZNODE_WRITE_LOCK); + tap_monitor(&watch); + + ON_DEBUG(orig_ext = *extent_by_coord(un_extent)); + orig_znode = un_extent->node; + + /* make sure that key is set properly */ + if (REISER4_DEBUG) { + reiser4_key tmp; + + unit_key_by_coord(un_extent, &tmp); + set_key_offset(&tmp, get_key_offset(&tmp) + extent_get_width(new_ext) * current_blocksize); + assert("vs-1080", keyeq(&tmp, key)); + } + + DISABLE_NODE_CHECK; + + /* set insert point after unit to be replaced */ + un_extent->between = AFTER_UNIT; + result = insert_into_item(un_extent, (flags == COPI_DONT_SHIFT_LEFT) ? 0 : lh, key, data, flags); + if (!result) { + reiser4_extent *ext; + + if (coord_after.node != orig_znode) { + coord_clear_iplug(&coord_after); + result = zload(coord_after.node); + } + + if (likely(!result)) { + ext = extent_by_coord(&coord_after); + + assert("vs-987", znode_is_loaded(coord_after.node)); + assert("vs-988", !memcmp(ext, &orig_ext, sizeof (*ext))); + + *ext = *new_ext; + znode_make_dirty(coord_after.node); + + if (coord_after.node != orig_znode) + zrelse(coord_after.node); + if (flags == COPI_DONT_SHIFT_LEFT) { + /* set coord back to initial extent unit */ + *un_extent = coord_after; + assert("vs-1375", znode_is_write_locked(un_extent->node)); + } + } + } + tap_done(&watch); + + ENABLE_NODE_CHECK; + return result; +} + +reiser4_internal lock_handle * +znode_lh(znode *node) +{ + assert("vs-1371", znode_is_write_locked(node)); + assert("vs-1372", znode_is_wlocked_once(node)); + return owners_list_front(&node->lock.owners); +} + + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.4-ck1/fs/reiser4/plugin/item/extent_file_ops.c --- linux-2.6.4/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/extent_file_ops.c 2004-03-11 22:45:15.322505077 +1100 @@ -0,0 +1,1314 @@ +/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "item.h" +#include "../../inode.h" +#include "../../page_cache.h" +#include "../../flush.h" /* just for jnode_tostring */ +#include "../object.h" + +#include + + +#if REISER4_DEBUG +static int +coord_extension_is_ok(const uf_coord_t *uf_coord) +{ + const coord_t *coord; + const extent_coord_extension_t *ext_coord; + + coord = &uf_coord->base_coord; + ext_coord = &uf_coord->extension.extent; + + return WITH_DATA(coord->node, (uf_coord->valid == 1 && + coord_is_iplug_set(coord) && + item_is_extent(coord) && + ext_coord->nr_units == nr_units_extent(coord) && + ext_coord->ext == extent_by_coord(coord) && + ext_coord->width == extent_get_width(ext_coord->ext) && + coord->unit_pos < ext_coord->nr_units && + ext_coord->pos_in_unit < ext_coord->width && + extent_get_start(ext_coord->ext) == extent_get_start(&ext_coord->extent) && + extent_get_width(ext_coord->ext) == extent_get_width(&ext_coord->extent))); +} + +#endif + +/* @coord is set either to the end of last extent item of a file + (coord->node is a node on the twig level) or to a place where first + item of file has to be inserted to (coord->node is leaf + node). Calculate size of hole to be inserted. If that hole is too + big - only part of it is inserted */ +static int +add_hole(coord_t *coord, lock_handle *lh, const reiser4_key *key /* key of position in a file for write */) +{ + int result; + znode *loaded; + reiser4_extent *ext, new_ext; + reiser4_block_nr hole_width; + reiser4_item_data item; + reiser4_key hole_key; + + coord_clear_iplug(coord); + result = zload(coord->node); + if (result) + return result; + loaded = coord->node; + + if (znode_get_level(coord->node) == LEAF_LEVEL) { + /* there are no items of this file yet. First item will be + hole extent inserted here */ + + /* @coord must be set for inserting of new item */ + assert("vs-711", coord_is_between_items(coord)); + + hole_key = *key; + set_key_offset(&hole_key, 0ull); + + hole_width = ((get_key_offset(key) + current_blocksize - 1) >> + current_blocksize_bits); + assert("vs-710", hole_width > 0); + + /* compose body of hole extent */ + set_extent(&new_ext, HOLE_EXTENT_START, hole_width); + + result = insert_extent_by_coord(coord, init_new_extent(&item, &new_ext, 1), &hole_key, lh); + zrelse(loaded); + return result; + } + + /* last item of file may have to be appended with hole */ + assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL); + assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID); + + /* make sure we are at proper item */ + assert("vs-918", keylt(key, max_key_inside_extent(coord, &hole_key))); + + /* key of first byte which is not addressed by this extent */ + append_key_extent(coord, &hole_key); + + if (keyle(key, &hole_key)) { + /* there is already extent unit which contains position + specified by @key */ + zrelse(loaded); + return 0; + } + + /* extent item has to be appended with hole. Calculate length of that + hole */ + hole_width = ((get_key_offset(key) - get_key_offset(&hole_key) + + current_blocksize - 1) >> current_blocksize_bits); + assert("vs-954", hole_width > 0); + + /* set coord after last unit */ + coord_init_after_item_end(coord); + + /* get last extent in the item */ + ext = extent_by_coord(coord); + if (state_of_extent(ext) == HOLE_EXTENT) { + /* last extent of a file is hole extent. Widen that extent by + @hole_width blocks. Note that we do not worry about + overflowing - extent width is 64 bits */ + set_extent(ext, HOLE_EXTENT_START, extent_get_width(ext) + hole_width); + znode_make_dirty(coord->node); + zrelse(loaded); + return 0; + } + + /* append item with hole extent unit */ + assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT || state_of_extent(ext) == UNALLOCATED_EXTENT)); + + /* compose body of hole extent */ + set_extent(&new_ext, HOLE_EXTENT_START, hole_width); + + result = insert_into_item(coord, lh, &hole_key, init_new_extent(&item, &new_ext, 1), 0 /*flags */ ); + zrelse(loaded); + return result; +} + +/* insert extent item (containing one unallocated extent of width 1) to place + set by @coord */ +static int +insert_first_block(uf_coord_t *uf_coord, const reiser4_key *key, reiser4_block_nr *block) +{ + int result; + reiser4_extent ext; + reiser4_item_data unit; + + /* make sure that we really write to first block */ + assert("vs-240", get_key_offset(key) == 0); + + /* extent insertion starts at leaf level */ + assert("vs-719", znode_get_level(uf_coord->base_coord.node) == LEAF_LEVEL); + + set_extent(&ext, UNALLOCATED_EXTENT_START, 1); + result = insert_extent_by_coord(&uf_coord->base_coord, init_new_extent(&unit, &ext, 1), key, uf_coord->lh); + if (result) { + /* FIXME-VITALY: this is grabbed at file_write time. */ + /* grabbed2free ((__u64)1); */ + return result; + } + + *block = fake_blocknr_unformatted(); + + /* invalidate coordinate, research must be performed to continue because write will continue on twig level */ + uf_coord->valid = 0; + return 0; +} + +/* @coord is set to the end of extent item. Append it with pointer to one block - either by expanding last unallocated + extent or by appending a new one of width 1 */ +static int +append_one_block(uf_coord_t *uf_coord, reiser4_key *key, reiser4_block_nr *block) +{ + int result; + reiser4_extent new_ext; + reiser4_item_data unit; + coord_t *coord; + extent_coord_extension_t *ext_coord; + + coord = &uf_coord->base_coord; + ext_coord = &uf_coord->extension.extent; + + /* check correctness of position in the item */ + assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord)); + assert("vs-1311", coord->between == AFTER_UNIT); + assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1); + assert("vs-883", + ( { + reiser4_key next; + keyeq(key, append_key_extent(coord, &next)); + })); + + switch (state_of_extent(ext_coord->ext)) { + case UNALLOCATED_EXTENT: + set_extent(ext_coord->ext, UNALLOCATED_EXTENT_START, extent_get_width(ext_coord->ext) + 1); + znode_make_dirty(coord->node); + + /* update coord extension */ + ext_coord->width ++; + break; + + case HOLE_EXTENT: + case ALLOCATED_EXTENT: + /* append one unallocated extent of width 1 */ + set_extent(&new_ext, UNALLOCATED_EXTENT_START, 1); + result = insert_into_item(coord, uf_coord->lh, key, init_new_extent(&unit, &new_ext, 1), 0 /* flags */ ); + /* FIXME: for now */ + uf_coord->valid = 0; + if (result) + return result; + break; + default: + assert("", 0); + } + + *block = fake_blocknr_unformatted(); + return 0; +} + +/* @coord is set to hole unit inside of extent item, replace hole unit with an + unit for unallocated extent of the width 1, and perhaps a hole unit before + the unallocated unit and perhaps a hole unit after the unallocated unit. */ +static int +plug_hole(uf_coord_t *uf_coord, reiser4_key *key) +{ + reiser4_extent *ext, new_exts[2], /* extents which will be added after original + * hole one */ + replace; /* extent original hole extent will be replaced + * with */ + reiser4_block_nr width, pos_in_unit; + reiser4_item_data item; + int count; + coord_t *coord; + extent_coord_extension_t *ext_coord; + + coord = &uf_coord->base_coord; + ext_coord = &uf_coord->extension.extent; + + ext = ext_coord->ext; + width = ext_coord->width; + pos_in_unit = ext_coord->pos_in_unit; + + if (width == 1) { + set_extent(ext, UNALLOCATED_EXTENT_START, 1); + znode_make_dirty(coord->node); + return 0; + } else if (pos_in_unit == 0) { + if (coord->unit_pos) { + if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) { + extent_set_width(ext - 1, extent_get_width(ext - 1) + 1); + extent_set_width(ext, width - 1); + znode_make_dirty(coord->node); + + /* update coord extension */ + coord->unit_pos --; + ext_coord->width = extent_get_width(ext - 1); + ext_coord->pos_in_unit = ext_coord->width - 1; + ext_coord->ext --; + ON_DEBUG(ext_coord->extent = *ext_coord->ext); + return 0; + } + } + /* extent for replace */ + set_extent(&replace, UNALLOCATED_EXTENT_START, 1); + /* extent to be inserted */ + set_extent(&new_exts[0], HOLE_EXTENT_START, width - 1); + count = 1; + } else if (pos_in_unit == width - 1) { + if (coord->unit_pos < nr_units_extent(coord) - 1) { + if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) { + extent_set_width(ext + 1, extent_get_width(ext + 1) + 1); + extent_set_width(ext, width - 1); + znode_make_dirty(coord->node); + + /* update coord extension */ + coord->unit_pos ++; + ext_coord->width = extent_get_width(ext + 1); + ext_coord->pos_in_unit = 0; + ext_coord->ext ++; + ON_DEBUG(ext_coord->extent = *ext_coord->ext); + return 0; + } + } + /* extent for replace */ + set_extent(&replace, HOLE_EXTENT_START, width - 1); + /* extent to be inserted */ + set_extent(&new_exts[0], UNALLOCATED_EXTENT_START, 1); + count = 1; + } else { + /* extent for replace */ + set_extent(&replace, HOLE_EXTENT_START, pos_in_unit); + /* extents to be inserted */ + set_extent(&new_exts[0], UNALLOCATED_EXTENT_START, 1); + set_extent(&new_exts[1], HOLE_EXTENT_START, width - pos_in_unit - 1); + count = 2; + } + + /* insert_into_item will insert new units after the one @coord is set + to. So, update key correspondingly */ + unit_key_by_coord(coord, key); /* FIXME-VS: how does it work without this? */ + set_key_offset(key, (get_key_offset(key) + extent_get_width(&replace) * coord->node->zjnode.tree->super->s_blocksize)); + + uf_coord->valid = 0; + return replace_extent(coord, uf_coord->lh, key, init_new_extent(&item, new_exts, count), &replace, 0 /* flags */); +} + +/* make unallocated node pointer in the position @uf_coord is set to */ +static int +overwrite_one_block(uf_coord_t *uf_coord, reiser4_key *key, reiser4_block_nr *block, int *created) +{ + int result; + extent_coord_extension_t *ext_coord; + + assert("vs-1312", uf_coord->base_coord.between == AT_UNIT); + + result = 0; + *created = 0; + ext_coord = &uf_coord->extension.extent; + switch (state_of_extent(ext_coord->ext)) { + case ALLOCATED_EXTENT: + *block = extent_get_start(ext_coord->ext) + ext_coord->pos_in_unit; + break; + + case HOLE_EXTENT: + result = plug_hole(uf_coord, key); + if (!result) { + *block = fake_blocknr_unformatted(); + *created = 1; + } + break; + + case UNALLOCATED_EXTENT: + break; + + default: + impossible("vs-238", "extent of unknown type found"); + result = RETERR(-EIO); + break; + } + + return result; +} + +#if REISER4_DEBUG + +/* after make extent uf_coord's lock handle must be set to node containing unit which was inserted/found */ +static void +check_make_extent_result(int result, reiser4_key *key, lock_handle *lh, reiser4_block_nr block) +{ + coord_t coord; + + if (result != 0) { + return; + } + + assert("vs-960", znode_is_write_locked(lh->node)); + zload(lh->node); + result = lh->node->nplug->lookup(lh->node, key, FIND_EXACT, &coord); + assert("vs-1502", result == NS_FOUND); + zrelse(lh->node); +} + +#endif + +static int +make_extent(reiser4_key *key, uf_coord_t *uf_coord, write_mode_t mode, reiser4_block_nr *block, int *created) +{ + int result; + + assert("vs-960", znode_is_write_locked(uf_coord->base_coord.node)); + assert("vs-1334", znode_is_loaded(uf_coord->base_coord.node)); + + DISABLE_NODE_CHECK; + + *block = 0; + switch (mode) { + case FIRST_ITEM: + /* create first item of the file */ + result = insert_first_block(uf_coord, key, block); + *created = 1; + break; + + case APPEND_ITEM: + item_plugin_by_coord(&uf_coord->base_coord); + assert("vs-1316", coord_extension_is_ok(uf_coord)); + result = append_one_block(uf_coord, key, block); + *created = 1; + break; + + case OVERWRITE_ITEM: + item_plugin_by_coord(&uf_coord->base_coord); + assert("vs-1316", coord_extension_is_ok(uf_coord)); + result = overwrite_one_block(uf_coord, key, block, created); + break; + + default: + assert("vs-1346", 0); + result = RETERR(-E_REPEAT); + break; + } + + ENABLE_NODE_CHECK; + ON_DEBUG(check_make_extent_result(result, key, uf_coord->lh, *block)); + return result; +} + +/* drop longterm znode lock before calling balance_dirty_pages. balance_dirty_pages may cause transaction to close, + therefore we have to update stat data if necessary */ +static int +extent_balance_dirty_pages(struct address_space *mapping, const flow_t *f, + hint_t *hint) +{ + return item_balance_dirty_pages(mapping, f, hint, 0, 0/* do not set hint */); +} + +/* estimate and reserve space which may be required for writing one page of file */ +static int +reserve_extent_write_iteration(struct inode *inode, reiser4_tree *tree) +{ + int result; + + grab_space_enable(); + /* one unformatted node and one insertion into tree and one stat data update may be involved */ + result = reiser4_grab_space(1 + /* Hans removed reservation for balancing here. */ + /* if extent items will be ever used by plugins other than unix file plugin - estimate update should instead be taken by + inode_file_plugin(inode)->estimate.update(inode) + */ + estimate_update_common(inode), + 0/* flags */); + return result; +} + +static void +write_move_coord(coord_t *coord, uf_coord_t *uf_coord, write_mode_t mode, int full_page) +{ + extent_coord_extension_t *ext_coord; + + assert("vs-1339", ergo(mode == OVERWRITE_ITEM, coord->between == AT_UNIT)); + assert("vs-1341", ergo(mode == FIRST_ITEM, uf_coord->valid == 0)); + + if (uf_coord->valid == 0) + return; + + ext_coord = &uf_coord->extension.extent; + + if (mode == APPEND_ITEM) { + assert("vs-1340", coord->between == AFTER_UNIT); + assert("vs-1342", coord->unit_pos == ext_coord->nr_units - 1); + assert("vs-1343", ext_coord->pos_in_unit == ext_coord->width - 2); + assert("vs-1344", state_of_extent(ext_coord->ext) == UNALLOCATED_EXTENT); + ON_DEBUG(ext_coord->extent = *ext_coord->ext); + ext_coord->pos_in_unit ++; + if (!full_page) + coord->between = AT_UNIT; + return; + } + + assert("vs-1345", coord->between == AT_UNIT); + + if (!full_page) + return; + if (ext_coord->pos_in_unit == ext_coord->width - 1) { + /* last position in the unit */ + if (coord->unit_pos == ext_coord->nr_units - 1) { + /* last unit in the item */ + uf_coord->valid = 0; + } else { + /* move to the next unit */ + coord->unit_pos ++; + ext_coord->ext ++; + ON_DEBUG(ext_coord->extent = *ext_coord->ext); + ext_coord->width = extent_get_width(ext_coord->ext); + ext_coord->pos_in_unit = 0; + } + } else + ext_coord->pos_in_unit ++; +} + +static void +set_hint_unlock_node(hint_t *hint, flow_t *f, znode_lock_mode mode) +{ + if (hint->coord.valid) { + set_hint(hint, &f->key, mode); + } else { + unset_hint(hint); + } + longterm_unlock_znode(hint->coord.lh); +} + +static int +write_is_partial(struct inode *inode, loff_t file_off, unsigned page_off, unsigned count) +{ + if (count == inode->i_sb->s_blocksize) + return 0; + if (page_off == 0 && file_off + count >= inode->i_size) + return 0; + return 1; +} + +/* this initialize content of page not covered by write */ +static void +zero_around(struct page *page, int from, int count) +{ + char *data; + + data = kmap_atomic(page, KM_USER0); + memset(data, 0, from); + memset(data + from + count, 0, PAGE_CACHE_SIZE - from - count); + flush_dcache_page(page); + kunmap_atomic(data, KM_USER0); +} + +/* write flow's data into file by pages */ +static int +extent_write_flow(struct inode *inode, flow_t *flow, hint_t *hint, + int grabbed, /* 0 if space for operation is not reserved yet, 1 - otherwise */ + write_mode_t mode) +{ + int result; + loff_t file_off; + unsigned long page_nr; + unsigned long page_off, count; + struct page *page; + jnode *j; + uf_coord_t *uf_coord; + coord_t *coord; + oid_t oid; + reiser4_tree *tree; + reiser4_key page_key; + + assert("nikita-3139", !inode_get_flag(inode, REISER4_NO_SD)); + assert("vs-885", current_blocksize == PAGE_CACHE_SIZE); + assert("vs-700", flow->user == 1); + assert("vs-1352", flow->length > 0); + + + tree = tree_by_inode(inode); + oid = get_inode_oid(inode); + uf_coord = &hint->coord; + coord = &uf_coord->base_coord; + + /* position in a file to start write from */ + file_off = get_key_offset(&flow->key); + /* index of page containing that offset */ + page_nr = (unsigned long)(file_off >> PAGE_CACHE_SHIFT); + /* offset within the page */ + page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1)); + + /* key of first byte of page */ + page_key = flow->key; + set_key_offset(&page_key, (loff_t)page_nr << PAGE_CACHE_SHIFT); + do { + reiser4_block_nr blocknr; + int created; + + if (!grabbed) { + result = reserve_extent_write_iteration(inode, tree); + if (result) + break; + } + /* number of bytes to be written to page */ + count = PAGE_CACHE_SIZE - page_off; + if (count > flow->length) + count = flow->length; + + write_page_trace(inode->i_mapping, page_nr); + + result = make_extent(&page_key, uf_coord, mode, &blocknr, &created); + if (result) { + goto exit1; + } + /* look for jnode and create it if it does not exist yet */ + j = find_get_jnode(tree, inode->i_mapping, oid, page_nr); + if (IS_ERR(j)) { + result = PTR_ERR(j); + goto exit1; + } + LOCK_JNODE(j); + /* extent corresponding to this jnode was just created */ + if (*jnode_get_block(j) == 0) { + /* jnode is not initialized */ + jnode_set_block(j, &blocknr); + } else { + assert("vs-1508", !blocknr_is_fake(&blocknr)); + assert("vs-1507", ergo(blocknr, *jnode_get_block(j) == blocknr)); + } + if (created) { + JF_SET(j, JNODE_CREATED); + /* new block is added to file. Update inode->i_blocks and inode->i_bytes. FIXME: + inode_set/get/add/sub_bytes is used to be called by quota macros */ + inode_add_bytes(inode, PAGE_CACHE_SIZE); + } + UNLOCK_JNODE(j); + + move_flow_forward(flow, count); + write_move_coord(coord, uf_coord, mode, page_off + count == PAGE_CACHE_SIZE); + set_hint_unlock_node(hint, flow, ZNODE_WRITE_LOCK); + + fault_in_pages_readable(flow->data - count, count); + + page = jnode_get_page_locked(j, GFP_KERNEL); + if (IS_ERR(page)) { + result = PTR_ERR(page); + goto exit2; + } + page_cache_get(page); + assert("vs-1425", jnode_page(j) == page); + + if (!PageUptodate(page)) { + if (write_is_partial(inode, file_off, page_off, count)) { + /* page is not being overwritten completely, therefore we have to take care about data + which might be written to file already */ + if (JF_ISSET(j, JNODE_EFLUSH)) { + unlock_page(page); + result = jload(j); + lock_page(page); + if (result) + goto exit3; + jrelse(j); + /* locked page will prevents jnode from eflushing */ + } else { + /* there is no need to jload */ + zero_around(page, page_off, count); + } + } else { + /* there is no need to jload */ + zero_around(page, page_off, count); + UNDER_SPIN_VOID(jnode, j, eflush_del(j, 1)); + } + } else { + /* make sure that jnode is not eflushed */ + UNDER_SPIN_VOID(jnode, j, eflush_del(j, 1)); + } + + assert("vs-1503", UNDER_SPIN(jnode, j, !JF_ISSET(j, JNODE_EFLUSH))); + assert("nikita-3033", schedulable()); + + /* copy user data into page */ + result = __copy_from_user((char *)kmap(page) + page_off, flow->data - count, count); + kunmap(page); + if (unlikely(result)) { + /* FIXME: write(fd, 0, 10); to empty file will write no data but file will get increased + size. */ + /*if (JF_ISSET(j, JNODE_NEW)) + JF_SET(j, JNODE_HEARD_BANSHEE);*/ + result = RETERR(-EFAULT); + goto exit3; + } + + set_page_dirty_internal(page); + SetPageUptodate(page); + if (!PageReferenced(page)) + SetPageReferenced(page); + + unlock_page(page); + page_cache_release(page); + + /* FIXME: possible optimization: if jnode is not dirty yet - it gets into clean list in try_capture and + then in jnode_mark_dirty gets moved to dirty list. So, it would be more optimal to put jnode directly + to dirty list */ + LOCK_JNODE(j); + result = try_capture(j, ZNODE_WRITE_LOCK, 0, 1/* can_coc */); + if (result) + goto exit2; + jnode_make_dirty_locked(j); + UNLOCK_JNODE(j); + + jput(j); + + /* throttle the writer */ + result = extent_balance_dirty_pages(inode->i_mapping, flow, hint); + if (!grabbed) + all_grabbed2free(); + if (result) { + reiser4_stat_inc(extent.bdp_caused_repeats); + break; + } + + page_off = 0; + page_nr ++; + file_off += count; + set_key_offset(&page_key, (loff_t)page_nr << PAGE_CACHE_SHIFT); + continue; + + exit3: + unlock_page(page); + page_cache_release(page); + exit2: + if (created) + inode_sub_bytes(inode, PAGE_CACHE_SIZE); + jput(j); + exit1: + if (!grabbed) + all_grabbed2free(); + break; + + /* hint is unset by make_page_extent when first extent of a + file was inserted: in that case we can not use coord anymore + because we are to continue on twig level but are still at + leaf level + */ + } while (flow->length && uf_coord->valid == 1); + +/* + if (flow->length) + DQUOT_FREE_SPACE_NODIRTY(inode, flow->length); +*/ + return result; +} + +/* estimate and reserve space which may be required for appending file with hole stored in extent */ +static int +extent_hole_reserve(reiser4_tree *tree) +{ + /* adding hole may require adding a hole unit into extent item and stat data update */ + grab_space_enable(); + return reiser4_grab_space(estimate_one_insert_into_item(tree) * 2, 0); +} + +static int +extent_write_hole(struct inode *inode, flow_t *flow, hint_t *hint, int grabbed) +{ + int result; + loff_t new_size; + coord_t *coord; + + coord = &hint->coord.base_coord; + if (!grabbed) { + result = extent_hole_reserve(znode_get_tree(coord->node)); + if (result) + return result; + } + + new_size = get_key_offset(&flow->key) + flow->length; + set_key_offset(&flow->key, new_size); + flow->length = 0; + result = add_hole(coord, hint->coord.lh, &flow->key); + hint->coord.valid = 0; + if (!result) { + done_lh(hint->coord.lh); + INODE_SET_FIELD(inode, i_size, new_size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + result = reiser4_update_sd(inode); + } + if (!grabbed) + all_grabbed2free(); + return result; +} + +/* + plugin->s.file.write + It can be called in two modes: + 1. real write - to write data from flow to a file (@flow->data != 0) + 2. expanding truncate (@f->data == 0) +*/ +reiser4_internal int +write_extent(struct inode *inode, flow_t *flow, hint_t *hint, + int grabbed, /* extent's write may be called from plain unix file write and from tail conversion. In first + case (grabbed == 0) space is not reserved forehand, so, it must be done here. When it is + being called from tail conversion - space is reserved already for whole operation which may + involve several calls to item write. In this case space reservation will not be done + here */ + write_mode_t mode) +{ + if (flow->data) + /* real write */ + return extent_write_flow(inode, flow, hint, grabbed, mode); + + /* expanding truncate. add_hole requires f->key to be set to new end of file */ + return extent_write_hole(inode, flow, hint, grabbed); +} + +/* move coord one page forward. Return 1 if coord is moved out of item */ +static int +read_move_coord(coord_t *coord, extent_coord_extension_t *ext_coord) +{ + if (ext_coord->pos_in_unit == ext_coord->width - 1) { + /* last position in the unit */ + if (coord->unit_pos == ext_coord->nr_units - 1) { + /* last unit in the item */ + return 1; + } else { + /* move to the next unit */ + coord->unit_pos ++; + ext_coord->ext ++; + ON_DEBUG(ext_coord->extent = *ext_coord->ext); + ext_coord->width = extent_get_width(ext_coord->ext); + ext_coord->pos_in_unit = 0; + } + } else + ext_coord->pos_in_unit ++; + return 0; +} + +static void +call_page_cache_readahead(struct address_space *mapping, struct file *file, unsigned long page_nr, + const uf_coord_t *uf_coord) +{ + reiser4_file_fsdata *fsdata; + uf_coord_t ra_coord; + + fsdata = reiser4_get_file_fsdata(file); + ra_coord = *uf_coord; + ra_coord.extension.extent.expected_page = page_nr; + fsdata->reg.coord = &ra_coord; + + page_cache_readahead(mapping, &file->f_ra, file, page_nr); + fsdata->reg.coord = 0; +} + +#if REISER4_TRACE +static void +print_ext_coord(const char *s, uf_coord_t *uf_coord) +{ + reiser4_key key; + extent_coord_extension_t *ext_coord; + + item_key_by_coord(&uf_coord->base_coord, &key); + ext_coord = &uf_coord->extension.extent; + printk("%s: item key [%llu, %llu], nr_units %d, cur extent [%llu, %llu], unit_pos %d, pos_in_unit %Lu\n", + s, get_key_objectid(&key), get_key_offset(&key), + ext_coord->nr_units, + extent_get_start(ext_coord->ext), extent_get_width(ext_coord->ext), + uf_coord->base_coord.unit_pos, ext_coord->pos_in_unit); +} +#endif + +#if REISER4_DEBUG + +/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set pos_in_unit inside of unit + correspondingly */ +static int +offset_is_in_unit(const coord_t *coord, loff_t off) +{ + reiser4_key unit_key; + __u64 unit_off; + reiser4_extent *ext; + + ext = extent_by_coord(coord); + + unit_key_extent(coord, &unit_key); + unit_off = get_key_offset(&unit_key); + if (off < unit_off) + return 0; + if (off >= (unit_off + (current_blocksize * extent_get_width(ext)))) + return 0; + return 1; +} + +static int +coord_matches_key_extent(const coord_t *coord, const reiser4_key *key) +{ + reiser4_key item_key; + + assert("vs-771", coord_is_existing_unit(coord)); + assert("vs-1258", keylt(key, append_key_extent(coord, &item_key))); + assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key))); + + return offset_is_in_unit(coord, get_key_offset(key)); +} + +#endif /* REISER4_DEBUG */ + +/* Implements plugin->u.item.s.file.read operation for extent items. */ +reiser4_internal int +read_extent(struct file *file, flow_t *flow, hint_t *hint) +{ + int result; + struct page *page; + unsigned long page_nr; + unsigned long page_off, count; + struct inode *inode; + __u64 file_off; + uf_coord_t *uf_coord; + coord_t *coord; + extent_coord_extension_t *ext_coord; + + uf_coord = &hint->coord; + assert("vs-1318", coord_extension_is_ok(uf_coord)); + + inode = file->f_dentry->d_inode; + coord = &uf_coord->base_coord; + ext_coord = &uf_coord->extension.extent; + + ON_TRACE(TRACE_EXTENTS, "read_extent start: ino %llu, size %llu, offset %llu, count %lld\n", + get_inode_oid(inode), inode->i_size, get_key_offset(&flow->key), flow->length); + IF_TRACE(TRACE_EXTENTS, print_ext_coord("read_extent start", uf_coord)); + + assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE); + assert("vs-572", flow->user == 1); + assert("vs-1351", flow->length > 0); + assert("vs-1119", znode_is_rlocked(coord->node)); + assert("vs-1120", znode_is_loaded(coord->node)); + assert("vs-1256", coord_matches_key_extent(coord, &flow->key)); + assert("vs-1355", get_key_offset(&flow->key) + flow->length <= inode->i_size); + + /* offset in a file to start read from */ + file_off = get_key_offset(&flow->key); + /* index of page containing that offset */ + page_nr = (unsigned long)(file_off >> PAGE_CACHE_SHIFT); + /* offset within the page */ + page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1)); + + count = PAGE_CACHE_SIZE - page_off; + + do { + call_page_cache_readahead(inode->i_mapping, file, page_nr, uf_coord); + + /* this will return page if it exists and is uptodate, otherwise it will allocate page and call + extent_readpage to fill it */ + page = read_cache_page(inode->i_mapping, page_nr, readpage_extent, coord); + if (IS_ERR(page)) + return PTR_ERR(page); + + /* number of bytes which can be read from the page */ + if (count > flow->length) + count = flow->length; + move_flow_forward(flow, count); + if (page_off + count == PAGE_CACHE_SIZE) + if (read_move_coord(coord, ext_coord)) + uf_coord->valid = 0; + set_hint_unlock_node(hint, flow, ZNODE_READ_LOCK); + + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_detach_jnode(page, inode->i_mapping, page_nr); + page_cache_release(page); + warning("jmacd-97178", "extent_read: page is not up to date"); + return RETERR(-EIO); + } + + /* If users can be writing to this page using arbitrary virtual addresses, take care about potential + aliasing before reading the page on the kernel side. + */ + if (!list_empty(&inode->i_mapping->i_mmap_shared)) + flush_dcache_page(page); + + assert("nikita-3034", schedulable()); + + + /* AUDIT: We must page-in/prepare user area first to avoid deadlocks */ + result = __copy_to_user(flow->data - count, (char *)kmap(page) + page_off, count); + kunmap(page); + + page_cache_release(page); + if (unlikely(result)) + return RETERR(-EFAULT); + + result = hint_validate(hint, &flow->key, 0/* do not check key */, ZNODE_READ_LOCK); + if (result) + break; + assert("vs-1318", coord_extension_is_ok(uf_coord)); + assert("vs-1263", coord_matches_key_extent(coord, &flow->key)); + page_off = 0; + page_nr ++; + count = PAGE_CACHE_SIZE; + } while (flow->length && uf_coord->valid == 1); + + ON_TRACE(TRACE_EXTENTS, "read_extent done: left %lld\n", flow->length); + IF_TRACE(TRACE_EXTENTS, print_ext_coord("read_extent done", uf_coord)); + + return 0; +} + +static int +move_coord_pages(coord_t *coord, extent_coord_extension_t *ext_coord, unsigned count) +{ + ext_coord->expected_page += count; + + do { + if (ext_coord->pos_in_unit + count < ext_coord->width) { + ext_coord->pos_in_unit += count; + break; + } + + if (coord->unit_pos == ext_coord->nr_units - 1) { + coord->between = AFTER_UNIT; + return 1; + } + + /* shift to next unit */ + count -= (ext_coord->width - ext_coord->pos_in_unit); + coord->unit_pos ++; + ext_coord->pos_in_unit = 0; + ext_coord->ext ++; + ON_DEBUG(ext_coord->extent = *ext_coord->ext); + ext_coord->width = extent_get_width(ext_coord->ext); + } while (1); + + return 0; +} + +static inline void +zero_page(struct page *page) +{ + char *kaddr = kmap_atomic(page, KM_USER0); + + xmemset(kaddr, 0, PAGE_CACHE_SIZE); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + SetPageUptodate(page); + unlock_page(page); +} + +static int +do_readpage_extent(reiser4_extent *ext, reiser4_block_nr pos, struct page *page) +{ + jnode *j; + struct address_space *mapping; + unsigned long index; + oid_t oid; + + mapping = page->mapping; + oid = get_inode_oid(mapping->host); + index = page->index; + ON_TRACE(TRACE_EXTENTS, "readpage_extent: page (oid %llu, index %lu, count %d)..", + oid, index, page_count(page)); + + switch (state_of_extent(ext)) { + case HOLE_EXTENT: + ON_TRACE(TRACE_EXTENTS, "hole, OK\n"); + /* + * it is possible to have hole page with jnode, if page was + * eflushed previously. + */ + j = jlookup(current_tree, oid, index); + if (j == NULL) { + zero_page(page); + return 0; + } + LOCK_JNODE(j); + if (!jnode_page(j)) { + jnode_attach_page(j, page); + } else { + assert("vs-1504", jnode_page(j) == page); + } + + UNLOCK_JNODE(j); + break; + + case ALLOCATED_EXTENT: + j = jnode_of_page(page); + if (IS_ERR(j)) + return PTR_ERR(j); + if (*jnode_get_block(j) == 0) { + reiser4_block_nr blocknr; + + blocknr = extent_get_start(ext) + pos; + jnode_set_block(j, &blocknr); + } else + assert("vs-1403", j->blocknr == extent_get_start(ext) + pos); + break; + + case UNALLOCATED_EXTENT: + j = jlookup(current_tree, oid, index); + assert("nikita-2688", j); + assert("vs-1426", jnode_page(j) == NULL); + + UNDER_SPIN_VOID(jnode, j, jnode_attach_page(j, page)); + ON_TRACE(TRACE_EXTENTS, "jnode %s attached to page\n", jnode_tostring(j)); + + if (!JF_ISSET(j, JNODE_EFLUSH)) { + assert("", 0); + ON_TRACE(TRACE_EXTENTS, "page fault on not initialized page\n"); + zero_page(page); + jput(j); + return 0; + } + + break; + + default: + warning("vs-957", "extent_readpage: wrong extent\n"); + return RETERR(-EIO); + } + + BUG_ON(j == 0); + page_io(page, j, READ, GFP_NOIO); + jput(j); + return 0; +} + +static int +readahead_readpage_extent(void *vp, struct page *page) +{ + int result; + uf_coord_t *uf_coord; + coord_t *coord; + extent_coord_extension_t *ext_coord; + + uf_coord = vp; + coord = &uf_coord->base_coord; + + if (coord->between != AT_UNIT) { + unlock_page(page); + return RETERR(-EINVAL); + } + + ext_coord = &uf_coord->extension.extent; + if (ext_coord->expected_page != page->index) { + /* read_cache_pages skipped few pages. Try to adjust coord to page */ + assert("vs-1269", page->index > ext_coord->expected_page); + if (move_coord_pages(coord, ext_coord, page->index - ext_coord->expected_page)) { + /* extent pointing to this page is not here */ + unlock_page(page); + return RETERR(-EINVAL); + } + + assert("vs-1274", offset_is_in_unit(coord, + (loff_t)page->index << PAGE_CACHE_SHIFT)); + ext_coord->expected_page = page->index; + } + + assert("vs-1281", page->index == ext_coord->expected_page); + result = do_readpage_extent(ext_coord->ext, ext_coord->pos_in_unit, page); + if (!result) + move_coord_pages(coord, ext_coord, 1); + return result; +} + +/* + plugin->u.item.s.file.readpages +*/ +reiser4_internal void +readpages_extent(void *vp, struct address_space *mapping, struct list_head *pages) +{ + if (vp) + read_cache_pages(mapping, pages, readahead_readpage_extent, vp); +} + +/* + plugin->s.file.readpage + reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage + or + filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent + + At the beginning: coord->node is read locked, zloaded, page is + locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index) +*/ +reiser4_internal int +readpage_extent(void *vp, struct page *page) +{ + uf_coord_t *uf_coord = vp; + ON_DEBUG(coord_t *coord = &uf_coord->base_coord); + ON_DEBUG(reiser4_key key); + + assert("vs-1040", PageLocked(page)); + assert("vs-1050", !PageUptodate(page)); + assert("vs-757", !jprivate(page) && !PagePrivate(page)); + assert("vs-1039", page->mapping && page->mapping->host); + + assert("vs-1044", znode_is_loaded(coord->node)); + assert("vs-758", item_is_extent(coord)); + assert("vs-1046", coord_is_existing_unit(coord)); + assert("vs-1045", znode_is_rlocked(coord->node)); + assert("vs-1047", page->mapping->host->i_ino == get_key_objectid(item_key_by_coord(coord, &key))); + assert("vs-1320", coord_extension_is_ok(uf_coord)); + + return do_readpage_extent(uf_coord->extension.extent.ext, uf_coord->extension.extent.pos_in_unit, page); +} + +/* + plugin->s.file.capture + + At the beginning: coord.node is write locked, zloaded, page is not locked, coord is set to existing unit inside of + extent item +*/ +reiser4_internal int +capture_extent(reiser4_key *key, uf_coord_t *uf_coord, struct page *page, write_mode_t mode) +{ + jnode *j; + int result; + reiser4_block_nr blocknr; + int created; + + ON_TRACE(TRACE_EXTENTS, "WP: index %lu, count %d..", page->index, page_count(page)); + + assert("vs-1051", page->mapping && page->mapping->host); + assert("nikita-3139", !inode_get_flag(page->mapping->host, REISER4_NO_SD)); + assert("vs-864", znode_is_wlocked(uf_coord->base_coord.node)); + assert("vs-1398", get_key_objectid(key) == get_inode_oid(page->mapping->host)); + + result = make_extent(key, uf_coord, mode, &blocknr, &created); + if (result) { + done_lh(uf_coord->lh); + return result; + } + + lock_page(page); + j = jnode_of_page(page); + if (IS_ERR(j)) { + unlock_page(page); + done_lh(uf_coord->lh); + return PTR_ERR(j); + } + set_page_dirty_internal(page); + unlock_page(page); + + LOCK_JNODE(j); + if (created) { + /* extent corresponding to this jnode was just created */ + assert("vs-1504", *jnode_get_block(j) == 0); + JF_SET(j, JNODE_CREATED); + inode_add_bytes(page->mapping->host, PAGE_CACHE_SIZE); + } + + /* jnode_of_page() might create new jnode which requires setting its block number. It is not related to whether + the extent unit is new or not. */ + if (*jnode_get_block(j) == 0) + jnode_set_block(j, &blocknr); + + UNLOCK_JNODE(j); + done_lh(uf_coord->lh); + + LOCK_JNODE(j); + result = try_capture(j, ZNODE_WRITE_LOCK, 0, 1/* can_coc */); + if (result != 0) + reiser4_panic("nikita-3324", "Cannot capture jnode: %i", result); + jnode_make_dirty_locked(j); + UNLOCK_JNODE(j); + jput(j); + + if (created) { + reiser4_update_sd(page->mapping->host); + /* warning about failure of this is issued already */ + } + + + ON_TRACE(TRACE_EXTENTS, "OK\n"); + return 0; +} + +/* + plugin->u.item.s.file.get_block +*/ +reiser4_internal int +get_block_address_extent(const uf_coord_t *uf_coord, sector_t block, struct buffer_head *bh) +{ + const extent_coord_extension_t *ext_coord; + + assert("vs-1321", coord_extension_is_ok(uf_coord)); + ext_coord = &uf_coord->extension.extent; + + if (state_of_extent(ext_coord->ext) != ALLOCATED_EXTENT) + bh->b_blocknr = 0; + else + bh->b_blocknr = extent_get_start(ext_coord->ext) + ext_coord->pos_in_unit; + return 0; +} + +/* + plugin->u.item.s.file.append_key + key of first byte which is the next to last byte by addressed by this extent +*/ +reiser4_internal reiser4_key * +append_key_extent(const coord_t *coord, reiser4_key *key) +{ + item_key_by_coord(coord, key); + set_key_offset(key, get_key_offset(key) + extent_size(coord, nr_units_extent(coord))); + + assert("vs-610", get_key_offset(key) && (get_key_offset(key) & (current_blocksize - 1)) == 0); + return key; +} + +/* plugin->u.item.s.file.init_coord_extension */ +reiser4_internal void +init_coord_extension_extent(uf_coord_t *uf_coord, loff_t lookuped) +{ + coord_t *coord; + extent_coord_extension_t *ext_coord; + reiser4_key key; + loff_t offset; + pos_in_node_t i; + + assert("vs-1295", uf_coord->valid == 0); + + coord = &uf_coord->base_coord; + assert("vs-1288", coord_is_iplug_set(coord)); + assert("vs-1327", znode_is_loaded(coord->node)); + + if (coord->between != AFTER_UNIT && coord->between != AT_UNIT) + return; + + ext_coord = &uf_coord->extension.extent; + ext_coord->nr_units = nr_units_extent(coord); + + if (coord->between == AFTER_UNIT) { + assert("vs-1330", coord->unit_pos == nr_units_extent(coord) - 1); + ext_coord->ext = extent_by_coord(coord); + ON_DEBUG(ext_coord->extent = *ext_coord->ext); + ext_coord->width = extent_get_width(ext_coord->ext); + ext_coord->pos_in_unit = ext_coord->width - 1; + uf_coord->valid = 1; + return; + } + + /* AT_UNIT */ + item_key_by_coord(coord, &key); + offset = get_key_offset(&key); + + /* FIXME: it would not be necessary if pos_in_unit were in coord_t */ + ext_coord->ext = extent_item(coord); + + for (i = 0; i < coord->unit_pos; i++, ext_coord->ext ++) + offset += (extent_get_width(ext_coord->ext) * current_blocksize); + ON_DEBUG(ext_coord->extent = *ext_coord->ext); + ext_coord->width = extent_get_width(ext_coord->ext); + + assert("vs-1328", offset <= lookuped); + assert("vs-1329", lookuped < offset + extent_get_width(ext_coord->ext) * current_blocksize); + ext_coord->pos_in_unit = ((lookuped - offset) >> current_blocksize_bits); + + uf_coord->valid = 1; +} + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.4-ck1/fs/reiser4/plugin/item/extent_flush_ops.c --- linux-2.6.4/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/extent_flush_ops.c 2004-03-11 22:45:15.325504611 +1100 @@ -0,0 +1,1092 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "item.h" +#include "../../tree.h" +#include "../../jnode.h" +#include "../../super.h" +#include "../../flush.h" +#include "../../carry.h" +#include "../object.h" + +#include + +/* Return either first or last extent (depending on @side) of the item + @coord is set to. Set @pos_in_unit either to first or to last block + of extent. */ +static reiser4_extent * +extent_utmost_ext(const coord_t *coord, sideof side, reiser4_block_nr *pos_in_unit) +{ + reiser4_extent *ext; + + if (side == LEFT_SIDE) { + /* get first extent of item */ + ext = extent_item(coord); + *pos_in_unit = 0; + } else { + /* get last extent of item and last position within it */ + assert("vs-363", side == RIGHT_SIDE); + ext = extent_item(coord) + coord_last_unit_pos(coord); + *pos_in_unit = extent_get_width(ext) - 1; + } + + return ext; +} + +/* item_plugin->f.utmost_child */ +/* Return the child. Coord is set to extent item. Find jnode corresponding + either to first or to last unformatted node pointed by the item */ +reiser4_internal int +utmost_child_extent(const coord_t *coord, sideof side, jnode **childp) +{ + reiser4_extent *ext; + reiser4_block_nr pos_in_unit; + + ext = extent_utmost_ext(coord, side, &pos_in_unit); + + switch (state_of_extent(ext)) { + case HOLE_EXTENT: + *childp = NULL; + return 0; + case ALLOCATED_EXTENT: + case UNALLOCATED_EXTENT: + break; + default: + /* this should never happen */ + assert("vs-1417", 0); + } + + { + reiser4_key key; + reiser4_tree *tree; + unsigned long index; + + if (side == LEFT_SIDE) { + /* get key of first byte addressed by the extent */ + item_key_by_coord(coord, &key); + } else { + /* get key of byte which next after last byte addressed by the extent */ + append_key_extent(coord, &key); + } + + assert("vs-544", (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul); + /* index of first or last (depending on @side) page addressed + by the extent */ + index = (unsigned long) (get_key_offset(&key) >> PAGE_CACHE_SHIFT); + if (side == RIGHT_SIDE) + index --; + + tree = coord->node->zjnode.tree; + *childp = jlookup(tree, get_key_objectid(&key), index); + } + + return 0; +} + +/* item_plugin->f.utmost_child_real_block */ +/* Return the child's block, if allocated. */ +reiser4_internal int +utmost_child_real_block_extent(const coord_t *coord, sideof side, reiser4_block_nr *block) +{ + reiser4_extent *ext; + + ext = extent_by_coord(coord); + + switch (state_of_extent(ext)) { + case ALLOCATED_EXTENT: + *block = extent_get_start(ext); + if (side == RIGHT_SIDE) + *block += extent_get_width(ext) - 1; + break; + case HOLE_EXTENT: + case UNALLOCATED_EXTENT: + *block = 0; + break; + default: + /* this should never happen */ + assert("vs-1418", 0); + } + + return 0; +} + +/* item_plugin->f.scan */ +/* Performs leftward scanning starting from an unformatted node and its parent coordinate. + This scan continues, advancing the parent coordinate, until either it encounters a + formatted child or it finishes scanning this node. + + If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm + not sure this is last property (same atom) is enforced, but it should be the case since + one atom must write the parent and the others must read the parent, thus fusing?). In + any case, the code below asserts this case for unallocated extents. Unallocated + extents are thus optimized because we can skip to the endpoint when scanning. + + It returns control to scan_extent, handles these terminating conditions, e.g., by + loading the next twig. +*/ +reiser4_internal int scan_extent(flush_scan * scan) +{ + coord_t coord; + jnode *neighbor; + unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist; + reiser4_block_nr unit_start; + /*struct inode *ino = NULL; */ + __u64 oid; + reiser4_key key; + /*struct page *pg; */ + int ret = 0, allocated, incr; + reiser4_tree *tree; + + if (!jnode_check_dirty(scan->node)) { + scan->stop = 1; + return 0; /* Race with truncate, this node is already + * truncated. */ + } + + coord_dup(&coord, &scan->parent_coord); + + assert("jmacd-1404", !scan_finished(scan)); + assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL); + assert("jmacd-1406", jnode_is_unformatted(scan->node)); + + /* The scan_index variable corresponds to the current page index of the + unformatted block scan position. */ + scan_index = index_jnode(scan->node); + + assert("jmacd-7889", item_is_extent(&coord)); + + ON_TRACE(TRACE_FLUSH_VERB, "%s scan starts %lu: %s\n", + (scanning_left(scan) ? "left" : "right"), scan_index, jnode_tostring(scan->node)); + +repeat: + /* objectid of file */ + oid = get_key_objectid(item_key_by_coord(&coord, &key)); + + ON_TRACE(TRACE_FLUSH_VERB, "%s scan index %lu: parent %p oid %llu\n", + (scanning_left(scan) ? "left" : "right"), scan_index, coord.node, oid); + + allocated = !extent_is_unallocated(&coord); + /* Get the values of this extent unit: */ + unit_index = extent_unit_index(&coord); + unit_width = extent_unit_width(&coord); + unit_start = extent_unit_start(&coord); + + assert("jmacd-7187", unit_width > 0); + assert("jmacd-7188", scan_index >= unit_index); + assert("jmacd-7189", scan_index <= unit_index + unit_width - 1); + + /* Depending on the scan direction, we set different maximum values for scan_index + (scan_max) and the number of nodes that would be passed if the scan goes the + entire way (scan_dist). Incr is an integer reflecting the incremental + direction of scan_index. */ + if (scanning_left(scan)) { + scan_max = unit_index; + scan_dist = scan_index - unit_index; + incr = -1; + } else { + scan_max = unit_index + unit_width - 1; + scan_dist = scan_max - unit_index; + incr = +1; + } + + tree = coord.node->zjnode.tree; + + /* If the extent is allocated we have to check each of its blocks. If the extent + is unallocated we can skip to the scan_max. */ + if (allocated) { + do { + neighbor = jlookup(tree, oid, scan_index); + if (neighbor == NULL) + goto stop_same_parent; + + ON_TRACE(TRACE_FLUSH_VERB, "alloc scan index %lu: %s\n", + scan_index, jnode_tostring(neighbor)); + + if (scan->node != neighbor && !scan_goto(scan, neighbor)) { + /* @neighbor was jput() by scan_goto(). */ + goto stop_same_parent; + } + + ret = scan_set_current(scan, neighbor, 1, &coord); + if (ret != 0) { + goto exit; + } + + /* reference to @neighbor is stored in @scan, no need + to jput(). */ + scan_index += incr; + + } while (incr + scan_max != scan_index); + + } else { + /* Optimized case for unallocated extents, skip to the end. */ + neighbor = jlookup(tree, oid, scan_max/*index*/); + if (neighbor == NULL) { + /* Race with truncate */ + scan->stop = 1; + ret = 0; + goto exit; + } + + ON_TRACE(TRACE_FLUSH_VERB, "unalloc scan index %lu: %s\n", scan_index, jnode_tostring(neighbor)); + + /* XXX commented assertion out, because it is inherently + * racy */ + /* assert("jmacd-3551", !jnode_check_flushprepped(neighbor) + && same_slum_check(neighbor, scan->node, 0, 0)); */ + + ret = scan_set_current(scan, neighbor, scan_dist, &coord); + if (ret != 0) { + goto exit; + } + } + + if (coord_sideof_unit(&coord, scan->direction) == 0 && item_is_extent(&coord)) { + /* Continue as long as there are more extent units. */ + + scan_index = + extent_unit_index(&coord) + (scanning_left(scan) ? extent_unit_width(&coord) - 1 : 0); + goto repeat; + } + + if (0) { +stop_same_parent: + + /* If we are scanning left and we stop in the middle of an allocated + extent, we know the preceder immediately.. */ + /* middle of extent is (scan_index - unit_index) != 0. */ + if (scanning_left(scan) && (scan_index - unit_index) != 0) { + /* FIXME(B): Someone should step-through and verify that this preceder + calculation is indeed correct. */ + /* @unit_start is starting block (number) of extent + unit. Flush stopped at the @scan_index block from + the beginning of the file, which is (scan_index - + unit_index) block within extent. + */ + if (unit_start) { + /* skip preceder update when we are at hole */ + scan->preceder_blk = unit_start + scan_index - unit_index; + check_preceder(scan->preceder_blk); + } + } + + /* In this case, we leave coord set to the parent of scan->node. */ + scan->stop = 1; + + } else { + /* In this case, we are still scanning, coord is set to the next item which is + either off-the-end of the node or not an extent. */ + assert("jmacd-8912", scan->stop == 0); + assert("jmacd-7812", (coord_is_after_sideof_unit(&coord, scan->direction) + || !item_is_extent(&coord))); + } + + ret = 0; +exit: + return ret; +} + +/* ask block allocator for some blocks */ +static void +extent_allocate_blocks(reiser4_blocknr_hint *preceder, + reiser4_block_nr wanted_count, reiser4_block_nr *first_allocated, reiser4_block_nr *allocated, block_stage_t block_stage) +{ + *allocated = wanted_count; + preceder->max_dist = 0; /* scan whole disk, if needed */ + + /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */ + preceder->block_stage = block_stage; + + /* FIXME: we do not handle errors here now */ + check_me("vs-420", reiser4_alloc_blocks (preceder, first_allocated, allocated, BA_PERMANENT) == 0); + /* update flush_pos's preceder to last allocated block number */ + preceder->blk = *first_allocated + *allocated - 1; +} + +/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent + will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have + to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */ +static reiser4_block_nr +reserve_replace(void) +{ + reiser4_block_nr grabbed, needed; + + grabbed = get_current_context()->grabbed_blocks; + needed = estimate_one_insert_into_item(current_tree); + check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED)); + return grabbed; +} + +static void +free_replace_reserved(reiser4_block_nr grabbed) +{ + reiser4_context *ctx; + + ctx = get_current_context(); + grabbed2free(ctx, get_super_private(ctx->super), + ctx->grabbed_blocks - grabbed); +} + +/* Block offset of first block addressed by unit */ +reiser4_internal __u64 +extent_unit_index(const coord_t *item) +{ + reiser4_key key; + + assert("vs-648", coord_is_existing_unit(item)); + unit_key_by_coord(item, &key); + return get_key_offset(&key) >> current_blocksize_bits; +} + +/* AUDIT shouldn't return value be of reiser4_block_nr type? + Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */ +reiser4_internal __u64 +extent_unit_width(const coord_t *item) +{ + assert("vs-649", coord_is_existing_unit(item)); + return width_by_coord(item); +} + +/* Starting block location of this unit */ +reiser4_internal reiser4_block_nr +extent_unit_start(const coord_t *item) +{ + return extent_get_start(extent_by_coord(item)); +} + +/* replace allocated extent with two allocated extents */ +static int +split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit) +{ + int result; + reiser4_extent *ext; + reiser4_extent replace_ext; + reiser4_extent append_ext; + reiser4_key key; + reiser4_item_data item; + reiser4_block_nr grabbed; + + ext = extent_by_coord(coord); + assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT); + assert("vs-1411", extent_get_width(ext) > pos_in_unit); + + set_extent(&replace_ext, extent_get_start(ext), pos_in_unit); + set_extent(&append_ext, extent_get_start(ext) + pos_in_unit, extent_get_width(ext) - pos_in_unit); + + /* insert_into_item will insert new unit after the one @coord is set to. So, update key correspondingly */ + unit_key_by_coord(coord, &key); + set_key_offset(&key, (get_key_offset(&key) + pos_in_unit * current_blocksize)); + + ON_TRACE(TRACE_EXTENT_ALLOC, + "split [%llu %llu] -> [%llu %llu][%llu %llu]\n", + extent_get_start(ext), extent_get_width(ext), + extent_get_start(&replace_ext), extent_get_width(&replace_ext), + extent_get_start(&append_ext), extent_get_width(&append_ext)); + + grabbed = reserve_replace(); + result = replace_extent(coord, znode_lh(coord->node), &key, init_new_extent(&item, &append_ext, 1), + &replace_ext, COPI_DONT_SHIFT_LEFT); + free_replace_reserved(grabbed); + return result; +} + +/* remove protection from e-flush */ +static inline void +junprotect (jnode * node) +{ + assert("zam-837", !JF_ISSET(node, JNODE_EFLUSH)); + /* when REISER4_USE_EFLUSH is not defined - EPROTECT bit does not get + * set */ + assert("zam-838", JF_ISSET(node, JNODE_EPROTECTED)); + + JF_CLR(node, JNODE_EPROTECTED); +} + +/* this is used to unprotect nodes which were protected before allocating but which will not be allocated either because + space allocator allocates less blocks than were protected and/or if allocation of those nodes failed */ +static void +unprotect_extent_nodes(flush_pos_t *flush_pos, __u64 count, capture_list_head *protected_nodes) +{ + jnode *node, *tmp; + capture_list_head unprotected_nodes; + txn_atom *atom; + + capture_list_init(&unprotected_nodes); + + atom = atom_locked_by_fq(pos_fq(flush_pos)); + assert("vs-1468", atom); + + assert("vs-1469", !capture_list_empty(protected_nodes)); + assert("vs-1474", count > 0); + node = capture_list_back(protected_nodes); + do { + count --; + junprotect(node); + if (count == 0) { + break; + } + tmp = capture_list_prev(node); + node = tmp; + assert("vs-1470", !capture_list_end(protected_nodes, node)); + } while (1); + + /* move back to dirty list */ + capture_list_split(protected_nodes, &unprotected_nodes, node); + capture_list_splice(&atom->dirty_nodes[LEAF_LEVEL], &unprotected_nodes); + + UNLOCK_ATOM(atom); +} + +extern int getjevent(void); + +/* remove node from atom's list and put to the end of list @jnodes */ +static void +protect_reloc_node(capture_list_head *jnodes, jnode *node) +{ + assert("zam-836", !JF_ISSET(node, JNODE_EPROTECTED)); + assert("vs-1216", jnode_is_unformatted(node)); + assert("vs-1468", node->list == DIRTY_LIST); + assert("vs-1477", spin_atom_is_locked(node->atom)); + assert("nikita-3390", spin_jnode_is_locked(node)); + + JF_SET(node, JNODE_EPROTECTED); + capture_list_remove_clean(node); + capture_list_push_back(jnodes, node); +} + +#define JNODES_TO_UNFLUSH (16) + +/* @count nodes of file (objectid @oid) starting from @index are going to be allocated. Protect those nodes from + e-flushing. Nodes which are eflushed already will be un-eflushed. There will be not more than JNODES_TO_UNFLUSH + un-eflushed nodes. If a node is not found or flushprepped - stop protecting */ +/* FIXME: it is likely that not flushprepped jnodes are on dirty capture list in sequential order.. */ +static int +protect_extent_nodes(flush_pos_t *flush_pos, oid_t oid, unsigned long index, reiser4_block_nr count, + reiser4_block_nr *protected, reiser4_extent *ext, + capture_list_head *protected_nodes) +{ + __u64 i; + __u64 j; + int result; + reiser4_tree *tree; + int eflushed; + jnode *buf[JNODES_TO_UNFLUSH]; + txn_atom *atom; + + assert("nikita-3394", capture_list_empty(protected_nodes)); + + tree = current_tree; + + atom = atom_locked_by_fq(pos_fq(flush_pos)); + assert("vs-1468", atom); + + assert("vs-1470", extent_get_width(ext) == count); + eflushed = 0; + *protected = 0; + for (i = 0; i < count; ++i, ++index) { + jnode *node; + + node = jlookup(tree, oid, index); + if (!node) + break; + + if (jnode_check_flushprepped(node)) { + atomic_dec(&node->x_count); + break; + } + + LOCK_JNODE(node); + assert("vs-1476", atomic_read(&node->x_count) > 1); + assert("nikita-3393", !JF_ISSET(node, JNODE_EPROTECTED)); + + if (JF_ISSET(node, JNODE_EFLUSH)) { + if (eflushed == JNODES_TO_UNFLUSH) { + UNLOCK_JNODE(node); + atomic_dec(&node->x_count); + break; + } + buf[eflushed] = node; + eflushed ++; + protect_reloc_node(protected_nodes, node); + UNLOCK_JNODE(node); + } else { + assert("nikita-3384", node->atom == atom); + protect_reloc_node(protected_nodes, node); + assert("nikita-3383", !JF_ISSET(node, JNODE_EFLUSH)); + UNLOCK_JNODE(node); + atomic_dec(&node->x_count); + } + + (*protected) ++; + } + UNLOCK_ATOM(atom); + + /* start io for eflushed nodes */ + for (j = 0; j < eflushed; ++ j) + jstartio(buf[j]); + + result = 0; + for (j = 0; j < eflushed; ++ j) { + if (result == 0) { + result = emergency_unflush(buf[j]); + if (result != 0) { + warning("nikita-3179", + "unflush failed: %i", result); + print_jnode("node", buf[j]); + } + } + jput(buf[j]); + } + if (result != 0) { + /* unprotect all the jnodes we have protected so far */ + unprotect_extent_nodes(flush_pos, i, protected_nodes); + } + return result; +} + +/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is + one). Return 1 if it succeeded, 0 - otherwise */ +static int +try_to_merge_with_left(coord_t *coord, reiser4_extent *ext, reiser4_extent *replace) +{ + assert("vs-1415", extent_by_coord(coord) == ext); + + if (coord->unit_pos == 0 || state_of_extent(ext - 1) != ALLOCATED_EXTENT) + /* @ext either does not exist or is not allocated extent */ + return 0; + if (extent_get_start(ext - 1) + extent_get_width(ext - 1) != extent_get_start(replace)) + return 0; + + /* we can glue, widen previous unit */ + ON_TRACE(TRACE_EXTENT_ALLOC, + "wide previous [%llu %llu] ->", + extent_get_start(ext - 1), extent_get_width(ext - 1)); + + extent_set_width(ext - 1, extent_get_width(ext - 1) + extent_get_width(replace)); + + ON_TRACE(TRACE_EXTENT_ALLOC, " [%llu %llu] -> ", extent_get_start(ext - 1), extent_get_width(ext - 1)); + + if (extent_get_width(ext) != extent_get_width(replace)) { + /* make current extent narrower */ + ON_TRACE(TRACE_EXTENT_ALLOC, "narrow [%llu %llu] -> ", extent_get_start(ext), extent_get_width(ext)); + + if (state_of_extent(ext) == ALLOCATED_EXTENT) + extent_set_start(ext, extent_get_start(ext) + extent_get_width(replace)); + extent_set_width(ext, extent_get_width(ext) - extent_get_width(replace)); + + ON_TRACE(TRACE_EXTENT_ALLOC, "[%llu %llu]\n", extent_get_start(ext), extent_get_width(ext)); + } else { + /* current extent completely glued with its left neighbor, remove it */ + coord_t from, to; + + ON_TRACE(TRACE_EXTENT_ALLOC, "delete [%llu %llu]\n", extent_get_start(ext), extent_get_width(ext)); + + coord_dup(&from, coord); + from.unit_pos = nr_units_extent(coord) - 1; + coord_dup(&to, &from); + + /* currently cut from extent can cut either from the beginning or from the end. Move place which got + freed after unit removal to end of item */ + xmemmove(ext, ext + 1, (from.unit_pos - coord->unit_pos) * sizeof(reiser4_extent)); + /* wipe part of item which is going to be cut, so that node_check will not be confused */ + ON_DEBUG(xmemset(extent_item(coord) + from.unit_pos, 0, sizeof (reiser4_extent))); + cut_node_content(&from, &to, NULL, NULL, NULL); + } + znode_make_dirty(coord->node); + /* move coord back */ + coord->unit_pos --; + return 1; +} + +/* replace extent (unallocated or allocated) pointed by @coord with extent @replace (allocated). If @replace is shorter + than @coord - add padding extent */ +static int +conv_extent(coord_t *coord, reiser4_extent *replace) +{ + int result; + reiser4_extent *ext; + reiser4_extent padd_ext; + reiser4_block_nr start, width, new_width; + reiser4_block_nr grabbed; + reiser4_item_data item; + reiser4_key key; + extent_state state; + + ext = extent_by_coord(coord); + state = state_of_extent(ext); + start = extent_get_start(ext); + width = extent_get_width(ext); + new_width = extent_get_width(replace); + + assert("vs-1458", state == UNALLOCATED_EXTENT || state == ALLOCATED_EXTENT); + assert("vs-1459", width >= new_width); + + if (try_to_merge_with_left(coord, ext, replace)) { + /* merged @replace with left neighbor. Current unit is either removed or narrowed */ + return 0; + } + + if (width == new_width) { + /* replace current extent with @replace */ + ON_TRACE(TRACE_EXTENT_ALLOC, "replace: [%llu %llu]->[%llu %llu]\n", + start, width, + extent_get_start(replace), extent_get_width(replace)); + + znode_make_dirty(coord->node); + *ext = *replace; + return 0; + } + + /* replace @ext with @replace and padding extent */ + set_extent(&padd_ext, state == ALLOCATED_EXTENT ? (start + new_width) : UNALLOCATED_EXTENT_START, + width - new_width); + + /* insert_into_item will insert new units after the one @coord is set to. So, update key correspondingly */ + unit_key_by_coord(coord, &key); + set_key_offset(&key, (get_key_offset(&key) + new_width * current_blocksize)); + + ON_TRACE(TRACE_EXTENT_ALLOC, + "replace: [%llu %llu]->[%llu %llu][%llu %llu]\n", + start, width, + extent_get_start(replace), extent_get_width(replace), + extent_get_start(&padd_ext), extent_get_width(&padd_ext)); + + grabbed = reserve_replace(); + result = replace_extent(coord, znode_lh(coord->node), &key, init_new_extent(&item, &padd_ext, 1), + replace, COPI_DONT_SHIFT_LEFT); + + free_replace_reserved(grabbed); + return result; +} + +static void +assign_real_blocknrs(flush_pos_t *flush_pos, reiser4_block_nr first, reiser4_block_nr count, + extent_state state, capture_list_head *protected_nodes) +{ + jnode *node; + txn_atom *atom; + flush_queue_t *fq; + + fq = pos_fq(flush_pos); + atom = atom_locked_by_fq(fq); + assert("vs-1468", atom); + + for_all_type_safe_list(capture, protected_nodes, node) { + LOCK_JNODE(node); + assert("vs-1132", ergo(state == UNALLOCATED_EXTENT, blocknr_is_fake(jnode_get_block(node)))); + assert("vs-1475", node->atom == atom); + assert("vs-1476", atomic_read(&node->x_count) > 0); + assert("vs-1412", JF_ISSET(node, JNODE_EPROTECTED)); + assert("vs-1460", !JF_ISSET(node, JNODE_EFLUSH)); + JF_CLR(node, JNODE_FLUSH_RESERVED); + jnode_set_block(node, &first); + unformatted_make_reloc(node, fq); + junprotect(node); + ON_DEBUG(node->list = FQ_LIST); + UNLOCK_JNODE(node); + first ++; + } + + capture_list_splice(&fq->prepped, protected_nodes); + UNLOCK_ATOM(atom); +} + +static void +make_node_ovrwr(capture_list_head *jnodes, jnode *node) +{ + LOCK_JNODE(node); + + assert ("zam-917", !JF_ISSET(node, JNODE_RELOC)); + assert ("zam-918", !JF_ISSET(node, JNODE_OVRWR)); + + JF_SET(node, JNODE_OVRWR); + capture_list_remove_clean(node); + capture_list_push_back(jnodes, node); + ON_DEBUG(node->list = OVRWR_LIST); + + UNLOCK_JNODE(node); +} + +/* put nodes of one extent (file objectid @oid, extent width @width) to overwrite set. Starting from the one with index + @index. If end of slum is detected (node is not found or flushprepped) - stop iterating and set flush position's + state to POS_INVALID */ +static void +mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid, unsigned long index, reiser4_block_nr width) +{ + unsigned long i; + reiser4_tree *tree; + jnode *node; + txn_atom *atom; + capture_list_head jnodes; + + capture_list_init(&jnodes); + + tree = current_tree; + + atom = atom_locked_by_fq(pos_fq(flush_pos)); + assert("vs-1478", atom); + + for (i = flush_pos->pos_in_unit; i < width; i ++, index ++) { + node = jlookup(tree, oid, index); + if (!node) { + flush_pos->state = POS_INVALID; + + ON_TRACE(TRACE_EXTENT_ALLOC, "node not found: (oid %llu, index %lu)\n", oid, index); + + break; + } + if (jnode_check_flushprepped(node)) { + flush_pos->state = POS_INVALID; + atomic_dec(&node->x_count); + + ON_TRACE(TRACE_EXTENT_ALLOC, "flushprepped: (oid %llu, index %lu)\n", oid, index); + + break; + } + make_node_ovrwr(&jnodes, node); + atomic_dec(&node->x_count); + } + + capture_list_splice(&atom->ovrwr_nodes, &jnodes); + UNLOCK_ATOM(atom); +} + +/* this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord is set to. It is to prepare for flushing + sequence of not flushprepped nodes (slum). It supposes that slum starts at flush_pos->pos_in_unit position within the + extent. Slum gets to relocate set if flush_pos->leaf_relocate is set to 1 and to overwrite set otherwise */ +reiser4_internal int +alloc_extent(flush_pos_t *flush_pos) +{ + coord_t *coord; + reiser4_extent *ext; + reiser4_extent replace_ext; + oid_t oid; + reiser4_block_nr protected; + reiser4_block_nr start; + __u64 index; + __u64 width; + extent_state state; + int result; + reiser4_block_nr first_allocated; + __u64 allocated; + reiser4_key key; + block_stage_t block_stage; + + + + assert("vs-1468", flush_pos->state == POS_ON_EPOINT); + assert("vs-1469", coord_is_existing_unit(&flush_pos->coord) && item_is_extent(&flush_pos->coord)); + + coord = &flush_pos->coord; + /* + * NOTE-NIKITA + * + * Extent items are marked as "frozen" at the beginning of + * extent->tail conversion (similarly, tail items are marked as + * "frozen" during tail->extent conversion), this is necessary so that + * number of items file being converted consists of doesn't change + * during conversion and so, space reservation remains valid. + * + * Initial design was that flush just skips frozen items on the + * pretext that they will be removed soon and it, hence, makes no + * sense to allocated them. + * + * But this leads to the following deadlock: + * + * extent->tail marks items as frozen and starts conversion. It then + * calls balance_dirty_pages() that can close current transaction + * handle. So, when extent->tail continues, frozen items are not part + * of its current transaction. + * + * At this moment, node N containing frozen items is locked by some + * other thread and captured into atom. That atom switches into + * CAPTURE_WAIT state and starts flushing. Flush cannot make progress, + * because it constantly hits frozen item and falls back to repeat, + * waiting for extent->tail to finish. + * + * extent->tail cannot make progress, because it is sleeping in + * capture_fuse() trying to lock and capture N. + * + * NOTE: New extent->tail conversion doesn't use "frozen" items, so + * that comment above is only of historical interest. + */ + + ext = extent_by_coord(coord); + state = state_of_extent(ext); + if (state == HOLE_EXTENT) { + flush_pos->state = POS_INVALID; + return 0; + } + + item_key_by_coord(coord, &key); + oid = get_key_objectid(&key); + index = extent_unit_index(coord) + flush_pos->pos_in_unit; + start = extent_get_start(ext); + width = extent_get_width(ext); + + assert("vs-1457", width > flush_pos->pos_in_unit); + + if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) { + protected_jnodes jnodes; + + /* relocate */ + if (flush_pos->pos_in_unit) { + /* split extent unit into two */ + result = split_allocated_extent(coord, flush_pos->pos_in_unit); + flush_pos->pos_in_unit = 0; + return result; + } + ON_TRACE(TRACE_EXTENT_ALLOC, + "ALLOC: relocate: (oid %llu, index %llu) [%llu %llu] - ", + oid, index, start, width); + + /* Prevent nodes from e-flushing before allocating disk space for them. Nodes which were eflushed will be + read from their temporary locations (but not more than certain limit: JNODES_TO_UNFLUSH) and that + disk space will be freed. */ + + protected_jnodes_init(&jnodes); + + result = protect_extent_nodes(flush_pos, oid, index, width, &protected, ext, &jnodes.nodes); + if (result) { + warning("vs-1469", "Failed to protect extent. Should not happen\n"); + protected_jnodes_done(&jnodes); + return result; + } + if (protected == 0) { + ON_TRACE(TRACE_EXTENT_ALLOC, "nothing todo\n"); + flush_pos->state = POS_INVALID; + flush_pos->pos_in_unit = 0; + protected_jnodes_done(&jnodes); + return 0; + } + + if (state == ALLOCATED_EXTENT) + /* all protected nodes are not flushprepped, therefore + * they are counted as flush_reserved */ + block_stage = BLOCK_FLUSH_RESERVED; + else + block_stage = BLOCK_UNALLOCATED; + + /* allocate new block numbers for protected nodes */ + extent_allocate_blocks(pos_hint(flush_pos), protected, &first_allocated, &allocated, block_stage); + + ON_TRACE(TRACE_EXTENT_ALLOC, "allocated: (first %llu, cound %llu) - ", first_allocated, allocated); + + if (allocated != protected) + /* unprotect nodes which will not be + * allocated/relocated on this iteration */ + unprotect_extent_nodes(flush_pos, protected - allocated, + &jnodes.nodes); + if (state == ALLOCATED_EXTENT) { + /* on relocating - free nodes which are going to be + * relocated */ + reiser4_dealloc_blocks(&start, &allocated, BLOCK_ALLOCATED, BA_DEFER); + } + + /* assign new block numbers to protected nodes */ + assign_real_blocknrs(flush_pos, first_allocated, allocated, state, &jnodes.nodes); + + protected_jnodes_done(&jnodes); + /* prepare extent which will replace current one */ + set_extent(&replace_ext, first_allocated, allocated); + + /* adjust extent item */ + result = conv_extent(coord, &replace_ext); + if (result != 0 && result != -ENOMEM) { + warning("vs-1461", "Failed to allocate extent. Should not happen\n"); + return result; + } + } else { + /* overwrite */ + ON_TRACE(TRACE_EXTENT_ALLOC, + "ALLOC: overwrite: (oid %llu, index %llu) [%llu %llu]\n", + oid, index, start, width); + mark_jnodes_overwrite(flush_pos, oid, index, width); + } + flush_pos->pos_in_unit = 0; + return 0; +} + +/* if @key is glueable to the item @coord is set to */ +static int +must_insert(const coord_t *coord, const reiser4_key *key) +{ + reiser4_key last; + + if (item_id_by_coord(coord) == EXTENT_POINTER_ID && keyeq(append_key_extent(coord, &last), key)) + return 0; + return 1; +} + + /* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item, + or modify last unit of last item to have greater width */ +static int +put_unit_to_end(znode *node, const reiser4_key *key, reiser4_extent *copy_ext) +{ + int result; + coord_t coord; + cop_insert_flag flags; + reiser4_extent *last_ext; + reiser4_item_data data; + + /* set coord after last unit in an item */ + coord_init_last_unit(&coord, node); + coord.between = AFTER_UNIT; + + flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE; + if (must_insert(&coord, key)) { + result = insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1), key, 0 /*lh */ , flags); + + } else { + /* try to glue with last unit */ + last_ext = extent_by_coord(&coord); + if (state_of_extent(last_ext) && + extent_get_start(last_ext) + extent_get_width(last_ext) == extent_get_start(copy_ext)) { + /* widen last unit of node */ + extent_set_width(last_ext, extent_get_width(last_ext) + extent_get_width(copy_ext)); + znode_make_dirty(node); + return 0; + } + + /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */ + result = insert_into_item(&coord, 0 /*lh */ , key, init_new_extent(&data, copy_ext, 1), flags); + } + + assert("vs-438", result == 0 || result == -E_NODE_FULL); + return result; +} + +/* @coord is set to extent unit */ +reiser4_internal squeeze_result +squalloc_extent(znode *left, const coord_t *coord, flush_pos_t *flush_pos, reiser4_key *stop_key) +{ + reiser4_extent *ext; + __u64 index; + __u64 width; + reiser4_block_nr start; + extent_state state; + oid_t oid; + reiser4_block_nr first_allocated; + __u64 allocated; + __u64 protected; + reiser4_extent copy_extent; + reiser4_key key; + int result; + block_stage_t block_stage; + + assert("vs-1457", flush_pos->pos_in_unit == 0); + assert("vs-1467", coord_is_leftmost_unit(coord)); + assert("vs-1467", item_is_extent(coord)); + + ext = extent_by_coord(coord); + index = extent_unit_index(coord); + start = extent_unit_start(coord); + width = extent_unit_width(coord); + state = state_of_extent(ext); + unit_key_by_coord(coord, &key); + oid = get_key_objectid(&key); + + if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) { + protected_jnodes jnodes; + + ON_TRACE(TRACE_EXTENT_ALLOC, "SQUALLOC: relocate: (oid %llu, index %llu) [%llu %llu] - ", + oid, index, start, width); + + /* relocate */ + protected_jnodes_init(&jnodes); + result = protect_extent_nodes(flush_pos, oid, index, extent_get_width(ext), &protected, ext, &jnodes.nodes); + if (result) { + warning("vs-1469", "Failed to protect extent. Should not happen\n"); + protected_jnodes_done(&jnodes); + return result; + } + if (protected == 0) { + flush_pos->state = POS_INVALID; + protected_jnodes_done(&jnodes); + return 0; + } + + if (state == ALLOCATED_EXTENT) + /* all protected nodes are not flushprepped, therefore + * they are counted as flush_reserved */ + block_stage = BLOCK_FLUSH_RESERVED; + else + block_stage = BLOCK_UNALLOCATED; + + /* allocate new block numbers for protected nodes */ + extent_allocate_blocks(pos_hint(flush_pos), protected, &first_allocated, &allocated, block_stage); + ON_TRACE(TRACE_EXTENT_ALLOC, "allocated: (first %llu, cound %llu) - ", first_allocated, allocated); + if (allocated != protected) + unprotect_extent_nodes(flush_pos, protected - allocated, + &jnodes.nodes); + + /* prepare extent which will be copied to left */ + set_extent(©_extent, first_allocated, allocated); + + result = put_unit_to_end(left, &key, ©_extent); + if (result == -E_NODE_FULL) { + int target_block_stage; + + /* free blocks which were just allocated */ + ON_TRACE(TRACE_EXTENT_ALLOC, + "left is full, free (first %llu, count %llu)\n", + first_allocated, allocated); + target_block_stage = (state == ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED : BLOCK_UNALLOCATED; + reiser4_dealloc_blocks(&first_allocated, &allocated, target_block_stage, BA_PERMANENT); + unprotect_extent_nodes(flush_pos, allocated, &jnodes.nodes); + + /* rewind the preceder. */ + flush_pos->preceder.blk = first_allocated; + check_preceder(flush_pos->preceder.blk); + + protected_jnodes_done(&jnodes); + return SQUEEZE_TARGET_FULL; + } + + if (state == ALLOCATED_EXTENT) { + /* free nodes which were relocated */ + reiser4_dealloc_blocks(&start, &allocated, BLOCK_ALLOCATED, BA_DEFER); + } + + /* assign new block numbers to protected nodes */ + assign_real_blocknrs(flush_pos, first_allocated, allocated, state, &jnodes.nodes); + set_key_offset(&key, get_key_offset(&key) + (allocated << current_blocksize_bits)); + ON_TRACE(TRACE_EXTENT_ALLOC, + "copied to left: [%llu %llu]\n", first_allocated, allocated); + protected_jnodes_done(&jnodes); + } else { + /* overwrite */ + ON_TRACE(TRACE_EXTENT_ALLOC, + "SQUALLOC: overwrite: (oid %llu, index %llu) [%llu %llu] - ", oid, index, start, width); + + /* overwrite: try to copy unit as it is to left neighbor and make all first not flushprepped nodes + overwrite nodes */ + set_extent(©_extent, start, width); + result = put_unit_to_end(left, &key, ©_extent); + if (result == -E_NODE_FULL) { + ON_TRACE(TRACE_EXTENT_ALLOC, "left is full\n"); + return SQUEEZE_TARGET_FULL; + } + mark_jnodes_overwrite(flush_pos, oid, index, width); + set_key_offset(&key, get_key_offset(&key) + (width << current_blocksize_bits)); + ON_TRACE(TRACE_EXTENT_ALLOC, "copied to left\n"); + } + *stop_key = key; + return SQUEEZE_CONTINUE; +} + +reiser4_internal int +key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key *key) +{ + return key_by_inode_and_offset_common(inode, off, key); +} + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/extent.h linux-2.6.4-ck1/fs/reiser4/plugin/item/extent.h --- linux-2.6.4/fs/reiser4/plugin/item/extent.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/extent.h 2004-03-11 22:45:15.320505388 +1100 @@ -0,0 +1,176 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#ifndef __REISER4_EXTENT_H__ +#define __REISER4_EXTENT_H__ + +/* on disk extent */ +typedef struct { + reiser4_dblock_nr start; + reiser4_dblock_nr width; +} reiser4_extent; + +typedef struct extent_stat { + int unallocated_units; + int unallocated_blocks; + int allocated_units; + int allocated_blocks; + int hole_units; + int hole_blocks; +} extent_stat; + +/* extents in an extent item can be either holes, or unallocated or allocated + extents */ +typedef enum { + HOLE_EXTENT, + UNALLOCATED_EXTENT, + ALLOCATED_EXTENT +} extent_state; + +#define HOLE_EXTENT_START 0 +#define UNALLOCATED_EXTENT_START 1 +#define UNALLOCATED_EXTENT_START2 2 + +typedef struct { + reiser4_block_nr pos_in_unit; + reiser4_block_nr width; /* width of current unit */ + pos_in_node_t nr_units; /* number of units */ + reiser4_extent *ext; /* */ + unsigned long expected_page; +#if REISER4_DEBUG + reiser4_extent extent; +#endif +} extent_coord_extension_t; + +/* macros to set/get fields of on-disk extent */ +static inline reiser4_block_nr +extent_get_start(const reiser4_extent * ext) +{ + return dblock_to_cpu(&ext->start); +} + +static inline reiser4_block_nr +extent_get_width(const reiser4_extent * ext) +{ + return dblock_to_cpu(&ext->width); +} + +extern __u64 reiser4_current_block_count(void); + +static inline void +extent_set_start(reiser4_extent * ext, reiser4_block_nr start) +{ + cassert(sizeof (ext->start) == 8); + assert("nikita-2510", ergo(start > 1, start < reiser4_current_block_count())); + cpu_to_dblock(start, &ext->start); +} + +static inline void +extent_set_width(reiser4_extent *ext, reiser4_block_nr width) +{ + cassert(sizeof (ext->width) == 8); + cpu_to_dblock(width, &ext->width); + assert("nikita-2511", + ergo(extent_get_start(ext) > 1, + extent_get_start(ext) + width <= reiser4_current_block_count())); +} + +#define extent_item(coord) \ +({ \ + assert("nikita-3143", item_is_extent(coord)); \ + ((reiser4_extent *)item_body_by_coord (coord)); \ +}) + +#define extent_by_coord(coord) \ +({ \ + assert("nikita-3144", item_is_extent(coord)); \ + (extent_item (coord) + (coord)->unit_pos); \ +}) + +#define width_by_coord(coord) \ +({ \ + assert("nikita-3145", item_is_extent(coord)); \ + extent_get_width (extent_by_coord(coord)); \ +}) + +struct carry_cut_data; +struct carry_kill_data; + +/* plugin->u.item.b.* */ +reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *); +int can_contain_key_extent(const coord_t * coord, const reiser4_key * key, const reiser4_item_data *); +int mergeable_extent(const coord_t * p1, const coord_t * p2); +pos_in_node_t nr_units_extent(const coord_t *); +lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *); +void init_coord_extent(coord_t *); +int init_extent(coord_t *, reiser4_item_data *); +int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *); +int can_shift_extent(unsigned free_space, + coord_t * source, znode * target, shift_direction, unsigned *size, unsigned want); +void copy_units_extent(coord_t * target, + coord_t * source, + unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space); +int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count, struct carry_kill_data *); +int create_hook_extent(const coord_t * coord, void *arg); +int cut_units_extent(coord_t *coord, pos_in_node_t from, pos_in_node_t to, + struct carry_cut_data *, reiser4_key *smallest_removed, reiser4_key *new_first); +int kill_units_extent(coord_t *coord, pos_in_node_t from, pos_in_node_t to, + struct carry_kill_data *, reiser4_key *smallest_removed, reiser4_key *new_first); +reiser4_key *unit_key_extent(const coord_t *, reiser4_key *); +reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *); +void print_extent(const char *, coord_t *); +void show_extent(struct seq_file *m, coord_t *coord); +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child); +int utmost_child_real_block_extent(const coord_t * coord, sideof side, reiser4_block_nr * block); +void item_stat_extent(const coord_t * coord, void *vp); +int check_extent(const coord_t * coord, const char **error); + +/* plugin->u.item.s.file.* */ +int write_extent(struct inode *, flow_t *, hint_t *, int grabbed, write_mode_t); +int read_extent(struct file *, flow_t *, hint_t *); +int readpage_extent(void *, struct page *); +void readpages_extent(void *, struct address_space *, struct list_head *pages); +int capture_extent(reiser4_key *, uf_coord_t *, struct page *, write_mode_t); +reiser4_key *append_key_extent(const coord_t *, reiser4_key *); +void init_coord_extension_extent(uf_coord_t *, loff_t offset); +int get_block_address_extent(const uf_coord_t *, sector_t block, struct buffer_head *); + + +/* these are used in flush.c + FIXME-VS: should they be somewhere in item_plugin? */ +int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos); +int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos, reiser4_key * stop_key); + +int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */ +int extent_is_allocated(const coord_t *); +__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */ +__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */ +reiser4_block_nr extent_unit_start(const coord_t * item); /* Starting block location of this unit. */ + +/* plugin->u.item.f. */ +int scan_extent (flush_scan * scan); +extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *); + +reiser4_item_data *init_new_extent(reiser4_item_data *data, void *ext_unit, int nr_extents); +reiser4_block_nr extent_size(const coord_t *coord, pos_in_node_t nr); +extent_state state_of_extent(reiser4_extent *ext); +void set_extent(reiser4_extent *ext, reiser4_block_nr start, reiser4_block_nr width); +int replace_extent(coord_t *un_extent, lock_handle *lh, + reiser4_key *key, reiser4_item_data *data, const reiser4_extent *new_ext, unsigned flags); +lock_handle *znode_lh(znode *); + +/* the reiser4 repacker support */ +struct repacker_cursor; +extern int process_extent_backward_for_repacking (tap_t *, struct repacker_cursor *); +extern int mark_extent_for_repacking (tap_t *, int); + +/* __REISER4_EXTENT_H__ */ +#endif +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.4-ck1/fs/reiser4/plugin/item/extent_item_ops.c --- linux-2.6.4/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/extent_item_ops.c 2004-03-11 22:45:15.327504300 +1100 @@ -0,0 +1,838 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "item.h" +#include "../../inode.h" +#include "../../tree_walk.h" /* check_sibling_list() */ +#include "../../page_cache.h" +#include "../../carry.h" + +/* item_plugin->b.max_key_inside */ +reiser4_internal reiser4_key * +max_key_inside_extent(const coord_t *coord, reiser4_key *key) +{ + item_key_by_coord(coord, key); + set_key_offset(key, get_key_offset(max_key())); + return key; +} + +/* item_plugin->b.can_contain_key + this checks whether @key of @data is matching to position set by @coord */ +reiser4_internal int +can_contain_key_extent(const coord_t *coord, const reiser4_key *key, const reiser4_item_data *data) +{ + reiser4_key item_key; + + if (item_plugin_by_coord(coord) != data->iplug) + return 0; + + item_key_by_coord(coord, &item_key); + if (get_key_locality(key) != get_key_locality(&item_key) || + get_key_objectid(key) != get_key_objectid(&item_key) || + get_key_ordering(key) != get_key_ordering(&item_key)) return 0; + + return 1; +} + +/* item_plugin->b.mergeable + first item is of extent type */ +/* Audited by: green(2002.06.13) */ +reiser4_internal int +mergeable_extent(const coord_t *p1, const coord_t *p2) +{ + reiser4_key key1, key2; + + assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID); + /* FIXME-VS: Which is it? Assert or return 0 */ + if (item_id_by_coord(p2) != EXTENT_POINTER_ID) { + return 0; + } + + item_key_by_coord(p1, &key1); + item_key_by_coord(p2, &key2); + if (get_key_locality(&key1) != get_key_locality(&key2) || + get_key_objectid(&key1) != get_key_objectid(&key2) || + get_key_ordering(&key1) != get_key_ordering(&key2) || + get_key_type(&key1) != get_key_type(&key2)) + return 0; + if (get_key_offset(&key1) + extent_size(p1, nr_units_extent(p1)) != get_key_offset(&key2)) + return 0; + return 1; +} + +/* item_plugin->b.show */ +reiser4_internal void +show_extent(struct seq_file *m, coord_t *coord) +{ + reiser4_extent *ext; + ext = extent_by_coord(coord); + seq_printf(m, "%Lu %Lu", extent_get_start(ext), extent_get_width(ext)); +} + + +#if REISER4_DEBUG_OUTPUT + +/* Audited by: green(2002.06.13) */ +static const char * +state2label(extent_state state) +{ + const char *label; + + label = 0; + switch (state) { + case HOLE_EXTENT: + label = "hole"; + break; + + case UNALLOCATED_EXTENT: + label = "unalloc"; + break; + + case ALLOCATED_EXTENT: + label = "alloc"; + break; + } + assert("vs-376", label); + return label; +} + +/* item_plugin->b.print */ +reiser4_internal void +print_extent(const char *prefix, coord_t *coord) +{ + reiser4_extent *ext; + unsigned i, nr; + + if (prefix) + printk("%s:", prefix); + + nr = nr_units_extent(coord); + ext = (reiser4_extent *) item_body_by_coord(coord); + + printk("%u: ", nr); + for (i = 0; i < nr; i++, ext++) { + printk("[%Lu (%Lu) %s]", extent_get_start(ext), extent_get_width(ext), state2label(state_of_extent(ext))); + } + printk("\n"); +} + +/* item_plugin->b.item_stat */ +reiser4_internal void +item_stat_extent(const coord_t *coord, void *vp) +{ + reiser4_extent *ext; + struct extent_stat *ex_stat; + unsigned i, nr_units; + + ex_stat = (struct extent_stat *) vp; + + ext = extent_item(coord); + nr_units = nr_units_extent(coord); + + for (i = 0; i < nr_units; i++) { + switch (state_of_extent(ext + i)) { + case ALLOCATED_EXTENT: + ex_stat->allocated_units++; + ex_stat->allocated_blocks += extent_get_width(ext + i); + break; + case UNALLOCATED_EXTENT: + ex_stat->unallocated_units++; + ex_stat->unallocated_blocks += extent_get_width(ext + i); + break; + case HOLE_EXTENT: + ex_stat->hole_units++; + ex_stat->hole_blocks += extent_get_width(ext + i); + break; + default: + assert("vs-1419", 0); + } + } +} + +#endif /* REISER4_DEBUG_OUTPUT */ + +/* item_plugin->b.nr_units */ +reiser4_internal pos_in_node_t +nr_units_extent(const coord_t *coord) +{ + /* length of extent item has to be multiple of extent size */ + assert("vs-1424", (item_length_by_coord(coord) % sizeof (reiser4_extent)) == 0); + return item_length_by_coord(coord) / sizeof (reiser4_extent); +} + +/* item_plugin->b.lookup */ +reiser4_internal lookup_result +lookup_extent(const reiser4_key *key, lookup_bias bias UNUSED_ARG, coord_t *coord) +{ /* znode and item_pos are + set to an extent item to + look through */ + reiser4_key item_key; + reiser4_block_nr lookuped, offset; + unsigned i, nr_units; + reiser4_extent *ext; + unsigned blocksize; + unsigned char blocksize_bits; + + item_key_by_coord(coord, &item_key); + offset = get_key_offset(&item_key); + + /* key we are looking for must be greater than key of item @coord */ + assert("vs-414", keygt(key, &item_key)); + + assert("umka-99945", + !keygt(key, max_key_inside_extent(coord, &item_key))); + + ext = extent_item(coord); + assert("vs-1350", ext == coord->body); + + blocksize = current_blocksize; + blocksize_bits = current_blocksize_bits; + + /* offset we are looking for */ + lookuped = get_key_offset(key); + + nr_units = nr_units_extent(coord); + /* go through all extents until the one which address given offset */ + for (i = 0; i < nr_units; i++, ext++) { + offset += (extent_get_width(ext) << blocksize_bits); + if (offset > lookuped) { + /* desired byte is somewhere in this extent */ + coord->unit_pos = i; + coord->between = AT_UNIT; + return CBK_COORD_FOUND; + } + } + + /* set coord after last unit */ + coord->unit_pos = nr_units - 1; + coord->between = AFTER_UNIT; + return CBK_COORD_FOUND; +} + +/* item_plugin->b.paste + item @coord is set to has been appended with @data->length of free + space. data->data contains data to be pasted into the item in position + @coord->in_item.unit_pos. It must fit into that free space. + @coord must be set between units. +*/ +reiser4_internal int +paste_extent(coord_t *coord, reiser4_item_data *data, carry_plugin_info *info UNUSED_ARG) +{ + unsigned old_nr_units; + reiser4_extent *ext; + int item_length; + + ext = extent_item(coord); + item_length = item_length_by_coord(coord); + old_nr_units = (item_length - data->length) / sizeof (reiser4_extent); + + /* this is also used to copy extent into newly created item, so + old_nr_units could be 0 */ + assert("vs-260", item_length >= data->length); + + /* make sure that coord is set properly */ + assert("vs-35", ((!coord_is_existing_unit(coord)) || (!old_nr_units && !coord->unit_pos))); + + /* first unit to be moved */ + switch (coord->between) { + case AFTER_UNIT: + coord->unit_pos++; + case BEFORE_UNIT: + coord->between = AT_UNIT; + break; + case AT_UNIT: + assert("vs-331", !old_nr_units && !coord->unit_pos); + break; + default: + impossible("vs-330", "coord is set improperly"); + } + + /* prepare space for new units */ + xmemmove(ext + coord->unit_pos + data->length / sizeof (reiser4_extent), + ext + coord->unit_pos, (old_nr_units - coord->unit_pos) * sizeof (reiser4_extent)); + + /* copy new data from kernel space */ + assert("vs-556", data->user == 0); + xmemcpy(ext + coord->unit_pos, data->data, (unsigned) data->length); + + /* after paste @coord is set to first of pasted units */ + assert("vs-332", coord_is_existing_unit(coord)); + assert("vs-333", !memcmp(data->data, extent_by_coord(coord), (unsigned) data->length)); + return 0; +} + +/* item_plugin->b.can_shift */ +reiser4_internal int +can_shift_extent(unsigned free_space, coord_t *source, + znode *target UNUSED_ARG, shift_direction pend UNUSED_ARG, unsigned *size, unsigned want) +{ + *size = item_length_by_coord(source); + if (*size > free_space) + /* never split a unit of extent item */ + *size = free_space - free_space % sizeof (reiser4_extent); + + /* we can shift *size bytes, calculate how many do we want to shift */ + if (*size > want * sizeof (reiser4_extent)) + *size = want * sizeof (reiser4_extent); + + if (*size % sizeof (reiser4_extent) != 0) + impossible("vs-119", "Wrong extent size: %i %i", *size, sizeof (reiser4_extent)); + return *size / sizeof (reiser4_extent); + +} + +/* item_plugin->b.copy_units */ +reiser4_internal void +copy_units_extent(coord_t *target, coord_t *source, + unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space) +{ + char *from_ext, *to_ext; + + assert("vs-217", free_space == count * sizeof (reiser4_extent)); + + from_ext = item_body_by_coord(source); + to_ext = item_body_by_coord(target); + + if (where_is_free_space == SHIFT_LEFT) { + assert("vs-215", from == 0); + + /* At this moment, item length was already updated in the item + header by shifting code, hence nr_units_extent() will + return "new" number of units---one we obtain after copying + units. + */ + to_ext += (nr_units_extent(target) - count) * sizeof (reiser4_extent); + } else { + reiser4_key key; + coord_t coord; + + assert("vs-216", from + count == coord_last_unit_pos(source) + 1); + + from_ext += item_length_by_coord(source) - free_space; + + /* new units are inserted before first unit in an item, + therefore, we have to update item key */ + coord = *source; + coord.unit_pos = from; + unit_key_extent(&coord, &key); + + node_plugin_by_node(target->node)->update_item_key(target, &key, 0/*info */); + } + + xmemcpy(to_ext, from_ext, free_space); +} + +/* item_plugin->b.create_hook + @arg is znode of leaf node for which we need to update right delimiting key */ +reiser4_internal int +create_hook_extent(const coord_t *coord, void *arg) +{ + coord_t *child_coord; + znode *node; + reiser4_key key; + reiser4_tree *tree; + + if (!arg) + return 0; + + child_coord = arg; + tree = znode_get_tree(coord->node); + + assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL); + + WLOCK_DK(tree); + WLOCK_TREE(tree); + /* find a node on the left level for which right delimiting key has to + be updated */ + if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) { + assert("vs-411", znode_is_left_connected(child_coord->node)); + node = child_coord->node->left; + } else { + assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT); + node = child_coord->node; + assert("nikita-3314", node != NULL); + } + + if (node != NULL) { + znode_set_rd_key(node, item_key_by_coord(coord, &key)); + + assert("nikita-3282", check_sibling_list(node)); + /* break sibling links */ + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) { + node->right->left = NULL; + node->right = NULL; + } + } + WUNLOCK_TREE(tree); + WUNLOCK_DK(tree); + return 0; +} + + +#define ITEM_TAIL_KILLED 0 +#define ITEM_HEAD_KILLED 1 +#define ITEM_KILLED 2 + +/* item_plugin->b.kill_hook + this is called when @count units starting from @from-th one are going to be removed + */ +reiser4_internal int +kill_hook_extent(const coord_t *coord, pos_in_node_t from, pos_in_node_t count, struct carry_kill_data *kdata) +{ + reiser4_extent *ext; + reiser4_block_nr start, length; + reiser4_key min_item_key, max_item_key; + reiser4_key from_key, to_key; + const reiser4_key *pfrom_key, *pto_key; + struct inode *inode; + reiser4_tree *tree; + znode *left; + znode *right; + pgoff_t from_off, to_off, offset, skip; + int retval; + + assert ("zam-811", znode_is_write_locked(coord->node)); + assert("nikita-3315", kdata != NULL); + + item_key_by_coord(coord, &min_item_key); + max_item_key_by_coord(coord, &max_item_key); + + if (kdata->params.from_key) { + pfrom_key = kdata->params.from_key; + pto_key = kdata->params.to_key; + } else { + coord_t dup; + + assert("vs-1549", from == coord->unit_pos); + unit_key_by_coord(coord, &from_key); + pfrom_key = &from_key; + + coord_dup(&dup, coord); + dup.unit_pos = from + count - 1; + max_unit_key_by_coord(&dup, &to_key); + pto_key = &to_key; + } + + if (!keylt(pto_key, &max_item_key)) { + if (!keygt(pfrom_key, &min_item_key)) { + /* item is to be removed completely */ + if (kdata->left != NULL) { + assert("nikita-3316", kdata->right != NULL); + + left = kdata->left->node; + right = kdata->right->node; + + tree = current_tree; + WLOCK_TREE(tree); + link_left_and_right(left, right); + WUNLOCK_TREE(tree); + if (right != NULL) + UNDER_RW_VOID(dk, tree, write, update_znode_dkeys(left, right)); + } + from_off = get_key_offset(&min_item_key) >> PAGE_CACHE_SHIFT; + to_off = (get_key_offset(&max_item_key) + 1) >> PAGE_CACHE_SHIFT; + retval = ITEM_KILLED; + } else { + /* tail of item is to be removed */ + from_off = (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + to_off = (get_key_offset(&max_item_key) + 1) >> PAGE_CACHE_SHIFT; + retval = ITEM_TAIL_KILLED; + } + } else { + /* head of item is to be removed */ + assert("vs-1571", keyeq(pfrom_key, &min_item_key)); + assert("vs-1572", (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0); + assert("vs-1573", ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE - 1)) == 0); + from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT; + to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT; + retval = ITEM_HEAD_KILLED; + } + + inode = kdata->inode; + assert("vs-1545", inode != NULL); + if (inode != NULL) + /* take care of pages and jnodes corresponding to part of item being killed */ + reiser4_invalidate_pages(inode->i_mapping, from_off, to_off - from_off); + + ext = extent_item(coord) + from; + offset = (get_key_offset(&min_item_key) + extent_size(coord, from)) >> PAGE_CACHE_SHIFT; + + assert("vs-1551", from_off >= offset); + assert("vs-1552", from_off - offset <= extent_get_width(ext)); + skip = from_off - offset; + offset = from_off; + + while (offset < to_off) { + length = extent_get_width(ext) - skip; + if (state_of_extent(ext) == HOLE_EXTENT) { + skip = 0; + offset += length; + ext ++; + continue; + } + + + if (offset + length > to_off) { + length = to_off - offset; + } + /* FIXME */ + inode_sub_bytes(inode, PAGE_CACHE_SIZE * length); + + if (state_of_extent(ext) == UNALLOCATED_EXTENT) { + /* some jnodes corresponding to this unallocated extent */ + fake_allocated2free(length, + 0 /* unformatted */); + + skip = 0; + offset += length; + ext ++; + continue; + } + + assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT); + + if (length != 0) { + start = extent_get_start(ext) + skip; + + /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed + immediately */ + reiser4_dealloc_blocks(&start, &length, 0 /* not used */, + BA_DEFER/* unformatted with defer */); + } + skip = 0; + offset += length; + ext ++; + } + return retval; +} + +/* item_plugin->b.kill_units */ +reiser4_internal int +kill_units_extent(coord_t *coord, pos_in_node_t from, pos_in_node_t to, struct carry_kill_data *kdata, + reiser4_key *smallest_removed, reiser4_key *new_first) +{ + reiser4_extent *ext; + reiser4_key item_key; + pos_in_node_t count; + reiser4_key from_key, to_key; + const reiser4_key *pfrom_key, *pto_key; + loff_t off; + int result; + + assert("vs-1541", ((kdata->params.from_key == NULL && kdata->params.to_key == NULL) || + (kdata->params.from_key != NULL && kdata->params.to_key != NULL))); + + if (kdata->params.from_key) { + pfrom_key = kdata->params.from_key; + pto_key = kdata->params.to_key; + } else { + coord_t dup; + + /* calculate key range of kill */ + assert("vs-1549", from == coord->unit_pos); + unit_key_by_coord(coord, &from_key); + pfrom_key = &from_key; + + coord_dup(&dup, coord); + dup.unit_pos = to; + max_unit_key_by_coord(&dup, &to_key); + pto_key = &to_key; + } + + item_key_by_coord(coord, &item_key); + +#if REISER4_DEBUG + { + reiser4_key max_item_key; + + max_item_key_by_coord(coord, &max_item_key); + + if (new_first) { + /* head of item is to be cut */ + assert("vs-1542", keyeq(pfrom_key, &item_key)); + assert("vs-1538", keylt(pto_key, &max_item_key)); + } else { + /* tail of item is to be cut */ + assert("vs-1540", keygt(pfrom_key, &item_key)); + assert("vs-1543", !keylt(pto_key, &max_item_key)); + } + } +#endif + + if (smallest_removed) + *smallest_removed = *pfrom_key; + + if (new_first) { + /* item head is cut. Item key will change. This new key is calculated here */ + assert("vs-1556", (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) == (PAGE_CACHE_SIZE - 1)); + *new_first = *pto_key; + set_key_offset(new_first, get_key_offset(new_first) + 1); + } + + count = to - from + 1; + result = kill_hook_extent(coord, from, count, kdata); + if (result == ITEM_TAIL_KILLED) { + assert("vs-1553", get_key_offset(pfrom_key) >= get_key_offset(&item_key) + extent_size(coord, from)); + off = get_key_offset(pfrom_key) - (get_key_offset(&item_key) + extent_size(coord, from)); + if (off) { + /* unit @from is to be cut partially. Its width decreases */ + ext = extent_item(coord) + from; + extent_set_width(ext, (off + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT); + count --; + } + } else { + __u64 max_to_offset; + __u64 rest; + + assert("vs-1575", result == ITEM_HEAD_KILLED); + assert("", from == 0); + assert("", ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE - 1)) == 0); + assert("", get_key_offset(pto_key) + 1 > get_key_offset(&item_key) + extent_size(coord, to)); + max_to_offset = get_key_offset(&item_key) + extent_size(coord, to + 1) - 1; + assert("", get_key_offset(pto_key) <= max_to_offset); + + rest = (max_to_offset - get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT; + if (rest) { + /* unit @to is to be cut partially */ + ext = extent_item(coord) + to; + + assert("", extent_get_width(ext) > rest); + + if (state_of_extent(ext) == ALLOCATED_EXTENT) + extent_set_start(ext, extent_get_start(ext) + (extent_get_width(ext) - rest)); + + extent_set_width(ext, rest); + count --; + } + } + return count * sizeof(reiser4_extent); +} + +/* item_plugin->b.cut_units + this is too similar to kill_units_extent */ +reiser4_internal int +cut_units_extent(coord_t *coord, pos_in_node_t from, pos_in_node_t to, struct carry_cut_data *cdata, + reiser4_key *smallest_removed, reiser4_key *new_first) +{ + reiser4_extent *ext; + reiser4_key item_key; + pos_in_node_t count; + reiser4_key from_key, to_key; + const reiser4_key *pfrom_key, *pto_key; + loff_t off; + + assert("vs-1541", ((cdata->params.from_key == NULL && cdata->params.to_key == NULL) || + (cdata->params.from_key != NULL && cdata->params.to_key != NULL))); + + if (cdata->params.from_key) { + pfrom_key = cdata->params.from_key; + pto_key = cdata->params.to_key; + } else { + coord_t dup; + + /* calculate key range of kill */ + coord_dup(&dup, coord); + dup.unit_pos = from; + unit_key_by_coord(&dup, &from_key); + + dup.unit_pos = to; + max_unit_key_by_coord(&dup, &to_key); + + pfrom_key = &from_key; + pto_key = &to_key; + } + + assert("vs-1555", (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0); + assert("vs-1556", (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) == (PAGE_CACHE_SIZE - 1)); + + item_key_by_coord(coord, &item_key); + +#if REISER4_DEBUG + { + reiser4_key max_item_key; + + assert("vs-1584", get_key_locality(pfrom_key) == get_key_locality(&item_key)); + assert("vs-1585", get_key_type(pfrom_key) == get_key_type(&item_key)); + assert("vs-1586", get_key_objectid(pfrom_key) == get_key_objectid(&item_key)); + assert("vs-1587", get_key_ordering(pfrom_key) == get_key_ordering(&item_key)); + + max_item_key_by_coord(coord, &max_item_key); + + if (new_first != NULL) { + /* head of item is to be cut */ + assert("vs-1542", keyeq(pfrom_key, &item_key)); + assert("vs-1538", keylt(pto_key, &max_item_key)); + } else { + /* tail of item is to be cut */ + assert("vs-1540", keygt(pfrom_key, &item_key)); + assert("vs-1543", keyeq(pto_key, &max_item_key)); + } + } +#endif + + if (smallest_removed) + *smallest_removed = *pfrom_key; + + if (new_first) { + /* item head is cut. Item key will change. This new key is calculated here */ + *new_first = *pto_key; + set_key_offset(new_first, get_key_offset(new_first) + 1); + } + + count = to - from + 1; + + assert("vs-1553", get_key_offset(pfrom_key) >= get_key_offset(&item_key) + extent_size(coord, from)); + off = get_key_offset(pfrom_key) - (get_key_offset(&item_key) + extent_size(coord, from)); + if (off) { + /* tail of unit @from is to be cut partially. Its width decreases */ + assert("vs-1582", new_first == NULL); + ext = extent_item(coord) + from; + extent_set_width(ext, off >> PAGE_CACHE_SHIFT); + count --; + } + + assert("vs-1554", get_key_offset(pto_key) <= get_key_offset(&item_key) + extent_size(coord, to + 1) - 1); + off = (get_key_offset(&item_key) + extent_size(coord, to + 1) - 1) - get_key_offset(pto_key); + if (off) { + /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased + and width decreased. */ + assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0); + ext = extent_item(coord) + to; + if (state_of_extent(ext) == ALLOCATED_EXTENT) + extent_set_start(ext, extent_get_start(ext) + (extent_get_width(ext) - (off >> PAGE_CACHE_SHIFT))); + + extent_set_width(ext, (off >> PAGE_CACHE_SHIFT)); + count --; + } + return count * sizeof(reiser4_extent); +} + +/* item_plugin->b.unit_key */ +reiser4_internal reiser4_key * +unit_key_extent(const coord_t *coord, reiser4_key *key) +{ + assert("vs-300", coord_is_existing_unit(coord)); + + item_key_by_coord(coord, key); + set_key_offset(key, (get_key_offset(key) + extent_size(coord, coord->unit_pos))); + + return key; +} + +/* item_plugin->b.max_unit_key */ +reiser4_internal reiser4_key * +max_unit_key_extent(const coord_t *coord, reiser4_key *key) +{ + assert("vs-300", coord_is_existing_unit(coord)); + + item_key_by_coord(coord, key); + set_key_offset(key, (get_key_offset(key) + extent_size(coord, coord->unit_pos + 1) - 1)); + return key; +} + +/* item_plugin->b.estimate + item_plugin->b.item_data_by_flow */ + +#if REISER4_DEBUG + +/* item_plugin->b.check + used for debugging, every item should have here the most complete + possible check of the consistency of the item that the inventor can + construct +*/ +int +check_extent(const coord_t *coord /* coord of item to check */ , + const char **error /* where to store error message */ ) +{ + reiser4_extent *ext, *first; + unsigned i, j; + reiser4_block_nr start, width, blk_cnt; + unsigned num_units; + reiser4_tree *tree; + oid_t oid; + reiser4_key key; + coord_t scan; + + assert("vs-933", REISER4_DEBUG); + + if (znode_get_level(coord->node) != TWIG_LEVEL) { + *error = "Extent on the wrong level"; + return -1; + } + if (item_length_by_coord(coord) % sizeof (reiser4_extent) != 0) { + *error = "Wrong item size"; + return -1; + } + ext = first = extent_item(coord); + blk_cnt = reiser4_block_count(reiser4_get_current_sb()); + num_units = coord_num_units(coord); + tree = znode_get_tree(coord->node); + item_key_by_coord(coord, &key); + oid = get_key_objectid(&key); + coord_dup(&scan, coord); + + for (i = 0; i < num_units; ++i, ++ext) { + __u64 index; + + scan.unit_pos = i; + index = extent_unit_index(&scan); + +#if 0 + /* check that all jnodes are present for the unallocated + * extent */ + if (state_of_extent(ext) == UNALLOCATED_EXTENT) { + for (j = 0; j < extent_get_width(ext); j ++) { + jnode *node; + + node = jlookup(tree, oid, index + j); + if (node == NULL) { + print_coord("scan", &scan, 0); + *error = "Jnode missing"; + return -1; + } + jput(node); + } + } +#endif + + start = extent_get_start(ext); + if (start < 2) + continue; + /* extent is allocated one */ + width = extent_get_width(ext); + if (start >= blk_cnt) { + *error = "Start too large"; + return -1; + } + if (start + width > blk_cnt) { + *error = "End too large"; + return -1; + } + /* make sure that this extent does not overlap with other + allocated extents extents */ + for (j = 0; j < i; j++) { + if (state_of_extent(first + j) != ALLOCATED_EXTENT) + continue; + if (!((extent_get_start(ext) >= extent_get_start(first + j) + extent_get_width(first + j)) + || (extent_get_start(ext) + extent_get_width(ext) <= extent_get_start(first + j)))) { + *error = "Extent overlaps with others"; + return -1; + } + } + + } + + return 0; +} + +#endif /* REISER4_DEBUG */ + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/extent_repack_ops.c linux-2.6.4-ck1/fs/reiser4/plugin/item/extent_repack_ops.c --- linux-2.6.4/fs/reiser4/plugin/item/extent_repack_ops.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/extent_repack_ops.c 2004-03-11 22:45:15.328504144 +1100 @@ -0,0 +1,445 @@ +/* Copyright 2003 by Hans Reiser. */ + +#include "item.h" +#include "../../key.h" +#include "../../super.h" +#include "../../carry.h" +#include "../../inode.h" +#include "../../page_cache.h" +#include "../../emergency_flush.h" +#include "../../prof.h" +#include "../../flush.h" +#include "../../tap.h" +#include "../object.h" + +#include "../../repacker.h" +#include "extent.h" + +static int get_reiser4_inode_by_tap (struct inode ** result, tap_t * tap) +{ + reiser4_key ext_key; + + unit_key_by_coord(tap->coord, &ext_key); + return get_reiser4_inode_by_key(result, &ext_key); +} + +static jnode * get_jnode_by_mapping (struct inode * inode, long index) +{ + struct page * page; + jnode * node; + + page = grab_cache_page(inode->i_mapping, index); + if (page == NULL) + return ERR_PTR(-ENOMEM); + node = jnode_of_page(page); + unlock_page(page); + page_cache_release(page); + return node; +} + +static int mark_jnode_for_repacking (jnode * node) +{ + int ret = 0; + + LOCK_JNODE(node); + ret = try_capture(node, ZNODE_WRITE_LOCK, 0, 0/* no can_coc */); + if (ret) { + UNLOCK_JNODE(node); + return ret; + } + + jnode_make_dirty_locked(node); + UNLOCK_JNODE(node); + JF_SET(node, JNODE_REPACK); + + ret = jload(node); + if (ret == 0) { + struct page * page; + + page = jnode_page(node); + lock_page(page); + set_page_dirty_internal(page); + unlock_page(page); + jrelse(node); + } + + return ret; +} + +/* + Mark jnodes of given extent for repacking. + @tap : lock, coord and load status for the tree traversal position, + @max_nr_marked: a maximum number of nodes which can be marked for repacking, + @return: error code if < 0, number of marked nodes otherwise. +*/ +reiser4_internal int mark_extent_for_repacking (tap_t * tap, int max_nr_marked) +{ + coord_t * coord = tap->coord; + reiser4_extent *ext; + int nr_marked; + struct inode * inode; + unsigned long index, pos_in_extent; + reiser4_block_nr width, start; + int ret; + + ext = extent_by_coord(coord); + + if (state_of_extent(ext) == HOLE_EXTENT) + return 0; + + width = extent_get_width(ext); + start = extent_get_start(ext); + index = extent_unit_index(coord); + + ret = get_reiser4_inode_by_tap(&inode, tap); + if (ret) + return ret; + + for (nr_marked = 0, pos_in_extent = 0; + nr_marked < max_nr_marked && pos_in_extent < width; pos_in_extent ++) + { + jnode * node; + + node = get_jnode_by_mapping(inode, index + pos_in_extent); + if (IS_ERR(node)) { + ret = PTR_ERR(node); + break; + } + + /* Freshly created jnode has no block number set. */ + if (node->blocknr == 0) { + reiser4_block_nr block; + block = start + pos_in_extent; + jnode_set_block(node, &block); + + node->parent_item_id = EXTENT_POINTER_ID; + } + + if (!JF_ISSET(node, JNODE_REPACK)) { + do { + /* Check whether the node is already read. */ + if (!JF_ISSET(node, JNODE_PARSED)) { + ret = jstartio(node); + if (ret) + break; + } + ret = mark_jnode_for_repacking(node); + if (ret) + break; + nr_marked ++; + } while (0); + } + jput(node); + if (ret) + break; + } + + iput(inode); + if (ret) + return ret; + return nr_marked; +} + +/* Check should the repacker relocate this node. */ +static int relocatable (jnode * check) +{ + return !JF_ISSET(check, JNODE_OVRWR) && !JF_ISSET(check, JNODE_RELOC); +} + +static int replace_end_of_extent (coord_t * coord, reiser4_block_nr end_part_start, + reiser4_block_nr end_part_width, int * all_replaced) +{ + reiser4_extent * ext; + reiser4_block_nr ext_start; + reiser4_block_nr ext_width; + + reiser4_item_data item; + reiser4_extent new_ext, replace_ext; + reiser4_block_nr replace_ext_width; + reiser4_key key; + + int ret; + + assert ("zam-959", item_is_extent(coord)); + + ext = extent_by_coord(coord); + ext_start = extent_get_start(ext); + ext_width = extent_get_width(ext); + + assert ("zam-960", end_part_width <= ext_width); + + replace_ext_width = ext_width - end_part_width; + if (replace_ext_width == 0) { + set_extent(ext, end_part_start, end_part_width); + znode_make_dirty(coord->node); + /* End part of extent is equal to the whole extent. */ + * all_replaced = 1; + return 0; + } + + set_extent(&replace_ext, ext_start, replace_ext_width); + set_extent(&new_ext, end_part_start, end_part_width); + + unit_key_by_coord(coord, &key); + set_key_offset(&key, get_key_offset(&key) + replace_ext_width * current_blocksize); + + { + reiser4_context * ctx = get_current_context(); + reiser4_super_info_data * sinfo = get_super_private(ctx->super); + __u64 estimated; + __u64 were_grabbed; + + were_grabbed = ctx->grabbed_blocks; + estimated = estimate_one_insert_item(&get_super_private(ctx->super)->tree); + + /* grab space for operations on internal levels. */ + ret = reiser4_grab_space( + estimated, BA_FORCE | BA_RESERVED | BA_PERMANENT | BA_FORMATTED); + if (ret) + return ret; + + ret = replace_extent( + coord, znode_lh(coord->node), &key, + init_new_extent(&item, &new_ext, 1), &replace_ext, COPI_DONT_SHIFT_LEFT); + + /* release grabbed space if it was not used. */ + assert ("zam-988", ctx->grabbed_blocks >= were_grabbed); + grabbed2free(ctx, sinfo, ctx->grabbed_blocks - were_grabbed); + } + + return ret; +} + +static int make_new_extent_at_end (coord_t * coord, reiser4_block_nr width, int * all_replaced) +{ + reiser4_extent * ext; + reiser4_block_nr ext_start; + reiser4_block_nr ext_width; + reiser4_block_nr new_ext_start; + + assert ("zam-961", item_is_extent(coord)); + + ext = extent_by_coord(coord); + ext_start = extent_get_start(ext); + ext_width = extent_get_width(ext); + + assert ("zam-962", width < ext_width); + + if (state_of_extent(ext) == ALLOCATED_EXTENT) + new_ext_start = ext_start + ext_width - width; + else + new_ext_start = ext_start; + + return replace_end_of_extent(coord, new_ext_start, width, all_replaced); +} + +static void parse_extent(coord_t * coord, reiser4_block_nr * start, reiser4_block_nr * width, long * ind) +{ + reiser4_extent * ext; + + ext = extent_by_coord(coord); + *start = extent_get_start(ext); + *width = extent_get_width(ext); + *ind = extent_unit_index(coord); +} + +static int skip_not_relocatable_extent(struct inode * inode, coord_t * coord, int * done) +{ + reiser4_block_nr ext_width, ext_start; + long ext_index, reloc_start; + jnode * check = NULL; + int ret = 0; + + assert("zam-985", state_of_extent(extent_by_coord(coord))); + parse_extent(coord, &ext_start, &ext_width, &ext_index); + + for (reloc_start = ext_width - 1; reloc_start >= 0; reloc_start --) { + check = get_jnode_by_mapping(inode, reloc_start + ext_index); + if (IS_ERR(check)) + return PTR_ERR(check); + + if (check->blocknr == 0) { + reiser4_block_nr block; + block = ext_start + reloc_start; + jnode_set_block(check, &block); + + check->parent_item_id = EXTENT_POINTER_ID; + } + + if (relocatable(check)) { + jput(check); + if (reloc_start < ext_width - 1) + ret = make_new_extent_at_end(coord, ext_width - reloc_start - 1, done); + return ret; + } + jput(check); + } + *done = 1; + return 0; +} + + +static int relocate_extent (struct inode * inode, coord_t * coord, reiser4_blocknr_hint * hint, + int *done, reiser4_block_nr * len) +{ + reiser4_block_nr ext_width, ext_start; + long ext_index, reloc_ind; + reiser4_block_nr new_ext_width, new_ext_start, new_block; + int unallocated_flg; + int ret = 0; + + parse_extent(coord, &ext_start, &ext_width, &ext_index); + assert("zam-974", *len != 0); + + unallocated_flg = (state_of_extent(extent_by_coord(coord)) == UNALLOCATED_EXTENT); + hint->block_stage = unallocated_flg ? BLOCK_UNALLOCATED : BLOCK_FLUSH_RESERVED; + + new_ext_width = *len; + ret = reiser4_alloc_blocks(hint, &new_ext_start, &new_ext_width, BA_PERMANENT); + if (ret) + return ret; + + hint->blk = new_ext_start; + if (!unallocated_flg) { + reiser4_block_nr dealloc_ext_start; + + dealloc_ext_start = ext_start + ext_width - new_ext_width; + ret = reiser4_dealloc_blocks(&dealloc_ext_start, &new_ext_width, 0, + BA_DEFER | BA_PERMANENT); + if (ret) + return ret; + } + + new_block = new_ext_start; + for (reloc_ind = ext_width - new_ext_width; reloc_ind < ext_width; reloc_ind ++) + { + jnode * check; + + check = get_jnode_by_mapping(inode, ext_index + reloc_ind); + if (IS_ERR(check)) + return PTR_ERR(check); + + assert("zam-975", relocatable(check)); + assert("zam-986", check->blocknr != 0); + + jnode_set_block(check, &new_block); + check->parent_item_id = EXTENT_POINTER_ID; + new_block ++; + + JF_SET(check, JNODE_RELOC); + JF_SET(check, JNODE_REPACK); + + jput(check); + } + + ret = replace_end_of_extent(coord, new_ext_start, new_ext_width, done); + *len = new_ext_width; + return ret; +} + +static int find_relocatable_extent (struct inode * inode, coord_t * coord, + int * nr_reserved, reiser4_block_nr * len) +{ + reiser4_block_nr ext_width, ext_start; + long ext_index, reloc_end; + jnode * check = NULL; + int ret = 0; + + *len = 0; + parse_extent(coord, &ext_start, &ext_width, &ext_index); + + for (reloc_end = ext_width - 1; + reloc_end >= 0 && *nr_reserved > 0; reloc_end --) + { + assert("zam-980", get_current_context()->grabbed_blocks >= *nr_reserved); + + check = get_jnode_by_mapping(inode, reloc_end + ext_index); + if (IS_ERR(check)) + return PTR_ERR(check); + + if (check->blocknr == 0) { + reiser4_block_nr block; + block = ext_start + reloc_end; + jnode_set_block(check, &block); + } + + if (!relocatable(check)) { + assert("zam-973", reloc_end < ext_width - 1); + goto out; + } + /* add node to transaction. */ + ret = mark_jnode_for_repacking(check); + if (ret) + goto out; ; + jput(check); + + (*len) ++; + (*nr_reserved) --; + } + if (0) { + out: + jput(check); + } + return ret; +} + +static int find_and_relocate_end_of_extent ( + struct inode * inode, coord_t * coord, + struct repacker_cursor * cursor, int * done) +{ + reiser4_block_nr len; + int ret; + + ret = skip_not_relocatable_extent(inode, coord, done); + if (ret || (*done)) + return ret; + + ret = find_relocatable_extent(inode, coord, &cursor->count, &len); + if (ret) + return ret; + if (len == 0) { + *done = 1; + return 0; + } + + ret = relocate_extent(inode, coord, &cursor->hint, done, &len); + if (ret) + return ret; + cursor->stats.jnodes_dirtied += (long)len; + return 0; +} + +/* process (relocate) unformatted nodes in backward direction: from the end of extent to the its start. */ +reiser4_internal int +process_extent_backward_for_repacking (tap_t * tap, struct repacker_cursor * cursor) +{ + coord_t * coord = tap->coord; + reiser4_extent *ext; + struct inode * inode = NULL; + int done = 0; + int ret; + + assert("zam-985", cursor->count > 0); + ext = extent_by_coord(coord); + if (state_of_extent(ext) == HOLE_EXTENT) + return 0; + + ret = get_reiser4_inode_by_tap(&inode, tap); + + while (!ret && !done) + ret = find_and_relocate_end_of_extent(inode, coord, cursor, &done); + + iput(inode); + return ret; +} + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/internal.c linux-2.6.4-ck1/fs/reiser4/plugin/item/internal.c --- linux-2.6.4/fs/reiser4/plugin/item/internal.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/internal.c 2004-03-11 22:45:15.329503989 +1100 @@ -0,0 +1,411 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Implementation of internal-item plugin methods. */ + +#include "../../forward.h" +#include "../../debug.h" +#include "../../dformat.h" +#include "../../key.h" +#include "../../coord.h" +#include "internal.h" +#include "item.h" +#include "../node/node.h" +#include "../plugin.h" +#include "../../jnode.h" +#include "../../znode.h" +#include "../../tree_walk.h" +#include "../../tree_mod.h" +#include "../../tree.h" +#include "../../super.h" +#include "../../block_alloc.h" + +/* see internal.h for explanation */ + +/* plugin->u.item.b.mergeable */ +reiser4_internal int +mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ , + const coord_t * p2 UNUSED_ARG /* second item */ ) +{ + /* internal items are not mergeable */ + return 0; +} + +/* ->lookup() method for internal items */ +reiser4_internal lookup_result +lookup_internal(const reiser4_key * key /* key to look up */ , + lookup_bias bias UNUSED_ARG /* lookup bias */ , + coord_t * coord /* coord of item */ ) +{ + reiser4_key ukey; + + switch (keycmp(unit_key_by_coord(coord, &ukey), key)) { + default: + impossible("", "keycmp()?!"); + case LESS_THAN: + /* FIXME-VS: AFTER_ITEM used to be here. But with new coord + item plugin can not be taken using coord set this way */ + assert("vs-681", coord->unit_pos == 0); + coord->between = AFTER_UNIT; + case EQUAL_TO: + return CBK_COORD_FOUND; + case GREATER_THAN: + return CBK_COORD_NOTFOUND; + } +} + +/* return body of internal item at @coord */ +static internal_item_layout * +internal_at(const coord_t * coord /* coord of + * item */ ) +{ + assert("nikita-607", coord != NULL); + assert("nikita-1650", item_plugin_by_coord(coord) == item_plugin_by_id(NODE_POINTER_ID)); + return (internal_item_layout *) item_body_by_coord(coord); +} + +reiser4_internal void +update_internal(const coord_t * coord, const reiser4_block_nr * blocknr) +{ + internal_item_layout *item = internal_at(coord); + assert("nikita-2959", reiser4_blocknr_is_sane(blocknr)); + + cpu_to_dblock(*blocknr, &item->pointer); +} + +/* return child block number stored in the internal item at @coord */ +static reiser4_block_nr +pointer_at(const coord_t * coord /* coord of item */ ) +{ + assert("nikita-608", coord != NULL); + return dblock_to_cpu(&internal_at(coord)->pointer); +} + +/* get znode pointed to by internal @item */ +static znode * +znode_at(const coord_t * item /* coord of item */ , + znode * parent /* parent node */) +{ + return child_znode(item, parent, 1, 0); +} + +/* store pointer from internal item into "block". Implementation of + ->down_link() method */ +reiser4_internal void +down_link_internal(const coord_t * coord /* coord of item */ , + const reiser4_key * key UNUSED_ARG /* key to get + * pointer for */ , + reiser4_block_nr * block /* resulting block number */ ) +{ + ON_DEBUG(reiser4_key item_key); + + assert("nikita-609", coord != NULL); + assert("nikita-611", block != NULL); + assert("nikita-612", (key == NULL) || + /* twig horrors */ + (znode_get_level(coord->node) == TWIG_LEVEL) || keyle(item_key_by_coord(coord, &item_key), key)); + + *block = pointer_at(coord); + assert("nikita-2960", reiser4_blocknr_is_sane(block)); +} + +/* Get the child's block number, or 0 if the block is unallocated. */ +reiser4_internal int +utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG, reiser4_block_nr * block) +{ + assert("jmacd-2059", coord != NULL); + + *block = pointer_at(coord); + assert("nikita-2961", reiser4_blocknr_is_sane(block)); + + if (blocknr_is_fake(block)) { + *block = 0; + } + + return 0; +} + +/* Return the child. */ +reiser4_internal int +utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG, jnode ** childp) +{ + reiser4_block_nr block = pointer_at(coord); + znode *child; + + assert("jmacd-2059", childp != NULL); + assert("nikita-2962", reiser4_blocknr_is_sane(&block)); + + child = zlook(znode_get_tree(coord->node), &block); + + if (IS_ERR(child)) { + return PTR_ERR(child); + } + + *childp = ZJNODE(child); + + return 0; +} + +static void check_link(znode *left, znode *right) +{ + znode *scan; + + for (scan = left; scan != right; scan = scan->right) { + if (ZF_ISSET(scan, JNODE_RIP)) + break; + if (znode_is_right_connected(scan) && scan->right != NULL) { + if (ZF_ISSET(scan->right, JNODE_RIP)) + break; + assert("nikita-3285", + znode_is_left_connected(scan->right)); + assert("nikita-3265", + ergo(scan != left, + ZF_ISSET(scan, JNODE_HEARD_BANSHEE))); + assert("nikita-3284", scan->right->left == scan); + } else + break; + } +} + +reiser4_internal int check__internal(const coord_t * coord, const char **error) +{ + reiser4_block_nr blk; + znode *child; + coord_t cpy; + + blk = pointer_at(coord); + if (!reiser4_blocknr_is_sane(&blk)) { + *error = "Invalid pointer"; + return -1; + } + coord_dup(&cpy, coord); + child = znode_at(&cpy, cpy.node); + if (child != NULL) { + znode *left_child; + znode *right_child; + + left_child = right_child = NULL; + + assert("nikita-3256", znode_invariant(child)); + if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) { + left_child = znode_at(&cpy, cpy.node); + RLOCK_TREE(znode_get_tree(child)); + if (left_child != NULL) + check_link(left_child, child); + RUNLOCK_TREE(znode_get_tree(child)); + if (left_child != NULL) + zput(left_child); + } + coord_dup(&cpy, coord); + if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) { + right_child = znode_at(&cpy, cpy.node); + RLOCK_TREE(znode_get_tree(child)); + if (right_child != NULL) + check_link(child, right_child); + RUNLOCK_TREE(znode_get_tree(child)); + if (right_child != NULL) + zput(right_child); + } + zput(child); + } + return 0; +} + +#if REISER4_DEBUG_OUTPUT +/* debugging aid: print human readable information about internal item at + @coord */ +reiser4_internal void +print_internal(const char *prefix /* prefix to print */ , + coord_t * coord /* coord of item to print */ ) +{ + reiser4_block_nr blk; + + blk = pointer_at(coord); + assert("nikita-2963", reiser4_blocknr_is_sane(&blk)); + printk("%s: internal: %s\n", prefix, sprint_address(&blk)); +} +#endif + +/* return true only if this item really points to "block" */ +/* Audited by: green(2002.06.14) */ +reiser4_internal int +has_pointer_to_internal(const coord_t * coord /* coord of item */ , + const reiser4_block_nr * block /* block number to + * check */ ) +{ + assert("nikita-613", coord != NULL); + assert("nikita-614", block != NULL); + + return pointer_at(coord) == *block; +} + +/* hook called by ->create_item() method of node plugin after new internal + item was just created. + + This is point where pointer to new node is inserted into tree. Initialize + parent pointer in child znode, insert child into sibling list and slum. + +*/ +reiser4_internal int +create_hook_internal(const coord_t * item /* coord of item */ , + void *arg /* child's left neighbor, if any */ ) +{ + znode *child; + + assert("nikita-1252", item != NULL); + assert("nikita-1253", item->node != NULL); + assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL); + assert("nikita-1450", item->unit_pos == 0); + + child = znode_at(item, item->node); + if (!IS_ERR(child)) { + znode *left; + int result = 0; + reiser4_tree *tree; + + left = arg; + tree = znode_get_tree(item->node); + WLOCK_DK(tree); + WLOCK_TREE(tree); + assert("nikita-1400", (child->in_parent.node == NULL) || (znode_above_root(child->in_parent.node))); + ++ item->node->c_count; + coord_to_parent_coord(item, &child->in_parent); + sibling_list_insert_nolock(child, left); + + assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN)); + ZF_CLR(child, JNODE_ORPHAN); + + ON_TRACE(TRACE_ZWEB, "create: %llx: %i [%llx]\n", + *znode_get_block(item->node), item->node->c_count, + *znode_get_block(child)); + + WUNLOCK_TREE(tree); + if ((left != NULL) && !keyeq(znode_get_rd_key(left), + znode_get_rd_key(child))) { + znode_set_rd_key(child, znode_get_rd_key(left)); + } + WUNLOCK_DK(tree); + zput(child); + return result; + } else + return PTR_ERR(child); +} + +/* hook called by ->cut_and_kill() method of node plugin just before internal + item is removed. + + This is point where empty node is removed from the tree. Clear parent + pointer in child, and mark node for pending deletion. + + Node will be actually deleted later and in several installations: + + . when last lock on this node will be released, node will be removed from + the sibling list and its lock will be invalidated + + . when last reference to this node will be dropped, bitmap will be updated + and node will be actually removed from the memory. + + +*/ +reiser4_internal int +kill_hook_internal(const coord_t * item /* coord of item */ , + pos_in_node_t from UNUSED_ARG /* start unit */ , + pos_in_node_t count UNUSED_ARG /* stop unit */, + struct carry_kill_data *p UNUSED_ARG) +{ + znode *child; + + assert("nikita-1222", item != NULL); + assert("nikita-1224", from == 0); + assert("nikita-1225", count == 1); + + child = znode_at(item, item->node); + if (IS_ERR(child)) + return PTR_ERR(child); + else if (node_is_empty(child)) { + reiser4_tree *tree; + + assert("nikita-1397", znode_is_write_locked(child)); + assert("nikita-1398", child->c_count == 0); + assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE)); + + tree = znode_get_tree(item->node); + WLOCK_TREE(tree); + init_parent_coord(&child->in_parent, NULL); + -- item->node->c_count; + WUNLOCK_TREE(tree); + ON_TRACE(TRACE_ZWEB, "kill: %llx: %i [%llx]\n", + *znode_get_block(item->node), item->node->c_count, + *znode_get_block(child)); + + zput(child); + return 0; + } else { + warning("nikita-1223", "Cowardly refuse to remove link to non-empty node"); + print_znode("parent", item->node); + print_znode("child", child); + zput(child); + return RETERR(-EIO); + } +} + +/* hook called by ->shift() node plugin method when iternal item was just + moved from one node to another. + + Update parent pointer in child and c_counts in old and new parent + +*/ +reiser4_internal int +shift_hook_internal(const coord_t * item /* coord of item */ , + unsigned from UNUSED_ARG /* start unit */ , + unsigned count UNUSED_ARG /* stop unit */ , + znode * old_node /* old parent */ ) +{ + znode *child; + znode *new_node; + reiser4_tree *tree; + + assert("nikita-1276", item != NULL); + assert("nikita-1277", from == 0); + assert("nikita-1278", count == 1); + assert("nikita-1451", item->unit_pos == 0); + + new_node = item->node; + assert("nikita-2132", new_node != old_node); + tree = znode_get_tree(item->node); + child = child_znode(item, old_node, 1, 0); + if (child == NULL) + return 0; + if (!IS_ERR(child)) { + reiser4_stat_inc(tree.reparenting); + WLOCK_TREE(tree); + ++ new_node->c_count; + assert("nikita-1395", znode_parent(child) == old_node); + assert("nikita-1396", old_node->c_count > 0); + coord_to_parent_coord(item, &child->in_parent); + assert("nikita-1781", znode_parent(child) == new_node); + assert("nikita-1782", check_tree_pointer(item, child) == NS_FOUND); + -- old_node->c_count; + WUNLOCK_TREE(tree); + zput(child); + ON_TRACE(TRACE_ZWEB, "shift: %llx: %i -> %lli: %i [%llx]\n", + *znode_get_block(old_node), + old_node->c_count, *znode_get_block(new_node), + new_node->c_count, *znode_get_block(child)); + return 0; + } else + return PTR_ERR(child); +} + +/* plugin->u.item.b.max_key_inside - not defined */ + +/* plugin->u.item.b.nr_units - item.c:single_unit */ + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/internal.h linux-2.6.4-ck1/fs/reiser4/plugin/item/internal.h --- linux-2.6.4/fs/reiser4/plugin/item/internal.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/internal.h 2004-03-11 22:45:15.329503989 +1100 @@ -0,0 +1,51 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ +/* Internal item contains down-link to the child of the internal/twig + node in a tree. It is internal items that are actually used during + tree traversal. */ + +#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ ) +#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ + +#include "../../forward.h" +#include "../../dformat.h" + +/* on-disk layout of internal item */ +typedef struct internal_item_layout { + /* 0 */ reiser4_dblock_nr pointer; + /* 4 */ +} internal_item_layout; + +struct cut_list; + +int mergeable_internal(const coord_t * p1, const coord_t * p2); +lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias, coord_t * coord); +/* store pointer from internal item into "block". Implementation of + ->down_link() method */ +extern void down_link_internal(const coord_t * coord, const reiser4_key * key, reiser4_block_nr * block); +extern int has_pointer_to_internal(const coord_t * coord, const reiser4_block_nr * block); +extern int create_hook_internal(const coord_t * item, void *arg); +extern int kill_hook_internal(const coord_t * item, pos_in_node_t from, pos_in_node_t count, + struct carry_kill_data *); +extern int shift_hook_internal(const coord_t * item, unsigned from, unsigned count, znode * old_node); +extern void print_internal(const char *prefix, coord_t * coord); + +extern int utmost_child_internal(const coord_t * coord, sideof side, jnode ** child); +int utmost_child_real_block_internal(const coord_t * coord, sideof side, reiser4_block_nr * block); + +extern void update_internal(const coord_t * coord, + const reiser4_block_nr * blocknr); +/* FIXME: reiserfs has check_internal */ +extern int check__internal(const coord_t * coord, const char **error); + +/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/item.c linux-2.6.4-ck1/fs/reiser4/plugin/item/item.c --- linux-2.6.4/fs/reiser4/plugin/item/item.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/item.c 2004-03-11 22:45:15.331503678 +1100 @@ -0,0 +1,746 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* definition of item plugins. */ + +#include "../../forward.h" +#include "../../debug.h" +#include "../../key.h" +#include "../../coord.h" +#include "../plugin_header.h" +#include "sde.h" +/*#include "tail.h"*/ +#include "../cryptcompress.h" +#include "internal.h" +#include "item.h" +/*#include "extent.h"*/ +#include "static_stat.h" +#include "../plugin.h" +#include "../../znode.h" +#include "../../tree.h" +#include "ctail.h" + +/* return pointer to item body */ +reiser4_internal void +item_body_by_coord_hard(coord_t * coord /* coord to query */ ) +{ + assert("nikita-324", coord != NULL); + assert("nikita-325", coord->node != NULL); + assert("nikita-326", znode_is_loaded(coord->node)); + assert("nikita-3200", coord->body == NULL); + trace_stamp(TRACE_TREE); + + coord->body = node_plugin_by_node(coord->node)->item_by_coord(coord); +} + +reiser4_internal int item_body_is_valid(const coord_t * coord) +{ + return + coord->body == + node_plugin_by_node(coord->node)->item_by_coord(coord); +} + +/* return length of item at @coord */ +/* Audited by: green(2002.06.15) */ +void +check_contexts(void); +reiser4_internal pos_in_node_t +item_length_by_coord(const coord_t * coord /* coord to query */ ) +{ + int len; + + assert("nikita-327", coord != NULL); + assert("nikita-328", coord->node != NULL); + assert("nikita-329", znode_is_loaded(coord->node)); + trace_stamp(TRACE_TREE); + + len = node_plugin_by_node(coord->node)->length_by_coord(coord); + ON_DEBUG(check_contexts()); + return len; +} + +reiser4_internal void +obtain_item_plugin(const coord_t * coord) +{ + assert("nikita-330", coord != NULL); + assert("nikita-331", coord->node != NULL); + assert("nikita-332", znode_is_loaded(coord->node)); + trace_stamp(TRACE_TREE); + + coord_set_iplug((coord_t *) coord, + node_plugin_by_node(coord->node)->plugin_by_coord(coord)); + assert("nikita-2479", + coord_iplug(coord) == node_plugin_by_node(coord->node)->plugin_by_coord(coord)); +} + +/* return type of item at @coord */ +reiser4_internal item_type_id +item_type_by_coord(const coord_t * coord /* coord to query */ ) +{ + assert("nikita-333", coord != NULL); + assert("nikita-334", coord->node != NULL); + assert("nikita-335", znode_is_loaded(coord->node)); + assert("nikita-336", item_plugin_by_coord(coord) != NULL); + + trace_stamp(TRACE_TREE); + + return item_plugin_by_coord(coord)->b.item_type; +} + +/* return id of item */ +/* Audited by: green(2002.06.15) */ +reiser4_internal item_id +item_id_by_coord(const coord_t * coord /* coord to query */ ) +{ + assert("vs-539", coord != NULL); + assert("vs-538", coord->node != NULL); + assert("vs-537", znode_is_loaded(coord->node)); + assert("vs-536", item_plugin_by_coord(coord) != NULL); + + trace_stamp(TRACE_TREE); + + assert("vs-540", item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID); + return item_id_by_plugin(item_plugin_by_coord(coord)); +} + +/* return key of item at @coord */ +/* Audited by: green(2002.06.15) */ +reiser4_internal reiser4_key * +item_key_by_coord(const coord_t * coord /* coord to query */ , + reiser4_key * key /* result */ ) +{ + assert("nikita-338", coord != NULL); + assert("nikita-339", coord->node != NULL); + assert("nikita-340", znode_is_loaded(coord->node)); + trace_stamp(TRACE_TREE); + + return node_plugin_by_node(coord->node)->key_at(coord, key); +} + +/* this returns max key in the item */ +reiser4_internal reiser4_key * +max_item_key_by_coord(const coord_t *coord /* coord to query */ , + reiser4_key *key /* result */ ) +{ + coord_t last; + + assert("nikita-338", coord != NULL); + assert("nikita-339", coord->node != NULL); + assert("nikita-340", znode_is_loaded(coord->node)); + trace_stamp(TRACE_TREE); + + /* make coord pointing to last item's unit */ + coord_dup(&last, coord); + last.unit_pos = coord_num_units(&last) - 1; + assert("vs-1560", coord_is_existing_unit(&last)); + + max_unit_key_by_coord(&last, key); + return key; +} + +/* return key of unit at @coord */ +reiser4_internal reiser4_key * +unit_key_by_coord(const coord_t * coord /* coord to query */ , + reiser4_key * key /* result */ ) +{ + assert("nikita-772", coord != NULL); + assert("nikita-774", coord->node != NULL); + assert("nikita-775", znode_is_loaded(coord->node)); + trace_stamp(TRACE_TREE); + + if (item_plugin_by_coord(coord)->b.unit_key != NULL) + return item_plugin_by_coord(coord)->b.unit_key(coord, key); + else + return item_key_by_coord(coord, key); +} + +/* return the biggest key contained the unit @coord */ +reiser4_internal reiser4_key * +max_unit_key_by_coord(const coord_t * coord /* coord to query */ , + reiser4_key * key /* result */ ) +{ + assert("nikita-772", coord != NULL); + assert("nikita-774", coord->node != NULL); + assert("nikita-775", znode_is_loaded(coord->node)); + trace_stamp(TRACE_TREE); + + if (item_plugin_by_coord(coord)->b.max_unit_key != NULL) + return item_plugin_by_coord(coord)->b.max_unit_key(coord, key); + else + return unit_key_by_coord(coord, key); +} + + +/* ->max_key_inside() method for items consisting of exactly one key (like + stat-data) */ +static reiser4_key * +max_key_inside_single_key(const coord_t * coord /* coord of item */ , + reiser4_key * result /* resulting key */) +{ + assert("nikita-604", coord != NULL); + + /* coord -> key is starting key of this item and it has to be already + filled in */ + return unit_key_by_coord(coord, result); +} + +/* ->nr_units() method for items consisting of exactly one unit always */ +static pos_in_node_t +nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ ) +{ + return 1; +} + +static int +paste_no_paste(coord_t * coord UNUSED_ARG, + reiser4_item_data * data UNUSED_ARG, + carry_plugin_info * info UNUSED_ARG) +{ + return 0; +} + +/* default ->fast_paste() method */ +reiser4_internal int +agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ ) +{ + return 1; +} + +reiser4_internal int +item_can_contain_key(const coord_t * item /* coord of item */ , + const reiser4_key * key /* key to check */ , + const reiser4_item_data * data /* parameters of item + * being created */ ) +{ + item_plugin *iplug; + reiser4_key min_key_in_item; + reiser4_key max_key_in_item; + + assert("nikita-1658", item != NULL); + assert("nikita-1659", key != NULL); + + iplug = item_plugin_by_coord(item); + if (iplug->b.can_contain_key != NULL) + return iplug->b.can_contain_key(item, key, data); + else { + assert("nikita-1681", iplug->b.max_key_inside != NULL); + item_key_by_coord(item, &min_key_in_item); + iplug->b.max_key_inside(item, &max_key_in_item); + + /* can contain key if + min_key_in_item <= key && + key <= max_key_in_item + */ + return keyle(&min_key_in_item, key) && keyle(key, &max_key_in_item); + } +} + +/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */ +reiser4_internal int +are_items_mergeable(const coord_t * i1 /* coord of first item */ , + const coord_t * i2 /* coord of second item */ ) +{ + item_plugin *iplug; + reiser4_key k1; + reiser4_key k2; + + assert("nikita-1336", i1 != NULL); + assert("nikita-1337", i2 != NULL); + + iplug = item_plugin_by_coord(i1); + assert("nikita-1338", iplug != NULL); + + IF_TRACE(TRACE_NODES, print_key("k1", item_key_by_coord(i1, &k1))); + IF_TRACE(TRACE_NODES, print_key("k2", item_key_by_coord(i2, &k2))); + + /* NOTE-NIKITA are_items_mergeable() is also called by assertions in + shifting code when nodes are in "suspended" state. */ + assert("nikita-1663", keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2))); + + if (iplug->b.mergeable != NULL) { + return iplug->b.mergeable(i1, i2); + } else if (iplug->b.max_key_inside != NULL) { + iplug->b.max_key_inside(i1, &k1); + item_key_by_coord(i2, &k2); + + /* mergeable if ->max_key_inside() >= key of i2; */ + return keyge(iplug->b.max_key_inside(i1, &k1), item_key_by_coord(i2, &k2)); + } else { + item_key_by_coord(i1, &k1); + item_key_by_coord(i2, &k2); + + return + (get_key_locality(&k1) == get_key_locality(&k2)) && + (get_key_objectid(&k1) == get_key_objectid(&k2)) && (iplug == item_plugin_by_coord(i2)); + } +} + +reiser4_internal int +item_is_extent(const coord_t * item) +{ + assert("vs-482", coord_is_existing_item(item)); + return item_id_by_coord(item) == EXTENT_POINTER_ID; +} + +reiser4_internal int +item_is_tail(const coord_t * item) +{ + assert("vs-482", coord_is_existing_item(item)); + return item_id_by_coord(item) == FORMATTING_ID; +} + +reiser4_internal int +item_is_statdata(const coord_t * item) +{ + assert("vs-516", coord_is_existing_item(item)); + return item_type_by_coord(item) == STAT_DATA_ITEM_TYPE; +} + +item_plugin item_plugins[LAST_ITEM_ID] = { + [STATIC_STAT_DATA_ID] = { + .h = { + .type_id = REISER4_ITEM_PLUGIN_TYPE, + .id = STATIC_STAT_DATA_ID, + .pops = NULL, + .label = "sd", + .desc = "stat-data", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .b = { + .item_type = STAT_DATA_ITEM_TYPE, + .max_key_inside = max_key_inside_single_key, + .can_contain_key = NULL, + .mergeable = NULL, + .nr_units = nr_units_single_unit, + .lookup = NULL, + .init = NULL, + .paste = paste_no_paste, + .fast_paste = NULL, + .can_shift = NULL, + .copy_units = NULL, + .create_hook = NULL, + .kill_hook = NULL, + .shift_hook = NULL, + .cut_units = NULL, + .kill_units = NULL, + .unit_key = NULL, + .max_unit_key = NULL, + .estimate = NULL, + .item_data_by_flow = NULL, +#if REISER4_DEBUG_OUTPUT + .print = print_sd, + .item_stat = item_stat_static_sd, +#endif +#if REISER4_DEBUG + .check = NULL +#endif + }, + .f = { + .utmost_child = NULL, + .utmost_child_real_block = NULL, + .update = NULL, + .scan = NULL, + .squeeze = NULL + }, + .s = { + .sd = { + .init_inode = init_inode_static_sd, + .save_len = save_len_static_sd, + .save = save_static_sd + } + } + }, + [SIMPLE_DIR_ENTRY_ID] = { + .h = { + .type_id = REISER4_ITEM_PLUGIN_TYPE, + .id = SIMPLE_DIR_ENTRY_ID, + .pops = NULL, + .label = "de", + .desc = "directory entry", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .b = { + .item_type = DIR_ENTRY_ITEM_TYPE, + .max_key_inside = max_key_inside_single_key, + .can_contain_key = NULL, + .mergeable = NULL, + .nr_units = nr_units_single_unit, + .lookup = NULL, + .init = NULL, + .paste = NULL, + .fast_paste = NULL, + .can_shift = NULL, + .copy_units = NULL, + .create_hook = NULL, + .kill_hook = NULL, + .shift_hook = NULL, + .cut_units = NULL, + .kill_units = NULL, + .unit_key = NULL, + .max_unit_key = NULL, + .estimate = NULL, + .item_data_by_flow = NULL, +#if REISER4_DEBUG_OUTPUT + .print = print_de, + .item_stat = NULL, +#endif +#if REISER4_DEBUG + .check = NULL +#endif + }, + .f = { + .utmost_child = NULL, + .utmost_child_real_block = NULL, + .update = NULL, + .scan = NULL, + .squeeze = NULL + }, + .s = { + .dir = { + .extract_key = extract_key_de, + .update_key = update_key_de, + .extract_name = extract_name_de, + .extract_file_type = extract_file_type_de, + .add_entry = add_entry_de, + .rem_entry = rem_entry_de, + .max_name_len = max_name_len_de + } + } + }, + [COMPOUND_DIR_ID] = { + .h = { + .type_id = REISER4_ITEM_PLUGIN_TYPE, + .id = COMPOUND_DIR_ID, + .pops = NULL, + .label = "cde", + .desc = "compressed directory entry", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .b = { + .item_type = DIR_ENTRY_ITEM_TYPE, + .max_key_inside = max_key_inside_cde, + .can_contain_key = can_contain_key_cde, + .mergeable = mergeable_cde, + .nr_units = nr_units_cde, + .lookup = lookup_cde, + .init = init_cde, + .paste = paste_cde, + .fast_paste = agree_to_fast_op, + .can_shift = can_shift_cde, + .copy_units = copy_units_cde, + .create_hook = NULL, + .kill_hook = NULL, + .shift_hook = NULL, + .cut_units = cut_units_cde, + .kill_units = kill_units_cde, + .unit_key = unit_key_cde, + .max_unit_key = unit_key_cde, + .estimate = estimate_cde, + .item_data_by_flow = NULL +#if REISER4_DEBUG_OUTPUT + , .print = print_cde, + .item_stat = NULL +#endif +#if REISER4_DEBUG + , .check = check_cde +#endif + }, + .f = { + .utmost_child = NULL, + .utmost_child_real_block = NULL, + .update = NULL, + .scan = NULL, + .squeeze = NULL + }, + .s = { + .dir = { + .extract_key = extract_key_cde, + .update_key = update_key_cde, + .extract_name = extract_name_cde, + .extract_file_type = extract_file_type_de, + .add_entry = add_entry_cde, + .rem_entry = rem_entry_cde, + .max_name_len = max_name_len_cde + } + } + }, + [NODE_POINTER_ID] = { + .h = { + .type_id = REISER4_ITEM_PLUGIN_TYPE, + .id = NODE_POINTER_ID, + .pops = NULL, + .label = "internal", + .desc = "internal item", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .b = { + .item_type = INTERNAL_ITEM_TYPE, + .max_key_inside = NULL, + .can_contain_key = NULL, + .mergeable = mergeable_internal, + .nr_units = nr_units_single_unit, + .lookup = lookup_internal, + .init = NULL, + .paste = NULL, + .fast_paste = NULL, + .can_shift = NULL, + .copy_units = NULL, + .create_hook = create_hook_internal, + .kill_hook = kill_hook_internal, + .shift_hook = shift_hook_internal, + .cut_units = NULL, + .kill_units = NULL, + .unit_key = NULL, + .max_unit_key = NULL, + .estimate = NULL, + .item_data_by_flow = NULL +#if REISER4_DEBUG_OUTPUT + , .print = print_internal, + .item_stat = NULL +#endif +#if REISER4_DEBUG + , .check = check__internal +#endif + }, + .f = { + .utmost_child = utmost_child_internal, + .utmost_child_real_block = utmost_child_real_block_internal, + .update = update_internal, + .scan = NULL, + .squeeze = NULL + }, + .s = { + .internal = { + .down_link = down_link_internal, + .has_pointer_to = has_pointer_to_internal + } + } + }, + [EXTENT_POINTER_ID] = { + .h = { + .type_id = REISER4_ITEM_PLUGIN_TYPE, + .id = EXTENT_POINTER_ID, + .pops = NULL, + .label = "extent", + .desc = "extent item", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .b = { + .item_type = UNIX_FILE_METADATA_ITEM_TYPE, + .max_key_inside = max_key_inside_extent, + .can_contain_key = can_contain_key_extent, + .mergeable = mergeable_extent, + .nr_units = nr_units_extent, + .lookup = lookup_extent, + .init = NULL, + .paste = paste_extent, + .fast_paste = agree_to_fast_op, + .can_shift = can_shift_extent, + .create_hook = create_hook_extent, + .copy_units = copy_units_extent, + .kill_hook = kill_hook_extent, + .shift_hook = NULL, + .cut_units = cut_units_extent, + .kill_units = kill_units_extent, + .unit_key = unit_key_extent, + .max_unit_key = max_unit_key_extent, + .estimate = NULL, + .item_data_by_flow = NULL, + .show = show_extent, +#if REISER4_DEBUG_OUTPUT + .print = print_extent, + .item_stat = item_stat_extent, +#endif +#if REISER4_DEBUG + .check = check_extent +#endif + }, + .f = { + .utmost_child = utmost_child_extent, + .utmost_child_real_block = utmost_child_real_block_extent, + .update = NULL, + .scan = scan_extent, + .squeeze = NULL, + .key_by_offset = key_by_offset_extent + }, + .s = { + .file = { + .write = write_extent, + .read = read_extent, + .readpage = readpage_extent, + .capture = capture_extent, + .get_block = get_block_address_extent, + .readpages = readpages_extent, + .append_key = append_key_extent, + .init_coord_extension = init_coord_extension_extent + } + } + }, + [FORMATTING_ID] = { + .h = { + .type_id = REISER4_ITEM_PLUGIN_TYPE, + .id = FORMATTING_ID, + .pops = NULL, + .label = "body", + .desc = "body (or tail?) item", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .b = { + .item_type = UNIX_FILE_METADATA_ITEM_TYPE, + .max_key_inside = max_key_inside_tail, + .can_contain_key = can_contain_key_tail, + .mergeable = mergeable_tail, + .nr_units = nr_units_tail, + .lookup = lookup_tail, + .init = NULL, + .paste = paste_tail, + .fast_paste = agree_to_fast_op, + .can_shift = can_shift_tail, + .create_hook = NULL, + .copy_units = copy_units_tail, + .kill_hook = kill_hook_tail, + .shift_hook = NULL, + .cut_units = cut_units_tail, + .kill_units = kill_units_tail, + .unit_key = unit_key_tail, + .max_unit_key = unit_key_tail, + .estimate = NULL, + .item_data_by_flow = NULL, + .show = show_tail, +#if REISER4_DEBUG_OUTPUT + .print = NULL, + .item_stat = NULL, +#endif +#if REISER4_DEBUG + .check = NULL +#endif + }, + .f = { + .utmost_child = NULL, + .utmost_child_real_block = NULL, + .update = NULL, + .scan = NULL, + .squeeze = NULL + }, + .s = { + .file = { + .write = write_tail, + .read = read_tail, + .readpage = readpage_tail, + .capture = NULL, + .get_block = NULL, + .readpages = NULL, + .append_key = append_key_tail, + .init_coord_extension = init_coord_extension_tail + } + } + }, + [CTAIL_ID] = { + .h = { + .type_id = REISER4_ITEM_PLUGIN_TYPE, + .id = CTAIL_ID, + .pops = NULL, + .label = "ctail", + .desc = "cryptcompress tail item", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .b = { + .item_type = UNIX_FILE_METADATA_ITEM_TYPE, + .max_key_inside = max_key_inside_tail, + .can_contain_key = can_contain_key_ctail, + .mergeable = mergeable_ctail, + .nr_units = nr_units_ctail, + .lookup = NULL, + .init = init_ctail, + .paste = paste_ctail, + .fast_paste = agree_to_fast_op, + .can_shift = can_shift_ctail, + .create_hook = NULL, + .copy_units = copy_units_ctail, + .kill_hook = kill_hook_ctail, + .shift_hook = shift_hook_ctail, + .cut_units = cut_units_ctail, + .kill_units = kill_units_ctail, + .unit_key = unit_key_tail, + .max_unit_key = unit_key_tail, + .estimate = estimate_ctail, + .item_data_by_flow = NULL +#if REISER4_DEBUG_OUTPUT + , .print = print_ctail, + .item_stat = NULL +#endif +#if REISER4_DEBUG + , .check = NULL +#endif + }, + .f = { + .utmost_child = utmost_child_ctail, + /* FIXME-EDWARD: write this */ + .utmost_child_real_block = NULL, + .update = NULL, + .scan = scan_ctail, + .squeeze = squeeze_ctail + }, + .s = { + .file = { + .write = NULL, + .read = read_ctail, + .readpage = readpage_ctail, + .capture = NULL, + .get_block = get_block_address_tail, + .readpages = readpages_ctail, + .append_key = append_key_ctail, + .init_coord_extension = init_coord_extension_tail + } + } + }, + [BLACK_BOX_ID] = { + .h = { + .type_id = REISER4_ITEM_PLUGIN_TYPE, + .id = BLACK_BOX_ID, + .pops = NULL, + .label = "blackbox", + .desc = "black box item", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .b = { + .item_type = OTHER_ITEM_TYPE, + .max_key_inside = NULL, + .can_contain_key = NULL, + .mergeable = NULL, + .nr_units = nr_units_single_unit, + /* to need for ->lookup method */ + .lookup = NULL, + .init = NULL, + .paste = NULL, + .fast_paste = NULL, + .can_shift = NULL, + .copy_units = NULL, + .create_hook = NULL, + .kill_hook = NULL, + .shift_hook = NULL, + .cut_units = NULL, + .kill_units = NULL, + .unit_key = NULL, + .max_unit_key = NULL, + .estimate = NULL, + .item_data_by_flow = NULL, +#if REISER4_DEBUG_OUTPUT + .print = NULL, + .item_stat = NULL, +#endif +#if REISER4_DEBUG + .check = NULL +#endif + } + } +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/item.h linux-2.6.4-ck1/fs/reiser4/plugin/item/item.h --- linux-2.6.4/fs/reiser4/plugin/item/item.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/item.h 2004-03-11 22:45:15.332503522 +1100 @@ -0,0 +1,391 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* first read balance.c comments before reading this */ + +/* An item_plugin implements all of the operations required for + balancing that are item specific. */ + +/* an item plugin also implements other operations that are specific to that + item. These go into the item specific operations portion of the item + handler, and all of the item specific portions of the item handler are put + into a union. */ + +#if !defined( __REISER4_ITEM_H__ ) +#define __REISER4_ITEM_H__ + +#include "../../forward.h" +#include "../plugin_header.h" +#include "../../dformat.h" +#include "../../seal.h" +#include "../../plugin/file/file.h" + +#include /* for struct file, struct inode */ +#include /* for struct page */ +#include /* for struct dentry */ + +typedef enum { + STAT_DATA_ITEM_TYPE, + DIR_ENTRY_ITEM_TYPE, + INTERNAL_ITEM_TYPE, + UNIX_FILE_METADATA_ITEM_TYPE, + OTHER_ITEM_TYPE +} item_type_id; + + +/* this is the part of each item plugin that all items are expected to + support or at least explicitly fail to support by setting the + pointer to null. */ +typedef struct { + item_type_id item_type; + + /* operations called by balancing + + It is interesting to consider that some of these item + operations could be given sources or targets that are not + really items in nodes. This could be ok/useful. + + */ + /* maximal key that can _possibly_ be occupied by this item + + When inserting, and node ->lookup() method (called by + coord_by_key()) reaches an item after binary search, + the ->max_key_inside() item plugin method is used to determine + whether new item should pasted into existing item + (new_key<=max_key_inside()) or new item has to be created + (new_key>max_key_inside()). + + For items that occupy exactly one key (like stat-data) + this method should return this key. For items that can + grow indefinitely (extent, directory item) this should + return max_key(). + + For example extent with the key + + (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks, + + ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and + */ + reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *); + + /* true if item @coord can merge data at @key. */ + int (*can_contain_key) (const coord_t *, const reiser4_key *, const reiser4_item_data *); + /* mergeable() - check items for mergeability + + Optional method. Returns true if two items can be merged. + + */ + int (*mergeable) (const coord_t *, const coord_t *); + + /* number of atomic things in an item */ + pos_in_node_t (*nr_units) (const coord_t *); + + /* search within item for a unit within the item, and return a + pointer to it. This can be used to calculate how many + bytes to shrink an item if you use pointer arithmetic and + compare to the start of the item body if the item's data + are continuous in the node, if the item's data are not + continuous in the node, all sorts of other things are maybe + going to break as well. */ + lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *); + /* method called by ode_plugin->create_item() to initialise new + item */ + int (*init) (coord_t * target, coord_t * from, reiser4_item_data * data); + /* method called (e.g., by resize_item()) to place new data into + item when it grows*/ + int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *); + /* return true if paste into @coord is allowed to skip + carry. That is, if such paste would require any changes + at the parent level + */ + int (*fast_paste) (const coord_t *); + /* how many but not more than @want units of @source can be + shifted into @target node. If pend == append - we try to + append last item of @target by first units of @source. If + pend == prepend - we try to "prepend" first item in @target + by last units of @source. @target node has @free_space + bytes of free space. Total size of those units are returned + via @size. + + @target is not NULL if shifting to the mergeable item and + NULL is new item will be created during shifting. + */ + int (*can_shift) (unsigned free_space, coord_t *, + znode *, shift_direction, unsigned *size, unsigned want); + + /* starting off @from-th unit of item @source append or + prepend @count units to @target. @target has been already + expanded by @free_space bytes. That must be exactly what is + needed for those items in @target. If @where_is_free_space + == SHIFT_LEFT - free space is at the end of @target item, + othersize - it is in the beginning of it. */ + void (*copy_units) (coord_t *, coord_t *, + unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space); + + int (*create_hook) (const coord_t *, void *); + /* do whatever is necessary to do when @count units starting + from @from-th one are removed from the tree */ + /* FIXME-VS: this is used to be here for, in particular, + extents and items of internal type to free blocks they point + to at the same time with removing items from a + tree. Problems start, however, when dealloc_block fails due + to some reason. Item gets removed, but blocks it pointed to + are not freed. It is not clear how to fix this for items of + internal type because a need to remove internal item may + appear in the middle of balancing, and there is no way to + undo changes made. OTOH, if space allocator involves + balancing to perform dealloc_block - this will probably + break balancing due to deadlock issues + */ + int (*kill_hook) (const coord_t *, pos_in_node_t from, pos_in_node_t count, struct carry_kill_data *); + int (*shift_hook) (const coord_t *, unsigned from, unsigned count, znode *_node); + + /* unit @*from contains @from_key. unit @*to contains + @to_key. Cut all keys between @from_key and @to_key + including boundaries. Set @from and @to to number of units + which were removed. When units are cut from item beginning - + move space which gets freed to head of item. When units are + cut from item end - move freed space to item end. When units + are cut from the middle of item - move freed space to item + head. Return amount of space which got freed. Save smallest + removed key if @smallest_removed is not 0 + */ + int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to, struct carry_cut_data *, + reiser4_key *smallest_removed, reiser4_key *new_first_key); + + /* like cut_units, except that these units are removed from the + tree, not only from a node */ + int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to, struct carry_kill_data *, + reiser4_key *smallest_removed, reiser4_key *new_first); + + /* if @key_of_coord == 1 - returned key of coord, otherwise - + key of unit is returned. If @coord is not set to certain + unit - ERR_PTR(-ENOENT) is returned */ + reiser4_key *(*unit_key) (const coord_t *, reiser4_key *); + reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *); + /* estimate how much space is needed for paste @data into item at + @coord. if @coord==0 - estimate insertion, otherwise - estimate + pasting + */ + int (*estimate) (const coord_t *, const reiser4_item_data *); + + /* converts flow @f to item data. @coord == 0 on insert */ + int (*item_data_by_flow) (const coord_t *, const flow_t *, reiser4_item_data *); + + void (*show) (struct seq_file *, coord_t *); + +#if REISER4_DEBUG_OUTPUT + /* used for debugging only, prints an ascii description of the + item contents */ + void (*print) (const char *, coord_t *); + /* gather statistics */ + void (*item_stat) (const coord_t *, void *); +#endif + +#if REISER4_DEBUG + /* used for debugging, every item should have here the most + complete possible check of the consistency of the item that + the inventor can construct */ + int (*check) (const coord_t *, const char **error); +#endif + +} balance_ops; + +typedef struct { + /* return the right or left child of @coord, only if it is in memory */ + int (*utmost_child) (const coord_t *, sideof side, jnode ** child); + + /* return whether the right or left child of @coord has a non-fake + block number. */ + int (*utmost_child_real_block) (const coord_t *, sideof side, reiser4_block_nr *); + /* relocate child at @coord to the @block */ + void (*update) (const coord_t *, const reiser4_block_nr *); + /* count unformatted nodes per item for leave relocation policy, etc.. */ + int (*scan) (flush_scan * scan); + /* squeeze by unformatted child */ + int (*squeeze) (flush_pos_t * pos); + /* backward mapping from jnode offset to a key. */ + int (*key_by_offset) (struct inode *, loff_t, reiser4_key *); +} flush_ops; + +/* operations specific to the directory item */ +typedef struct { + /* extract stat-data key from directory entry at @coord and place it + into @key. */ + int (*extract_key) (const coord_t *, reiser4_key * key); + /* update object key in item. */ + int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *); + /* extract name from directory entry at @coord and return it */ + char *(*extract_name) (const coord_t *, char *buf); + /* extract file type (DT_* stuff) from directory entry at @coord and + return it */ + unsigned (*extract_file_type) (const coord_t *); + int (*add_entry) (struct inode *dir, + coord_t *, lock_handle *, + const struct dentry *name, reiser4_dir_entry_desc *entry); + int (*rem_entry) (struct inode *dir, const struct qstr *name, + coord_t *, lock_handle *, + reiser4_dir_entry_desc *entry); + int (*max_name_len) (const struct inode *dir); +} dir_entry_ops; + +/* operations specific to items regular (unix) file metadata are built of */ +typedef struct { + int (*write)(struct inode *, flow_t *, hint_t *, int grabbed, write_mode_t); + int (*read)(struct file *, flow_t *, hint_t *); + int (*readpage) (void *, struct page *); + int (*capture) (reiser4_key *, uf_coord_t *, struct page *, write_mode_t); + int (*get_block) (const uf_coord_t *, sector_t, struct buffer_head *); + void (*readpages) (void *, struct address_space *, struct list_head *pages); + /* key of first byte which is not addressed by the item @coord is set to + For example extent with the key + + (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks, + + ->append_key is + + (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size) */ + /* FIXME: could be uf_coord also */ + reiser4_key *(*append_key) (const coord_t *, reiser4_key *); + + void (*init_coord_extension)(uf_coord_t *, loff_t); +} file_ops; + +/* operations specific to items of stat data type */ +typedef struct { + int (*init_inode) (struct inode * inode, char *sd, int len); + int (*save_len) (struct inode * inode); + int (*save) (struct inode * inode, char **area); +} sd_ops; + +/* operations specific to internal item */ +typedef struct { + /* all tree traversal want to know from internal item is where + to go next. */ + void (*down_link) (const coord_t * coord, + const reiser4_key * key, reiser4_block_nr * block); + /* check that given internal item contains given pointer. */ + int (*has_pointer_to) (const coord_t * coord, + const reiser4_block_nr * block); +} internal_item_ops; + +struct item_plugin { + /* generic fields */ + plugin_header h; + + /* methods common for all item types */ + balance_ops b; + /* methods used during flush */ + flush_ops f; + + /* methods specific to particular type of item */ + union { + dir_entry_ops dir; + file_ops file; + sd_ops sd; + internal_item_ops internal; + } s; + +}; + +static inline item_id +item_id_by_plugin(item_plugin * plugin) +{ + return plugin->h.id; +} + +static inline char +get_iplugid(item_plugin *iplug) +{ + assert("nikita-2838", iplug != NULL); + assert("nikita-2839", 0 <= iplug->h.id && iplug->h.id < 0xff); + return (char)item_id_by_plugin(iplug); +} + +static inline void +coord_set_iplug(coord_t * coord, item_plugin *iplug) +{ + assert("nikita-2837", coord != NULL); + assert("nikita-2838", iplug != NULL); + coord->iplugid = get_iplugid(iplug); +} + +static inline item_plugin * +coord_iplug(const coord_t * coord) +{ + assert("nikita-2833", coord != NULL); + assert("nikita-2834", coord->iplugid != INVALID_PLUGID); + return (item_plugin *)plugin_by_id(REISER4_ITEM_PLUGIN_TYPE, coord->iplugid); +} + +extern int item_can_contain_key(const coord_t * item, const reiser4_key * key, const reiser4_item_data *); +extern int are_items_mergeable(const coord_t * i1, const coord_t * i2); +extern int item_is_extent(const coord_t *); +extern int item_is_tail(const coord_t *); +extern int item_is_statdata(const coord_t * item); + +extern pos_in_node_t item_length_by_coord(const coord_t * coord); +extern item_type_id item_type_by_coord(const coord_t * coord); +extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ ); +extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key); +extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *); +extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key); +extern reiser4_key *max_unit_key_by_coord(const coord_t * coord, reiser4_key * key); + +extern void obtain_item_plugin(const coord_t * coord); + +#if defined(REISER4_DEBUG) || defined(REISER4_DEBUG_MODIFY) || defined(REISER4_DEBUG_OUTPUT) +extern int znode_is_loaded(const znode * node); +#endif + +/* return plugin of item at @coord */ +static inline item_plugin * +item_plugin_by_coord(const coord_t * coord /* coord to query */ ) +{ + assert("nikita-330", coord != NULL); + assert("nikita-331", coord->node != NULL); + assert("nikita-332", znode_is_loaded(coord->node)); + trace_stamp(TRACE_TREE); + + if (unlikely(!coord_is_iplug_set(coord))) + obtain_item_plugin(coord); + return coord_iplug(coord); +} + +/* this returns true if item is of internal type */ +static inline int +item_is_internal(const coord_t * item) +{ + assert("vs-483", coord_is_existing_item(item)); + return item_type_by_coord(item) == INTERNAL_ITEM_TYPE; +} + +extern void item_body_by_coord_hard(coord_t * coord); +extern int item_body_is_valid(const coord_t * coord); + +/* return pointer to item body */ +static inline void * +item_body_by_coord(const coord_t * coord /* coord to query */ ) +{ + assert("nikita-324", coord != NULL); + assert("nikita-325", coord->node != NULL); + assert("nikita-326", znode_is_loaded(coord->node)); + trace_stamp(TRACE_TREE); + + if (coord->body == NULL) + item_body_by_coord_hard((coord_t *)coord); + assert("nikita-3201", item_body_is_valid(coord)); + return coord->body; +} + +/* __REISER4_ITEM_H__ */ +#endif +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/sde.c linux-2.6.4-ck1/fs/reiser4/plugin/item/sde.c --- linux-2.6.4/fs/reiser4/plugin/item/sde.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/sde.c 2004-03-11 22:45:15.333503367 +1100 @@ -0,0 +1,211 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Directory entry implementation */ +#include "../../forward.h" +#include "../../debug.h" +#include "../../dformat.h" +#include "../../kassign.h" +#include "../../coord.h" +#include "sde.h" +#include "item.h" +#include "../plugin.h" +#include "../../znode.h" +#include "../../carry.h" +#include "../../tree.h" +#include "../../inode.h" + +#include /* for struct inode */ +#include /* for struct dentry */ +#include + +#if REISER4_DEBUG_OUTPUT +reiser4_internal void +print_de(const char *prefix /* prefix to print */ , + coord_t * coord /* item to print */ ) +{ + assert("nikita-1456", prefix != NULL); + assert("nikita-1457", coord != NULL); + + if (item_length_by_coord(coord) < (int) sizeof (directory_entry_format)) { + printk("%s: wrong size: %i < %i\n", prefix, item_length_by_coord(coord), sizeof (directory_entry_format)); + } else { + reiser4_key sdkey; + char *name; + char buf[DE_NAME_BUF_LEN]; + + extract_key_de(coord, &sdkey); + name = extract_name_de(coord, buf); + printk("%s: name: %s\n", prefix, name); + print_key("\tsdkey", &sdkey); + } +} +#endif + +/* ->extract_key() method of simple directory item plugin. */ +reiser4_internal int +extract_key_de(const coord_t * coord /* coord of item */ , + reiser4_key * key /* resulting key */ ) +{ + directory_entry_format *dent; + + assert("nikita-1458", coord != NULL); + assert("nikita-1459", key != NULL); + + dent = (directory_entry_format *) item_body_by_coord(coord); + assert("nikita-1158", item_length_by_coord(coord) >= (int) sizeof *dent); + return extract_key_from_id(&dent->id, key); +} + +reiser4_internal int +update_key_de(const coord_t * coord, const reiser4_key * key, lock_handle * lh UNUSED_ARG) +{ + directory_entry_format *dent; + obj_key_id obj_id; + int result; + + assert("nikita-2342", coord != NULL); + assert("nikita-2343", key != NULL); + + dent = (directory_entry_format *) item_body_by_coord(coord); + result = build_obj_key_id(key, &obj_id); + if (result == 0) { + dent->id = obj_id; + znode_make_dirty(coord->node); + } + return 0; +} + +reiser4_internal char * +extract_dent_name(const coord_t * coord, directory_entry_format *dent, char *buf) +{ + reiser4_key key; + + unit_key_by_coord(coord, &key); + if (!is_longname_key(&key)) { + if (is_dot_key(&key)) + return (char *) "."; + else + return extract_name_from_key(&key, buf); + } else + return (char *) dent->name; +} + +/* ->extract_name() method of simple directory item plugin. */ +reiser4_internal char * +extract_name_de(const coord_t * coord /* coord of item */, char *buf) +{ + directory_entry_format *dent; + + assert("nikita-1460", coord != NULL); + + dent = (directory_entry_format *) item_body_by_coord(coord); + return extract_dent_name(coord, dent, buf); +} + +/* ->extract_file_type() method of simple directory item plugin. */ +reiser4_internal unsigned +extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of + * item */ ) +{ + assert("nikita-1764", coord != NULL); + /* we don't store file type in the directory entry yet. + + But see comments at kassign.h:obj_key_id + */ + return DT_UNKNOWN; +} + +reiser4_internal int +add_entry_de(struct inode *dir /* directory of item */ , + coord_t * coord /* coord of item */ , + lock_handle * lh /* insertion lock handle */ , + const struct dentry *de /* name to add */ , + reiser4_dir_entry_desc * entry /* parameters of new directory + * entry */ ) +{ + reiser4_item_data data; + directory_entry_format *dent; + int result; + const char *name; + int len; + int longname; + + name = de->d_name.name; + len = de->d_name.len; + assert("nikita-1163", strlen(name) == len); + + longname = is_longname(name, len); + + data.length = sizeof *dent; + if (longname) + data.length += len + 1; + data.data = NULL; + data.user = 0; + data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID); + + /* NOTE-NIKITA quota plugin */ + if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length)) + return -EDQUOT; + + result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ ); + if (result != 0) + return result; + + dent = (directory_entry_format *) item_body_by_coord(coord); + build_inode_key_id(entry->obj, &dent->id); + if (longname) { + xmemcpy(dent->name, name, len); + cputod8(0, &dent->name[len]); + } + return 0; +} + +reiser4_internal int +rem_entry_de(struct inode *dir /* directory of item */ , + const struct qstr * name UNUSED_ARG, + coord_t * coord /* coord of item */ , + lock_handle * lh UNUSED_ARG /* lock handle for + * removal */ , + reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of + * directory entry + * being removed */ ) +{ + coord_t shadow; + int result; + int length; + + length = item_length_by_coord(coord); + if (inode_get_bytes(dir) < length) { + warning("nikita-2627", "Dir is broke: %llu: %llu", get_inode_oid(dir), inode_get_bytes(dir)); + return RETERR(-EIO); + } + + /* cut_node() is supposed to take pointers to _different_ + coords, because it will modify them without respect to + possible aliasing. To work around this, create temporary copy + of @coord. + */ + coord_dup(&shadow, coord); + result = kill_node_content(coord, &shadow, NULL, NULL, NULL, 0, NULL, NULL); + if (result == 0) { + /* NOTE-NIKITA quota plugin */ + DQUOT_FREE_SPACE_NODIRTY(dir, length); + } + return result; +} + +reiser4_internal int +max_name_len_de(const struct inode *dir) +{ + return tree_by_inode(dir)->nplug->max_item_size() - sizeof (directory_entry_format) - 2; +} + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/sde.h linux-2.6.4-ck1/fs/reiser4/plugin/item/sde.h --- linux-2.6.4/fs/reiser4/plugin/item/sde.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/sde.h 2004-03-11 22:45:15.334503211 +1100 @@ -0,0 +1,64 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Directory entry. */ + +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ ) +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ + +#include "../../forward.h" +#include "../../dformat.h" +#include "../../kassign.h" +#include "../../key.h" + +#include +#include /* for struct dentry */ + +typedef struct directory_entry_format { + /* key of object stat-data. It's not necessary to store whole + key here, because it's always key of stat-data, so minor + packing locality and offset can be omitted here. But this + relies on particular key allocation scheme for stat-data, so, + for extensibility sake, whole key can be stored here. + + We store key as array of bytes, because we don't want 8-byte + alignment of dir entries. + */ + obj_key_id id; + /* file name. Null terminated string. */ + d8 name[0]; +} directory_entry_format; + +void print_de(const char *prefix, coord_t * coord); +int extract_key_de(const coord_t * coord, reiser4_key * key); +int update_key_de(const coord_t * coord, const reiser4_key * key, lock_handle * lh); +char *extract_name_de(const coord_t * coord, char *buf); +unsigned extract_file_type_de(const coord_t * coord); +int add_entry_de(struct inode *dir, coord_t * coord, + lock_handle * lh, const struct dentry *name, reiser4_dir_entry_desc * entry); +int rem_entry_de(struct inode *dir, const struct qstr * name, coord_t * coord, lock_handle * lh, reiser4_dir_entry_desc * entry); +int max_name_len_de(const struct inode *dir); + + +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length); + +char *extract_dent_name(const coord_t * coord, + directory_entry_format *dent, char *buf); + +#if REISER4_LARGE_KEY +#define DE_NAME_BUF_LEN (24) +#else +#define DE_NAME_BUF_LEN (16) +#endif + +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/static_stat.c linux-2.6.4-ck1/fs/reiser4/plugin/item/static_stat.c --- linux-2.6.4/fs/reiser4/plugin/item/static_stat.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/static_stat.c 2004-03-11 22:45:15.336502900 +1100 @@ -0,0 +1,1322 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* stat data manipulation. */ + +#include "../../forward.h" +#include "../../super.h" +#include "../../vfs_ops.h" +#include "../../inode.h" +#include "../../debug.h" +#include "../../dformat.h" +#include "../object.h" +#include "../plugin.h" +#include "../plugin_header.h" +#include "static_stat.h" +#include "item.h" + +#include +#include + +/* see static_stat.h for explanation */ + +/* helper function used while we are dumping/loading inode/plugin state + to/from the stat-data. */ + +static void +move_on(int *length /* space remaining in stat-data */ , + char **area /* current coord in stat data */ , + int size_of /* how many bytes to move forward */ ) +{ + assert("nikita-615", length != NULL); + assert("nikita-616", area != NULL); + + *length -= size_of; + *area += size_of; + + assert("nikita-617", *length >= 0); +} + +#if REISER4_DEBUG_OUTPUT +/* ->print() method of static sd item. Prints human readable information about + sd at @coord */ +reiser4_internal void +print_sd(const char *prefix /* prefix to print */ , + coord_t * coord /* coord of item */ ) +{ + char *sd; + int len; + int bit; + int chunk; + __u16 mask; + reiser4_stat_data_base *sd_base; + + assert("nikita-1254", prefix != NULL); + assert("nikita-1255", coord != NULL); + + sd = item_body_by_coord(coord); + len = item_length_by_coord(coord); + + sd_base = (reiser4_stat_data_base *) sd; + if (len < (int) sizeof *sd_base) { + printk("%s: wrong size: %i < %i\n", prefix, item_length_by_coord(coord), sizeof *sd_base); + return; + } + + mask = d16tocpu(&sd_base->extmask); + printk("%s: extmask: %x\n", prefix, mask); + + move_on(&len, &sd, sizeof *sd_base); + + for (bit = 0, chunk = 0; mask != 0; ++bit, mask >>= 1) { + if (((bit + 1) % 16) != 0) { + /* handle extension */ + sd_ext_plugin *sdplug; + + sdplug = sd_ext_plugin_by_id(bit); + if (sdplug == NULL) { + continue; + } + if ((mask & 1) && sdplug->print != NULL) { + /* alignment is not supported in node layout + plugin yet. + result = align( inode, &len, &sd, + sdplug -> alignment ); + if( result != 0 ) + return result; */ + sdplug->print(prefix, &sd, &len); + } + } else if (mask & 1) { + /* next portion of bitmask */ + if (len < (int) sizeof (d16)) { + warning("nikita-2708", "No space for bitmap"); + break; + } + mask = d16tocpu((d16 *) sd); + move_on(&len, &sd, sizeof (d16)); + ++chunk; + if (chunk == 3) { + if (!(mask & 0x8000)) { + /* clear last bit */ + mask &= ~0x8000; + continue; + } + /* too much */ + warning("nikita-2709", "Too many extensions"); + break; + } + } else + /* bitmask exhausted */ + break; + } +} +#endif + +reiser4_internal void +item_stat_static_sd(const coord_t * coord, void *vp) +{ + reiser4_stat_data_base *sd; + mode_t mode; + sd_stat *stat; + + stat = (sd_stat *) vp; + sd = (reiser4_stat_data_base *) item_body_by_coord(coord); + mode = 0; // d16tocpu( &sd -> mode ); + + if (S_ISREG(mode)) + stat->files++; + else if (S_ISDIR(mode)) + stat->dirs++; + else + stat->others++; +} + +/* helper function used while loading inode/plugin state from stat-data. + Complain if there is less space in stat-data than was expected. + Can only happen on disk corruption. */ +static int +not_enough_space(struct inode *inode /* object being processed */ , + const char *where /* error message */ ) +{ + assert("nikita-618", inode != NULL); + + warning("nikita-619", "Not enough space in %llu while loading %s", get_inode_oid(inode), where); + return RETERR(-EINVAL); +} + +/* helper function used while loading inode/plugin state from + stat-data. Call it if invalid plugin id was found. */ +static int +unknown_plugin(reiser4_plugin_id id /* invalid id */ , + struct inode *inode /* object being processed */ ) +{ + warning("nikita-620", "Unknown plugin %i in %llu", id, get_inode_oid(inode)); + return RETERR(-EINVAL); +} + +#if 0 /* Item alignment is not yet supported */ + +/* helper function used while storing/loading inode/plugin data to/from + stat-data. Move current coord in stat-data ("area") to position + aligned up to "alignment" bytes. */ +static int +align(struct inode *inode /* object being processed */ , + int *length /* space remaining in stat-data */ , + char **area /* current coord in stat data */ , + int alignment /* required alignment */ ) +{ + int delta; + + assert("nikita-621", inode != NULL); + assert("nikita-622", length != NULL); + assert("nikita-623", area != NULL); + assert("nikita-624", alignment > 0); + + delta = round_up(*area, alignment) - *area; + if (delta > *length) + return not_enough_space(inode, "padding"); + if (delta > 0) + move_on(length, area, delta); + return 0; +} + +#endif /* 0 */ + +/* this is installed as ->init_inode() method of + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). + Copies data from on-disk stat-data format into inode. + Handles stat-data extensions. */ +/* was sd_load */ +reiser4_internal int +init_inode_static_sd(struct inode *inode /* object being processed */ , + char *sd /* stat-data body */ , + int len /* length of stat-data */ ) +{ + int result; + int bit; + int chunk; + __u16 mask; + __u64 bigmask; + reiser4_stat_data_base *sd_base; + reiser4_inode *state; + + assert("nikita-625", inode != NULL); + assert("nikita-626", sd != NULL); + + result = 0; + sd_base = (reiser4_stat_data_base *) sd; + state = reiser4_inode_data(inode); + mask = d16tocpu(&sd_base->extmask); + bigmask = mask; + inode_set_flag(inode, REISER4_SDLEN_KNOWN); + + move_on(&len, &sd, sizeof *sd_base); + for (bit = 0, chunk = 0; mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION; ++bit, mask >>= 1) { + if (((bit + 1) % 16) != 0) { + /* handle extension */ + sd_ext_plugin *sdplug; + + sdplug = sd_ext_plugin_by_id(bit); + if (sdplug == NULL) { + warning("nikita-627", "No such extension %i in inode %llu", bit, get_inode_oid(inode)); + result = RETERR(-EINVAL); + break; + } + if (mask & 1) { + assert("nikita-628", sdplug->present); + /* alignment is not supported in node layout + plugin yet. + result = align( inode, &len, &sd, + sdplug -> alignment ); + if( result != 0 ) + return result; */ + result = sdplug->present(inode, &sd, &len); + } else if (sdplug->absent != NULL) + result = sdplug->absent(inode); + if (result) + break; + /* else, we are looking at the last bit in 16-bit + portion of bitmask */ + } else if (mask & 1) { + /* next portion of bitmask */ + if (len < (int) sizeof (d16)) { + warning("nikita-629", "No space for bitmap in inode %llu", get_inode_oid(inode)); + result = RETERR(-EINVAL); + break; + } + mask = d16tocpu((d16 *) sd); + bigmask <<= 16; + bigmask |= mask; + move_on(&len, &sd, sizeof (d16)); + ++chunk; + if (chunk == 3) { + if (!(mask & 0x8000)) { + /* clear last bit */ + mask &= ~0x8000; + continue; + } + /* too much */ + warning("nikita-630", "Too many extensions in %llu", get_inode_oid(inode)); + result = RETERR(-EINVAL); + break; + } + } else + /* bitmask exhausted */ + break; + } + scint_pack(&state->extmask, bigmask, GFP_ATOMIC); + /* common initialisations */ + inode->i_blksize = get_super_private(inode->i_sb)->optimal_io_size; + if (len - (sizeof (d16) * bit / 16) > 0) + /* alignment in save_len_static_sd() is taken into account + -edward */ + warning("nikita-631", "unused space in inode %llu", get_inode_oid(inode)); + return result; +} + +/* estimates size of stat-data required to store inode. + Installed as ->save_len() method of + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */ +/* was sd_len */ +reiser4_internal int +save_len_static_sd(struct inode *inode /* object being processed */ ) +{ + unsigned int result; + __u64 mask; + int bit; + + assert("nikita-632", inode != NULL); + + result = sizeof (reiser4_stat_data_base); + mask = scint_unpack(&reiser4_inode_data(inode)->extmask); + for (bit = 0; mask != 0; ++bit, mask >>= 1) { + if (mask & 1) { + sd_ext_plugin *sdplug; + + sdplug = sd_ext_plugin_by_id(bit); + assert("nikita-633", sdplug != NULL); + /* no aligment support + result += + round_up( result, sdplug -> alignment ) - result; */ + result += sdplug->save_len(inode); + } + } + result += sizeof (d16) * bit / 16; + return result; +} + +/* saves inode into stat-data. + Installed as ->save() method of + item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */ +/* was sd_save */ +reiser4_internal int +save_static_sd(struct inode *inode /* object being processed */ , + char **area /* where to save stat-data */ ) +{ + int result; + __u64 emask; + int bit; + unsigned int len; + reiser4_stat_data_base *sd_base; + + assert("nikita-634", inode != NULL); + assert("nikita-635", area != NULL); + + result = 0; + emask = scint_unpack(&reiser4_inode_data(inode)->extmask); + sd_base = (reiser4_stat_data_base *) * area; + cputod16((unsigned) (emask & 0xffff), &sd_base->extmask); + + *area += sizeof *sd_base; + len = 0xffffffffu; + for (bit = 0; emask != 0; ++bit, emask >>= 1) { + if (emask & 1) { + if ((bit + 1) % 16 != 0) { + sd_ext_plugin *sdplug; + sdplug = sd_ext_plugin_by_id(bit); + assert("nikita-636", sdplug != NULL); + /* no alignment support yet + align( inode, &len, area, + sdplug -> alignment ); */ + result = sdplug->save(inode, area); + if (result) + break; + } else { + cputod16((unsigned) (emask & 0xffff), (d16 *) * area); + *area += sizeof (d16); + } + } + } + return result; +} + +/* stat-data extension handling functions. */ + +static int +present_lw_sd(struct inode *inode /* object being processed */ , + char **area /* position in stat-data */ , + int *len /* remaining length */ ) +{ + if (*len >= (int) sizeof (reiser4_light_weight_stat)) { + reiser4_light_weight_stat *sd_lw; + + sd_lw = (reiser4_light_weight_stat *) * area; + + inode->i_mode = d16tocpu(&sd_lw->mode); + inode->i_nlink = d32tocpu(&sd_lw->nlink); + inode->i_size = d64tocpu(&sd_lw->size); + if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) { + inode->i_mode &= ~S_IFIFO; + inode_set_flag(inode, REISER4_PART_CONV); + } + move_on(len, area, sizeof *sd_lw); + return 0; + } else + return not_enough_space(inode, "lw sd"); +} + +static int +save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being + * processed */ ) +{ + return sizeof (reiser4_light_weight_stat); +} + +static int +save_lw_sd(struct inode *inode /* object being processed */ , + char **area /* position in stat-data */ ) +{ + reiser4_light_weight_stat *sd; + mode_t delta; + + assert("nikita-2705", inode != NULL); + assert("nikita-2706", area != NULL); + assert("nikita-2707", *area != NULL); + + sd = (reiser4_light_weight_stat *) * area; + + delta = inode_get_flag(inode, REISER4_PART_CONV) ? S_IFIFO : 0; + cputod16(inode->i_mode | delta, &sd->mode); + cputod32(inode->i_nlink, &sd->nlink); + cputod64((__u64) inode->i_size, &sd->size); + *area += sizeof *sd; + return 0; +} + +#if REISER4_DEBUG_OUTPUT +static void +print_lw_sd(const char *prefix, char **area /* position in stat-data */ , + int *len /* remaining length */ ) +{ + reiser4_light_weight_stat *sd; + + sd = (reiser4_light_weight_stat *) * area; + printk("%s: mode: %o, nlink: %i, size: %llu\n", prefix, + d16tocpu(&sd->mode), d32tocpu(&sd->nlink), d64tocpu(&sd->size)); + move_on(len, area, sizeof *sd); +} +#endif + +static int +present_unix_sd(struct inode *inode /* object being processed */ , + char **area /* position in stat-data */ , + int *len /* remaining length */ ) +{ + assert("nikita-637", inode != NULL); + assert("nikita-638", area != NULL); + assert("nikita-639", *area != NULL); + assert("nikita-640", len != NULL); + assert("nikita-641", *len > 0); + + if (*len >= (int) sizeof (reiser4_unix_stat)) { + reiser4_unix_stat *sd; + + sd = (reiser4_unix_stat *) * area; + + inode->i_uid = d32tocpu(&sd->uid); + inode->i_gid = d32tocpu(&sd->gid); + inode->i_atime.tv_sec = d32tocpu(&sd->atime); + inode->i_mtime.tv_sec = d32tocpu(&sd->mtime); + inode->i_ctime.tv_sec = d32tocpu(&sd->ctime); + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) + inode->i_rdev = d64tocpu(&sd->u.rdev); + else + inode_set_bytes(inode, (loff_t) d64tocpu(&sd->u.bytes)); + move_on(len, area, sizeof *sd); + return 0; + } else + return not_enough_space(inode, "unix sd"); +} + +static int +absent_unix_sd(struct inode *inode /* object being processed */ ) +{ + inode->i_uid = get_super_private(inode->i_sb)->default_uid; + inode->i_gid = get_super_private(inode->i_sb)->default_gid; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode_set_bytes(inode, inode->i_size); + /* mark inode as lightweight, so that caller (reiser4_lookup) will + complete initialisation by copying [ug]id from a parent. */ + inode_set_flag(inode, REISER4_LIGHT_WEIGHT); + return 0; +} + +/* Audited by: green(2002.06.14) */ +static int +save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being + * processed */ ) +{ + return sizeof (reiser4_unix_stat); +} + +static int +save_unix_sd(struct inode *inode /* object being processed */ , + char **area /* position in stat-data */ ) +{ + reiser4_unix_stat *sd; + + assert("nikita-642", inode != NULL); + assert("nikita-643", area != NULL); + assert("nikita-644", *area != NULL); + + sd = (reiser4_unix_stat *) * area; + cputod32(inode->i_uid, &sd->uid); + cputod32(inode->i_gid, &sd->gid); + cputod32((__u32) inode->i_atime.tv_sec, &sd->atime); + cputod32((__u32) inode->i_ctime.tv_sec, &sd->ctime); + cputod32((__u32) inode->i_mtime.tv_sec, &sd->mtime); + if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) + cputod64(inode->i_rdev, &sd->u.rdev); + else + cputod64((__u64) inode_get_bytes(inode), &sd->u.bytes); + *area += sizeof *sd; + return 0; +} + +#if REISER4_DEBUG_OUTPUT +static void +print_unix_sd(const char *prefix, char **area /* position in stat-data */ , + int *len /* remaining length */ ) +{ + reiser4_unix_stat *sd; + + sd = (reiser4_unix_stat *) * area; + printk("%s: uid: %i, gid: %i, atime: %i, mtime: %i, ctime: %i, " + "rdev: %llo, bytes: %llu\n", prefix, + d32tocpu(&sd->uid), + d32tocpu(&sd->gid), + d32tocpu(&sd->atime), + d32tocpu(&sd->mtime), d32tocpu(&sd->ctime), d64tocpu(&sd->u.rdev), d64tocpu(&sd->u.bytes)); + move_on(len, area, sizeof *sd); +} +#endif + +static int +present_large_times_sd(struct inode *inode /* object being processed */, + char **area /* position in stat-data */, + int *len /* remaining length */) +{ + if (*len >= (int) sizeof (reiser4_large_times_stat)) { + reiser4_large_times_stat *sd_lt; + + sd_lt = (reiser4_large_times_stat *) * area; + + inode->i_atime.tv_nsec = d32tocpu(&sd_lt->atime); + inode->i_mtime.tv_nsec = d32tocpu(&sd_lt->mtime); + inode->i_ctime.tv_nsec = d32tocpu(&sd_lt->ctime); + + move_on(len, area, sizeof *sd_lt); + return 0; + } else + return not_enough_space(inode, "large times sd"); +} + +static int +save_len_large_times_sd(struct inode *inode UNUSED_ARG /* object being processed */ ) +{ + return sizeof (reiser4_large_times_stat); +} + +static int +save_large_times_sd(struct inode *inode /* object being processed */ , + char **area /* position in stat-data */ ) +{ + reiser4_large_times_stat *sd; + + assert("nikita-2817", inode != NULL); + assert("nikita-2818", area != NULL); + assert("nikita-2819", *area != NULL); + + sd = (reiser4_large_times_stat *) * area; + + cputod32((__u32) inode->i_atime.tv_nsec, &sd->atime); + cputod32((__u32) inode->i_ctime.tv_nsec, &sd->ctime); + cputod32((__u32) inode->i_mtime.tv_nsec, &sd->mtime); + + *area += sizeof *sd; + return 0; +} + +#if REISER4_DEBUG_OUTPUT +static void +print_large_times_sd(const char *prefix, char **area /* position in stat-data */, + int *len /* remaining length */ ) +{ + reiser4_large_times_stat *sd; + + sd = (reiser4_large_times_stat *) * area; + printk("%s: nanotimes: a: %i, m: %i, c: %i\n", prefix, + d32tocpu(&sd->atime), d32tocpu(&sd->mtime), d32tocpu(&sd->ctime)); + move_on(len, area, sizeof *sd); +} +#endif + +/* symlink stat data extention */ + +/* allocate memory for symlink target and attach it to inode->u.generic_ip */ +static int +symlink_target_to_inode(struct inode *inode, const char *target, int len) +{ + assert("vs-845", inode->u.generic_ip == 0); + assert("vs-846", !inode_get_flag(inode, REISER4_GENERIC_PTR_USED)); + + /* FIXME-VS: this is prone to deadlock. Not more than other similar + places, though */ + inode->u.generic_ip = reiser4_kmalloc((size_t) len + 1, GFP_KERNEL); + if (!inode->u.generic_ip) + return RETERR(-ENOMEM); + + xmemcpy((char *) (inode->u.generic_ip), target, (size_t) len); + ((char *) (inode->u.generic_ip))[len] = 0; + inode_set_flag(inode, REISER4_GENERIC_PTR_USED); + return 0; +} + +/* this is called on read_inode. There is nothing to do actually, but some + sanity checks */ +static int +present_symlink_sd(struct inode *inode, char **area, int *len) +{ + int result; + int length; + reiser4_symlink_stat *sd; + + length = (int) inode->i_size; + /* + * *len is number of bytes in stat data item from *area to the end of + * item. It must be not less than size of symlink + 1 for ending 0 + */ + if (length > *len) + return not_enough_space(inode, "symlink"); + + if (*(*area + length) != 0) { + warning("vs-840", "Symlink is not zero terminated"); + return RETERR(-EIO); + } + + sd = (reiser4_symlink_stat *) * area; + result = symlink_target_to_inode(inode, sd->body, length); + + move_on(len, area, length + 1); + return result; +} + +static int +save_len_symlink_sd(struct inode *inode) +{ + return inode->i_size + 1; +} + +/* this is called on create and update stat data. Do nothing on update but + update @area */ +static int +save_symlink_sd(struct inode *inode, char **area) +{ + int result; + int length; + reiser4_symlink_stat *sd; + + length = (int) inode->i_size; + /* inode->i_size must be set already */ + assert("vs-841", length); + + result = 0; + sd = (reiser4_symlink_stat *) * area; + if (!inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) { + const char *target; + + target = (const char *) (inode->u.generic_ip); + inode->u.generic_ip = 0; + + result = symlink_target_to_inode(inode, target, length); + + /* copy symlink to stat data */ + xmemcpy(sd->body, target, (size_t) length); + (*area)[length] = 0; + } else { + /* there is nothing to do in update but move area */ + assert("vs-844", !memcmp(inode->u.generic_ip, sd->body, (size_t) length + 1)); + } + + *area += (length + 1); + return result; +} + +#if REISER4_DEBUG_OUTPUT +static void +print_symlink_sd(const char *prefix, char **area /* position in stat-data */ , + int *len /* remaining length */ ) +{ + reiser4_symlink_stat *sd; + int length; + + sd = (reiser4_symlink_stat *) * area; + length = strlen(sd->body); + printk("%s: \"%s\"\n", prefix, sd->body); + move_on(len, area, length + 1); +} +#endif + +static int +present_flags_sd(struct inode *inode /* object being processed */ , + char **area /* position in stat-data */ , + int *len /* remaining length */ ) +{ + assert("nikita-645", inode != NULL); + assert("nikita-646", area != NULL); + assert("nikita-647", *area != NULL); + assert("nikita-648", len != NULL); + assert("nikita-649", *len > 0); + + if (*len >= (int) sizeof (reiser4_flags_stat)) { + reiser4_flags_stat *sd; + + sd = (reiser4_flags_stat *) * area; + inode->i_flags = d32tocpu(&sd->flags); + move_on(len, area, sizeof *sd); + return 0; + } else + return not_enough_space(inode, "generation and attrs"); +} + +/* Audited by: green(2002.06.14) */ +static int +save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being + * processed */ ) +{ + return sizeof (reiser4_flags_stat); +} + +static int +save_flags_sd(struct inode *inode /* object being processed */ , + char **area /* position in stat-data */ ) +{ + reiser4_flags_stat *sd; + + assert("nikita-650", inode != NULL); + assert("nikita-651", area != NULL); + assert("nikita-652", *area != NULL); + + sd = (reiser4_flags_stat *) * area; + cputod32(inode->i_flags, &sd->flags); + *area += sizeof *sd; + return 0; +} + +static int absent_plugin_sd(struct inode *inode); +static int +present_plugin_sd(struct inode *inode /* object being processed */ , + char **area /* position in stat-data */ , + int *len /* remaining length */ ) +{ + reiser4_plugin_stat *sd; + reiser4_plugin *plugin; + int i; + __u16 mask; + int result; + int num_of_plugins; + + assert("nikita-653", inode != NULL); + assert("nikita-654", area != NULL); + assert("nikita-655", *area != NULL); + assert("nikita-656", len != NULL); + assert("nikita-657", *len > 0); + + if (*len < (int) sizeof (reiser4_plugin_stat)) + return not_enough_space(inode, "plugin"); + + sd = (reiser4_plugin_stat *) * area; + + mask = 0; + num_of_plugins = d16tocpu(&sd->plugins_no); + move_on(len, area, sizeof *sd); + result = 0; + for (i = 0; i < num_of_plugins; ++i) { + reiser4_plugin_slot *slot; + + slot = (reiser4_plugin_slot *) * area; + if (*len < (int) sizeof *slot) + return not_enough_space(inode, "additional plugin"); + + plugin = plugin_by_disk_id(tree_by_inode(inode), + d16tocpu(&slot->type_id), &slot->id); + if (plugin == NULL) + return unknown_plugin(d16tocpu(&slot->id), inode); + + /* plugin is loaded into inode, mark this into inode's + bitmask of loaded non-standard plugins */ + if (!(mask & (1 << plugin->h.type_id))) { + mask |= (1 << plugin->h.type_id); + } else { + warning("nikita-658", "duplicate plugin for %llu", get_inode_oid(inode)); + print_plugin("plugin", plugin); + return RETERR(-EINVAL); + } + move_on(len, area, sizeof *slot); + if (plugin->h.pops == NULL) + continue; + /* + align(inode, len, area, plugin->h.pops->alignment); + + - commented because alignment policies in len_for() and save_plug() + are incompatible -edward */ + + /* load plugin data, if any */ + if (plugin->h.pops->load) { + result = plugin->h.pops->load(inode, plugin, area, len); + if (result != 0) + return result; + } + } + /* if object plugin wasn't loaded from stat-data, guess it by + mode bits */ + plugin = file_plugin_to_plugin(inode_file_plugin(inode)); + if (plugin == NULL) + result = absent_plugin_sd(inode); + + reiser4_inode_data(inode)->plugin_mask = mask; + return result; +} + +/* Audited by: green(2002.06.14) */ +static int +absent_plugin_sd(struct inode *inode /* object being processed */ ) +{ + int result; + + assert("nikita-659", inode != NULL); + + result = guess_plugin_by_mode(inode); + /* if mode was wrong, guess_plugin_by_mode() returns "regular file", + but setup_inode_ops() will call make_bad_inode(). + Another, more logical but bit more complex solution is to add + "bad-file plugin". */ + /* FIXME-VS: activate was called here */ + return result; +} + +/* helper function for plugin_sd_save_len(): calculate how much space + required to save state of given plugin */ +/* Audited by: green(2002.06.14) */ +static int +len_for(reiser4_plugin * plugin /* plugin to save */ , + struct inode *inode /* object being processed */ , int len) +{ + reiser4_inode *info; + assert("nikita-661", inode != NULL); + + info = reiser4_inode_data(inode); + if (plugin != NULL && + (info->plugin_mask & (1 << (plugin->h.type_id)))) { + len += sizeof (reiser4_plugin_slot); + if (plugin->h.pops && plugin->h.pops->save_len != NULL) { + /* non-standard plugin, call method */ + /* commented as it is incompatible with alignment + * policy in save_plug() -edward */ + /* len = round_up(len, plugin->h.pops->alignment); */ + len += plugin->h.pops->save_len(inode, plugin); + } + } + return len; +} + +/* calculate how much space is required to save state of all plugins, + associated with inode */ +static int +save_len_plugin_sd(struct inode *inode /* object being processed */ ) +{ + int len; + reiser4_inode *state; + + assert("nikita-663", inode != NULL); + + state = reiser4_inode_data(inode); + /* common case: no non-standard plugins */ + if (state->plugin_mask == 0) + return 0; + len = sizeof (reiser4_plugin_stat); + /* AUDIT this looks really ugly. And are you going to add more plugins + here later hardwired??? + Why not simply get len_for() to return size of that exact plugin? + Addition can be performed here. Also probably some kind of loop + should be done through all plugins, not blind hardwiring of all + plugins known at compilation time */ + len = len_for(file_plugin_to_plugin(state->pset->file), inode, len); + len = len_for(perm_plugin_to_plugin(state->pset->perm), inode, len); + len = len_for(formatting_plugin_to_plugin(state->pset->formatting), inode, len); + len = len_for(hash_plugin_to_plugin(state->pset->hash), inode, len); + len = len_for(crypto_plugin_to_plugin(state->pset->crypto), inode, len); + len = len_for(digest_plugin_to_plugin(state->pset->digest), inode, len); + len = len_for(compression_plugin_to_plugin(state->pset->compression), inode, len); + assert("nikita-664", len > (int) sizeof (reiser4_plugin_stat)); + return len; +} + +/* helper function for plugin_sd_save(): save plugin, associated with + inode. */ +static int +save_plug(reiser4_plugin * plugin /* plugin to save */ , + struct inode *inode /* object being processed */ , + char **area /* position in stat-data */ , + int *count /* incremented if plugin were actually + * saved. */ ) +{ + reiser4_plugin_slot *slot; + int fake_len; + int result; + + assert("nikita-665", inode != NULL); + assert("nikita-666", area != NULL); + assert("nikita-667", *area != NULL); + + if (plugin == NULL) + return 0; + if (!(reiser4_inode_data(inode)->plugin_mask & (1 << plugin->h.type_id))) + return 0; + slot = (reiser4_plugin_slot *) * area; + cputod16(plugin->h.type_id, &slot->type_id); + cputod16((unsigned) plugin->h.id, &slot->id); + fake_len = (int) 0xffff; + move_on(&fake_len, area, sizeof *slot); + ++*count; + result = 0; + if (plugin->h.pops != NULL) { +#if 0 + align(inode, &fake_len, area, plugin->h.pops->alignment); + /* + commented as it is incompatible with alignment policy + in len_for() -edward + */ +#endif + if (plugin->h.pops->save != NULL) + result = plugin->h.pops->save(inode, plugin, area); + } + return result; +} + +/* save state of all non-standard plugins associated with inode */ +static int +save_plugin_sd(struct inode *inode /* object being processed */ , + char **area /* position in stat-data */ ) +{ + int result; + int num_of_plugins; + reiser4_plugin_stat *sd; + reiser4_inode *state; + int fake_len; + + assert("nikita-669", inode != NULL); + assert("nikita-670", area != NULL); + assert("nikita-671", *area != NULL); + + state = reiser4_inode_data(inode); + if (state->plugin_mask == 0) + return 0; + sd = (reiser4_plugin_stat *) * area; + fake_len = (int) 0xffff; + move_on(&fake_len, area, sizeof *sd); + + num_of_plugins = 0; + /* for now, use hardcoded list of plugins that can be associated + with inode */ + /* AUDIT. Hardcoded list of plugins is bad */ + result = save_plug(file_plugin_to_plugin(state->pset->file), inode, area, &num_of_plugins) + || save_plug(perm_plugin_to_plugin(state->pset->perm), inode, area, &num_of_plugins) + || save_plug(formatting_plugin_to_plugin(state->pset->formatting), inode, area, &num_of_plugins) + || save_plug(hash_plugin_to_plugin(state->pset->hash), inode, area, &num_of_plugins) + || save_plug(crypto_plugin_to_plugin(state->pset->crypto), inode, area, &num_of_plugins) + || save_plug(digest_plugin_to_plugin(state->pset->digest), inode, area, &num_of_plugins) + || save_plug(compression_plugin_to_plugin(state->pset->compression), inode, area, &num_of_plugins); + + cputod16((unsigned) num_of_plugins, &sd->plugins_no); + return result; +} + + +/* helper function for crypto_sd_present(), crypto_sd_save. + Allocates memory for crypto stat, keyid and attaches it to the inode */ + +static int crypto_stat_to_inode (struct inode *inode, + crypto_stat_t * tmp, + unsigned int size /* fingerprint size */) +{ + crypto_stat_t * stat; + + assert ("edward-11", (reiser4_inode_data(inode))->crypt == NULL); + assert ("edward-33", !inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)); + + stat = reiser4_kmalloc(sizeof(*stat), GFP_KERNEL); + if (!stat) + return RETERR(-ENOMEM); + stat->keyid = reiser4_kmalloc((size_t)size, GFP_KERNEL); + if (!stat->keyid) { + reiser4_kfree(stat); + return RETERR(-ENOMEM); + } + /* load inode crypto-stat */ + stat->keysize = tmp->keysize; + xmemcpy(stat->keyid, tmp->keyid, (size_t)size); + reiser4_inode_data(inode)->crypt = stat; + + inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED); + return 0; +} + +/* crypto stat-data extension */ + +static int present_crypto_sd(struct inode *inode, char **area, int *len) +{ + int result; + reiser4_crypto_stat *sd; + crypto_stat_t stat; + digest_plugin * dplug = inode_digest_plugin(inode); + + unsigned int keyid_size; + + assert("edward-06", dplug != NULL); + assert("edward-07", area != NULL); + assert("edward-08", *area != NULL); + assert("edward-09", len != NULL); + assert("edward-10", *len > 0); + + if (*len < (int) sizeof (reiser4_crypto_stat)) { + return not_enough_space(inode, "crypto-sd"); + } + keyid_size = dplug->digestsize; + /* *len is number of bytes in stat data item from *area to the end of + item. It must be not less than size of this extension */ + assert("edward-75", sizeof(*sd) + keyid_size <= *len); + + sd = (reiser4_crypto_stat *) * area; + stat.keysize = d16tocpu(&sd->keysize); + stat.keyid = (__u8 *)sd->keyid; + + result = crypto_stat_to_inode(inode, &stat, keyid_size); + move_on(len, area, sizeof(*sd) + keyid_size); + return result; +} + +static int absent_crypto_sd(struct inode * inode) +{ + return -EIO; +} + +static int save_len_crypto_sd(struct inode *inode) +{ + return (sizeof(reiser4_crypto_stat) + inode_digest_plugin(inode)->digestsize); +} + +static int save_crypto_sd(struct inode *inode, char **area) +{ + int result = 0; + reiser4_crypto_stat *sd; + digest_plugin * dplug = inode_digest_plugin(inode); + + assert("edward-12", dplug != NULL); + assert("edward-13", area != NULL); + assert("edward-14", *area != NULL); + assert("edward-76", reiser4_inode_data(inode) != NULL); + + sd = (reiser4_crypto_stat *) *area; + if (!inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) { + /* file is just created */ + crypto_stat_t * stat = reiser4_inode_data(inode)->crypt; + + assert("edward-15", stat != NULL); + + /* copy inode crypto-stat to the disk stat-data */ + cputod16(stat->keysize, &sd->keysize); + xmemcpy(sd->keyid, stat->keyid, (size_t)dplug->digestsize); + inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED); + } else { + /* do nothing */ + } + *area += (sizeof(*sd) + dplug->digestsize); + return result; +} + +#if REISER4_DEBUG_OUTPUT +static void +print_crypto_sd(const char *prefix, char **area /* position in stat-data */ , + int *len /* remaining length */ ) +{ + /* FIXME-EDWARD Make sure we debug only with none digest plugin */ + digest_plugin * dplug = digest_plugin_by_id(NONE_DIGEST_ID); + reiser4_crypto_stat *sd = (reiser4_crypto_stat *) * area; + + printk("%s: keysize: %u keyid: \"%llx\"\n", prefix, d16tocpu(&sd->keysize), *(__u64 *)(sd->keyid)); + move_on(len, area, sizeof(*sd) + dplug->digestsize); +} +#endif + +/* cluster stat-data extension */ + +static int present_cluster_sd(struct inode *inode, char **area, int *len) +{ + reiser4_inode * info; + + assert("edward-77", inode != NULL); + assert("edward-78", area != NULL); + assert("edward-79", *area != NULL); + assert("edward-80", len != NULL); + assert("edward-81", !inode_get_flag(inode, REISER4_CLUSTER_KNOWN)); + + info = reiser4_inode_data(inode); + + assert("edward-82", info != NULL); + + if (*len >= (int) sizeof (reiser4_cluster_stat)) { + reiser4_cluster_stat *sd; + sd = (reiser4_cluster_stat *) * area; + info->cluster_shift = d8tocpu(&sd->cluster_shift); + inode_set_flag(inode, REISER4_CLUSTER_KNOWN); + move_on(len, area, sizeof *sd); + return 0; + } + else + return not_enough_space(inode, "cluster sd"); +} + +static int absent_cluster_sd(struct inode * inode) +{ + return -EIO; +} + +static int save_len_cluster_sd(struct inode *inode UNUSED_ARG) +{ + return sizeof (reiser4_cluster_stat); +} + +static int save_cluster_sd(struct inode *inode, char **area) +{ + reiser4_cluster_stat *sd; + + assert("edward-106", inode != NULL); + assert("edward-107", area != NULL); + assert("edward-108", *area != NULL); + + sd = (reiser4_cluster_stat *) * area; + if (!inode_get_flag(inode, REISER4_CLUSTER_KNOWN)) { + cputod8(reiser4_inode_data(inode)->cluster_shift, &sd->cluster_shift); + inode_set_flag(inode, REISER4_CLUSTER_KNOWN); + } + else { + /* do nothing */ + } + *area += sizeof *sd; + return 0; +} + +#if REISER4_DEBUG_OUTPUT +static void +print_cluster_sd(const char *prefix, char **area /* position in stat-data */, + int *len /* remaining length */ ) +{ + reiser4_cluster_stat *sd = (reiser4_cluster_stat *) * area; + + printk("%s: %u\n", prefix, d8tocpu(&sd->cluster_shift)); + move_on(len, area, sizeof *sd); +} +#endif + +static int eio(struct inode *inode, char **area, int *len) +{ + return RETERR(-EIO); +} + +sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = { + [LIGHT_WEIGHT_STAT] = { + .h = { + .type_id = REISER4_SD_EXT_PLUGIN_TYPE, + .id = LIGHT_WEIGHT_STAT, + .pops = NULL, + .label = "light-weight sd", + .desc = "sd for light-weight files", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .present = present_lw_sd, + .absent = NULL, + .save_len = save_len_lw_sd, + .save = save_lw_sd, +#if REISER4_DEBUG_OUTPUT + .print = print_lw_sd, +#endif + .alignment = 8 + }, + [UNIX_STAT] = { + .h = { + .type_id = REISER4_SD_EXT_PLUGIN_TYPE, + .id = UNIX_STAT, + .pops = NULL, + .label = "unix-sd", + .desc = "unix stat-data fields", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .present = present_unix_sd, + .absent = absent_unix_sd, + .save_len = save_len_unix_sd, + .save = save_unix_sd, +#if REISER4_DEBUG_OUTPUT + .print = print_unix_sd, +#endif + .alignment = 8 + }, + [LARGE_TIMES_STAT] = { + .h = { + .type_id = REISER4_SD_EXT_PLUGIN_TYPE, + .id = LARGE_TIMES_STAT, + .pops = NULL, + .label = "64time-sd", + .desc = "nanosecond resolution for times", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .present = present_large_times_sd, + .absent = NULL, + .save_len = save_len_large_times_sd, + .save = save_large_times_sd, +#if REISER4_DEBUG_OUTPUT + .print = print_large_times_sd, +#endif + .alignment = 8 + }, + [SYMLINK_STAT] = { + /* stat data of symlink has this extension */ + .h = { + .type_id = REISER4_SD_EXT_PLUGIN_TYPE, + .id = SYMLINK_STAT, + .pops = NULL, + .label = "symlink-sd", + .desc = "stat data is appended with symlink name", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .present = present_symlink_sd, + .absent = NULL, + .save_len = save_len_symlink_sd, + .save = save_symlink_sd, +#if REISER4_DEBUG_OUTPUT + .print = print_symlink_sd, +#endif + .alignment = 8 + }, + [PLUGIN_STAT] = { + .h = { + .type_id = REISER4_SD_EXT_PLUGIN_TYPE, + .id = PLUGIN_STAT, + .pops = NULL, + .label = "plugin-sd", + .desc = "plugin stat-data fields", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .present = present_plugin_sd, + .absent = absent_plugin_sd, + .save_len = save_len_plugin_sd, + .save = save_plugin_sd, +#if REISER4_DEBUG_OUTPUT + .print = NULL, +#endif + .alignment = 8 + }, + [FLAGS_STAT] = { + .h = { + .type_id = REISER4_SD_EXT_PLUGIN_TYPE, + .id = FLAGS_STAT, + .pops = NULL, + .label = "flags-sd", + .desc = "inode bit flags", + .linkage = TYPE_SAFE_LIST_LINK_ZERO} + , + .present = present_flags_sd, + .absent = NULL, + .save_len = save_len_flags_sd, + .save = save_flags_sd, +#if REISER4_DEBUG_OUTPUT + .print = NULL, +#endif + .alignment = 8 + }, + [CAPABILITIES_STAT] = { + .h = { + .type_id = REISER4_SD_EXT_PLUGIN_TYPE, + .id = CAPABILITIES_STAT, + .pops = NULL, + .label = "capabilities-sd", + .desc = "capabilities", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .present = eio, + .absent = NULL, + .save_len = save_len_flags_sd, + .save = save_flags_sd, +#if REISER4_DEBUG_OUTPUT + .print = NULL, +#endif + .alignment = 8 + }, + [CLUSTER_STAT] = { + .h = { + .type_id = REISER4_SD_EXT_PLUGIN_TYPE, + .id = CLUSTER_STAT, + .pops = NULL, + .label = "cluster-sd", + .desc = "cluster shift", + .linkage = TYPE_SAFE_LIST_LINK_ZERO} + , + .present = present_cluster_sd, + .absent = absent_cluster_sd, + /* return IO_ERROR if smthng is wrong */ + .save_len = save_len_cluster_sd, + .save = save_cluster_sd, +#if REISER4_DEBUG_OUTPUT + .print = print_cluster_sd, +#endif + .alignment = 8 + }, + [CRYPTO_STAT] = { + .h = { + .type_id = REISER4_SD_EXT_PLUGIN_TYPE, + .id = CRYPTO_STAT, + .pops = NULL, + .label = "crypto-sd", + .desc = "secret key size and id", + .linkage = TYPE_SAFE_LIST_LINK_ZERO} + , + .present = present_crypto_sd, + .absent = absent_crypto_sd, + /* return IO_ERROR if smthng is wrong */ + .save_len = save_len_crypto_sd, + .save = save_crypto_sd, +#if REISER4_DEBUG_OUTPUT + .print = print_crypto_sd, +#endif + .alignment = 8 + } +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/static_stat.h linux-2.6.4-ck1/fs/reiser4/plugin/item/static_stat.h --- linux-2.6.4/fs/reiser4/plugin/item/static_stat.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/static_stat.h 2004-03-11 22:45:15.337502745 +1100 @@ -0,0 +1,220 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* This describes the static_stat item, used to hold all information needed by the stat() syscall. + +In the case where each file has not less than the fields needed by the +stat() syscall, it is more compact to store those fields in this +struct. + +If this item does not exist, then all stats are dynamically resolved. +At the moment, we either resolve all stats dynamically or all of them +statically. If you think this is not fully optimal, and the rest of +reiser4 is working, then fix it...:-) + +*/ + +#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ ) +#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ + +#include "../../forward.h" +#include "../../dformat.h" + +#include /* for struct inode */ + +/* Stat data layout: goals and implementation. + +We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to +them, including not having semantic metadata attached to them. + +There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically sized structure because the statically sized structure knows without recording it what the names and lengths of the attributes are. + +This leads to a natural compromise, which is to special case those files which have simply the standard unix file +attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix file +in their use of file attributes. + +Yet this compromise deserves to be compromised a little. + +We accomodate the case where you have no more than the standard unix file attributes by using an "extension bitmask": +each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum). + + If the first +bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited from parent +directory (as uid, gid) or initialised to some sane values. + + To capitalize on existing code infrastructure, extensions are + implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE. + Each stat-data extension plugin implements four methods: + + ->present() called by sd_load() when this extension is found in stat-data + ->absent() called by sd_load() when this extension is not found in stat-data + ->save_len() called by sd_len() to calculate total length of stat-data + ->save() called by sd_save() to store extension data into stat-data + + Implementation is in fs/reiser4/plugin/item/static_stat.c +*/ + +/* stat-data extension. Please order this by presumed frequency of use */ +typedef enum { + /* support for light-weight files */ + LIGHT_WEIGHT_STAT, + /* data required to implement unix stat(2) call. Layout is in + reiser4_unix_stat. If this is not present, file is light-weight */ + UNIX_STAT, + /* this contains additional set of 32bit [anc]time fields to implement + nanosecond resolution. Layout is in reiser4_large_times_stat. Usage + if this extension is governed by 32bittimes mount option. */ + LARGE_TIMES_STAT, + /* stat data has link name included */ + SYMLINK_STAT, + /* if this is present, file is controlled by non-standard + plugin (that is, plugin that cannot be deduced from file + mode bits), for example, aggregation, interpolation etc. */ + PLUGIN_STAT, + /* this extension contains persistent inode flags. These flags are + single bits: immutable, append, only, etc. Layout is in + reiser4_flags_stat. */ + FLAGS_STAT, + /* this extension contains capabilities sets, associated with this + file. Layout is in reiser4_capabilities_stat */ + CAPABILITIES_STAT, + /* this extension contains the information about minimal unit size for + file data processing. Layout is in reiser4_cluster_stat */ + CLUSTER_STAT, + /* this extension contains size and public id of the secret key. + Layout is in reiser4_crypto_stat */ + CRYPTO_STAT, + LAST_SD_EXTENSION, + /* + * init_inode_static_sd() iterates over extension mask until all + * non-zero bits are processed. This means, that neither ->present(), + * nor ->absent() methods will be called for stat-data extensions that + * go after last present extension. But some basic extensions, we want + * either ->absent() or ->present() method to be called, because these + * extensions set up something in inode even when they are not + * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all + * extensions before and including LAST_IMPORTANT_SD_EXTENSION either + * ->present(), or ->absent() method will be called, independently of + * what other extensions are present. + */ + LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT, +} sd_ext_bits; + +/* minimal stat-data. This allows to support light-weight files. */ +typedef struct reiser4_stat_data_base { + /* 0 */ d16 extmask; + /* 2 */ +} PACKED reiser4_stat_data_base; + +typedef struct reiser4_light_weight_stat { + /* 0 */ d16 mode; + /* 2 */ d32 nlink; + /* 8 */ d64 size; + /* size in bytes */ + /* 16 */ +} PACKED reiser4_light_weight_stat; + +typedef struct reiser4_unix_stat { + /* owner id */ + /* 0 */ d32 uid; + /* group id */ + /* 4 */ d32 gid; + /* access time */ + /* 8 */ d32 atime; + /* modification time */ + /* 12 */ d32 mtime; + /* change time */ + /* 16 */ d32 ctime; + union { + /* minor:major for device files */ + /* 20 */ d64 rdev; + /* bytes used by file */ + /* 20 */ d64 bytes; + } u; + /* 28 */ +} PACKED reiser4_unix_stat; + +/* symlink stored as part of inode */ +typedef struct reiser4_symlink_stat { + char body[0]; +} PACKED reiser4_symlink_stat; + +typedef struct reiser4_plugin_slot { + /* 0 */ d16 type_id; + /* 2 */ d16 id; +/* 4 *//* here plugin stores its persistent state */ +} PACKED reiser4_plugin_slot; + +/* stat-data extension for files with non-standard plugin. */ +typedef struct reiser4_plugin_stat { + /* number of additional plugins, associated with this object */ + /* 0 */ d16 plugins_no; + /* 2 */ reiser4_plugin_slot slot[0]; + /* 2 */ +} PACKED reiser4_plugin_stat; + +/* stat-data extension for inode flags. Currently it is just fixed-width 32 + * bit mask. If need arise, this can be replaced with variable width + * bitmask. */ +typedef struct reiser4_flags_stat { + /* 0 */ d32 flags; + /* 4 */ +} PACKED reiser4_flags_stat; + +typedef struct reiser4_capabilities_stat { + /* 0 */ d32 effective; + /* 8 */ d32 permitted; + /* 16 */ +} PACKED reiser4_capabilities_stat; + +typedef struct reiser4_cluster_stat { +/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */ + /* 0 */ d8 cluster_shift; + /* 1 */ +} PACKED reiser4_cluster_stat; + +typedef struct reiser4_crypto_stat { + /* secret key size, bits */ + /* 0 */ d16 keysize; + /* secret key id */ + /* 2 */ d8 keyid[0]; + /* 2 */ +} PACKED reiser4_crypto_stat; + +typedef struct reiser4_large_times_stat { + /* access time */ + /* 0 */ d32 atime; + /* modification time */ + /* 8 */ d32 mtime; + /* change time */ + /* 16 */ d32 ctime; + /* 24 */ +} PACKED reiser4_large_times_stat; + +/* this structure is filled by sd_item_stat */ +typedef struct sd_stat { + int dirs; + int files; + int others; +} sd_stat; + +/* plugin->item.common.* */ +extern void print_sd(const char *prefix, coord_t * coord); +extern void item_stat_static_sd(const coord_t * coord, void *vp); + +/* plugin->item.s.sd.* */ +extern int init_inode_static_sd(struct inode *inode, char *sd, int len); +extern int save_len_static_sd(struct inode *inode); +extern int save_static_sd(struct inode *inode, char **area); + +/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/tail.c linux-2.6.4-ck1/fs/reiser4/plugin/item/tail.c --- linux-2.6.4/fs/reiser4/plugin/item/tail.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/tail.c 2004-03-11 22:45:15.338502589 +1100 @@ -0,0 +1,688 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "item.h" +#include "../../inode.h" +#include "../../page_cache.h" +#include "../../carry.h" + +#include +#include +#include +#include + +/* plugin->u.item.b.max_key_inside */ +reiser4_internal reiser4_key * +max_key_inside_tail(const coord_t *coord, reiser4_key *key) +{ + item_key_by_coord(coord, key); + set_key_offset(key, get_key_offset(max_key())); + return key; +} + +/* plugin->u.item.b.can_contain_key */ +reiser4_internal int +can_contain_key_tail(const coord_t *coord, const reiser4_key *key, const reiser4_item_data *data) +{ + reiser4_key item_key; + + if (item_plugin_by_coord(coord) != data->iplug) + return 0; + + item_key_by_coord(coord, &item_key); + if (get_key_locality(key) != get_key_locality(&item_key) || + get_key_objectid(key) != get_key_objectid(&item_key)) return 0; + + return 1; +} + +/* plugin->u.item.b.mergeable + first item is of tail type */ +/* Audited by: green(2002.06.14) */ +reiser4_internal int +mergeable_tail(const coord_t *p1, const coord_t *p2) +{ + reiser4_key key1, key2; + + assert("vs-535", item_type_by_coord(p1) == UNIX_FILE_METADATA_ITEM_TYPE); + assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID); + + if (item_id_by_coord(p2) != FORMATTING_ID) { + /* second item is of another type */ + return 0; + } + + item_key_by_coord(p1, &key1); + item_key_by_coord(p2, &key2); + if (get_key_locality(&key1) != get_key_locality(&key2) || + get_key_objectid(&key1) != get_key_objectid(&key2) || get_key_type(&key1) != get_key_type(&key2)) { + /* items of different objects */ + return 0; + } + if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) { + /* not adjacent items */ + return 0; + } + return 1; +} + +reiser4_internal void show_tail(struct seq_file *m, coord_t *coord) +{ + seq_printf(m, "length: %i", item_length_by_coord(coord)); +} + +/* plugin->u.item.b.print + plugin->u.item.b.check */ + +/* plugin->u.item.b.nr_units */ +reiser4_internal pos_in_node_t +nr_units_tail(const coord_t *coord) +{ + return item_length_by_coord(coord); +} + +/* plugin->u.item.b.lookup */ +reiser4_internal lookup_result +lookup_tail(const reiser4_key *key, lookup_bias bias, coord_t *coord) +{ + reiser4_key item_key; + __u64 lookuped, offset; + unsigned nr_units; + + item_key_by_coord(coord, &item_key); + offset = get_key_offset(item_key_by_coord(coord, &item_key)); + nr_units = nr_units_tail(coord); + + /* key we are looking for must be greater than key of item @coord */ + assert("vs-416", keygt(key, &item_key)); + + /* offset we are looking for */ + lookuped = get_key_offset(key); + + if (lookuped >= offset && lookuped < offset + nr_units) { + /* byte we are looking for is in this item */ + coord->unit_pos = lookuped - offset; + coord->between = AT_UNIT; + return CBK_COORD_FOUND; + } + + /* set coord after last unit */ + coord->unit_pos = nr_units - 1; + coord->between = AFTER_UNIT; + return bias == FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND; +} + +/* plugin->u.item.b.paste */ +reiser4_internal int +paste_tail(coord_t *coord, reiser4_item_data *data, carry_plugin_info *info UNUSED_ARG) +{ + unsigned old_item_length; + char *item; + + /* length the item had before resizing has been performed */ + old_item_length = item_length_by_coord(coord) - data->length; + + /* tail items never get pasted in the middle */ + assert("vs-363", + (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) || + (coord->unit_pos == old_item_length - 1 && + coord->between == AFTER_UNIT) || + (coord->unit_pos == 0 && old_item_length == 0 && coord->between == AT_UNIT)); + + item = item_body_by_coord(coord); + if (coord->unit_pos == 0) + /* make space for pasted data when pasting at the beginning of + the item */ + xmemmove(item + data->length, item, old_item_length); + + if (coord->between == AFTER_UNIT) + coord->unit_pos++; + + if (data->data) { + assert("vs-554", data->user == 0 || data->user == 1); + if (data->user) { + assert("nikita-3035", schedulable()); + /* AUDIT: return result is not checked! */ + /* copy from user space */ + __copy_from_user(item + coord->unit_pos, data->data, (unsigned) data->length); + } else + /* copy from kernel space */ + xmemcpy(item + coord->unit_pos, data->data, (unsigned) data->length); + } else { + xmemset(item + coord->unit_pos, 0, (unsigned) data->length); + } + return 0; +} + +/* plugin->u.item.b.fast_paste */ + +/* plugin->u.item.b.can_shift + number of units is returned via return value, number of bytes via @size. For + tail items they coincide */ +reiser4_internal int +can_shift_tail(unsigned free_space, coord_t *source UNUSED_ARG, + znode *target UNUSED_ARG, shift_direction direction UNUSED_ARG, unsigned *size, unsigned want) +{ + /* make sure that that we do not want to shift more than we have */ + assert("vs-364", want > 0 && want <= (unsigned) item_length_by_coord(source)); + + *size = min(want, free_space); + return *size; +} + +/* plugin->u.item.b.copy_units */ +reiser4_internal void +copy_units_tail(coord_t *target, coord_t *source, + unsigned from, unsigned count, shift_direction where_is_free_space, unsigned free_space UNUSED_ARG) +{ + /* make sure that item @target is expanded already */ + assert("vs-366", (unsigned) item_length_by_coord(target) >= count); + assert("vs-370", free_space >= count); + + if (where_is_free_space == SHIFT_LEFT) { + /* append item @target with @count first bytes of @source */ + assert("vs-365", from == 0); + + xmemcpy((char *) item_body_by_coord(target) + + item_length_by_coord(target) - count, (char *) item_body_by_coord(source), count); + } else { + /* target item is moved to right already */ + reiser4_key key; + + assert("vs-367", (unsigned) item_length_by_coord(source) == from + count); + + xmemcpy((char *) item_body_by_coord(target), (char *) item_body_by_coord(source) + from, count); + + /* new units are inserted before first unit in an item, + therefore, we have to update item key */ + item_key_by_coord(source, &key); + set_key_offset(&key, get_key_offset(&key) + from); + + node_plugin_by_node(target->node)->update_item_key(target, &key, 0 /*info */); + } +} + +/* plugin->u.item.b.create_hook */ + + +/* item_plugin->b.kill_hook + this is called when @count units starting from @from-th one are going to be removed + */ +reiser4_internal int +kill_hook_tail(const coord_t *coord, pos_in_node_t from UNUSED_ARG, + pos_in_node_t count, struct carry_kill_data *kdata) +{ + reiser4_key key; + loff_t start, end; + + assert("vs-1577", kdata); + assert("vs-1579", kdata->inode); + + item_key_by_coord(coord, &key); + start = get_key_offset(&key) + from; + end = start + count; + fake_kill_hook_tail(kdata->inode, start, end); + return 0; +} + +/* plugin->u.item.b.shift_hook */ + +/* helper for kill_units_tail and cut_units_tail */ +static int +do_cut_or_kill(coord_t *coord, pos_in_node_t from, pos_in_node_t to, + reiser4_key *smallest_removed, reiser4_key *new_first) +{ + pos_in_node_t count; + + /* this method is only called to remove part of item */ + assert("vs-374", (to - from + 1) < item_length_by_coord(coord)); + /* tails items are never cut from the middle of an item */ + assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord))); + assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord))); + + count = to - from + 1; + + if (smallest_removed) { + /* store smallest key removed */ + item_key_by_coord(coord, smallest_removed); + set_key_offset(smallest_removed, get_key_offset(smallest_removed) + from); + } + if (new_first) { + /* head of item is cut */ + assert("vs-1529", from == 0); + + item_key_by_coord(coord, new_first); + set_key_offset(new_first, get_key_offset(new_first) + from + count); + } + + if (REISER4_DEBUG) + xmemset((char *) item_body_by_coord(coord) + from, 0, count); + return count; +} + +/* plugin->u.item.b.cut_units */ +reiser4_internal int +cut_units_tail(coord_t *coord, pos_in_node_t from, pos_in_node_t to, + struct carry_cut_data *cdata UNUSED_ARG, reiser4_key *smallest_removed, reiser4_key *new_first) +{ + return do_cut_or_kill(coord, from, to, smallest_removed, new_first); +} + +/* plugin->u.item.b.kill_units */ +reiser4_internal int +kill_units_tail(coord_t *coord, pos_in_node_t from, pos_in_node_t to, + struct carry_kill_data *kdata, reiser4_key *smallest_removed, reiser4_key *new_first) +{ + kill_hook_tail(coord, from, to - from + 1, kdata); + return do_cut_or_kill(coord, from, to, smallest_removed, new_first); +} + +/* plugin->u.item.b.unit_key */ +reiser4_internal reiser4_key * +unit_key_tail(const coord_t *coord, reiser4_key *key) +{ + assert("vs-375", coord_is_existing_unit(coord)); + + item_key_by_coord(coord, key); + set_key_offset(key, (get_key_offset(key) + coord->unit_pos)); + + return key; +} + +/* plugin->u.item.b.estimate + plugin->u.item.b.item_data_by_flow */ + +/* overwrite tail item or its part by use data */ +static int +overwrite_tail(coord_t *coord, flow_t *f) +{ + unsigned count; + + assert("vs-570", f->user == 1); + assert("vs-946", f->data); + assert("vs-947", coord_is_existing_unit(coord)); + assert("vs-948", znode_is_write_locked(coord->node)); + assert("nikita-3036", schedulable()); + + count = item_length_by_coord(coord) - coord->unit_pos; + if (count > f->length) + count = f->length; + + if (__copy_from_user((char *) item_body_by_coord(coord) + coord->unit_pos, f->data, count)) + return RETERR(-EFAULT); + + znode_make_dirty(coord->node); + + move_flow_forward(f, count); + return 0; +} + +/* tail redpage function. It is called from readpage_tail(). */ +reiser4_internal int do_readpage_tail(uf_coord_t *uf_coord, struct page *page) { + tap_t tap; + int result; + coord_t coord; + lock_handle lh; + + int count, mapped; + struct inode *inode; + + /* saving passed coord in order to do not move it by tap. */ + init_lh(&lh); + copy_lh(&lh, uf_coord->lh); + inode = page->mapping->host; + coord_dup(&coord, &uf_coord->base_coord); + + tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); + + if ((result = tap_load(&tap))) + goto out_tap_done; + + /* lookup until page is filled up. */ + for (mapped = 0; mapped < PAGE_CACHE_SIZE; mapped += count) { + void *pagedata; + + /* number of bytes to be copied to page. */ + count = item_length_by_coord(&coord) - coord.unit_pos; + + if (count > PAGE_CACHE_SIZE - mapped) + count = PAGE_CACHE_SIZE - mapped; + + /* attaching @page to address space and getting data address. */ + pagedata = kmap_atomic(page, KM_USER0); + + /* copying tail body to page. */ + xmemcpy((char *)(pagedata + mapped), + ((char *)item_body_by_coord(&coord) + coord.unit_pos), count); + + flush_dcache_page(page); + + /* dettaching page from address space. */ + kunmap_atomic(page, KM_USER0); + + /* Getting next tail item. */ + if (mapped + count < PAGE_CACHE_SIZE) { + + /* unlocking page in order to avoid keep it locked durring tree lookup, + which takes long term locks. */ + unlock_page(page); + + /* getting right neighbour. */ + result = go_dir_el(&tap, RIGHT_SIDE, 0); + + /* lock page back */ + lock_page(page); + + /* page is uptodate due to another thread made it up to date. Getting + out of here. */ + if (PageUptodate(page)) { + result = 0; + goto out_unlock_page; + } + + if (result) { + /* check if there is no neighbour node. */ + if (result == -E_NO_NEIGHBOR) { + result = 0; + goto out_update_page; + } else { + goto out_tap_relse; + } + } else { + /* check if found coord is not owned by file. */ + if (!inode_file_plugin(inode)->owns_item(inode, &coord)) { + result = 0; + goto out_update_page; + } + } + } + } + + /* making page up to date and releasing it. */ + SetPageUptodate(page); + unlock_page(page); + + /* releasing tap */ + tap_relse(&tap); + tap_done(&tap); + + return 0; + + out_update_page: + SetPageUptodate(page); + out_unlock_page: + unlock_page(page); + out_tap_relse: + tap_relse(&tap); + out_tap_done: + tap_done(&tap); + return result; +} + +/* + plugin->s.file.readpage + reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail + or + filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail + + At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail + item. */ +reiser4_internal int +readpage_tail(void *vp, struct page *page) +{ + uf_coord_t *uf_coord = vp; + ON_DEBUG(coord_t *coord = &uf_coord->base_coord); + ON_DEBUG(reiser4_key key); + + assert("umka-2515", PageLocked(page)); + assert("umka-2516", !PageUptodate(page)); + assert("umka-2517", !jprivate(page) && !PagePrivate(page)); + assert("umka-2518", page->mapping && page->mapping->host); + + assert("umka-2519", znode_is_loaded(coord->node)); + assert("umka-2520", item_is_tail(coord)); + assert("umka-2521", coord_is_existing_unit(coord)); + assert("umka-2522", znode_is_rlocked(coord->node)); + assert("umka-2523", page->mapping->host->i_ino == get_key_objectid(item_key_by_coord(coord, &key))); + + return do_readpage_tail(uf_coord, page); +} + +reiser4_internal int +item_balance_dirty_pages(struct address_space *mapping, const flow_t *f, + hint_t *hint, int back_to_dirty, int do_set_hint) +{ + int result; + struct inode *inode; + + if (do_set_hint) { + if (hint->coord.valid) + set_hint(hint, &f->key, ZNODE_WRITE_LOCK); + else + unset_hint(hint); + longterm_unlock_znode(hint->coord.lh); + } + + inode = mapping->host; + if (get_key_offset(&f->key) > inode->i_size) + INODE_SET_FIELD(inode, i_size, get_key_offset(&f->key)); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + result = reiser4_update_sd(inode); + if (result) + return result; + + /* FIXME-VS: this is temporary: the problem is that bdp takes inodes + from sb's dirty list and it looks like nobody puts there inodes of + files which are built of tails */ + if (back_to_dirty) { + mapping->dirtied_when = jiffies|1; + spin_lock(&inode_lock); + list_move(&inode->i_list, &inode->i_sb->s_dirty); + spin_unlock(&inode_lock); + } + + balance_dirty_page_unix_file(inode); + return hint_validate(hint, &f->key, 0/* do not check key */, ZNODE_WRITE_LOCK); +} + +/* drop longterm znode lock before calling balance_dirty_pages. balance_dirty_pages may cause transaction to close, + therefore we have to update stat data if necessary */ +static int formatting_balance_dirty_pages(struct address_space *mapping, const flow_t *f, + hint_t *hint) +{ + return item_balance_dirty_pages(mapping, f, hint, 1, 1/* set hint */); +} + +/* calculate number of blocks which can be dirtied/added when flow is inserted and stat data gets updated and grab them. + FIXME-VS: we may want to call grab_space with BA_CAN_COMMIT flag but that would require all that complexity with + sealing coord, releasing long term lock and validating seal later */ +static int +insert_flow_reserve(reiser4_tree *tree) +{ + grab_space_enable(); + return reiser4_grab_space(estimate_insert_flow(tree->height) + estimate_one_insert_into_item(tree), 0); +} + +/* one block gets overwritten and stat data may get updated */ +static int +overwrite_reserve(reiser4_tree *tree) +{ + grab_space_enable(); + return reiser4_grab_space(1 + estimate_one_insert_into_item(tree), 0); +} + +/* plugin->u.item.s.file.write + access to data stored in tails goes directly through formatted nodes */ +reiser4_internal int +write_tail(struct inode *inode, flow_t *f, hint_t *hint, + int grabbed, /* tail's write may be called from plain unix file write and from tail conversion. In first + case (grabbed == 0) space is not reserved forehand, so, it must be done here. When it is + being called from tail conversion - space is reserved already for whole operation which may + involve several calls to item write. In this case space reservation will not be done here */ + write_mode_t mode) +{ + int result; + coord_t *coord; + + assert("vs-1338", hint->coord.valid == 1); + + coord = &hint->coord.base_coord; + result = 0; + while (f->length && hint->coord.valid == 1) { + switch (mode) { + case FIRST_ITEM: + case APPEND_ITEM: + /* check quota before appending data */ + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, f->length)) { + result = RETERR(-EDQUOT); + break; + } + + if (!grabbed) + result = insert_flow_reserve(znode_get_tree(coord->node)); + if (!result) + result = insert_flow(coord, hint->coord.lh, f); + if (f->length) + DQUOT_FREE_SPACE_NODIRTY(inode, f->length); + break; + + case OVERWRITE_ITEM: + if (!grabbed) + result = overwrite_reserve(znode_get_tree(coord->node)); + if (!result) + result = overwrite_tail(coord, f); + break; + + default: + impossible("vs-1031", "does this ever happen?"); + result = RETERR(-EIO); + break; + + } + + if (result) { + if (!grabbed) + all_grabbed2free(); + break; + } + + /* FIXME: do not rely on a coord yet */ + hint->coord.valid = 0; + + /* throttle the writer */ + result = formatting_balance_dirty_pages(inode->i_mapping, f, hint); + if (!grabbed) + all_grabbed2free(); + if (result) { + // reiser4_stat_tail_add(bdp_caused_repeats); + break; + } + } + + return result; +} + +#if REISER4_DEBUG + +static int +coord_matches_key_tail(const coord_t *coord, const reiser4_key *key) +{ + reiser4_key item_key; + + assert("vs-1356", coord_is_existing_unit(coord)); + assert("vs-1354", keylt(key, append_key_tail(coord, &item_key))); + assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key))); + return get_key_offset(key) == get_key_offset(&item_key) + coord->unit_pos; + +} + +#endif + +/* plugin->u.item.s.file.read */ +reiser4_internal int +read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint) +{ + unsigned count; + int item_length; + coord_t *coord; + uf_coord_t *uf_coord; + + uf_coord = &hint->coord; + coord = &uf_coord->base_coord; + + assert("vs-571", f->user == 1); + assert("vs-571", f->data); + assert("vs-967", coord && coord->node); + assert("vs-1117", znode_is_rlocked(coord->node)); + assert("vs-1118", znode_is_loaded(coord->node)); + + assert("nikita-3037", schedulable()); + assert("vs-1357", coord_matches_key_tail(coord, &f->key)); + + /* calculate number of bytes to read off the item */ + item_length = item_length_by_coord(coord); + count = item_length_by_coord(coord) - coord->unit_pos; + if (count > f->length) + count = f->length; + + + /* FIXME: unlock long term lock ! */ + + if (__copy_to_user(f->data, ((char *) item_body_by_coord(coord) + coord->unit_pos), count)) + return RETERR(-EFAULT); + + /* probably mark_page_accessed() should only be called if + * coord->unit_pos is zero. */ + mark_page_accessed(znode_page(coord->node)); + move_flow_forward(f, count); + + coord->unit_pos += count; + if (item_length == coord->unit_pos) { + coord->unit_pos --; + coord->between = AFTER_UNIT; + } + + return 0; +} + +/* + plugin->u.item.s.file.append_key + key of first byte which is the next to last byte by addressed by this item +*/ +reiser4_internal reiser4_key * +append_key_tail(const coord_t *coord, reiser4_key *key) +{ + item_key_by_coord(coord, key); + set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord)); + return key; +} + +/* plugin->u.item.s.file.init_coord_extension */ +reiser4_internal void +init_coord_extension_tail(uf_coord_t *uf_coord, loff_t lookuped) +{ + uf_coord->valid = 1; +} + +/* + plugin->u.item.s.file.get_block +*/ +reiser4_internal int +get_block_address_tail(const uf_coord_t *uf_coord, sector_t block, struct buffer_head *bh) +{ + assert("nikita-3252", + znode_get_level(uf_coord->base_coord.node) == LEAF_LEVEL); + + bh->b_blocknr = *znode_get_block(uf_coord->base_coord.node); + return 0; +} + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/item/tail.h linux-2.6.4-ck1/fs/reiser4/plugin/item/tail.h --- linux-2.6.4/fs/reiser4/plugin/item/tail.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/item/tail.h 2004-03-11 22:45:15.339502434 +1100 @@ -0,0 +1,56 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#if !defined( __REISER4_TAIL_H__ ) +#define __REISER4_TAIL_H__ + +typedef struct { + int not_used; +} tail_coord_extension_t; + +struct cut_list; + + +/* plugin->u.item.b.* */ +reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *); +int can_contain_key_tail(const coord_t * coord, const reiser4_key * key, const reiser4_item_data *); +int mergeable_tail(const coord_t * p1, const coord_t * p2); +pos_in_node_t nr_units_tail(const coord_t *); +lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *); +int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *); +int can_shift_tail(unsigned free_space, coord_t * source, + znode * target, shift_direction, unsigned *size, unsigned want); +void copy_units_tail(coord_t * target, coord_t * source, + unsigned from, unsigned count, shift_direction, unsigned free_space); +int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count, struct carry_kill_data *); +int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to, + struct carry_cut_data *, reiser4_key *smallest_removed, reiser4_key *new_first); +int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to, + struct carry_kill_data *, reiser4_key *smallest_removed, reiser4_key *new_first); +reiser4_key *unit_key_tail(const coord_t *, reiser4_key *); + +/* plugin->u.item.s.* */ +int write_tail(struct inode *, flow_t *, hint_t *, int grabbed, write_mode_t); +int read_tail(struct file *, flow_t *, hint_t *); +int readpage_tail(void *vp, struct page *page); +reiser4_key *append_key_tail(const coord_t *, reiser4_key *); +void init_coord_extension_tail(uf_coord_t *, loff_t offset); +int get_block_address_tail(const uf_coord_t *uf_coord, + sector_t block, struct buffer_head *bh); + +void show_tail(struct seq_file *m, coord_t *coord); +int item_balance_dirty_pages(struct address_space *mapping, const flow_t *f, + hint_t *hint, int back_to_dirty, int set_hint); + +/* __REISER4_TAIL_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/name/invterp.c linux-2.6.4-ck1/fs/reiser4/plugin/name/invterp.c --- linux-2.6.4/fs/reiser4/plugin/name/invterp.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/name/invterp.c 2004-03-11 22:45:15.339502434 +1100 @@ -0,0 +1,11 @@ +/* Invterp is short for invertable interpolate, and interpolate means to +substitute in. + +Example: + +/filenameA/<> +will resolve to +/filenameA<-`The contents of filenameA' +wherever used. + +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/node/node40.c linux-2.6.4-ck1/fs/reiser4/plugin/node/node40.c --- linux-2.6.4/fs/reiser4/plugin/node/node40.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/node/node40.c 2004-03-11 22:45:15.347501190 +1100 @@ -0,0 +1,2860 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/*#include "../../forward.h"*/ +#include "../../debug.h" +#include "../../key.h" +#include "../../coord.h" +#include "../plugin_header.h" +#include "../item/item.h" +#include "node.h" +#include "node40.h" +#include "../plugin.h" +#include "../../jnode.h" +#include "../../znode.h" +#include "../../pool.h" +#include "../../carry.h" +#include "../../tap.h" +#include "../../tree.h" +#include "../../super.h" +#include "../../reiser4.h" + +#include +#include +#include + +/* leaf 40 format: + + [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ] + plugin_id (16) key + free_space (16) pluginid (16) + free_space_start (16) offset (16) + level (8) + num_items (16) + magic (32) + flush_time (32) +*/ + +/* magic number that is stored in ->magic field of node header */ +const __u32 REISER4_NODE_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */ + +static int prepare_for_update(znode * left, znode * right, carry_plugin_info * info); + +/* header of node of reiser40 format is at the beginning of node */ +static inline node40_header * +node40_node_header(const znode * node /* node to + * query */ ) +{ + assert("nikita-567", node != NULL); + assert("nikita-568", znode_page(node) != NULL); + assert("nikita-569", zdata(node) != NULL); + return (node40_header *) zdata(node); +} + +/* functions to get/set fields of node40_header */ + +static __u32 +nh40_get_magic(node40_header * nh) +{ + return d32tocpu(&nh->magic); +} + +static void +nh40_set_magic(node40_header * nh, __u32 magic) +{ + cputod32(magic, &nh->magic); +} + +static void +nh40_set_free_space(node40_header * nh, unsigned value) +{ + cputod16(value, &nh->free_space); + /*node->free_space = value; */ +} + +static inline unsigned +nh40_get_free_space(node40_header * nh) +{ + return d16tocpu(&nh->free_space); +} + +static void +nh40_set_free_space_start(node40_header * nh, unsigned value) +{ + cputod16(value, &nh->free_space_start); +} + +static inline unsigned +nh40_get_free_space_start(node40_header * nh) +{ + return d16tocpu(&nh->free_space_start); +} + +static inline void +nh40_set_level(node40_header * nh, unsigned value) +{ + cputod8(value, &nh->level); +} + +static unsigned +nh40_get_level(node40_header * nh) +{ + return d8tocpu(&nh->level); +} + +static void +nh40_set_num_items(node40_header * nh, unsigned value) +{ + cputod16(value, &nh->nr_items); +} + +static inline unsigned +nh40_get_num_items(node40_header * nh) +{ + return d16tocpu(&nh->nr_items); +} + +static void +nh40_set_mkfs_id(node40_header * nh, __u32 id) +{ + cputod32(id, &nh->mkfs_id); +} + +static inline __u32 +nh40_get_mkfs_id(node40_header * nh) +{ + return d32tocpu(&nh->mkfs_id); +} + +#if 0 +static void +nh40_set_flush_id(node40_header * nh, __u64 id) +{ + cputod64(id, &nh->flush.flush_id); +} +#endif + +static inline __u64 +nh40_get_flush_id(node40_header * nh) +{ + return d64tocpu(&nh->flush_id); +} + +/* plugin field of node header should be read/set by + plugin_by_disk_id/save_disk_plugin */ + +/* array of item headers is at the end of node */ +static inline item_header40 * +node40_ih_at(const znode * node, unsigned pos) +{ + return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1; +} + +/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1 + */ +static inline item_header40 * +node40_ih_at_coord(const coord_t * coord) +{ + return (item_header40 *) (zdata(coord->node) + znode_size(coord->node)) - (coord->item_pos) - 1; +} + +/* functions to get/set fields of item_header40 */ +static void +ih40_set_offset(item_header40 * ih, unsigned offset) +{ + cputod16(offset, &ih->offset); +} + +static inline unsigned +ih40_get_offset(item_header40 * ih) +{ + return d16tocpu(&ih->offset); +} + +/* plugin field of item header should be read/set by + plugin_by_disk_id/save_disk_plugin */ + +/* plugin methods */ + +/* plugin->u.node.item_overhead + look for description of this method in plugin/node/node.h */ +reiser4_internal size_t +item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG) +{ + return sizeof (item_header40); +} + +/* plugin->u.node.free_space + look for description of this method in plugin/node/node.h */ +reiser4_internal size_t free_space_node40(znode * node) +{ + assert("nikita-577", node != NULL); + assert("nikita-578", znode_is_loaded(node)); + assert("nikita-579", zdata(node) != NULL); + trace_stamp(TRACE_NODES); + + return nh40_get_free_space(node40_node_header(node)); +} + +/* private inline version of node40_num_of_items() for use in this file. This + is necessary, because address of node40_num_of_items() is taken and it is + never inlined as a result. */ +static inline short +node40_num_of_items_internal(const znode * node) +{ + trace_stamp(TRACE_NODES); + return nh40_get_num_items(node40_node_header(node)); +} + +#if REISER4_DEBUG +static inline void check_num_items(const znode *node) +{ + assert("nikita-2749", + node40_num_of_items_internal(node) == node->nr_items); + assert("nikita-2746", znode_is_write_locked(node)); +} +#else +#define check_num_items(node) noop +#endif + +/* plugin->u.node.num_of_items + look for description of this method in plugin/node/node.h */ +reiser4_internal int +num_of_items_node40(const znode * node) +{ + trace_stamp(TRACE_NODES); + return node40_num_of_items_internal(node); +} + +static void +node40_set_num_items(znode * node, node40_header * nh, unsigned value) +{ + assert("nikita-2751", node != NULL); + assert("nikita-2750", nh == node40_node_header(node)); + + check_num_items(node); + nh40_set_num_items(nh, value); + node->nr_items = value; + check_num_items(node); +} + +/* plugin->u.node.item_by_coord + look for description of this method in plugin/node/node.h */ +reiser4_internal char * +item_by_coord_node40(const coord_t * coord) +{ + item_header40 *ih; + char *p; + + /* @coord is set to existing item */ + assert("nikita-596", coord != NULL); + assert("vs-255", coord_is_existing_item(coord)); + + ih = node40_ih_at_coord(coord); + p = zdata(coord->node) + ih40_get_offset(ih); + return p; +} + +/* plugin->u.node.length_by_coord + look for description of this method in plugin/node/node.h */ +reiser4_internal int +length_by_coord_node40(const coord_t * coord) +{ + item_header40 *ih; + int result; + + /* @coord is set to existing item */ + assert("vs-256", coord != NULL); + assert("vs-257", coord_is_existing_item(coord)); + + ih = node40_ih_at_coord(coord); + if ((int) coord->item_pos == node40_num_of_items_internal(coord->node) - 1) + result = nh40_get_free_space_start(node40_node_header(coord->node)) - ih40_get_offset(ih); + else + result = ih40_get_offset(ih - 1) - ih40_get_offset(ih); + + return result; +} + +static pos_in_node_t +node40_item_length(const znode *node, pos_in_node_t item_pos) +{ + item_header40 *ih; + pos_in_node_t result; + + /* @coord is set to existing item */ + assert("vs-256", node != NULL); + assert("vs-257", node40_num_of_items_internal(node) > item_pos); + + ih = node40_ih_at(node, item_pos); + if (item_pos == node40_num_of_items_internal(node) - 1) + result = nh40_get_free_space_start(node40_node_header(node)) - ih40_get_offset(ih); + else + result = ih40_get_offset(ih - 1) - ih40_get_offset(ih); + + return result; +} + +/* plugin->u.node.plugin_by_coord + look for description of this method in plugin/node/node.h */ +reiser4_internal item_plugin * +plugin_by_coord_node40(const coord_t * coord) +{ + item_header40 *ih; + item_plugin *result; + + /* @coord is set to existing item */ + assert("vs-258", coord != NULL); + assert("vs-259", coord_is_existing_item(coord)); + + ih = node40_ih_at_coord(coord); + /* pass NULL in stead of current tree. This is time critical call. */ + result = item_plugin_by_disk_id(NULL, &ih->plugin_id); + return result; +} + +/* plugin->u.node.key_at + look for description of this method in plugin/node/node.h */ +reiser4_internal reiser4_key * +key_at_node40(const coord_t * coord, reiser4_key * key) +{ + item_header40 *ih; + + assert("nikita-1765", coord_is_existing_item(coord)); + + /* @coord is set to existing item */ + ih = node40_ih_at_coord(coord); + xmemcpy(key, &ih->key, sizeof (reiser4_key)); + return key; +} + +/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */ + +#define NODE_INCSTAT(n, counter) \ + reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter) + +#define NODE_ADDSTAT(n, counter, val) \ + reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val) + +/* plugin->u.node.lookup + look for description of this method in plugin/node/node.h */ +reiser4_internal node_search_result +lookup_node40(znode * node /* node to query */ , + const reiser4_key * key /* key to look for */ , + lookup_bias bias /* search bias */ , + coord_t * coord /* resulting coord */ ) +{ + int left; + int right; + int found; + int items; + + item_header40 *lefth; + item_header40 *righth; + + item_plugin *iplug; + item_header40 *bstop; + item_header40 *ih; + cmp_t order; + + assert("nikita-583", node != NULL); + assert("nikita-584", key != NULL); + assert("nikita-585", coord != NULL); + assert("nikita-2693", znode_is_any_locked(node)); + cassert(REISER4_SEQ_SEARCH_BREAK > 2); + + trace_stamp(TRACE_NODES); + + items = node_num_items(node); + NODE_INCSTAT(node, calls); + NODE_ADDSTAT(node, items, items); + + node_check(node, REISER4_NODE_DKEYS); + + if (unlikely(items == 0)) { + coord_init_first_unit(coord, node); + return NS_NOT_FOUND; + } + + /* binary search for item that can contain given key */ + left = 0; + right = items - 1; + coord->node = node; + coord_clear_iplug(coord); + found = 0; + + lefth = node40_ih_at(node, left); + righth = node40_ih_at(node, right); + + /* It is known that for small arrays sequential search is on average + more efficient than binary. This is because sequential search is + coded as tight loop that can be better optimized by compilers and + for small array size gain from this optimization makes sequential + search the winner. Another, maybe more important, reason for this, + is that sequential array is more CPU cache friendly, whereas binary + search effectively destroys CPU caching. + + Critical here is the notion of "smallness". Reasonable value of + REISER4_SEQ_SEARCH_BREAK can be found by playing with code in + fs/reiser4/ulevel/ulevel.c:test_search(). + + Don't try to further optimize sequential search by scanning from + right to left in attempt to use more efficient loop termination + condition (comparison with 0). This doesn't work. + + */ + + while (right - left >= REISER4_SEQ_SEARCH_BREAK) { + int median; + item_header40 *medianh; + + median = (left + right) / 2; + medianh = node40_ih_at(node, median); + + assert("nikita-1084", median >= 0); + assert("nikita-1085", median < items); + NODE_INCSTAT(node, binary); + switch (keycmp(key, &medianh->key)) { + case LESS_THAN: + right = median; + righth = medianh; + break; + default: + wrong_return_value("nikita-586", "keycmp"); + case GREATER_THAN: + left = median; + lefth = medianh; + break; + case EQUAL_TO: + do { + -- median; + /* headers are ordered from right to left */ + ++ medianh; + } while (median >= 0 && keyeq(key, &medianh->key)); + right = left = median + 1; + ih = lefth = righth = medianh - 1; + found = 1; + break; + } + } + /* sequential scan. Item headers, and, therefore, keys are stored at + the rightmost part of a node from right to left. We are trying to + access memory from left to right, and hence, scan in _descending_ + order of item numbers. + */ + if (!found) { + for (left = right, ih = righth; left >= 0; ++ ih, -- left) { + cmp_t comparison; + + NODE_INCSTAT(node, seq); + prefetchkey(&(ih + 1)->key); + comparison = keycmp(&ih->key, key); + if (comparison == GREATER_THAN) + continue; + if (comparison == EQUAL_TO) { + found = 1; + do { + -- left; + ++ ih; + } while (left >= 0 && keyeq(&ih->key, key)); + ++ left; + -- ih; + } else { + assert("nikita-1256", comparison == LESS_THAN); + } + break; + } + if (unlikely(left < 0)) + left = 0; + } + + assert("nikita-3212", right >= left); + assert("nikita-3214", + equi(found, keyeq(&node40_ih_at(node, left)->key, key))); + +#if REISER4_STATS + NODE_ADDSTAT(node, found, !!found); + NODE_ADDSTAT(node, pos, left); + if (items > 1) + NODE_ADDSTAT(node, posrelative, (left << 10) / (items - 1)); + else + NODE_ADDSTAT(node, posrelative, 1 << 10); + if (left == node->last_lookup_pos) + NODE_INCSTAT(node, samepos); + if (left == node->last_lookup_pos + 1) + NODE_INCSTAT(node, nextpos); + node->last_lookup_pos = left; +#endif + + coord_set_item_pos(coord, left); + coord->unit_pos = 0; + coord->between = AT_UNIT; + + /* key < leftmost key in a mode or node is corrupted and keys + are not sorted */ + bstop = node40_ih_at(node, (unsigned) left); + order = keycmp(&bstop->key, key); + if (unlikely(order == GREATER_THAN)) { + if (unlikely(left != 0)) { + /* screw up */ + warning("nikita-587", "Key less than %i key in a node", left); + print_key("key", key); + print_key("min", &bstop->key); + print_znode("node", node); + print_coord_content("coord", coord); + return RETERR(-EIO); + } else { + coord->between = BEFORE_UNIT; + return NS_NOT_FOUND; + } + } + /* left <= key, ok */ + iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id); + + if (unlikely(iplug == NULL)) { + warning("nikita-588", "Unknown plugin %i", d16tocpu(&bstop->plugin_id)); + print_key("key", key); + print_znode("node", node); + print_coord_content("coord", coord); + return RETERR(-EIO); + } + + coord_set_iplug(coord, iplug); + + /* if exact key from item header was found by binary search, no + further checks are necessary. */ + if (found) { + assert("nikita-1259", order == EQUAL_TO); + return NS_FOUND; + } + if (iplug->b.max_key_inside != NULL) { + reiser4_key max_item_key; + + /* key > max_item_key --- outside of an item */ + if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) { + coord->unit_pos = 0; + coord->between = AFTER_ITEM; + /* FIXME-VS: key we are looking for does not fit into + found item. Return NS_NOT_FOUND then. Without that + the following case does not work: there is extent of + file 10000, 10001. File 10000, 10002 has been just + created. When writing to position 0 in that file - + traverse_tree will stop here on twig level. When we + want it to go down to leaf level + */ + return NS_NOT_FOUND; + } + } + + if (iplug->b.lookup != NULL) { + return iplug->b.lookup(key, bias, coord); + } else { + assert("nikita-1260", order == LESS_THAN); + coord->between = AFTER_UNIT; + return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND; + } +} + +#undef NODE_ADDSTAT +#undef NODE_INCSTAT + +/* plugin->u.node.estimate + look for description of this method in plugin/node/node.h */ +reiser4_internal size_t estimate_node40(znode * node) +{ + size_t result; + + assert("nikita-597", node != NULL); + + result = free_space_node40(node) - sizeof(item_header40); + + return (result > 0) ? result : 0; +} + +/* plugin->u.node.check + look for description of this method in plugin/node/node.h */ +reiser4_internal int +check_node40(const znode * node /* node to check */ , + __u32 flags /* check flags */ , + const char **error /* where to store error message */ ) +{ + int nr_items; + int i; + reiser4_key prev; + unsigned old_offset; + tree_level level; + coord_t coord; + + assert("nikita-580", node != NULL); + assert("nikita-581", error != NULL); + assert("nikita-2948", znode_is_loaded(node)); + trace_stamp(TRACE_NODES); + + + if (ZF_ISSET(node, JNODE_HEARD_BANSHEE)) + return 0; + + assert("nikita-582", zdata(node) != NULL); + + nr_items = node40_num_of_items_internal(node); + if (nr_items < 0) { + *error = "Negative number of items"; + return -1; + } + + if (flags & REISER4_NODE_DKEYS) + prev = node->ld_key; + else + prev = *min_key(); + + old_offset = 0; + coord_init_zero(&coord); + coord.node = (znode *) node; + coord.unit_pos = 0; + coord.between = AT_UNIT; + level = znode_get_level(node); + for (i = 0; i < nr_items; i++) { + item_header40 *ih; + reiser4_key unit_key; + unsigned j; + + ih = node40_ih_at(node, (unsigned) i); + coord_set_item_pos(&coord, i); + if ((ih40_get_offset(ih) >= + znode_size(node) - nr_items * sizeof (item_header40)) || + (ih40_get_offset(ih) < sizeof (node40_header))) { + *error = "Offset is out of bounds"; + return -1; + } + if (ih40_get_offset(ih) <= old_offset) { + *error = "Offsets are in wrong order"; + return -1; + } + if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) { + *error = "Wrong offset of first item"; + return -1; + } + old_offset = ih40_get_offset(ih); + + if (keygt(&prev, &ih->key)) { + *error = "Keys are in wrong order"; + return -1; + } + if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) { + *error = "Wrong key of first unit"; + return -1; + } + prev = ih->key; + for (j = 0; j < coord_num_units(&coord); ++j) { + coord.unit_pos = j; + unit_key_by_coord(&coord, &unit_key); + if (keygt(&prev, &unit_key)) { + *error = "Unit keys are in wrong order"; + return -1; + } + prev = unit_key; + } + coord.unit_pos = 0; + if (level != TWIG_LEVEL && + item_is_extent(&coord)) { + *error = "extent on the wrong level"; + return -1; + } + if (level == LEAF_LEVEL && + item_is_internal(&coord)) { + *error = "internal item on the wrong level"; + return -1; + } + if (level != LEAF_LEVEL && + !item_is_internal(&coord) && !item_is_extent(&coord)) { + *error = "wrong item on the internal level"; + return -1; + } + if (level > TWIG_LEVEL && + !item_is_internal(&coord)) { + *error = "non-internal item on the internal level"; + return -1; + } +#if REISER4_DEBUG + if (item_plugin_by_coord(&coord)->b.check && item_plugin_by_coord(&coord)->b.check(&coord, error)) + return -1; +#endif + if (i) { + coord_t prev_coord; + /* two neighboring items can not be mergeable */ + coord_dup(&prev_coord, &coord); + coord_prev_item(&prev_coord); + if (are_items_mergeable(&prev_coord, &coord)) { + *error = "mergeable items in one node"; + return -1; + } + + } + } + + RLOCK_DK(current_tree); + if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) { + coord_t coord; + item_plugin *iplug; + + coord_init_last_unit(&coord, node); + iplug = item_plugin_by_coord(&coord); + if ((item_is_extent(&coord) || item_is_tail(&coord)) && + iplug->s.file.append_key != NULL) { + reiser4_key mkey; + + iplug->s.file.append_key(&coord, &mkey); + set_key_offset(&mkey, get_key_offset(&mkey) - 1); + if (keygt(&mkey, znode_get_rd_key((znode *) node))) { + *error = "key of rightmost item is too large"; + return -1; + } + } + } + if (flags & REISER4_NODE_DKEYS) { + RLOCK_TREE(current_tree); + + flags |= REISER4_NODE_TREE_STABLE; + + if (keygt(&prev, &node->rd_key)) { + reiser4_stat_inc(tree.rd_key_skew); + if (flags & REISER4_NODE_TREE_STABLE) { + *error = "Last key is greater than rdkey"; + return -1; + } + } + if (keygt(&node->ld_key, &node->rd_key)) { + *error = "ldkey is greater than rdkey"; + return -1; + } + if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && + (node->left != NULL) && + !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) && + ergo(flags & REISER4_NODE_TREE_STABLE, + !keyeq(&node->left->rd_key, &node->ld_key)) && + ergo(!(flags & REISER4_NODE_TREE_STABLE), keygt(&node->left->rd_key, &node->ld_key))) { + *error = "left rdkey or ldkey is wrong"; + return -1; + } + if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && + (node->right != NULL) && + !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) && + ergo(flags & REISER4_NODE_TREE_STABLE, + !keyeq(&node->rd_key, &node->right->ld_key)) && + ergo(!(flags & REISER4_NODE_TREE_STABLE), keygt(&node->rd_key, &node->right->ld_key))) { + *error = "rdkey or right ldkey is wrong"; + return -1; + } + + RUNLOCK_TREE(current_tree); + } + RUNLOCK_DK(current_tree); + + return 0; +} + +/* plugin->u.node.parse + look for description of this method in plugin/node/node.h */ +reiser4_internal int +parse_node40(znode * node /* node to parse */ ) +{ + node40_header *header; + int result; + + header = node40_node_header((znode *) node); + result = -EIO; + if (unlikely(((__u8) znode_get_level(node)) != nh40_get_level(header))) + warning("nikita-494", "Wrong level found in node: %i != %i", + znode_get_level(node), nh40_get_level(header)); + else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC)) + warning("nikita-495", + "Wrong magic in tree node: want %x, got %x", + REISER4_NODE_MAGIC, nh40_get_magic(header)); + else { + node->nr_items = node40_num_of_items_internal(node); + result = 0; + } + if (unlikely(result != 0)) + /* print_znode("node", node)*/; + return RETERR(result); +} + +/* plugin->u.node.init + look for description of this method in plugin/node/node.h */ +reiser4_internal int +init_node40(znode * node /* node to initialise */ ) +{ + node40_header *header; + + assert("nikita-570", node != NULL); + assert("nikita-572", zdata(node) != NULL); + + header = node40_node_header(node); + if (REISER4_ZERO_NEW_NODE) + xmemset(zdata(node), 0, (unsigned int) znode_size(node)); + else + xmemset(header, 0, sizeof (node40_header)); + nh40_set_free_space(header, znode_size(node) - sizeof (node40_header)); + nh40_set_free_space_start(header, sizeof (node40_header)); + /* sane hypothesis: 0 in CPU format is 0 in disk format */ + /* items: 0 */ + save_plugin_id(node_plugin_to_plugin(node->nplug), &header->common_header.plugin_id); + nh40_set_level(header, znode_get_level(node)); + nh40_set_magic(header, REISER4_NODE_MAGIC); + node->nr_items = 0; + nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb())); + + /* flags: 0 */ + return 0; +} + +reiser4_internal int +guess_node40(const znode * node /* node to guess plugin of */ ) +{ + node40_header *nethack; + + assert("nikita-1058", node != NULL); + nethack = node40_node_header(node); + return + (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) && + (plugin_by_disk_id(znode_get_tree(node), + REISER4_NODE_PLUGIN_TYPE, &nethack->common_header.plugin_id)->h.id == NODE40_ID); +} + +#if REISER4_DEBUG_OUTPUT +reiser4_internal void +print_node40(const char *prefix, const znode * node /* node to print */ , + __u32 flags UNUSED_ARG /* print flags */ ) +{ + node40_header *header; + + header = node40_node_header(node); + printk("%s: BLOCKNR %Lu FREE_SPACE %u, LEVEL %u, ITEM_NUMBER %u\n", + prefix, + *znode_get_block(node), nh40_get_free_space(header), nh40_get_level(header), nh40_get_num_items(header)); +} +#endif + +/* plugin->u.node.chage_item_size + look for description of this method in plugin/node/node.h */ +reiser4_internal void +change_item_size_node40(coord_t * coord, int by) +{ + node40_header *nh; + item_header40 *ih; + char *item_data; + int item_length; + unsigned i; + + node_check(coord->node, 0); + + /* make sure that @item is coord of existing item */ + assert("vs-210", coord_is_existing_item(coord)); + + nh = node40_node_header(coord->node); + + item_data = item_by_coord_node40(coord); + item_length = length_by_coord_node40(coord); + + /* move item bodies */ + ih = node40_ih_at_coord(coord); + xmemmove(item_data + item_length + by, item_data + item_length, + nh40_get_free_space_start(node40_node_header(coord->node)) - (ih40_get_offset(ih) + item_length)); + + /* update offsets of moved items */ + for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) { + ih = node40_ih_at(coord->node, i); + ih40_set_offset(ih, ih40_get_offset(ih) + by); + } + + /* update node header */ + nh40_set_free_space(nh, nh40_get_free_space(nh) - by); + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by); +} + +static int +should_notify_parent(const znode * node) +{ + /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */ + return !disk_addr_eq(znode_get_block(node), &znode_get_tree(node)->root_block); +} + +/* plugin->u.node.create_item + look for description of this method in plugin/node/node.h */ +reiser4_internal int +create_item_node40(coord_t * target, const reiser4_key * key, reiser4_item_data * data, carry_plugin_info * info) +{ + node40_header *nh; + item_header40 *ih; + unsigned offset; + unsigned i; + + node_check(target->node, 0); + + nh = node40_node_header(target->node); + + assert("vs-212", coord_is_between_items(target)); + /* node must have enough free space */ + assert("vs-254", free_space_node40(target->node) >= data->length + sizeof(item_header40)); + assert("vs-1410", data->length >= 0); + + if (coord_set_to_right(target)) + /* there are not items to the right of @target, so, new item + will be inserted after last one */ + coord_set_item_pos(target, nh40_get_num_items(nh)); + + if (target->item_pos < nh40_get_num_items(nh)) { + /* there are items to be moved to prepare space for new + item */ + ih = node40_ih_at_coord(target); + /* new item will start at this offset */ + offset = ih40_get_offset(ih); + + xmemmove(zdata(target->node) + offset + data->length, + zdata(target->node) + offset, nh40_get_free_space_start(nh) - offset); + /* update headers of moved items */ + for (i = target->item_pos; i < nh40_get_num_items(nh); i++) { + ih = node40_ih_at(target->node, i); + ih40_set_offset(ih, ih40_get_offset(ih) + data->length); + } + + /* @ih is set to item header of the last item, move item headers */ + xmemmove(ih - 1, ih, sizeof (item_header40) * (nh40_get_num_items(nh) - target->item_pos)); + } else { + /* new item will start at this offset */ + offset = nh40_get_free_space_start(nh); + } + + /* make item header for the new item */ + ih = node40_ih_at_coord(target); + xmemcpy(&ih->key, key, sizeof (reiser4_key)); + ih40_set_offset(ih, offset); + save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id); + + /* update node header */ + nh40_set_free_space(nh, nh40_get_free_space(nh) - data->length - sizeof (item_header40)); + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + data->length); + node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1); + + /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */ + target->unit_pos = 0; + target->between = AT_UNIT; + coord_clear_iplug(target); + + /* initialise item */ + if (data->iplug->b.init != NULL) { + data->iplug->b.init(target, NULL, data); + } + /* copy item body */ + if (data->iplug->b.paste != NULL) { + data->iplug->b.paste(target, data, info); + } else if (data->data != NULL) { + if (data->user) { + /* AUDIT: Are we really should not check that pointer + from userspace was valid and data bytes were + available? How will we return -EFAULT of some kind + without this check? */ + assert("nikita-3038", schedulable()); + /* copy data from user space */ + __copy_from_user(zdata(target->node) + offset, data->data, (unsigned) data->length); + } else + /* copy from kernel space */ + xmemcpy(zdata(target->node) + offset, data->data, (unsigned) data->length); + } + + if (target->item_pos == 0) { + /* left delimiting key has to be updated */ + prepare_for_update(NULL, target->node, info); + } + + if (item_plugin_by_coord(target)->b.create_hook != NULL) { + item_plugin_by_coord(target)->b.create_hook(target, data->arg); + } + + node_check(target->node, 0); + return 0; +} + +/* plugin->u.node.update_item_key + look for description of this method in plugin/node/node.h */ +reiser4_internal void +update_item_key_node40(coord_t * target, const reiser4_key * key, carry_plugin_info * info) +{ + item_header40 *ih; + + ih = node40_ih_at_coord(target); + xmemcpy(&ih->key, key, sizeof (reiser4_key)); + + if (target->item_pos == 0) { + prepare_for_update(NULL, target->node, info); + } +} + +/* this bits encode cut mode */ +#define CMODE_TAIL 1 +#define CMODE_WHOLE 2 +#define CMODE_HEAD 4 + +struct cut40_info { + int mode; + pos_in_node_t tail_removed; /* position of item which gets tail removed */ + pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */ + pos_in_node_t removed_count; /* number of items removed completely */ + pos_in_node_t head_removed; /* position of item which gets head removed */ + + pos_in_node_t freed_space_start; + pos_in_node_t freed_space_end; + pos_in_node_t first_moved; + pos_in_node_t head_removed_location; +}; + +static void +init_cinfo(struct cut40_info *cinfo) +{ + cinfo->mode = 0; + cinfo->tail_removed = MAX_POS_IN_NODE; + cinfo->first_removed = MAX_POS_IN_NODE; + cinfo->removed_count = MAX_POS_IN_NODE; + cinfo->head_removed = MAX_POS_IN_NODE; + cinfo->freed_space_start = MAX_POS_IN_NODE; + cinfo->freed_space_end = MAX_POS_IN_NODE; + cinfo->first_moved = MAX_POS_IN_NODE; + cinfo->head_removed_location = MAX_POS_IN_NODE; +} + +/* complete cut_node40/kill_node40 content by removing the gap created by */ +static void +compact(znode *node, struct cut40_info *cinfo) +{ + node40_header *nh; + item_header40 *ih; + pos_in_node_t freed; + pos_in_node_t pos, nr_items; + + assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE && + cinfo->freed_space_end != MAX_POS_IN_NODE && + cinfo->first_moved != MAX_POS_IN_NODE)); + assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start); + + nh = node40_node_header(node); + nr_items = nh40_get_num_items(nh); + + /* remove gap made up by removal */ + xmemmove(zdata(node) + cinfo->freed_space_start, zdata(node) + cinfo->freed_space_end, + nh40_get_free_space_start(nh) - cinfo->freed_space_end); + + /* update item headers of moved items - change their locations */ + pos = cinfo->first_moved; + ih = node40_ih_at(node, pos); + if (cinfo->head_removed_location != MAX_POS_IN_NODE) { + assert("vs-1580", pos == cinfo->head_removed); + ih40_set_offset(ih, cinfo->head_removed_location); + pos ++; + ih --; + } + + freed = cinfo->freed_space_end - cinfo->freed_space_start; + for (; pos < nr_items; pos ++, ih --) { + assert("vs-1581", ih == node40_ih_at(node, pos)); + ih40_set_offset(ih, (ih40_get_offset(ih) - freed)); + } + + /* free space start moved to right */ + nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed); + + if (cinfo->removed_count != MAX_POS_IN_NODE) { + /* number of items changed. Remove item headers of those items */ + ih = node40_ih_at(node, nr_items - 1); + xmemmove(ih + cinfo->removed_count, ih, + sizeof (item_header40) * (nr_items - cinfo->removed_count - cinfo->first_removed)); + freed += sizeof (item_header40) * cinfo->removed_count; + node40_set_num_items(node, nh, nr_items - cinfo->removed_count); + } + + /* total amount of free space increased */ + nh40_set_free_space(nh, nh40_get_free_space(nh) + freed); +} + +/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types + of cut. First is when a unit is removed from the middle of an item. In this case this function returns 1. All the + rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item + getting head cut. Function returns 0 in this case */ +static int +parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params) +{ + reiser4_key left_key, right_key; + reiser4_key min_from_key, max_to_key; + const reiser4_key *from_key, *to_key; + + init_cinfo(cinfo); + + /* calculate minimal key stored in first item of items to be cut (params->from) */ + item_key_by_coord(params->from, &min_from_key); + /* and max key stored in last item of items to be cut (params->to) */ + max_item_key_by_coord(params->to, &max_to_key); + + /* if cut key range is not defined in input parameters - define it using cut coord range */ + if (params->from_key == NULL) { + assert("vs-1513", params->to_key == NULL); + unit_key_by_coord(params->from, &left_key); + from_key = &left_key; + max_unit_key_by_coord(params->to, &right_key); + to_key = &right_key; + } else { + from_key = params->from_key; + to_key = params->to_key; + } + + if (params->from->item_pos == params->to->item_pos) { + if (keylt(&min_from_key, from_key) && keylt(to_key, &max_to_key)) + return 1; + + if (keygt(from_key, &min_from_key)) { + /* tail of item is to be cut cut */ + cinfo->tail_removed = params->from->item_pos; + cinfo->mode |= CMODE_TAIL; + } else if (keylt(to_key, &max_to_key)) { + /* head of item is to be cut */ + cinfo->head_removed = params->from->item_pos; + cinfo->mode |= CMODE_HEAD; + } else { + /* item is removed completely */ + cinfo->first_removed = params->from->item_pos; + cinfo->removed_count = 1; + cinfo->mode |= CMODE_WHOLE; + } + } else { + cinfo->first_removed = params->from->item_pos + 1; + cinfo->removed_count = params->to->item_pos - params->from->item_pos - 1; + + if (keygt(from_key, &min_from_key)) { + /* first item is not cut completely */ + cinfo->tail_removed = params->from->item_pos; + cinfo->mode |= CMODE_TAIL; + } else { + cinfo->first_removed --; + cinfo->removed_count ++; + } + if (keylt(to_key, &max_to_key)) { + /* last item is not cut completely */ + cinfo->head_removed = params->to->item_pos; + cinfo->mode |= CMODE_HEAD; + } else { + cinfo->removed_count ++; + } + if (cinfo->removed_count) + cinfo->mode |= CMODE_WHOLE; + } + + return 0; +} + +static void +call_kill_hooks(znode *node, pos_in_node_t from, pos_in_node_t count, carry_kill_data *kdata) +{ + coord_t coord; + item_plugin *iplug; + pos_in_node_t pos; + + coord.node = node; + coord.unit_pos = 0; + coord.between = AT_UNIT; + for (pos = 0; pos < count; pos ++) { + coord_set_item_pos(&coord, from + pos); + coord.unit_pos = 0; + coord.between = AT_UNIT; + iplug = item_plugin_by_coord(&coord); + if (iplug->b.kill_hook) { + iplug->b.kill_hook(&coord, 0, coord_num_units(&coord), kdata); + } + } +} + +/* this is used to kill item partially */ +static pos_in_node_t +kill_units(coord_t *coord, pos_in_node_t from, pos_in_node_t to, void *data, reiser4_key *smallest_removed, + reiser4_key *new_first_key) +{ + struct carry_kill_data *kdata; + item_plugin *iplug; + + kdata = data; + iplug = item_plugin_by_coord(coord); + + assert("vs-1524", iplug->b.kill_units); + return iplug->b.kill_units(coord, from, to, kdata, smallest_removed, new_first_key); +} + +/* call item plugin to cut tail of file */ +static pos_in_node_t +kill_tail(coord_t *coord, void *data, reiser4_key *smallest_removed) +{ + struct carry_kill_data *kdata; + pos_in_node_t to; + + kdata = data; + to = coord_last_unit_pos(coord); + return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed, 0); +} + +/* call item plugin to cut head of item */ +static pos_in_node_t +kill_head(coord_t *coord, void *data, reiser4_key *smallest_removed, reiser4_key *new_first_key) +{ + return kill_units(coord, 0, coord->unit_pos, data, smallest_removed, new_first_key); +} + +/* this is used to cut item partially */ +static pos_in_node_t +cut_units(coord_t *coord, pos_in_node_t from, pos_in_node_t to, void *data, + reiser4_key *smallest_removed, reiser4_key *new_first_key) +{ + carry_cut_data *cdata; + item_plugin *iplug; + + cdata = data; + iplug = item_plugin_by_coord(coord); + assert("vs-302", iplug->b.cut_units); + return iplug->b.cut_units(coord, from, to, cdata, smallest_removed, new_first_key); +} + +/* call item plugin to cut tail of file */ +static pos_in_node_t +cut_tail(coord_t *coord, void *data, reiser4_key *smallest_removed) +{ + carry_cut_data *cdata; + pos_in_node_t to; + + cdata = data; + to = coord_last_unit_pos(cdata->params.from); + return cut_units(coord, coord->unit_pos, to, data, smallest_removed, 0); +} + +/* call item plugin to cut head of item */ +static pos_in_node_t +cut_head(coord_t *coord, void *data, reiser4_key *smallest_removed, reiser4_key *new_first_key) +{ + return cut_units(coord, 0, coord->unit_pos, data, smallest_removed, new_first_key); +} + +/* this returns 1 of key of first item changed, 0 - if it did not */ +static int +prepare_for_compact(struct cut40_info *cinfo, const struct cut_kill_params *params, int is_cut, + void *data, carry_plugin_info *info) +{ + znode *node; + item_header40 *ih; + pos_in_node_t freed; + pos_in_node_t item_pos; + coord_t coord; + reiser4_key new_first_key; + pos_in_node_t (*kill_units_f)(coord_t *, pos_in_node_t, pos_in_node_t, void *, reiser4_key *, reiser4_key *); + pos_in_node_t (*kill_tail_f)(coord_t *, void *, reiser4_key *); + pos_in_node_t (*kill_head_f)(coord_t *, void *, reiser4_key *, reiser4_key *); + int retval; + + retval = 0; + + node = params->from->node; + + assert("vs-184", node == params->to->node); + assert("vs-312", !node_is_empty(node)); + assert("vs-297", coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT); + + if (is_cut) { + kill_units_f = cut_units; + kill_tail_f = cut_tail; + kill_head_f = cut_head; + } else { + kill_units_f = kill_units; + kill_tail_f = kill_tail; + kill_head_f = kill_head; + } + + if (parse_cut(cinfo, params) == 1) { + /* cut from the middle of item */ + freed = kill_units_f(params->from, params->from->unit_pos, params->to->unit_pos, data, params->smallest_removed, NULL); + + item_pos = params->from->item_pos; + ih = node40_ih_at(node, item_pos); + cinfo->freed_space_start = ih40_get_offset(ih) + node40_item_length(node, item_pos) - freed; + cinfo->freed_space_end = cinfo->freed_space_start + freed; + cinfo->first_moved = item_pos + 1; + } else { + assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE || + cinfo->first_removed != MAX_POS_IN_NODE || + cinfo->head_removed != MAX_POS_IN_NODE)); + + switch (cinfo->mode) { + case CMODE_TAIL: + /* one item gets cut partially from its end */ + assert("vs-1562", cinfo->tail_removed == params->from->item_pos); + + freed = kill_tail_f(params->from, data, params->smallest_removed); + + item_pos = cinfo->tail_removed; + ih = node40_ih_at(node, item_pos); + cinfo->freed_space_start = ih40_get_offset(ih) + node40_item_length(node, item_pos) - freed; + cinfo->freed_space_end = cinfo->freed_space_start + freed; + cinfo->first_moved = cinfo->tail_removed + 1; + break; + + case CMODE_WHOLE: + /* one or more items get removed completely */ + assert("vs-1563", cinfo->first_removed == params->from->item_pos); + assert("vs-1564", cinfo->removed_count > 0 && cinfo->removed_count != MAX_POS_IN_NODE); + + /* call kill hook for all items removed completely */ + if (is_cut == 0) + call_kill_hooks(node, cinfo->first_removed, cinfo->removed_count, data); + + item_pos = cinfo->first_removed; + ih = node40_ih_at(node, item_pos); + + if (params->smallest_removed) + xmemcpy(params->smallest_removed, &ih->key, sizeof (reiser4_key)); + + cinfo->freed_space_start = ih40_get_offset(ih); + + item_pos += (cinfo->removed_count - 1); + ih -= (cinfo->removed_count - 1); + cinfo->freed_space_end = ih40_get_offset(ih) + node40_item_length(node, item_pos); + cinfo->first_moved = item_pos + 1; + if (cinfo->first_removed == 0) + /* key of first item of the node changes */ + retval = 1; + break; + + case CMODE_HEAD: + /* one item gets cut partially from its head */ + assert("vs-1565", cinfo->head_removed == params->from->item_pos); + + freed = kill_head_f(params->to, data, params->smallest_removed, &new_first_key); + + item_pos = cinfo->head_removed; + ih = node40_ih_at(node, item_pos); + cinfo->freed_space_start = ih40_get_offset(ih); + cinfo->freed_space_end = ih40_get_offset(ih) + freed; + cinfo->first_moved = cinfo->head_removed + 1; + + /* item head is removed, therefore, item key changed */ + coord.node = node; + coord_set_item_pos(&coord, item_pos); + coord.unit_pos = 0; + coord.between = AT_UNIT; + update_item_key_node40(&coord, &new_first_key, 0); + if (item_pos == 0) + /* key of first item of the node changes */ + retval = 1; + break; + + case CMODE_TAIL | CMODE_WHOLE: + /* one item gets cut from its end and one or more items get removed completely */ + assert("vs-1566", cinfo->tail_removed == params->from->item_pos); + assert("vs-1567", cinfo->first_removed == cinfo->tail_removed + 1); + assert("vs-1564", cinfo->removed_count > 0 && cinfo->removed_count != MAX_POS_IN_NODE); + + freed = kill_tail_f(params->from, data, params->smallest_removed); + + item_pos = cinfo->tail_removed; + ih = node40_ih_at(node, item_pos); + cinfo->freed_space_start = ih40_get_offset(ih) + node40_item_length(node, item_pos) - freed; + + /* call kill hook for all items removed completely */ + if (is_cut == 0) + call_kill_hooks(node, cinfo->first_removed, cinfo->removed_count, data); + + item_pos += cinfo->removed_count; + ih -= cinfo->removed_count; + cinfo->freed_space_end = ih40_get_offset(ih) + node40_item_length(node, item_pos); + cinfo->first_moved = item_pos + 1; + break; + + case CMODE_WHOLE | CMODE_HEAD: + /* one or more items get removed completely and one item gets cut partially from its head */ + assert("vs-1568", cinfo->first_removed == params->from->item_pos); + assert("vs-1564", cinfo->removed_count > 0 && cinfo->removed_count != MAX_POS_IN_NODE); + assert("vs-1569", cinfo->head_removed == cinfo->first_removed + cinfo->removed_count); + + /* call kill hook for all items removed completely */ + if (is_cut == 0) + call_kill_hooks(node, cinfo->first_removed, cinfo->removed_count, data); + + item_pos = cinfo->first_removed; + ih = node40_ih_at(node, item_pos); + + if (params->smallest_removed) + xmemcpy(params->smallest_removed, &ih->key, sizeof (reiser4_key)); + + freed = kill_head_f(params->to, data, 0, &new_first_key); + + cinfo->freed_space_start = ih40_get_offset(ih); + + ih = node40_ih_at(node, cinfo->head_removed); + /* this is the most complex case. Item which got head removed and items which are to be moved + intact change their location differently. */ + cinfo->freed_space_end = ih40_get_offset(ih) + freed; + cinfo->first_moved = cinfo->head_removed; + cinfo->head_removed_location = cinfo->freed_space_start; + + /* item head is removed, therefore, item key changed */ + coord.node = node; + coord_set_item_pos(&coord, cinfo->head_removed); + coord.unit_pos = 0; + coord.between = AT_UNIT; + update_item_key_node40(&coord, &new_first_key, 0); + + assert("vs-1579", cinfo->first_removed == 0); + /* key of first item of the node changes */ + retval = 1; + break; + + case CMODE_TAIL | CMODE_HEAD: + /* one item get cut from its end and its neighbor gets cut from its tail */ + impossible("vs-1576", "this can not happen currently"); + break; + + case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD: + impossible("vs-1577", "this can not happen currently"); + break; + default: + impossible("vs-1578", "unexpected cut mode"); + break; + } + } + return retval; +} + + +/* plugin->u.node.kill + return value is number of items removed completely */ +int +kill_node40(struct carry_kill_data *kdata, carry_plugin_info *info) +{ + znode *node; + struct cut40_info cinfo; + int first_key_changed; + + node = kdata->params.from->node; + node_check(node, 0); + + first_key_changed = prepare_for_compact(&cinfo, &kdata->params, 0/* not cut */, kdata, info); + compact(node, &cinfo); + + if (info) { + /* it is not called by node40_shift, so we have to take care + of changes on upper levels */ + if (node_is_empty(node) && !(kdata->flags & DELETE_RETAIN_EMPTY)) + /* all contents of node is deleted */ + prepare_removal_node40(node, info); + else if (first_key_changed) { + prepare_for_update(NULL, node, info); + } + } + + coord_clear_iplug(kdata->params.from); + coord_clear_iplug(kdata->params.to); + + node_check(node, 0); + znode_make_dirty(node); + return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count ; +} + +/* plugin->u.node.cut + return value is number of items removed completely */ +int +cut_node40(struct carry_cut_data *cdata, carry_plugin_info *info) +{ + znode *node; + struct cut40_info cinfo; + int first_key_changed; + + node = cdata->params.from->node; + node_check(node, 0); + + first_key_changed = prepare_for_compact(&cinfo, &cdata->params, 1/* not cut */, cdata, info); + compact(node, &cinfo); + + if (info) { + /* it is not called by node40_shift, so we have to take care + of changes on upper levels */ + if (node_is_empty(node)) + /* all contents of node is deleted */ + prepare_removal_node40(node, info); + else if (first_key_changed) { + prepare_for_update(NULL, node, info); + } + } + + coord_clear_iplug(cdata->params.from); + coord_clear_iplug(cdata->params.to); + + node_check(node, 0); + znode_make_dirty(node); + return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count ; +} + + +/* this structure is used by shift method of node40 plugin */ +struct shift_params { + shift_direction pend; /* when @pend == append - we are shifting to + left, when @pend == prepend - to right */ + coord_t wish_stop; /* when shifting to left this is last unit we + want shifted, when shifting to right - this + is set to unit we want to start shifting + from */ + znode *target; + int everything; /* it is set to 1 if everything we have to shift is + shifted, 0 - otherwise */ + + /* FIXME-VS: get rid of read_stop */ + + /* these are set by estimate_shift */ + coord_t real_stop; /* this will be set to last unit which will be + really shifted */ + + /* coordinate in source node before operation of unit which becomes + first after shift to left of last after shift to right */ + union { + coord_t future_first; + coord_t future_last; + } u; + + unsigned merging_units; /* number of units of first item which have to + be merged with last item of target node */ + unsigned merging_bytes; /* number of bytes in those units */ + + unsigned entire; /* items shifted in their entirety */ + unsigned entire_bytes; /* number of bytes in those items */ + + unsigned part_units; /* number of units of partially copied item */ + unsigned part_bytes; /* number of bytes in those units */ + + unsigned shift_bytes; /* total number of bytes in items shifted (item + headers not included) */ + +}; + +static int +item_creation_overhead(coord_t * item) +{ + return node_plugin_by_coord(item)->item_overhead(item->node, 0); +} + +/* how many units are there in @source starting from source->unit_pos + but not further than @stop_coord */ +static int +wanted_units(coord_t * source, coord_t * stop_coord, shift_direction pend) +{ + if (pend == SHIFT_LEFT) { + assert("vs-181", source->unit_pos == 0); + } else { + assert("vs-182", source->unit_pos == coord_last_unit_pos(source)); + } + + if (source->item_pos != stop_coord->item_pos) { + /* @source and @stop_coord are different items */ + return coord_last_unit_pos(source) + 1; + } + + if (pend == SHIFT_LEFT) { + return stop_coord->unit_pos + 1; + } else { + return source->unit_pos - stop_coord->unit_pos + 1; + } +} + +/* this calculates what can be copied from @shift->wish_stop.node to + @shift->target */ +static void +estimate_shift(struct shift_params *shift, const reiser4_context *ctx) +{ + unsigned target_free_space, size; + pos_in_node_t stop_item; /* item which estimating should not consider */ + unsigned want; /* number of units of item we want shifted */ + coord_t source; /* item being estimated */ + item_plugin *iplug; + + /* shifting to left/right starts from first/last units of + @shift->wish_stop.node */ + if (shift->pend == SHIFT_LEFT) { + coord_init_first_unit(&source, shift->wish_stop.node); + } else { + coord_init_last_unit(&source, shift->wish_stop.node); + } + shift->real_stop = source; + + /* free space in target node and number of items in source */ + target_free_space = znode_free_space(shift->target); + + shift->everything = 0; + if (!node_is_empty(shift->target)) { + /* target node is not empty, check for boundary items + mergeability */ + coord_t to; + + /* item we try to merge @source with */ + if (shift->pend == SHIFT_LEFT) { + coord_init_last_unit(&to, shift->target); + } else { + coord_init_first_unit(&to, shift->target); + } + + if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to, &source) : are_items_mergeable(&source, &to)) { + /* how many units of @source do we want to merge to + item @to */ + want = wanted_units(&source, &shift->wish_stop, shift->pend); + + /* how many units of @source we can merge to item + @to */ + iplug = item_plugin_by_coord(&source); + if (iplug->b.can_shift != NULL) + shift->merging_units = + iplug->b.can_shift(target_free_space, + &source, shift->target, shift->pend, &size, want); + else { + shift->merging_units = 0; + size = 0; + } + shift->merging_bytes = size; + shift->shift_bytes += size; + /* update stop coord to be set to last unit of @source + we can merge to @target */ + if (shift->merging_units) + /* at least one unit can be shifted */ + shift->real_stop.unit_pos = (shift->merging_units - source.unit_pos - 1) * shift->pend; + else { + /* nothing can be shifted */ + if (shift->pend == SHIFT_LEFT) + coord_init_before_first_item(&shift->real_stop, source.node); + else + coord_init_after_last_item(&shift->real_stop, source.node); + } + assert("nikita-2081", shift->real_stop.unit_pos + 1); + + if (shift->merging_units != want) { + /* we could not copy as many as we want, so, + there is no reason for estimating any + longer */ + return; + } + + target_free_space -= size; + coord_add_item_pos(&source, shift->pend); + } + } + + /* number of item nothing of which we want to shift */ + stop_item = shift->wish_stop.item_pos + shift->pend; + + /* calculate how many items can be copied into given free + space as whole */ + for (; source.item_pos != stop_item; coord_add_item_pos(&source, shift->pend)) { + if (shift->pend == SHIFT_RIGHT) + source.unit_pos = coord_last_unit_pos(&source); + + /* how many units of @source do we want to copy */ + want = wanted_units(&source, &shift->wish_stop, shift->pend); + + if (want == coord_last_unit_pos(&source) + 1) { + /* we want this item to be copied entirely */ + size = item_length_by_coord(&source) + item_creation_overhead(&source); + if (size <= target_free_space) { + /* item fits into target node as whole */ + target_free_space -= size; + shift->shift_bytes += size - item_creation_overhead(&source); + shift->entire_bytes += size - item_creation_overhead(&source); + shift->entire++; + + /* update shift->real_stop coord to be set to + last unit of @source we can merge to + @target */ + shift->real_stop = source; + if (shift->pend == SHIFT_LEFT) + shift->real_stop.unit_pos = coord_last_unit_pos(&shift->real_stop); + else + shift->real_stop.unit_pos = 0; + continue; + } + } + + /* we reach here only for an item which does not fit into + target node in its entirety. This item may be either + partially shifted, or not shifted at all. We will have to + create new item in target node, so decrease amout of free + space by an item creation overhead. We can reach here also + if stop coord is in this item */ + if (target_free_space >= (unsigned) item_creation_overhead(&source)) { + target_free_space -= item_creation_overhead(&source); + iplug = item_plugin_by_coord(&source); + if (iplug->b.can_shift) { + shift->part_units = iplug->b.can_shift(target_free_space, &source, 0 /*target */ + , shift->pend, &size, want); + } else { + target_free_space = 0; + shift->part_units = 0; + size = 0; + } + } else { + target_free_space = 0; + shift->part_units = 0; + size = 0; + } + shift->part_bytes = size; + shift->shift_bytes += size; + + /* set @shift->real_stop to last unit of @source we can merge + to @shift->target */ + if (shift->part_units) { + shift->real_stop = source; + shift->real_stop.unit_pos = (shift->part_units - source.unit_pos - 1) * shift->pend; + assert("nikita-2082", shift->real_stop.unit_pos + 1); + } + + if (want != shift->part_units) + /* not everything wanted were shifted */ + return; + break; + } + + shift->everything = 1; +} + +static void +copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count, shift_direction dir, unsigned free_space) +{ + item_plugin *iplug; + + assert("nikita-1463", target != NULL); + assert("nikita-1464", source != NULL); + assert("nikita-1465", from + count <= coord_num_units(source)); + + IF_TRACE(TRACE_COORDS, print_coord("copy_units source:", source, 0)); + + iplug = item_plugin_by_coord(source); + assert("nikita-1468", iplug == item_plugin_by_coord(target)); + iplug->b.copy_units(target, source, from, count, dir, free_space); + + if (dir == SHIFT_RIGHT) { + /* FIXME-VS: this looks not necessary. update_item_key was + called already by copy_units method */ + reiser4_key split_key; + + assert("nikita-1469", target->unit_pos == 0); + + unit_key_by_coord(target, &split_key); + node_plugin_by_coord(target)->update_item_key(target, &split_key, 0); + } +} + +/* copy part of @shift->real_stop.node starting either from its beginning or + from its end and ending at @shift->real_stop to either the end or the + beginning of @shift->target */ +static void +copy(struct shift_params *shift) +{ + node40_header *nh; + coord_t from; + coord_t to; + item_header40 *from_ih, *to_ih; + int free_space_start; + int new_items; + unsigned old_items; + int old_offset; + unsigned i; + + nh = node40_node_header(shift->target); + free_space_start = nh40_get_free_space_start(nh); + old_items = nh40_get_num_items(nh); + new_items = shift->entire + (shift->part_units ? 1 : 0); + assert("vs-185", shift->shift_bytes == shift->merging_bytes + shift->entire_bytes + shift->part_bytes); + + from = shift->wish_stop; + + IF_TRACE(TRACE_COORDS, print_coord("node40_copy from:", &from, 0)); + + coord_init_first_unit(&to, shift->target); + + /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty, + hence to.between is set to EMPTY_NODE above. Looks like we want it + to be AT_UNIT. + + Oh, wonders of ->betweeness... + + */ + to.between = AT_UNIT; + + if (shift->pend == SHIFT_LEFT) { + /* copying to left */ + + coord_set_item_pos(&from, 0); + from_ih = node40_ih_at(from.node, 0); + + coord_set_item_pos(&to, node40_num_of_items_internal(to.node) - 1); + if (shift->merging_units) { + /* expand last item, so that plugin methods will see + correct data */ + free_space_start += shift->merging_bytes; + nh40_set_free_space_start(nh, (unsigned) free_space_start); + nh40_set_free_space(nh, nh40_get_free_space(nh) - shift->merging_bytes); + + IF_TRACE(TRACE_COORDS, print_coord("before copy_units from:", &from, 0)); + IF_TRACE(TRACE_COORDS, print_coord("before copy_units to:", &to, 0)); + + /* appending last item of @target */ + copy_units(&to, &from, 0, /* starting from 0-th unit */ + shift->merging_units, SHIFT_LEFT, shift->merging_bytes); + coord_inc_item_pos(&from); + from_ih--; + coord_inc_item_pos(&to); + } + + to_ih = node40_ih_at(shift->target, old_items); + if (shift->entire) { + /* copy @entire items entirely */ + + /* copy item headers */ + xmemcpy(to_ih - shift->entire + 1, + from_ih - shift->entire + 1, shift->entire * sizeof (item_header40)); + /* update item header offset */ + old_offset = ih40_get_offset(from_ih); + /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */ + for (i = 0; i < shift->entire; i++, to_ih--, from_ih--) + ih40_set_offset(to_ih, ih40_get_offset(from_ih) - old_offset + free_space_start); + + /* copy item bodies */ + xmemcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */ + shift->entire_bytes); + + coord_add_item_pos(&from, (int) shift->entire); + coord_add_item_pos(&to, (int) shift->entire); + } + + nh40_set_free_space_start(nh, free_space_start + shift->shift_bytes - shift->merging_bytes); + nh40_set_free_space(nh, + nh40_get_free_space(nh) - + (shift->shift_bytes - shift->merging_bytes + sizeof (item_header40) * new_items)); + + /* update node header */ + node40_set_num_items(shift->target, nh, old_items + new_items); + assert("vs-170", nh40_get_free_space(nh) < znode_size(shift->target)); + + if (shift->part_units) { + /* copy heading part (@part units) of @source item as + a new item into @target->node */ + + /* copy item header of partially copied item */ + coord_set_item_pos(&to, node40_num_of_items_internal(to.node) + - 1); + xmemcpy(to_ih, from_ih, sizeof (item_header40)); + ih40_set_offset(to_ih, nh40_get_free_space_start(nh) - shift->part_bytes); + if (item_plugin_by_coord(&to)->b.init) + item_plugin_by_coord(&to)->b.init(&to, &from, 0); + copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT, shift->part_bytes); + } + + } else { + /* copying to right */ + + coord_set_item_pos(&from, node40_num_of_items_internal(from.node) - 1); + from_ih = node40_ih_at_coord(&from); + + coord_set_item_pos(&to, 0); + + /* prepare space for new items */ + xmemmove(zdata(to.node) + sizeof (node40_header) + + shift->shift_bytes, + zdata(to.node) + sizeof (node40_header), free_space_start - sizeof (node40_header)); + /* update item headers of moved items */ + to_ih = node40_ih_at(to.node, 0); + /* first item gets @merging_bytes longer. free space appears + at its beginning */ + if (!node_is_empty(to.node)) + ih40_set_offset(to_ih, ih40_get_offset(to_ih) + shift->shift_bytes - shift->merging_bytes); + + for (i = 1; i < old_items; i++) + ih40_set_offset(to_ih - i, ih40_get_offset(to_ih - i) + shift->shift_bytes); + + /* move item headers to make space for new items */ + xmemmove(to_ih - old_items + 1 - new_items, to_ih - old_items + 1, sizeof (item_header40) * old_items); + to_ih -= (new_items - 1); + + nh40_set_free_space_start(nh, free_space_start + shift->shift_bytes); + nh40_set_free_space(nh, + nh40_get_free_space(nh) - + (shift->shift_bytes + sizeof (item_header40) * new_items)); + + /* update node header */ + node40_set_num_items(shift->target, nh, old_items + new_items); + assert("vs-170", nh40_get_free_space(nh) < znode_size(shift->target)); + + if (shift->merging_units) { + coord_add_item_pos(&to, new_items); + to.unit_pos = 0; + to.between = AT_UNIT; + /* prepend first item of @to */ + copy_units(&to, &from, + coord_last_unit_pos(&from) - + shift->merging_units + 1, shift->merging_units, SHIFT_RIGHT, shift->merging_bytes); + coord_dec_item_pos(&from); + from_ih++; + } + + if (shift->entire) { + /* copy @entire items entirely */ + + /* copy item headers */ + xmemcpy(to_ih, from_ih, shift->entire * sizeof (item_header40)); + + /* update item header offset */ + old_offset = ih40_get_offset(from_ih + shift->entire - 1); + /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */ + for (i = 0; i < shift->entire; i++, to_ih++, from_ih++) + ih40_set_offset(to_ih, + ih40_get_offset(from_ih) - + old_offset + sizeof (node40_header) + shift->part_bytes); + /* copy item bodies */ + coord_add_item_pos(&from, -(int) (shift->entire - 1)); + xmemcpy(zdata(to.node) + sizeof (node40_header) + + shift->part_bytes, item_by_coord_node40(&from), + shift->entire_bytes); + coord_dec_item_pos(&from); + } + + if (shift->part_units) { + coord_set_item_pos(&to, 0); + to.unit_pos = 0; + to.between = AT_UNIT; + /* copy heading part (@part units) of @source item as + a new item into @target->node */ + + /* copy item header of partially copied item */ + xmemcpy(to_ih, from_ih, sizeof (item_header40)); + ih40_set_offset(to_ih, sizeof (node40_header)); + if (item_plugin_by_coord(&to)->b.init) + item_plugin_by_coord(&to)->b.init(&to, &from, 0); + copy_units(&to, &from, + coord_last_unit_pos(&from) - + shift->part_units + 1, shift->part_units, SHIFT_RIGHT, shift->part_bytes); + } + } +} + +/* remove everything either before or after @fact_stop. Number of items + removed completely is returned */ +static int +delete_copied(struct shift_params *shift) +{ + coord_t from; + coord_t to; + struct carry_cut_data cdata; + + if (shift->pend == SHIFT_LEFT) { + /* we were shifting to left, remove everything from the + beginning of @shift->wish_stop->node upto + @shift->wish_stop */ + coord_init_first_unit(&from, shift->real_stop.node); + to = shift->real_stop; + + /* store old coordinate of unit which will be first after + shift to left */ + shift->u.future_first = to; + coord_next_unit(&shift->u.future_first); + } else { + /* we were shifting to right, remove everything from + @shift->stop_coord upto to end of + @shift->stop_coord->node */ + from = shift->real_stop; + coord_init_last_unit(&to, from.node); + + /* store old coordinate of unit which will be last after + shift to right */ + shift->u.future_last = from; + coord_prev_unit(&shift->u.future_last); + } + + cdata.params.from = &from; + cdata.params.to = &to; + cdata.params.from_key = 0; + cdata.params.to_key = 0; + cdata.params.smallest_removed = 0; + return cut_node40(&cdata, 0); +} + +/* znode has left and right delimiting keys. We moved data between nodes, + therefore we must update delimiting keys of those znodes */ +/* Audited by: green(2002.06.13) */ +reiser4_internal void +update_znode_dkeys(znode * left, znode * right) +{ + reiser4_key key; + + assert("nikita-1470", rw_dk_is_write_locked(znode_get_tree(right))); + + leftmost_key_in_node(right, &key); + + if (0) { + printk("update_znode_dkeys: %p(%s) %p(%s)\n", + left, left ? (node_is_empty(left) ? "e" : "o") : "n", + right, right ? (node_is_empty(right) ? "e" : "o") : "n"); + print_key("leftmost", &key); + } + + if (left == NULL) { + /* update left delimiting key of @right */ + znode_set_ld_key(right, &key); + return; + } else if (!node_is_empty(left) && !node_is_empty(right)) { + /* update right delimiting key of @left */ + znode_set_rd_key(left, &key); + /* update left delimiting key of @right */ + znode_set_ld_key(right, &key); + return; + } else if (node_is_empty(left) && node_is_empty(right)) + /* AUDIT: there are 2 checks below both stating that both nodes cannot be empty, yet we return success before we even had a chance to check for the error. Perhaps some typo is here? */ + return; + else if (node_is_empty(left)) { + assert("vs-186", !node_is_empty(right)); + + /* update right delimiting key of @left */ + znode_set_rd_key(left, znode_get_ld_key(left)); + + /* update left delimiting key of @right */ + znode_set_ld_key(right, &key); + return; + } + + if (node_is_empty(right)) { + assert("vs-187", !node_is_empty(left)); + + /* update right delimiting key of @left */ + znode_set_rd_key(left, znode_get_rd_key(right)); + + /* update left delimiting key of @right */ + znode_set_ld_key(right, znode_get_rd_key(right)); + return; + } + impossible("vs-188", "both nodes can not be empty"); +} + +/* something was moved between @left and @right. Add carry operation to @info + list to have carry to update delimiting key between them */ +static int +prepare_for_update(znode * left, znode * right, carry_plugin_info * info) +{ + carry_op *op; + carry_node *cn; + + if (info == NULL) + /* nowhere to send operation to. */ + return 0; + + if (!should_notify_parent(right)) + return 0; + + op = node_post_carry(info, COP_UPDATE, right, 1); + if (IS_ERR(op) || op == NULL) + return op ? PTR_ERR(op) : -EIO; + + if (left != NULL) { + carry_node *reference; + + if (info->doing) + reference = insert_carry_node(info->doing, + info->todo, left); + else + reference = op->node; + assert("nikita-2992", reference != NULL); + cn = add_carry(info->todo, POOLO_BEFORE, reference); + if (IS_ERR(cn)) + return PTR_ERR(cn); + cn->parent = 1; + cn->node = left; + if (ZF_ISSET(left, JNODE_ORPHAN)) + cn->left_before = 1; + op->u.update.left = cn; + } else + op->u.update.left = NULL; + return 0; +} + +/* plugin->u.node.prepare_removal + to delete a pointer to @empty from the tree add corresponding carry + operation (delete) to @info list */ +reiser4_internal int +prepare_removal_node40(znode * empty, carry_plugin_info * info) +{ + carry_op *op; + + if (!should_notify_parent(empty)) + return 0; + /* already on a road to Styx */ + if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE)) + return 0; + op = node_post_carry(info, COP_DELETE, empty, 1); + if (IS_ERR(op) || op == NULL) + return RETERR(op ? PTR_ERR(op) : -EIO); + + op->u.delete.child = 0; + op->u.delete.flags = 0; + + /* fare thee well */ + ZF_SET(empty, JNODE_HEARD_BANSHEE); + return 0; +} + +/* something were shifted from @insert_coord->node to @shift->target, update + @insert_coord correspondingly */ +static void +adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed, int including_insert_coord) +{ + /* item plugin was invalidated by shifting */ + coord_clear_iplug(insert_coord); + + if (node_is_empty(shift->wish_stop.node)) { + assert("vs-242", shift->everything); + if (including_insert_coord) { + if (shift->pend == SHIFT_RIGHT) { + /* set @insert_coord before first unit of + @shift->target node */ + coord_init_before_first_item(insert_coord, shift->target); + } else { + /* set @insert_coord after last in target node */ + coord_init_after_last_item(insert_coord, shift->target); + } + } else { + /* set @insert_coord inside of empty node. There is + only one possible coord within an empty + node. init_first_unit will set that coord */ + coord_init_first_unit(insert_coord, shift->wish_stop.node); + } + return; + } + + if (shift->pend == SHIFT_RIGHT) { + /* there was shifting to right */ + if (shift->everything) { + /* everything wanted was shifted */ + if (including_insert_coord) { + /* @insert_coord is set before first unit of + @to node */ + coord_init_before_first_item(insert_coord, shift->target); + insert_coord->between = BEFORE_UNIT; + } else { + /* @insert_coord is set after last unit of + @insert->node */ + coord_init_last_unit(insert_coord, shift->wish_stop.node); + insert_coord->between = AFTER_UNIT; + } + } + return; + } + + /* there was shifting to left */ + if (shift->everything) { + /* everything wanted was shifted */ + if (including_insert_coord) { + /* @insert_coord is set after last unit in @to node */ + coord_init_after_last_item(insert_coord, shift->target); + } else { + /* @insert_coord is set before first unit in the same + node */ + coord_init_before_first_item(insert_coord, shift->wish_stop.node); + } + return; + } + + /* FIXME-VS: the code below is complicated because with between == + AFTER_ITEM unit_pos is set to 0 */ + + if (!removed) { + /* no items were shifted entirely */ + assert("vs-195", shift->merging_units == 0 || shift->part_units == 0); + + if (shift->real_stop.item_pos == insert_coord->item_pos) { + if (shift->merging_units) { + if (insert_coord->between == AFTER_UNIT) { + assert("nikita-1441", insert_coord->unit_pos >= shift->merging_units); + insert_coord->unit_pos -= shift->merging_units; + } else if (insert_coord->between == BEFORE_UNIT) { + assert("nikita-2090", insert_coord->unit_pos > shift->merging_units); + insert_coord->unit_pos -= shift->merging_units; + } + + assert("nikita-2083", insert_coord->unit_pos + 1); + } else { + if (insert_coord->between == AFTER_UNIT) { + assert("nikita-1442", insert_coord->unit_pos >= shift->part_units); + insert_coord->unit_pos -= shift->part_units; + } else if (insert_coord->between == BEFORE_UNIT) { + assert("nikita-2089", insert_coord->unit_pos > shift->part_units); + insert_coord->unit_pos -= shift->part_units; + } + + assert("nikita-2084", insert_coord->unit_pos + 1); + } + } + return; + } + + /* we shifted to left and there was no enough space for everything */ + switch (insert_coord->between) { + case AFTER_UNIT: + case BEFORE_UNIT: + if (shift->real_stop.item_pos == insert_coord->item_pos) + insert_coord->unit_pos -= shift->part_units; + case AFTER_ITEM: + coord_add_item_pos(insert_coord, -removed); + break; + default: + impossible("nikita-2087", "not ready"); + } + assert("nikita-2085", insert_coord->unit_pos + 1); +} + +static int +call_shift_hooks(struct shift_params *shift) +{ + unsigned i, shifted; + coord_t coord; + item_plugin *iplug; + + assert("vs-275", !node_is_empty(shift->target)); + + /* number of items shift touches */ + shifted = shift->entire + (shift->merging_units ? 1 : 0) + (shift->part_units ? 1 : 0); + + if (shift->pend == SHIFT_LEFT) { + /* moved items are at the end */ + coord_init_last_unit(&coord, shift->target); + coord.unit_pos = 0; + + assert("vs-279", shift->pend == 1); + for (i = 0; i < shifted; i++) { + unsigned from, count; + + iplug = item_plugin_by_coord(&coord); + if (i == 0 && shift->part_units) { + assert("vs-277", coord_num_units(&coord) == shift->part_units); + count = shift->part_units; + from = 0; + } else if (i == shifted - 1 && shift->merging_units) { + count = shift->merging_units; + from = coord_num_units(&coord) - count; + } else { + count = coord_num_units(&coord); + from = 0; + } + + if (iplug->b.shift_hook) { + iplug->b.shift_hook(&coord, from, count, shift->wish_stop.node); + } + coord_add_item_pos(&coord, -shift->pend); + } + } else { + /* moved items are at the beginning */ + coord_init_first_unit(&coord, shift->target); + + assert("vs-278", shift->pend == -1); + for (i = 0; i < shifted; i++) { + unsigned from, count; + + iplug = item_plugin_by_coord(&coord); + if (i == 0 && shift->part_units) { + assert("vs-277", coord_num_units(&coord) == shift->part_units); + count = coord_num_units(&coord); + from = 0; + } else if (i == shifted - 1 && shift->merging_units) { + count = shift->merging_units; + from = 0; + } else { + count = coord_num_units(&coord); + from = 0; + } + + if (iplug->b.shift_hook) { + iplug->b.shift_hook(&coord, from, count, shift->wish_stop.node); + } + coord_add_item_pos(&coord, -shift->pend); + } + } + + return 0; +} + +/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */ +static int +unit_moved_left(const struct shift_params *shift, const coord_t * old) +{ + assert("vs-944", shift->real_stop.node == old->node); + + if (shift->real_stop.item_pos < old->item_pos) + return 0; + if (shift->real_stop.item_pos == old->item_pos) { + if (shift->real_stop.unit_pos < old->unit_pos) + return 0; + } + return 1; +} + +/* shift to right is completed. Return 1 if unit @old was moved to right + neighbor */ +static int +unit_moved_right(const struct shift_params *shift, const coord_t * old) +{ + assert("vs-944", shift->real_stop.node == old->node); + + if (shift->real_stop.item_pos > old->item_pos) + return 0; + if (shift->real_stop.item_pos == old->item_pos) { + if (shift->real_stop.unit_pos > old->unit_pos) + return 0; + } + return 1; +} + +/* coord @old was set in node from which shift was performed. What was shifted + is stored in @shift. Update @old correspondingly to performed shift */ +static coord_t * +adjust_coord2(const struct shift_params *shift, const coord_t * old, coord_t * new) +{ + coord_clear_iplug(new); + new->between = old->between; + + coord_clear_iplug(new); + if (old->node == shift->target) { + if (shift->pend == SHIFT_LEFT) { + /* coord which is set inside of left neighbor does not + change during shift to left */ + coord_dup(new, old); + return new; + } + new->node = old->node; + coord_set_item_pos(new, + old->item_pos + shift->entire + + (shift->part_units ? 1 : 0)); + new->unit_pos = old->unit_pos; + if (old->item_pos == 0 && shift->merging_units) + new->unit_pos += shift->merging_units; + return new; + } + + assert("vs-977", old->node == shift->wish_stop.node); + if (shift->pend == SHIFT_LEFT) { + if (unit_moved_left(shift, old)) { + /* unit @old moved to left neighbor. Calculate its + coordinate there */ + new->node = shift->target; + coord_set_item_pos(new, + node_num_items(shift->target) - + shift->entire - + (shift->part_units ? 1 : 0) + + old->item_pos); + + new->unit_pos = old->unit_pos; + if (shift->merging_units) { + coord_dec_item_pos(new); + if (old->item_pos == 0) { + /* unit_pos only changes if item got + merged */ + new->unit_pos = coord_num_units(new) - (shift->merging_units - old->unit_pos); + } + } + } else { + /* unit @old did not move to left neighbor. + + Use _nocheck, because @old is outside of its node. + */ + coord_dup_nocheck(new, old); + coord_add_item_pos(new, -shift->u.future_first.item_pos); + if (new->item_pos == 0) + new->unit_pos -= shift->u.future_first.unit_pos; + } + } else { + if (unit_moved_right(shift, old)) { + /* unit @old moved to right neighbor */ + new->node = shift->target; + coord_set_item_pos(new, + old->item_pos - + shift->real_stop.item_pos); + if (new->item_pos == 0) { + /* unit @old might change unit pos */ + coord_set_item_pos(new, + old->unit_pos - + shift->real_stop.unit_pos); + } + } else { + /* unit @old did not move to right neighbor, therefore + it did not change */ + coord_dup(new, old); + } + } + coord_set_iplug(new, item_plugin_by_coord(new)); + return new; +} + +/* this is called when shift is completed (something of source node is copied + to target and deleted in source) to update all taps set in current + context */ +static void +update_taps(const struct shift_params *shift) +{ + tap_t *tap; + coord_t new; + + for_all_taps(tap) { + /* update only taps set to nodes participating in shift */ + if (tap->coord->node == shift->wish_stop.node || tap->coord->node == shift->target) + tap_to_coord(tap, adjust_coord2(shift, tap->coord, &new)); + } +} + +#if REISER4_DEBUG + +struct shift_check { + reiser4_key key; + __u16 plugin_id; + union { + __u64 bytes; + __u64 entries; + void *unused; + } u; +}; + +void * +shift_check_prepare(const znode *left, const znode *right) +{ + pos_in_node_t i, nr_items; + int mergeable; + struct shift_check *data; + item_header40 *ih; + + + if (node_is_empty(left) || node_is_empty(right)) + mergeable = 0; + else { + coord_t l, r; + + coord_init_last_unit(&l, left); + coord_init_first_unit(&r, right); + mergeable = are_items_mergeable(&l, &r); + } + nr_items = node40_num_of_items_internal(left) + node40_num_of_items_internal(right) - (mergeable ? 1 : 0); + data = reiser4_kmalloc(sizeof(struct shift_check) * nr_items, GFP_KERNEL); + if (data != NULL) { + coord_t coord; + pos_in_node_t item_pos; + + coord_init_first_unit(&coord, left); + i = 0; + + for (item_pos = 0; item_pos < node40_num_of_items_internal(left); item_pos ++) { + + coord_set_item_pos(&coord, item_pos); + ih = node40_ih_at_coord(&coord); + + data[i].key = ih->key; + data[i].plugin_id = d16tocpu(&ih->plugin_id); + switch(data[i].plugin_id) { + case FORMATTING_ID: + data[i].u.bytes = coord_num_units(&coord); + break; + case EXTENT_POINTER_ID: + data[i].u.bytes = extent_size(&coord, coord_num_units(&coord)); + break; + case COMPOUND_DIR_ID: + data[i].u.entries = coord_num_units(&coord); + break; + default: + data[i].u.unused = NULL; + break; + } + i ++; + } + + coord_init_first_unit(&coord, right); + + if (mergeable) { + assert("vs-1609", i != 0); + + ih = node40_ih_at_coord(&coord); + + assert("vs-1589", data[i - 1].plugin_id == d16tocpu(&ih->plugin_id)); + switch(data[i - 1].plugin_id) { + case FORMATTING_ID: + data[i - 1].u.bytes += coord_num_units(&coord); + break; + case EXTENT_POINTER_ID: + data[i - 1].u.bytes += extent_size(&coord, coord_num_units(&coord)); + break; + case COMPOUND_DIR_ID: + data[i - 1].u.entries += coord_num_units(&coord); + break; + default: + impossible("vs-1605", "wrong mergeable item"); + break; + } + item_pos = 1; + } else + item_pos = 0; + for (; item_pos < node40_num_of_items_internal(right); item_pos ++) { + + assert("vs-1604", i < nr_items); + coord_set_item_pos(&coord, item_pos); + ih = node40_ih_at_coord(&coord); + + data[i].key = ih->key; + data[i].plugin_id = d16tocpu(&ih->plugin_id); + switch(data[i].plugin_id) { + case FORMATTING_ID: + data[i].u.bytes = coord_num_units(&coord); + break; + case EXTENT_POINTER_ID: + data[i].u.bytes = extent_size(&coord, coord_num_units(&coord)); + break; + case COMPOUND_DIR_ID: + data[i].u.entries = coord_num_units(&coord); + break; + default: + data[i].u.unused = NULL; + break; + } + i ++; + } + assert("vs-1606", i == nr_items); + } + return data; +} + +void +shift_check(void *vp, const znode *left, const znode *right) +{ + pos_in_node_t i, nr_items; + coord_t coord; + __u64 last_bytes; + int mergeable; + item_header40 *ih; + pos_in_node_t item_pos; + struct shift_check *data; + + data = (struct shift_check *)vp; + + if (data == NULL) + return; + + if (node_is_empty(left) || node_is_empty(right)) + mergeable = 0; + else { + coord_t l, r; + + coord_init_last_unit(&l, left); + coord_init_first_unit(&r, right); + mergeable = are_items_mergeable(&l, &r); + } + + nr_items = node40_num_of_items_internal(left) + node40_num_of_items_internal(right) - (mergeable ? 1 : 0); + + i = 0; + last_bytes = 0; + + coord_init_first_unit(&coord, left); + + for (item_pos = 0; item_pos < node40_num_of_items_internal(left); item_pos ++) { + + coord_set_item_pos(&coord, item_pos); + ih = node40_ih_at_coord(&coord); + + assert("vs-1611", i == item_pos); + assert("vs-1590", keyeq(&ih->key, &data[i].key)); + assert("vs-1591", d16tocpu(&ih->plugin_id) == data[i].plugin_id); + if ((i < (node40_num_of_items_internal(left) - 1)) || !mergeable) { + switch(data[i].plugin_id) { + case FORMATTING_ID: + assert("vs-1592", data[i].u.bytes == coord_num_units(&coord)); + break; + case EXTENT_POINTER_ID: + assert("vs-1593", data[i].u.bytes == extent_size(&coord, coord_num_units(&coord))); + break; + case COMPOUND_DIR_ID: + assert("vs-1594", data[i].u.entries == coord_num_units(&coord)); + break; + default: + break; + } + } + if (item_pos == (node40_num_of_items_internal(left) - 1) && mergeable) { + switch(data[i].plugin_id) { + case FORMATTING_ID: + last_bytes = coord_num_units(&coord); + break; + case EXTENT_POINTER_ID: + last_bytes = extent_size(&coord, coord_num_units(&coord)); + break; + case COMPOUND_DIR_ID: + last_bytes = coord_num_units(&coord); + break; + default: + impossible("vs-1595", "wrong mergeable item"); + break; + } + } + i ++; + } + + coord_init_first_unit(&coord, right); + if (mergeable) { + ih = node40_ih_at_coord(&coord); + + assert("vs-1589", data[i - 1].plugin_id == d16tocpu(&ih->plugin_id)); + assert("vs-1608", last_bytes != 0); + switch(data[i - 1].plugin_id) { + case FORMATTING_ID: + assert("vs-1596", data[i - 1].u.bytes == last_bytes + coord_num_units(&coord)); + break; + + case EXTENT_POINTER_ID: + assert("vs-1597", data[i - 1].u.bytes == last_bytes + extent_size(&coord, coord_num_units(&coord))); + break; + + case COMPOUND_DIR_ID: + assert("vs-1598", data[i - 1].u.bytes == last_bytes + coord_num_units(&coord)); + break; + default: + impossible("vs-1599", "wrong mergeable item"); + break; + } + item_pos = 1; + } else + item_pos = 0; + + for (; item_pos < node40_num_of_items_internal(right); item_pos ++) { + + coord_set_item_pos(&coord, item_pos); + ih = node40_ih_at_coord(&coord); + + assert("vs-1612", keyeq(&ih->key, &data[i].key)); + assert("vs-1613", d16tocpu(&ih->plugin_id) == data[i].plugin_id); + switch(data[i].plugin_id) { + case FORMATTING_ID: + assert("vs-1600", data[i].u.bytes == coord_num_units(&coord)); + break; + case EXTENT_POINTER_ID: + assert("vs-1601", data[i].u.bytes == extent_size(&coord, coord_num_units(&coord))); + break; + case COMPOUND_DIR_ID: + assert("vs-1602", data[i].u.entries == coord_num_units(&coord)); + break; + default: + break; + } + i ++; + } + + assert("vs-1603", i == nr_items); + reiser4_kfree(data); +} + +#endif + +ON_DEBUG_MODIFY(extern __u32 znode_checksum(const znode * node);) + +/* plugin->u.node.shift + look for description of this method in plugin/node/node.h */ +reiser4_internal int +shift_node40(coord_t *from, znode *to, shift_direction pend, + int delete_child, /* if @from->node becomes empty - it will be deleted from the tree if this is set to + 1 */ + int including_stop_coord /* */ , + carry_plugin_info *info) +{ + struct shift_params shift; + int result; + znode *left, *right; + znode *source; + int target_empty; +#if REISER4_DEBUG + struct shift_check *check_data; +#endif + + assert("nikita-2161", coord_check(from)); + + ON_DEBUG_MODIFY(znode_set_checksum(ZJNODE(to), 0)); + + xmemset(&shift, 0, sizeof (shift)); + shift.pend = pend; + shift.wish_stop = *from; + shift.target = to; + + assert("nikita-1473", znode_is_write_locked(from->node)); + assert("nikita-1474", znode_is_write_locked(to)); + node_check(from->node, 0); + node_check(to, 0); + + source = from->node; + + /* set @shift.wish_stop to rightmost/leftmost unit among units we want + shifted */ + if (node_is_empty(shift.wish_stop.node)) + result = 1; + if (pend == SHIFT_LEFT) { + result = coord_set_to_left(&shift.wish_stop); + left = to; + right = from->node; + } else { + result = coord_set_to_right(&shift.wish_stop); + left = from->node; + right = to; + } + + if (result) { + /* move insertion coord even if there is nothing to move */ + if (including_stop_coord) { + /* move insertion coord (@from) */ + if (pend == SHIFT_LEFT) { + /* after last item in target node */ + coord_init_after_last_item(from, to); + } else { + /* before first item in target node */ + coord_init_before_first_item(from, to); + } + } + /* there is nothing to shift */ + assert("nikita-2078", coord_check(from)); + return 0; + } + + target_empty = node_is_empty(to); + + ON_DEBUG_MODIFY(assert("nikita-3427", to->cksum == znode_checksum(to))); + + /* when first node plugin with item body compression is implemented, + this must be changed to call node specific plugin */ + + /* shift->stop_coord is updated to last unit which really will be + shifted */ + estimate_shift(&shift, get_current_context()); + if (!shift.shift_bytes) { + /* we could not shift anything */ + assert("nikita-2079", coord_check(from)); + ON_DEBUG_MODIFY(assert("nikita-3433", + to->cksum == znode_checksum(to))); + return 0; + } + + ON_DEBUG(check_data = shift_check_prepare(left, right)); + + IF_TRACE(TRACE_COORDS, print_coord("shift->wish_stop before copy:", &shift.wish_stop, 0)); + + copy(&shift); + + /* result value of this is important. It is used by adjust_coord below */ + result = delete_copied(&shift); + + assert("vs-1610", result >= 0); + assert("vs-1471", ((reiser4_context *) current->fs_context)->magic == context_magic); + + /* item which has been moved from one node to another might want to do + something on that event. This can be done by item's shift_hook + method, which will be now called for every moved items */ + call_shift_hooks(&shift); + + assert("vs-1472", ((reiser4_context *) current->fs_context)->magic == context_magic); + + update_taps(&shift); + + assert("vs-1473", ((reiser4_context *) current->fs_context)->magic == context_magic); + + /* adjust @from pointer in accordance with @including_stop_coord flag + and amount of data which was really shifted */ + adjust_coord(from, &shift, result, including_stop_coord); + + if (target_empty) + /* + * items were shifted into empty node. Update delimiting key. + */ + result = prepare_for_update(NULL, left, info); + + /* add update operation to @info, which is the list of operations to + be performed on a higher level */ + result = prepare_for_update(left, right, info); + if (!result && node_is_empty(source) && delete_child) { + /* all contents of @from->node is moved to @to and @from->node + has to be removed from the tree, so, on higher level we + will be removing the pointer to node @from->node */ + result = prepare_removal_node40(source, info); + } + +#ifdef DEBUGGING_SHIFT + dinfo("SHIFT TO %s: merging %d, entire %d, part %d, size %d\n", + shift.pend == SHIFT_LEFT ? "LEFT" : "RIGHT", + shift.merging_units, shift.entire, shift.part_units, shift.shift_bytes); +#endif + ON_TRACE(TRACE_SHIFT, "shift: [%Li] %s--%s [%Li]: %i\n", + *znode_get_block(left), + (shift.pend == SHIFT_LEFT) ? "<" : "", + (shift.pend == SHIFT_LEFT) ? "" : ">", *znode_get_block(right), shift.shift_bytes); + + node_check(source, 0); + node_check(to, 0); + assert("nikita-2080", coord_check(from)); + + ON_DEBUG(shift_check(check_data, left, right)); + + return result ? result : (int) shift.shift_bytes; +} + +/* plugin->u.node.fast_insert() + look for description of this method in plugin/node/node.h */ +reiser4_internal int +fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) +{ + return 1; +} + +/* plugin->u.node.fast_paste() + look for description of this method in plugin/node/node.h */ +reiser4_internal int +fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) +{ + return 1; +} + +/* plugin->u.node.fast_cut() + look for description of this method in plugin/node/node.h */ +reiser4_internal int +fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ ) +{ + return 1; +} + +/* plugin->u.node.modify - not defined */ + +/* plugin->u.node.max_item_size */ +reiser4_internal int +max_item_size_node40(void) +{ + return reiser4_get_current_sb()->s_blocksize - sizeof (node40_header) - sizeof (item_header40); +} + +/* plugin->u.node.set_item_plugin */ +reiser4_internal int +set_item_plugin_node40(coord_t *coord, item_id id) +{ + item_header40 *ih; + + ih = node40_ih_at_coord(coord); + cputod16(id, &ih->plugin_id); + coord->iplugid = id; + return 0; +} + + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/node/node40.h linux-2.6.4-ck1/fs/reiser4/plugin/node/node40.h --- linux-2.6.4/fs/reiser4/plugin/node/node40.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/node/node40.h 2004-03-11 22:45:15.348501034 +1100 @@ -0,0 +1,116 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#if !defined( __REISER4_NODE40_H__ ) +#define __REISER4_NODE40_H__ + +#include "../../forward.h" +#include "../../dformat.h" +#include "node.h" + +#include + + +/* format of node header for 40 node layouts. Keep bloat out of this struct. */ +typedef struct node40_header { + /* identifier of node plugin. Must be located at the very beginning + of a node. */ + common_node_header common_header; /* this is 16 bits */ + /* number of items. Should be first element in the node header, + because we haven't yet finally decided whether it shouldn't go into + common_header. + */ + d16 nr_items; + /* free space in node measured in bytes */ + d16 free_space; + /* offset to start of free space in node */ + d16 free_space_start; + /* for reiser4_fsck. When information about what is a free + block is corrupted, and we try to recover everything even + if marked as freed, then old versions of data may + duplicate newer versions, and this field allows us to + restore the newer version. Also useful for when users + who don't have the new trashcan installed on their linux distro + delete the wrong files and send us desperate emails + offering $25 for them back. */ + + /* magic field we need to tell formatted nodes */ + d32 magic; + /* flushstamp is made of mk_id and write_counter. mk_id is an + id generated randomly at mkreiserfs time. So we can just + skip all nodes with different mk_id. write_counter is d64 + incrementing counter of writes on disk. It is used for + choosing the newest data at fsck time. */ + + d32 mkfs_id; + d64 flush_id; + /* node flags to be used by fsck (reiser4ck or reiser4fsck?) + and repacker */ + d16 flags; + + /* 1 is leaf level, 2 is twig level, root is the numerically + largest level */ + d8 level; + + d8 pad; +} PACKED node40_header; + +/* item headers are not standard across all node layouts, pass + pos_in_node to functions instead */ +typedef struct item_header40 { + /* key of item */ + /* 0 */ reiser4_key key; + /* offset from start of a node measured in 8-byte chunks */ + /* 24 */ d16 offset; + /* 26 */ d16 flags; + /* 28 */ d16 plugin_id; +} PACKED item_header40; + +size_t item_overhead_node40(const znode * node, flow_t * aflow); +size_t free_space_node40(znode * node); +node_search_result lookup_node40(znode * node, const reiser4_key * key, lookup_bias bias, coord_t * coord); +int num_of_items_node40(const znode * node); +char *item_by_coord_node40(const coord_t * coord); +int length_by_coord_node40(const coord_t * coord); +item_plugin *plugin_by_coord_node40(const coord_t * coord); +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key); +size_t estimate_node40(znode * node); +int check_node40(const znode * node, __u32 flags, const char **error); +int parse_node40(znode * node); +#if REISER4_DEBUG_OUTPUT +void print_node40(const char *prefix, const znode * node, __u32 flags); +#endif +int init_node40(znode * node); +int guess_node40(const znode * node); +void change_item_size_node40(coord_t * coord, int by); +int create_item_node40(coord_t * target, const reiser4_key * key, reiser4_item_data * data, carry_plugin_info * info); +void update_item_key_node40(coord_t * target, const reiser4_key * key, carry_plugin_info * info); +int kill_node40(struct carry_kill_data *, carry_plugin_info *); +int cut_node40(struct carry_cut_data *, carry_plugin_info *); +int shift_node40(coord_t * from, znode * to, shift_direction pend, + /* if @from->node becomes + empty - it will be deleted from + the tree if this is set to 1 + */ + int delete_child, int including_stop_coord, carry_plugin_info * info); + +int fast_insert_node40(const coord_t * coord); +int fast_paste_node40(const coord_t * coord); +int fast_cut_node40(const coord_t * coord); +int max_item_size_node40(void); +int prepare_removal_node40(znode * empty, carry_plugin_info * info); +int set_item_plugin_node40(coord_t * coord, item_id id); + +void update_znode_dkeys(znode * left, znode * right); + +/* __REISER4_NODE40_H__ */ +#endif +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/node/node.c linux-2.6.4-ck1/fs/reiser4/plugin/node/node.c --- linux-2.6.4/fs/reiser4/plugin/node/node.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/node/node.c 2004-03-11 22:45:15.340502278 +1100 @@ -0,0 +1,394 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Node plugin interface. + + Description: The tree provides the abstraction of flows, which it + internally fragments into items which it stores in nodes. + + A key_atom is a piece of data bound to a single key. + + For reasonable space efficiency to be achieved it is often + necessary to store key_atoms in the nodes in the form of items, where + an item is a sequence of key_atoms of the same or similar type. It is + more space-efficient, because the item can implement (very) + efficient compression of key_atom's bodies using internal knowledge + about their semantics, and it can often avoid having a key for each + key_atom. Each type of item has specific operations implemented by its + item handler (see balance.c). + + Rationale: the rest of the code (specifically balancing routines) + accesses leaf level nodes through this interface. This way we can + implement various block layouts and even combine various layouts + within the same tree. Balancing/allocating algorithms should not + care about peculiarities of splitting/merging specific item types, + but rather should leave that to the item's item handler. + + Items, including those that provide the abstraction of flows, have + the property that if you move them in part or in whole to another + node, the balancing code invokes their is_left_mergeable() + item_operation to determine if they are mergeable with their new + neighbor in the node you have moved them to. For some items the + is_left_mergeable() function always returns null. + + When moving the bodies of items from one node to another: + + if a partial item is shifted to another node the balancing code invokes + an item handler method to handle the item splitting. + + if the balancing code needs to merge with an item in the node it + is shifting to, it will invoke an item handler method to handle + the item merging. + + if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy() + adjusting the item headers after the move is done using the node handler. +*/ + +#include "../../forward.h" +#include "../../debug.h" +#include "../../key.h" +#include "../../coord.h" +#include "../plugin_header.h" +#include "../item/item.h" +#include "node.h" +#include "../plugin.h" +#include "../../znode.h" +#include "../../tree.h" +#include "../../super.h" +#include "../../reiser4.h" + +/* return starting key of the leftmost item in the @node */ +reiser4_internal reiser4_key * +leftmost_key_in_node(const znode * node /* node to query */ , + reiser4_key * key /* resulting key */ ) +{ + assert("nikita-1634", node != NULL); + assert("nikita-1635", key != NULL); + + if (!node_is_empty(node)) { + coord_t first_item; + + coord_init_first_unit(&first_item, (znode *) node); + item_key_by_coord(&first_item, key); + } else + *key = *max_key(); + return key; +} + +#if REISER4_DEBUG_OUTPUT +/* helper function: convert 4 bit integer to its hex representation */ +/* Audited by: green(2002.06.12) */ +static char +hex_to_ascii(const int hex /* hex digit */ ) +{ + assert("nikita-1081", (0 <= hex) && (hex < 0x10)); + + if (hex < 10) + return '0' + hex; + else + return 'a' + hex - 10; +} + +/* helper function used to indent output during recursive tree printing */ +/* Audited by: green(2002.06.12) */ +reiser4_internal void +indent(unsigned indentation) +{ + unsigned i; + + for (i = 0; i < indentation; ++i) + printk("%.1i........", indentation - i); +} + +/* helper function used to indent output for @node during recursive tree + printing */ +reiser4_internal void +indent_znode(const znode * node /* current node */ ) +{ + if (znode_get_tree(node)->height < znode_get_level(node)) + indent(0); + else + indent(znode_get_tree(node)->height - znode_get_level(node)); +} + +/* debugging aid: output human readable information about @node */ +reiser4_internal void +print_node_content(const char *prefix /* output prefix */ , + const znode * node /* node to print */ , + __u32 flags /* print flags */ ) +{ + unsigned short i; + coord_t coord; + item_plugin *iplug; + reiser4_key key; + + if (!znode_is_loaded(node)) { + print_znode("znode is not loaded\n", node); + return; + } + if ((flags & REISER4_NODE_PRINT_HEADER) && (node_plugin_by_node(node)->print != NULL)) { + indent_znode(node); + node_plugin_by_node(node)->print(prefix, node, flags); + + indent_znode(node); + print_key("LDKEY", &node->ld_key); + + indent_znode(node); + print_key("RDKEY", &node->rd_key); + } + + /*if( flags & REISER4_NODE_SILENT ) {return;} */ + + coord.node = (znode *) node; + coord.unit_pos = 0; + coord.between = AT_UNIT; + /*indent_znode (node); */ + for (i = 0; i < node_num_items(node); i++) { + indent_znode(node); + printk("%d: ", i); + + coord_set_item_pos(&coord, i); + + iplug = item_plugin_by_coord(&coord); + if (flags & REISER4_NODE_PRINT_PLUGINS) { + print_plugin("\titem plugin", item_plugin_to_plugin(iplug)); + indent_znode(node); + } + if (flags & REISER4_NODE_PRINT_KEYS) { + item_key_by_coord(&coord, &key); + print_key("\titem key", &key); + } + + if ((flags & REISER4_NODE_PRINT_ITEMS) && (iplug->b.print)) { + indent_znode(node); + printk("\tlength %d\n", item_length_by_coord(&coord)); + indent_znode(node); + iplug->b.print("\titem", &coord); + } + if (flags & REISER4_NODE_PRINT_DATA) { + int j; + int length; + char *data; + + data = item_body_by_coord(&coord); + length = item_length_by_coord(&coord); + indent_znode(node); + printk("\titem length: %i, offset: %i\n", length, data - zdata(node)); + for (j = 0; j < length; ++j) { + char datum; + + if ((j % 16) == 0) { + /* next 16 bytes */ + if (j == 0) { + indent_znode(node); + printk("\tdata % .2i: ", j); + } else { + printk("\n"); + indent_znode(node); + printk("\t % .2i: ", j); + } + } + datum = data[j]; + printk("%c", hex_to_ascii((datum & 0xf0) >> 4)); + printk("%c ", hex_to_ascii(datum & 0xf)); + } + printk("\n"); + indent_znode(node); + } + printk("======================\n"); + } + printk("\n"); +} + +/* debugging aid: output human readable information about @node + the same as the above, but items to be printed must be specified */ +reiser4_internal void +print_node_items(const char *prefix /* output prefix */ , + const znode * node /* node to print */ , + __u32 flags /* print flags */ , + unsigned from, unsigned count) +{ + unsigned i; + coord_t coord; + item_plugin *iplug; + reiser4_key key; + + if (!znode_is_loaded(node)) { + print_znode("znode is not loaded\n", node); + return; + } + if ((flags & REISER4_NODE_PRINT_HEADER) && (node_plugin_by_node(node)->print != NULL)) { + indent_znode(node); + node_plugin_by_node(node)->print(prefix, node, flags); + + indent_znode(node); + print_key("LDKEY", &node->ld_key); + + indent_znode(node); + print_key("RDKEY", &node->rd_key); + } + + /*if( flags & REISER4_NODE_SILENT ) {return;} */ + + coord.node = (znode *) node; + coord.unit_pos = 0; + coord.between = AT_UNIT; + /*indent_znode (node); */ + if (from >= node_num_items(node) || from + count > node_num_items(node)) { + printk("there are no those items (%u-%u) in the node (%u)\n", + from, from + count - 1, node_num_items(node)); + return; + } + + for (i = from; i < from + count; i++) { + indent_znode(node); + printk("%d: ", i); + + coord_set_item_pos(&coord, i); + + iplug = item_plugin_by_coord(&coord); + if (flags & REISER4_NODE_PRINT_PLUGINS) { + print_plugin("\titem plugin", item_plugin_to_plugin(iplug)); + indent_znode(node); + } + if (flags & REISER4_NODE_PRINT_KEYS) { + item_key_by_coord(&coord, &key); + print_key("\titem key", &key); + } + + if ((flags & REISER4_NODE_PRINT_ITEMS) && (iplug->b.print)) { + indent_znode(node); + printk("\tlength %d\n", item_length_by_coord(&coord)); + indent_znode(node); + iplug->b.print("\titem", &coord); + } + if (flags & REISER4_NODE_PRINT_DATA) { + int j; + int length; + char *data; + + data = item_body_by_coord(&coord); + length = item_length_by_coord(&coord); + indent_znode(node); + printk("\titem length: %i, offset: %i\n", length, data - zdata(node)); + for (j = 0; j < length; ++j) { + char datum; + + if ((j % 16) == 0) { + /* next 16 bytes */ + if (j == 0) { + indent_znode(node); + printk("\tdata % .2i: ", j); + } else { + printk("\n"); + indent_znode(node); + printk("\t % .2i: ", j); + } + } + datum = data[j]; + printk("%c", hex_to_ascii((datum & 0xf0) >> 4)); + printk("%c ", hex_to_ascii(datum & 0xf)); + } + printk("\n"); + indent_znode(node); + } + printk("======================\n"); + } + printk("\n"); +} +#endif + +#if REISER4_DEBUG_NODE +/* debugging aid: check consistency of @node content */ +void +node_check(znode * node /* node to check */ , + __u32 flags /* check flags */ ) +{ + const char *mes; + int result; + reiser4_tree *tree; + + if (!reiser4_is_debugged(reiser4_get_current_sb(), REISER4_CHECK_NODE)) + return; + + if (get_current_context()->disable_node_check) + return; + tree = znode_get_tree(node); + + if (lock_counters()->rw_locked_dk > 0) + return; + if (lock_counters()->rw_locked_tree > 0) + return; + + if (znode_above_root(node)) + return; + if (znode_just_created(node)) + return; + + zload(node); + result = node_plugin_by_node(node)->check(node, flags, &mes); + if (result != 0) { + printk("%s\n", mes); + print_node_content("check", node, ~0u); + reiser4_panic("vs-273", "node corrupted"); + } + zrelse(node); +} +#endif + +node_plugin node_plugins[LAST_NODE_ID] = { + [NODE40_ID] = { + .h = { + .type_id = REISER4_NODE_PLUGIN_TYPE, + .id = NODE40_ID, + .pops = NULL, + .label = "unified", + .desc = "unified node layout", + .linkage = TYPE_SAFE_LIST_LINK_ZERO, + } + , + .item_overhead = item_overhead_node40, + .free_space = free_space_node40, + .lookup = lookup_node40, + .num_of_items = num_of_items_node40, + .item_by_coord = item_by_coord_node40, + .length_by_coord = length_by_coord_node40, + .plugin_by_coord = plugin_by_coord_node40, + .key_at = key_at_node40, + .estimate = estimate_node40, + .check = check_node40, + .parse = parse_node40, + .init = init_node40, +#ifdef GUESS_EXISTS + .guess = guess_node40, +#endif +#if REISER4_DEBUG_OUTPUT + .print = print_node40, +#endif + .change_item_size = change_item_size_node40, + .create_item = create_item_node40, + .update_item_key = update_item_key_node40, + .cut_and_kill = kill_node40, + .cut = cut_node40, + .shift = shift_node40, + .fast_insert = fast_insert_node40, + .fast_paste = fast_paste_node40, + .fast_cut = fast_cut_node40, +#ifdef MODIFY_EXISTS + .modify = NULL, +#endif + .max_item_size = max_item_size_node40, + .prepare_removal = prepare_removal_node40, + .set_item_plugin = set_item_plugin_node40 + } +}; + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/node/node.h linux-2.6.4-ck1/fs/reiser4/plugin/node/node.h --- linux-2.6.4/fs/reiser4/plugin/node/node.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/node/node.h 2004-03-11 22:45:15.341502123 +1100 @@ -0,0 +1,315 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* We need a definition of the default node layout here. */ + +/* Generally speaking, it is best to have free space in the middle of the + node so that two sets of things can grow towards it, and to have the + item bodies on the left so that the last one of them grows into free + space. We optimize for the case where we append new items to the end + of the node, or grow the last item, because it hurts nothing to so + optimize and it is a common special case to do massive insertions in + increasing key order (and one of cases more likely to have a real user + notice the delay time for). + + formatted leaf default layout: (leaf1) + + |node header:item bodies:free space:key + pluginid + item offset| + + We grow towards the middle, optimizing layout for the case where we + append new items to the end of the node. The node header is fixed + length. Keys, and item offsets plus pluginids for the items + corresponding to them are in increasing key order, and are fixed + length. Item offsets are relative to start of node (16 bits creating + a node size limit of 64k, 12 bits might be a better choice....). Item + bodies are in decreasing key order. Item bodies have a variable size. + There is a one to one to one mapping of keys to item offsets to item + bodies. Item offsets consist of pointers to the zeroth byte of the + item body. Item length equals the start of the next item minus the + start of this item, except the zeroth item whose length equals the end + of the node minus the start of that item (plus a byte). In other + words, the item length is not recorded anywhere, and it does not need + to be since it is computable. + + Leaf variable length items and keys layout : (lvar) + + |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies| + + We grow towards the middle, optimizing layout for the case where we + append new items to the end of the node. The node header is fixed + length. Keys and item offsets for the items corresponding to them are + in increasing key order, and keys are variable length. Item offsets + are relative to start of node (16 bits). Item bodies are in + decreasing key order. Item bodies have a variable size. There is a + one to one to one mapping of keys to item offsets to item bodies. + Item offsets consist of pointers to the zeroth byte of the item body. + Item length equals the start of the next item's key minus the start of + this item, except the zeroth item whose length equals the end of the + node minus the start of that item (plus a byte). + + leaf compressed keys layout: (lcomp) + + |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies| + + We grow towards the middle, optimizing layout for the case where we + append new items to the end of the node. The node header is fixed + length. Keys and item offsets for the items corresponding to them are + in increasing key order, and keys are variable length. The "key + inherit" field indicates how much of the key prefix is identical to + the previous key (stem compression as described in "Managing + Gigabytes" is used). key_inherit is a one byte integer. The + intra-node searches performed through this layout are linear searches, + and this is theorized to not hurt performance much due to the high + cost of processor stalls on modern CPUs, and the small number of keys + in a single node. Item offsets are relative to start of node (16 + bits). Item bodies are in decreasing key order. Item bodies have a + variable size. There is a one to one to one mapping of keys to item + offsets to item bodies. Item offsets consist of pointers to the + zeroth byte of the item body. Item length equals the start of the + next item minus the start of this item, except the zeroth item whose + length equals the end of the node minus the start of that item (plus a + byte). In other words, item length and key length is not recorded + anywhere, and it does not need to be since it is computable. + + internal node default layout: (idef1) + + just like ldef1 except that item bodies are either blocknrs of + children or extents, and moving them may require updating parent + pointers in the nodes that they point to. +*/ + +/* There is an inherent 3-way tradeoff between optimizing and + exchanging disks between different architectures and code + complexity. This is optimal and simple and inexchangeable. + Someone else can do the code for exchanging disks and make it + complex. It would not be that hard. Using other than the PAGE_SIZE + might be suboptimal. +*/ + +#if !defined( __REISER4_NODE_H__ ) +#define __REISER4_NODE_H__ + +#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE + +#include "../../dformat.h" +#include "../plugin_header.h" + +#include + +typedef enum { + NS_FOUND = 0, + NS_NOT_FOUND = -ENOENT +} node_search_result; + +/* Maximal possible space overhead for creation of new item in a node */ +#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 ) + +typedef enum { + REISER4_NODE_PRINT_HEADER = (1u << 0), + REISER4_NODE_PRINT_KEYS = (1u << 1), + REISER4_NODE_PRINT_PLUGINS = (1u << 2), + REISER4_NODE_PRINT_ITEMS = (1u << 3), + REISER4_NODE_PRINT_DATA = (1u << 4), + REISER4_NODE_CHECK = (1u << 5), + REISER4_NODE_PANIC = (1u << 6), + REISER4_NODE_PRINT_ZNODE = (1u << 7), + REISER4_NODE_DKEYS = (1u << 8), + REISER4_NODE_TREE_STABLE = (1u << 9), + REISER4_NODE_DONT_DOT = (1u << 10), + REISER4_NODE_PRINT_BRIEF = (1u << 11), + REISER4_NODE_ONLY_INCORE = (1u << 12), + REISER4_NODE_SILENT = (1u << 13), + REISER4_COLLECT_STAT = (1u << 14), + REISER4_NODE_PRINT_ALL = ~0u +} reiser4_node_print_flag; + +#define REISER4_TREE_CHECK ( REISER4_NODE_CHECK | REISER4_NODE_ONLY_INCORE | REISER4_NODE_SILENT | REISER4_NODE_TREE_STABLE ) +#define REISER4_TREE_VERBOSE ( REISER4_NODE_PRINT_ALL & ~REISER4_NODE_SILENT ) +#define REISER4_TREE_BRIEF ( REISER4_NODE_PRINT_BRIEF ) +#define REISER4_TREE_CHECK_ALL ( REISER4_TREE_CHECK & ~REISER4_NODE_ONLY_INCORE ) + +/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */ +struct cut_list { + coord_t * from; + coord_t * to; + const reiser4_key * from_key; + const reiser4_key * to_key; + reiser4_key * smallest_removed; + carry_plugin_info * info; + __u32 flags; + struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */ + lock_handle *left; + lock_handle *right; +}; + +struct carry_cut_data; +struct carry_kill_data; + +/* The responsibility of the node plugin is to store and give access + to the sequence of items within the node. */ +typedef struct node_plugin { + /* generic plugin fields */ + plugin_header h; + + /* calculates the amount of space that will be required to store an + item which is in addition to the space consumed by the item body. + (the space consumed by the item body can be gotten by calling + item->estimate) */ + size_t(*item_overhead) (const znode * node, flow_t * f); + + /* returns free space by looking into node (i.e., without using + znode->free_space). */ + size_t(*free_space) (znode * node); + /* search within the node for the one item which might + contain the key, invoking item->search_within to search within + that item to see if it is in there */ + node_search_result(*lookup) (znode * node, const reiser4_key * key, lookup_bias bias, coord_t * coord); + /* number of items in node */ + int (*num_of_items) (const znode * node); + + /* store information about item in @coord in @data */ + /* break into several node ops, don't add any more uses of this before doing so */ + /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */ + char *(*item_by_coord) (const coord_t * coord); + int (*length_by_coord) (const coord_t * coord); + item_plugin *(*plugin_by_coord) (const coord_t * coord); + + /* store item key in @key */ + reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key); + /* conservatively estimate whether unit of what size can fit + into node. This estimation should be performed without + actually looking into the node's content (free space is saved in + znode). */ + size_t(*estimate) (znode * node); + + /* performs every consistency check the node plugin author could + imagine. Optional. */ + int (*check) (const znode * node, __u32 flags, const char **error); + + /* Called when node is read into memory and node plugin is + already detected. This should read some data into znode (like free + space counter) and, optionally, check data consistency. + */ + int (*parse) (znode * node); + /* This method is called on a new node to initialise plugin specific + data (header, etc.) */ + int (*init) (znode * node); + /* Check whether @node content conforms to this plugin format. + Probably only useful after support for old V3.x formats is added. + Uncomment after 4.0 only. + */ + /* int ( *guess )( const znode *node ); */ +#if REISER4_DEBUG_OUTPUT + void (*print) (const char *prefix, const znode * node, __u32 flags); +#endif + /* change size of @item by @by bytes. @item->node has enough free + space. When @by > 0 - free space is appended to end of item. When + @by < 0 - item is truncated - it is assumed that last @by bytes if + the item are freed already */ + void (*change_item_size) (coord_t * item, int by); + + /* create new item @length bytes long in coord @target */ + int (*create_item) (coord_t * target, const reiser4_key * key, + reiser4_item_data * data, carry_plugin_info * info); + + /* update key of item. */ + void (*update_item_key) (coord_t * target, const reiser4_key * key, carry_plugin_info * info); + +#if 0 + /* remove data between @from and @to from the tree */ + int (*cut_and_kill1) (struct cut_list *); + + /* remove data between @from and @to from a node (when shifting from + one node to another, one cuts from a node but does not cut_and_kill + from the tree) */ + int (*cut1) (struct cut_list *); +#endif + int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *); + int (*cut) (struct carry_cut_data *, carry_plugin_info *); + + /* copy as much as possible but not more than up to @stop from + @stop->node to @target. If (pend == append) then data from beginning of + @stop->node are copied to the end of @target. If (pend == prepend) then + data from the end of @stop->node are copied to the beginning of + @target. Copied data are removed from @stop->node. Information + about what to do on upper level is stored in @todo */ + int (*shift) (coord_t * stop, znode * target, shift_direction pend, + int delete_node, int including_insert_coord, carry_plugin_info * info); + /* return true if this node allows skip carry() in some situations + (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format + emulation doesn't. + + This will speedup insertions that doesn't require updates to the + parent, by bypassing initialisation of carry() structures. It's + believed that majority of insertions will fit there. + + */ + int (*fast_insert) (const coord_t * coord); + int (*fast_paste) (const coord_t * coord); + int (*fast_cut) (const coord_t * coord); + /* this limits max size of item which can be inserted into a node and + number of bytes item in a node may be appended with */ + int (*max_item_size) (void); + int (*prepare_removal) (znode * empty, carry_plugin_info * info); + /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular + * files */ + int (*set_item_plugin) (coord_t * coord, item_id); +} node_plugin; + +typedef enum { + /* standard unified node layout used for both leaf and internal + nodes */ + NODE40_ID, + LAST_NODE_ID +} reiser4_node_id; + +extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key); +#if REISER4_DEBUG_OUTPUT +extern void print_node_content(const char *prefix, const znode * node, __u32 flags); +extern void print_node_items(const char *prefix /* output prefix */ , + const znode * node /* node to print */ , + __u32 flags /* print flags */ , + unsigned from, unsigned count); +#else +#define print_node_content(p,n,f) noop +#endif + +extern void indent(unsigned indentation); +extern void indent_znode(const znode * node); + +#if REISER4_DEBUG_NODE +extern void node_check(znode * node, __u32 flags); +#define DISABLE_NODE_CHECK \ +({ \ + ++ get_current_context() -> disable_node_check; \ +}) + +#define ENABLE_NODE_CHECK \ +({ \ + -- get_current_context() -> disable_node_check; \ +}) + +#else +#define node_check( n, f ) noop +#define DISABLE_NODE_CHECK noop +#define ENABLE_NODE_CHECK noop +#endif + +extern void indent_znode(const znode * node); + +typedef struct common_node_header { + /* identifier of node plugin. Must be located at the very beginning + of a node. */ + d16 plugin_id; +} common_node_header; +/* __REISER4_NODE_H__ */ +#endif +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/object.c linux-2.6.4-ck1/fs/reiser4/plugin/object.c --- linux-2.6.4/fs/reiser4/plugin/object.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/object.c 2004-03-11 22:45:15.350500724 +1100 @@ -0,0 +1,1612 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Examples of object plugins: file, directory, symlink, special file */ +/* Plugins associated with inode: + + Plugin of inode is plugin referenced by plugin-id field of on-disk + stat-data. How we store this plugin in in-core inode is not + important. Currently pointers are used, another variant is to store + offsets and do array lookup on each access. + + Now, each inode has one selected plugin: object plugin that + determines what type of file this object is: directory, regular etc. + + This main plugin can use other plugins that are thus subordinated to + it. Directory instance of object plugin uses hash; regular file + instance uses tail policy plugin. + + Object plugin is either taken from id in stat-data or guessed from + i_mode bits. Once it is established we ask it to install its + subordinate plugins, by looking again in stat-data or inheriting them + from parent. +*/ +/* How new inode is initialized during ->read_inode(): + 1 read stat-data and initialize inode fields: i_size, i_mode, + i_generation, capabilities etc. + 2 read plugin id from stat data or try to guess plugin id + from inode->i_mode bits if plugin id is missing. + 3 Call ->init_inode() method of stat-data plugin to initialise inode fields. + 4 Call ->activate() method of object's plugin. Plugin is either read from + from stat-data or guessed from mode bits + 5 Call ->inherit() method of object plugin to inherit as yet initialized + plugins from parent. + + Easy induction proves that on last step all plugins of inode would be + initialized. + + When creating new object: + 1 obtain object plugin id (see next period) + 2 ->install() this plugin + 3 ->inherit() the rest from the parent + +*/ +/* We need some examples of creating an object with default and + non-default plugin ids. Nikita, please create them. + +*/ + +#include "../forward.h" +#include "../debug.h" +#include "../key.h" +#include "../kassign.h" +#include "../coord.h" +#include "../seal.h" +#include "plugin_header.h" +#include "item/static_stat.h" +#include "file/file.h" +#include "file/pseudo.h" +#include "symlink.h" +#include "dir/dir.h" +#include "item/item.h" +#include "plugin.h" +#include "object.h" +#if defined(XATTR) +#include "xattr.h" +#endif +#include "../znode.h" +#include "../tap.h" +#include "../tree.h" +#include "../vfs_ops.h" +#include "../inode.h" +#include "../super.h" +#include "../reiser4.h" +#include "../prof.h" +#include "../safe_link.h" + +#include +#include +#include +#include +#include /* security_inode_delete() */ +#include /* wake_up_inode() */ +#include +#include + +/* helper function to print errors */ +static void +key_warning(const reiser4_key * key /* key to print */, + const struct inode *inode, + int code /* error code to print */) +{ + assert("nikita-716", key != NULL); + + if (code != -ENOMEM) { + warning("nikita-717", "Error for inode %llu (%i)", + get_key_objectid(key), code); + print_key("for key", key); + print_inode("inode", inode); + } +} + +#if REISER4_DEBUG +static void +check_inode_seal(const struct inode *inode, + const coord_t *coord, const reiser4_key *key) +{ + reiser4_key unit_key; + + unit_key_by_coord(coord, &unit_key); + assert("nikita-2752", + WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key))); + assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key)); +} + +static void +check_sd_coord(coord_t *coord, const reiser4_key *key) +{ + reiser4_key ukey; + + coord_clear_iplug(coord); + if (zload(coord->node)) + return; + + if (!coord_is_existing_unit(coord) || + !item_plugin_by_coord(coord) || + !keyeq(unit_key_by_coord(coord, &ukey), key) || + (znode_get_level(coord->node) != LEAF_LEVEL) || + !item_is_statdata(coord)) { + warning("nikita-1901", "Conspicuous seal"); + print_key("key", key); + print_coord("coord", coord, 1); + impossible("nikita-2877", "no way"); + } + zrelse(coord->node); +} + +#else +#define check_inode_seal(inode, coord, key) noop +#define check_sd_coord(coord, key) noop +#endif + +/* find sd of inode in a tree, deal with errors */ +reiser4_internal int +lookup_sd(struct inode *inode /* inode to look sd for */ , + znode_lock_mode lock_mode /* lock mode */ , + coord_t * coord /* resulting coord */ , + lock_handle * lh /* resulting lock handle */ , + const reiser4_key * key /* resulting key */, + int silent) +{ + int result; + __u32 flags; + + assert("nikita-1692", inode != NULL); + assert("nikita-1693", coord != NULL); + assert("nikita-1694", key != NULL); + + /* look for the object's stat data in a tree. + This returns in "node" pointer to a locked znode and in "pos" + position of an item found in node. Both are only valid if + coord_found is returned. */ + flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0; + flags |= CBK_UNIQUE; + /* + * traverse tree to find stat data. We cannot use vroot here, because + * it only covers _body_ of the file, and stat data don't belong + * there. + */ + result = coord_by_key(tree_by_inode(inode), + key, + coord, + lh, + lock_mode, + FIND_EXACT, + LEAF_LEVEL, + LEAF_LEVEL, + flags, + 0); + if (REISER4_DEBUG && result == 0) + check_sd_coord(coord, key); + + if (result != 0 && !silent) + key_warning(key, inode, result); + return result; +} + +/* insert new stat-data into tree. Called with inode state + locked. Return inode state locked. */ +static int +insert_new_sd(struct inode *inode /* inode to create sd for */ ) +{ + int result; + reiser4_key key; + coord_t coord; + reiser4_item_data data; + char *area; + reiser4_inode *ref; + lock_handle lh; + oid_t oid; + + assert("nikita-723", inode != NULL); + assert("nikita-3406", inode_get_flag(inode, REISER4_NO_SD)); + + ref = reiser4_inode_data(inode); + spin_lock_inode(inode); + grab_plugin_from(ref, sd, inode_sd_plugin(inode)); + + /* + * prepare specification of new item to be inserted + */ + + data.iplug = ref->pset->sd; + data.length = data.iplug->s.sd.save_len(inode); + spin_unlock_inode(inode); + + data.data = NULL; + data.user = 0; + + if (data.length > tree_by_inode(inode)->nplug->max_item_size()) { + /* This is silly check, but we don't know actual node where + insertion will go into. */ + return RETERR(-ENAMETOOLONG); + } + oid = oid_allocate(inode->i_sb); + if (oid == ABSOLUTE_MAX_OID) + return RETERR(-EOVERFLOW); + + set_inode_oid(inode, oid); + + coord_init_zero(&coord); + init_lh(&lh); + + result = insert_by_key(tree_by_inode(inode), + build_sd_key(inode, &key), + &data, + &coord, + &lh, + /* stat data lives on a leaf level */ + LEAF_LEVEL, + CBK_UNIQUE); + + /* we don't want to re-check that somebody didn't insert + stat-data while we were doing io, because if it did, + insert_by_key() returned error. */ + /* but what _is_ possible is that plugin for inode's stat-data, + list of non-standard plugins or their state would change + during io, so that stat-data wouldn't fit into sd. To avoid + this race we keep inode_state lock. This lock has to be + taken each time you access inode in a way that would cause + changes in sd size: changing plugins etc. + */ + + if (result == IBK_INSERT_OK) { + write_current_tracef("..sd i %#llx %#llx", + get_inode_oid(inode), ref->locality_id); + + coord_clear_iplug(&coord); + result = zload(coord.node); + if (result == 0) { + /* have we really inserted stat data? */ + assert("nikita-725", item_is_statdata(&coord)); + + /* inode was just created. It is inserted into hash + table, but no directory entry was yet inserted into + parent. So, inode is inaccessible through + ->lookup(). All places that directly grab inode + from hash-table (like old knfsd), should check + IMMUTABLE flag that is set by common_create_child. + */ + assert("nikita-3240", data.iplug != NULL); + assert("nikita-3241", data.iplug->s.sd.save != NULL); + area = item_body_by_coord(&coord); + result = data.iplug->s.sd.save(inode, &area); + znode_make_dirty(coord.node); + if (result == 0) { + /* object has stat-data now */ + inode_clr_flag(inode, REISER4_NO_SD); + inode_set_flag(inode, REISER4_SDLEN_KNOWN); + /* initialise stat-data seal */ + seal_init(&ref->sd_seal, &coord, &key); + ref->sd_coord = coord; + check_inode_seal(inode, &coord, &key); + } else if (result != -ENOMEM) + /* + * convert any other error code to -EIO to + * avoid confusing user level with unexpected + * errors. + */ + result = RETERR(-EIO); + zrelse(coord.node); + } + } + done_lh(&lh); + + if (result != 0) + key_warning(&key, inode, result); + else + oid_count_allocated(); + + return result; +} + + +/* update stat-data at @coord */ +static int +update_sd_at(struct inode * inode, coord_t * coord, reiser4_key * key, + lock_handle * lh) +{ + int result; + reiser4_item_data data; + char *area; + reiser4_inode *state; + znode *loaded; + + state = reiser4_inode_data(inode); + + coord_clear_iplug(coord); + result = zload(coord->node); + if (result != 0) + return result; + loaded = coord->node; + + spin_lock_inode(inode); + assert("nikita-728", state->pset->sd != NULL); + data.iplug = state->pset->sd; + + /* data.length is how much space to add to (or remove + from if negative) sd */ + if (!inode_get_flag(inode, REISER4_SDLEN_KNOWN)) { + /* recalculate stat-data length */ + data.length = + data.iplug->s.sd.save_len(inode) - + item_length_by_coord(coord); + inode_set_flag(inode, REISER4_SDLEN_KNOWN); + } else + data.length = 0; + spin_unlock_inode(inode); + + /*zrelse(coord->node);*/ + + /* if on-disk stat data is of different length than required + for this inode, resize it */ + if (0 != data.length) { + data.data = NULL; + data.user = 0; + + /* insertion code requires that insertion point (coord) was + * between units. */ + coord->between = AFTER_UNIT; + result = resize_item(coord, + &data, key, lh, COPI_DONT_SHIFT_LEFT); + if (result != 0) { + key_warning(key, inode, result); + zrelse(loaded); + return result; + } + if (loaded != coord->node) { + /* resize_item moved coord to another node. Zload it */ + zrelse(loaded); + coord_clear_iplug(coord); + result = zload(coord->node); + if (result != 0) + return result; + loaded = coord->node; + } + } + + area = item_body_by_coord(coord); + spin_lock_inode(inode); + result = data.iplug->s.sd.save(inode, &area); + znode_make_dirty(coord->node); + + /* re-initialise stat-data seal */ + + /* + * coord.between was possibly skewed from AT_UNIT when stat-data size + * was changed and new extensions were pasted into item. + */ + coord->between = AT_UNIT; + seal_init(&state->sd_seal, coord, key); + state->sd_coord = *coord; + spin_unlock_inode(inode); + check_inode_seal(inode, coord, key); + zrelse(loaded); + return result; +} + +/* Update existing stat-data in a tree. Called with inode state locked. Return + inode state locked. */ +static int +update_sd(struct inode *inode /* inode to update sd for */ ) +{ + int result; + reiser4_key key; + coord_t coord; + seal_t seal; + reiser4_inode *state; + lock_handle lh; + + assert("nikita-726", inode != NULL); + + /* no stat-data, nothing to update?! */ + assert("nikita-726000", !inode_get_flag(inode, REISER4_NO_SD)); + + init_lh(&lh); + + state = reiser4_inode_data(inode); + spin_lock_inode(inode); + coord = state->sd_coord; + coord_clear_iplug(&coord); + seal = state->sd_seal; + spin_unlock_inode(inode); + + build_sd_key(inode, &key); + if (seal_is_set(&seal)) { + /* first, try to use seal */ + result = seal_validate(&seal, + &coord, + &key, + LEAF_LEVEL, + &lh, + FIND_EXACT, + ZNODE_WRITE_LOCK, + ZNODE_LOCK_LOPRI); + if (result == 0) + check_sd_coord(&coord, &key); + } else + result = -E_REPEAT; + + if (result != 0) { + coord_init_zero(&coord); + result = lookup_sd(inode, + ZNODE_WRITE_LOCK, &coord, &lh, &key, 0); + } + + /* we don't want to re-check that somebody didn't remove stat-data + while we were doing io, because if it did, lookup_sd returned + error. */ + if (result == 0) + result = update_sd_at(inode, &coord, &key, &lh); + done_lh(&lh); + + return result; +} + +/* save object's stat-data to disk */ +reiser4_internal int +write_sd_by_inode_common(struct inode *inode /* object to save */) +{ + int result; + + assert("nikita-730", inode != NULL); + + mark_inode_update(inode, 1); + + if (inode_get_flag(inode, REISER4_NO_SD)) + /* object doesn't have stat-data yet */ + result = insert_new_sd(inode); + else + result = update_sd(inode); + if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM) + /* Don't issue warnings about "name is too long" */ + warning("nikita-2221", "Failed to save sd for %llu: %i", + get_inode_oid(inode), result); + return result; +} + +/* checks whether yet another hard links to this object can be added */ +reiser4_internal int +can_add_link_common(const struct inode *object /* object to check */ ) +{ + assert("nikita-732", object != NULL); + + /* inode->i_nlink is unsigned int, so just check for integer + * overflow */ + return object->i_nlink + 1 != 0; +} + + +/* space for stat data removal is reserved */ +reiser4_internal int +common_file_delete_no_reserve(struct inode *inode /* object to remove */ ) +{ + int result; + + assert("nikita-1477", inode != NULL); + + if (!inode_get_flag(inode, REISER4_NO_SD)) { + reiser4_key sd_key; + + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + + build_sd_key(inode, &sd_key); + write_current_tracef("..sd k %#llx", get_inode_oid(inode)); + result = cut_tree(tree_by_inode(inode), &sd_key, &sd_key, NULL); + if (result == 0) { + inode_set_flag(inode, REISER4_NO_SD); + result = oid_release(inode->i_sb, get_inode_oid(inode)); + if (result == 0) { + oid_count_released(); + + result = safe_link_del(inode, SAFE_UNLINK); + } + } + } else + result = 0; + return result; +} + +/* delete_file_common() - delete object stat-data. This is to be used when file deletion turns into stat data removal */ +reiser4_internal int +delete_file_common(struct inode *inode /* object to remove */ ) +{ + int result; + + assert("nikita-1477", inode != NULL); + assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode)); + assert("nikita-3421", inode->i_nlink == 0); + + if (!inode_get_flag(inode, REISER4_NO_SD)) { + reiser4_block_nr reserve; + + /* grab space which is needed to remove stat data and + * safe-link form the tree */ + reserve = 2 * estimate_one_item_removal(tree_by_inode(inode)); + if (reiser4_grab_space_force(reserve, + BA_RESERVED | BA_CAN_COMMIT)) { + warning("nikita-2847", + "Cannot delete unnamed sd of %lli. Run fsck", + get_inode_oid(inode)); + return RETERR(-ENOSPC); + } + result = common_file_delete_no_reserve(inode); + } else + result = 0; + return result; +} + +/* common directory consists of two items: stat data and one item containing "." and ".." */ +static int delete_directory_common(struct inode *inode) +{ + int result; + dir_plugin *dplug; + + dplug = inode_dir_plugin(inode); + assert("vs-1101", dplug && dplug->done); + + /* grab space enough for removing two items */ + if (reiser4_grab_space(2 * estimate_one_item_removal(tree_by_inode(inode)), BA_RESERVED | BA_CAN_COMMIT)) + return RETERR(-ENOSPC); + + result = dplug->done(inode); + if (!result) + result = common_file_delete_no_reserve(inode); + all_grabbed2free(); + return result; +} + +/* ->set_plug_in_inode() default method. */ +static int +set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ , + struct inode *parent /* parent object */ , + reiser4_object_create_data * data /* creational + * data */ ) +{ + __u64 mask; + + object->i_mode = data->mode; + /* this should be plugin decision */ + object->i_uid = current->fsuid; + object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME; + + /* support for BSD style group-id assignment. */ + if (reiser4_is_set(object->i_sb, REISER4_BSD_GID)) + object->i_gid = parent->i_gid; + else if (parent->i_mode & S_ISGID) { + /* parent directory has sguid bit */ + object->i_gid = parent->i_gid; + if (S_ISDIR(object->i_mode)) + /* sguid is inherited by sub-directories */ + object->i_mode |= S_ISGID; + } else + object->i_gid = current->fsgid; + + /* this object doesn't have stat-data yet */ + inode_set_flag(object, REISER4_NO_SD); + /* setup inode and file-operations for this inode */ + setup_inode_ops(object, data); + object->i_nlink = 0; + seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL); + mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT); + if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES)) + mask |= (1 << LARGE_TIMES_STAT); + + scint_pack(&reiser4_inode_data(object)->extmask, mask, GFP_ATOMIC); + return 0; +} + +/* Determine object plugin for @inode based on i_mode. + + Most objects in reiser4 file system are controlled by standard object + plugins: regular file, directory, symlink, fifo, and so on. + + For such files we don't explicitly store plugin id in object stat + data. Rather required plugin is guessed from mode bits, where file "type" + is encoded (see stat(2)). +*/ +reiser4_internal int +guess_plugin_by_mode(struct inode *inode /* object to guess plugins + * for */ ) +{ + int fplug_id; + int dplug_id; + reiser4_inode *info; + + assert("nikita-736", inode != NULL); + + dplug_id = fplug_id = -1; + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + fplug_id = SPECIAL_FILE_PLUGIN_ID; + break; + case S_IFLNK: + fplug_id = SYMLINK_FILE_PLUGIN_ID; + break; + case S_IFDIR: + fplug_id = DIRECTORY_FILE_PLUGIN_ID; + dplug_id = HASHED_DIR_PLUGIN_ID; + break; + default: + warning("nikita-737", "wrong file mode: %o", inode->i_mode); + return RETERR(-EIO); + case S_IFREG: + fplug_id = UNIX_FILE_PLUGIN_ID; + break; + } + info = reiser4_inode_data(inode); + plugin_set_file(&info->pset, + (fplug_id >= 0) ? file_plugin_by_id(fplug_id) : NULL); + plugin_set_dir(&info->pset, + (dplug_id >= 0) ? dir_plugin_by_id(dplug_id) : NULL); + return 0; +} + +/* this comon implementation of create estimation function may be used when object creation involves insertion of one item + (usualy stat data) into tree */ +static reiser4_block_nr estimate_create_file_common(struct inode *object) +{ + return estimate_one_insert_item(tree_by_inode(object)); +} + +/* this comon implementation of create directory estimation function may be used when directory creation involves + insertion of two items (usualy stat data and item containing "." and "..") into tree */ +static reiser4_block_nr estimate_create_dir_common(struct inode *object) +{ + return 2 * estimate_one_insert_item(tree_by_inode(object)); +} + +/* ->create method of object plugin */ +static int +create_common(struct inode *object, struct inode *parent UNUSED_ARG, + reiser4_object_create_data * data UNUSED_ARG) +{ + reiser4_block_nr reserve; + assert("nikita-744", object != NULL); + assert("nikita-745", parent != NULL); + assert("nikita-747", data != NULL); + assert("nikita-748", inode_get_flag(object, REISER4_NO_SD)); + + reserve = estimate_create_file_common(object); + if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) + return RETERR(-ENOSPC); + return write_sd_by_inode_common(object); +} + +/* standard implementation of ->owns_item() plugin method: compare objectids + of keys in inode and coord */ +reiser4_internal int +owns_item_common(const struct inode *inode /* object to check + * against */ , + const coord_t * coord /* coord to check */ ) +{ + reiser4_key item_key; + reiser4_key file_key; + + assert("nikita-760", inode != NULL); + assert("nikita-761", coord != NULL); + + return /*coord_is_in_node( coord ) && */ + coord_is_existing_item(coord) && + (get_key_objectid(build_sd_key(inode, &file_key)) == get_key_objectid(item_key_by_coord(coord, &item_key))); +} + +/* @count bytes of flow @f got written, update correspondingly f->length, + f->data and f->key */ +reiser4_internal void +move_flow_forward(flow_t * f, unsigned count) +{ + if (f->data) + f->data += count; + f->length -= count; + set_key_offset(&f->key, get_key_offset(&f->key) + count); +} + +/* default ->add_link() method of file plugin */ +static int +add_link_common(struct inode *object, struct inode *parent UNUSED_ARG) +{ + /* + * increment ->i_nlink and update ->i_ctime + */ + + INODE_INC_FIELD(object, i_nlink); + object->i_ctime = CURRENT_TIME; + return 0; +} + +/* default ->rem_link() method of file plugin */ +static int +rem_link_common(struct inode *object, struct inode *parent UNUSED_ARG) +{ + assert("nikita-2021", object != NULL); + assert("nikita-2163", object->i_nlink > 0); + + /* + * decrement ->i_nlink and update ->i_ctime + */ + + INODE_DEC_FIELD(object, i_nlink); + assert("nikita-3407", inode_dir_plugin(object) == NULL || + object->i_nlink != 1 || object->i_size <= 1); + object->i_ctime = CURRENT_TIME; + return 0; +} + +/* ->not_linked() method for file plugins */ +static int +not_linked_common(const struct inode *inode) +{ + assert("nikita-2007", inode != NULL); + return (inode->i_nlink == 0); +} + +/* ->not_linked() method the for directory file plugin */ +static int +not_linked_dir(const struct inode *inode) +{ + assert("nikita-2008", inode != NULL); + /* one link from dot */ + return (inode->i_nlink == 1); +} + +/* ->adjust_to_parent() method for regular files */ +static int +adjust_to_parent_common(struct inode *object /* new object */ , + struct inode *parent /* parent directory */ , + struct inode *root /* root directory */ ) +{ + reiser4_inode *self; + reiser4_inode *ancestor; + + assert("nikita-2165", object != NULL); + if (parent == NULL) + parent = root; + assert("nikita-2069", parent != NULL); + + self = reiser4_inode_data(object); + ancestor = reiser4_inode_data(parent); + + /* + * inherit missing plugins from parent + */ + + grab_plugin(self, ancestor, file); + grab_plugin(self, ancestor, sd); + grab_plugin(self, ancestor, formatting); + grab_plugin(self, ancestor, perm); + return 0; +} + +/* ->adjust_to_parent() method for directory files */ +static int +adjust_to_parent_dir(struct inode *object /* new object */ , + struct inode *parent /* parent directory */ , + struct inode *root /* root directory */ ) +{ + reiser4_inode *self; + reiser4_inode *ancestor; + + assert("nikita-2166", object != NULL); + if (parent == NULL) + parent = root; + assert("nikita-2167", parent != NULL); + + self = reiser4_inode_data(object); + ancestor = reiser4_inode_data(parent); + + /* + * inherit missing plugins from parent + */ + + grab_plugin(self, ancestor, file); + grab_plugin(self, ancestor, dir); + grab_plugin(self, ancestor, sd); + grab_plugin(self, ancestor, hash); + grab_plugin(self, ancestor, formatting); + grab_plugin(self, ancestor, perm); + grab_plugin(self, ancestor, dir_item); + return 0; +} + +/* simplest implementation of ->getattr() method. Completely static. */ +static int +getattr_common(struct vfsmount *mnt UNUSED_ARG, struct dentry *dentry, struct kstat *stat) +{ + struct inode *obj; + + assert("nikita-2298", dentry != NULL); + assert("nikita-2299", stat != NULL); + assert("nikita-2300", dentry->d_inode != NULL); + + obj = dentry->d_inode; + + stat->dev = obj->i_sb->s_dev; + stat->ino = oid_to_uino(get_inode_oid(obj)); + stat->mode = obj->i_mode; + /* don't confuse userland with huge nlink. This is not entirely + * correct, because nlink_t is not necessary 16 bit signed. */ + stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink))0x7fff); + stat->uid = obj->i_uid; + stat->gid = obj->i_gid; + stat->rdev = obj->i_rdev; + stat->atime = obj->i_atime; + stat->mtime = obj->i_mtime; + stat->ctime = obj->i_ctime; + stat->size = obj->i_size; + stat->blocks = (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS; + /* "preferred" blocksize for efficient file system I/O */ + stat->blksize = get_super_private(obj->i_sb)->optimal_io_size; + + return 0; +} + +/* plugin->u.file.release */ +static int +release_dir(struct inode *inode, struct file *file) +{ + /* this is called when directory file descriptor is closed. */ + spin_lock_inode(inode); + /* remove directory from readddir list. See comment before + * readdir_common() for details. */ + if (file->private_data != NULL) + readdir_list_remove(reiser4_get_file_fsdata(file)); + spin_unlock_inode(inode); + return 0; +} + +/* + * seek method for directory. See comment before readdir_common() for + * explanation. + */ +static loff_t +seek_dir(struct file *file, loff_t off, int origin) +{ + loff_t result; + struct inode *inode; + + inode = file->f_dentry->d_inode; + ON_TRACE(TRACE_DIR | TRACE_VFS_OPS, "dir_seek: %s: %lli -> %lli/%i\n", + file->f_dentry->d_name.name, file->f_pos, off, origin); + down(&inode->i_sem); + /* update ->f_pos */ + result = default_llseek(file, off, origin); + if (result >= 0) { + int ff; + coord_t coord; + lock_handle lh; + tap_t tap; + readdir_pos *pos; + + coord_init_zero(&coord); + init_lh(&lh); + tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK); + + ff = dir_readdir_init(file, &tap, &pos); + if (ff != 0) + result = (loff_t) ff; + tap_done(&tap); + } + up(&inode->i_sem); + return result; +} + +/* default implementation of ->bind() method of file plugin */ +static int +bind_common(struct inode *child UNUSED_ARG, struct inode *parent UNUSED_ARG) +{ + return 0; +} + +#define detach_common bind_common +#define cannot ((void *)bind_common) + +static int +detach_dir(struct inode *child, struct inode *parent) +{ + dir_plugin *dplug; + + dplug = inode_dir_plugin(child); + assert("nikita-2883", dplug != NULL); + assert("nikita-2884", dplug->detach != NULL); + return dplug->detach(child, parent); +} + + +/* this common implementation of update estimation function may be used when stat data update does not do more than + inserting a unit into a stat data item which is probably true for most cases */ +reiser4_internal reiser4_block_nr +estimate_update_common(const struct inode *inode) +{ + return estimate_one_insert_into_item(tree_by_inode(inode)); +} + +static reiser4_block_nr +estimate_unlink_common(struct inode *object UNUSED_ARG, + struct inode *parent UNUSED_ARG) +{ + return 0; +} + +static reiser4_block_nr +estimate_unlink_dir_common(struct inode *object, struct inode *parent) +{ + dir_plugin *dplug; + + dplug = inode_dir_plugin(object); + assert("nikita-2888", dplug != NULL); + assert("nikita-2887", dplug->estimate.unlink != NULL); + return dplug->estimate.unlink(object, parent); +} + +/* implementation of ->bind() method for file plugin of directory file */ +static int +bind_dir(struct inode *child, struct inode *parent) +{ + dir_plugin *dplug; + + dplug = inode_dir_plugin(child); + assert("nikita-2646", dplug != NULL); + return dplug->attach(child, parent); +} + +/* ->setattr() method. This is called when inode attribute (including + * ->i_size) is modified. */ +reiser4_internal int +setattr_common(struct inode *inode /* Object to change attributes */, + struct iattr *attr /* change description */) +{ + int result; + __u64 tograb; + + assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE)); + + /* + * grab disk space and call standard inode_setattr(). + */ + + tograb = estimate_one_insert_into_item(tree_by_inode(inode)); + result = reiser4_grab_space(tograb, BA_CAN_COMMIT); + if (!result) { + result = inode_setattr(inode, attr); + if (!result) + /* "capture" inode */ + result = reiser4_mark_inode_dirty(inode); + all_grabbed2free(); + } + return result; +} + +/* doesn't seem to be exported in headers. */ +extern spinlock_t inode_lock; + +/* ->delete_inode() method. This is called by + * iput()->iput_final()->drop_inode() when last reference to inode is released + * and inode has no names. */ +static void delete_inode_common(struct inode *object) +{ + /* create context here. + * + * removal of inode from the hash table (done at the very beginning of + * generic_delete_inode(), truncate of pages, and removal of file's + * extents has to be performed in the same atom. Otherwise, it may so + * happen, that twig node with unallocated extent will be flushed to + * the disk. + */ + reiser4_context ctx; + + /* + * FIXME: this resembles generic_delete_inode + */ + list_del_init(&object->i_list); + object->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + + init_context(&ctx, object->i_sb); + + uncapture_inode(object); + + if (!is_bad_inode(object)) { + file_plugin *fplug; + + /* truncate object body */ + fplug = inode_file_plugin(object); + if (fplug->pre_delete != NULL && fplug->pre_delete(object) != 0) + warning("vs-1216", "Failed to delete file body %llu", + get_inode_oid(object)); + else + assert("vs-1430", + reiser4_inode_data(object)->jnodes == 0); + } + + if (object->i_data.nrpages) { + warning("vs-1434", "nrpages %ld\n", object->i_data.nrpages); + truncate_inode_pages(&object->i_data, 0); + } + security_inode_delete(object); + if (!is_bad_inode(object)) + DQUOT_INIT(object); + + object->i_sb->s_op->delete_inode(object); + + spin_lock(&inode_lock); + hlist_del_init(&object->i_hash); + spin_unlock(&inode_lock); + wake_up_inode(object); + if (object->i_state != I_CLEAR) + BUG(); + destroy_inode(object); + (void)reiser4_exit_context(&ctx); +} + +/* + * ->forget_inode() method. Called by iput()->iput_final()->drop_inode() when + * last reference to inode with names is released + */ +static void forget_inode_common(struct inode *object) +{ + generic_forget_inode(object); +} + +/* ->drop_inode() method. Called by iput()->iput_final() when last reference + * to inode is released */ +static void drop_common(struct inode * object) +{ + file_plugin *fplug; + + assert("nikita-2643", object != NULL); + + /* -not- creating context in this method, because it is frequently + called and all existing ->not_linked() methods are one liners. */ + + fplug = inode_file_plugin(object); + /* fplug is NULL for fake inode */ + if (fplug != NULL && fplug->not_linked(object)) { + assert("nikita-3231", fplug->delete_inode != NULL); + fplug->delete_inode(object); + } else { + assert("nikita-3232", fplug->forget_inode != NULL); + fplug->forget_inode(object); + } +} + +static ssize_t +isdir(void) +{ + return RETERR(-EISDIR); +} + +#define eisdir ((void *)isdir) + +static ssize_t +perm(void) +{ + return RETERR(-EPERM); +} + +#define eperm ((void *)perm) + +static int +can_rem_dir(const struct inode * inode) +{ + /* is_dir_empty() returns 0 is dir is empty */ + return !is_dir_empty(inode); +} + +static int +process_truncate(struct inode *inode, __u64 size) +{ + int result; + struct iattr attr; + file_plugin *fplug; + reiser4_context ctx; + + init_context(&ctx, inode->i_sb); + + attr.ia_size = size; + attr.ia_valid = ATTR_SIZE | ATTR_CTIME; + fplug = inode_file_plugin(inode); + + down(&inode->i_sem); + result = fplug->setattr(inode, &attr); + up(&inode->i_sem); + + context_set_commit_async(&ctx); + reiser4_exit_context(&ctx); + + return result; +} + +reiser4_internal int +safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value) +{ + int result; + + if (link == SAFE_UNLINK) + /* nothing to do. iput() in the caller (process_safelink) will + * finish with file */ + result = 0; + else if (link == SAFE_TRUNCATE) + result = process_truncate(object, value); + else { + warning("nikita-3438", "Unrecognized safe-link type: %i", link); + result = RETERR(-EIO); + } + return result; +} + +static void +clear_inode_common(struct inode *inode) +{ + perm_plugin *pplug; + + /* + * for now, only ACLs want to do something in ->clear_inode + */ + + pplug = inode_perm_plugin(inode); + if (pplug != NULL && pplug->clear != NULL) + pplug->clear(inode); +} + +reiser4_internal int prepare_write_common ( + struct file * file, struct page * page, unsigned from, unsigned to) +{ + int result; + file_plugin *fplug; + struct inode *inode; + + assert("umka-3099", file != NULL); + assert("umka-3100", page != NULL); + assert("umka-3095", PageLocked(page)); + + if (to - from == PAGE_CACHE_SIZE || PageUptodate(page)) + return 0; + + inode = page->mapping->host; + fplug = inode_file_plugin(inode); + + if (fplug->readpage == NULL) + return RETERR(-EINVAL); + + result = fplug->readpage(file, page); + if (result != 0) { + SetPageError(page); + ClearPageUptodate(page); + /* All reiser4 readpage() implementations should return the + * page locked in case of error. */ + assert("nikita-3472", PageLocked(page)); + } else { + /* + * ->readpage() either: + * + * 1. starts IO against @page. @page is locked for IO in + * this case. + * + * 2. doesn't start IO. @page is unlocked. + * + * In either case, page should be locked. + */ + lock_page(page); + /* + * IO (if any) is completed at this point. Check for IO + * errors. + */ + if (!PageUptodate(page)) + result = RETERR(-EIO); + } + assert("umka-3098", PageLocked(page)); + return result; +} + +reiser4_internal int +key_by_inode_and_offset_common(struct inode *inode, loff_t off, reiser4_key *key) +{ + key_init(key); + set_key_locality(key, reiser4_inode_data(inode)->locality_id); + set_key_ordering(key, get_inode_ordering(inode)); + set_key_objectid(key, get_inode_oid(inode));/*FIXME: inode->i_ino */ + set_key_type(key, KEY_BODY_MINOR); + set_key_offset(key, (__u64) off); + return 0; +} + +#if defined(XATTR) +/* from xattr.c */ +extern xattr_list_head xattr_common_namespaces; +#endif + +/* + * Definitions of object plugins. + */ + +file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = { + [UNIX_FILE_PLUGIN_ID] = { + .h = { + .type_id = REISER4_FILE_PLUGIN_TYPE, + .id = UNIX_FILE_PLUGIN_ID, + .pops = NULL, + .label = "reg", + .desc = "regular file", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .open = NULL, + .truncate = truncate_unix_file, + .write_sd_by_inode = write_sd_by_inode_common, + .capturepage = capturepage_unix_file, + .readpage = readpage_unix_file, + .capture = capture_unix_file, + .read = read_unix_file, + .write = write_unix_file, + .release = release_unix_file, + .ioctl = ioctl_unix_file, + .mmap = mmap_unix_file, + .get_block = get_block_unix_file, + .flow_by_inode = flow_by_inode_unix_file, + .key_by_inode = key_by_inode_unix_file, + .set_plug_in_inode = set_plug_in_inode_common, + .adjust_to_parent = adjust_to_parent_common, + .create = create_common, + .delete = delete_file_common, + .add_link = add_link_common, + .rem_link = rem_link_common, + .owns_item = owns_item_unix_file, + .can_add_link = can_add_link_common, + .can_rem_link = NULL, + .not_linked = not_linked_common, + .setattr = setattr_unix_file, + .getattr = getattr_common, + .seek = NULL, + .detach = detach_common, + .bind = bind_common, + .safelink = safelink_unix_file, + .estimate = { + .create = estimate_create_file_common, + .update = estimate_update_common, + .unlink = estimate_unlink_common + }, +#if defined(XATTR) + .xattr = { + .set = xattr_set_common, + .get = xattr_get_common, + .list = xattr_list_common, + .remove = xattr_remove_common, + .ns = &xattr_common_namespaces + }, +#endif + .readpages = readpages_unix_file, + .init_inode_data = init_inode_data_unix_file, + .pre_delete = pre_delete_unix_file, + .drop = drop_common, + .delete_inode = delete_inode_common, + .forget_inode = forget_inode_common, + .clear_inode = clear_inode_common, + .sendfile = sendfile_unix_file, + .prepare_write = prepare_write_unix_file + }, + [DIRECTORY_FILE_PLUGIN_ID] = { + .h = { + .type_id = REISER4_FILE_PLUGIN_TYPE, + .id = DIRECTORY_FILE_PLUGIN_ID, + .pops = NULL, + .label = "dir", + .desc = "directory", + .linkage = TYPE_SAFE_LIST_LINK_ZERO}, + .open = NULL, + .truncate = eisdir, + .write_sd_by_inode = write_sd_by_inode_common, + .capturepage = NULL, + .readpage = eisdir, + .capture = NULL, + .read = eisdir, + .write = eisdir, + .release = release_dir, + .ioctl = eisdir, + .mmap = eisdir, + .get_block = NULL, + .flow_by_inode = NULL, + .key_by_inode = NULL, + .set_plug_in_inode = set_plug_in_inode_common, + .adjust_to_parent = adjust_to_parent_dir, + .create = create_common, + .delete = delete_directory_common, + .add_link = add_link_common, + .rem_link = rem_link_common, + .owns_item = owns_item_hashed, + .can_add_link = can_add_link_common, + .can_rem_link = can_rem_dir, + .not_linked = not_linked_dir, + .setattr = setattr_common, + .getattr = getattr_common, + .seek = seek_dir, + .detach = detach_dir, + .bind = bind_dir, + .safelink = safelink_common, + .estimate = { + .create = estimate_create_dir_common, + .update = estimate_update_common, + .unlink = estimate_unlink_dir_common + }, +#if defined(XATTR) + .xattr = { + .set = xattr_set_common, + .get = xattr_get_common, + .list = xattr_list_common, + .remove = xattr_remove_common, + .ns = &xattr_common_namespaces + }, +#endif + .readpages = NULL, + .init_inode_data = init_inode_ordering, + .pre_delete = NULL, + .drop = drop_common, + .delete_inode = delete_inode_common, + .forget_inode = forget_inode_common, + .clear_inode = clear_inode_common + }, + [SYMLINK_FILE_PLUGIN_ID] = { + .h = { + .type_id = REISER4_FILE_PLUGIN_TYPE, + .id = SYMLINK_FILE_PLUGIN_ID, + .pops = NULL, + .label = "symlink", + .desc = "symbolic link", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .open = NULL, + .truncate = eperm, + .write_sd_by_inode = write_sd_by_inode_common, + .capturepage = NULL, + .readpage = eperm, + .capture = NULL, + .read = eperm, + .write = eperm, + .release = NULL, + .ioctl = eperm, + .mmap = eperm, + .get_block = NULL, + .flow_by_inode = NULL, + .key_by_inode = NULL, + .set_plug_in_inode = set_plug_in_inode_common, + .adjust_to_parent = adjust_to_parent_common, + .create = create_symlink, + /* FIXME-VS: symlink should probably have its own destroy + * method */ + .delete = delete_file_common, + .add_link = add_link_common, + .rem_link = rem_link_common, + .owns_item = NULL, + .can_add_link = can_add_link_common, + .can_rem_link = NULL, + .not_linked = not_linked_common, + .setattr = setattr_common, + .getattr = getattr_common, + .seek = NULL, + .detach = detach_common, + .bind = bind_common, + .safelink = safelink_common, + .estimate = { + .create = estimate_create_file_common, + .update = estimate_update_common, + .unlink = estimate_unlink_common + }, +#if defined(XATTR) + .xattr = { + .set = xattr_set_common, + .get = xattr_get_common, + .list = xattr_list_common, + .remove = xattr_remove_common, + .ns = &xattr_common_namespaces + }, +#endif + .readpages = NULL, + .init_inode_data = init_inode_ordering, + .pre_delete = NULL, + .drop = drop_common, + .delete_inode = delete_inode_common, + .forget_inode = forget_inode_common, + .clear_inode = clear_inode_common + }, + [SPECIAL_FILE_PLUGIN_ID] = { + .h = { + .type_id = REISER4_FILE_PLUGIN_TYPE, + .id = SPECIAL_FILE_PLUGIN_ID, + .pops = NULL, + .label = "special", + .desc = "special: fifo, device or socket", + .linkage = TYPE_SAFE_LIST_LINK_ZERO} + , + .open = NULL, + .truncate = eperm, + .create = create_common, + .write_sd_by_inode = write_sd_by_inode_common, + .capturepage = NULL, + .readpage = eperm, + .capture = NULL, + .read = eperm, + .write = eperm, + .release = NULL, + .ioctl = eperm, + .mmap = eperm, + .get_block = NULL, + .flow_by_inode = NULL, + .key_by_inode = NULL, + .set_plug_in_inode = set_plug_in_inode_common, + .adjust_to_parent = adjust_to_parent_common, + .delete = delete_file_common, + .add_link = add_link_common, + .rem_link = rem_link_common, + .owns_item = owns_item_common, + .can_add_link = can_add_link_common, + .can_rem_link = NULL, + .not_linked = not_linked_common, + .setattr = setattr_common, + .getattr = getattr_common, + .seek = NULL, + .detach = detach_common, + .bind = bind_common, + .safelink = safelink_common, + .estimate = { + .create = estimate_create_file_common, + .update = estimate_update_common, + .unlink = estimate_unlink_common + }, +#if defined(XATTR) + .xattr = { + .set = xattr_set_common, + .get = xattr_get_common, + .list = xattr_list_common, + .remove = xattr_remove_common, + .ns = &xattr_common_namespaces + }, +#endif + .readpages = NULL, + .init_inode_data = init_inode_ordering, + .pre_delete = NULL, + .drop = drop_common, + .delete_inode = delete_inode_common, + .forget_inode = forget_inode_common, + .clear_inode = clear_inode_common + }, + [PSEUDO_FILE_PLUGIN_ID] = { + .h = { + .type_id = REISER4_FILE_PLUGIN_TYPE, + .id = PSEUDO_FILE_PLUGIN_ID, + .pops = NULL, + .label = "pseudo", + .desc = "pseudo file", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .open = open_pseudo, + .truncate = eperm, + .write_sd_by_inode = eperm, + .readpage = eperm, + .capturepage = NULL, + .capture = NULL, + .read = read_pseudo, + .write = write_pseudo, + .release = release_pseudo, + .ioctl = eperm, + .mmap = eperm, + .get_block = eperm, + .flow_by_inode = NULL, + .key_by_inode = NULL, + .set_plug_in_inode = set_plug_in_inode_common, + .adjust_to_parent = NULL, + .create = NULL, + .delete = eperm, + .add_link = NULL, + .rem_link = NULL, + .owns_item = NULL, + .can_add_link = cannot, + .can_rem_link = cannot, + .not_linked = NULL, + .setattr = inode_setattr, + .getattr = getattr_common, + .seek = seek_pseudo, + .detach = detach_common, + .bind = bind_common, + .safelink = NULL, + .estimate = { + .create = NULL, + .update = NULL, + .unlink = NULL + }, +#if defined(XATTR) + .xattr = { + .set = NULL, + .get = NULL, + .list = NULL, + .remove = NULL, + .ns = NULL + }, +#endif + .readpages = NULL, + .init_inode_data = NULL, + .pre_delete = NULL, + .drop = drop_pseudo, + .delete_inode = NULL, + .forget_inode = NULL, + .clear_inode = NULL + }, + [CRC_FILE_PLUGIN_ID] = { + .h = { + .type_id = REISER4_FILE_PLUGIN_TYPE, + .id = CRC_FILE_PLUGIN_ID, + .pops = &cryptcompress_plugin_ops, + .label = "cryptcompress", + .desc = "cryptcompress file", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + /* FIXME: check which of these are relly needed */ + .open = NULL, + .truncate = truncate_cryptcompress, + .write_sd_by_inode = write_sd_by_inode_common, + .readpage = readpage_cryptcompress, + .capturepage = NULL, + .capture = capture_cryptcompress, + .read = generic_file_read, + .write = write_cryptcompress, + .release = NULL, + .ioctl = NULL, + .mmap = generic_file_mmap, + .get_block = get_block_cryptcompress, + .flow_by_inode = flow_by_inode_cryptcompress, + .key_by_inode = key_by_inode_cryptcompress, + .set_plug_in_inode = set_plug_in_inode_common, + .adjust_to_parent = adjust_to_parent_common, + .create = create_cryptcompress, + .delete = delete_cryptcompress, + .add_link = add_link_common, + .rem_link = rem_link_common, + .owns_item = owns_item_common, + .can_add_link = can_add_link_common, + .can_rem_link = NULL, + .not_linked = not_linked_common, + .setattr = setattr_cryptcompress, + .getattr = getattr_common, + .seek = NULL, + .detach = detach_common, + .bind = bind_common, + .safelink = safelink_common, + .estimate = { + .create = estimate_create_file_common, + .update = estimate_update_common, + .unlink = estimate_unlink_common + }, +#if defined(XATTR) + .xattr = { + .set = xattr_set_common, + .get = xattr_get_common, + .list = xattr_list_common, + .remove = xattr_remove_common, + .ns = &xattr_common_namespaces + }, +#endif + .readpages = readpages_cryptcompress, + .init_inode_data = NULL, + .pre_delete = pre_delete_cryptcompress, + .drop = drop_common, + .delete_inode = delete_inode_common, + .forget_inode = forget_inode_common, + .clear_inode = clear_inode_common, + .sendfile = sendfile_common, + .prepare_write = prepare_write_common + } +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/object.h linux-2.6.4-ck1/fs/reiser4/plugin/object.h --- linux-2.6.4/fs/reiser4/plugin/object.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/object.h 2004-03-11 22:45:15.351500568 +1100 @@ -0,0 +1,42 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Declaration of object plugin functions. */ + +#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ ) +#define __FS_REISER4_PLUGIN_OBJECT_H__ + +#include "../forward.h" + +#include /* for struct inode */ +#include + +extern int lookup_sd(struct inode *inode, znode_lock_mode lock_mode, + coord_t * coord, lock_handle * lh, const reiser4_key * key, + int silent); +extern int guess_plugin_by_mode(struct inode *inode); + +extern int delete_file_common(struct inode *inode); +extern int write_sd_by_inode_common(struct inode *inode); +extern int owns_item_common(const struct inode *inode, + const coord_t * coord); +extern reiser4_block_nr estimate_update_common(const struct inode *inode); +extern int safelink_common(struct inode *object, + reiser4_safe_link_t link, __u64 value); +extern int prepare_write_common (struct file *, struct page *, unsigned, unsigned); +extern int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *); + +extern reiser4_plugin_ops cryptcompress_plugin_ops; + +/* __FS_REISER4_PLUGIN_OBJECT_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/plugin.c linux-2.6.4-ck1/fs/reiser4/plugin/plugin.c --- linux-2.6.4/fs/reiser4/plugin/plugin.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/plugin.c 2004-03-11 22:45:15.353500257 +1100 @@ -0,0 +1,656 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Basic plugin infrastructure, lookup etc. */ + +/* PLUGINS: + + Plugins are internal Reiser4 "modules" or "objects" used to increase + extensibility and allow external users to easily adapt reiser4 to + their needs. + + Plugins are classified into several disjoint "types". Plugins + belonging to the particular plugin type are termed "instances" of + this type. Currently the following types are present: + + . object plugin + . hash plugin + . tail plugin + . perm plugin + . item plugin + . node layout plugin + + Object (file) plugin determines how given file-system object serves + standard VFS requests for read, write, seek, mmap etc. Instances of + file plugins are: regular file, directory, symlink. Another example + of file plugin is audit plugin, that optionally records accesses to + underlying object and forward request to it. + + Hash plugins compute hashes used by reiser4 to store and locate + files within directories. Instances of hash plugin type are: r5, + tea, rupasov. + + Tail plugins (or, more precisely, tail policy plugins) determine + when last part of the file should be stored in a direct item. + + Perm plugins control permissions granted for process accessing file. + + Scope and lookup: + + label such that pair ( type_label, plugin_label ) is unique. This + pair is a globally persistent and user-visible plugin + identifier. Internally kernel maintains plugins and plugin types in + arrays using an index into those arrays as plugin and plugin type + identifiers. File-system in turn, also maintains persistent + "dictionary" which is mapping from plugin label to numerical + identifier which is stored in file-system objects. That is, we + store the offset into the plugin array for that plugin type as the + plugin id in the stat data of the filesystem object. + + plugin_labels have meaning for the user interface that assigns + plugins to files, and may someday have meaning for dynamic loading of + plugins and for copying of plugins from one fs instance to + another by utilities like cp and tar. + + Internal kernel plugin type identifier (index in plugins[] array) is + of type reiser4_plugin_type. Set of available plugin types is + currently static, but dynamic loading doesn't seem to pose + insurmountable problems. + + Within each type plugins are addressed by the identifiers of type + reiser4_plugin_id (indices in + reiser4_plugin_type_data.builtin[]). Such identifiers are only + required to be unique within one type, not globally. + + Thus, plugin in memory is uniquely identified by the pair (type_id, + id). Each plugin is either builtin, or dynamic. Builtin plugins are + ones, required to provide standard file-system semantics and are + hard-coded into kernel image, or reiser4 module. Dynamic plugins, on + the other hand, are loaded as modules on demand. + + NOTE: dynamic plugin loading will be deferred until some future version + or until we have enough time to implement it efficiently. + + Usage: + + There exists only one instance of each plugin instance, but this + single instance can be associated with many entities (file-system + objects, items, nodes, transactions, file-descriptors etc.). Entity + to which plugin of given type is termed (due to the lack of + imagination) "subject" of this plugin type and, by abuse of + terminology, subject of particular instance of this type to which + it's attached currently. For example, inode is subject of object + plugin type. Inode representing directory is subject of directory + plugin, hash plugin type and some particular instance of hash plugin + type. Inode, representing regular file is subject of "regular file" + plugin, tail-policy plugin type etc. + + With each subject plugin possibly stores some state. For example, + state of directory plugin (instance of object plugin type) is pointer + to hash plugin (if directories always use hashing that is). State of + audit plugin is file descriptor (struct file) of log file or some + magic value to do logging through printk(). + + Interface: + + In addition to a scalar identifier, each plugin type and plugin + proper has a "label": short string and a "description"---longer + descriptive string. Labels and descriptions of plugin types are + hard-coded into plugins[] array, declared and defined in + plugin.c. Label and description of plugin are stored in .label and + .desc fields of reiser4_plugin_header respectively. It's possible to + locate plugin by the pair of labels. This is used to implement "plug" + mount option and ioctl(REISER4_IOC_SETPLG). If plugin with given + pair of labels is not found, code tries to load certain module. Name + of this module is determined by request_plugin() function. For + example, for hash plugin with label "thash", module name would be + "reiserplug-hash-thash". After module requesting, lookup by labels is + repeated, so that if module registers itself through + reiser4_register_plugin() it will be found. + + NOTE: dynamic plugin loading will be deferred until some future version + or until we have enough time to implement it efficiently. + + Features: + + . user-level plugin manipulations: + + reiser4("filename/..file_plugin<='audit'"); + + write(open("filename/..file_plugin"), "audit", 8); + + . user level utilities lsplug and chplug to manipulate plugins. + Utilities are not of primary priority. Possibly they will be not + working on v4.0 + + . mount option "plug" to set-up plugins of root-directory. + "plug=foo:bar" will set "bar" as default plugin of type "foo". + + Limitations: + + . each plugin type has to provide at least one builtin + plugin. This is technical limitation and it can be lifted in the + future. + + TODO: + + New plugin types/plugings: + Things we should be able to separately choose to inherit: + + security plugins + + stat data + + file bodies + + file plugins + + dir plugins + + . perm:acl + + d audi---audit plugin intercepting and possibly logging all + accesses to object. Requires to put stub functions in file_operations + in stead of generic_file_*. + + . over---handle hash overflows + + . sqnt---handle different access patterns and instruments read-ahead + + . hier---handle inheritance of plugins along file-system hierarchy + + Different kinds of inheritance: on creation vs. on access. + Compatible/incompatible plugins. + Inheritance for multi-linked files. + Layered plugins. + Notion of plugin context is abandoned. Each file is associated + with one plugin and dependant plugins (hash, etc.) are stored as + main plugin state. Now, if we have plugins used for regular files + but not for directories, how such plugins would be inherited? + . always store them with directories also + . use inheritance hierarchy, independent of file-system namespace + +*/ + +#include "../debug.h" +#include "../dformat.h" +#include "plugin_header.h" +#include "item/static_stat.h" +#include "node/node.h" +#include "security/perm.h" +#include "space/space_allocator.h" +#include "disk_format/disk_format.h" +#include "plugin.h" +#include "../reiser4.h" +#include "../jnode.h" + +#include /* for struct super_block */ + +/* public interface */ + +/* initialise plugin sub-system. Just call this once on reiser4 startup. */ +int init_plugins(void); +int handle_default_plugin_option(char *option, reiser4_plugin ** area); +int setup_plugins(struct super_block *super, reiser4_plugin ** area); +reiser4_plugin *lookup_plugin(const char *type_label, const char *plug_label); +reiser4_plugin *lookup_plugin_name(char *plug_label); +int locate_plugin(struct inode *inode, plugin_locator * loc); + +/* internal functions. */ + +static reiser4_plugin_type find_type(const char *label); +static reiser4_plugin *find_plugin(reiser4_plugin_type_data * ptype, const char *label); + +/* initialise plugin sub-system. Just call this once on reiser4 startup. */ +reiser4_internal int +init_plugins(void) +{ + reiser4_plugin_type type_id; + + ON_TRACE(TRACE_PLUGINS, "Builtin plugins:\n"); + for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) { + reiser4_plugin_type_data *ptype; + int i; + + ptype = &plugins[type_id]; + plugin_list_init(&ptype->plugins_list); + ON_TRACE(TRACE_PLUGINS, + "Of type %s (%s):\n", ptype->label, ptype->desc); + for (i = 0; i < ptype->builtin_num; ++i) { + reiser4_plugin *plugin; + + plugin = plugin_at(ptype, i); + + if (plugin->h.label == NULL) + /* uninitialized slot encountered */ + continue; + assert("nikita-3445", plugin->h.type_id == type_id); + plugin->h.id = i; + IF_TRACE(TRACE_PLUGINS, print_plugin("\t", plugin)); + if (plugin->h.pops != NULL && + plugin->h.pops->init != NULL) { + int result; + + result = plugin->h.pops->init(plugin); + if (result != 0) + return result; + } + plugin_list_clean(plugin); + plugin_list_push_back(&ptype->plugins_list, plugin); + } + } + return 0; +} + +/* parse mount time option and update root-directory plugin + appropriately. */ +reiser4_internal int +handle_default_plugin_option(char *option, /* Option should has form + "type:label", where "type" + is label of plugin type and + "label" is label of plugin + instance within this + type. */ + reiser4_plugin ** area /* where result is to + * be stored */ ) +{ + char *type_label; + char *plug_label; + reiser4_plugin *plugin; + + assert("nikita-538", option != NULL); + assert("nikita-539", area != NULL); + + type_label = option; + plug_label = strchr(option, ':'); + if (plug_label == NULL) { + printk("Use 'plug=type:label'\n"); + return RETERR(-EINVAL); + } + + *plug_label = '\0'; + ++plug_label; + + plugin = lookup_plugin(type_label, plug_label); + if (plugin == NULL) { + printk("Unknown plugin: %s:%s\n", type_label, plug_label); + return RETERR(-EINVAL); + } + if (area[plugin->h.type_id] != NULL) { + printk("Plugin already set\n"); + print_plugin("existing", area[plugin->h.type_id]); + print_plugin("new", plugin); + return RETERR(-EINVAL); + } + area[plugin->h.type_id] = plugin; + return 0; +} + +/* lookup plugin name by scanning tables */ +reiser4_internal reiser4_plugin * +lookup_plugin_name(char *plug_label /* label to search for */ ) +{ + reiser4_plugin_type type_id; + reiser4_plugin *plugin; + + assert("vova-001", plug_label != NULL); + + plugin = NULL; + + dinfo("lookup_plugin_name: %s\n", plug_label); + + for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) { + plugin = find_plugin(&plugins[type_id], plug_label); + if (plugin != NULL) + break; + } + return plugin; +} + +/* true if plugin type id is valid */ +reiser4_internal int +is_type_id_valid(reiser4_plugin_type type_id /* plugin type id */) +{ + /* "type_id" is unsigned, so no comparison with 0 is + necessary */ + return (type_id < REISER4_PLUGIN_TYPES); +} + +/* true if plugin id is valid */ +reiser4_internal int +is_plugin_id_valid(reiser4_plugin_type type_id /* plugin type id */ , + reiser4_plugin_id id /* plugin id */) +{ + assert("nikita-1653", is_type_id_valid(type_id)); + return ((id < plugins[type_id].builtin_num) && (id >= 0)); +} + +/* lookup plugin by scanning tables */ +reiser4_internal reiser4_plugin * +lookup_plugin(const char *type_label /* plugin type label */ , + const char *plug_label /* plugin label */ ) +{ + reiser4_plugin *result; + reiser4_plugin_type type_id; + + assert("nikita-546", type_label != NULL); + assert("nikita-547", plug_label != NULL); + + result = NULL; + type_id = find_type(type_label); + if (is_type_id_valid(type_id)) { + result = find_plugin(&plugins[type_id], plug_label); + if (result == NULL) + printk("Unknown plugin: %s\n", plug_label); + } else + printk("Unknown plugin type '%s'\n", type_label); + return result; +} + +#if NOT_YET +/* convert string labels to in-memory identifiers and visa versa. + Requered for proper interaction with user-land */ +/* takes loc->type_label and loc->plug_label and fills in loc->type_id and loc->id */ + /* it is not necessary to have a non-NULL type label to find a plugin + by the plug_label */ + +int +locate_plugin(struct inode *inode, plugin_locator * loc) +{ + reiser4_plugin_type type_id; + + assert("nikita-548", inode != NULL); + assert("nikita-549", loc != NULL); + + if (loc->type_label[0] != '\0') + loc->type_id = type_by_label(loc->type_label); + type_id = loc->type_id; + if (is_type_id_valid(type_id)) { + reiser4_plugin *plugin; + + if (loc->plug_label[0] != '\0') + plugin = find_plugin(&plugins[type_id], loc->plug_label); + else + plugin = reiser4_get_plugin(inode, type_id); + if (plugin == NULL) + return -ENOENT; + + strncpy(loc->plug_label, plugin->h.label, min(MAX_PLUGIN_PLUG_LABEL_LEN, strlen(plugin->h.label) + 1)); + if (loc->type_label[0] == '\0') + strncpy(loc->type_label, + plugins[type_id].label, + min(MAX_PLUGIN_TYPE_LABEL_LEN, strlen(plugins[type_id].label) + 1)); + loc->id = plugin->h.id; + return 0; + } else + return RETERR(-EINVAL); + +} +#endif + +/* return plugin by its @type_id and @id. + + Both arguments are checked for validness: this is supposed to be called + from user-level. +*/ +reiser4_internal reiser4_plugin * +plugin_by_unsafe_id(reiser4_plugin_type type_id /* plugin + * type id, + * unchecked */ , + reiser4_plugin_id id /* plugin id, + * unchecked */ ) +{ + if (is_type_id_valid(type_id)) { + if (is_plugin_id_valid(type_id, id)) + return plugin_at(&plugins[type_id], id); + else + /* id out of bounds */ + warning("nikita-2913", + "Invalid plugin id: [%i:%i]", type_id, id); + } else + /* type_id out of bounds */ + warning("nikita-2914", "Invalid type_id: %i", type_id); + return NULL; +} + +/* convert plugin id to the disk format */ +reiser4_internal int +save_plugin_id(reiser4_plugin * plugin /* plugin to convert */ , + d16 * area /* where to store result */ ) +{ + assert("nikita-1261", plugin != NULL); + assert("nikita-1262", area != NULL); + + cputod16((__u16) plugin->h.id, area); + return 0; +} + +/* list of all plugins of given type */ +reiser4_internal plugin_list_head * +get_plugin_list(reiser4_plugin_type type_id /* plugin type + * id */ ) +{ + assert("nikita-1056", is_type_id_valid(type_id)); + return &plugins[type_id].plugins_list; +} + +#if REISER4_DEBUG_OUTPUT +/* print human readable plugin information */ +reiser4_internal void +print_plugin(const char *prefix /* prefix to print */ , + reiser4_plugin * plugin /* plugin to print */ ) +{ + if (plugin != NULL) { + printk("%s: %s (%s:%i)\n", prefix, plugin->h.desc, plugin->h.label, plugin->h.id); + } else + printk("%s: (nil)\n", prefix); +} + +#endif + +/* find plugin type by label */ +static reiser4_plugin_type +find_type(const char *label /* plugin type + * label */ ) +{ + reiser4_plugin_type type_id; + + assert("nikita-550", label != NULL); + + for (type_id = 0; (type_id < REISER4_PLUGIN_TYPES) && strcmp(label, plugins[type_id].label); ++type_id) {; + } + return type_id; +} + +/* given plugin label find it within given plugin type by scanning + array. Used to map user-visible symbolic name to internal kernel + id */ +static reiser4_plugin * +find_plugin(reiser4_plugin_type_data * ptype /* plugin + * type to + * find + * plugin + * within */ , + const char *label /* plugin label */ ) +{ + int i; + reiser4_plugin *result; + + assert("nikita-551", ptype != NULL); + assert("nikita-552", label != NULL); + + for (i = 0; i < ptype->builtin_num; ++i) { + result = plugin_at(ptype, i); + if (!strcmp(result->h.label, label)) + return result; + } + return NULL; +} + +/* defined in fs/reiser4/plugin/file.c */ +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID]; +/* defined in fs/reiser4/plugin/dir.c */ +extern dir_plugin dir_plugins[LAST_DIR_ID]; +/* defined in fs/reiser4/plugin/item/static_stat.c */ +extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION]; +/* defined in fs/reiser4/plugin/hash.c */ +extern hash_plugin hash_plugins[LAST_HASH_ID]; +/* defined in fs/reiser4/plugin/crypt.c */ +extern crypto_plugin crypto_plugins[LAST_CRYPTO_ID]; +/* defined in fs/reiser4/plugin/digest.c */ +extern digest_plugin digest_plugins[LAST_DIGEST_ID]; +/* defined in fs/reiser4/plugin/compress.c */ +extern compression_plugin compression_plugins[LAST_COMPRESSION_ID]; +/* defined in fs/reiser4/plugin/tail.c */ +extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID]; +/* defined in fs/reiser4/plugin/security/security.c */ +extern perm_plugin perm_plugins[LAST_PERM_ID]; +/* defined in fs/reiser4/plugin/item/item.c */ +extern item_plugin item_plugins[LAST_ITEM_ID]; +/* defined in fs/reiser4/plugin/node/node.c */ +extern node_plugin node_plugins[LAST_NODE_ID]; +/* defined in fs/reiser4/plugin/disk_format/disk_format.c */ +extern disk_format_plugin format_plugins[LAST_FORMAT_ID]; +/* defined in jnode.c */ +extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE]; +/* defined in plugin/pseudo.c */ +extern pseudo_plugin pseudo_plugins[LAST_PSEUDO_ID]; + +reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = { + /* C90 initializers */ + [REISER4_FILE_PLUGIN_TYPE] = { + .type_id = REISER4_FILE_PLUGIN_TYPE, + .label = "file", + .desc = "Object plugins", + .builtin_num = sizeof_array(file_plugins), + .builtin = file_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (file_plugin) + }, + [REISER4_DIR_PLUGIN_TYPE] = { + .type_id = REISER4_DIR_PLUGIN_TYPE, + .label = "dir", + .desc = "Directory plugins", + .builtin_num = sizeof_array(dir_plugins), + .builtin = dir_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (dir_plugin) + }, + [REISER4_HASH_PLUGIN_TYPE] = { + .type_id = REISER4_HASH_PLUGIN_TYPE, + .label = "hash", + .desc = "Directory hashes", + .builtin_num = sizeof_array(hash_plugins), + .builtin = hash_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (hash_plugin) + }, + [REISER4_CRYPTO_PLUGIN_TYPE] = { + .type_id = REISER4_CRYPTO_PLUGIN_TYPE, + .label = "crypto", + .desc = "Crypto plugins", + .builtin_num = sizeof_array(crypto_plugins), + .builtin = crypto_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (crypto_plugin) + }, + [REISER4_DIGEST_PLUGIN_TYPE] = { + .type_id = REISER4_DIGEST_PLUGIN_TYPE, + .label = "digest", + .desc = "Digest plugins", + .builtin_num = sizeof_array(digest_plugins), + .builtin = digest_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (digest_plugin) + }, + [REISER4_COMPRESSION_PLUGIN_TYPE] = { + .type_id = REISER4_COMPRESSION_PLUGIN_TYPE, + .label = "compression", + .desc = "Compression plugins", + .builtin_num = sizeof_array(compression_plugins), + .builtin = compression_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (compression_plugin) + }, + + [REISER4_FORMATTING_PLUGIN_TYPE] = { + .type_id = REISER4_FORMATTING_PLUGIN_TYPE, + .label = "tail", + .desc = "Tail inlining policies", + .builtin_num = sizeof_array(formatting_plugins), + .builtin = formatting_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (formatting_plugin) + }, + [REISER4_PERM_PLUGIN_TYPE] = { + .type_id = REISER4_PERM_PLUGIN_TYPE, + .label = "perm", + .desc = "Permission checks", + .builtin_num = sizeof_array(perm_plugins), + .builtin = perm_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (perm_plugin) + }, + [REISER4_ITEM_PLUGIN_TYPE] = { + .type_id = REISER4_ITEM_PLUGIN_TYPE, + .label = "item", + .desc = "Item handlers", + .builtin_num = sizeof_array(item_plugins), + .builtin = item_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (item_plugin) + }, + [REISER4_NODE_PLUGIN_TYPE] = { + .type_id = REISER4_NODE_PLUGIN_TYPE, + .label = "node", + .desc = "node layout handlers", + .builtin_num = sizeof_array(node_plugins), + .builtin = node_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (node_plugin) + }, + [REISER4_SD_EXT_PLUGIN_TYPE] = { + .type_id = REISER4_SD_EXT_PLUGIN_TYPE, + .label = "sd_ext", + .desc = "Parts of stat-data", + .builtin_num = sizeof_array(sd_ext_plugins), + .builtin = sd_ext_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (sd_ext_plugin) + }, + [REISER4_FORMAT_PLUGIN_TYPE] = { + .type_id = REISER4_FORMAT_PLUGIN_TYPE, + .label = "disk layout", + .desc = "defines filesystem on disk layout", + .builtin_num = sizeof_array(format_plugins), + .builtin = format_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (disk_format_plugin) + }, + [REISER4_JNODE_PLUGIN_TYPE] = { + .type_id = REISER4_JNODE_PLUGIN_TYPE, + .label = "jnode flavor", + .desc = "defines kind of jnode", + .builtin_num = sizeof_array(jnode_plugins), + .builtin = jnode_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (jnode_plugin) + }, + [REISER4_PSEUDO_PLUGIN_TYPE] = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .label = "pseudo file", + .desc = "pseudo file", + .builtin_num = sizeof_array(pseudo_plugins), + .builtin = pseudo_plugins, + .plugins_list = TYPE_SAFE_LIST_HEAD_ZERO, + .size = sizeof (pseudo_plugin) + } +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/plugin.h linux-2.6.4-ck1/fs/reiser4/plugin/plugin.h --- linux-2.6.4/fs/reiser4/plugin/plugin.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/plugin.h 2004-03-11 22:45:15.355499946 +1100 @@ -0,0 +1,813 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Basic plugin data-types. + see fs/reiser4/plugin/plugin.c for details */ + +#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ ) +#define __FS_REISER4_PLUGIN_TYPES_H__ + +#include "../forward.h" +#include "../debug.h" +#include "../dformat.h" +#include "../key.h" +#include "../type_safe_list.h" +#include "plugin_header.h" +#include "item/static_stat.h" +#include "item/internal.h" +#include "item/sde.h" +#include "item/cde.h" +#include "file/file.h" +#include "pseudo/pseudo.h" +#include "symlink.h" +#include "dir/hashed_dir.h" +#include "dir/dir.h" +#include "item/item.h" +#include "node/node.h" +#include "node/node40.h" +#include "security/perm.h" + + +#include "space/bitmap.h" +#include "space/space_allocator.h" + +#include "disk_format/disk_format40.h" +#include "disk_format/disk_format.h" + +#if defined(XATTR) +#include "xattr.h" +#endif + +#include /* for struct super_block, address_space */ +#include /* for struct page */ +#include /* for struct buffer_head */ +#include /* for struct dentry */ +#include + +/* a flow is a sequence of bytes being written to or read from the tree. The + tree will slice the flow into items while storing it into nodes, but all of + that is hidden from anything outside the tree. */ + +struct flow { + reiser4_key key; /* key of start of flow's sequence of bytes */ + loff_t length; /* length of flow's sequence of bytes */ + char *data; /* start of flow's sequence of bytes */ + int user; /* if 1 data is user space, 0 - kernel space */ + rw_op op; /* */ +}; + +typedef ssize_t(*rw_f_type) (struct file * file, flow_t * a_flow, loff_t * off); + +/* File plugin. Defines the set of methods that file plugins implement, some of which are optional. + + A file plugin offers to the caller an interface for IO ( writing to and/or reading from) to what the caller sees as one + sequence of bytes. An IO to it may affect more than one physical sequence of bytes, or no physical sequence of bytes, + it may affect sequences of bytes offered by other file plugins to the semantic layer, and the file plugin may invoke + other plugins and delegate work to them, but its interface is structured for offering the caller the ability to read + and/or write what the caller sees as being a single sequence of bytes. + + The file plugin must present a sequence of bytes to the caller, but it does not necessarily have to store a sequence of + bytes, it does not necessarily have to support efficient tree traversal to any offset in the sequence of bytes (tail + and extent items, whose keys contain offsets, do however provide efficient non-sequential lookup of any offset in the + sequence of bytes). + + Directory plugins provide methods for selecting file plugins by resolving a name for them. + + The functionality other filesystems call an attribute, and rigidly tie together, we decompose into orthogonal + selectable features of files. Using the terminology we will define next, an attribute is a perhaps constrained, + perhaps static length, file whose parent has a uni-count-intra-link to it, which might be grandparent-major-packed, and + whose parent has a deletion method that deletes it. + + File plugins implement constraints. + + Files can be of variable length (e.g. regular unix files), or of static length (e.g. static sized attributes). + + An object may have many sequences of bytes, and many file plugins, but, it has exactly one objectid. It is usually + desirable that an object has a deletion method which deletes every item with that objectid. Items cannot in general be + found by just their objectids. This means that an object must have either a method built into its deletion plugin + method for knowing what items need to be deleted, or links stored with the object that provide the plugin with a method + for finding those items. Deleting a file within an object may or may not have the effect of deleting the entire + object, depending on the file plugin's deletion method. + + LINK TAXONOMY: + + Many objects have a reference count, and when the reference count reaches 0 the object's deletion method is invoked. + Some links embody a reference count increase ("countlinks"), and others do not ("nocountlinks"). + + Some links are bi-directional links ("bilinks"), and some are uni-directional("unilinks"). + + Some links are between parts of the same object ("intralinks"), and some are between different objects ("interlinks"). + + PACKING TAXONOMY: + + Some items of an object are stored with a major packing locality based on their object's objectid (e.g. unix directory + items in plan A), and these are called "self-major-packed". + + Some items of an object are stored with a major packing locality based on their semantic parent object's objectid + (e.g. unix file bodies in plan A), and these are called "parent-major-packed". + + Some items of an object are stored with a major packing locality based on their semantic grandparent, and these are + called "grandparent-major-packed". Now carefully notice that we run into trouble with key length if we have to store a + 8 byte major+minor grandparent based packing locality, an 8 byte parent objectid, an 8 byte attribute objectid, and an + 8 byte offset, all in a 24 byte key. One of these fields must be sacrificed if an item is to be + grandparent-major-packed, and which to sacrifice is left to the item author choosing to make the item + grandparent-major-packed. You cannot make tail items and extent items grandparent-major-packed, though you could make + them self-major-packed (usually they are parent-major-packed). + + In the case of ACLs (which are composed of fixed length ACEs which consist of {subject-type, + subject, and permission bitmask} triples), it makes sense to not have an offset field in the ACE item key, and to allow + duplicate keys for ACEs. Thus, the set of ACES for a given file is found by looking for a key consisting of the + objectid of the grandparent (thus grouping all ACLs in a directory together), the minor packing locality of ACE, the + objectid of the file, and 0. + + IO involves moving data from one location to another, which means that two locations must be specified, source and + destination. + + This source and destination can be in the filesystem, or they can be a pointer in the user process address space plus a byte count. + + If both source and destination are in the filesystem, then at least one of them must be representable as a pure stream + of bytes (which we call a flow, and define as a struct containing a key, a data pointer, and a length). This may mean + converting one of them into a flow. We provide a generic cast_into_flow() method, which will work for any plugin + supporting read_flow(), though it is inefficiently implemented in that it temporarily stores the flow in a buffer + (Question: what to do with huge flows that cannot fit into memory? Answer: we must not convert them all at once. ) + + Performing a write requires resolving the write request into a flow defining the source, and a method that performs the write, and + a key that defines where in the tree the write is to go. + + Performing a read requires resolving the read request into a flow defining the target, and a method that performs the + read, and a key that defines where in the tree the read is to come from. + + There will exist file plugins which have no pluginid stored on the disk for them, and which are only invoked by other + plugins. + +*/ +typedef struct file_plugin { + + /* generic fields */ + plugin_header h; + + /* file_operations->open is dispatched here */ + int (*open) (struct inode * inode, struct file * file); + + int (*truncate) (struct inode * inode, loff_t size); + + /* save inode cached stat-data onto disk. It was called + reiserfs_update_sd() in 3.x */ + int (*write_sd_by_inode) (struct inode * inode); + int (*readpage) (void *, struct page *); + int (*prepare_write) (struct file *, struct page *, unsigned, unsigned); + + /* captures passed page to current atom and takes care about extents handling. + This is needed for loop back devices support and used from ->commit_write() */ + int (*capturepage) (struct page *); + /* + * add pages created through mmap into object. + */ + int (*capture) (struct inode *inode, struct writeback_control *wbc); + /* these should be implemented using body_read_flow and body_write_flow + builtins */ + ssize_t(*read) (struct file * file, char *buf, size_t size, loff_t * off); + ssize_t(*write) (struct file * file, const char *buf, size_t size, loff_t * off); + + int (*release) (struct inode *inode, struct file * file); + int (*ioctl) (struct inode *, struct file *, unsigned int cmd, unsigned long arg); + int (*mmap) (struct file * file, struct vm_area_struct * vma); + int (*get_block) (struct inode * inode, sector_t block, struct buffer_head * bh_result, int create); +/* private methods: These are optional. If used they will allow you to + minimize the amount of code needed to implement a deviation from some other + method that also uses them. */ + + /* Construct flow into @flow according to user-supplied data. + + This is used by read/write methods to construct a flow to + write/read. ->flow_by_inode() is plugin method, rather than single + global implemenation, because key in a flow used by plugin may + depend on data in a @buf. + */ + int (*flow_by_inode) (struct inode *, char *buf, int user, loff_t size, loff_t off, rw_op op, flow_t *); + + /* Return the key used to retrieve an offset of a file. It is used by + default implemenation of ->flow_by_inode() method + (common_build_flow()) and, among other things, to get to the extent + from jnode of unformatted node. + */ + int (*key_by_inode) (struct inode * inode, loff_t off, reiser4_key * key); + + /* set the plugin for a file. Called during file creation in creat() + but not reiser4() unless an inode already exists for the file. */ + int (*set_plug_in_inode) (struct inode * inode, struct inode * parent, reiser4_object_create_data * data); + + /* set up plugins for new @object created in @parent. @root is root + directory. */ + int (*adjust_to_parent) (struct inode * object, struct inode * parent, struct inode * root); + /* this does whatever is necessary to do when object is created. For + instance, for ordinary files stat data is inserted */ + int (*create) (struct inode * object, struct inode * parent, + reiser4_object_create_data * data); + /* delete empty object. This method should check REISER4_NO_SD + and set REISER4_NO_SD on success. Deletion of empty object + at least includes removal of stat-data if any. For directories this + also includes removal of dot and dot-dot. + */ + int (*delete) (struct inode * object); + + /* add link from @parent to @object */ + int (*add_link) (struct inode * object, struct inode * parent); + + /* remove link from @parent to @object */ + int (*rem_link) (struct inode * object, struct inode * parent); + + /* return true if item addressed by @coord belongs to @inode. + This is used by read/write to properly slice flow into items + in presence of multiple key assignment policies, because + items of a file are not necessarily contiguous in a key space, + for example, in a plan-b. */ + int (*owns_item) (const struct inode * inode, const coord_t * coord); + + /* checks whether yet another hard links to this object can be + added */ + int (*can_add_link) (const struct inode * inode); + /* checks whether hard links to this object can be removed */ + int (*can_rem_link) (const struct inode * inode); + /* true if there is only one link (aka name) for this file */ + int (*not_linked) (const struct inode * inode); + + /* change inode attributes. */ + int (*setattr) (struct inode * inode, struct iattr * attr); + + /* obtain inode attributes */ + int (*getattr) (struct vfsmount * mnt UNUSED_ARG, struct dentry * dentry, struct kstat * stat); + + /* seek */ + loff_t(*seek) (struct file * f, loff_t offset, int origin); + + int (*detach)(struct inode *child, struct inode *parent); + + /* called when @child was just looked up in the @parent */ + int (*bind) (struct inode * child, struct inode * parent); + + /* process safe-link during mount */ + int (*safelink)(struct inode *object, reiser4_safe_link_t link, + __u64 value); + + /* The couple of estimate methods for all file operations */ + struct { + reiser4_block_nr (*create) (struct inode *); + reiser4_block_nr (*update) (const struct inode *); + reiser4_block_nr (*unlink) (struct inode *, struct inode *); + } estimate; + void (*readpages)(struct file *file, struct address_space *mapping, + struct list_head *pages); + /* reiser4 specific part of inode has a union of structures which are specific to a plugin. This method is + called when inode is read (read_inode) and when file is created (common_create_child) so that file plugin + could initialize its inode data */ + void (*init_inode_data)(struct inode *, reiser4_object_create_data *, int); + + /* truncate file to zero size. called by reiser4_drop_inode before truncate_inode_pages */ + int (*pre_delete)(struct inode *); + + /* called from reiser4_drop_inode() */ + void (*drop)(struct inode *); + + /* called from ->drop() when there are no links, and object should be + * garbage collected. */ + void (*delete_inode)(struct inode *); + void (*forget_inode)(struct inode *); + void (*clear_inode)(struct inode *); + ssize_t (*sendfile)(struct file *, loff_t *, size_t, read_actor_t, void __user *); +#if defined(XATTR) + struct { + int (*set) (struct dentry*, const char*,const void *,size_t,int); + ssize_t (*get) (struct dentry *, const char *, void *, size_t); + ssize_t (*list) (struct dentry *, char *, size_t); + int (*remove) (struct dentry *, const char *); + xattr_list_head *ns; + } xattr; +#endif +} file_plugin; + +typedef struct dir_plugin { + /* generic fields */ + plugin_header h; + /* this is to find name in a directory and key of object the name points to */ + int (*lookup_name) (struct inode * parent, struct dentry *, reiser4_key *); + /* for use by open call, based on name supplied will install + appropriate plugin and state information, into the inode such that + subsequent VFS operations that supply a pointer to that inode + operate in a manner appropriate. Note that this may require storing + some state for the plugin, and that this state might even include + the name used by open. */ + int (*lookup) (struct inode * parent_inode, struct dentry **dentry); + /* VFS required/defined operations below this line */ + int (*unlink) (struct inode * parent, struct dentry * victim); + int (*link) (struct inode * parent, struct dentry * existing, struct dentry * where); + /* rename object named by @old entry in @old_dir to be named by @new + entry in @new_dir */ + int (*rename) (struct inode * old_dir, struct dentry * old, struct inode * new_dir, struct dentry * new); + + /* create new object described by @data and add it to the @parent + directory under the name described by @dentry */ + int (*create_child) (reiser4_object_create_data * data, + struct inode ** retobj); + + /* readdir implementation */ + int (*readdir) (struct file * f, void *cookie, filldir_t filldir); + + /* private methods: These are optional. If used they will allow you to + minimize the amount of code needed to implement a deviation from + some other method that uses them. You could logically argue that + they should be a separate type of plugin. */ + + /* check whether "name" is acceptable name to be inserted into + this object. Optionally implemented by directory-like objects. + Can check for maximal length, reserved symbols etc */ + int (*is_name_acceptable) (const struct inode * inode, const char *name, int len); + + void (*build_entry_key) (const struct inode * dir /* directory where + * entry is (or will + * be) in.*/ , + const struct qstr * name /* name of file referenced + * by this entry */ , + reiser4_key * result /* resulting key of directory + * entry */ ); + int (*build_readdir_key) (struct file * dir, reiser4_key * result); + int (*add_entry) (struct inode * object, struct dentry * where, + reiser4_object_create_data * data, reiser4_dir_entry_desc * entry); + + int (*rem_entry) (struct inode * object, struct dentry * where, reiser4_dir_entry_desc * entry); + + /* initialize directory structure for newly created object. For normal + unix directories, insert dot and dotdot. */ + int (*init) (struct inode * object, struct inode * parent, reiser4_object_create_data * data); + /* destroy directory */ + int (*done) (struct inode * child); + + /* called when @subdir was just looked up in the @dir */ + int (*attach) (struct inode * subdir, struct inode * dir); + int (*detach)(struct inode * subdir, struct inode * dir); + + struct { + reiser4_block_nr (*add_entry) (struct inode *node); + reiser4_block_nr (*rem_entry) (struct inode *node); + reiser4_block_nr (*unlink) (struct inode *, struct inode *); + } estimate; +} dir_plugin; + +typedef struct formatting_plugin { + /* generic fields */ + plugin_header h; + /* returns non-zero iff file's tail has to be stored + in a direct item. */ + int (*have_tail) (const struct inode * inode, loff_t size); +} formatting_plugin; + +typedef struct hash_plugin { + /* generic fields */ + plugin_header h; + /* computes hash of the given name */ + __u64(*hash) (const unsigned char *name, int len); +} hash_plugin; + +typedef struct crypto_plugin { + /* generic fields */ + plugin_header h; + /* number of cpu expkey words */ + unsigned nr_keywords; + /* minimal input blocksize accepted by the crypto algorithm */ + size_t (*blocksize)(__u16 keysize); + /* Offset translator. For each offset this returns (k * offset), where + k (k >= 1) is a coefficient of expansion of the crypto algorithm. + For all symmetric algorithms k == 1. For asymmetric algorithms (which + inflate data) offset translation guarantees that all disk cluster's + units will have keys smaller then next cluster's one. + */ + loff_t (*scale)(struct inode * inode, size_t blocksize, loff_t src); + /* Crypto algorithms can accept data only by chunks of crypto block + size. This method is to align any flow up to crypto block size when + we pass it to crypto algorithm. To align means to append padding of + special format specific to the crypto algorithm */ + int (*align_cluster)(__u8 *tail, int clust_size, int blocksize); + /* low-level key manager (check, install, etc..) */ + int (*set_key) (__u32 *expkey, const __u8 *key); + /* main text processing procedures */ + void (*encrypt) (__u32 *expkey, __u8 *dst, const __u8 *src); + void (*decrypt) (__u32 *expkey, __u8 *dst, const __u8 *src); +} crypto_plugin; + +typedef struct digest_plugin { + /* generic fields */ + plugin_header h; + /* input blocksize */ + unsigned int blksize; + /* digestsize */ + unsigned int digestsize; + /* alloc context */ + int (*alloc)(void *ctx); + /* free context */ + void (*free)(void *ctx); + /* main procedures */ + void (*init)(void *ctx); + void (*update)(void *ctx, /* context specific to particular + * type of digest algorithm */ + const __u8 *data, /* input data */ + unsigned int len /* input data size */); + void (*final)(void *ctx, __u8 *out /* destination digest */); +} digest_plugin; + +typedef struct compression_plugin { + /* generic fields */ + plugin_header h; + /* working memory size, bytes */ + unsigned mem_req; + /* the maximum number of bytes the size of the "compressed" data can + * exceed the uncompressed data. */ + unsigned overrun; + /* main text processing procedures */ + void (*compress) (__u8 *buf, __u8 *src_first, unsigned src_len, + __u8 *dst_first, unsigned *dst_len); + void (*decompress) (__u8 *buf, __u8 *src_first, unsigned src_len, + __u8 *dst_first, unsigned *dst_len); +}compression_plugin; + +typedef struct sd_ext_plugin { + /* generic fields */ + plugin_header h; + int (*present) (struct inode * inode, char **area, int *len); + int (*absent) (struct inode * inode); + int (*save_len) (struct inode * inode); + int (*save) (struct inode * inode, char **area); +#if REISER4_DEBUG_OUTPUT + void (*print) (const char *prefix, char **area, int *len); +#endif + /* alignment requirement for this stat-data part */ + int alignment; +} sd_ext_plugin; + +/* this plugin contains methods to allocate objectid for newly created files, + to deallocate objectid when file gets removed, to report number of used and + free objectids */ +typedef struct oid_allocator_plugin { + /* generic fields */ + plugin_header h; + int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files, __u64 oids); + /* used to report statfs->f_files */ + __u64(*oids_used) (reiser4_oid_allocator * map); + /* get next oid to use */ + __u64(*next_oid) (reiser4_oid_allocator * map); + /* used to report statfs->f_ffree */ + __u64(*oids_free) (reiser4_oid_allocator * map); + /* allocate new objectid */ + int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *); + /* release objectid */ + int (*release_oid) (reiser4_oid_allocator * map, oid_t); + /* how many pages to reserve in transaction for allocation of new + objectid */ + int (*oid_reserve_allocate) (reiser4_oid_allocator * map); + /* how many pages to reserve in transaction for freeing of an + objectid */ + int (*oid_reserve_release) (reiser4_oid_allocator * map); + void (*print_info) (const char *, reiser4_oid_allocator *); +} oid_allocator_plugin; + +/* disk layout plugin: this specifies super block, journal, bitmap (if there + are any) locations, etc */ +typedef struct disk_format_plugin { + /* generic fields */ + plugin_header h; + /* replay journal, initialize super_info_data, etc */ + int (*get_ready) (struct super_block *, void *data); + + /* key of root directory stat data */ + const reiser4_key *(*root_dir_key) (const struct super_block *); + + int (*release) (struct super_block *); + jnode *(*log_super) (struct super_block *); + void (*print_info) (const struct super_block *); + int (*check_mount) (const struct super_block *); + int (*check_open) (const struct inode *object); +} disk_format_plugin; + +struct jnode_plugin { + /* generic fields */ + plugin_header h; + int (*init) (jnode * node); + int (*parse) (jnode * node); + struct address_space *(*mapping) (const jnode * node); + unsigned long (*index) (const jnode * node); + jnode *(*clone) (jnode * node); +}; + +/* plugin instance. */ +/* */ +/* This is "wrapper" union for all types of plugins. Most of the code uses */ +/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */ +/* operates with pointers to reiser4_plugin. This union is only used in */ +/* some generic code in plugin/plugin.c that operates on all */ +/* plugins. Technically speaking purpose of this union is to add type */ +/* safety to said generic code: each plugin type (file_plugin, for */ +/* example), contains plugin_header as its first memeber. This first member */ +/* is located at the same place in memory as .h member of */ +/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */ +/* looks in the .h which is header of plugin type located in union. This */ +/* allows to avoid type-casts. */ +union reiser4_plugin { + /* generic fields */ + plugin_header h; + /* file plugin */ + file_plugin file; + /* directory plugin */ + dir_plugin dir; + /* hash plugin, used by directory plugin */ + hash_plugin hash; + /* crypto plugin, used by file plugin */ + crypto_plugin crypto; + /* digest plugin, used by file plugin */ + digest_plugin digest; + /* compression plugin, used by file plugin */ + compression_plugin compression; + /* tail plugin, used by file plugin */ + formatting_plugin formatting; + /* permission plugin */ + perm_plugin perm; + /* node plugin */ + node_plugin node; + /* item plugin */ + item_plugin item; + /* stat-data extension plugin */ + sd_ext_plugin sd_ext; + /* disk layout plugin */ + disk_format_plugin format; + /* object id allocator plugin */ + oid_allocator_plugin oid_allocator; + /* plugin for different jnode types */ + jnode_plugin jnode; + /* plugin for pseudo files */ + pseudo_plugin pseudo; + /* place-holder for new plugin types that can be registered + dynamically, and used by other dynamically loaded plugins. */ + void *generic; +}; + +struct reiser4_plugin_ops { + /* called when plugin is initialized */ + int (*init) (reiser4_plugin * plugin); + /* called when plugin is unloaded */ + int (*done) (reiser4_plugin * plugin); + /* load given plugin from disk */ + int (*load) (struct inode * inode, + reiser4_plugin * plugin, char **area, int *len); + /* how many space is required to store this plugin's state + in stat-data */ + int (*save_len) (struct inode * inode, reiser4_plugin * plugin); + /* save persistent plugin-data to disk */ + int (*save) (struct inode * inode, reiser4_plugin * plugin, char **area); + /* alignment requirement for on-disk state of this plugin + in number of bytes */ + int alignment; + /* install itself into given inode. This can return error + (e.g., you cannot change hash of non-empty directory). */ + int (*change) (struct inode * inode, reiser4_plugin * plugin); + /* install itself into given inode. This can return error + (e.g., you cannot change hash of non-empty directory). */ + int (*inherit) (struct inode * inode, struct inode * parent, + reiser4_plugin * plugin); +}; + +/* functions implemented in fs/reiser4/plugin/plugin.c */ + +/* stores plugin reference in reiser4-specific part of inode */ +extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id); +extern int handle_default_plugin_option(char *option, reiser4_plugin ** area); +extern int setup_plugins(struct super_block *super, reiser4_plugin ** area); +extern reiser4_plugin *lookup_plugin(const char *type_label, const char *plug_label); +extern int init_plugins(void); + +/* functions implemented in fs/reiser4/plugin/object.c */ +void move_flow_forward(flow_t * f, unsigned count); + +/* builtin plugins */ + +/* builtin file-plugins */ +typedef enum { + /* regular file */ + UNIX_FILE_PLUGIN_ID, + /* directory */ + DIRECTORY_FILE_PLUGIN_ID, + /* symlink */ + SYMLINK_FILE_PLUGIN_ID, + /* for objects completely handled by the VFS: fifos, devices, + sockets */ + SPECIAL_FILE_PLUGIN_ID, + /* Plugin id for crypto-compression objects */ + CRC_FILE_PLUGIN_ID, + /* pseudo file */ + PSEUDO_FILE_PLUGIN_ID, + /* number of file plugins. Used as size of arrays to hold + file plugins. */ + LAST_FILE_PLUGIN_ID +} reiser4_file_id; + +/* builtin dir-plugins */ +typedef enum { + HASHED_DIR_PLUGIN_ID, + SEEKABLE_HASHED_DIR_PLUGIN_ID, + PSEUDO_DIR_PLUGIN_ID, + LAST_DIR_ID +} reiser4_dir_id; + +/* builtin hash-plugins */ + +typedef enum { + RUPASOV_HASH_ID, + R5_HASH_ID, + TEA_HASH_ID, + FNV1_HASH_ID, + DEGENERATE_HASH_ID, + LAST_HASH_ID +} reiser4_hash_id; + +/* builtin crypto-plugins */ + +typedef enum { + NONE_CRYPTO_ID, + LAST_CRYPTO_ID +} reiser4_crypto_id; + +/* builtin digest plugins */ + +typedef enum { + NONE_DIGEST_ID, + LAST_DIGEST_ID +} reiser4_digest_id; + +/* builtin compression plugins */ + +typedef enum { + NONE_COMPRESSION_ID, + LAST_COMPRESSION_ID +} reiser4_compression_id; + +/* builtin tail-plugins */ + +typedef enum { + NEVER_TAILS_FORMATTING_ID, + SUPPRESS_OLD_ID, + FOURK_FORMATTING_ID, + ALWAYS_TAILS_FORMATTING_ID, + SMALL_FILE_FORMATTING_ID, + LAST_TAIL_FORMATTING_ID +} reiser4_formatting_id; + +/* Encapsulations of crypto specific data */ +typedef struct crypto_data { + reiser4_crypto_id cra; /* id of the crypto algorithm */ + reiser4_digest_id dia; /* id of the digest algorithm */ + __u8 * key; /* secret key */ + __u16 keysize; /* key size, bits */ + __u8 * keyid; /* keyid */ + __u16 keyid_size; /* keyid size, bytes */ +} crypto_data_t; + +/* compression/clustering specific data */ +typedef reiser4_compression_id compression_data_t; /* id of the compression algorithm */ +typedef __u8 cluster_data_t; /* cluster info */ + +/* data type used to pack parameters that we pass to vfs + object creation function create_object() */ +struct reiser4_object_create_data { + /* plugin to control created object */ + reiser4_file_id id; + /* mode of regular file, directory or special file */ +/* what happens if some other sort of perm plugin is in use? */ + int mode; + /* rdev of special file */ + dev_t rdev; + /* symlink target */ + const char *name; + /* add here something for non-standard objects you invent, like + query for interpolation file etc. */ + crypto_data_t * crypto; + compression_data_t * compression; + cluster_data_t * cluster; + + struct inode *parent; + struct dentry *dentry; +}; + +#define MAX_PLUGIN_TYPE_LABEL_LEN 32 +#define MAX_PLUGIN_PLUG_LABEL_LEN 32 + +/* used for interface with user-land: table-driven parsing in + reiser4(). */ +typedef struct plugin_locator { + reiser4_plugin_type type_id; + reiser4_plugin_id id; + char type_label[MAX_PLUGIN_TYPE_LABEL_LEN]; + char plug_label[MAX_PLUGIN_PLUG_LABEL_LEN]; +} plugin_locator; + +extern int locate_plugin(struct inode *inode, plugin_locator * loc); + +static inline reiser4_plugin * +plugin_by_id(reiser4_plugin_type type_id, reiser4_plugin_id id); + +static inline reiser4_plugin * +plugin_by_disk_id(reiser4_tree * tree, reiser4_plugin_type type_id, d16 * did); + +#define PLUGIN_BY_ID(TYPE,ID,FIELD) \ +static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id ) \ +{ \ + reiser4_plugin *plugin = plugin_by_id ( ID, id ); \ + return plugin ? & plugin -> FIELD : NULL; \ +} \ +static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \ +{ \ + reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id ); \ + return plugin ? & plugin -> FIELD : NULL; \ +} \ +static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id ) \ +{ \ + reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id ); \ + return plugin ? & plugin -> FIELD : NULL; \ +} \ +static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin ) \ +{ \ + return ( reiser4_plugin * ) plugin; \ +} \ +static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin ) \ +{ \ + return TYPE ## _to_plugin (plugin) -> h.id; \ +} \ +typedef struct { int foo; } TYPE ## _plugin_dummy + +PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item); +PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file); +PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir); +PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node); +PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext); +PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm); +PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash); +PLUGIN_BY_ID(crypto_plugin, REISER4_CRYPTO_PLUGIN_TYPE, crypto); +PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest); +PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression); +PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting); +PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format); +PLUGIN_BY_ID(oid_allocator_plugin, REISER4_OID_ALLOCATOR_PLUGIN_TYPE, oid_allocator); +PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode); +PLUGIN_BY_ID(pseudo_plugin, REISER4_PSEUDO_PLUGIN_TYPE, pseudo); + +extern int save_plugin_id(reiser4_plugin * plugin, d16 * area); + +#if REISER4_DEBUG_OUTPUT +extern void print_plugin(const char *prefix, reiser4_plugin * plugin); +#else +#define print_plugin( pr, pl ) noop +#endif + +TYPE_SAFE_LIST_DEFINE(plugin, reiser4_plugin, h.linkage); + +extern plugin_list_head *get_plugin_list(reiser4_plugin_type type_id); + +#define for_all_plugins( ptype, plugin ) \ +for( plugin = plugin_list_front( get_plugin_list( ptype ) ) ; \ + ! plugin_list_end( get_plugin_list( ptype ), plugin ) ; \ + plugin = plugin_list_next( plugin ) ) + + +#define grab_plugin(self, ancestor, plugin) \ + grab_plugin_from((self), plugin, (ancestor)->pset->plugin) + +/* if plugin in @self->field is not yet set, set it to be equal to @val */ +#define grab_plugin_from(self, field, val) \ +({ \ + typeof(val) __val; \ + struct inode *__inode; \ + reiser4_inode *__self; \ + int __result; \ + \ + __val = (val); \ + __self = (self); \ + __inode = inode_by_reiser4_inode(__self); \ + __result = 0; \ + if(__self->pset->field == NULL) { \ + if (__val->h.pops != NULL && \ + __val->h.pops->change != NULL) { \ + __result = __val->h.pops->change(__inode, \ + (reiser4_plugin *)__val); \ + } else \ + plugin_set_ ## field(&__self->pset, __val); \ + } \ + __result; \ +}) + +/* __FS_REISER4_PLUGIN_TYPES_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/plugin_hash.c linux-2.6.4-ck1/fs/reiser4/plugin/plugin_hash.c --- linux-2.6.4/fs/reiser4/plugin/plugin_hash.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/plugin_hash.c 2004-03-11 22:45:15.356499791 +1100 @@ -0,0 +1,173 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Generic hash table for use by external plugins. */ + +#include "../debug.h" + +#include "plugin_hash.h" + +#include +#include + +TYPE_SAFE_LIST_DEFINE(phash, phash_user, link); + +static phash_list_head phash_anchor[PHASH_LAST]; +static spinlock_t phash_lock = SPIN_LOCK_UNLOCKED; + +/* hash table support */ + +#define PHASH_TABLE_SIZE (128) + +static inline phash_header *to_header(const phash_hash_link * anchor) +{ + return container_of(anchor, phash_header, link); +} + +static inline int +phasheq(const phash_hash_link * a1, const phash_hash_link * a2) +{ + phash_header *h1; + phash_header *h2; + + h1 = to_header(a1); + h2 = to_header(a2); + return (h1->user == h1->user) && (h1->object == h2->object); +} + +static inline unsigned long +hash(unsigned long user, unsigned long object) +{ + return (user >> 3) ^ (object >> 1); +} + +static inline unsigned long +phash_hash(phash_hash_table *table, const phash_hash_link * a) +{ + phash_header *h; + + h = to_header(a); + return hash((unsigned long)h->user, (unsigned long)h->object); +} + +/* The hash table definition */ +#define KMALLOC(size) kmalloc((size), GFP_KERNEL) +#define KFREE(ptr, size) kfree(ptr) +TYPE_SAFE_HASH_DEFINE(phash, phash_header, + phash_hash_link, link, link, phash_hash, phasheq); +#undef KFREE +#undef KMALLOC + +static phash_hash_table phash_table; + +#if REISER4_DEBUG +static int is_scope_valid(phash_scope scope) +{ + return (0 <= (int)scope) && (scope < PHASH_LAST); +} +#endif + +reiser4_internal int phash_user_register(phash_user *user) +{ + assert("nikita-2924", user != NULL); + assert("nikita-2925", is_scope_valid(user->scope)); + spin_lock(&phash_lock); + phash_list_push_back(&phash_anchor[user->scope], user); + spin_unlock(&phash_lock); + return 0; +} + +reiser4_internal void phash_user_unregister(phash_user *user) +{ + assert("nikita-2926", user != NULL); + assert("nikita-2927", is_scope_valid(user->scope)); + spin_lock(&phash_lock); + phash_list_remove_clean(user); + spin_unlock(&phash_lock); +} + +reiser4_internal phash_header *phash_get(phash_user *user, void *object) +{ + phash_header head; + phash_header *found; + + head.user = user; + head.object = object; + + spin_lock(&phash_lock); + found = phash_hash_find(&phash_table, &head.link); + spin_unlock(&phash_lock); + return found; +} + +reiser4_internal void +phash_set(phash_user *user, void *object, phash_header *value) +{ + value->user = user; + value->object = object; + + spin_lock(&phash_lock); + phash_hash_insert(&phash_table, value); + spin_unlock(&phash_lock); +} + +reiser4_internal int phash_destroy_hook(phash_scope scope, void *object) +{ + int result; + int called; + phash_user *user; + + assert("nikita-2928", is_scope_valid(scope)); + + result = 0; + do { + called = 0; + spin_lock(&phash_lock); + for_all_type_safe_list(phash, &phash_anchor[scope], user) { + phash_header *head; + + if (user->ops.destroy == NULL) + continue; + + head = phash_get(user, object); + if (head != NULL) { + int reply; + + spin_unlock(&phash_lock); + reply = user->ops.destroy(user, object, head); + if (reply != 0 && result == 0) + result = reply; + spin_lock(&phash_lock); + } + ++ called; + } + } while(called > 0); + spin_unlock(&phash_lock); + return result; +} + +reiser4_internal int phash_init(void) +{ + int i; + + for (i = 0 ; i < PHASH_LAST ; ++ i) + phash_list_init(&phash_anchor[i]); + + return phash_hash_init(&phash_table, PHASH_TABLE_SIZE, NULL); +} + +reiser4_internal void phash_done(void) +{ + phash_hash_done(&phash_table); +} + + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/plugin_hash.h linux-2.6.4-ck1/fs/reiser4/plugin/plugin_hash.h --- linux-2.6.4/fs/reiser4/plugin/plugin_hash.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/plugin_hash.h 2004-03-11 22:45:15.356499791 +1100 @@ -0,0 +1,90 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Generic hash table for use by external plugins. see + * fs/reiser4/plugin/plugin_hash.c for details */ + +#if !defined(__PLUGIN_HASH_H__) +#define __PLUGIN_HASH_H__ + +#include "plugin_header.h" +#include "../type_safe_list.h" +#include "../type_safe_hash.h" + +typedef enum phash_scope { + PHASH_INODE, + PHASH_JNODE, + PHASH_SUPER, + + PHASH_LAST +} phash_scope; + +struct phash_header; +typedef struct phash_header phash_header; + +struct phash_user; +typedef struct phash_user phash_user; + +typedef struct phash_ops { + int (*destroy)(phash_user *user, void *object, phash_header *value); +} phash_ops; + +TYPE_SAFE_LIST_DECLARE(phash); + +struct phash_user { + reiser4_plugin_type type_id; + reiser4_plugin_id id; + phash_ops ops; + phash_scope scope; + phash_list_link link; +}; + +TYPE_SAFE_HASH_DECLARE(phash, phash_header); + +struct phash_header { + phash_hash_link link; + phash_user *user; + void *object; +}; + +extern int phash_user_register (phash_user *user); +extern void phash_user_unregister(phash_user *user); + +extern phash_header *phash_get(phash_user *user, void *object); +extern void phash_set(phash_user *user, void *object, phash_header *value); + +extern int phash_destroy_hook(phash_scope scope, void *object); + +extern int phash_init(void); +extern void phash_done(void); + +static inline int +phash_inode_destroy(struct inode *inode) +{ + return phash_destroy_hook(PHASH_INODE, inode); +} + +static inline int +phash_jnode_destroy(jnode *node) +{ + return phash_destroy_hook(PHASH_JNODE, node); +} + +static inline int +phash_super_destroy(struct super_block *super) +{ + return phash_destroy_hook(PHASH_SUPER, super); +} + +/* __PLUGIN_HASH_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ + diff -Naurp linux-2.6.4/fs/reiser4/plugin/plugin_header.h linux-2.6.4-ck1/fs/reiser4/plugin/plugin_header.h --- linux-2.6.4/fs/reiser4/plugin/plugin_header.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/plugin_header.h 2004-03-11 22:45:15.357499635 +1100 @@ -0,0 +1,133 @@ +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* plugin header. Data structures required by all plugin types. */ + +#if !defined( __PLUGIN_HEADER_H__ ) +#define __PLUGIN_HEADER_H__ + +/* plugin data-types and constants */ + +#include "../type_safe_list.h" +#include "../dformat.h" + +typedef enum { + REISER4_FILE_PLUGIN_TYPE, + REISER4_DIR_PLUGIN_TYPE, + REISER4_ITEM_PLUGIN_TYPE, + REISER4_NODE_PLUGIN_TYPE, + REISER4_HASH_PLUGIN_TYPE, + REISER4_FORMATTING_PLUGIN_TYPE, + REISER4_PERM_PLUGIN_TYPE, + REISER4_SD_EXT_PLUGIN_TYPE, + REISER4_FORMAT_PLUGIN_TYPE, + REISER4_OID_ALLOCATOR_PLUGIN_TYPE, + REISER4_JNODE_PLUGIN_TYPE, + REISER4_CRYPTO_PLUGIN_TYPE, + REISER4_DIGEST_PLUGIN_TYPE, + REISER4_COMPRESSION_PLUGIN_TYPE, + REISER4_PSEUDO_PLUGIN_TYPE, + REISER4_PLUGIN_TYPES +} reiser4_plugin_type; + +struct reiser4_plugin_ops; +/* generic plugin operations, supported by each + plugin type. */ +typedef struct reiser4_plugin_ops reiser4_plugin_ops; + +TYPE_SAFE_LIST_DECLARE(plugin); + +/* common part of each plugin instance. */ +typedef struct plugin_header { + /* plugin type */ + reiser4_plugin_type type_id; + /* id of this plugin */ + reiser4_plugin_id id; + /* plugin operations */ + reiser4_plugin_ops *pops; + /* short label of this plugin */ + const char *label; + /* descriptive string. Put your copyright message here. */ + const char *desc; + /* list linkage */ + plugin_list_link linkage; +} plugin_header; + + +/* PRIVATE INTERFACES */ + +/* plugin type representation. */ +typedef struct reiser4_plugin_type_data { + /* internal plugin type identifier. Should coincide with + index of this item in plugins[] array. */ + reiser4_plugin_type type_id; + /* short symbolic label of this plugin type. Should be no longer + than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */ + const char *label; + /* plugin type description longer than .label */ + const char *desc; + /* number of built-in plugin instances of this type */ + int builtin_num; + /* array of built-in plugins */ + void *builtin; + plugin_list_head plugins_list; + size_t size; +} reiser4_plugin_type_data; + +extern reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES]; + +int is_type_id_valid(reiser4_plugin_type type_id); +int is_plugin_id_valid(reiser4_plugin_type type_id, reiser4_plugin_id id); + +static inline reiser4_plugin * +plugin_at(reiser4_plugin_type_data * ptype, int i) +{ + char *builtin; + + builtin = ptype->builtin; + return (reiser4_plugin *) (builtin + i * ptype->size); +} + + +/* return plugin by its @type_id and @id */ +static inline reiser4_plugin * +plugin_by_id(reiser4_plugin_type type_id /* plugin type id */ , + reiser4_plugin_id id /* plugin id */ ) +{ + assert("nikita-1651", is_type_id_valid(type_id)); + assert("nikita-1652", is_plugin_id_valid(type_id, id)); + return plugin_at(&plugins[type_id], id); +} + +extern reiser4_plugin * +plugin_by_unsafe_id(reiser4_plugin_type type_id, reiser4_plugin_id id); + +/* get plugin whose id is stored in disk format */ +static inline reiser4_plugin * +plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG /* tree, + * plugin + * belongs + * to */ , + reiser4_plugin_type type_id /* plugin type + * id */ , + d16 * did /* plugin id in disk format */ ) +{ + /* what we should do properly is to maintain within each + file-system a dictionary that maps on-disk plugin ids to + "universal" ids. This dictionary will be resolved on mount + time, so that this function will perform just one additional + array lookup. */ + return plugin_by_unsafe_id(type_id, d16tocpu(did)); +} + +/* __PLUGIN_HEADER_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/plugin_set.c linux-2.6.4-ck1/fs/reiser4/plugin/plugin_set.c --- linux-2.6.4/fs/reiser4/plugin/plugin_set.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/plugin_set.c 2004-03-11 22:45:15.358499480 +1100 @@ -0,0 +1,256 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* plugin-sets */ + +/* + * Each inode comes with a whole set of plugins: file plugin, directory + * plugin, hash plugin, tail policy plugin, security plugin, etc. + * + * Storing them (pointers to them, that is) in inode is a waste of + * space. Especially, given that on average file system plugins of vast + * majority of files will belong to few sets (e.g., one set for regular files, + * another set for standard directory, etc.) + * + * Plugin set (pset) is an object containing pointers to all plugins required + * by inode. Inode only stores a pointer to pset. psets are "interned", that + * is, different inodes with the same set of plugins point to the same + * pset. This is archived by storing psets in global hash table. Races are + * avoided by simple (and efficient so far) solution of never recycling psets, + * even when last inode pointing to it is destroyed. + * + */ + +#include "../debug.h" + +#include "plugin_set.h" + +#include +#include + +/* slab for plugin sets */ +static kmem_cache_t *plugin_set_slab; + +static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = { + [0 ... 7] = SPIN_LOCK_UNLOCKED +}; + +/* hash table support */ + +#define PS_TABLE_SIZE (32) + +static inline plugin_set * +cast_to(const unsigned long * a) +{ + return container_of(a, plugin_set, hashval); +} + +static inline int +pseq(const unsigned long * a1, const unsigned long * a2) +{ + plugin_set *set1; + plugin_set *set2; + + /* make sure fields are not missed in the code below */ + cassert(sizeof *set1 == + + sizeof set1->hashval + + sizeof set1->link + + + sizeof set1->file + + sizeof set1->dir + + sizeof set1->perm + + sizeof set1->formatting + + sizeof set1->hash + + sizeof set1->sd + + sizeof set1->dir_item + + sizeof set1->crypto + + sizeof set1->digest + + sizeof set1->compression); + + set1 = cast_to(a1); + set2 = cast_to(a2); + return + set1->hashval == set2->hashval && + + set1->file == set2->file && + set1->dir == set2->dir && + set1->perm == set2->perm && + set1->formatting == set2->formatting && + set1->hash == set2->hash && + set1->sd == set2->sd && + set1->dir_item == set2->dir_item && + set1->crypto == set2->crypto && + set1->digest == set2->digest && + set1->compression == set2->compression; +} + +#define HASH_FIELD(hash, set, field) \ +({ \ + (hash) += (unsigned long)(set)->field >> 2; \ +}) + +static inline unsigned long calculate_hash(const plugin_set *set) +{ + unsigned long result; + + result = 0; + HASH_FIELD(result, set, file); + HASH_FIELD(result, set, dir); + HASH_FIELD(result, set, perm); + HASH_FIELD(result, set, formatting); + HASH_FIELD(result, set, hash); + HASH_FIELD(result, set, sd); + HASH_FIELD(result, set, dir_item); + HASH_FIELD(result, set, crypto); + HASH_FIELD(result, set, digest); + HASH_FIELD(result, set, compression); + return result & (PS_TABLE_SIZE - 1); +} + +static inline unsigned long +pshash(ps_hash_table *table, const unsigned long * a) +{ + return *a; +} + +/* The hash table definition */ +#define KMALLOC(size) kmalloc((size), GFP_KERNEL) +#define KFREE(ptr, size) kfree(ptr) +TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash, pseq); +#undef KFREE +#undef KMALLOC + +static ps_hash_table ps_table; +static plugin_set empty_set = { + .hashval = 0, + .file = NULL, + .dir = NULL, + .perm = NULL, + .formatting = NULL, + .hash = NULL, + .sd = NULL, + .dir_item = NULL, + .crypto = NULL, + .digest = NULL, + .compression = NULL, + .link = { NULL } +}; + +reiser4_internal plugin_set *plugin_set_get_empty(void) +{ + return &empty_set; +} + +reiser4_internal void plugin_set_put(plugin_set *set) +{ +} + +reiser4_internal plugin_set *plugin_set_clone(plugin_set *set) +{ + return set; +} + +static inline unsigned long * +pset_field(plugin_set *set, int offset) +{ + return (unsigned long *)(((char *)set) + offset); +} + +static int plugin_set_field(plugin_set **set, const unsigned long val, const int offset) +{ + unsigned long *spot; + spinlock_t *lock; + plugin_set replica; + plugin_set *twin; + plugin_set *psal; + plugin_set *orig; + + assert("nikita-2902", set != NULL); + assert("nikita-2904", *set != NULL); + + spot = pset_field(*set, offset); + if (unlikely(*spot == val)) + return 0; + + replica = *(orig = *set); + *pset_field(&replica, offset) = val; + replica.hashval = calculate_hash(&replica); + rcu_read_lock(); + twin = ps_hash_find(&ps_table, &replica.hashval); + if (unlikely(twin == NULL)) { + rcu_read_unlock(); + psal = kmem_cache_alloc(plugin_set_slab, GFP_KERNEL); + if (psal == NULL) + return RETERR(-ENOMEM); + *psal = replica; + lock = &plugin_set_lock[replica.hashval & 7]; + spin_lock(lock); + twin = ps_hash_find(&ps_table, &replica.hashval); + if (likely(twin == NULL)) { + *set = psal; + ps_hash_insert_rcu(&ps_table, psal); + } else { + *set = twin; + kmem_cache_free(plugin_set_slab, psal); + } + spin_unlock(lock); + } else { + rcu_read_unlock(); + *set = twin; + } + return 0; +} + +#define DEFINE_PLUGIN_SET(type, field) \ +reiser4_internal int plugin_set_ ## field(plugin_set **set, type *val) \ +{ \ + cassert(sizeof val == sizeof(unsigned long)); \ + return plugin_set_field(set, (unsigned long)val, \ + offsetof(plugin_set, field)); \ +} + +DEFINE_PLUGIN_SET(file_plugin, file) +DEFINE_PLUGIN_SET(dir_plugin, dir) +DEFINE_PLUGIN_SET(perm_plugin, perm) +DEFINE_PLUGIN_SET(formatting_plugin, formatting) +DEFINE_PLUGIN_SET(hash_plugin, hash) +DEFINE_PLUGIN_SET(item_plugin, sd) +DEFINE_PLUGIN_SET(item_plugin, dir_item) +DEFINE_PLUGIN_SET(crypto_plugin, crypto) +DEFINE_PLUGIN_SET(digest_plugin, digest) +DEFINE_PLUGIN_SET(compression_plugin, compression) + +reiser4_internal int plugin_set_init(void) +{ + int result; + + result = ps_hash_init(&ps_table, PS_TABLE_SIZE, NULL); + if (result == 0) { + plugin_set_slab = kmem_cache_create("plugin_set", + sizeof (plugin_set), 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (plugin_set_slab == NULL) + result = RETERR(-ENOMEM); + } + return result; +} + +reiser4_internal void plugin_set_done(void) +{ + /* NOTE: scan hash table and recycle all objects. */ + kmem_cache_destroy(plugin_set_slab); + ps_hash_done(&ps_table); +} + + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/plugin_set.h linux-2.6.4-ck1/fs/reiser4/plugin/plugin_set.h --- linux-2.6.4/fs/reiser4/plugin/plugin_set.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/plugin_set.h 2004-03-11 22:45:15.358499480 +1100 @@ -0,0 +1,72 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* plugin-sets. see fs/reiser4/plugin/plugin_set.c for details */ + +#if !defined( __PLUGIN_SET_H__ ) +#define __PLUGIN_SET_H__ + +#include "../type_safe_hash.h" +#include "plugin.h" + +#include + +struct plugin_set; +typedef struct plugin_set plugin_set; + +TYPE_SAFE_HASH_DECLARE(ps, plugin_set); + +struct plugin_set { + unsigned long hashval; + /* plugin of file */ + file_plugin *file; + /* plugin of dir */ + dir_plugin *dir; + /* perm plugin for this file */ + perm_plugin *perm; + /* tail policy plugin. Only meaningful for regular files */ + formatting_plugin *formatting; + /* hash plugin. Only meaningful for directories. */ + hash_plugin *hash; + /* plugin of stat-data */ + item_plugin *sd; + /* plugin of items a directory is built of */ + item_plugin *dir_item; + /* crypto plugin */ + crypto_plugin *crypto; + /* digest plugin */ + digest_plugin *digest; + /* compression plugin */ + compression_plugin *compression; + ps_hash_link link; +}; + +extern plugin_set *plugin_set_get_empty(void); +extern plugin_set *plugin_set_clone(plugin_set *set); +extern void plugin_set_put(plugin_set *set); + +extern int plugin_set_file (plugin_set **set, file_plugin *file); +extern int plugin_set_dir (plugin_set **set, dir_plugin *file); +extern int plugin_set_perm (plugin_set **set, perm_plugin *file); +extern int plugin_set_formatting (plugin_set **set, formatting_plugin *file); +extern int plugin_set_hash (plugin_set **set, hash_plugin *file); +extern int plugin_set_sd (plugin_set **set, item_plugin *file); +extern int plugin_set_dir_item (plugin_set **set, item_plugin *file); +extern int plugin_set_crypto (plugin_set **set, crypto_plugin *file); +extern int plugin_set_digest (plugin_set **set, digest_plugin *file); +extern int plugin_set_compression(plugin_set **set, compression_plugin *file); + +extern int plugin_set_init(void); +extern void plugin_set_done(void); + +/* __PLUGIN_SET_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/pseudo/pseudo.c linux-2.6.4-ck1/fs/reiser4/plugin/pseudo/pseudo.c --- linux-2.6.4/fs/reiser4/plugin/pseudo/pseudo.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/pseudo/pseudo.c 2004-03-11 22:45:15.361499013 +1100 @@ -0,0 +1,1427 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Handling of "pseudo" files representing unified access to meta data in + reiser4. */ + +/* + * See http://namesys.com/v4/v4.html, and especially + * http://namesys.com/v4/pseudo.html for basic information about reiser4 + * pseudo files, access to meta-data, reiser4() system call, etc. + * + * Pseudo files should be accessible from both reiser4() system call and + * normal POSIX calls. + * + * OVERVIEW + * + * Pseudo files provide access to various functionality through file + * system name space. As such they are similar to pseudo file systems + * already existing in UNIX and Linux: procfs, sysfs, etc. But pseudo + * files are embedded into name space of Reiser4---real block device based + * file system, and are more tightly integrated with it. In particular, + * some pseudo files are "attached" to other files (either "real" or also + * pseudo), by being accessible through path names of the form + * + * "a/b/c/..something" + * + * Here object accessible through "a/b/c/..something" is attached to the + * object accessible through "a/b/c" , and the latter is said to be the + * "host" object of the former. + * + * Object can have multiple pseudo files attached to it, distinguished by + * the last component of their names "..something", "..somethingelse", + * etc. + * + * (Note however, that ".." prefix is just a convention, and it is not + * necessary that all pseudo file names started with it.) + * + * Moreover, in addition to the purely pseudo files (that is, file system + * objects whose content (as available through read(2) system call) is not + * backed by any kind of persistent storage), extended file attributes + * (see attr(5) on Linux, and http://acl.bestbits.at/) including security + * attributes such as ACLs are also available through file system name + * space. + * + * As a result each file system object has a sub-name-space rooted at it, + * which is in striking contrast with traditional UNIX file system, where + * only directories has sub-objects and all other types of files (regular, + * FIFO-s, devices, and symlinks) are leaves. + * + * For the sake of objectivity it should be mentioned that this is not + * _completely_ new development in file system design, see + * http://docs.sun.com/db/doc/816-0220/6m6nkorp9?a=view + * + * In particular, as each object has sub-objects, name space tree is + * infinite in both extent (number of reachable objects) and depth. + * + * Some pseudo files are "built-in". They are present as sub-objects in + * each file system object, unless specifically disabled. + * + * Built-in pseudo files are implemented in this file and described at + * http://namesys.com/v4/pseudo.html + * + * IMPLEMENTATION + * + * Pseudo files are implemented as normal inodes, living in the same super + * block as other inodes for reiser4 file system. Their inode numbers are + * generated by fs/inode.c:new_inode() function and are not persistent (in + * the sense that they are not guaranteed to be the same after + * remount). To avoid clashes with "normal" inodes, all pseudo inodes are + * placed into otherwise unused locality (for example, 0), hence allowing + * reiser4_inode_find_actor() to tell them from normal inodes. + * + * All pseudo inodes share the same object plugin + * PSEUDO_FILE_PLUGIN_ID. In pseudo-inode specific part of reiser4_inode + * (pseudo_info), two things are stored: + * + * 1. pointer to the inode of the "host object" (for /a/b/c/..acl, + * /a/b/c is the host object) + * + * 2. pointer to pseudo plugin, used by PSEUDO_FILE_PLUGIN_ID to + * implement VFS operations. + * + * This design has following advantages: + * + * 1. provides for ease addition of new pseudo files without going + * through writing whole new object plugin. + * + * 2. allows sys_reiser4() to be implemented by directory invoking + * pseudo plugin methods. + * + */ + +#include "../../inode.h" +#include "../../debug.h" +#include "../plugin.h" + +#include "pseudo.h" + +static int init_pseudo(struct inode *parent, struct inode *pseudo, + pseudo_plugin *pplug, const char *name); + +static struct inode *add_pseudo(struct inode *parent, + pseudo_plugin *pplug, struct dentry **d); + +static void pseudo_set_datum(struct inode *pseudo, unsigned long datum) +{ + reiser4_inode_data(pseudo)->file_plugin_data.pseudo_info.datum = datum; +} + +/* + * try to look up built-in pseudo file by its name. + */ +reiser4_internal int +lookup_pseudo_file(struct inode *parent, struct dentry **dentry) +{ + reiser4_plugin *plugin; + const char *name; + struct inode *pseudo; + int result; + + assert("nikita-2999", parent != NULL); + assert("nikita-3000", dentry != NULL); + + /* if pseudo files are disabled for this file system bail out */ + if (reiser4_is_set(parent->i_sb, REISER4_NO_PSEUDO)) + return RETERR(-ENOENT); + + name = (*dentry)->d_name.name; + pseudo = ERR_PTR(-ENOENT); + /* scan all pseudo file plugins and check each */ + for_all_plugins(REISER4_PSEUDO_PLUGIN_TYPE, plugin) { + pseudo_plugin *pplug; + + pplug = &plugin->pseudo; + if (pplug->try != NULL && pplug->try(pplug, parent, name)) { + pseudo = add_pseudo(parent, pplug, dentry); + break; + } + } + if (!IS_ERR(pseudo)) + result = 0; + else + result = PTR_ERR(pseudo); + return result; +} + +static struct inode *add_pseudo(struct inode *parent, + pseudo_plugin *pplug, struct dentry **d) +{ + struct inode *pseudo; + + pseudo = new_inode(parent->i_sb); + if (pseudo != NULL) { + int result; + + result = init_pseudo(parent, pseudo, pplug, (*d)->d_name.name); + if (result != 0) + pseudo = ERR_PTR(result); + else + *d = d_splice_alias(pseudo, *d); + } else + pseudo = ERR_PTR(RETERR(-ENOMEM)); + return pseudo; +} + + +/* + * initialize pseudo file @pseudo to be child of @parent, with plugin @pplug + * and name @name. + */ +static int +init_pseudo(struct inode *parent, struct inode *pseudo, + pseudo_plugin *pplug, const char *name) +{ + int result; + reiser4_inode *idata; + reiser4_object_create_data data; + static const oid_t pseudo_locality = 0x0ull; + + idata = reiser4_inode_data(pseudo); + idata->locality_id = pseudo_locality; + idata->file_plugin_data.pseudo_info.host = parent; + idata->file_plugin_data.pseudo_info.plugin = pplug; + + data.id = PSEUDO_FILE_PLUGIN_ID; + data.mode = pplug->lookup_mode; + + plugin_set_file(&idata->pset, file_plugin_by_id(PSEUDO_FILE_PLUGIN_ID)); + /* if plugin has a ->lookup method, it means that @pseudo should + * behave like directory. */ + if (pplug->lookup != NULL) + plugin_set_dir(&idata->pset, + dir_plugin_by_id(PSEUDO_DIR_PLUGIN_ID)); + + result = inode_file_plugin(pseudo)->set_plug_in_inode(pseudo, + parent, &data); + if (result != 0) { + warning("nikita-3203", "Cannot install pseudo plugin"); + print_plugin("plugin", pseudo_plugin_to_plugin(pplug)); + return result; + } + + /* inherit permission plugin from parent */ + grab_plugin(idata, reiser4_inode_data(parent), perm); + + pseudo->i_nlink = 1; + /* insert inode into VFS hash table */ + insert_inode_hash(pseudo); + return 0; +} + +/* helper function: return host object of @inode pseudo file */ +static struct inode *get_inode_host(struct inode *inode) +{ + return reiser4_inode_data(inode)->file_plugin_data.pseudo_info.host; +} + +/* helper function: return host object by file descriptor */ +static struct inode *get_pseudo_host(struct file *file) +{ + struct inode *inode; + + inode = file->f_dentry->d_inode; + return get_inode_host(inode); +} + +/* helper function: return host object by seq_file */ +static struct inode *get_seq_pseudo_host(struct seq_file *seq) +{ + struct file *file; + + file = seq->private; + return get_pseudo_host(file); +} + + +static int try_by_label(pseudo_plugin *pplug, + const struct inode *parent, const char *name) +{ + return !strcmp(name, pplug->h.label); +} + +static int show_uid(struct seq_file *seq, void *cookie) +{ + seq_printf(seq, "%lu", (long unsigned)get_seq_pseudo_host(seq)->i_uid); + return 0; +} + +static int check_perm(struct inode *inode) +{ + if (IS_RDONLY(inode)) + return RETERR(-EROFS); + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return RETERR(-EPERM); + return 0; +} + +static int update_ugid(struct dentry *dentry, struct inode *inode, + uid_t uid, gid_t gid) +{ + int result; + + /* logic COPIED from fs/open.c:chown_common() */ + result = check_perm(inode); + if (result == 0) { + struct iattr newattrs; + + newattrs.ia_valid = ATTR_CTIME; + if (uid != (uid_t) -1) { + newattrs.ia_valid |= ATTR_UID; + newattrs.ia_uid = uid; + } + if (gid != (uid_t) -1) { + newattrs.ia_valid |= ATTR_GID; + newattrs.ia_gid = gid; + } + if (!S_ISDIR(inode->i_mode)) + newattrs.ia_valid |= ATTR_KILL_SUID|ATTR_KILL_SGID; + down(&inode->i_sem); + result = notify_change(dentry, &newattrs); + up(&inode->i_sem); + } + return result; +} + +static int get_uid(struct file *file, const char *buf) +{ + uid_t uid; + int result; + + if (sscanf(buf, "%i", &uid) == 1) { + struct inode *host; + + host = get_pseudo_host(file); + result = update_ugid(file->f_dentry->d_parent, host, uid, -1); + } else + result = RETERR(-EINVAL); + return result; +} + +static int show_gid(struct seq_file *seq, void *cookie) +{ + seq_printf(seq, "%lu", (long unsigned)get_seq_pseudo_host(seq)->i_gid); + return 0; +} + +static int get_gid(struct file *file, const char *buf) +{ + gid_t gid; + int result; + + if (sscanf(buf, "%i", &gid) == 1) { + struct inode *host; + + host = get_pseudo_host(file); + result = update_ugid(file->f_dentry->d_parent, host, -1, gid); + } else + result = RETERR(-EINVAL); + return result; +} + +static int show_oid(struct seq_file *seq, void *cookie) +{ + seq_printf(seq, "%llu", get_inode_oid(get_seq_pseudo_host(seq))); + return 0; +} + +static int show_key(struct seq_file *seq, void *cookie) +{ + char buf[KEY_BUF_LEN]; + reiser4_key key; + + sprintf_key(buf, build_sd_key(get_seq_pseudo_host(seq), &key)); + seq_printf(seq, "%s", buf); + return 0; +} + +static int show_size(struct seq_file *seq, void *cookie) +{ + seq_printf(seq, "%lli", get_seq_pseudo_host(seq)->i_size); + return 0; +} + +static int show_nlink(struct seq_file *seq, void *cookie) +{ + seq_printf(seq, "%u", get_seq_pseudo_host(seq)->i_nlink); + return 0; +} + +static int show_locality(struct seq_file *seq, void *cookie) +{ + seq_printf(seq, "%llu", + reiser4_inode_data(get_seq_pseudo_host(seq))->locality_id); + return 0; +} + +static int show_rwx(struct seq_file *seq, void *cookie) +{ + umode_t m; + + m = get_seq_pseudo_host(seq)->i_mode; + seq_printf(seq, "%#ho %c%c%c%c%c%c%c%c%c%c", + m, + + S_ISREG(m) ? '-' : + S_ISDIR(m) ? 'd' : + S_ISCHR(m) ? 'c' : + S_ISBLK(m) ? 'b' : + S_ISFIFO(m) ? 'p' : + S_ISLNK(m) ? 'l' : + S_ISSOCK(m) ? 's' : '?', + + m & S_IRUSR ? 'r' : '-', + m & S_IWUSR ? 'w' : '-', + m & S_IXUSR ? 'x' : '-', + + m & S_IRGRP ? 'r' : '-', + m & S_IWGRP ? 'w' : '-', + m & S_IXGRP ? 'x' : '-', + + m & S_IROTH ? 'r' : '-', + m & S_IWOTH ? 'w' : '-', + m & S_IXOTH ? 'x' : '-'); + return 0; +} + +static int get_rwx(struct file *file, const char *buf) +{ + umode_t rwx; + int result; + + if (sscanf(buf, "%hi", &rwx) == 1) { + struct inode *host; + + host = get_pseudo_host(file); + result = check_perm(host); + if (result == 0) { + struct iattr newattrs; + + down(&host->i_sem); + if (rwx == (mode_t) -1) + rwx = host->i_mode; + newattrs.ia_mode = + (rwx & S_IALLUGO) | (host->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + result = notify_change(file->f_dentry->d_parent, + &newattrs); + up(&host->i_sem); + } + } else + result = RETERR(-EINVAL); + return result; +} + +static unsigned long list_length(const struct list_head *head) +{ + struct list_head *scan; + unsigned long length; + + length = 0; + list_for_each(scan, head) + ++ length; + return length; +} + +static void * pseudos_start(struct seq_file *m, loff_t *pos) +{ + if (*pos >= LAST_PSEUDO_ID) + return NULL; + return pseudo_plugin_by_id(*pos); +} + +static void pseudos_stop(struct seq_file *m, void *v) +{ +} + +static void * pseudos_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++ (*pos); + return pseudos_start(m, pos); +} + +static int pseudos_show(struct seq_file *m, void *v) +{ + pseudo_plugin *pplug; + + pplug = v; + if (pplug->try != NULL) + seq_printf(m, "%s\n", pplug->h.label); + return 0; +} + +static void * bmap_start(struct seq_file *m, loff_t *pos) +{ + struct inode *host; + + host = get_seq_pseudo_host(m); + if (*pos << host->i_blkbits >= host->i_size) + return NULL; + else + return (void *)((unsigned long)*pos + 1); +} + +static void bmap_stop(struct seq_file *m, void *v) +{ +} + +static void * bmap_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++ (*pos); + return bmap_start(m, pos); +} + +extern int reiser4_lblock_to_blocknr(struct address_space *mapping, + sector_t lblock, reiser4_block_nr *blocknr); + + +static int bmap_show(struct seq_file *m, void *v) +{ + sector_t lblock; + int result; + reiser4_block_nr blocknr; + + lblock = ((sector_t)(unsigned long)v) - 1; + result = reiser4_lblock_to_blocknr(get_seq_pseudo_host(m)->i_mapping, + lblock, &blocknr); + if (result == 0) { + if (blocknr_is_fake(&blocknr)) + seq_printf(m, "%#llx\n", blocknr); + else + seq_printf(m, "%llu\n", blocknr); + } + return result; +} + +typedef struct readdir_cookie { + tap_t tap; + coord_t coord; + lock_handle lh; +} readdir_cookie; + +static int is_host_item(struct inode *host, coord_t *coord) +{ + if (item_type_by_coord(coord) != DIR_ENTRY_ITEM_TYPE) + return 0; + if (!inode_file_plugin(host)->owns_item(host, coord)) + return 0; + return 1; +} + +static void finish(readdir_cookie *c) +{ + if (c != NULL && !IS_ERR(c)) { + tap_done(&c->tap); + kfree(c); + } +} + +static void * readdir_start(struct seq_file *m, loff_t *pos) +{ + struct inode *host; + readdir_cookie *c; + dir_plugin *dplug; + reiser4_key dotkey; + struct qstr dotname; + int result; + loff_t entryno; + + host = get_seq_pseudo_host(m); + dplug = inode_dir_plugin(host); + + dotname.name = "."; + dotname.len = 1; + + down(&host->i_sem); + if (dplug == NULL) { + finish(NULL); + return NULL; + } + + dplug->build_entry_key(host, &dotname, &dotkey); + + c = kmalloc(sizeof *c, GFP_KERNEL); + if (c == NULL) { + finish(NULL); + return ERR_PTR(RETERR(-ENOMEM)); + } + + result = object_lookup(host, + &dotkey, + &c->coord, + &c->lh, + ZNODE_READ_LOCK, + FIND_EXACT, + LEAF_LEVEL, + LEAF_LEVEL, + CBK_READDIR_RA, + NULL); + + tap_init(&c->tap, &c->coord, &c->lh, ZNODE_READ_LOCK); + if (result == 0) + result = tap_load(&c->tap); { + if (result == 0) { + for (entryno = 0; entryno != *pos; ++ entryno) { + result = go_next_unit(&c->tap); + if (result == -E_NO_NEIGHBOR) { + finish(c); + return NULL; + } + if (result != 0) + break; + if (!is_host_item(host, c->tap.coord)) { + finish(c); + return NULL; + } + } + } + } + if (result != 0) { + finish(c); + return ERR_PTR(result); + } else + return c; +} + +static void readdir_stop(struct seq_file *m, void *v) +{ + up(&get_seq_pseudo_host(m)->i_sem); + finish(v); +} + +static void * readdir_next(struct seq_file *m, void *v, loff_t *pos) +{ + readdir_cookie *c; + struct inode *host; + int result; + + c = v; + ++ (*pos); + host = get_seq_pseudo_host(m); + result = go_next_unit(&c->tap); + if (result == 0) { + if (!is_host_item(host, c->tap.coord)) { + finish(c); + return NULL; + } else + return v; + } else { + finish(c); + return ERR_PTR(result); + } +} + +static int readdir_show(struct seq_file *m, void *v) +{ + readdir_cookie *c; + item_plugin *iplug; + char *name; + char buf[DE_NAME_BUF_LEN]; + + c = v; + iplug = item_plugin_by_coord(&c->coord); + + name = iplug->s.dir.extract_name(&c->coord, buf); + assert("nikita-3221", name != NULL); + seq_printf(m, "%s/", name); + return 0; +} + +typedef struct plugin_entry { + const char *name; + int offset; +} plugin_entry; + +#define PLUGIN_ENTRY(field) \ +{ \ + .name = #field, \ + .offset = offsetof(plugin_set, field) \ +} + +#define PSEUDO_ARRAY_ENTRY(idx, aname) \ +[idx] = { \ + .name = aname, \ + .offset = idx \ +} + +static plugin_entry pentry[] = { + PLUGIN_ENTRY(file), + PLUGIN_ENTRY(dir), + PLUGIN_ENTRY(perm), + PLUGIN_ENTRY(formatting), + PLUGIN_ENTRY(hash), + PLUGIN_ENTRY(sd), + PLUGIN_ENTRY(dir_item), + PLUGIN_ENTRY(crypto), + PLUGIN_ENTRY(digest), + PLUGIN_ENTRY(compression), + { + .name = NULL, + .offset = 0 + } +}; + +typedef enum { + PFIELD_TYPEID, + PFIELD_ID, + PFIELD_LABEL, + PFIELD_DESC +} plugin_field; + +static plugin_entry fentry[] = { + PSEUDO_ARRAY_ENTRY(PFIELD_TYPEID, "type_id"), + PSEUDO_ARRAY_ENTRY(PFIELD_ID, "id"), + PSEUDO_ARRAY_ENTRY(PFIELD_LABEL, "label"), + PSEUDO_ARRAY_ENTRY(PFIELD_DESC, "desc"), + { + .name = NULL, + .offset = 0 + }, +}; + +static int show_plugin(struct seq_file *seq, void *cookie) +{ + struct inode *host; + struct file *file; + struct inode *inode; + reiser4_plugin *plug; + plugin_entry *entry; + int idx; + plugin_set *pset; + + file = seq->private; + inode = file->f_dentry->d_inode; + + /* foo is grandparent of foo/..plugin/file */ + host = get_inode_host(get_inode_host(inode)); + idx = reiser4_inode_data(inode)->file_plugin_data.pseudo_info.datum; + entry = &pentry[idx]; + pset = reiser4_inode_data(host)->pset; + plug = *(reiser4_plugin **)(((char *)pset) + entry->offset); + + if (plug != NULL) + seq_printf(seq, "%i %s %s", + plug->h.id, plug->h.label, plug->h.desc); + return 0; +} + +static int array_lookup_pseudo(struct inode *parent, struct dentry ** dentry, + plugin_entry *array, pseudo_plugin *pplug) +{ + int result; + int idx; + struct inode *pseudo; + + pseudo = ERR_PTR(-ENOENT); + for (idx = 0; array[idx].name != NULL; ++ idx) { + if (!strcmp((*dentry)->d_name.name, array[idx].name)) { + pseudo = add_pseudo(parent, pplug, dentry); + break; + } + } + if (IS_ERR(pseudo)) + result = PTR_ERR(pseudo); + else { + result = 0; + pseudo_set_datum(pseudo, idx); + } + return result; +} + +static int array_readdir_pseudo(struct file *f, void *dirent, filldir_t filld, + plugin_entry *array, int size) +{ + loff_t off; + ino_t ino; + + off = f->f_pos; + if (off < 0) + return 0; + + /* for god's sake, why switch(loff_t) requires __cmpdi2? */ + switch ((int)off) { + case 0: + ino = f->f_dentry->d_inode->i_ino; + if (filld(dirent, ".", 1, off, ino, DT_DIR) < 0) + break; + ++ off; + /* fallthrough */ + case 1: + ino = parent_ino(f->f_dentry); + if (filld(dirent, "..", 2, off, ino, DT_DIR) < 0) + break; + ++ off; + /* fallthrough */ + default: + for (; off < size + 1; ++ off) { + const char *name; + + name = array[off - 2].name; + if (filld(dirent, name, strlen(name), + off, off + (long)f, DT_REG) < 0) + break; + } + } + f->f_pos = off; + return 0; +} + + +static int lookup_plugin_field(struct inode *parent, struct dentry ** dentry) +{ + return array_lookup_pseudo(parent, dentry, fentry, + pseudo_plugin_by_id(PSEUDO_PLUGIN_FIELD_ID)); +} + +static int show_plugin_field(struct seq_file *seq, void *cookie) +{ + struct inode *host; + struct inode *parent; + struct file *file; + struct inode *inode; + reiser4_plugin *plug; + plugin_entry *entry; + int pidx; + int idx; + plugin_set *pset; + + file = seq->private; + inode = file->f_dentry->d_inode; + + parent = get_inode_host(inode); + /* foo is grand-grand-parent of foo/..plugin/hash/id */ + host = get_inode_host(get_inode_host(parent)); + pidx = reiser4_inode_data(parent)->file_plugin_data.pseudo_info.datum; + idx = reiser4_inode_data(inode)->file_plugin_data.pseudo_info.datum; + entry = &pentry[pidx]; + pset = reiser4_inode_data(host)->pset; + plug = *(reiser4_plugin **)(((char *)pset) + entry->offset); + + if (plug != NULL) { + switch (idx) { + case PFIELD_TYPEID: + seq_printf(seq, "%i", plug->h.type_id); + break; + case PFIELD_ID: + seq_printf(seq, "%i", plug->h.id); + break; + case PFIELD_LABEL: + seq_printf(seq, "%s", plug->h.label); + break; + case PFIELD_DESC: + seq_printf(seq, "%s", plug->h.desc); + break; + } + } + + return 0; +} + +static int readdir_plugin_field(struct file *f, void *dirent, filldir_t filld) +{ + return array_readdir_pseudo(f, dirent, filld, + fentry, sizeof_array(fentry)); +} + +static int lookup_plugins(struct inode *parent, struct dentry ** dentry) +{ + return array_lookup_pseudo(parent, dentry, pentry, + pseudo_plugin_by_id(PSEUDO_PLUGIN_ID)); +} + +static int readdir_plugins(struct file *f, void *dirent, filldir_t filld) +{ + return array_readdir_pseudo(f, dirent, filld, + pentry, sizeof_array(pentry)); +} + +typedef enum { + PAGECACHE_NRPAGES, + PAGECACHE_CLEAN, + PAGECACHE_DIRTY, + PAGECACHE_LOCKED, + PAGECACHE_IO +} pagecache_stat; + +static plugin_entry pagecache_entry[] = { + PSEUDO_ARRAY_ENTRY(PAGECACHE_NRPAGES, "nrpages"), + PSEUDO_ARRAY_ENTRY(PAGECACHE_CLEAN, "clean"), + PSEUDO_ARRAY_ENTRY(PAGECACHE_DIRTY, "dirty"), + PSEUDO_ARRAY_ENTRY(PAGECACHE_LOCKED, "locked"), + PSEUDO_ARRAY_ENTRY(PAGECACHE_IO, "io"), + { + .name = NULL, + .offset = 0 + }, +}; + +static int show_pagecache(struct seq_file *seq, void *cookie) +{ + struct inode *host; + struct address_space *as; + + unsigned long nrpages; + unsigned long clean; + unsigned long dirty; + unsigned long locked; + unsigned long io; + + host = get_seq_pseudo_host(seq); + + as = host->i_mapping; + spin_lock(&as->page_lock); + nrpages = as->nrpages; + clean = list_length(&as->clean_pages); + dirty = list_length(&as->dirty_pages); + locked = list_length(&as->locked_pages); + io = list_length(&as->io_pages); + spin_unlock(&as->page_lock); + + seq_printf(seq, "%lu %lu %lu %lu %lu", + nrpages, clean, dirty, locked, io); + return 0; +} + +static int readdir_pagecache(struct file *f, void *dirent, filldir_t filld) +{ + return array_readdir_pseudo(f, dirent, filld, + pagecache_entry, + sizeof_array(pagecache_entry)); +} + +static int lookup_pagecache(struct inode *parent, struct dentry ** dentry) +{ + return array_lookup_pseudo(parent, dentry, pagecache_entry, + pseudo_plugin_by_id(PSEUDO_PAGECACHE_STAT_ID)); +} + +static int show_pagecache_stat(struct seq_file *seq, void *cookie) +{ + struct inode *host; + struct file *file; + struct inode *inode; + int idx; + + struct address_space *as; + + file = seq->private; + inode = file->f_dentry->d_inode; + + /* foo is grand-parent of foo/..pagecache/dirty */ + host = get_inode_host(get_inode_host(inode)); + idx = reiser4_inode_data(inode)->file_plugin_data.pseudo_info.datum; + as = host->i_mapping; + spin_lock(&as->page_lock); + switch (idx) { + case PAGECACHE_NRPAGES: + seq_printf(seq, "%lu", as->nrpages); + break; + case PAGECACHE_CLEAN: + seq_printf(seq, "%lu", list_length(&as->clean_pages)); + break; + case PAGECACHE_DIRTY: + seq_printf(seq, "%lu", list_length(&as->dirty_pages)); + break; + case PAGECACHE_LOCKED: + seq_printf(seq, "%lu", list_length(&as->locked_pages)); + break; + case PAGECACHE_IO: + seq_printf(seq, "%lu", list_length(&as->io_pages)); + break; + } + spin_unlock(&as->page_lock); + + return 0; +} + +static void * items_start(struct seq_file *m, loff_t *pos) +{ + struct inode *host; + readdir_cookie *c; + file_plugin *fplug; + reiser4_key headkey; + int result; + loff_t entryno; + + host = get_seq_pseudo_host(m); + fplug = inode_file_plugin(host); + + down(&host->i_sem); + if (fplug->key_by_inode == NULL) { + finish(NULL); + return NULL; + } + + fplug->key_by_inode(host, 0, &headkey); + + c = kmalloc(sizeof *c, GFP_KERNEL); + if (c == NULL) { + finish(NULL); + return ERR_PTR(RETERR(-ENOMEM)); + } + + result = object_lookup(host, + &headkey, + &c->coord, + &c->lh, + ZNODE_READ_LOCK, + FIND_MAX_NOT_MORE_THAN, + TWIG_LEVEL, + LEAF_LEVEL, + 0, + NULL); + + tap_init(&c->tap, &c->coord, &c->lh, ZNODE_READ_LOCK); + if (result == 0) + result = tap_load(&c->tap); { + if (result == 0) { + for (entryno = 0; entryno != *pos; ++ entryno) { + result = go_next_unit(&c->tap); + if (result == -E_NO_NEIGHBOR) { + finish(c); + return NULL; + } + if (result != 0) + break; + if (!fplug->owns_item(host, c->tap.coord)) { + finish(c); + return NULL; + } + } + } + } + if (result != 0) { + finish(c); + return ERR_PTR(result); + } else + return c; +} + +static void items_stop(struct seq_file *m, void *v) +{ + up(&get_seq_pseudo_host(m)->i_sem); + finish(v); +} + +static void * items_next(struct seq_file *m, void *v, loff_t *pos) +{ + readdir_cookie *c; + struct inode *host; + int result; + + c = v; + ++ (*pos); + host = get_seq_pseudo_host(m); + result = go_next_unit(&c->tap); + if (result == 0) { + if (!inode_file_plugin(host)->owns_item(host, c->tap.coord)) { + finish(c); + return NULL; + } else + return v; + } else { + finish(c); + return ERR_PTR(result); + } +} + +static int items_show(struct seq_file *m, void *v) +{ + readdir_cookie *c; + item_plugin *iplug; + char buf[KEY_BUF_LEN]; + reiser4_key key; + + + c = v; + iplug = item_plugin_by_coord(&c->coord); + + sprintf_key(buf, unit_key_by_coord(&c->coord, &key)); + seq_printf(m, "%s %s ", buf, iplug->h.label); + if (iplug->b.show != NULL) + iplug->b.show(m, &c->coord); + seq_printf(m, "\n"); + return 0; +} + +static int get_new(struct file *file, const char *buf) +{ + int result; + + if (strchr(buf, '/') == NULL) { + result = RETERR(-ENOSYS); + } else + result = RETERR(-EINVAL); + return result; +} + +pseudo_plugin pseudo_plugins[LAST_PSEUDO_ID] = { + [PSEUDO_UID_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_UID_ID, + .pops = NULL, + .label = "..uid", + .desc = "returns owner", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO | S_IWUSR, + .read_type = PSEUDO_READ_SINGLE, + .read = { + .single_show = show_uid + }, + .write_type = PSEUDO_WRITE_STRING, + .write = { + .gets = get_uid + } + }, + [PSEUDO_GID_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_GID_ID, + .pops = NULL, + .label = "..gid", + .desc = "returns group", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO | S_IWUSR, + .read_type = PSEUDO_READ_SINGLE, + .read = { + .single_show = show_gid + }, + .write_type = PSEUDO_WRITE_STRING, + .write = { + .gets = get_gid + } + }, + [PSEUDO_RWX_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_RWX_ID, + .pops = NULL, + .label = "..rwx", + .desc = "returns rwx permissions", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO | S_IWUSR, + .read_type = PSEUDO_READ_SINGLE, + .read = { + .single_show = show_rwx + }, + .write_type = PSEUDO_WRITE_STRING, + .write = { + .gets = get_rwx + } + }, + [PSEUDO_OID_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_OID_ID, + .pops = NULL, + .label = "..oid", + .desc = "returns object id", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO, + .read_type = PSEUDO_READ_SINGLE, + .read = { + .single_show = show_oid + }, + .write_type = PSEUDO_WRITE_NONE + }, + [PSEUDO_KEY_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_KEY_ID, + .pops = NULL, + .label = "..key", + .desc = "returns object's key", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO, + .read_type = PSEUDO_READ_SINGLE, + .read = { + .single_show = show_key + }, + .write_type = PSEUDO_WRITE_NONE + }, + [PSEUDO_SIZE_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_SIZE_ID, + .pops = NULL, + .label = "..size", + .desc = "returns object's size", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO, + .read_type = PSEUDO_READ_SINGLE, + .read = { + .single_show = show_size + }, + .write_type = PSEUDO_WRITE_NONE + }, + [PSEUDO_NLINK_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_NLINK_ID, + .pops = NULL, + .label = "..nlink", + .desc = "returns nlink count", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO, + .read_type = PSEUDO_READ_SINGLE, + .read = { + .single_show = show_nlink + }, + .write_type = PSEUDO_WRITE_NONE + }, + [PSEUDO_LOCALITY_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_LOCALITY_ID, + .pops = NULL, + .label = "..locality", + .desc = "returns object's locality", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO, + .read_type = PSEUDO_READ_SINGLE, + .read = { + .single_show = show_locality + }, + .write_type = PSEUDO_WRITE_NONE + }, + [PSEUDO_PAGECACHE_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_PAGECACHE_ID, + .pops = NULL, + .label = "..pagecache", + .desc = "returns page cache stats", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = lookup_pagecache, + .lookup_mode = S_IFREG | S_IRUGO | S_IXUGO, + .read_type = PSEUDO_READ_SINGLE, + .read = { + .single_show = show_pagecache + }, + .write_type = PSEUDO_WRITE_NONE, + .readdir = readdir_pagecache + }, + [PSEUDO_PAGECACHE_STAT_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_PAGECACHE_STAT_ID, + .pops = NULL, + .label = "pagecache stat", + .desc = "pagecache stat", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = NULL, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO, + .read_type = PSEUDO_READ_SINGLE, + .read = { + .single_show = show_pagecache_stat + }, + .write_type = PSEUDO_WRITE_NONE + }, + [PSEUDO_PSEUDOS_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_PSEUDOS_ID, + .pops = NULL, + .label = "..pseudo", + .desc = "returns a list of pseudo files", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO, + .read_type = PSEUDO_READ_SEQ, + .read = { + .ops = { + .start = pseudos_start, + .stop = pseudos_stop, + .next = pseudos_next, + .show = pseudos_show + } + }, + .write_type = PSEUDO_WRITE_NONE + }, + [PSEUDO_BMAP_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_BMAP_ID, + .pops = NULL, + .label = "..bmap", + .desc = "returns a list blocks for this file", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO, + .read_type = PSEUDO_READ_SEQ, + .read = { + .ops = { + .start = bmap_start, + .stop = bmap_stop, + .next = bmap_next, + .show = bmap_show + } + }, + .write_type = PSEUDO_WRITE_NONE + }, + [PSEUDO_READDIR_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_READDIR_ID, + .pops = NULL, + .label = "..readdir", + .desc = "returns a list of names in the dir", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO, + .read_type = PSEUDO_READ_SEQ, + .read = { + .ops = { + .start = readdir_start, + .stop = readdir_stop, + .next = readdir_next, + .show = readdir_show + } + }, + .write_type = PSEUDO_WRITE_NONE + }, + [PSEUDO_PLUGIN_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_PLUGIN_ID, + .pops = NULL, + .label = "plugin", + .desc = "plugin", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = NULL, + .lookup = lookup_plugin_field, + .lookup_mode = S_IFREG | S_IRUGO | S_IXUGO, + .read_type = PSEUDO_READ_SINGLE, + .read = { + .single_show = show_plugin + }, + .write_type = PSEUDO_WRITE_NONE, + .readdir = readdir_plugin_field + }, + [PSEUDO_PLUGINS_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_PLUGINS_ID, + .pops = NULL, + .label = "..plugin", + .desc = "list of plugins", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = lookup_plugins, + .lookup_mode = S_IFREG | S_IRUGO | S_IXUGO, + .read_type = PSEUDO_READ_NONE, + .write_type = PSEUDO_WRITE_NONE, + .readdir = readdir_plugins + }, + [PSEUDO_PLUGIN_FIELD_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_PLUGIN_ID, + .pops = NULL, + .label = "plugin-field", + .desc = "plugin field", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = NULL, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO, + .read_type = PSEUDO_READ_SINGLE, + .read = { + .single_show = show_plugin_field + }, + .write_type = PSEUDO_WRITE_NONE, + .readdir = NULL + }, + [PSEUDO_ITEMS_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_ITEMS_ID, + .pops = NULL, + .label = "..items", + .desc = "returns a list of items for this file", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IRUGO, + .read_type = PSEUDO_READ_SEQ, + .read = { + .ops = { + .start = items_start, + .stop = items_stop, + .next = items_next, + .show = items_show + } + }, + .write_type = PSEUDO_WRITE_NONE + }, + [PSEUDO_NEW_ID] = { + .h = { + .type_id = REISER4_PSEUDO_PLUGIN_TYPE, + .id = PSEUDO_NEW_ID, + .pops = NULL, + .label = "..new", + .desc = "creates new file in the host", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .try = try_by_label, + .lookup = NULL, + .lookup_mode = S_IFREG | S_IWUSR, + .read_type = PSEUDO_READ_NONE, + .read = { + .single_show = show_rwx + }, + .write_type = PSEUDO_WRITE_STRING, + .write = { + .gets = get_new + } + }, +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/pseudo/pseudo.h linux-2.6.4-ck1/fs/reiser4/plugin/pseudo/pseudo.h --- linux-2.6.4/fs/reiser4/plugin/pseudo/pseudo.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/pseudo/pseudo.h 2004-03-11 22:45:15.362498858 +1100 @@ -0,0 +1,126 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by + * reiser4/README */ + +/* Handling of "pseudo" files representing unified access to meta data in + reiser4. See pseudo.c for more comments. */ + +#if !defined( __REISER4_PSEUDO_H__ ) +#define __REISER4_PSEUDO_H__ + +#include "../plugin_header.h" +#include "../../key.h" + +#include +#include + +typedef enum { + PSEUDO_READ_NONE, + PSEUDO_READ_SEQ, + PSEUDO_READ_SINGLE, + PSEUDO_READ_FORWARD +} pseudo_read_type; + +typedef enum { + PSEUDO_WRITE_NONE, + PSEUDO_WRITE_STRING, + PSEUDO_WRITE_FORWARD +} pseudo_write_type; + +/* low level operations on the pseudo files. + + Methods from this interface are directly callable by reiser4 system call. + + This operation structure looks suspiciously like yet another plugin + type. Doing so would simplify some things. For example, there are already + functions to look up plugin by name, dynamic loading is planned, etc. + +*/ +struct pseudo_plugin; +typedef struct pseudo_plugin pseudo_plugin; +struct pseudo_plugin { + plugin_header h; + + int (*try) (pseudo_plugin *pplug, + const struct inode *parent, const char *name); + /* lookup method applicable to this pseudo file by method name. + + This is for something like "foo/..acl/dup", here "../acl" is the + name of a pseudo file, and "dup" is name of an operation (method) + applicable to "../acl". Once "..acl" is resolved to ACL object, + ->lookup( "dup" ) can be called to get operation. + + */ + int (*lookup)(struct inode *parent, struct dentry ** dentry); + + oid_t (*makeid)(void); + + umode_t lookup_mode; + + /* NOTE-NIKITA some other operations. Reiser4 syntax people should + add something here. */ + + pseudo_read_type read_type; + union { + struct seq_operations ops; + int (*single_show) (struct seq_file *, void *); + ssize_t (*read)(struct file *, char __user *, size_t , loff_t *); + + } read; + + pseudo_write_type write_type; + union { + int (*gets)(struct file *, const char *); + ssize_t (*write)(struct file *, + const char __user *, size_t , loff_t *); + } write; + int (*readdir)(struct file *f, void *dirent, filldir_t filld); +}; + +/* portion of reiser4_inode specific for pseudo files */ +typedef struct pseudo_info { + /* pseudo file plugin controlling this file */ + pseudo_plugin *plugin; + /* host object, for /etc/passwd/..oid, this is pointer to inode of + * /etc/passwd */ + struct inode *host; + /* for private use of pseudo file plugin */ + unsigned long datum; +} pseudo_info_t; + +extern int lookup_pseudo_file(struct inode *parent, struct dentry **dentry); + +typedef enum { + PSEUDO_UID_ID, + PSEUDO_GID_ID, + PSEUDO_RWX_ID, + PSEUDO_OID_ID, + PSEUDO_KEY_ID, + PSEUDO_SIZE_ID, + PSEUDO_NLINK_ID, + PSEUDO_LOCALITY_ID, + PSEUDO_PAGECACHE_ID, + PSEUDO_PAGECACHE_STAT_ID, + PSEUDO_PSEUDOS_ID, + PSEUDO_BMAP_ID, + PSEUDO_READDIR_ID, + PSEUDO_PLUGIN_ID, + PSEUDO_PLUGINS_ID, + PSEUDO_PLUGIN_FIELD_ID, + PSEUDO_ITEMS_ID, + PSEUDO_NEW_ID, + LAST_PSEUDO_ID +} reiser4_pseudo_id; + +/* __REISER4_PSEUDO_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + scroll-step: 1 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/security/acl.c linux-2.6.4-ck1/fs/reiser4/plugin/security/acl.c --- linux-2.6.4/fs/reiser4/plugin/security/acl.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/security/acl.c 2004-03-11 22:45:15.363498702 +1100 @@ -0,0 +1,554 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "acl.h" + +#include "../../debug.h" +#include "../../dformat.h" +#include "../../inode.h" + +#include +#include + +static int +check_write(struct inode *inode, umode_t mode, int mask) +{ + int result; + + result = 0; + if (mask & MAY_WRITE) { + /* + * Nobody gets write access to a read-only fs. + */ + if (IS_RDONLY(inode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + result = -EROFS; + /* + * Nobody gets write access to an immutable file. + */ + else if (IS_IMMUTABLE(inode)) + result = -EACCES; + } + return result; +} + +static int +check_capabilities(umode_t mode, int mask) +{ + /* + * Read/write DACs are always overridable. + * Executable DACs are overridable if at least one exec bit is set. + */ + /* + * Adjusted to match POSIX 1003.1e draft 17 more thoroughly. See + * http://marc.theaimsgroup.com/?l=linux-kernel&m=107253552623787&w=2 + */ + if (!(mask & MAY_EXEC) || (mode & S_IXUGO) || S_ISDIR(mode)) + if (capable(CAP_DAC_OVERRIDE)) + return 0; + /* + * Searching includes executable on directories, else just read. + */ + if (mask == MAY_READ || (S_ISDIR(mode) && !(mask & MAY_WRITE))) + if (capable(CAP_DAC_READ_SEARCH)) + return 0; + + return -EACCES; +} + +static int +check_mode(umode_t mode, int mask) +{ + /* + * If the DACs are ok we don't need any capability check. + */ + if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)) + return 0; + else + return check_capabilities(mode, mask); +} + +static int +check_group(struct inode *inode, umode_t mode, int mask) +{ + if (in_group_p(inode->i_gid)) + mode >>= 3; + return check_mode(mode, mask); +} + +static struct posix_acl * +inode_get_acl(struct inode *inode) +{ + return reiser4_inode_data(inode)->perm_plugin_data.acl_perm_info.access; +} + +static struct posix_acl * +inode_get_default_acl(struct inode *inode) +{ + return reiser4_inode_data(inode)->perm_plugin_data.acl_perm_info.access; +} + +static void +inode_set_acl(struct inode *inode, struct posix_acl *acl) +{ + struct posix_acl *old; + + old = inode_get_acl(inode); + if (old != NULL) + posix_acl_release(old); + reiser4_inode_data(inode)->perm_plugin_data.acl_perm_info.access = acl; +} + +static void +inode_set_default_acl(struct inode *inode, struct posix_acl *acl) +{ + struct posix_acl *old; + + old = inode_get_default_acl(inode); + if (old != NULL) + posix_acl_release(old); + reiser4_inode_data(inode)->perm_plugin_data.acl_perm_info.dfault = acl; +} + +static struct posix_acl * +get_acl(struct inode *inode, int type) +{ + switch(type) { + case ACL_TYPE_ACCESS: + return inode_get_acl(inode); + case ACL_TYPE_DEFAULT: + return inode_get_default_acl(inode); + default: + wrong_return_value("nikita-3444", "type"); + } + return NULL; +} + +reiser4_internal int +mask_ok_acl(struct inode *inode, int mask) +{ + umode_t mode; + struct posix_acl *acl; + int result; + + mode = inode->i_mode; + + result = check_write(inode, mode, mask); + if (result == 0) { + if (current->fsuid == inode->i_uid) + result = check_mode(mode >> 6, mask); + else { + result = check_group(inode, mode, mask); + acl = inode_get_acl(inode); + if (acl != NULL) { + result = posix_acl_permission(inode, acl, mask); + if (result == -EACCES) + result = check_capabilities(mode, mask); + } + } + } + return result; +} + +typedef struct { + d16 tag; + d16 perm; + d32 id; +} reiser4_acl_entry; + +typedef struct { + d16 count; +} reiser4_acl_header; + + +static void +move_on(int *length, char **area, int size_of) +{ + *length -= size_of; + *area += size_of; + assert("nikita-617", *length >= 0); +} + +static int +read_ace(struct posix_acl *acl, int no, char **area, int *len) +{ + reiser4_acl_entry *ace; + + ace = (reiser4_acl_entry *)*area; + if (*len < sizeof *ace) + return -EIO; + + acl->a_entries[no].e_tag = d16tocpu(&ace->tag); + acl->a_entries[no].e_perm = d16tocpu(&ace->perm); + switch(acl->a_entries[no].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + acl->a_entries[no].e_id = ACL_UNDEFINED_ID; + break; + case ACL_USER: + case ACL_GROUP: + acl->a_entries[no].e_id = d32tocpu(&ace->id); + break; + default: + return RETERR(-EIO); + } + move_on(len, area, sizeof *ace); + return 0; +} + +static reiser4_xattr_plugin xattr_acl_handlers[]; +static reiser4_xattr_plugin xattr_acl_trigger_handlers[]; + +static xattr_namespace acl_trigger_namespace = { + .linkage = TYPE_SAFE_LIST_HEAD_INIT(acl_trigger_namespace.linkage), + .plug = xattr_acl_trigger_handlers +}; + +static int init_acl(reiser4_plugin *plugin) +{ + xattr_add_common_namespace(&acl_trigger_namespace); + return 0; +} + +/* this is called by ->present method of static_stat_data plugin when plugin + * extension is present that contains ACL plugin. */ +static int +load_acl(struct inode * inode, reiser4_plugin * plugin, char **area, int *len) +{ + reiser4_acl_header *head; + int count; + struct posix_acl *acl; + int result; + + head = (reiser4_acl_header *)*area; + if (*len < sizeof *head) + return RETERR(-EIO); + count = d16tocpu(&head->count); + move_on(len, area, sizeof *head); + + acl = posix_acl_alloc(count, GFP_KERNEL); + if (acl != NULL) { + int i; + + for (i = 0, result = 0; i < count && result == 0; ++ i) + result = read_ace(acl, i, area, len); + if (result == 0) + result = reiser4_set_acl(inode, ACL_TYPE_ACCESS, acl); + if (result != 0) + inode_set_acl(inode, NULL); + } else + result = RETERR(-ENOMEM); + return result; +} + +static int +save_len_acl(struct inode * inode, reiser4_plugin * plugin) +{ + struct posix_acl *acl; + + acl = inode_get_acl(inode); + if (acl != 0) + return + sizeof(reiser4_acl_header) + + acl->a_count * sizeof(reiser4_acl_entry); + else + return 0; +} + +static int +save_acl(struct inode * inode, reiser4_plugin * plugin, char **area) +{ + struct posix_acl *acl; + + acl = inode_get_acl(inode); + if (acl != NULL) { + reiser4_acl_header *head; + int i; + + head = (reiser4_acl_header *)*area; + cputod16(acl->a_count, &head->count); + *area += sizeof *head; + for (i = 0; i < acl->a_count; ++i) { + reiser4_acl_entry *ace; + + ace = (reiser4_acl_entry *)*area; + cputod32(acl->a_entries[i].e_id, &ace->id); + cputod16(acl->a_entries[i].e_tag, &ace->tag); + cputod16(acl->a_entries[i].e_perm, &ace->perm); + *area += sizeof *ace; + } + } + return 0; +} + +static int +change_acl(struct inode * inode, reiser4_plugin * plugin) +{ + int result; + + if (inode_perm_plugin(inode) == NULL || + inode_perm_plugin(inode)->h.id != ACL_PERM_ID || + inode_get_acl(inode) == NULL) { + result = reiser4_set_acl(inode, ACL_TYPE_ACCESS, NULL); + if (result == 0) + plugin_set_perm(&reiser4_inode_data(inode)->pset, + &plugin->perm); + else if (result == -EOPNOTSUPP) + result = 0; + } else + result = 0; + return result; +} + +void +clear_acl(struct inode *inode) +{ + inode_set_acl(inode, NULL); +} + +reiser4_plugin_ops acl_plugin_ops = { + .init = init_acl, + .load = load_acl, + .save_len = save_len_acl, + .save = save_acl, + .change = change_acl +}; + +static int +set_acl_plugin(struct inode *inode) +{ + int result; + + if (inode_perm_plugin(inode) == NULL || + inode_perm_plugin(inode)->h.id != ACL_PERM_ID) { + reiser4_inode *info; + perm_plugin *acl_plug; + + info = reiser4_inode_data(inode); + acl_plug = perm_plugin_by_id(ACL_PERM_ID); + result = plugin_set_perm(&info->pset, acl_plug); + if (result == 0) { + result = xattr_add_namespace(inode, xattr_acl_handlers); + if (result == 0) + inode_set_plugin(inode, + perm_plugin_to_plugin(acl_plug)); + } + } else + result = 0; + return result; +} + +int +reiser4_set_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + int result; + + result = 0; + if (!S_ISLNK(inode->i_mode)) { + switch(type) { + case ACL_TYPE_ACCESS: + if (acl != NULL) { + mode_t mode; + + mode = inode->i_mode; + result = posix_acl_equiv_mode(acl, &mode); + if (result >= 0) { + inode->i_mode = mode; + if (result == 0) + acl = NULL; + } else + return result; + } + inode_set_acl(inode, acl); + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? RETERR(-EACCES) : 0; + inode_set_default_acl(inode, acl); + break; + default: + return RETERR(-EINVAL); + } + result = set_acl_plugin(inode); + } else + result = RETERR(-EOPNOTSUPP); + return result; +} + +/* below is an interface to the xattr API */ + +static int +xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +{ + struct posix_acl *acl; + int result; + + acl = get_acl(inode, type); + if (!IS_ERR(acl)) { + if (acl != NULL) + result = posix_acl_to_xattr(acl, buffer, size); + else + result = RETERR(-ENODATA); + } else + result = RETERR(PTR_ERR(acl)); + return result; +} + +static int +xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) +{ + struct posix_acl *acl; + int result; + + result = 0; + if (current->fsuid == inode->i_uid || capable(CAP_FOWNER)) { + if (value != NULL) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return RETERR(PTR_ERR(acl)); + else if (acl != NULL) { + result = posix_acl_valid(acl); + } + } else + acl = NULL; + if (result == 0) { + result = reiser4_set_acl(inode, type, acl); + if (result == 0) { + file_plugin *fplug; + __u64 tograb; + + fplug = inode_file_plugin(inode); + tograb = fplug->estimate.update(inode); + result = reiser4_grab_space(tograb, + BA_CAN_COMMIT); + if (result == 0) + result = reiser4_update_sd(inode); + } + } + } else + result = RETERR(-EPERM); + return result; +} + + +static +size_t reiser4_xattr_list_acl_access(char *list, struct inode *inode, + const char *name, int name_len) +{ + const size_t size = sizeof(XATTR_NAME_ACL_ACCESS); + + if (list != NULL) + memcpy(list, XATTR_NAME_ACL_ACCESS, size); + return size; +} + +static +int reiser4_xattr_get_acl_access(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); + else + return RETERR(-EINVAL); +} + +static +int reiser4_xattr_set_acl_access(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); + else + return RETERR(-EINVAL); +} + +static +size_t reiser4_xattr_list_acl_default(char *list, struct inode *inode, + const char *name, int name_len) +{ + const size_t size = sizeof(XATTR_NAME_ACL_DEFAULT); + + if (list != NULL) + memcpy(list, XATTR_NAME_ACL_DEFAULT, size); + return size; +} + +static +int reiser4_xattr_get_acl_default(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); + else + return RETERR(-EINVAL); +} + +static +int reiser4_xattr_set_acl_default(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); + else + return RETERR(-EINVAL); +} + +static reiser4_xattr_plugin xattr_acl_handlers[] = { + [0] = { + .prefix = XATTR_NAME_ACL_ACCESS, + .list = reiser4_xattr_list_acl_access, + .get = reiser4_xattr_get_acl_access, + .set = reiser4_xattr_set_acl_access + }, + [1] = { + .prefix = XATTR_NAME_ACL_DEFAULT, + .list = reiser4_xattr_list_acl_default, + .get = reiser4_xattr_get_acl_default, + .set = reiser4_xattr_set_acl_default + }, + [2] = { + .prefix = NULL, + .list = NULL, + .get = NULL, + .set = NULL + } +}; + +static int eopnotsupp(void) +{ + return RETERR(-EOPNOTSUPP); +} + +static reiser4_xattr_plugin xattr_acl_trigger_handlers[] = { + [0] = { + .prefix = XATTR_NAME_ACL_ACCESS, + .list = (void *)eopnotsupp, + .get = (void *)eopnotsupp, + .set = reiser4_xattr_set_acl_access + }, + [1] = { + .prefix = XATTR_NAME_ACL_DEFAULT, + .list = (void *)eopnotsupp, + .get = (void *)eopnotsupp, + .set = reiser4_xattr_set_acl_default + }, + [2] = { + .prefix = NULL, + .list = NULL, + .get = NULL, + .set = NULL + } +}; + + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ + diff -Naurp linux-2.6.4/fs/reiser4/plugin/security/acl.h linux-2.6.4-ck1/fs/reiser4/plugin/security/acl.h --- linux-2.6.4/fs/reiser4/plugin/security/acl.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/security/acl.h 2004-03-11 22:45:15.363498702 +1100 @@ -0,0 +1,39 @@ +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Directory entry. */ + +#if !defined(__FS_REISER4_PLUGIN_SECURITY_ACL_H__) +#define __FS_REISER4_PLUGIN_SECURITY_ACL_H__ + +#include "../../forward.h" + +#include "../plugin.h" + +#if defined (XATTR) +#include "../xattr.h" +#endif + +typedef struct acl_perm_info { + struct posix_acl *access; + struct posix_acl *dfault; +} acl_perm_info_t; + +int mask_ok_acl(struct inode *inode, int mask); +void clear_acl(struct inode *inode); + +extern reiser4_plugin_ops acl_plugin_ops; + +int reiser4_set_acl(struct inode *inode, int type, struct posix_acl *acl); + +/* __FS_REISER4_PLUGIN_SECURITY_ACL_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/security/perm.c linux-2.6.4-ck1/fs/reiser4/plugin/security/perm.c --- linux-2.6.4/fs/reiser4/plugin/security/perm.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/security/perm.c 2004-03-11 22:45:15.364498547 +1100 @@ -0,0 +1,102 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* definition of item plugins. */ + +#include "../plugin.h" +#include "../plugin_header.h" +#include "../../debug.h" + +#include "acl.h" + +#include +#include /* for struct dentry */ +#include + +static int +mask_ok_common(struct inode *inode, int mask) +{ + return vfs_permission(inode, mask); +} + +static int +setattr_ok_common(struct dentry *dentry, struct iattr *attr) +{ + int result; + struct inode *inode; + + assert("nikita-2272", dentry != NULL); + assert("nikita-2273", attr != NULL); + + inode = dentry->d_inode; + assert("nikita-2274", inode != NULL); + + result = inode_change_ok(inode, attr); + if (result == 0) { + unsigned int valid; + + valid = attr->ia_valid; + if ((valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (valid & ATTR_GID && attr->ia_gid != inode->i_gid)) + result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; + } + return result; +} + +perm_plugin perm_plugins[LAST_PERM_ID] = { + [RWX_PERM_ID] = { + .h = { + .type_id = REISER4_PERM_PLUGIN_TYPE, + .id = RWX_PERM_ID, + .pops = NULL, + .label = "rwx", + .desc = "standard UNIX permissions", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .read_ok = NULL, + .write_ok = NULL, + .lookup_ok = NULL, + .create_ok = NULL, + .link_ok = NULL, + .unlink_ok = NULL, + .delete_ok = NULL, + .mask_ok = mask_ok_common, + .setattr_ok = setattr_ok_common, + .getattr_ok = NULL, + .rename_ok = NULL, + .clear = NULL + }, +#if defined(XATTR) + [ACL_PERM_ID] = { + .h = { + .type_id = REISER4_PERM_PLUGIN_TYPE, + .id = ACL_PERM_ID, + .pops = &acl_plugin_ops, + .label = "acl", + .desc = "POSIX acls", + .linkage = TYPE_SAFE_LIST_LINK_ZERO + }, + .read_ok = NULL, + .write_ok = NULL, + .lookup_ok = NULL, + .create_ok = NULL, + .link_ok = NULL, + .unlink_ok = NULL, + .delete_ok = NULL, + .mask_ok = mask_ok_acl, + .setattr_ok = setattr_ok_common, + .getattr_ok = NULL, + .rename_ok = NULL, + .clear = clear_acl + } +#endif +}; + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/security/perm.h linux-2.6.4-ck1/fs/reiser4/plugin/security/perm.h --- linux-2.6.4/fs/reiser4/plugin/security/perm.h 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/security/perm.h 2004-03-11 22:45:15.364498547 +1100 @@ -0,0 +1,90 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +/* Perm (short for "permissions") plugins common stuff. */ + +#if !defined( __REISER4_PERM_H__ ) +#define __REISER4_PERM_H__ + +#include "../../forward.h" +#include "../plugin_header.h" + +#include +#include /* for struct file */ +#include /* for struct dentry */ + +/* interface for perm plugin. + + Perm plugin method can be implemented through: + + 1. consulting ->i_mode bits in stat data + + 2. obtaining acl from the tree and inspecting it + + 3. asking some kernel module or user-level program to authorize access. + + This allows for integration with things like capabilities, SELinux-style + secutiry contexts, etc. + +*/ +typedef struct perm_plugin { + /* generic fields */ + plugin_header h; + + /* check permissions for read/write */ + int (*read_ok) (struct file * file, const char *buf, size_t size, loff_t * off); + int (*write_ok) (struct file * file, const char *buf, size_t size, loff_t * off); + + /* check permissions for lookup */ + int (*lookup_ok) (struct inode * parent, struct dentry * dentry); + + /* check permissions for create */ + int (*create_ok) (struct inode * parent, struct dentry * dentry, reiser4_object_create_data * data); + + /* check permissions for linking @where to @existing */ + int (*link_ok) (struct dentry * existing, struct inode * parent, struct dentry * where); + + /* check permissions for unlinking @victim from @parent */ + int (*unlink_ok) (struct inode * parent, struct dentry * victim); + + /* check permissions for deletion of @object whose last reference is + by @parent */ + int (*delete_ok) (struct inode * parent, struct dentry * victim); + int (*mask_ok) (struct inode * inode, int mask); + /* check whether attribute change is acceptable */ + int (*setattr_ok) (struct dentry * dentry, struct iattr * attr); + + /* check whether stat(2) is allowed */ + int (*getattr_ok) (struct vfsmount * mnt UNUSED_ARG, struct dentry * dentry, struct kstat * stat); + /* check whether rename(2) is allowed */ + int (*rename_ok) (struct inode * old_dir, struct dentry * old, + struct inode * new_dir, struct dentry * new); + /* called when inode is thrown out of memory (by + * reiser4_clear_inode()->clear_inode_common() */ + void (*clear) (struct inode * inode); +} perm_plugin; + +/* call ->check_ok method of perm plugin for inode */ +#define perm_chk(inode, check, ...) \ +({ \ + perm_plugin *perm; \ + \ + perm = inode_perm_plugin(inode); \ + (perm == NULL || perm->check ## _ok == NULL) ? \ + 0 : \ + perm->check ## _ok(__VA_ARGS__); \ +}) + +typedef enum { RWX_PERM_ID, ACL_PERM_ID, LAST_PERM_ID } reiser4_perm_id; + +/* __REISER4_PERM_H__ */ +#endif + +/* Make Linus happy. + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 120 + End: +*/ diff -Naurp linux-2.6.4/fs/reiser4/plugin/space/bitmap.c linux-2.6.4-ck1/fs/reiser4/plugin/space/bitmap.c --- linux-2.6.4/fs/reiser4/plugin/space/bitmap.c 1970-01-01 10:00:00.000000000 +1000 +++ linux-2.6.4-ck1/fs/reiser4/plugin/space/bitmap.c 2004-03-11 22:45:15.367498080 +1100 @@ -0,0 +1,1616 @@ +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */ + +#include "../../debug.h" +#include "../../dformat.h" +#include "../../txnmgr.h" +#include "../../jnode.h" +#include "../../block_alloc.h" +#include "../../tree.h" +#include "../../super.h" +#include "../../lib.h" + +#include "../plugin.h" +#include "../../diskmap.h" + +#include "space_allocator.h" +#include "bitmap.h" + +#include +#include /* for struct super_block */ +#include + +/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap + * blocks + + A useful optimization of reiser4 bitmap handling would be dynamic bitmap + blocks loading/unloading which is different from v3.x where all bitmap + blocks are loaded at mount time. + + To implement bitmap blocks unloading we need to count bitmap block usage + and detect currently unused blocks allowing them to be unloaded. It is not + a simple task since we allow several threads to modify one bitmap block + simultaneously. + + Briefly speaking, the following schema is proposed: we count in special + variable associated with each bitmap block. That is for counting of block + alloc/dealloc operations on that bitmap block. With a deferred block + deallocation feature of reiser4 all those operation will be represented in + atom dirty/deleted lists as jnodes for freshly allocated or deleted + nodes. + + So, we increment usage counter for each new node allocated or deleted, and + decrement it at atom commit one time for each node from the dirty/deleted + atom's list. Of course, freshly allocated node deletion and node reusing + from atom deleted (if we do so) list should decrement bitmap usage counter + also. + + This schema seems to be working but that reference counting is + not easy to debug. I think we should agree with Hans and do not implement + it in v4.0. Current code implements "on-demand" bitmap blocks loading only. + + For simplicity all bitmap nodes (both commit and working bitmap blocks) are + loaded into memory on fs mount time or each bitmap nodes are loaded at the + first access to it, the "dont_load_bitmap" mount option controls whether + bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap + nodes currently is not supported. */ + +#define CHECKSUM_SIZE 4 + +#define BYTES_PER_LONG (sizeof(long)) + +#if BITS_PER_LONG == 64 +# define LONG_INT_SHIFT (6) +#else +# define LONG_INT_SHIFT (5) +#endif + +#define LONG_INT_MASK (BITS_PER_LONG - 1) + +typedef unsigned long ulong_t; + + +#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE) +#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3) + +/* Block allocation/deallocation are done through special bitmap objects which + are allocated in an array at fs mount. */ +struct bitmap_node { + struct semaphore sema; /* long term lock object */ + + jnode *wjnode; /* j-nodes for WORKING ... */ + jnode *cjnode; /* ... and COMMIT bitmap blocks */ + + bmap_off_t first_zero_bit; /* for skip_busy option implementation */ + + atomic_t loaded; /* a flag which shows that bnode is loaded + * already */ +}; + +static inline char * +bnode_working_data(struct bitmap_node *bnode) +{ + char *data; + + data = jdata(bnode->wjnode); + assert("zam-429", data != NULL); + + return data + CHECKSUM_SIZE; +} + +static inline char * +bnode_commit_data(const struct bitmap_node *bnode) +{ + char *data; + + data = jdata(bnode->cjnode); + assert("zam-430", data != NULL); + + return data + CHECKSUM_SIZE; +} + +static inline __u32 +bnode_commit_crc(const struct bitmap_node *bnode) +{ + char *data; + + data = jdata(bnode->cjnode); + assert("vpf-261", data != NULL); + + return d32tocpu((d32 *) data); +} + +static inline void +bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc) +{ + char *data; + + data = jdata(bnode->cjnode); + assert("vpf-261", data != NULL); + + cputod32(crc, (d32 *) data); +} + +/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having + * written the code, does this added abstraction still have */ +/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the + * reiser4_space_allocator structure) */ +/* ZAM-FIXME-HANS: I don't understand your english in comment above. */ +/* FIXME-HANS(Zam): I don't understand the questions like "might be a union + * someday?". What they about? If there is a reason to have a union, it should + * be a union, if not, it should not be a union. "..might be someday" means no + * reason. */ +struct bitmap_allocator_data { + /* an array for bitmap blocks direct access */ + struct bitmap_node *bitmap; +}; + +#define get_barray(super) \ +(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap) + +#define get_bnode(super, i) (get_barray(super) + i) + +/* allocate and initialize jnode with JNODE_BITMAP type */ +static jnode * +bnew(void) +{ + jnode *jal = jalloc(); + + if (jal) + jnode_init(jal, current_tree, JNODE_BITMAP); + + return jal; +} + +/* this file contains: + - bitmap based implementation of space allocation plugin + - all the helper functions like set bit, find_first_zero_bit, etc */ + +/* Audited by: green(2002.06.12) */ +static int +find_next_zero_bit_in_word(ulong_t word, int start_bit) +{ + unsigned int mask = 1 << start_bit; + int i = start_bit; + + while ((word & mask) != 0) { + mask <<= 1; + if (++i >= BITS_PER_LONG) + break; + } + + return i; +} + +#include + +#define reiser4_set_bit(nr, addr) ext2_set_bit(nr, addr) +#define reiser4_clear_bit(nr, addr) ext2_clear_bit(nr, addr) +#define reiser4_test_bit(nr, addr) ext2_test_bit(nr, addr) + +#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \ +ext2_find_next_zero_bit(addr, maxoffset, offset) + +/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets + * are counted from @addr, return the offset of the first bit if it is found, + * @maxoffset otherwise. */ +static bmap_off_t reiser4_find_next_set_bit( + void *addr, bmap_off_t max_offset, bmap_off_t start_offset) +{ + ulong_t *base = addr; + /* start_offset is in bits, convert it to byte offset within bitmap. */ + int word_nr = start_offset >> LONG_INT_SHIFT; + /* bit number within the byte. */ + int bit_nr = start_offset & LONG_INT_MASK; + int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT; + + assert("zam-387", max_offset != 0); + + /* Unaligned @start_offset case. */ + if (bit_nr != 0) { + bmap_nr_t nr; + + nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr); + + if (nr < BITS_PER_LONG) + return (word_nr << LONG_INT_SHIFT) + nr; + + ++word_nr; + } + + /* Fast scan trough aligned words. */ + while (word_nr <= max_word_nr) { + if (base[word_nr] != 0) { + return (word_nr << LONG_INT_SHIFT) + + find_next_zero_bit_in_word(~(base[word_nr]), 0); + } + + ++word_nr; + } + + return max_offset; +} + +/* search for the first set bit in single word. */ +static int find_last_set_bit_in_word (ulong_t word, int start_bit) +{ + unsigned bit_mask; + int nr = start_bit; + + assert ("zam-965", start_bit < BITS_PER_LONG); + assert ("zam-966", start_bit >= 0); + + bit_mask = (1 << nr); + + while (bit_mask != 0) { + if (bit_mask & word) + return nr; + bit_mask >>= 1; + nr --; + } + return BITS_PER_LONG; +} + +/* Search bitmap for a set bit in backward direction from the end to the + * beginning of given region + * + * @result: result offset of the last set bit + * @addr: base memory address, + * @low_off: low end of the search region, edge bit included into the region, + * @high_off: high end of the search region, edge bit included into the region, + * + * @return: 0 - set bit was found, -1 otherwise. + */ +static int +reiser4_find_last_set_bit (bmap_off_t * result, void * addr, bmap_off_t low_off, bmap_off_t high_off) +{ + ulong_t * base = addr; + int last_word; + int first_word; + int last_bit; + int nr; + + assert ("zam-961", high_off >= 0); + assert ("zam-962", high_off >= low_off); + + last_word = high_off >> LONG_INT_SHIFT; + last_bit = high_off & LONG_INT_MASK; + first_word = low_off >> LONG_INT_SHIFT; + + if (last_bit < BITS_PER_LONG) { + nr = find_last_set_bit_in_word(base[last_word], last_bit); + if (nr < BITS_PER_LONG) { + *result = (last_word << LONG_INT_SHIFT) + nr; + return 0; + } + -- last_word; + } + while (last_word >= first_word) { + if (base[last_word] != 0x0) { + last_bit = find_last_set_bit_in_word(base[last_word], BITS_PER_LONG - 1); + assert ("zam-972", last_bit < BITS_PER_LONG); + *result = (last_word << LONG_INT_SHIFT) + last_bit; + return 0; + } + -- last_word; + } + + return -1; /* set bit not found */ +} + +/* Search bitmap for a clear bit in backward direction from the end to the + * beginning of given region */ +static int +reiser4_find_last_zero_bit (bmap_off_t * result, void * addr, bmap_off_t low_off, bmap_off_t high_off) +{ + ulong_t * base = addr; + int last_word; + int first_word; + int last_bit; + int nr; + + last_word = high_off >> LONG_INT_SHIFT; + last_bit = high_off & LONG_INT_MASK; + first_word = low_off >> LONG_INT_SHIFT; + + if (last_bit < BITS_PER_LONG) { + nr = find_last_set_bit_in_word(~base[last_word], last_bit); + if (nr < BITS_PER_LONG) { + *result = (last_word << LONG_INT_SHIFT) + nr; + return 0; + } + -- last_word; + } + while (last_word >= first_word) { + if (base[last_word] != (ulong_t)(-1)) { + *result = (last_word << LONG_INT_SHIFT) + + find_last_set_bit_in_word(~base[last_word], BITS_PER_LONG - 1); + return 0; + } + -- last_word; + } + + return -1; /* zero bit not found */ +} + +/* Audited by: green(2002.06.12) */ +static void +reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end) +{ + int first_byte; + int last_byte; + + unsigned char first_byte_mask = 0xFF; + unsigned char last_byte_mask = 0xFF; + + assert("zam-410", start < end); + + first_byte = start >> 3; + last_byte = (end - 1) >> 3; + + if (last_byte > first_byte + 1) + xmemset(addr + first_byte + 1, 0, (size_t) (last_byte - first_byte - 1)); + + first_byte_mask >>= 8 - (start & 0x7); + last_byte_mask <<= ((end - 1) & 0x7) + 1; + + if (first_byte == last_byte) { + addr[first_byte] &= (first_byte_mask | last_byte_mask); + } else { + addr[first_byte] &= first_byte_mask; + addr[last_byte] &= last_byte_mask; + } +} + +/* Audited by: green(2002.06.12) */ +/* ZAM-FIXME-HANS: comment this */ +static void +reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end) +{ + int first_byte; + int last_byte; + + unsigned char first_byte_mask = 0xFF; + unsigned char last_byte_mask = 0xFF; + + assert("zam-386", start < end); + + first_byte = start >> 3; + last_byte = (end - 1) >> 3; + + if (last_byte > first_byte + 1) + xmemset(addr + first_byte + 1, 0xFF, (size_t) (last_byte - first_byte - 1)); + + first_byte_mask <<= start & 0x7; + last_byte_mask >>= 7 - ((end - 1) & 0x7); + + if (first_byte == last_byte) { + addr[first_byte] |= (first_byte_mask & last_byte_mask); + } else { + addr[first_byte] |= first_byte_mask; + addr[last_byte] |= last_byte_mask; + } +} + +#define ADLER_BASE 65521 +#define ADLER_NMAX 5552 + +/* Calculates the adler32 checksum for the data pointed by `data` of the + length `len`. This function was originally taken from zlib, version 1.1.3, + July 9th, 1998. + + Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + + The above comment applies only to the adler32 function. +*/ + +static __u32 +adler32(char *data, __u32 len) +{ + unsigned char *t = data; + __u32 s1 = 1; + __u32 s2 = 0; + int k; + + while (len > 0) { + k = len < ADLER_NMAX ? len : ADLER_NMAX; + len -= k; + + while (k--) { + s1 += *t++; + s2 += s1; + } + + s1 %= ADLER_BASE; + s2 %= ADLER_BASE;