# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.744 -> 1.745 # fs/jbd/journal.c 1.5 -> 1.6 # fs/jbd/recovery.c 1.3 -> 1.4 # fs/jbd/transaction.c 1.4 -> 1.5 # Documentation/DocBook/Makefile 1.13 -> 1.14 # include/linux/jbd.h 1.5 -> 1.6 # (new) -> 1.1 Documentation/DocBook/journal-api.tmpl # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 02/08/14 roger@zuse.computer-surgery.co.uk 1.745 # Add DocBook style documentation for the jbd # layers client api. # -------------------------------------------- # diff -Nru a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile --- a/Documentation/DocBook/Makefile Wed Aug 14 22:13:58 2002 +++ b/Documentation/DocBook/Makefile Wed Aug 14 22:13:58 2002 @@ -1,7 +1,8 @@ BOOKS := wanbook.sgml z8530book.sgml mcabook.sgml videobook.sgml \ kernel-api.sgml parportbook.sgml kernel-hacking.sgml \ kernel-locking.sgml via-audio.sgml mousedrivers.sgml sis900.sgml \ - deviceiobook.sgml procfs-guide.sgml tulip-user.sgml + deviceiobook.sgml procfs-guide.sgml tulip-user.sgml \ + journal-api.sgml PS := $(patsubst %.sgml, %.ps, $(BOOKS)) PDF := $(patsubst %.sgml, %.pdf, $(BOOKS)) @@ -137,6 +138,17 @@ parportbook.ps: $(EPS-parportbook) parportbook.sgml: parportbook.tmpl $(TOPDIR)/drivers/parport/init.c $(TOPDIR)/scripts/docgen $(TOPDIR)/drivers/parport/init.c <$< >$@ + + +JBDSOURCES := $(TOPDIR)/include/linux/jbd.h \ + $(TOPDIR)/fs/jbd/journal.c \ + $(TOPDIR)/fs/jbd/recovery.c \ + $(TOPDIR)/fs/jbd/transaction.c + +journal-api.sgml: journal-api.tmpl $(JBDSOURCES) + $(TOPDIR)/scripts/docgen $(JBDSOURCES) \ + journal-api.sgml + DVI := $(patsubst %.sgml, %.dvi, $(BOOKS)) AUX := $(patsubst %.sgml, %.aux, $(BOOKS)) diff -Nru a/Documentation/DocBook/journal-api.tmpl b/Documentation/DocBook/journal-api.tmpl --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/Documentation/DocBook/journal-api.tmpl Wed Aug 14 22:13:58 2002 @@ -0,0 +1,297 @@ + + + + The Linux Journalling API + + + Roger + Gammans + +
+ rgammans@computer-surgery.co.uk +
+
+
+
+ + + + Stephen + Tweedie + +
+ sct@redhat.com +
+
+
+
+ + + 2002 + Roger Gammans + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + + Overview + + Details + +The journalling layer is easy to use. You need to +first of all create a journal_t data structure. There are +two calls to do this dependent on how you decide to allocate the physical +media on which the journal resides. The journal_init_inode() call +is for journals stored in filesystem inodes, or the journal_init_dev() +call can be use for journal stored on a raw device (in a continuous range +of blocks). A journal_t is a typedef for a struct pointer, so when +you are finally finished make sure you call journal_destroy() on it +to free up any used kernel memory. + + + +Once you have got your journal_t object you need to 'mount' or load the journal +file, unless of course you haven't initialised it yet - in which case you +need to call journal_create(). + + + +Most of the time however your journal file will already have been created, but +before you load it you must call journal_wipe() to empty the journal file. +Hang on, you say , what if the filesystem wasn't cleanly umount()'d . Well, it is the +job of the client file system to detect this and skip the call to journal_wipe(). + + + +In either case the next call should be to journal_load() which prepares the +journal file for use. Note that journal_wipe(..,0) calls journal_skip_recovery() +for you if it detects any outstanding transactions in the journal and similarly +journal_load() will call journal_recover() if necessary. +I would advise reading fs/ext3/super.c for examples on this stage. +[RGG: Why is the journal_wipe() call necessary - doesn't this needlessly +complicate the API. Or isn't a good idea for the journal layer to hide +dirty mounts from the client fs] + + + +Now you can go ahead and start modifying the underlying +filesystem. Almost. + + + + + +You still need to actually journal your filesystem changes, this +is done by wrapping them into transactions. Additionally you +also need to wrap the modification of each of the the buffers +with calls to the journal layer, so it knows what the modifications +you are actually making are. To do this use journal_start() which +returns a transaction handle. + + + +journal_start() +and its counterpart journal_stop(), which indicates the end of a transaction +are nestable calls, so you can reenter a transaction if necessary, +but remember you must call journal_stop() the same number of times as +journal_start() before the transaction is completed (or more accurately +leaves the the update phase). Ext3/VFS makes use of this feature to simplify +quota support. + + + +Inside each transaction you need to wrap the modifications to the +individual buffers (blocks). Before you start to modify a buffer you +need to call journal_get_{create,write,undo}_access() as appropriate, +this allows the journalling layer to copy the unmodified data if it +needs to. After all the buffer may be part of a previously uncommitted +transaction. +At this point you are at last ready to modify a buffer, and once +you are have done so you need to call journal_dirty_{meta,}data(). +Or if you've asked for access to a buffer you now know is now longer +required to be pushed back on the device you can call journal_forget() +in much the same way as you might have used bforget() in the past. + + + + + + +A journal_flush() may be called at any time to commit and checkpoint +all your transactions. + + + +Then at umount time , in your put_super() (2.4) or write_super() (2.5) +you can then call journal_destroy() to clean up your in-core journal object. + + + + +Unfortunately there a couple of ways the journal layer can cause a deadlock. +The first thing to note is that each task can only have +a single outstanding transaction at any one time, remember nothing +commits until the outermost journal_stop(). This means +you must complete the transaction at the end of each file/inode/address +etc. operation you perform, so that the journalling system isn't re-entered +on another journal. Since transactions can't be nested/batched +across differing journals, and another filesystem other than +yours (say ext3) may be modified in a later syscall. + + + +The second case to bear in mind is that journal_start() can +block if there isn't enough space in the journal for your transaction +(based on the passed nblocks param) - when it blocks it merely(!) needs to +wait for transactions to complete and be committed from other tasks, +so essentially we are waiting for journal_stop(). So to avoid +deadlocks you must treat journal_start/stop() as if they +were semaphores and include them in your semaphore ordering rules to prevent +deadlocks. Note that journal_extend() has similar blocking behaviour to +journal_start() so you can deadlock here just as easily as on journal_start(). + + + +Try to reserve the right number of blocks the first time. ;-). + + +Another wriggle to watch out for is your on-disk block allocation strategy. +why? Because, if you undo a delete, you need to ensure you haven't reused any +of the freed blocks in a later transaction. One simple way of doing this +is make sure any blocks you allocate only have checkpointed transactions +listed against them. Ext3 does this in ext3_test_allocatable(). + + + +Lock is also providing through journal_{un,}lock_updates(), +ext3 uses this when it wants a window with a clean and stable fs for a moment. +eg. + + + journal_lock_updates() //stop new stuff happening.. + journal_flush() // checkpoint everything. + ..do stuff on stable fs + journal_unlock_updates() // carry on with filesystem use. + + +The opportunities for abuse and DOS attacks with this should be obvious, +if you allow unprivileged userspace to trigger codepaths containing these +calls. + + + + +Summary + +Using the journal is a matter of wrapping the different context changes, +being each mount, each modification (transaction) and each changed buffer +to tell the journalling layer about them. + +Here is a some pseudo code to give you an idea of how it works, as +an example. + + journal_t* my_jnrl = journal_create(); + journal_init_{dev,inode}(jnrl,...) + if (clean) journal_wipe(); + journal_load(); + + foreach(transaction) { /*transactions must be + completed before + a syscall returns to + userspace*/ + + handle_t * xct=journal_start(my_jnrl); + foreach(bh) { + journal_get_{create,write,undo}_access(xact,bh); + if ( myfs_modify(bh) ) { /* returns true + if makes changes */ + journal_dirty_{meta,}data(xact,bh); + } else { + journal_forget(bh); + } + } + journal_stop(xct); + } + journal_destroy(my_jrnl); + + + + + + Data Types + + The journalling layer uses typedefs to 'hide' the concrete definitions + of the structures used. As a client of the JBD layer you can + just rely on the using the pointer as a magic cookie of some sort. + + Obviously the hiding is not enforced as this is 'C'. + + Structures +!Iinclude/linux/jbd.h + + + + + Functions + + The functions here are split into two groups those that + affect a journal as a whole, and those which are used to + manage transactions + + Journal Level +!Efs/jbd/journal.c +!Efs/jbd/recovery.c + + Transasction Level +!Efs/jbd/transaction.c + + + + See also + + + + Journaling the Linux ext2fs Filesystem,LinuxExpo 98, Stephen Tweedie + + + + + + + Ext3 Journalling FileSystem , OLS 2000, Dr. Stephen Tweedie + + + + + +
diff -Nru a/fs/jbd/journal.c b/fs/jbd/journal.c --- a/fs/jbd/journal.c Wed Aug 14 22:13:58 2002 +++ b/fs/jbd/journal.c Wed Aug 14 22:13:58 2002 @@ -730,14 +730,21 @@ * need to set up all of the mapping information to tell the journaling * system where the journal blocks are. * - * journal_init_dev creates a journal which maps a fixed contiguous - * range of blocks on an arbitrary block device. - * - * journal_init_inode creates a journal which maps an on-disk inode as - * the journal. The inode must exist already, must support bmap() and - * must have all data blocks preallocated. */ + /** + * journal_t * journal_init_dev() - creates an initialises a journal structure + * @kdev: Block device on which to create the journal + * @fs_dev: Device which hold journalled filesystem for this journal. + * @start: Block nr Start of journal. + * @len: Lenght of the journal in blocks. + * @blocksize: blocksize of journalling device + * @returns: a newly created journal_t * + * + * journal_init_dev creates a journal which maps a fixed contiguous + * range of blocks on an arbitrary block device. + * + */ journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev, int start, int len, int blocksize) { @@ -760,7 +767,15 @@ return journal; } - + +/** + * journal_t * journal_init_inode () - creates a journal which maps to a inode. + * @inode: An inode to create the journal in + * + * journal_init_inode creates a journal which maps an on-disk inode as + * the journal. The inode must exist already, must support bmap() and + * must have all data blocks preallocated. + */ journal_t * journal_init_inode (struct inode *inode) { struct buffer_head *bh; @@ -850,12 +865,15 @@ return 0; } -/* +/** + * int journal_create() - Initialise the new journal file + * @journal: Journal to create. This structure must have been initialised + * * Given a journal_t structure which tells us which disk blocks we can * use, create a new journal superblock and initialise all of the - * journal fields from scratch. */ - -int journal_create (journal_t *journal) + * journal fields from scratch. + **/ +int journal_create(journal_t *journal) { unsigned long blocknr; struct buffer_head *bh; @@ -916,11 +934,14 @@ return journal_reset(journal); } -/* +/** + * void journal_update_superblock() - Update journal sb on disk. + * @journal: The journal to update. + * @wait: Set to '0' if you don't want to wait for IO completion. + * * Update a journal's dynamic superblock fields and write it to disk, * optionally waiting for the IO to complete. -*/ - + */ void journal_update_superblock(journal_t *journal, int wait) { journal_superblock_t *sb = journal->j_superblock; @@ -1036,12 +1057,14 @@ } -/* +/** + * int journal_load() - Read journal from disk. + * @journal: Journal to act on. + * * Given a journal_t structure which tells us which disk blocks contain * a journal, read the journal from disk to initialise the in-memory * structures. */ - int journal_load(journal_t *journal) { int err; @@ -1086,11 +1109,13 @@ return -EIO; } -/* +/** + * void journal_destroy() - Release a journal_t structure. + * @journal: Journal to act on. +* * Release a journal_t structure once it is no longer in use by the * journaled object. */ - void journal_destroy (journal_t *journal) { /* Wait for the commit thread to wake up and die. */ @@ -1128,8 +1153,12 @@ } -/* Published API: Check whether the journal uses all of a given set of - * features. Return true (non-zero) if it does. */ +/** + *int journal_check_used_features () - Check if features specified are used. + * + * Check whether the journal uses all of a given set of + * features. Return true (non-zero) if it does. + **/ int journal_check_used_features (journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) @@ -1151,7 +1180,10 @@ return 0; } -/* Published API: Check whether the journaling code supports the use of +/** + * int journal_check_available_features() - Check feature set in journalling layer + * + * Check whether the journaling code supports the use of * all of a given set of features on this journal. Return true * (non-zero) if it can. */ @@ -1180,8 +1212,13 @@ return 0; } -/* Published API: Mark a given journal feature as present on the - * superblock. Returns true if the requested features could be set. */ +/** + * int journal_set_features () - Mark a given journal feature in the superblock + * + * Mark a given journal feature as present on the + * superblock. Returns true if the requested features could be set. + * + */ int journal_set_features (journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) @@ -1207,12 +1244,12 @@ } -/* - * Published API: +/** + * int journal_update_format () - Update on-disk journal structure. + * * Given an initialised but unloaded journal struct, poke about in the * on-disk structure to update it to the most recent supported version. */ - int journal_update_format (journal_t *journal) { journal_superblock_t *sb; @@ -1262,7 +1299,10 @@ } -/* +/** + * int journal_flush () - Flush journal + * @journal: Journal to act on. + * * Flush all data for a given journal to disk and empty the journal. * Filesystems can use this when remounting readonly to ensure that * recovery does not need to happen on remount. @@ -1316,12 +1356,16 @@ return err; } -/* +/** + * int journal_wipe() - Wipe journal contents + * @journal: Journal to act on. + * @write: flag (see below) + * * Wipe out all of the contents of a journal, safely. This will produce * a warning if the journal contains any valid recovery information. * Must be called between journal_init_*() and journal_load(). * - * If (write) is non-zero, then we wipe out the journal on disk; otherwise + * If 'write' is non-zero, then we wipe out the journal on disk; otherwise * we merely suppress recovery. */ @@ -1370,43 +1414,11 @@ } /* - * journal_abort: perform a complete, immediate shutdown of the ENTIRE - * journal (not of a single transaction). This operation cannot be - * undone without closing and reopening the journal. - * - * The journal_abort function is intended to support higher level error - * recovery mechanisms such as the ext2/ext3 remount-readonly error - * mode. - * - * Journal abort has very specific semantics. Any existing dirty, - * unjournaled buffers in the main filesystem will still be written to - * disk by bdflush, but the journaling mechanism will be suspended - * immediately and no further transaction commits will be honoured. - * - * Any dirty, journaled buffers will be written back to disk without - * hitting the journal. Atomicity cannot be guaranteed on an aborted - * filesystem, but we _do_ attempt to leave as much data as possible - * behind for fsck to use for cleanup. - * - * Any attempt to get a new transaction handle on a journal which is in - * ABORT state will just result in an -EROFS error return. A - * journal_stop on an existing handle will return -EIO if we have - * entered abort state during the update. - * - * Recursive transactions are not disturbed by journal abort until the - * final journal_stop, which will receive the -EIO error. - * - * Finally, the journal_abort call allows the caller to supply an errno - * which will be recored (if possible) in the journal superblock. This - * allows a client to record failure conditions in the middle of a - * transaction without having to complete the transaction to record the - * failure to disk. ext3_error, for example, now uses this - * functionality. + * Journal abort has very specific semantics, which we describe + * for journal abort. * - * Errors which originate from within the journaling layer will NOT - * supply an errno; a null errno implies that absolutely no further - * writes are done to the journal (unless there are any already in - * progress). + * Two internal function, which provide abort to te jbd layer + * itself are here. */ /* Quick version for internal journal use (doesn't lock the journal). @@ -1444,7 +1456,52 @@ journal_update_superblock(journal, 1); } -/* Full version for external use */ +/** + * void journal_abort () - Shutdown the journal immediately. + * @journal: the journal to shutdown. + * @errno: an error number to record in the journal indicating + * the reason for the shutdown. + * + * Perform a complete, immediate shutdown of the ENTIRE + * journal (not of a single transaction). This operation cannot be + * undone without closing and reopening the journal. + * + * The journal_abort function is intended to support higher level error + * recovery mechanisms such as the ext2/ext3 remount-readonly error + * mode. + * + * Journal abort has very specific semantics. Any existing dirty, + * unjournaled buffers in the main filesystem will still be written to + * disk by bdflush, but the journaling mechanism will be suspended + * immediately and no further transaction commits will be honoured. + * + * Any dirty, journaled buffers will be written back to disk without + * hitting the journal. Atomicity cannot be guaranteed on an aborted + * filesystem, but we _do_ attempt to leave as much data as possible + * behind for fsck to use for cleanup. + * + * Any attempt to get a new transaction handle on a journal which is in + * ABORT state will just result in an -EROFS error return. A + * journal_stop on an existing handle will return -EIO if we have + * entered abort state during the update. + * + * Recursive transactions are not disturbed by journal abort until the + * final journal_stop, which will receive the -EIO error. + * + * Finally, the journal_abort call allows the caller to supply an errno + * which will be recorded (if possible) in the journal superblock. This + * allows a client to record failure conditions in the middle of a + * transaction without having to complete the transaction to record the + * failure to disk. ext3_error, for example, now uses this + * functionality. + * + * Errors which originate from within the journaling layer will NOT + * supply an errno; a null errno implies that absolutely no further + * writes are done to the journal (unless there are any already in + * progress). + * + */ + void journal_abort (journal_t *journal, int errno) { lock_journal(journal); @@ -1452,6 +1509,17 @@ unlock_journal(journal); } +/** + * int journal_errno () - returns the journal's error state. + * @journal: journal to examine. + * + * This is the errno numbet set with journal_abort(), the last + * time the journal was mounted - if the journal was stopped + * without calling abort this will be 0. + * + * If the journal has been aborted on this mount time -EROFS will + * be returned. + */ int journal_errno (journal_t *journal) { int err; @@ -1465,6 +1533,14 @@ return err; } + + +/** + * int journal_clear_err () - clears the journal's error state + * + * An error must be cleared or Acked to take a FS out of readonly + * mode. + */ int journal_clear_err (journal_t *journal) { int err = 0; @@ -1478,6 +1554,13 @@ return err; } + +/** + * void journal_ack_err() - Ack journal err. + * + * An error must be cleared or Acked to take a FS out of readonly + * mode. + */ void journal_ack_err (journal_t *journal) { lock_journal(journal); diff -Nru a/fs/jbd/recovery.c b/fs/jbd/recovery.c --- a/fs/jbd/recovery.c Wed Aug 14 22:13:58 2002 +++ b/fs/jbd/recovery.c Wed Aug 14 22:13:58 2002 @@ -207,20 +207,22 @@ var -= ((journal)->j_last - (journal)->j_first); \ } while (0) -/* - * journal_recover - * +/** + * int journal_recover(journal_t *journal) - recovers a on-disk journal + * @journal: the journal to recover + * * The primary function for recovering the log contents when mounting a * journaled device. - * + */ +int journal_recover(journal_t *journal) +{ +/* * Recovery is done in three passes. In the first pass, we look for the * end of the log. In the second, we assemble the list of revoke * blocks. In the third and final pass, we replay any un-revoked blocks * in the log. */ -int journal_recover(journal_t *journal) -{ int err; journal_superblock_t * sb; @@ -264,20 +266,23 @@ return err; } -/* - * journal_skip_recovery - * +/** + * int journal_skip_recovery() - Start journal and wipe exiting records + * @journal: journal to startup + * * Locate any valid recovery information from the journal and set up the * journal structures in memory to ignore it (presumably because the * caller has evidence that it is out of date). - * + * This function does'nt appear to be exorted.. + */ +int journal_skip_recovery(journal_t *journal) +{ +/* * We perform one pass over the journal to allow us to tell the user how * much recovery information is being erased, and to let us initialise * the journal transaction sequence numbers to the next unused ID. */ -int journal_skip_recovery(journal_t *journal) -{ int err; journal_superblock_t * sb; diff -Nru a/fs/jbd/transaction.c b/fs/jbd/transaction.c --- a/fs/jbd/transaction.c Wed Aug 14 22:13:58 2002 +++ b/fs/jbd/transaction.c Wed Aug 14 22:13:58 2002 @@ -201,19 +201,20 @@ return 0; } -/* - * Obtain a new handle. +/** + * handle_t *journal_start() - Obtain a new handle. + * @journal: Journal to start transaction on. + * @nblocks: number of block buffer we might modify * * We make sure that the transaction can guarantee at least nblocks of * modified buffers in the log. We block until the log can guarantee * that much space. * - * This function is visible to journal users (like ext2fs), so is not + * This function is visible to journal users (like ext3fs), so is not * called with the journal already locked. * * Return a pointer to a newly allocated handle, or NULL on failure */ - handle_t *journal_start(journal_t *journal, int nblocks) { handle_t *handle = journal_current_handle(); @@ -306,7 +307,11 @@ return ret; } -/* +/** + * handle_t *journal_try_start() - Don't block, but try and get a handle + * @journal: Journal to start transaction on. + * @nblocks: number of block buffer we might modify + * * Try to start a handle, but non-blockingly. If we weren't able * to, return an ERR_PTR value. */ @@ -353,16 +358,18 @@ return handle; } -/* - * journal_extend: extend buffer credits. - * +/** + * int journal_extend() - extend buffer credits. + * @handle: handle to 'extend' + * @nblocks: nr blocks to try to extend by. + * * Some transactions, such as large extends and truncates, can be done * atomically all at once or in several stages. The operation requests * a credit for a number of buffer modications in advance, but can * extend its credit if it needs more. * * journal_extend tries to give the running handle more buffer credits. - * It does not guarantee that allocation: this is a best-effort only. + * It does not guarantee that allocation - this is a best-effort only. * The calling process MUST be able to deal cleanly with a failure to * extend here. * @@ -371,7 +378,6 @@ * return code < 0 implies an error * return code > 0 implies normal transaction-full status. */ - int journal_extend (handle_t *handle, int nblocks) { transaction_t *transaction = handle->h_transaction; @@ -420,8 +426,12 @@ } -/* - * journal_restart: restart a handle for a multi-transaction filesystem +/** + * int journal_restart() - restart a handle . + * @handle: handle to restart + * @nblocks: nr credits requested + * + * Restart a handle for a multi-transaction filesystem * operation. * * If the journal_extend() call above fails to grant new buffer credits @@ -463,8 +473,9 @@ } -/* - * Barrier operation: establish a transaction barrier. +/** + * void journal_lock_updates () - establish a transaction barrier. + * @journal: Journal to establish a barrier on. * * This locks out any further updates from being started, and blocks * until all existing updates have completed, returning only once the @@ -472,7 +483,6 @@ * * The journal lock should not be held on entry. */ - void journal_lock_updates (journal_t *journal) { lock_journal(journal); @@ -500,12 +510,14 @@ down(&journal->j_barrier); } -/* +/** + * void journal_unlock_updates (journal_t* journal) - release barrier + * @journal: Journal to release the barrier on. + * * Release a transaction barrier obtained with journal_lock_updates(). * * Should be called without the journal lock held. */ - void journal_unlock_updates (journal_t *journal) { lock_journal(journal); @@ -519,23 +531,14 @@ } /* - * journal_get_write_access: notify intent to modify a buffer for metadata - * (not data) update. - * - * If the buffer is already part of the current transaction, then there - * is nothing we need to do. If it is already part of a prior + * if the buffer is already part of the current transaction, then there + * is nothing we need to do. if it is already part of a prior * transaction which we are still committing to disk, then we need to * make sure that we do not overwrite the old copy: we do copy-out to - * preserve the copy going to disk. We also account the buffer against + * preserve the copy going to disk. we also account the buffer against * the handle's metadata buffer credits (unless the buffer is already * part of the transaction, that is). - * - * Returns an error code or 0 on success. - * - * In full data journalling mode the buffer may be of type BJ_AsyncData, - * because we're write()ing a buffer which is also part of a shared mapping. */ - static int do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) { @@ -749,6 +752,17 @@ return error; } +/** + * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. + * @handle: transaction to add buffer modifications to + * @bh: bh to be used for metadata writes + * + * Returns an error code or 0 on success. + * + * In full data journalling mode the buffer may be of type BJ_AsyncData, + * because we're write()ing a buffer which is also part of a shared mapping. + */ + int journal_get_write_access (handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; @@ -779,6 +793,13 @@ * There is no lock ranking violation: it was a newly created, * unlocked buffer beforehand. */ +/** + * int journal_get_create_access () - notify intent to use newly created bh + * @handle: ransaction to new buffer to + * @bh: new buffer. + * + * Call this if you create a new bh. + */ int journal_get_create_access (handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; @@ -840,13 +861,14 @@ -/* - * journal_get_undo_access: Notify intent to modify metadata with non- - * rewindable consequences - * +/** + * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences + * @handle: transaction + * @bh: buffer to undo + * * Sometimes there is a need to distinguish between metadata which has * been committed to disk and that which has not. The ext3fs code uses - * this for freeing and allocating space: we have to make sure that we + * this for freeing and allocating space, we have to make sure that we * do not reuse freed space until the deallocation has been committed, * since if we overwrote that space we would make the delete * un-rewindable in case of a crash. @@ -858,13 +880,12 @@ * as we know that the buffer has definitely been committed to disk. * * We never need to know which transaction the committed data is part - * of: buffers touched here are guaranteed to be dirtied later and so + * of, buffers touched here are guaranteed to be dirtied later and so * will be committed to a new transaction in due course, at which point * we can discard the old committed data pointer. * * Returns error number or 0 on success. */ - int journal_get_undo_access (handle_t *handle, struct buffer_head *bh) { journal_t *journal = handle->h_transaction->t_journal; @@ -906,10 +927,12 @@ return err; } -/* - * journal_dirty_data: mark a buffer as containing dirty data which - * needs to be flushed before we can commit the current transaction. - * +/** + * int journal_dirty_data() - mark a buffer as containing dirty data which needs to be flushed before we can commit the current transaction. + * @handle: transaction + * @bh: bufferhead to mark + * @async: flag + * * The buffer is placed on the transaction's data list and is marked as * belonging to the transaction. * @@ -918,7 +941,10 @@ * t_async_datalist. * * Returns error number or 0 on success. - * + */ +int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async) +{ +/* * journal_dirty_data() can be called via page_launder->ext3_writepage * by kswapd. So it cannot block. Happily, there's nothing here * which needs lock_journal if `async' is set. @@ -927,9 +953,6 @@ * between BJ_AsyncData and BJ_SyncData according to who tried to * change its state last. */ - -int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async) -{ journal_t *journal = handle->h_transaction->t_journal; int need_brelse = 0; int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData; @@ -1072,24 +1095,28 @@ return 0; } -/* - * journal_dirty_metadata: mark a buffer as containing dirty metadata - * which needs to be journaled as part of the current transaction. +/** + * int journal_dirty_metadata() - mark a buffer as containing dirty metadata + * @handle: transaction to add buffer to. + * @bh: buffer to mark + * + * mark dirty metadata which needs to be journaled as part of the current transaction. * * The buffer is placed on the transaction's metadata list and is marked * as belonging to the transaction. * + * Returns error number or 0 on success. + */ +int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh) +{ +/* * Special care needs to be taken if the buffer already belongs to the * current committing transaction (in which case we should have frozen * data present for that commit). In that case, we don't relink the * buffer: that only gets done when the old transaction finally * completes its commit. * - * Returns error number or 0 on success. */ - -int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh) -{ transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; struct journal_head *jh = bh2jh(bh); @@ -1175,9 +1202,12 @@ } #endif -/* - * journal_forget: bforget() for potentially-journaled buffers. We can - * only do the bforget if there are no commits pending against the +/** + * void journal_forget() - bforget() for potentially-journaled buffers. + * @handle: transaction handle + * @bh: bh to 'forget' + * + * We can only do the bforget if there are no commits pending against the * buffer. If the buffer is dirty in the current running transaction we * can safely unlink it. * @@ -1189,7 +1219,6 @@ * Allow this call even if the handle has aborted --- it may be part of * the caller's cleanup after an abort. */ - void journal_forget (handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; @@ -1328,7 +1357,10 @@ } #endif -/* +/** + * int journal_stop() - complete a transaction + * @handle: tranaction to complete. + * * All done for a particular handle. * * There is not much action needed here. We just return any remaining @@ -1341,7 +1373,6 @@ * return -EIO if a journal_abort has been executed since the * transaction began. */ - int journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; @@ -1425,8 +1456,10 @@ return err; } -/* - * For synchronous operations: force any uncommitted trasnactions +/**int journal_force_commit() - force any uncommitted transactions + * @journal: journal to force + * + * For synchronous operations: force any uncommitted transactions * to disk. May seem kludgy, but it reuses all the handle batching * code in a very simple manner. */ @@ -1630,6 +1663,26 @@ return 0; } + +/** + * int journal_try_to_free_buffers() - try to free page buffers. + * @journal: journal for operation + * @page: to try and free + * @gfp_mask: 'IO' mode for try_to_free_buffers() + * + * + * For all the buffers on this page, + * if they are fully written out ordered data, move them onto BUF_CLEAN + * so try_to_free_buffers() can reap them. + * + * This function returns non-zero if we wish try_to_free_buffers() + * to be called. We do this if the page is releasable by try_to_free_buffers(). + * We also do it if the page has locked or dirty buffers and the caller wants + * us to perform sync or async writeout. + */ +int journal_try_to_free_buffers(journal_t *journal, + struct page *page, int gfp_mask) +{ /* * journal_try_to_free_buffers(). For all the buffers on this page, * if they are fully written out ordered data, move them onto BUF_CLEAN @@ -1654,14 +1707,7 @@ * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? * - * This function returns non-zero if we wish try_to_free_buffers() - * to be called. We do this is the page is releasable by try_to_free_buffers(). - * We also do it if the page has locked or dirty buffers and the caller wants - * us to perform sync or async writeout. */ -int journal_try_to_free_buffers(journal_t *journal, - struct page *page, int gfp_mask) -{ struct buffer_head *bh; struct buffer_head *tmp; int locked_or_dirty = 0; @@ -1872,8 +1918,15 @@ return may_free; } -/* - * Return non-zero if the page's buffers were successfully reaped +/** + * int journal_flushpage() + * @journal: journal to use for flush... + * @page: page to flush + * @offset: length of page to flush. + * + * Reap page buffers containing data after offset in page. + * + * Return non-zero if the page's buffers were successfully reaped. */ int journal_flushpage(journal_t *journal, struct page *page, diff -Nru a/include/linux/jbd.h b/include/linux/jbd.h --- a/include/linux/jbd.h Wed Aug 14 22:13:58 2002 +++ b/include/linux/jbd.h Wed Aug 14 22:13:58 2002 @@ -62,7 +62,38 @@ #define JFS_MIN_JOURNAL_BLOCKS 1024 #ifdef __KERNEL__ + +/** + * typedef handle_t - The handle_t type represents a single atomic update being performed by some process. + * + * All filesystem modifications made by the process go + * through this handle. Recursive operations (such as quota operations) + * are gathered into a single update. + * + * The buffer credits field is used to account for journaled buffers + * being modified by the running process. To ensure that there is + * enough log space for all outstanding operations, we need to limit the + * number of outstanding buffers possible at any time. When the + * operation completes, any buffer credits not used are credited back to + * the transaction, so that at all times we know how many buffers the + * outstanding updates on a transaction might possibly touch. + * + * This is an opaque datatype. + **/ typedef struct handle_s handle_t; /* Atomic operation type */ + + +/** + * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem. + * + * journal_t is linked to from the fs superblock structure. + * + * We use the journal_t to keep track of all outstanding transaction + * activity on the filesystem, and to manage the state of the log + * writing process. + * + * This is an opaque datatype. + **/ typedef struct journal_s journal_t; /* Journal control structure */ #endif @@ -251,18 +282,20 @@ struct jbd_revoke_table_s; -/* The handle_t type represents a single atomic update being performed - * by some process. All filesystem modifications made by the process go - * through this handle. Recursive operations (such as quota operations) - * are gathered into a single update. - * - * The buffer credits field is used to account for journaled buffers - * being modified by the running process. To ensure that there is - * enough log space for all outstanding operations, we need to limit the - * number of outstanding buffers possible at any time. When the - * operation completes, any buffer credits not used are credited back to - * the transaction, so that at all times we know how many buffers the - * outstanding updates on a transaction might possibly touch. */ +/** + * struct handle_s - The handle_s type is the concrete type associated with handle_t. + * @h_transaction: Which compound transaction is this update a part of? + * @h_buffer_credits: Number of remaining buffers we are allowed to dirty. + * @h_ref: Reference count on this handle + * @h_err: Field for caller's use to track errors through large fs operations + * @h_sync: flag for sync-on-close + * @h_jdata: flag to force data journaling + * @h_aborted: flag indicating fatal error on handle + **/ + +/* Docbook can't yet cope with the bit fields, but will leave the documentation + * in so it can be fixed later. + */ struct handle_s { @@ -275,8 +308,8 @@ /* Reference count on this handle */ int h_ref; - /* Field for caller's use to track errors through large fs - operations */ + /* Field for caller's use to track errors through large fs */ + /* operations */ int h_err; /* Flags */ @@ -400,21 +433,58 @@ int t_handle_count; }; - -/* The journal_t maintains all of the journaling state information for a - * single filesystem. It is linked to from the fs superblock structure. - * - * We use the journal_t to keep track of all outstanding transaction - * activity on the filesystem, and to manage the state of the log - * writing process. */ +/** + * struct journal_s - The journal_s type is the concrete type associated with journal_t. + * @j_flags: General journaling state flags + * @j_errno: Is there an outstanding uncleared error on the journal (from a prior abort)? + * @j_sb_buffer: First part of superblock buffer + * @j_superblock: Second part of superblock buffer + * @j_format_version: Version of the superblock format + * @j_barrier_count: Number of processes waiting to create a barrier lock + * @j_barrier: The barrier lock itself + * @j_running_transaction: The current running transaction.. + * @j_committing_transaction: the transaction we are pushing to disk + * @j_checkpoint_transactions: a linked circular list of all transactions waiting for checkpointing + * @j_wait_transaction_locked: Wait queue for waiting for a locked transaction to start committing, or for a barrier lock to be released + * @j_wait_logspace: Wait queue for waiting for checkpointing to complete + * @j_wait_done_commit: Wait queue for waiting for commit to complete + * @j_wait_checkpoint: Wait queue to trigger checkpointing + * @j_wait_commit: Wait queue to trigger commit + * @j_wait_updates: Wait queue to wait for updates to complete + * @j_checkpoint_sem: Semaphore for locking against concurrent checkpoints + * @j_sem: The main journal lock, used by lock_journal() + * @j_head: Journal head - identifies the first unused block in the journal + * @j_tail: Journal tail - identifies the oldest still-used block in the journal. + * @j_free: Journal free - how many free blocks are there in the journal? + * @j_first: The block number of the first usable block + * @j_last: The block number one beyond the last usable block + * @j_dev: Device where we store the journal + * @j_blocksize: blocksize for the location where we store the journal. + * @j_blk_offset: starting block offset for into the device where we store the journal + * @j_fs_dev: Device which holds the client fs. For internal journal this will be equal to j_dev + * @j_maxlen: Total maximum capacity of the journal region on disk. + * @j_inode: Optional inode where we store the journal. If present, all journal block numbers are mapped into this inode via bmap(). + * @j_tail_sequence: Sequence number of the oldest transaction in the log + * @j_transaction_sequence: Sequence number of the next transaction to grant + * @j_commit_sequence: Sequence number of the most recently committed transaction + * @j_commit_request: Sequence number of the most recent transaction wanting commit + * @j_uuid: Uuid of client object. + * @j_task: Pointer to the current commit thread for this journal + * @j_max_transaction_buffers: Maximum number of metadata buffers to allow in a single compound commit transaction + * @j_commit_interval: What is the maximum transaction lifetime before we begin a commit? + * @j_commit_timer: The timer used to wakeup the commit thread + * @j_commit_timer_active: Timer flag + * @j_all_journals: Link all journals together - system-wide + * @j_revoke: The revoke table - maintains the list of revoked blocks in the current transaction. + **/ struct journal_s { /* General journaling state flags */ unsigned long j_flags; - /* Is there an outstanding uncleared error on the journal (from - * a prior abort)? */ + /* Is there an outstanding uncleared error on the journal (from */ + /* a prior abort)? */ int j_errno; /* The superblock buffer */ @@ -436,13 +506,13 @@ /* ... the transaction we are pushing to disk ... */ transaction_t * j_committing_transaction; - /* ... and a linked circular list of all transactions waiting - * for checkpointing. */ + /* ... and a linked circular list of all transactions waiting */ + /* for checkpointing. */ /* Protected by journal_datalist_lock */ transaction_t * j_checkpoint_transactions; - /* Wait queue for waiting for a locked transaction to start - committing, or for a barrier lock to be released */ + /* Wait queue for waiting for a locked transaction to start */ + /* committing, or for a barrier lock to be released */ wait_queue_head_t j_wait_transaction_locked; /* Wait queue for waiting for checkpointing to complete */ @@ -469,33 +539,33 @@ /* Journal head: identifies the first unused block in the journal. */ unsigned long j_head; - /* Journal tail: identifies the oldest still-used block in the - * journal. */ + /* Journal tail: identifies the oldest still-used block in the */ + /* journal. */ unsigned long j_tail; /* Journal free: how many free blocks are there in the journal? */ unsigned long j_free; - /* Journal start and end: the block numbers of the first usable - * block and one beyond the last usable block in the journal. */ + /* Journal start and end: the block numbers of the first usable */ + /* block and one beyond the last usable block in the journal. */ unsigned long j_first, j_last; - /* Device, blocksize and starting block offset for the location - * where we store the journal. */ + /* Device, blocksize and starting block offset for the location */ + /* where we store the journal. */ kdev_t j_dev; int j_blocksize; unsigned int j_blk_offset; - /* Device which holds the client fs. For internal journal this - * will be equal to j_dev. */ + /* Device which holds the client fs. For internal journal this */ + /* will be equal to j_dev. */ kdev_t j_fs_dev; /* Total maximum capacity of the journal region on disk. */ unsigned int j_maxlen; - /* Optional inode where we store the journal. If present, all - * journal block numbers are mapped into this inode via - * bmap(). */ + /* Optional inode where we store the journal. If present, all */ + /* journal block numbers are mapped into this inode via */ + /* bmap(). */ struct inode * j_inode; /* Sequence number of the oldest transaction in the log */ @@ -507,23 +577,23 @@ /* Sequence number of the most recent transaction wanting commit */ tid_t j_commit_request; - /* Journal uuid: identifies the object (filesystem, LVM volume - * etc) backed by this journal. This will eventually be - * replaced by an array of uuids, allowing us to index multiple - * devices within a single journal and to perform atomic updates - * across them. */ + /* Journal uuid: identifies the object (filesystem, LVM volume */ + /* etc) backed by this journal. This will eventually be */ + /* replaced by an array of uuids, allowing us to index multiple */ + /* devices within a single journal and to perform atomic updates */ + /* across them. */ __u8 j_uuid[16]; /* Pointer to the current commit thread for this journal */ struct task_struct * j_task; - /* Maximum number of metadata buffers to allow in a single - * compound commit transaction */ + /* Maximum number of metadata buffers to allow in a single */ + /* compound commit transaction */ int j_max_transaction_buffers; - /* What is the maximum transaction lifetime before we begin a - * commit? */ + /* What is the maximum transaction lifetime before we begin a */ + /* commit? */ unsigned long j_commit_interval; /* The timer used to wakeup the commit thread: */ @@ -533,8 +603,8 @@ /* Link all journals together - system-wide */ struct list_head j_all_journals; - /* The revoke table: maintains the list of revoked blocks in the - current transaction. */ + /* The revoke table: maintains the list of revoked blocks in the */ + /* current transaction. */ struct jbd_revoke_table_s *j_revoke; };