2 * Copyright (c) International Business Machines Corp., 2000-2003
3 * Portions Copyright (c) Christoph Hellwig, 2001-2002
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * jfs_logmgr.c: log manager
23 * for related information, see transaction manager (jfs_txnmgr.c), and
24 * recovery manager (jfs_logredo.c).
26 * note: for detail, RTFS.
29 * special purpose buffer manager supporting log i/o requirements.
30 * per log serial pageout of logpage
31 * queuing i/o requests and redrive i/o at iodone
32 * maintain current logpage buffer
33 * no caching since append only
34 * appropriate jfs buffer cache buffers as needed
37 * transactions which wrote COMMIT records in the same in-memory
38 * log page during the pageout of previous/current log page(s) are
39 * committed together by the pageout of the page.
42 * transactions are committed asynchronously when the log page
43 * containing it COMMIT is paged out when it becomes full;
46 * . a per log lock serialize log write.
47 * . a per log lock serialize group commit.
48 * . a per log lock serialize log open/close;
51 * careful-write (ping-pong) of last logpage to recover from crash
53 * detection of split (out-of-order) write of physical sectors
54 * of last logpage via timestamp at end of each sector
55 * with its mirror data array at trailer).
58 * lsn - 64-bit monotonically increasing integer vs
59 * 32-bit lspn and page eor.
63 #include <linux/locks.h>
64 #include <linux/blkdev.h>
65 #include <linux/interrupt.h>
66 #include <linux/smp_lock.h>
67 #include <linux/completion.h>
68 #include "jfs_incore.h"
69 #include "jfs_filsys.h"
70 #include "jfs_metapage.h"
71 #include "jfs_txnmgr.h"
72 #include "jfs_debug.h"
76 * lbuf's ready to be redriven. Protected by log_redrive_lock (jfsIO thread)
78 static struct lbuf *log_redrive_list;
79 static spinlock_t log_redrive_lock = SPIN_LOCK_UNLOCKED;
80 DECLARE_WAIT_QUEUE_HEAD(jfs_IO_thread_wait);
84 * log read/write serialization (per log)
86 #define LOG_LOCK_INIT(log) init_MUTEX(&(log)->loglock)
87 #define LOG_LOCK(log) down(&((log)->loglock))
88 #define LOG_UNLOCK(log) up(&((log)->loglock))
92 * log group commit serialization (per log)
95 #define LOGGC_LOCK_INIT(log) spin_lock_init(&(log)->gclock)
96 #define LOGGC_LOCK(log) spin_lock_irq(&(log)->gclock)
97 #define LOGGC_UNLOCK(log) spin_unlock_irq(&(log)->gclock)
98 #define LOGGC_WAKEUP(tblk) wake_up_all(&(tblk)->gcwait)
101 * log sync serialization (per log)
103 #define LOGSYNC_DELTA(logsize) min((logsize)/8, 128*LOGPSIZE)
104 #define LOGSYNC_BARRIER(logsize) ((logsize)/4)
106 #define LOGSYNC_DELTA(logsize) min((logsize)/4, 256*LOGPSIZE)
107 #define LOGSYNC_BARRIER(logsize) ((logsize)/2)
112 * log buffer cache synchronization
114 static spinlock_t jfsLCacheLock = SPIN_LOCK_UNLOCKED;
116 #define LCACHE_LOCK(flags) spin_lock_irqsave(&jfsLCacheLock, flags)
117 #define LCACHE_UNLOCK(flags) spin_unlock_irqrestore(&jfsLCacheLock, flags)
120 * See __SLEEP_COND in jfs_locks.h
122 #define LCACHE_SLEEP_COND(wq, cond, flags) \
126 __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \
129 #define LCACHE_WAKEUP(event) wake_up(event)
133 * lbuf buffer cache (lCache) control
135 /* log buffer manager pageout control (cumulative, inclusive) */
136 #define lbmREAD 0x0001
137 #define lbmWRITE 0x0002 /* enqueue at tail of write queue;
138 * init pageout if at head of queue;
140 #define lbmRELEASE 0x0004 /* remove from write queue
141 * at completion of pageout;
142 * do not free/recycle it yet:
143 * caller will free it;
145 #define lbmSYNC 0x0008 /* do not return to freelist
146 * when removed from write queue;
148 #define lbmFREE 0x0010 /* return to freelist
149 * at completion of pageout;
150 * the buffer may be recycled;
152 #define lbmDONE 0x0020
153 #define lbmERROR 0x0040
154 #define lbmGC 0x0080 /* lbmIODone to perform post-GC processing
157 #define lbmDIRECT 0x0100
160 * external references
162 extern void txLazyUnlock(struct tblock * tblk);
163 extern int jfs_stop_threads;
164 extern struct completion jfsIOwait;
169 static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk,
170 struct lrd * lrd, struct tlock * tlck);
172 static int lmNextPage(struct jfs_log * log);
173 static int lmLogFileSystem(struct jfs_log * log, char *uuid, int activate);
175 static int lbmLogInit(struct jfs_log * log);
176 static void lbmLogShutdown(struct jfs_log * log);
177 static struct lbuf *lbmAllocate(struct jfs_log * log, int);
178 static void lbmFree(struct lbuf * bp);
179 static void lbmfree(struct lbuf * bp);
180 static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp);
181 static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
183 static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag);
184 static int lbmIOWait(struct lbuf * bp, int flag);
185 static void lbmIODone(struct buffer_head *bh, int);
186 static void lbmStartIO(struct lbuf * bp);
187 static void lmGCwrite(struct jfs_log * log, int cant_block);
188 static int lmLogSync(struct jfs_log * log, int nosyncwait);
194 #ifdef CONFIG_JFS_STATISTICS
196 uint commit; /* # of commit */
197 uint pagedone; /* # of page written */
198 uint submitted; /* # of pages submitted */
199 uint full_page; /* # of full pages submitted */
200 uint partial_page; /* # of partial pages submitted */
208 * FUNCTION: write a log record;
212 * RETURN: lsn - offset to the next log record to write (end-of-log);
215 * note: todo: log error handler
217 int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
222 struct metapage *mp = NULL;
224 jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p",
225 log, tblk, lrd, tlck);
229 /* log by (out-of-transaction) JFS ? */
233 /* log from page ? */
235 tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL)
239 * initialize/update page/transaction recovery lsn
246 * initialize page lsn if first log write of the page
253 /* insert page at tail of logsynclist */
254 list_add_tail(&mp->synclist, &log->synclist);
258 * initialize/update lsn of tblock of the page
260 * transaction inherits oldest lsn of pages associated
261 * with allocation/deallocation of resources (their
262 * log records are used to reconstruct allocation map
263 * at recovery time: inode for inode allocation map,
264 * B+-tree index of extent descriptors for block
266 * allocation map pages inherit transaction lsn at
267 * commit time to allow forwarding log syncpt past log
268 * records associated with allocation/deallocation of
269 * resources only after persistent map of these map pages
270 * have been updated and propagated to home.
273 * initialize transaction lsn:
275 if (tblk->lsn == 0) {
276 /* inherit lsn of its first page logged */
280 /* insert tblock after the page on logsynclist */
281 list_add(&tblk->synclist, &mp->synclist);
284 * update transaction lsn:
287 /* inherit oldest/smallest lsn of page */
288 logdiff(diffp, mp->lsn, log);
289 logdiff(difft, tblk->lsn, log);
291 /* update tblock lsn with page lsn */
294 /* move tblock after page on logsynclist */
295 list_del(&tblk->synclist);
296 list_add(&tblk->synclist, &mp->synclist);
303 * write the log record
306 lsn = lmWriteRecord(log, tblk, lrd, tlck);
309 * forward log syncpt if log reached next syncpt trigger
311 logdiff(diffp, lsn, log);
312 if (diffp >= log->nextsync)
313 lsn = lmLogSync(log, 0);
315 /* update end-of-log lsn */
320 /* return end-of-log address */
326 * NAME: lmWriteRecord()
328 * FUNCTION: move the log record to current log page
330 * PARAMETER: cd - commit descriptor
332 * RETURN: end-of-log address
334 * serialization: LOG_LOCK() held on entry/exit
337 lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
340 int lsn = 0; /* end-of-log address */
341 struct lbuf *bp; /* dst log page buffer */
342 struct logpage *lp; /* dst log page */
343 caddr_t dst; /* destination address in log page */
344 int dstoffset; /* end-of-log offset in log page */
345 int freespace; /* free space in log page */
346 caddr_t p; /* src meta-data page */
349 int nbytes; /* number of bytes to move */
352 struct linelock *linelock;
359 /* retrieve destination log page to write */
360 bp = (struct lbuf *) log->bp;
361 lp = (struct logpage *) bp->l_ldata;
362 dstoffset = log->eor;
364 /* any log data to write ? */
369 * move log record data
371 /* retrieve source meta-data page to log */
372 if (tlck->flag & tlckPAGELOCK) {
373 p = (caddr_t) (tlck->mp->data);
374 linelock = (struct linelock *) & tlck->lock;
376 /* retrieve source in-memory inode to log */
377 else if (tlck->flag & tlckINODELOCK) {
378 if (tlck->type & tlckDTREE)
379 p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot;
381 p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot;
382 linelock = (struct linelock *) & tlck->lock;
385 else if (tlck->flag & tlckINLINELOCK) {
387 inlinelock = (struct inlinelock *) & tlck;
388 p = (caddr_t) & inlinelock->pxd;
389 linelock = (struct linelock *) & tlck;
391 #endif /* _JFS_WIP */
393 jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck);
394 return 0; /* Probably should trap */
396 l2linesize = linelock->l2linesize;
399 ASSERT(linelock->index <= linelock->maxcnt);
402 for (i = 0; i < linelock->index; i++, lv++) {
407 if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) {
408 /* page become full: move on to next page */
412 lp = (struct logpage *) bp->l_ldata;
413 dstoffset = LOGPHDRSIZE;
417 * move log vector data
419 src = (u8 *) p + (lv->offset << l2linesize);
420 srclen = lv->length << l2linesize;
423 freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
424 nbytes = min(freespace, srclen);
425 dst = (caddr_t) lp + dstoffset;
426 memcpy(dst, src, nbytes);
429 /* is page not full ? */
430 if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
433 /* page become full: move on to next page */
436 bp = (struct lbuf *) log->bp;
437 lp = (struct logpage *) bp->l_ldata;
438 dstoffset = LOGPHDRSIZE;
445 * move log vector descriptor
448 lvd = (struct lvd *) ((caddr_t) lp + dstoffset);
449 lvd->offset = cpu_to_le16(lv->offset);
450 lvd->length = cpu_to_le16(lv->length);
452 jfs_info("lmWriteRecord: lv offset:%d length:%d",
453 lv->offset, lv->length);
456 if ((i = linelock->next)) {
457 linelock = (struct linelock *) lid_to_tlock(i);
462 * move log record descriptor
465 lrd->length = cpu_to_le16(len);
471 freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
472 nbytes = min(freespace, srclen);
473 dst = (caddr_t) lp + dstoffset;
474 memcpy(dst, src, nbytes);
479 /* are there more to move than freespace of page ? */
484 * end of log record descriptor
487 /* update last log record eor */
488 log->eor = dstoffset;
489 bp->l_eor = dstoffset;
490 lsn = (log->page << L2LOGPSIZE) + dstoffset;
492 if (lrd->type & cpu_to_le16(LOG_COMMIT)) {
494 jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn,
497 INCREMENT(lmStat.commit); /* # of commit */
500 * enqueue tblock for group commit:
502 * enqueue tblock of non-trivial/synchronous COMMIT
503 * at tail of group commit queue
504 * (trivial/asynchronous COMMITs are ignored by
509 /* init tblock gc state */
510 tblk->flag = tblkGC_QUEUE;
512 tblk->pn = log->page;
513 tblk->eor = log->eor;
515 /* enqueue transaction to commit queue */
517 if (log->cqueue.head) {
518 log->cqueue.tail->cqnext = tblk;
519 log->cqueue.tail = tblk;
521 log->cqueue.head = log->cqueue.tail = tblk;
526 jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x",
527 le16_to_cpu(lrd->type), log->bp, log->page, dstoffset);
529 /* page not full ? */
530 if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
534 /* page become full: move on to next page */
537 bp = (struct lbuf *) log->bp;
538 lp = (struct logpage *) bp->l_ldata;
539 dstoffset = LOGPHDRSIZE;
550 * FUNCTION: write current page and allocate next page.
556 * serialization: LOG_LOCK() held on entry/exit
558 static int lmNextPage(struct jfs_log * log)
561 int lspn; /* log sequence page number */
562 int pn; /* current page number */
567 /* get current log page number and log sequence page number */
570 lp = (struct logpage *) bp->l_ldata;
571 lspn = le32_to_cpu(lp->h.page);
576 * write or queue the full page at the tail of write queue
578 /* get the tail tblk on commit queue */
579 tblk = log->cqueue.tail;
581 /* every tblk who has COMMIT record on the current page,
582 * and has not been committed, must be on commit queue
583 * since tblk is queued at commit queueu at the time
584 * of writing its COMMIT record on the page before
585 * page becomes full (even though the tblk thread
586 * who wrote COMMIT record may have been suspended
590 /* is page bound with outstanding tail tblk ? */
591 if (tblk && tblk->pn == pn) {
592 /* mark tblk for end-of-page */
593 tblk->flag |= tblkGC_EOP;
595 if (log->cflag & logGC_PAGEOUT) {
596 /* if page is not already on write queue,
597 * just enqueue (no lbmWRITE to prevent redrive)
598 * buffer to wqueue to ensure correct serial order
599 * of the pages since log pages will be added
602 if (bp->l_wqnext == NULL)
603 lbmWrite(log, bp, 0, 0);
606 * No current GC leader, initiate group commit
608 log->cflag |= logGC_PAGEOUT;
612 /* page is not bound with outstanding tblk:
613 * init write or mark it to be redriven (lbmWRITE)
616 /* finalize the page */
617 bp->l_ceor = bp->l_eor;
618 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
619 lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0);
624 * allocate/initialize next page
626 /* if log wraps, the first data page of log is 2
627 * (0 never used, 1 is superblock).
629 log->page = (pn == log->size - 1) ? 2 : pn + 1;
630 log->eor = LOGPHDRSIZE; /* ? valid page empty/full at logRedo() */
632 /* allocate/initialize next log page buffer */
633 nextbp = lbmAllocate(log, log->page);
634 nextbp->l_eor = log->eor;
637 /* initialize next log page */
638 lp = (struct logpage *) nextbp->l_ldata;
639 lp->h.page = lp->t.page = cpu_to_le32(lspn + 1);
640 lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
647 * NAME: lmGroupCommit()
649 * FUNCTION: group commit
650 * initiate pageout of the pages with COMMIT in the order of
651 * page number - redrive pageout of the page at the head of
652 * pageout queue until full page has been written.
657 * LOGGC_LOCK serializes log group commit queue, and
658 * transaction blocks on the commit queue.
659 * N.B. LOG_LOCK is NOT held during lmGroupCommit().
661 int lmGroupCommit(struct jfs_log * log, struct tblock * tblk)
667 /* group committed already ? */
668 if (tblk->flag & tblkGC_COMMITTED) {
669 if (tblk->flag & tblkGC_ERROR)
675 jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc);
677 if (tblk->xflag & COMMIT_LAZY)
678 tblk->flag |= tblkGC_LAZY;
680 if ((!(log->cflag & logGC_PAGEOUT)) && log->cqueue.head &&
681 (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag))) {
683 * No pageout in progress
685 * start group commit as its group leader.
687 log->cflag |= logGC_PAGEOUT;
692 if (tblk->xflag & COMMIT_LAZY) {
694 * Lazy transactions can leave now
700 /* lmGCwrite gives up LOGGC_LOCK, check again */
702 if (tblk->flag & tblkGC_COMMITTED) {
703 if (tblk->flag & tblkGC_ERROR)
710 /* upcount transaction waiting for completion
713 tblk->flag |= tblkGC_READY;
715 __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED),
716 LOGGC_LOCK(log), LOGGC_UNLOCK(log));
718 /* removed from commit queue */
719 if (tblk->flag & tblkGC_ERROR)
729 * FUNCTION: group commit write
730 * initiate write of log page, building a group of all transactions
731 * with commit records on that page.
736 * LOGGC_LOCK must be held by caller.
737 * N.B. LOG_LOCK is NOT held during lmGroupCommit().
739 static void lmGCwrite(struct jfs_log * log, int cant_write)
743 int gcpn; /* group commit page number */
745 struct tblock *xtblk;
748 * build the commit group of a log page
750 * scan commit queue and make a commit group of all
751 * transactions with COMMIT records on the same log page.
753 /* get the head tblk on the commit queue */
754 tblk = xtblk = log->cqueue.head;
757 while (tblk && tblk->pn == gcpn) {
760 /* state transition: (QUEUE, READY) -> COMMIT */
761 tblk->flag |= tblkGC_COMMIT;
764 tblk = xtblk; /* last tblk of the page */
767 * pageout to commit transactions on the log page.
769 bp = (struct lbuf *) tblk->bp;
770 lp = (struct logpage *) bp->l_ldata;
771 /* is page already full ? */
772 if (tblk->flag & tblkGC_EOP) {
773 /* mark page to free at end of group commit of the page */
774 tblk->flag &= ~tblkGC_EOP;
775 tblk->flag |= tblkGC_FREE;
776 bp->l_ceor = bp->l_eor;
777 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
778 lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC,
780 INCREMENT(lmStat.full_page);
782 /* page is not yet full */
784 bp->l_ceor = tblk->eor; /* ? bp->l_ceor = bp->l_eor; */
785 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
786 lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write);
787 INCREMENT(lmStat.partial_page);
794 * FUNCTION: group commit post-processing
795 * Processes transactions after their commit records have been written
796 * to disk, redriving log I/O if necessary.
801 * This routine is called a interrupt time by lbmIODone
803 static void lmPostGC(struct lbuf * bp)
806 struct jfs_log *log = bp->l_log;
811 spin_lock_irqsave(&log->gclock, flags);
813 * current pageout of group commit completed.
815 * remove/wakeup transactions from commit queue who were
816 * group committed with the current log page
818 while ((tblk = log->cqueue.head) && (tblk->flag & tblkGC_COMMIT)) {
819 /* if transaction was marked GC_COMMIT then
820 * it has been shipped in the current pageout
821 * and made it to disk - it is committed.
824 if (bp->l_flag & lbmERROR)
825 tblk->flag |= tblkGC_ERROR;
827 /* remove it from the commit queue */
828 log->cqueue.head = tblk->cqnext;
829 if (log->cqueue.head == NULL)
830 log->cqueue.tail = NULL;
831 tblk->flag &= ~tblkGC_QUEUE;
834 if (tblk == log->flush_tblk) {
835 /* we can stop flushing the log now */
836 clear_bit(log_FLUSH, &log->flag);
837 log->flush_tblk = NULL;
840 jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk,
843 if (!(tblk->xflag & COMMIT_FORCE))
845 * Hand tblk over to lazy commit thread
849 /* state transition: COMMIT -> COMMITTED */
850 tblk->flag |= tblkGC_COMMITTED;
852 if (tblk->flag & tblkGC_READY)
858 /* was page full before pageout ?
859 * (and this is the last tblk bound with the page)
861 if (tblk->flag & tblkGC_FREE)
863 /* did page become full after pageout ?
864 * (and this is the last tblk bound with the page)
866 else if (tblk->flag & tblkGC_EOP) {
867 /* finalize the page */
868 lp = (struct logpage *) bp->l_ldata;
869 bp->l_ceor = bp->l_eor;
870 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
871 jfs_info("lmPostGC: calling lbmWrite");
872 lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE,
878 /* are there any transactions who have entered lnGroupCommit()
879 * (whose COMMITs are after that of the last log page written.
880 * They are waiting for new group commit (above at (SLEEP 1))
881 * or lazy transactions are on a full (queued) log page,
882 * select the latest ready transaction as new group leader and
883 * wake her up to lead her group.
885 if ((tblk = log->cqueue.head) &&
886 ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) ||
887 test_bit(log_FLUSH, &log->flag)))
889 * Call lmGCwrite with new group leader
893 /* no transaction are ready yet (transactions are only just
894 * queued (GC_QUEUE) and not entered for group commit yet).
895 * the first transaction entering group commit
896 * will elect herself as new group leader.
899 log->cflag &= ~logGC_PAGEOUT;
902 spin_unlock_irqrestore(&log->gclock, flags);
909 * FUNCTION: write log SYNCPT record for specified log
910 * if new sync address is available
911 * (normally the case if sync() is executed by back-ground
913 * if not, explicitly run jfs_blogsync() to initiate
914 * getting of new sync address.
915 * calculate new value of i_nextsync which determines when
916 * this code is called again.
918 * this is called only from lmLog().
920 * PARAMETER: ip - pointer to logs inode.
924 * serialization: LOG_LOCK() held on entry/exit
926 static int lmLogSync(struct jfs_log * log, int nosyncwait)
929 int written; /* written since last syncpt */
930 int free; /* free space left available */
931 int delta; /* additional delta to write normally */
932 int more; /* additional write granted */
935 struct logsyncblk *lp;
940 /* if last sync is same as last syncpt,
941 * invoke sync point forward processing to update sync.
944 if (log->sync == log->syncpt) {
946 /* ToDo: push dirty metapages out to disk */
949 if (list_empty(&log->synclist))
950 log->sync = log->lsn;
952 lp = list_entry(log->synclist.next,
953 struct logsyncblk, synclist);
960 /* if sync is different from last syncpt,
961 * write a SYNCPT record with syncpt = sync.
962 * reset syncpt = sync
964 if (log->sync != log->syncpt) {
965 struct super_block *sb = log->sb;
966 struct jfs_sb_info *sbi = JFS_SBI(sb);
969 * We need to make sure all of the "written" metapages
970 * actually make it to disk
972 fsync_inode_data_buffers(sbi->ipbmap);
973 fsync_inode_data_buffers(sbi->ipimap);
974 fsync_inode_data_buffers(sb->s_bdev->bd_inode);
978 lrd.type = cpu_to_le16(LOG_SYNCPT);
980 lrd.log.syncpt.sync = cpu_to_le32(log->sync);
981 lsn = lmWriteRecord(log, NULL, &lrd, NULL);
983 log->syncpt = log->sync;
988 * setup next syncpt trigger (SWAG)
990 logsize = log->logsize;
992 logdiff(written, lsn, log);
993 free = logsize - written;
994 delta = LOGSYNC_DELTA(logsize);
995 more = min(free / 2, delta);
996 if (more < 2 * LOGPSIZE) {
997 jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n");
1001 * option 1 - panic ? No.!
1002 * option 2 - shutdown file systems
1003 * associated with log ?
1004 * option 3 - extend log ?
1007 * option 4 - second chance
1009 * mark log wrapped, and continue.
1010 * when all active transactions are completed,
1011 * mark log vaild for recovery.
1012 * if crashed during invalid state, log state
1013 * implies invald log, forcing fsck().
1015 /* mark log state log wrap in log superblock */
1016 /* log->state = LOGWRAP; */
1018 /* reset sync point computation */
1019 log->syncpt = log->sync = lsn;
1020 log->nextsync = delta;
1022 /* next syncpt trigger = written + more */
1023 log->nextsync = written + more;
1025 /* return if lmLogSync() from outside of transaction, e.g., sync() */
1029 /* if number of bytes written from last sync point is more
1030 * than 1/4 of the log size, stop new transactions from
1031 * starting until all current transactions are completed
1032 * by setting syncbarrier flag.
1034 if (written > LOGSYNC_BARRIER(logsize) && logsize > 32 * LOGPSIZE) {
1035 set_bit(log_SYNCBARRIER, &log->flag);
1036 jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn,
1039 * We may have to initiate group commit
1041 jfs_flush_journal(log, 0);
1051 * FUNCTION: open the log on first open;
1052 * insert filesystem in the active list of the log.
1054 * PARAMETER: ipmnt - file system mount inode
1055 * iplog - log inode (out)
1061 int lmLogOpen(struct super_block *sb, struct jfs_log ** logptr)
1064 struct block_device *bdev;
1065 struct jfs_log *log;
1067 if (!(log = kmalloc(sizeof(struct jfs_log), GFP_KERNEL)))
1069 memset(log, 0, sizeof(struct jfs_log));
1070 init_waitqueue_head(&log->syncwait);
1072 log->sb = sb; /* This should be a list */
1074 if (!(JFS_SBI(sb)->mntflag & JFS_INLINELOG))
1078 * in-line log in host file system
1080 * file system to log have 1-to-1 relationship;
1083 set_bit(log_INLINELOG, &log->flag);
1084 log->bdev = sb->s_bdev;
1085 log->base = addressPXD(&JFS_SBI(sb)->logpxd);
1086 log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
1087 (L2LOGPSIZE - sb->s_blocksize_bits);
1088 log->l2bsize = sb->s_blocksize_bits;
1089 ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits);
1094 if ((rc = lmLogInit(log)))
1099 * external log as separate logical volume
1101 * file systems to log may have n-to-1 relationship;
1106 * TODO: Check for already opened log devices
1109 if (!(bdev = bdget(kdev_t_to_nr(JFS_SBI(sb)->logdev)))) {
1114 if ((rc = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS))) {
1119 memcpy(log->uuid, JFS_SBI(sb)->loguuid, sizeof(log->uuid));
1124 if ((rc = lmLogInit(log)))
1128 * add file system to log active file system list
1130 if ((rc = lmLogFileSystem(log, JFS_SBI(sb)->uuid, 1)))
1140 shutdown: /* unwind lbmLogInit() */
1141 lbmLogShutdown(log);
1143 close: /* close external log device */
1144 blkdev_put(bdev, BDEV_FS);
1146 free: /* free log descriptor */
1149 jfs_warn("lmLogOpen: exit(%d)", rc);
1157 * FUNCTION: log initialization at first log open.
1159 * logredo() (or logformat()) should have been run previously.
1160 * initialize the log inode from log superblock.
1161 * set the log state in the superblock to LOGMOUNT and
1162 * write SYNCPT log record.
1164 * PARAMETER: log - log structure
1167 * -EINVAL - bad log magic number or superblock dirty
1168 * error returned from logwait()
1170 * serialization: single first open thread
1172 int lmLogInit(struct jfs_log * log)
1176 struct logsuper *logsuper;
1177 struct lbuf *bpsuper;
1182 jfs_info("lmLogInit: log:0x%p", log);
1185 * log inode is overlaid on generic inode where
1186 * dinode have been zeroed out by iRead();
1190 * initialize log i/o
1192 if ((rc = lbmLogInit(log)))
1196 * validate log superblock
1200 if (!test_bit(log_INLINELOG, &log->flag))
1201 log->l2bsize = 12; /* XXX kludge alert XXX */
1202 if ((rc = lbmRead(log, 1, &bpsuper)))
1205 logsuper = (struct logsuper *) bpsuper->l_ldata;
1207 if (logsuper->magic != cpu_to_le32(LOGMAGIC)) {
1208 jfs_warn("*** Log Format Error ! ***");
1213 /* logredo() should have been run successfully. */
1214 if (logsuper->state != cpu_to_le32(LOGREDONE)) {
1215 jfs_warn("*** Log Is Dirty ! ***");
1220 /* initialize log inode from log superblock */
1221 if (test_bit(log_INLINELOG,&log->flag)) {
1222 if (log->size != le32_to_cpu(logsuper->size)) {
1226 jfs_info("lmLogInit: inline log:0x%p base:0x%Lx size:0x%x",
1227 log, (unsigned long long) log->base, log->size);
1229 if (memcmp(logsuper->uuid, log->uuid, 16)) {
1230 jfs_warn("wrong uuid on JFS log device");
1233 log->size = le32_to_cpu(logsuper->size);
1234 log->l2bsize = le32_to_cpu(logsuper->l2bsize);
1235 jfs_info("lmLogInit: external log:0x%p base:0x%Lx size:0x%x",
1236 log, (unsigned long long) log->base, log->size);
1239 log->page = le32_to_cpu(logsuper->end) / LOGPSIZE;
1240 log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page);
1242 /* check for disabled journaling to disk */
1243 if (JFS_SBI(log->sb)->flag & JFS_NOINTEGRITY) {
1244 log->no_integrity = 1;
1245 log->ni_page = log->page;
1246 log->ni_eor = log->eor;
1249 log->no_integrity = 0;
1252 * initialize for log append write mode
1254 /* establish current/end-of-log page/buffer */
1255 if ((rc = lbmRead(log, log->page, &bp)))
1258 lp = (struct logpage *) bp->l_ldata;
1260 jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d",
1261 le32_to_cpu(logsuper->end), log->page, log->eor,
1262 le16_to_cpu(lp->h.eor));
1264 // ASSERT(log->eor == lp->h.eor);
1267 bp->l_pn = log->page;
1268 bp->l_eor = log->eor;
1270 /* initialize the group commit serialization lock */
1271 LOGGC_LOCK_INIT(log);
1273 /* if current page is full, move on to next page */
1274 if (log->eor >= LOGPSIZE - LOGPTLRSIZE)
1277 /* allocate/initialize the log write serialization lock */
1281 * initialize log syncpoint
1284 * write the first SYNCPT record with syncpoint = 0
1285 * (i.e., log redo up to HERE !);
1286 * remove current page from lbm write queue at end of pageout
1287 * (to write log superblock update), but do not release to freelist;
1291 lrd.type = cpu_to_le16(LOG_SYNCPT);
1293 lrd.log.syncpt.sync = 0;
1294 lsn = lmWriteRecord(log, NULL, &lrd, NULL);
1296 bp->l_ceor = bp->l_eor;
1297 lp = (struct logpage *) bp->l_ldata;
1298 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
1299 lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0);
1300 if ((rc = lbmIOWait(bp, 0)))
1303 /* initialize logsync parameters */
1304 log->logsize = (log->size - 2) << L2LOGPSIZE;
1307 log->sync = log->syncpt;
1308 log->nextsync = LOGSYNC_DELTA(log->logsize);
1310 jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x",
1311 log->lsn, log->syncpt, log->sync);
1313 LOGSYNC_LOCK_INIT(log);
1315 INIT_LIST_HEAD(&log->synclist);
1317 log->cqueue.head = log->cqueue.tail = NULL;
1318 log->flush_tblk = NULL;
1323 * initialize for lazy/group commit
1328 * update/write superblock
1330 logsuper->state = cpu_to_le32(LOGMOUNT);
1331 log->serial = le32_to_cpu(logsuper->serial) + 1;
1332 logsuper->serial = cpu_to_le32(log->serial);
1333 lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
1334 if ((rc = lbmIOWait(bpsuper, lbmFREE)))
1342 errout30: /* release log page */
1344 bp->l_wqnext = NULL;
1347 errout20: /* release log superblock */
1350 errout10: /* unwind lbmLogInit() */
1351 lbmLogShutdown(log);
1353 jfs_warn("lmLogInit: exit(%d)", rc);
1359 * NAME: lmLogClose()
1361 * FUNCTION: remove file system <ipmnt> from active list of log <iplog>
1362 * and close it on last close.
1364 * PARAMETER: sb - superblock
1367 * RETURN: errors from subroutines
1371 int lmLogClose(struct super_block *sb, struct jfs_log * log)
1375 jfs_info("lmLogClose: log:0x%p", log);
1377 if (!test_bit(log_INLINELOG, &log->flag))
1381 * in-line log in host file system
1383 rc = lmLogShutdown(log);
1387 * external log as separate logical volume
1390 lmLogFileSystem(log, JFS_SBI(sb)->uuid, 0);
1391 rc = lmLogShutdown(log);
1392 blkdev_put(log->bdev, BDEV_FS);
1396 jfs_info("lmLogClose: exit(%d)", rc);
1402 * NAME: jfs_flush_journal()
1404 * FUNCTION: initiate write of any outstanding transactions to the journal
1405 * and optionally wait until they are all written to disk
1407 * wait == 0 flush until latest txn is committed, don't wait
1408 * wait == 1 flush until latest txn is committed, wait
1409 * wait > 1 flush until all txn's are complete, wait
1411 void jfs_flush_journal(struct jfs_log *log, int wait)
1414 struct tblock *target;
1417 /* jfs_write_inode may call us during read-only mount */
1420 jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait);
1424 target = log->cqueue.head;
1428 * This ensures that we will keep writing to the journal as long
1429 * as there are unwritten commit records
1432 if (test_bit(log_FLUSH, &log->flag)) {
1434 * We're already flushing.
1435 * if flush_tblk is NULL, we are flushing everything,
1436 * so leave it that way. Otherwise, update it to the
1437 * latest transaction
1439 if (log->flush_tblk)
1440 log->flush_tblk = target;
1442 /* Only flush until latest transaction is committed */
1443 log->flush_tblk = target;
1444 set_bit(log_FLUSH, &log->flag);
1447 * Initiate I/O on outstanding transactions
1449 if (!(log->cflag & logGC_PAGEOUT)) {
1450 log->cflag |= logGC_PAGEOUT;
1455 if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) {
1456 /* Flush until all activity complete */
1457 set_bit(log_FLUSH, &log->flag);
1458 log->flush_tblk = NULL;
1461 if (wait && target && !(target->flag & tblkGC_COMMITTED)) {
1462 DECLARE_WAITQUEUE(__wait, current);
1464 add_wait_queue(&target->gcwait, &__wait);
1465 set_current_state(TASK_UNINTERRUPTIBLE);
1468 current->state = TASK_RUNNING;
1470 remove_wait_queue(&target->gcwait, &__wait);
1478 * If there was recent activity, we may need to wait
1479 * for the lazycommit thread to catch up
1481 if (log->cqueue.head || !list_empty(&log->synclist)) {
1482 for (i = 0; i < 800; i++) { /* Too much? */
1483 current->state = TASK_INTERRUPTIBLE;
1484 schedule_timeout(HZ / 4);
1485 if ((log->cqueue.head == NULL) &&
1486 list_empty(&log->synclist))
1490 assert(log->cqueue.head == NULL);
1491 assert(list_empty(&log->synclist));
1492 clear_bit(log_FLUSH, &log->flag);
1496 * NAME: lmLogShutdown()
1498 * FUNCTION: log shutdown at last LogClose().
1500 * write log syncpt record.
1501 * update super block to set redone flag to 0.
1503 * PARAMETER: log - log inode
1505 * RETURN: 0 - success
1507 * serialization: single last close thread
1509 int lmLogShutdown(struct jfs_log * log)
1514 struct logsuper *logsuper;
1515 struct lbuf *bpsuper;
1519 jfs_info("lmLogShutdown: log:0x%p", log);
1521 jfs_flush_journal(log, 2);
1524 * We need to make sure all of the "written" metapages
1525 * actually make it to disk
1527 fsync_no_super(log->sb->s_dev);
1530 * write the last SYNCPT record with syncpoint = 0
1531 * (i.e., log redo up to HERE !)
1535 lrd.type = cpu_to_le16(LOG_SYNCPT);
1537 lrd.log.syncpt.sync = 0;
1539 /* check for disabled journaling to disk */
1540 if (JFS_SBI(log->sb)->flag & JFS_NOINTEGRITY) {
1541 log->no_integrity = 0;
1542 log->page = log->ni_page;
1543 log->eor = log->ni_eor;
1546 lsn = lmWriteRecord(log, NULL, &lrd, NULL);
1548 lp = (struct logpage *) bp->l_ldata;
1549 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
1550 lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0);
1551 lbmIOWait(log->bp, lbmFREE);
1554 * synchronous update log superblock
1555 * mark log state as shutdown cleanly
1556 * (i.e., Log does not need to be replayed).
1558 if ((rc = lbmRead(log, 1, &bpsuper)))
1561 logsuper = (struct logsuper *) bpsuper->l_ldata;
1562 logsuper->state = cpu_to_le32(LOGREDONE);
1563 logsuper->end = cpu_to_le32(lsn);
1564 lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
1565 rc = lbmIOWait(bpsuper, lbmFREE);
1567 jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d",
1568 lsn, log->page, log->eor);
1572 * shutdown per log i/o
1574 lbmLogShutdown(log);
1577 jfs_warn("lmLogShutdown: exit(%d)", rc);
1584 * NAME: lmLogFileSystem()
1586 * FUNCTION: insert (<activate> = true)/remove (<activate> = false)
1587 * file system into/from log active file system list.
1589 * PARAMETE: log - pointer to logs inode.
1590 * fsdev - kdev_t of filesystem.
1591 * serial - pointer to returned log serial number
1592 * activate - insert/remove device from active list.
1594 * RETURN: 0 - success
1595 * errors returned by vms_iowait().
1597 static int lmLogFileSystem(struct jfs_log * log, char *uuid, int activate)
1601 struct logsuper *logsuper;
1602 struct lbuf *bpsuper;
1605 * insert/remove file system device to log active file system list.
1607 if ((rc = lbmRead(log, 1, &bpsuper)))
1610 logsuper = (struct logsuper *) bpsuper->l_ldata;
1612 for (i = 0; i < MAX_ACTIVE; i++)
1613 if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) {
1614 memcpy(logsuper->active[i].uuid, uuid, 16);
1617 if (i == MAX_ACTIVE) {
1618 jfs_warn("Too many file systems sharing journal!");
1620 return -EMFILE; /* Is there a better rc? */
1623 for (i = 0; i < MAX_ACTIVE; i++)
1624 if (!memcmp(logsuper->active[i].uuid, uuid, 16)) {
1625 memcpy(logsuper->active[i].uuid, NULL_UUID, 16);
1628 if (i == MAX_ACTIVE) {
1629 jfs_warn("Somebody stomped on the journal!");
1637 * synchronous write log superblock:
1639 * write sidestream bypassing write queue:
1640 * at file system mount, log super block is updated for
1641 * activation of the file system before any log record
1642 * (MOUNT record) of the file system, and at file system
1643 * unmount, all meta data for the file system has been
1644 * flushed before log super block is updated for deactivation
1645 * of the file system.
1647 lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
1648 rc = lbmIOWait(bpsuper, lbmFREE);
1654 * log buffer manager (lbm)
1655 * ------------------------
1657 * special purpose buffer manager supporting log i/o requirements.
1659 * per log write queue:
1660 * log pageout occurs in serial order by fifo write queue and
1661 * restricting to a single i/o in pregress at any one time.
1662 * a circular singly-linked list
1663 * (log->wrqueue points to the tail, and buffers are linked via
1664 * bp->wrqueue field), and
1665 * maintains log page in pageout ot waiting for pageout in serial pageout.
1671 * initialize per log I/O setup at lmLogInit()
1673 static int lbmLogInit(struct jfs_log * log)
1678 jfs_info("lbmLogInit: log:0x%p", log);
1680 /* initialize current buffer cursor */
1683 /* initialize log device write queue */
1687 * Each log has its own buffer pages allocated to it. These are
1688 * not managed by the page cache. This ensures that a transaction
1689 * writing to the log does not block trying to allocate a page from
1690 * the page cache (for the log). This would be bad, since page
1691 * allocation waits on the kswapd thread that may be committing inodes
1692 * which would cause log activity. Was that clear? I'm trying to
1693 * avoid deadlock here.
1695 init_waitqueue_head(&log->free_wait);
1697 log->lbuf_free = NULL;
1699 for (i = 0; i < LOGPAGES; i++) {
1700 lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL);
1703 lbuf->l_bh.b_data = lbuf->l_ldata =
1704 (char *) get_zeroed_page(GFP_KERNEL);
1705 if (lbuf->l_ldata == 0) {
1710 init_waitqueue_head(&lbuf->l_ioevent);
1712 lbuf->l_bh.b_size = LOGPSIZE;
1713 lbuf->l_bh.b_dev = to_kdev_t(log->bdev->bd_dev);
1714 lbuf->l_bh.b_end_io = lbmIODone;
1715 lbuf->l_bh.b_private = lbuf;
1716 lbuf->l_bh.b_page = virt_to_page(lbuf->l_ldata);
1717 lbuf->l_bh.b_state = 0;
1718 init_waitqueue_head(&lbuf->l_bh.b_wait);
1720 lbuf->l_freelist = log->lbuf_free;
1721 log->lbuf_free = lbuf;
1727 lbmLogShutdown(log);
1735 * finalize per log I/O setup at lmLogShutdown()
1737 static void lbmLogShutdown(struct jfs_log * log)
1741 jfs_info("lbmLogShutdown: log:0x%p", log);
1743 lbuf = log->lbuf_free;
1745 struct lbuf *next = lbuf->l_freelist;
1746 free_page((unsigned long) lbuf->l_ldata);
1758 * allocate an empty log buffer
1760 static struct lbuf *lbmAllocate(struct jfs_log * log, int pn)
1763 unsigned long flags;
1766 * recycle from log buffer freelist if any
1769 LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags);
1770 log->lbuf_free = bp->l_freelist;
1771 LCACHE_UNLOCK(flags);
1775 bp->l_wqnext = NULL;
1776 bp->l_freelist = NULL;
1779 bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize));
1780 bp->l_bh.b_blocknr = bp->l_blkno;
1790 * release a log buffer to freelist
1792 static void lbmFree(struct lbuf * bp)
1794 unsigned long flags;
1800 LCACHE_UNLOCK(flags);
1803 static void lbmfree(struct lbuf * bp)
1805 struct jfs_log *log = bp->l_log;
1807 assert(bp->l_wqnext == NULL);
1810 * return the buffer to head of freelist
1812 bp->l_freelist = log->lbuf_free;
1813 log->lbuf_free = bp;
1815 wake_up(&log->free_wait);
1823 * FUNCTION: add a log buffer to the the log redrive list
1829 * Takes log_redrive_lock.
1831 static inline void lbmRedrive(struct lbuf *bp)
1833 unsigned long flags;
1835 spin_lock_irqsave(&log_redrive_lock, flags);
1836 bp->l_redrive_next = log_redrive_list;
1837 log_redrive_list = bp;
1838 spin_unlock_irqrestore(&log_redrive_lock, flags);
1840 wake_up(&jfs_IO_thread_wait);
1847 static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
1852 * allocate a log buffer
1854 *bpp = bp = lbmAllocate(log, pn);
1855 jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn);
1857 bp->l_flag |= lbmREAD;
1858 bp->l_bh.b_reqnext = NULL;
1859 clear_bit(BH_Uptodate, &bp->l_bh.b_state);
1860 lock_buffer(&bp->l_bh);
1861 set_bit(BH_Mapped, &bp->l_bh.b_state);
1862 set_bit(BH_Req, &bp->l_bh.b_state);
1863 bp->l_bh.b_rdev = bp->l_bh.b_dev;
1864 bp->l_bh.b_rsector = bp->l_blkno << (log->l2bsize - 9);
1865 generic_make_request(READ, &bp->l_bh);
1866 run_task_queue(&tq_disk);
1868 wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
1877 * buffer at head of pageout queue stays after completion of
1878 * partial-page pageout and redriven by explicit initiation of
1879 * pageout by caller until full-page pageout is completed and
1882 * device driver i/o done redrives pageout of new buffer at
1883 * head of pageout queue when current buffer at head of pageout
1884 * queue is released at the completion of its full-page pageout.
1886 * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit().
1887 * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone()
1889 static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
1893 unsigned long flags;
1895 jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn);
1897 /* map the logical block address to physical block address */
1899 log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
1901 LCACHE_LOCK(flags); /* disable+lock */
1904 * initialize buffer for device driver
1909 * insert bp at tail of write queue associated with log
1911 * (request is either for bp already/currently at head of queue
1912 * or new bp to be inserted at tail)
1916 /* is buffer not already on write queue ? */
1917 if (bp->l_wqnext == NULL) {
1918 /* insert at tail of wqueue */
1924 bp->l_wqnext = tail->l_wqnext;
1925 tail->l_wqnext = bp;
1931 /* is buffer at head of wqueue and for write ? */
1932 if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) {
1933 LCACHE_UNLOCK(flags); /* unlock+enable */
1937 LCACHE_UNLOCK(flags); /* unlock+enable */
1941 else if (flag & lbmSYNC)
1954 * initiate pageout bypassing write queue for sidestream
1955 * (e.g., log superblock) write;
1957 static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
1959 jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x",
1960 bp, flag, bp->l_pn);
1963 * initialize buffer for device driver
1965 bp->l_flag = flag | lbmDIRECT;
1967 /* map the logical block address to physical block address */
1969 log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
1972 * initiate pageout of the page
1979 * NAME: lbmStartIO()
1981 * FUNCTION: Interface to DD strategy routine
1985 * serialization: LCACHE_LOCK() is NOT held during log i/o;
1987 static void lbmStartIO(struct lbuf * bp)
1989 jfs_info("lbmStartIO");
1991 bp->l_bh.b_reqnext = NULL;
1992 set_bit(BH_Dirty, &bp->l_bh.b_state);
1993 // lock_buffer(&bp->l_bh);
1994 assert(!test_bit(BH_Lock, &bp->l_bh.b_state));
1995 set_bit(BH_Lock, &bp->l_bh.b_state);
1997 set_bit(BH_Mapped, &bp->l_bh.b_state);
1998 set_bit(BH_Req, &bp->l_bh.b_state);
1999 bp->l_bh.b_rdev = bp->l_bh.b_dev;
2000 bp->l_bh.b_rsector = bp->l_blkno << (bp->l_log->l2bsize - 9);
2002 if (bp->l_log->no_integrity)
2003 /* don't really do I/O */
2004 lbmIODone(&bp->l_bh, 1);
2006 generic_make_request(WRITE, &bp->l_bh);
2008 INCREMENT(lmStat.submitted);
2009 run_task_queue(&tq_disk);
2016 static int lbmIOWait(struct lbuf * bp, int flag)
2018 unsigned long flags;
2021 jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
2023 LCACHE_LOCK(flags); /* disable+lock */
2025 LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags);
2027 rc = (bp->l_flag & lbmERROR) ? -EIO : 0;
2032 LCACHE_UNLOCK(flags); /* unlock+enable */
2034 jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
2041 * executed at INTIODONE level
2043 static void lbmIODone(struct buffer_head *bh, int uptodate)
2045 struct lbuf *bp = bh->b_private;
2046 struct lbuf *nextbp, *tail;
2047 struct jfs_log *log;
2048 unsigned long flags;
2051 * get back jfs buffer bound to the i/o buffer
2053 jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag);
2055 LCACHE_LOCK(flags); /* disable+lock */
2057 unlock_buffer(&bp->l_bh);
2058 bp->l_flag |= lbmDONE;
2061 bp->l_flag |= lbmERROR;
2063 jfs_err("lbmIODone: I/O error in JFS log");
2069 if (bp->l_flag & lbmREAD) {
2070 bp->l_flag &= ~lbmREAD;
2072 LCACHE_UNLOCK(flags); /* unlock+enable */
2074 /* wakeup I/O initiator */
2075 LCACHE_WAKEUP(&bp->l_ioevent);
2081 * pageout completion
2083 * the bp at the head of write queue has completed pageout.
2085 * if single-commit/full-page pageout, remove the current buffer
2086 * from head of pageout queue, and redrive pageout with
2087 * the new buffer at head of pageout queue;
2088 * otherwise, the partial-page pageout buffer stays at
2089 * the head of pageout queue to be redriven for pageout
2090 * by lmGroupCommit() until full-page pageout is completed.
2092 bp->l_flag &= ~lbmWRITE;
2093 INCREMENT(lmStat.pagedone);
2095 /* update committed lsn */
2097 log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor;
2099 if (bp->l_flag & lbmDIRECT) {
2100 LCACHE_WAKEUP(&bp->l_ioevent);
2101 LCACHE_UNLOCK(flags);
2107 /* single element queue */
2109 /* remove head buffer of full-page pageout
2110 * from log device write queue
2112 if (bp->l_flag & lbmRELEASE) {
2114 bp->l_wqnext = NULL;
2117 /* multi element queue */
2119 /* remove head buffer of full-page pageout
2120 * from log device write queue
2122 if (bp->l_flag & lbmRELEASE) {
2123 nextbp = tail->l_wqnext = bp->l_wqnext;
2124 bp->l_wqnext = NULL;
2127 * redrive pageout of next page at head of write queue:
2128 * redrive next page without any bound tblk
2129 * (i.e., page w/o any COMMIT records), or
2130 * first page of new group commit which has been
2131 * queued after current page (subsequent pageout
2132 * is performed synchronously, except page without
2133 * any COMMITs) by lmGroupCommit() as indicated
2136 if (nextbp->l_flag & lbmWRITE) {
2138 * We can't do the I/O at interrupt time.
2139 * The jfsIO thread can do it
2147 * synchronous pageout:
2149 * buffer has not necessarily been removed from write queue
2150 * (e.g., synchronous write of partial-page with COMMIT):
2151 * leave buffer for i/o initiator to dispose
2153 if (bp->l_flag & lbmSYNC) {
2154 LCACHE_UNLOCK(flags); /* unlock+enable */
2156 /* wakeup I/O initiator */
2157 LCACHE_WAKEUP(&bp->l_ioevent);
2161 * Group Commit pageout:
2163 else if (bp->l_flag & lbmGC) {
2164 LCACHE_UNLOCK(flags);
2169 * asynchronous pageout:
2171 * buffer must have been removed from write queue:
2172 * insert buffer at head of freelist where it can be recycled
2175 assert(bp->l_flag & lbmRELEASE);
2176 assert(bp->l_flag & lbmFREE);
2179 LCACHE_UNLOCK(flags); /* unlock+enable */
2183 int jfsIOWait(void *arg)
2190 current->tty = NULL;
2191 strcpy(current->comm, "jfsIO");
2195 spin_lock_irq(¤t->sigmask_lock);
2196 sigfillset(¤t->blocked);
2197 recalc_sigpending(current);
2198 spin_unlock_irq(¤t->sigmask_lock);
2200 complete(&jfsIOwait);
2203 DECLARE_WAITQUEUE(wq, current);
2205 spin_lock_irq(&log_redrive_lock);
2206 while ((bp = log_redrive_list)) {
2207 log_redrive_list = bp->l_redrive_next;
2208 bp->l_redrive_next = NULL;
2209 spin_unlock_irq(&log_redrive_lock);
2211 spin_lock_irq(&log_redrive_lock);
2213 add_wait_queue(&jfs_IO_thread_wait, &wq);
2214 set_current_state(TASK_INTERRUPTIBLE);
2215 spin_unlock_irq(&log_redrive_lock);
2217 current->state = TASK_RUNNING;
2218 remove_wait_queue(&jfs_IO_thread_wait, &wq);
2219 } while (!jfs_stop_threads);
2221 jfs_info("jfsIOWait being killed!");
2222 complete_and_exit(&jfsIOwait, 0);
2226 * NAME: lmLogFormat()/jfs_logform()
2228 * FUNCTION: format file system log
2232 * logAddress - start address of log space in FS block
2233 * logSize - length of log space in FS block;
2235 * RETURN: 0 - success
2238 * XXX: We're synchronously writing one page at a time. This needs to
2239 * be improved by writing multiple pages at once.
2241 int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
2244 struct jfs_sb_info *sbi = JFS_SBI(log->sb);
2245 struct logsuper *logsuper;
2247 int lspn; /* log sequence page number */
2248 struct lrd *lrd_ptr;
2252 jfs_info("lmLogFormat: logAddress:%Ld logSize:%d",
2253 (long long)logAddress, logSize);
2255 /* allocate a log buffer */
2256 bp = lbmAllocate(log, 1);
2258 npages = logSize >> sbi->l2nbperpage;
2263 * page 0 - reserved;
2264 * page 1 - log superblock;
2265 * page 2 - log data page: A SYNC log record is written
2266 * into this page at logform time;
2267 * pages 3-N - log data page: set to empty log data pages;
2270 * init log superblock: log page 1
2272 logsuper = (struct logsuper *) bp->l_ldata;
2274 logsuper->magic = cpu_to_le32(LOGMAGIC);
2275 logsuper->version = cpu_to_le32(LOGVERSION);
2276 logsuper->state = cpu_to_le32(LOGREDONE);
2277 logsuper->flag = cpu_to_le32(sbi->mntflag); /* ? */
2278 logsuper->size = cpu_to_le32(npages);
2279 logsuper->bsize = cpu_to_le32(sbi->bsize);
2280 logsuper->l2bsize = cpu_to_le32(sbi->l2bsize);
2281 logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE);
2283 bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
2284 bp->l_blkno = logAddress + sbi->nbperpage;
2286 if ((rc = lbmIOWait(bp, 0)))
2290 * init pages 2 to npages-1 as log data pages:
2292 * log page sequence number (lpsn) initialization:
2295 * +-----+-----+=====+=====+===.....===+=====+
2297 * <--- N page circular file ---->
2299 * the N (= npages-2) data pages of the log is maintained as
2300 * a circular file for the log records;
2301 * lpsn grows by 1 monotonically as each log page is written
2302 * to the circular file of the log;
2303 * and setLogpage() will not reset the page number even if
2304 * the eor is equal to LOGPHDRSIZE. In order for binary search
2305 * still work in find log end process, we have to simulate the
2306 * log wrap situation at the log format time.
2307 * The 1st log page written will have the highest lpsn. Then
2308 * the succeeding log pages will have ascending order of
2309 * the lspn starting from 0, ... (N-2)
2311 lp = (struct logpage *) bp->l_ldata;
2313 * initialize 1st log page to be written: lpsn = N - 1,
2314 * write a SYNCPT log record is written to this page
2316 lp->h.page = lp->t.page = cpu_to_le32(npages - 3);
2317 lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE);
2319 lrd_ptr = (struct lrd *) &lp->data;
2320 lrd_ptr->logtid = 0;
2321 lrd_ptr->backchain = 0;
2322 lrd_ptr->type = cpu_to_le16(LOG_SYNCPT);
2323 lrd_ptr->length = 0;
2324 lrd_ptr->log.syncpt.sync = 0;
2326 bp->l_blkno += sbi->nbperpage;
2327 bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
2329 if ((rc = lbmIOWait(bp, 0)))
2333 * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
2335 for (lspn = 0; lspn < npages - 3; lspn++) {
2336 lp->h.page = lp->t.page = cpu_to_le32(lspn);
2337 lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
2339 bp->l_blkno += sbi->nbperpage;
2340 bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
2342 if ((rc = lbmIOWait(bp, 0)))
2351 /* release the buffer */
2357 #ifdef CONFIG_JFS_STATISTICS
2358 int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
2359 int *eof, void *data)
2364 len += sprintf(buffer,
2365 "JFS Logmgr stats\n"
2366 "================\n"
2368 "writes submitted = %d\n"
2369 "writes completed = %d\n"
2370 "full pages submitted = %d\n"
2371 "partial pages submitted = %d\n",
2376 lmStat.partial_page);
2379 *start = buffer + begin;
2392 #endif /* CONFIG_JFS_STATISTICS */