port enough sendpoint support for DSM-G600 from D-Link 2.4.21-pre4
[linux-2.4.git] / fs / jbd / checkpoint.c
1 /*
2  * linux/fs/checkpoint.c
3  * 
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5  *
6  * Copyright 1999 Red Hat Software --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Checkpoint routines for the generic filesystem journaling code.  
13  * Part of the ext2fs journaling system.  
14  *
15  * Checkpointing is the process of ensuring that a section of the log is
16  * committed fully to disk, so that that portion of the log can be
17  * reused.
18  */
19
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/jbd.h>
23 #include <linux/errno.h>
24 #include <linux/slab.h>
25 #include <linux/locks.h>
26
27 extern spinlock_t journal_datalist_lock;
28
29 /*
30  * Unlink a buffer from a transaction. 
31  *
32  * Called with journal_datalist_lock held.
33  */
34
35 static inline void __buffer_unlink(struct journal_head *jh)
36 {
37         transaction_t *transaction;
38
39         transaction = jh->b_cp_transaction;
40         jh->b_cp_transaction = NULL;
41
42         jh->b_cpnext->b_cpprev = jh->b_cpprev;
43         jh->b_cpprev->b_cpnext = jh->b_cpnext;
44         if (transaction->t_checkpoint_list == jh)
45                 transaction->t_checkpoint_list = jh->b_cpnext;
46         if (transaction->t_checkpoint_list == jh)
47                 transaction->t_checkpoint_list = NULL;
48 }
49
50 /*
51  * Try to release a checkpointed buffer from its transaction.
52  * Returns 1 if we released it.
53  * Requires journal_datalist_lock
54  */
55 static int __try_to_free_cp_buf(struct journal_head *jh)
56 {
57         int ret = 0;
58         struct buffer_head *bh = jh2bh(jh);
59
60         if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
61                 JBUFFER_TRACE(jh, "remove from checkpoint list");
62                 __journal_remove_checkpoint(jh);
63                 __journal_remove_journal_head(bh);
64                 BUFFER_TRACE(bh, "release");
65                 /* BUF_LOCKED -> BUF_CLEAN (fwiw) */
66                 refile_buffer(bh);
67                 __brelse(bh);
68                 ret = 1;
69         }
70         return ret;
71 }
72
73 /*
74  * log_wait_for_space: wait until there is space in the journal.
75  *
76  * Called with the journal already locked, but it will be unlocked if we have
77  * to wait for a checkpoint to free up some space in the log.
78  */
79
80 void log_wait_for_space(journal_t *journal, int nblocks)
81 {
82         while (log_space_left(journal) < nblocks) {
83                 if (journal->j_flags & JFS_ABORT)
84                         return;
85                 unlock_journal(journal);
86                 down(&journal->j_checkpoint_sem);
87                 lock_journal(journal);
88                 
89                 /* Test again, another process may have checkpointed
90                  * while we were waiting for the checkpoint lock */
91                 if (log_space_left(journal) < nblocks) {
92                         log_do_checkpoint(journal, nblocks);
93                 }
94                 up(&journal->j_checkpoint_sem);
95         }
96 }
97
98 /*
99  * Clean up a transaction's checkpoint list.  
100  *
101  * We wait for any pending IO to complete and make sure any clean
102  * buffers are removed from the transaction. 
103  *
104  * Return 1 if we performed any actions which might have destroyed the
105  * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
106  * the last checkpoint buffer is cleansed)
107  *
108  * Called with the journal locked.
109  * Called with journal_datalist_lock held.
110  */
111 static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
112 {
113         struct journal_head *jh, *next_jh, *last_jh;
114         struct buffer_head *bh;
115         int ret = 0;
116
117         assert_spin_locked(&journal_datalist_lock);
118         jh = transaction->t_checkpoint_list;
119         if (!jh)
120                 return 0;
121
122         last_jh = jh->b_cpprev;
123         next_jh = jh;
124         do {
125                 jh = next_jh;
126                 bh = jh2bh(jh);
127                 if (buffer_locked(bh)) {
128                         atomic_inc(&bh->b_count);
129                         spin_unlock(&journal_datalist_lock);
130                         unlock_journal(journal);
131                         wait_on_buffer(bh);
132                         /* the journal_head may have gone by now */
133                         BUFFER_TRACE(bh, "brelse");
134                         __brelse(bh);
135                         goto out_return_1;
136                 }
137                 
138                 if (jh->b_transaction != NULL) {
139                         transaction_t *transaction = jh->b_transaction;
140                         tid_t tid = transaction->t_tid;
141
142                         spin_unlock(&journal_datalist_lock);
143                         log_start_commit(journal, transaction);
144                         unlock_journal(journal);
145                         log_wait_commit(journal, tid);
146                         goto out_return_1;
147                 }
148
149                 /*
150                  * We used to test for (jh->b_list != BUF_CLEAN) here.
151                  * But unmap_underlying_metadata() can place buffer onto
152                  * BUF_CLEAN. Since refile_buffer() no longer takes buffers
153                  * off checkpoint lists, we cope with it here
154                  */
155                 /*
156                  * AKPM: I think the buffer_jdirty test is redundant - it
157                  * shouldn't have NULL b_transaction?
158                  */
159                 next_jh = jh->b_cpnext;
160                 if (!buffer_dirty(bh) && !buffer_jdirty(bh)) {
161                         BUFFER_TRACE(bh, "remove from checkpoint");
162                         __journal_remove_checkpoint(jh);
163                         __journal_remove_journal_head(bh);
164                         refile_buffer(bh);
165                         __brelse(bh);
166                         ret = 1;
167                 }
168                 
169                 jh = next_jh;
170         } while (jh != last_jh);
171
172         return ret;
173 out_return_1:
174         lock_journal(journal);
175         spin_lock(&journal_datalist_lock);
176         return 1;
177 }
178
179 #define NR_BATCH        64
180
181 static void __flush_batch(struct buffer_head **bhs, int *batch_count)
182 {
183         int i;
184
185         spin_unlock(&journal_datalist_lock);
186         ll_rw_block(WRITE, *batch_count, bhs);
187         run_task_queue(&tq_disk);
188         spin_lock(&journal_datalist_lock);
189         for (i = 0; i < *batch_count; i++) {
190                 struct buffer_head *bh = bhs[i];
191                 clear_bit(BH_JWrite, &bh->b_state);
192                 BUFFER_TRACE(bh, "brelse");
193                 __brelse(bh);
194         }
195         *batch_count = 0;
196 }
197
198 /*
199  * Try to flush one buffer from the checkpoint list to disk.
200  *
201  * Return 1 if something happened which requires us to abort the current
202  * scan of the checkpoint list.  
203  *
204  * Called with journal_datalist_lock held.
205  */
206 static int __flush_buffer(journal_t *journal, struct journal_head *jh,
207                         struct buffer_head **bhs, int *batch_count,
208                         int *drop_count)
209 {
210         struct buffer_head *bh = jh2bh(jh);
211         int ret = 0;
212
213         if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
214                 J_ASSERT_JH(jh, jh->b_transaction == NULL);
215                 
216                 /*
217                  * Important: we are about to write the buffer, and
218                  * possibly block, while still holding the journal lock.
219                  * We cannot afford to let the transaction logic start
220                  * messing around with this buffer before we write it to
221                  * disk, as that would break recoverability.  
222                  */
223                 BUFFER_TRACE(bh, "queue");
224                 atomic_inc(&bh->b_count);
225                 J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state));
226                 set_bit(BH_JWrite, &bh->b_state);
227                 bhs[*batch_count] = bh;
228                 (*batch_count)++;
229                 if (*batch_count == NR_BATCH) {
230                         __flush_batch(bhs, batch_count);
231                         ret = 1;
232                 }
233         } else {
234                 int last_buffer = 0;
235                 if (jh->b_cpnext == jh) {
236                         /* We may be about to drop the transaction.  Tell the
237                          * caller that the lists have changed.
238                          */
239                         last_buffer = 1;
240                 }
241                 if (__try_to_free_cp_buf(jh)) {
242                         (*drop_count)++;
243                         ret = last_buffer;
244                 }
245         }
246         return ret;
247 }
248
249         
250 /*
251  * Perform an actual checkpoint.  We don't write out only enough to
252  * satisfy the current blocked requests: rather we submit a reasonably
253  * sized chunk of the outstanding data to disk at once for
254  * efficiency.  log_wait_for_space() will retry if we didn't free enough.
255  * 
256  * However, we _do_ take into account the amount requested so that once
257  * the IO has been queued, we can return as soon as enough of it has
258  * completed to disk.  
259  *
260  * The journal should be locked before calling this function.
261  */
262
263 /* @@@ `nblocks' is unused.  Should it be used? */
264 int log_do_checkpoint (journal_t *journal, int nblocks)
265 {
266         transaction_t *transaction, *last_transaction, *next_transaction;
267         int result;
268         int target;
269         int batch_count = 0;
270         struct buffer_head *bhs[NR_BATCH];
271
272         jbd_debug(1, "Start checkpoint\n");
273
274         /* 
275          * First thing: if there are any transactions in the log which
276          * don't need checkpointing, just eliminate them from the
277          * journal straight away.  
278          */
279         result = cleanup_journal_tail(journal);
280         jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
281         if (result <= 0)
282                 return result;
283
284         /*
285          * OK, we need to start writing disk blocks.  Try to free up a
286          * quarter of the log in a single checkpoint if we can.
287          */
288         /*
289          * AKPM: check this code.  I had a feeling a while back that it
290          * degenerates into a busy loop at unmount time.
291          */
292         target = (journal->j_last - journal->j_first) / 4;
293
294         spin_lock(&journal_datalist_lock);
295 repeat:
296         transaction = journal->j_checkpoint_transactions;
297         if (transaction == NULL)
298                 goto done;
299         last_transaction = transaction->t_cpprev;
300         next_transaction = transaction;
301
302         do {
303                 struct journal_head *jh, *last_jh, *next_jh;
304                 int drop_count = 0;
305                 int cleanup_ret, retry = 0;
306
307                 transaction = next_transaction;
308                 next_transaction = transaction->t_cpnext;
309                 jh = transaction->t_checkpoint_list;
310                 last_jh = jh->b_cpprev;
311                 next_jh = jh;
312                 do {
313                         jh = next_jh;
314                         next_jh = jh->b_cpnext;
315                         retry = __flush_buffer(journal, jh, bhs, &batch_count,
316                                                 &drop_count);
317                 } while (jh != last_jh && !retry);
318                 if (batch_count) {
319                         __flush_batch(bhs, &batch_count);
320                         goto repeat;
321                 }
322                 if (retry)
323                         goto repeat;
324                 /*
325                  * We have walked the whole transaction list without
326                  * finding anything to write to disk.  We had better be
327                  * able to make some progress or we are in trouble. 
328                  */
329                 cleanup_ret = __cleanup_transaction(journal, transaction);
330                 J_ASSERT(drop_count != 0 || cleanup_ret != 0);
331                 goto repeat;    /* __cleanup may have dropped lock */
332         } while (transaction != last_transaction);
333
334 done:
335         spin_unlock(&journal_datalist_lock);
336         result = cleanup_journal_tail(journal);
337         if (result < 0)
338                 return result;
339         
340         return 0;
341 }
342
343 /*
344  * Check the list of checkpoint transactions for the journal to see if
345  * we have already got rid of any since the last update of the log tail
346  * in the journal superblock.  If so, we can instantly roll the
347  * superblock forward to remove those transactions from the log.
348  * 
349  * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
350  * 
351  * Called with the journal lock held.
352  *
353  * This is the only part of the journaling code which really needs to be
354  * aware of transaction aborts.  Checkpointing involves writing to the
355  * main filesystem area rather than to the journal, so it can proceed
356  * even in abort state, but we must not update the journal superblock if
357  * we have an abort error outstanding.
358  */
359
360 int cleanup_journal_tail(journal_t *journal)
361 {
362         transaction_t * transaction;
363         tid_t           first_tid;
364         unsigned long   blocknr, freed;
365
366         /* OK, work out the oldest transaction remaining in the log, and
367          * the log block it starts at. 
368          * 
369          * If the log is now empty, we need to work out which is the
370          * next transaction ID we will write, and where it will
371          * start. */
372
373         /* j_checkpoint_transactions needs locking */
374         spin_lock(&journal_datalist_lock);
375         transaction = journal->j_checkpoint_transactions;
376         if (transaction) {
377                 first_tid = transaction->t_tid;
378                 blocknr = transaction->t_log_start;
379         } else if ((transaction = journal->j_committing_transaction) != NULL) {
380                 first_tid = transaction->t_tid;
381                 blocknr = transaction->t_log_start;
382         } else if ((transaction = journal->j_running_transaction) != NULL) {
383                 first_tid = transaction->t_tid;
384                 blocknr = journal->j_head;
385         } else {
386                 first_tid = journal->j_transaction_sequence;
387                 blocknr = journal->j_head;
388         }
389         spin_unlock(&journal_datalist_lock);
390         J_ASSERT (blocknr != 0);
391
392         /* If the oldest pinned transaction is at the tail of the log
393            already then there's not much we can do right now. */
394         if (journal->j_tail_sequence == first_tid)
395                 return 1;
396
397         /* OK, update the superblock to recover the freed space.
398          * Physical blocks come first: have we wrapped beyond the end of
399          * the log?  */
400         freed = blocknr - journal->j_tail;
401         if (blocknr < journal->j_tail)
402                 freed = freed + journal->j_last - journal->j_first;
403
404         jbd_debug(1,
405                   "Cleaning journal tail from %d to %d (offset %lu), "
406                   "freeing %lu\n",
407                   journal->j_tail_sequence, first_tid, blocknr, freed);
408
409         journal->j_free += freed;
410         journal->j_tail_sequence = first_tid;
411         journal->j_tail = blocknr;
412         if (!(journal->j_flags & JFS_ABORT))
413                 journal_update_superblock(journal, 1);
414         return 0;
415 }
416
417
418 /* Checkpoint list management */
419
420 /*
421  * journal_clean_checkpoint_list
422  *
423  * Find all the written-back checkpoint buffers in the journal and release them.
424  *
425  * Called with the journal locked.
426  * Called with journal_datalist_lock held.
427  * Returns number of bufers reaped (for debug)
428  */
429
430 int __journal_clean_checkpoint_list(journal_t *journal)
431 {
432         transaction_t *transaction, *last_transaction, *next_transaction;
433         int ret = 0;
434
435         transaction = journal->j_checkpoint_transactions;
436         if (transaction == 0)
437                 goto out;
438
439         last_transaction = transaction->t_cpprev;
440         next_transaction = transaction;
441         do {
442                 struct journal_head *jh;
443
444                 transaction = next_transaction;
445                 next_transaction = transaction->t_cpnext;
446                 jh = transaction->t_checkpoint_list;
447                 if (jh) {
448                         struct journal_head *last_jh = jh->b_cpprev;
449                         struct journal_head *next_jh = jh;
450                         do {
451                                 jh = next_jh;
452                                 next_jh = jh->b_cpnext;
453                                 ret += __try_to_free_cp_buf(jh);
454                         } while (jh != last_jh);
455                 }
456         } while (transaction != last_transaction);
457 out:
458         return ret;
459 }
460
461 /* 
462  * journal_remove_checkpoint: called after a buffer has been committed
463  * to disk (either by being write-back flushed to disk, or being
464  * committed to the log).
465  *
466  * We cannot safely clean a transaction out of the log until all of the
467  * buffer updates committed in that transaction have safely been stored
468  * elsewhere on disk.  To achieve this, all of the buffers in a
469  * transaction need to be maintained on the transaction's checkpoint
470  * list until they have been rewritten, at which point this function is
471  * called to remove the buffer from the existing transaction's
472  * checkpoint list.  
473  *
474  * This function is called with the journal locked.
475  * This function is called with journal_datalist_lock held.
476  */
477
478 void __journal_remove_checkpoint(struct journal_head *jh)
479 {
480         transaction_t *transaction;
481         journal_t *journal;
482
483         JBUFFER_TRACE(jh, "entry");
484         
485         if ((transaction = jh->b_cp_transaction) == NULL) {
486                 JBUFFER_TRACE(jh, "not on transaction");
487                 goto out;
488         }
489
490         journal = transaction->t_journal;
491
492         __buffer_unlink(jh);
493
494         if (transaction->t_checkpoint_list != NULL)
495                 goto out;
496         JBUFFER_TRACE(jh, "transaction has no more buffers");
497
498         /* There is one special case to worry about: if we have just
499            pulled the buffer off a committing transaction's forget list,
500            then even if the checkpoint list is empty, the transaction
501            obviously cannot be dropped! */
502
503         if (transaction == journal->j_committing_transaction) {
504                 JBUFFER_TRACE(jh, "belongs to committing transaction");
505                 goto out;
506         }
507
508         /* OK, that was the last buffer for the transaction: we can now
509            safely remove this transaction from the log */
510
511         __journal_drop_transaction(journal, transaction);
512
513         /* Just in case anybody was waiting for more transactions to be
514            checkpointed... */
515         wake_up(&journal->j_wait_logspace);
516 out:
517         JBUFFER_TRACE(jh, "exit");
518 }
519
520 void journal_remove_checkpoint(struct journal_head *jh)
521 {
522         spin_lock(&journal_datalist_lock);
523         __journal_remove_checkpoint(jh);
524         spin_unlock(&journal_datalist_lock);
525 }
526
527 /*
528  * journal_insert_checkpoint: put a committed buffer onto a checkpoint
529  * list so that we know when it is safe to clean the transaction out of
530  * the log.
531  *
532  * Called with the journal locked.
533  * Called with journal_datalist_lock held.
534  */
535 void __journal_insert_checkpoint(struct journal_head *jh, 
536                                transaction_t *transaction)
537 {
538         JBUFFER_TRACE(jh, "entry");
539         J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jdirty(jh2bh(jh)));
540         J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
541
542         assert_spin_locked(&journal_datalist_lock);
543         jh->b_cp_transaction = transaction;
544
545         if (!transaction->t_checkpoint_list) {
546                 jh->b_cpnext = jh->b_cpprev = jh;
547         } else {
548                 jh->b_cpnext = transaction->t_checkpoint_list;
549                 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
550                 jh->b_cpprev->b_cpnext = jh;
551                 jh->b_cpnext->b_cpprev = jh;
552         }
553         transaction->t_checkpoint_list = jh;
554 }
555
556 void journal_insert_checkpoint(struct journal_head *jh, 
557                                transaction_t *transaction)
558 {
559         spin_lock(&journal_datalist_lock);
560         __journal_insert_checkpoint(jh, transaction);
561         spin_unlock(&journal_datalist_lock);
562 }
563
564 /*
565  * We've finished with this transaction structure: adios...
566  * 
567  * The transaction must have no links except for the checkpoint by this
568  * point.
569  *
570  * Called with the journal locked.
571  * Called with journal_datalist_lock held.
572  */
573
574 void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
575 {
576         assert_spin_locked(&journal_datalist_lock);
577         if (transaction->t_cpnext) {
578                 transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
579                 transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
580                 if (journal->j_checkpoint_transactions == transaction)
581                         journal->j_checkpoint_transactions =
582                                 transaction->t_cpnext;
583                 if (journal->j_checkpoint_transactions == transaction)
584                         journal->j_checkpoint_transactions = NULL;
585         }
586
587         J_ASSERT (transaction->t_ilist == NULL);
588         J_ASSERT (transaction->t_buffers == NULL);
589         J_ASSERT (transaction->t_sync_datalist == NULL);
590         J_ASSERT (transaction->t_async_datalist == NULL);
591         J_ASSERT (transaction->t_forget == NULL);
592         J_ASSERT (transaction->t_iobuf_list == NULL);
593         J_ASSERT (transaction->t_shadow_list == NULL);
594         J_ASSERT (transaction->t_log_list == NULL);
595         J_ASSERT (transaction->t_checkpoint_list == NULL);
596         J_ASSERT (transaction->t_updates == 0);
597         J_ASSERT (list_empty(&transaction->t_jcb));
598
599         J_ASSERT (transaction->t_journal->j_committing_transaction !=
600                                         transaction);
601         
602         jbd_debug (1, "Dropping transaction %d, all done\n", 
603                    transaction->t_tid);
604         kfree (transaction);
605 }
606