4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * 2000 - 2002 Heinz Mauelshagen, Sistina Software
7 * LVM snapshot driver is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
12 * LVM snapshot driver is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with GNU CC; see the file COPYING. If not, write to
19 * the Free Software Foundation, 59 Temple Place - Suite 330,
20 * Boston, MA 02111-1307, USA.
27 * 05/07/2000 - implemented persistent snapshot support
28 * 23/11/2000 - used cpu_to_le64 rather than my own macro
29 * 25/01/2001 - Put LockPage back in
30 * 01/02/2001 - A dropped snapshot is now set as inactive
31 * 14/02/2001 - tidied debug statements
32 * 19/02/2001 - changed rawio calls to pass in preallocated buffer_heads
33 * 26/02/2001 - introduced __brw_kiovec to remove a lot of conditional
35 * 07/03/2001 - fixed COW exception table not persistent on 2.2 (HM)
36 * 12/03/2001 - lvm_pv_get_number changes:
38 * o renamed it to _pv_get_number
39 * o pv number is returned in new uint * arg
40 * o -1 returned on error
41 * lvm_snapshot_fill_COW_table has a return value too.
42 * 15/10/2001 - fix snapshot alignment problem [CM]
43 * - fix snapshot full oops (always check lv_block_exception) [CM]
44 * 26/06/2002 - support for new list_move macro [patch@luckynet.dynu.com]
48 #include <linux/kernel.h>
49 #include <linux/vmalloc.h>
50 #include <linux/blkdev.h>
51 #include <linux/smp_lock.h>
52 #include <linux/types.h>
53 #include <linux/iobuf.h>
54 #include <linux/lvm.h>
55 #include <linux/devfs_fs_kernel.h>
58 #include "lvm-internal.h"
60 static char *lvm_snap_version __attribute__ ((unused)) = "LVM "LVM_RELEASE_NAME" snapshot code ("LVM_RELEASE_DATE")\n";
63 extern const char *const lvm_name;
64 extern int lvm_blocksizes[];
66 void lvm_snapshot_release(lv_t *);
68 static int _write_COW_table_block(vg_t *vg, lv_t *lv, int idx,
70 static void _disable_snapshot(vg_t *vg, lv_t *lv);
73 static inline int __brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
74 kdev_t dev, unsigned long b[], int size,
76 return brw_kiovec(rw, nr, iovec, dev, b, size);
80 static int _pv_get_number(vg_t * vg, kdev_t rdev, uint *pvn)
83 for (p = 0; p < vg->pv_max; p++) {
84 if (vg->pv[p] == NULL)
87 if (vg->pv[p]->pv_dev == rdev)
91 if (p >= vg->pv_max) {
92 /* bad news, the snapshot COW table is probably corrupt */
94 "%s -- _pv_get_number failed for rdev = %u\n",
99 *pvn = vg->pv[p]->pv_number;
104 #define hashfn(dev,block,mask,chunk_size) \
105 ((HASHDEV(dev)^((block)/(chunk_size))) & (mask))
107 static inline lv_block_exception_t *
108 lvm_find_exception_table(kdev_t org_dev, unsigned long org_start, lv_t * lv)
110 struct list_head * hash_table = lv->lv_snapshot_hash_table, * next;
111 unsigned long mask = lv->lv_snapshot_hash_mask;
112 int chunk_size = lv->lv_chunk_size;
113 lv_block_exception_t * ret;
116 hash_table = &hash_table[hashfn(org_dev, org_start, mask, chunk_size)];
118 for (next = hash_table->next; next != hash_table; next = next->next)
120 lv_block_exception_t * exception;
122 exception = list_entry(next, lv_block_exception_t, hash);
123 if (exception->rsector_org == org_start &&
124 exception->rdev_org == org_dev)
128 /* fun, isn't it? :) */
130 list_move(next, hash_table);
133 list_add(next, hash_table);
144 inline void lvm_hash_link(lv_block_exception_t * exception,
145 kdev_t org_dev, unsigned long org_start,
148 struct list_head * hash_table = lv->lv_snapshot_hash_table;
149 unsigned long mask = lv->lv_snapshot_hash_mask;
150 int chunk_size = lv->lv_chunk_size;
154 hash_table = &hash_table[hashfn(org_dev, org_start, mask, chunk_size)];
155 list_add(&exception->hash, hash_table);
159 * Determine if we already have a snapshot chunk for this block.
160 * Return: 1 if it the chunk already exists
161 * 0 if we need to COW this block and allocate a new chunk
162 * -1 if the snapshot was disabled because it ran out of space
164 * We need to be holding at least a read lock on lv->lv_lock.
166 int lvm_snapshot_remap_block(kdev_t * org_dev, unsigned long * org_sector,
167 unsigned long pe_start, lv_t * lv)
170 unsigned long pe_off, pe_adjustment, __org_start;
172 int chunk_size = lv->lv_chunk_size;
173 lv_block_exception_t * exception;
175 if (!lv->lv_block_exception)
178 pe_off = pe_start % chunk_size;
179 pe_adjustment = (*org_sector-pe_off) % chunk_size;
180 __org_start = *org_sector - pe_adjustment;
181 __org_dev = *org_dev;
183 exception = lvm_find_exception_table(__org_dev, __org_start, lv);
186 *org_dev = exception->rdev_new;
187 *org_sector = exception->rsector_new + pe_adjustment;
193 void lvm_drop_snapshot(vg_t *vg, lv_t *lv_snap, const char *reason)
198 /* no exception storage space available for this snapshot
199 or error on this snapshot --> release it */
200 invalidate_buffers(lv_snap->lv_dev);
202 /* wipe the snapshot since it's inconsistent now */
203 _disable_snapshot(vg, lv_snap);
205 for (i = last_dev = 0; i < lv_snap->lv_remap_ptr; i++) {
206 if ( lv_snap->lv_block_exception[i].rdev_new != last_dev) {
207 last_dev = lv_snap->lv_block_exception[i].rdev_new;
208 invalidate_buffers(last_dev);
212 lvm_snapshot_release(lv_snap);
213 lv_snap->lv_status &= ~LV_ACTIVE;
216 "%s -- giving up to snapshot %s on %s: %s\n",
217 lvm_name, lv_snap->lv_snapshot_org->lv_name, lv_snap->lv_name,
221 static inline int lvm_snapshot_prepare_blocks(unsigned long *blocks,
226 int i, sectors_per_block, nr_blocks;
228 sectors_per_block = blocksize / SECTOR_SIZE;
230 if (start & (sectors_per_block - 1))
233 nr_blocks = nr_sectors / sectors_per_block;
234 start /= sectors_per_block;
236 for (i = 0; i < nr_blocks; i++)
242 inline int lvm_get_blksize(kdev_t dev)
244 int correct_size = BLOCK_SIZE, i, major;
247 if (blksize_size[major])
249 i = blksize_size[major][MINOR(dev)];
256 #ifdef DEBUG_SNAPSHOT
257 static inline void invalidate_snap_cache(unsigned long start, unsigned long nr,
260 struct buffer_head * bh;
261 int sectors_per_block, i, blksize, minor;
264 blksize = lvm_blocksizes[minor];
265 sectors_per_block = blksize >> 9;
266 nr /= sectors_per_block;
267 start /= sectors_per_block;
269 for (i = 0; i < nr; i++)
271 bh = get_hash_table(dev, start++, blksize);
279 int lvm_snapshot_fill_COW_page(vg_t * vg, lv_t * lv_snap)
281 int id = 0, is = lv_snap->lv_remap_ptr;
283 lv_COW_table_disk_t * lv_COW_table = (lv_COW_table_disk_t *)
284 page_address(lv_snap->lv_COW_table_iobuf->maplist[0]);
291 lvm_get_blksize(lv_snap->lv_block_exception[is].rdev_new);
292 is -= is % (blksize_snap / sizeof(lv_COW_table_disk_t));
294 memset(lv_COW_table, 0, blksize_snap);
295 for ( ; is < lv_snap->lv_remap_ptr; is++, id++) {
296 /* store new COW_table entry */
297 lv_block_exception_t *be = lv_snap->lv_block_exception + is;
300 if (_pv_get_number(vg, be->rdev_org, &pvn))
303 lv_COW_table[id].pv_org_number = cpu_to_le64(pvn);
304 lv_COW_table[id].pv_org_rsector = cpu_to_le64(be->rsector_org);
306 if (_pv_get_number(vg, be->rdev_new, &pvn))
309 lv_COW_table[id].pv_snap_number = cpu_to_le64(pvn);
310 lv_COW_table[id].pv_snap_rsector = cpu_to_le64(be->rsector_new);
316 printk(KERN_ERR "%s -- lvm_snapshot_fill_COW_page failed", lvm_name);
322 * writes a COW exception table sector to disk (HM)
324 * We need to hold a write lock on lv_snap->lv_lock.
326 int lvm_write_COW_table_block(vg_t * vg, lv_t *lv_snap)
330 if((r = _write_COW_table_block(vg, lv_snap,
331 lv_snap->lv_remap_ptr - 1, &err)))
332 lvm_drop_snapshot(vg, lv_snap, err);
337 * copy on write handler for one snapshot logical volume
339 * read the original blocks and store it/them on the new one(s).
340 * if there is no exception storage space free any longer --> release snapshot.
342 * this routine gets called for each _first_ write to a physical chunk.
344 * We need to hold a write lock on lv_snap->lv_lock. It is assumed that
345 * lv->lv_block_exception is non-NULL (checked by lvm_snapshot_remap_block())
346 * when this function is called.
348 int lvm_snapshot_COW(kdev_t org_phys_dev,
349 unsigned long org_phys_sector,
350 unsigned long org_pe_start,
351 unsigned long org_virt_sector,
352 vg_t *vg, lv_t* lv_snap)
355 unsigned long org_start, snap_start, snap_phys_dev, virt_start, pe_off;
356 unsigned long phys_start;
357 int idx = lv_snap->lv_remap_ptr, chunk_size = lv_snap->lv_chunk_size;
358 struct kiobuf * iobuf = lv_snap->lv_iobuf;
359 unsigned long *blocks = iobuf->blocks;
360 int blksize_snap, blksize_org, min_blksize, max_blksize;
361 int max_sectors, nr_sectors;
363 /* check if we are out of snapshot space */
364 if (idx >= lv_snap->lv_remap_end)
365 goto fail_out_of_space;
367 /* calculate physical boundaries of source chunk */
368 pe_off = org_pe_start % chunk_size;
369 org_start = org_phys_sector - ((org_phys_sector-pe_off) % chunk_size);
370 virt_start = org_virt_sector - (org_phys_sector - org_start);
372 /* calculate physical boundaries of destination chunk */
373 snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new;
374 snap_start = lv_snap->lv_block_exception[idx].rsector_new;
376 #ifdef DEBUG_SNAPSHOT
379 "org %s faulting %lu start %lu, snap %s start %lu, "
380 "size %d, pe_start %lu pe_off %lu, virt_sec %lu\n",
382 kdevname(org_phys_dev), org_phys_sector, org_start,
383 kdevname(snap_phys_dev), snap_start,
385 org_pe_start, pe_off,
389 blksize_org = lvm_sectsize(org_phys_dev);
390 blksize_snap = lvm_sectsize(snap_phys_dev);
391 max_blksize = max(blksize_org, blksize_snap);
392 min_blksize = min(blksize_org, blksize_snap);
393 max_sectors = KIO_MAX_SECTORS * (min_blksize>>9);
395 if (chunk_size % (max_blksize>>9))
398 /* Don't change org_start, we need it to fill in the exception table */
399 phys_start = org_start;
403 nr_sectors = min(chunk_size, max_sectors);
404 chunk_size -= nr_sectors;
406 iobuf->length = nr_sectors << 9;
408 if (!lvm_snapshot_prepare_blocks(blocks, phys_start,
409 nr_sectors, blksize_org))
412 if (__brw_kiovec(READ, 1, &iobuf, org_phys_dev, blocks,
413 blksize_org, lv_snap) != (nr_sectors<<9))
416 if (!lvm_snapshot_prepare_blocks(blocks, snap_start,
417 nr_sectors, blksize_snap))
420 if (__brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev, blocks,
421 blksize_snap, lv_snap) != (nr_sectors<<9))
424 phys_start += nr_sectors;
425 snap_start += nr_sectors;
428 #ifdef DEBUG_SNAPSHOT
429 /* invalidate the logical snapshot buffer cache */
430 invalidate_snap_cache(virt_start, lv_snap->lv_chunk_size,
434 /* the original chunk is now stored on the snapshot volume
435 so update the execption table */
436 lv_snap->lv_block_exception[idx].rdev_org = org_phys_dev;
437 lv_snap->lv_block_exception[idx].rsector_org = org_start;
439 lvm_hash_link(lv_snap->lv_block_exception + idx,
440 org_phys_dev, org_start, lv_snap);
441 lv_snap->lv_remap_ptr = idx + 1;
442 if (lv_snap->lv_snapshot_use_rate > 0) {
443 if (lv_snap->lv_remap_ptr * 100 / lv_snap->lv_remap_end >= lv_snap->lv_snapshot_use_rate)
444 wake_up_interruptible(&lv_snap->lv_snapshot_wait);
450 lvm_drop_snapshot(vg, lv_snap, reason);
454 reason = "out of space";
457 reason = "read error";
460 reason = "write error";
463 reason = "blocksize error";
467 reason = "couldn't prepare kiovec blocks "
468 "(start probably isn't block aligned)";
472 int lvm_snapshot_alloc_iobuf_pages(struct kiobuf * iobuf, int sectors)
474 int bytes, nr_pages, err, i;
476 bytes = sectors * SECTOR_SIZE;
477 nr_pages = (bytes + ~PAGE_MASK) >> PAGE_SHIFT;
478 err = expand_kiobuf(iobuf, nr_pages);
484 for (i = 0; i < nr_pages; i++)
488 page = alloc_page(GFP_KERNEL);
491 iobuf->maplist[i] = page;
503 static int calc_max_buckets(void)
507 mem = num_physpages << PAGE_SHIFT;
510 mem /= sizeof(struct list_head);
515 int lvm_snapshot_alloc_hash_table(lv_t * lv)
518 unsigned long buckets, max_buckets, size;
519 struct list_head * hash;
521 buckets = lv->lv_remap_end;
522 max_buckets = calc_max_buckets();
523 buckets = min(buckets, max_buckets);
524 while (buckets & (buckets-1))
525 buckets &= (buckets-1);
527 size = buckets * sizeof(struct list_head);
530 hash = vmalloc(size);
531 lv->lv_snapshot_hash_table = hash;
535 lv->lv_snapshot_hash_table_size = size;
537 lv->lv_snapshot_hash_mask = buckets-1;
539 INIT_LIST_HEAD(hash+buckets);
545 int lvm_snapshot_alloc(lv_t * lv_snap)
547 int ret, max_sectors;
549 /* allocate kiovec to do chunk io */
550 ret = alloc_kiovec(1, &lv_snap->lv_iobuf);
553 max_sectors = KIO_MAX_SECTORS << (PAGE_SHIFT-9);
555 ret = lvm_snapshot_alloc_iobuf_pages(lv_snap->lv_iobuf, max_sectors);
556 if (ret) goto out_free_kiovec;
558 /* allocate kiovec to do exception table io */
559 ret = alloc_kiovec(1, &lv_snap->lv_COW_table_iobuf);
560 if (ret) goto out_free_kiovec;
562 ret = lvm_snapshot_alloc_iobuf_pages(lv_snap->lv_COW_table_iobuf,
563 PAGE_SIZE/SECTOR_SIZE);
564 if (ret) goto out_free_both_kiovecs;
566 ret = lvm_snapshot_alloc_hash_table(lv_snap);
567 if (ret) goto out_free_both_kiovecs;
572 out_free_both_kiovecs:
573 unmap_kiobuf(lv_snap->lv_COW_table_iobuf);
574 free_kiovec(1, &lv_snap->lv_COW_table_iobuf);
575 lv_snap->lv_COW_table_iobuf = NULL;
578 unmap_kiobuf(lv_snap->lv_iobuf);
579 free_kiovec(1, &lv_snap->lv_iobuf);
580 lv_snap->lv_iobuf = NULL;
581 vfree(lv_snap->lv_snapshot_hash_table);
582 lv_snap->lv_snapshot_hash_table = NULL;
586 void lvm_snapshot_release(lv_t * lv)
588 if (lv->lv_block_exception)
590 vfree(lv->lv_block_exception);
591 lv->lv_block_exception = NULL;
593 if (lv->lv_snapshot_hash_table)
595 vfree(lv->lv_snapshot_hash_table);
596 lv->lv_snapshot_hash_table = NULL;
597 lv->lv_snapshot_hash_table_size = 0;
601 kiobuf_wait_for_io(lv->lv_iobuf);
602 unmap_kiobuf(lv->lv_iobuf);
603 free_kiovec(1, &lv->lv_iobuf);
606 if (lv->lv_COW_table_iobuf)
608 kiobuf_wait_for_io(lv->lv_COW_table_iobuf);
609 unmap_kiobuf(lv->lv_COW_table_iobuf);
610 free_kiovec(1, &lv->lv_COW_table_iobuf);
611 lv->lv_COW_table_iobuf = NULL;
616 static int _write_COW_table_block(vg_t *vg, lv_t *lv_snap,
617 int idx, const char **reason) {
622 ulong snap_pe_start, COW_table_sector_offset,
623 COW_entries_per_pe, COW_chunks_per_pe, COW_entries_per_block;
625 kdev_t snap_phys_dev;
626 lv_block_exception_t *be;
627 struct kiobuf *COW_table_iobuf = lv_snap->lv_COW_table_iobuf;
628 lv_COW_table_disk_t * lv_COW_table =
629 ( lv_COW_table_disk_t *) page_address(lv_snap->lv_COW_table_iobuf->maplist[0]);
631 COW_chunks_per_pe = LVM_GET_COW_TABLE_CHUNKS_PER_PE(vg, lv_snap);
632 COW_entries_per_pe = LVM_GET_COW_TABLE_ENTRIES_PER_PE(vg, lv_snap);
634 /* get physical addresse of destination chunk */
635 snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new;
636 snap_pe_start = lv_snap->lv_block_exception[idx - (idx % COW_entries_per_pe)].rsector_new - lv_snap->lv_chunk_size;
638 blksize_snap = lvm_sectsize(snap_phys_dev);
640 COW_entries_per_block = blksize_snap / sizeof(lv_COW_table_disk_t);
641 idx_COW_table = idx % COW_entries_per_pe % COW_entries_per_block;
643 if ( idx_COW_table == 0) memset(lv_COW_table, 0, blksize_snap);
645 /* sector offset into the on disk COW table */
646 COW_table_sector_offset = (idx % COW_entries_per_pe) / (SECTOR_SIZE / sizeof(lv_COW_table_disk_t));
648 /* COW table block to write next */
649 blocks[0] = (snap_pe_start + COW_table_sector_offset) >> (blksize_snap >> 10);
651 /* store new COW_table entry */
652 be = lv_snap->lv_block_exception + idx;
653 if(_pv_get_number(vg, be->rdev_org, &pvn))
654 goto fail_pv_get_number;
656 lv_COW_table[idx_COW_table].pv_org_number = cpu_to_le64(pvn);
657 lv_COW_table[idx_COW_table].pv_org_rsector =
658 cpu_to_le64(be->rsector_org);
659 if(_pv_get_number(vg, snap_phys_dev, &pvn))
660 goto fail_pv_get_number;
662 lv_COW_table[idx_COW_table].pv_snap_number = cpu_to_le64(pvn);
663 lv_COW_table[idx_COW_table].pv_snap_rsector =
664 cpu_to_le64(be->rsector_new);
666 COW_table_iobuf->length = blksize_snap;
667 /* COW_table_iobuf->nr_pages = 1; */
669 if (__brw_kiovec(WRITE, 1, &COW_table_iobuf, snap_phys_dev,
670 blocks, blksize_snap, lv_snap) != blksize_snap)
673 /* initialization of next COW exception table block with zeroes */
674 end_of_table = idx % COW_entries_per_pe == COW_entries_per_pe - 1;
675 if (idx_COW_table % COW_entries_per_block == COW_entries_per_block - 1 || end_of_table)
677 /* don't go beyond the end */
678 if (idx + 1 >= lv_snap->lv_remap_end) goto out;
680 memset(lv_COW_table, 0, blksize_snap);
685 snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new;
686 snap_pe_start = lv_snap->lv_block_exception[idx - (idx % COW_entries_per_pe)].rsector_new - lv_snap->lv_chunk_size;
687 blksize_snap = lvm_sectsize(snap_phys_dev);
688 blocks[0] = snap_pe_start >> (blksize_snap >> 10);
691 if (__brw_kiovec(WRITE, 1, &COW_table_iobuf, snap_phys_dev,
692 blocks, blksize_snap, lv_snap) !=
701 *reason = "write error";
705 *reason = "_pv_get_number failed";
711 * This function is a bit of a hack; we need to ensure that the
712 * snapshot is never made active again, because it will surely be
713 * corrupt. At the moment we do not have access to the LVM metadata
714 * from within the kernel. So we set the first exception to point to
715 * sector 1 (which will always be within the metadata, and as such
716 * invalid). User land tools will check for this when they are asked
717 * to activate the snapshot and prevent this from happening.
720 static void _disable_snapshot(vg_t *vg, lv_t *lv) {
722 lv->lv_block_exception[0].rsector_org = LVM_SNAPSHOT_DROPPED_SECTOR;
723 if(_write_COW_table_block(vg, lv, 0, &err) < 0) {
724 printk(KERN_ERR "%s -- couldn't disable snapshot: %s\n",