fs/ocfs2/aops.c

   1 /* -*- mode: c; c-basic-offset: 8; -*-
   2  * vim: noexpandtab sw=8 ts=8 sts=0:
   3  *
   4  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public
  17  * License along with this program; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 021110-1307, USA.
  20  */
  21
  22 #include <linux/fs.h>
  23 #include <linux/slab.h>
  24 #include <linux/highmem.h>
  25 #include <linux/pagemap.h>
  26 #include <asm/byteorder.h>
  27 #include <linux/swap.h>
  28 #include <linux/pipe_fs_i.h>
  29
  30 #define MLOG_MASK_PREFIX ML_FILE_IO
  31 #include <cluster/masklog.h>
  32
  33 #include "ocfs2.h"
  34
  35 #include "alloc.h"
  36 #include "aops.h"
  37 #include "dlmglue.h"
  38 #include "extent_map.h"
  39 #include "file.h"
  40 #include "inode.h"
  41 #include "journal.h"
  42 #include "suballoc.h"
  43 #include "super.h"
  44 #include "symlink.h"
  45
  46 #include "buffer_head_io.h"
  47
  48 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
  49                                    struct buffer_head *bh_result, int create)
  50 {
  51         int err = -EIO;
  52         int status;
  53         struct ocfs2_dinode *fe = NULL;
  54         struct buffer_head *bh = NULL;
  55         struct buffer_head *buffer_cache_bh = NULL;
  56         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  57         void *kaddr;
  58
  59         mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
  60                    (unsigned long long)iblock, bh_result, create);
  61
  62         BUG_ON(ocfs2_inode_is_fast_symlink(inode));
  63
  64         if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
  65                 mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
  66                      (unsigned long long)iblock);
  67                 goto bail;
  68         }
  69
  70         status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
  71                                   OCFS2_I(inode)->ip_blkno,
  72                                   &bh, OCFS2_BH_CACHED, inode);
  73         if (status < 0) {
  74                 mlog_errno(status);
  75                 goto bail;
  76         }
  77         fe = (struct ocfs2_dinode *) bh->b_data;
  78
  79         if (!OCFS2_IS_VALID_DINODE(fe)) {
  80                 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
  81                      (unsigned long long)fe->i_blkno, 7, fe->i_signature);
  82                 goto bail;
  83         }
  84
  85         if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
  86                                                     le32_to_cpu(fe->i_clusters))) {
  87                 mlog(ML_ERROR, "block offset is outside the allocated size: "
  88                      "%llu\n", (unsigned long long)iblock);
  89                 goto bail;
  90         }
  91
  92         /* We don't use the page cache to create symlink data, so if
  93          * need be, copy it over from the buffer cache. */
  94         if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
  95                 u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
  96                             iblock;
  97                 buffer_cache_bh = sb_getblk(osb->sb, blkno);
  98                 if (!buffer_cache_bh) {
  99                         mlog(ML_ERROR, "couldn't getblock for symlink!\n");
 100                         goto bail;
 101                 }
 102
 103                 /* we haven't locked out transactions, so a commit
 104                  * could've happened. Since we've got a reference on
 105                  * the bh, even if it commits while we're doing the
 106                  * copy, the data is still good. */
 107                 if (buffer_jbd(buffer_cache_bh)
 108                     && ocfs2_inode_is_new(inode)) {
 109                         kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
 110                         if (!kaddr) {
 111                                 mlog(ML_ERROR, "couldn't kmap!\n");
 112                                 goto bail;
 113                         }
 114                         memcpy(kaddr + (bh_result->b_size * iblock),
 115                                buffer_cache_bh->b_data,
 116                                bh_result->b_size);
 117                         kunmap_atomic(kaddr, KM_USER0);
 118                         set_buffer_uptodate(bh_result);
 119                 }
 120                 brelse(buffer_cache_bh);
 121         }
 122
 123         map_bh(bh_result, inode->i_sb,
 124                le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
 125
 126         err = 0;
 127
 128 bail:
 129         if (bh)
 130                 brelse(bh);
 131
 132         mlog_exit(err);
 133         return err;
 134 }
 135
 136 static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 137                            struct buffer_head *bh_result, int create)
 138 {
 139         int err = 0;
 140         u64 p_blkno, past_eof;
 141         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 142
 143         mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
 144                    (unsigned long long)iblock, bh_result, create);
 145
 146         if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
 147                 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
 148                      inode, inode->i_ino);
 149
 150         if (S_ISLNK(inode->i_mode)) {
 151                 /* this always does I/O for some reason. */
 152                 err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
 153                 goto bail;
 154         }
 155
 156         err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL);
 157         if (err) {
 158                 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
 159                      "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
 160                      (unsigned long long)p_blkno);
 161                 goto bail;
 162         }
 163
 164         /*
 165          * ocfs2 never allocates in this function - the only time we
 166          * need to use BH_New is when we're extending i_size on a file
 167          * system which doesn't support holes, in which case BH_New
 168          * allows block_prepare_write() to zero.
 169          */
 170         mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
 171                         "ino %lu, iblock %llu\n", inode->i_ino,
 172                         (unsigned long long)iblock);
 173
 174         if (p_blkno)
 175                 map_bh(bh_result, inode->i_sb, p_blkno);
 176
 177         if (!ocfs2_sparse_alloc(osb)) {
 178                 if (p_blkno == 0) {
 179                         err = -EIO;
 180                         mlog(ML_ERROR,
 181                              "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
 182                              (unsigned long long)iblock,
 183                              (unsigned long long)p_blkno,
 184                              (unsigned long long)OCFS2_I(inode)->ip_blkno);
 185                         mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
 186                         dump_stack();
 187                 }
 188
 189                 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 190                 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
 191                      (unsigned long long)past_eof);
 192
 193                 if (create && (iblock >= past_eof))
 194                         set_buffer_new(bh_result);
 195         }
 196
 197 bail:
 198         if (err < 0)
 199                 err = -EIO;
 200
 201         mlog_exit(err);
 202         return err;
 203 }
 204
 205 static int ocfs2_readpage(struct file *file, struct page *page)
 206 {
 207         struct inode *inode = page->mapping->host;
 208         loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
 209         int ret, unlock = 1;
 210
 211         mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
 212
 213         ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
 214         if (ret != 0) {
 215                 if (ret == AOP_TRUNCATED_PAGE)
 216                         unlock = 0;
 217                 mlog_errno(ret);
 218                 goto out;
 219         }
 220
 221         down_read(&OCFS2_I(inode)->ip_alloc_sem);
 222
 223         /*
 224          * i_size might have just been updated as we grabed the meta lock.  We
 225          * might now be discovering a truncate that hit on another node.
 226          * block_read_full_page->get_block freaks out if it is asked to read
 227          * beyond the end of a file, so we check here.  Callers
 228          * (generic_file_read, fault->nopage) are clever enough to check i_size
 229          * and notice that the page they just read isn't needed.
 230          *
 231          * XXX sys_readahead() seems to get that wrong?
 232          */
 233         if (start >= i_size_read(inode)) {
 234                 char *addr = kmap(page);
 235                 memset(addr, 0, PAGE_SIZE);
 236                 flush_dcache_page(page);
 237                 kunmap(page);
 238                 SetPageUptodate(page);
 239                 ret = 0;
 240                 goto out_alloc;
 241         }
 242
 243         ret = ocfs2_data_lock_with_page(inode, 0, page);
 244         if (ret != 0) {
 245                 if (ret == AOP_TRUNCATED_PAGE)
 246                         unlock = 0;
 247                 mlog_errno(ret);
 248                 goto out_alloc;
 249         }
 250
 251         ret = block_read_full_page(page, ocfs2_get_block);
 252         unlock = 0;
 253
 254         ocfs2_data_unlock(inode, 0);
 255 out_alloc:
 256         up_read(&OCFS2_I(inode)->ip_alloc_sem);
 257         ocfs2_meta_unlock(inode, 0);
 258 out:
 259         if (unlock)
 260                 unlock_page(page);
 261         mlog_exit(ret);
 262         return ret;
 263 }
 264
 265 /* Note: Because we don't support holes, our allocation has
 266  * already happened (allocation writes zeros to the file data)
 267  * so we don't have to worry about ordered writes in
 268  * ocfs2_writepage.
 269  *
 270  * ->writepage is called during the process of invalidating the page cache
 271  * during blocked lock processing.  It can't block on any cluster locks
 272  * to during block mapping.  It's relying on the fact that the block
 273  * mapping can't have disappeared under the dirty pages that it is
 274  * being asked to write back.
 275  */
 276 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 277 {
 278         int ret;
 279
 280         mlog_entry("(0x%p)\n", page);
 281
 282         ret = block_write_full_page(page, ocfs2_get_block, wbc);
 283
 284         mlog_exit(ret);
 285
 286         return ret;
 287 }
 288
 289 /*
 290  * This is called from ocfs2_write_zero_page() which has handled it's
 291  * own cluster locking and has ensured allocation exists for those
 292  * blocks to be written.
 293  */
 294 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
 295                                unsigned from, unsigned to)
 296 {
 297         int ret;
 298
 299         down_read(&OCFS2_I(inode)->ip_alloc_sem);
 300
 301         ret = block_prepare_write(page, from, to, ocfs2_get_block);
 302
 303         up_read(&OCFS2_I(inode)->ip_alloc_sem);
 304
 305         return ret;
 306 }
 307
 308 /* Taken from ext3. We don't necessarily need the full blown
 309  * functionality yet, but IMHO it's better to cut and paste the whole
 310  * thing so we can avoid introducing our own bugs (and easily pick up
 311  * their fixes when they happen) --Mark */
 312 int walk_page_buffers(  handle_t *handle,
 313                         struct buffer_head *head,
 314                         unsigned from,
 315                         unsigned to,
 316                         int *partial,
 317                         int (*fn)(      handle_t *handle,
 318                                         struct buffer_head *bh))
 319 {
 320         struct buffer_head *bh;
 321         unsigned block_start, block_end;
 322         unsigned blocksize = head->b_size;
 323         int err, ret = 0;
 324         struct buffer_head *next;
 325
 326         for (   bh = head, block_start = 0;
 327                 ret == 0 && (bh != head || !block_start);
 328                 block_start = block_end, bh = next)
 329         {
 330                 next = bh->b_this_page;
 331                 block_end = block_start + blocksize;
 332                 if (block_end <= from || block_start >= to) {
 333                         if (partial && !buffer_uptodate(bh))
 334                                 *partial = 1;
 335                         continue;
 336                 }
 337                 err = (*fn)(handle, bh);
 338                 if (!ret)
 339                         ret = err;
 340         }
 341         return ret;
 342 }
 343
 344 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 345                                                          struct page *page,
 346                                                          unsigned from,
 347                                                          unsigned to)
 348 {
 349         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 350         handle_t *handle = NULL;
 351         int ret = 0;
 352
 353         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 354         if (!handle) {
 355                 ret = -ENOMEM;
 356                 mlog_errno(ret);
 357                 goto out;
 358         }
 359
 360         if (ocfs2_should_order_data(inode)) {
 361                 ret = walk_page_buffers(handle,
 362                                         page_buffers(page),
 363                                         from, to, NULL,
 364                                         ocfs2_journal_dirty_data);
 365                 if (ret < 0)
 366                         mlog_errno(ret);
 367         }
 368 out:
 369         if (ret) {
 370                 if (handle)
 371                         ocfs2_commit_trans(osb, handle);
 372                 handle = ERR_PTR(ret);
 373         }
 374         return handle;
 375 }
 376
 377 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 378 {
 379         sector_t status;
 380         u64 p_blkno = 0;
 381         int err = 0;
 382         struct inode *inode = mapping->host;
 383
 384         mlog_entry("(block = %llu)\n", (unsigned long long)block);
 385
 386         /* We don't need to lock journal system files, since they aren't
 387          * accessed concurrently from multiple nodes.
 388          */
 389         if (!INODE_JOURNAL(inode)) {
 390                 err = ocfs2_meta_lock(inode, NULL, 0);
 391                 if (err) {
 392                         if (err != -ENOENT)
 393                                 mlog_errno(err);
 394                         goto bail;
 395                 }
 396                 down_read(&OCFS2_I(inode)->ip_alloc_sem);
 397         }
 398
 399         err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL);
 400
 401         if (!INODE_JOURNAL(inode)) {
 402                 up_read(&OCFS2_I(inode)->ip_alloc_sem);
 403                 ocfs2_meta_unlock(inode, 0);
 404         }
 405
 406         if (err) {
 407                 mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
 408                      (unsigned long long)block);
 409                 mlog_errno(err);
 410                 goto bail;
 411         }
 412
 413
 414 bail:
 415         status = err ? 0 : p_blkno;
 416
 417         mlog_exit((int)status);
 418
 419         return status;
 420 }
 421
 422 /*
 423  * TODO: Make this into a generic get_blocks function.
 424  *
 425  * From do_direct_io in direct-io.c:
 426  *  "So what we do is to permit the ->get_blocks function to populate
 427  *   bh.b_size with the size of IO which is permitted at this offset and
 428  *   this i_blkbits."
 429  *
 430  * This function is called directly from get_more_blocks in direct-io.c.
 431  *
 432  * called like this: dio->get_blocks(dio->inode, fs_startblk,
 433  *                                      fs_count, map_bh, dio->rw == WRITE);
 434  */
 435 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 436                                      struct buffer_head *bh_result, int create)
 437 {
 438         int ret;
 439         u64 p_blkno, inode_blocks;
 440         int contig_blocks;
 441         unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
 442         unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
 443
 444         /* This function won't even be called if the request isn't all
 445          * nicely aligned and of the right size, so there's no need
 446          * for us to check any of that. */
 447
 448         inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 449
 450         /*
 451          * Any write past EOF is not allowed because we'd be extending.
 452          */
 453         if (create && (iblock + max_blocks) > inode_blocks) {
 454                 ret = -EIO;
 455                 goto bail;
 456         }
 457
 458         /* This figures out the size of the next contiguous block, and
 459          * our logical offset */
 460         ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
 461                                           &contig_blocks);
 462         if (ret) {
 463                 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
 464                      (unsigned long long)iblock);
 465                 ret = -EIO;
 466                 goto bail;
 467         }
 468
 469         if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
 470                 ocfs2_error(inode->i_sb,
 471                             "Inode %llu has a hole at block %llu\n",
 472                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
 473                             (unsigned long long)iblock);
 474                 ret = -EROFS;
 475                 goto bail;
 476         }
 477
 478         /*
 479          * get_more_blocks() expects us to describe a hole by clearing
 480          * the mapped bit on bh_result().
 481          */
 482         if (p_blkno)
 483                 map_bh(bh_result, inode->i_sb, p_blkno);
 484         else {
 485                 /*
 486                  * ocfs2_prepare_inode_for_write() should have caught
 487                  * the case where we'd be filling a hole and triggered
 488                  * a buffered write instead.
 489                  */
 490                 if (create) {
 491                         ret = -EIO;
 492                         mlog_errno(ret);
 493                         goto bail;
 494                 }
 495
 496                 clear_buffer_mapped(bh_result);
 497         }
 498
 499         /* make sure we don't map more than max_blocks blocks here as
 500            that's all the kernel will handle at this point. */
 501         if (max_blocks < contig_blocks)
 502                 contig_blocks = max_blocks;
 503         bh_result->b_size = contig_blocks << blocksize_bits;
 504 bail:
 505         return ret;
 506 }
 507
 508 /*
 509  * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
 510  * particularly interested in the aio/dio case.  Like the core uses
 511  * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
 512  * truncation on another.
 513  */
 514 static void ocfs2_dio_end_io(struct kiocb *iocb,
 515                              loff_t offset,
 516                              ssize_t bytes,
 517                              void *private)
 518 {
 519         struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 520
 521         /* this io's submitter should not have unlocked this before we could */
 522         BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
 523         ocfs2_iocb_clear_rw_locked(iocb);
 524         up_read(&inode->i_alloc_sem);
 525         ocfs2_rw_unlock(inode, 0);
 526 }
 527
 528 /*
 529  * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
 530  * from ext3.  PageChecked() bits have been removed as OCFS2 does not
 531  * do journalled data.
 532  */
 533 static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
 534 {
 535         journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 536
 537         journal_invalidatepage(journal, page, offset);
 538 }
 539
 540 static int ocfs2_releasepage(struct page *page, gfp_t wait)
 541 {
 542         journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 543
 544         if (!page_has_buffers(page))
 545                 return 0;
 546         return journal_try_to_free_buffers(journal, page, wait);
 547 }
 548
 549 static ssize_t ocfs2_direct_IO(int rw,
 550                                struct kiocb *iocb,
 551                                const struct iovec *iov,
 552                                loff_t offset,
 553                                unsigned long nr_segs)
 554 {
 555         struct file *file = iocb->ki_filp;
 556         struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
 557         int ret;
 558
 559         mlog_entry_void();
 560
 561         if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
 562                 /*
 563                  * We get PR data locks even for O_DIRECT.  This
 564                  * allows concurrent O_DIRECT I/O but doesn't let
 565                  * O_DIRECT with extending and buffered zeroing writes
 566                  * race.  If they did race then the buffered zeroing
 567                  * could be written back after the O_DIRECT I/O.  It's
 568                  * one thing to tell people not to mix buffered and
 569                  * O_DIRECT writes, but expecting them to understand
 570                  * that file extension is also an implicit buffered
 571                  * write is too much.  By getting the PR we force
 572                  * writeback of the buffered zeroing before
 573                  * proceeding.
 574                  */
 575                 ret = ocfs2_data_lock(inode, 0);
 576                 if (ret < 0) {
 577                         mlog_errno(ret);
 578                         goto out;
 579                 }
 580                 ocfs2_data_unlock(inode, 0);
 581         }
 582
 583         ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 584                                             inode->i_sb->s_bdev, iov, offset,
 585                                             nr_segs,
 586                                             ocfs2_direct_IO_get_blocks,
 587                                             ocfs2_dio_end_io);
 588 out:
 589         mlog_exit(ret);
 590         return ret;
 591 }
 592
 593 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
 594                                             u32 cpos,
 595                                             unsigned int *start,
 596                                             unsigned int *end)
 597 {
 598         unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
 599
 600         if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
 601                 unsigned int cpp;
 602
 603                 cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
 604
 605                 cluster_start = cpos % cpp;
 606                 cluster_start = cluster_start << osb->s_clustersize_bits;
 607
 608                 cluster_end = cluster_start + osb->s_clustersize;
 609         }
 610
 611         BUG_ON(cluster_start > PAGE_SIZE);
 612         BUG_ON(cluster_end > PAGE_SIZE);
 613
 614         if (start)
 615                 *start = cluster_start;
 616         if (end)
 617                 *end = cluster_end;
 618 }
 619
 620 /*
 621  * 'from' and 'to' are the region in the page to avoid zeroing.
 622  *
 623  * If pagesize > clustersize, this function will avoid zeroing outside
 624  * of the cluster boundary.
 625  *
 626  * from == to == 0 is code for "zero the entire cluster region"
 627  */
 628 static void ocfs2_clear_page_regions(struct page *page,
 629                                      struct ocfs2_super *osb, u32 cpos,
 630                                      unsigned from, unsigned to)
 631 {
 632         void *kaddr;
 633         unsigned int cluster_start, cluster_end;
 634
 635         ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
 636
 637         kaddr = kmap_atomic(page, KM_USER0);
 638
 639         if (from || to) {
 640                 if (from > cluster_start)
 641                         memset(kaddr + cluster_start, 0, from - cluster_start);
 642                 if (to < cluster_end)
 643                         memset(kaddr + to, 0, cluster_end - to);
 644         } else {
 645                 memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
 646         }
 647
 648         kunmap_atomic(kaddr, KM_USER0);
 649 }
 650
 651 /*
 652  * Some of this taken from block_prepare_write(). We already have our
 653  * mapping by now though, and the entire write will be allocating or
 654  * it won't, so not much need to use BH_New.
 655  *
 656  * This will also skip zeroing, which is handled externally.
 657  */
 658 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 659                           struct inode *inode, unsigned int from,
 660                           unsigned int to, int new)
 661 {
 662         int ret = 0;
 663         struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
 664         unsigned int block_end, block_start;
 665         unsigned int bsize = 1 << inode->i_blkbits;
 666
 667         if (!page_has_buffers(page))
 668                 create_empty_buffers(page, bsize, 0);
 669
 670         head = page_buffers(page);
 671         for (bh = head, block_start = 0; bh != head || !block_start;
 672              bh = bh->b_this_page, block_start += bsize) {
 673                 block_end = block_start + bsize;
 674
 675                 /*
 676                  * Ignore blocks outside of our i/o range -
 677                  * they may belong to unallocated clusters.
 678                  */
 679                 if (block_start >= to || block_end <= from) {
 680                         if (PageUptodate(page))
 681                                 set_buffer_uptodate(bh);
 682                         continue;
 683                 }
 684
 685                 /*
 686                  * For an allocating write with cluster size >= page
 687                  * size, we always write the entire page.
 688                  */
 689
 690                 if (buffer_new(bh))
 691                         clear_buffer_new(bh);
 692
 693                 if (!buffer_mapped(bh)) {
 694                         map_bh(bh, inode->i_sb, *p_blkno);
 695                         unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
 696                 }
 697
 698                 if (PageUptodate(page)) {
 699                         if (!buffer_uptodate(bh))
 700                                 set_buffer_uptodate(bh);
 701                 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
 702                      (block_start < from || block_end > to)) {
 703                         ll_rw_block(READ, 1, &bh);
 704                         *wait_bh++=bh;
 705                 }
 706
 707                 *p_blkno = *p_blkno + 1;
 708         }
 709
 710         /*
 711          * If we issued read requests - let them complete.
 712          */
 713         while(wait_bh > wait) {
 714                 wait_on_buffer(*--wait_bh);
 715                 if (!buffer_uptodate(*wait_bh))
 716                         ret = -EIO;
 717         }
 718
 719         if (ret == 0 || !new)
 720                 return ret;
 721
 722         /*
 723          * If we get -EIO above, zero out any newly allocated blocks
 724          * to avoid exposing stale data.
 725          */
 726         bh = head;
 727         block_start = 0;
 728         do {
 729                 void *kaddr;
 730
 731                 block_end = block_start + bsize;
 732                 if (block_end <= from)
 733                         goto next_bh;
 734                 if (block_start >= to)
 735                         break;
 736
 737                 kaddr = kmap_atomic(page, KM_USER0);
 738                 memset(kaddr+block_start, 0, bh->b_size);
 739                 flush_dcache_page(page);
 740                 kunmap_atomic(kaddr, KM_USER0);
 741                 set_buffer_uptodate(bh);
 742                 mark_buffer_dirty(bh);
 743
 744 next_bh:
 745                 block_start = block_end;
 746                 bh = bh->b_this_page;
 747         } while (bh != head);
 748
 749         return ret;
 750 }
 751
 752 /*
 753  * This will copy user data from the buffer page in the splice
 754  * context.
 755  *
 756  * For now, we ignore SPLICE_F_MOVE as that would require some extra
 757  * communication out all the way to ocfs2_write().
 758  */
 759 int ocfs2_map_and_write_splice_data(struct inode *inode,
 760                                   struct ocfs2_write_ctxt *wc, u64 *p_blkno,
 761                                   unsigned int *ret_from, unsigned int *ret_to)
 762 {
 763         int ret;
 764         unsigned int to, from, cluster_start, cluster_end;
 765         char *src, *dst;
 766         struct ocfs2_splice_write_priv *sp = wc->w_private;
 767         struct pipe_buffer *buf = sp->s_buf;
 768         unsigned long bytes, src_from;
 769         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 770
 771         ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
 772                                         &cluster_end);
 773
 774         from = sp->s_offset;
 775         src_from = sp->s_buf_offset;
 776         bytes = wc->w_count;
 777
 778         if (wc->w_large_pages) {
 779                 /*
 780                  * For cluster size < page size, we have to
 781                  * calculate pos within the cluster and obey
 782                  * the rightmost boundary.
 783                  */
 784                 bytes = min(bytes, (unsigned long)(osb->s_clustersize
 785                                    - (wc->w_pos & (osb->s_clustersize - 1))));
 786         }
 787         to = from + bytes;
 788
 789         if (wc->w_this_page_new)
 790                 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
 791                                             cluster_start, cluster_end, 1);
 792         else
 793                 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
 794                                             from, to, 0);
 795         if (ret) {
 796                 mlog_errno(ret);
 797                 goto out;
 798         }
 799
 800         BUG_ON(from > PAGE_CACHE_SIZE);
 801         BUG_ON(to > PAGE_CACHE_SIZE);
 802         BUG_ON(from > osb->s_clustersize);
 803         BUG_ON(to > osb->s_clustersize);
 804
 805         src = buf->ops->map(sp->s_pipe, buf, 1);
 806         dst = kmap_atomic(wc->w_this_page, KM_USER1);
 807         memcpy(dst + from, src + src_from, bytes);
 808         kunmap_atomic(wc->w_this_page, KM_USER1);
 809         buf->ops->unmap(sp->s_pipe, buf, src);
 810
 811         wc->w_finished_copy = 1;
 812
 813         *ret_from = from;
 814         *ret_to = to;
 815 out:
 816
 817         return bytes ? (unsigned int)bytes : ret;
 818 }
 819
 820 /*
 821  * This will copy user data from the iovec in the buffered write
 822  * context.
 823  */
 824 int ocfs2_map_and_write_user_data(struct inode *inode,
 825                                   struct ocfs2_write_ctxt *wc, u64 *p_blkno,
 826                                   unsigned int *ret_from, unsigned int *ret_to)
 827 {
 828         int ret;
 829         unsigned int to, from, cluster_start, cluster_end;
 830         unsigned long bytes, src_from;
 831         char *dst;
 832         struct ocfs2_buffered_write_priv *bp = wc->w_private;
 833         const struct iovec *cur_iov = bp->b_cur_iov;
 834         char __user *buf;
 835         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 836
 837         ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
 838                                         &cluster_end);
 839
 840         buf = cur_iov->iov_base + bp->b_cur_off;
 841         src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
 842
 843         from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
 844
 845         /*
 846          * This is a lot of comparisons, but it reads quite
 847          * easily, which is important here.
 848          */
 849         /* Stay within the src page */
 850         bytes = PAGE_SIZE - src_from;
 851         /* Stay within the vector */
 852         bytes = min(bytes,
 853                     (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
 854         /* Stay within count */
 855         bytes = min(bytes, (unsigned long)wc->w_count);
 856         /*
 857          * For clustersize > page size, just stay within
 858          * target page, otherwise we have to calculate pos
 859          * within the cluster and obey the rightmost
 860          * boundary.
 861          */
 862         if (wc->w_large_pages) {
 863                 /*
 864                  * For cluster size < page size, we have to
 865                  * calculate pos within the cluster and obey
 866                  * the rightmost boundary.
 867                  */
 868                 bytes = min(bytes, (unsigned long)(osb->s_clustersize
 869                                    - (wc->w_pos & (osb->s_clustersize - 1))));
 870         } else {
 871                 /*
 872                  * cluster size > page size is the most common
 873                  * case - we just stay within the target page
 874                  * boundary.
 875                  */
 876                 bytes = min(bytes, PAGE_CACHE_SIZE - from);
 877         }
 878
 879         to = from + bytes;
 880
 881         if (wc->w_this_page_new)
 882                 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
 883                                             cluster_start, cluster_end, 1);
 884         else
 885                 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
 886                                             from, to, 0);
 887         if (ret) {
 888                 mlog_errno(ret);
 889                 goto out;
 890         }
 891
 892         BUG_ON(from > PAGE_CACHE_SIZE);
 893         BUG_ON(to > PAGE_CACHE_SIZE);
 894         BUG_ON(from > osb->s_clustersize);
 895         BUG_ON(to > osb->s_clustersize);
 896
 897         dst = kmap(wc->w_this_page);
 898         memcpy(dst + from, bp->b_src_buf + src_from, bytes);
 899         kunmap(wc->w_this_page);
 900
 901         /*
 902          * XXX: This is slow, but simple. The caller of
 903          * ocfs2_buffered_write_cluster() is responsible for
 904          * passing through the iovecs, so it's difficult to
 905          * predict what our next step is in here after our
 906          * initial write. A future version should be pushing
 907          * that iovec manipulation further down.
 908          *
 909          * By setting this, we indicate that a copy from user
 910          * data was done, and subsequent calls for this
 911          * cluster will skip copying more data.
 912          */
 913         wc->w_finished_copy = 1;
 914
 915         *ret_from = from;
 916         *ret_to = to;
 917 out:
 918
 919         return bytes ? (unsigned int)bytes : ret;
 920 }
 921
 922 /*
 923  * Map, fill and write a page to disk.
 924  *
 925  * The work of copying data is done via callback.  Newly allocated
 926  * pages which don't take user data will be zero'd (set 'new' to
 927  * indicate an allocating write)
 928  *
 929  * Returns a negative error code or the number of bytes copied into
 930  * the page.
 931  */
 932 int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
 933                           u64 *p_blkno, struct page *page,
 934                           struct ocfs2_write_ctxt *wc, int new)
 935 {
 936         int ret, copied = 0;
 937         unsigned int from = 0, to = 0;
 938         unsigned int cluster_start, cluster_end;
 939         unsigned int zero_from = 0, zero_to = 0;
 940
 941         ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
 942                                         &cluster_start, &cluster_end);
 943
 944         if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
 945             && !wc->w_finished_copy) {
 946
 947                 wc->w_this_page = page;
 948                 wc->w_this_page_new = new;
 949                 ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
 950                 if (ret < 0) {
 951                         mlog_errno(ret);
 952                         goto out;
 953                 }
 954
 955                 copied = ret;
 956
 957                 zero_from = from;
 958                 zero_to = to;
 959                 if (new) {
 960                         from = cluster_start;
 961                         to = cluster_end;
 962                 }
 963         } else {
 964                 /*
 965                  * If we haven't allocated the new page yet, we
 966                  * shouldn't be writing it out without copying user
 967                  * data. This is likely a math error from the caller.
 968                  */
 969                 BUG_ON(!new);
 970
 971                 from = cluster_start;
 972                 to = cluster_end;
 973
 974                 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
 975                                             cluster_start, cluster_end, 1);
 976                 if (ret) {
 977                         mlog_errno(ret);
 978                         goto out;
 979                 }
 980         }
 981
 982         /*
 983          * Parts of newly allocated pages need to be zero'd.
 984          *
 985          * Above, we have also rewritten 'to' and 'from' - as far as
 986          * the rest of the function is concerned, the entire cluster
 987          * range inside of a page needs to be written.
 988          *
 989          * We can skip this if the page is up to date - it's already
 990          * been zero'd from being read in as a hole.
 991          */
 992         if (new && !PageUptodate(page))
 993                 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
 994                                          wc->w_cpos, zero_from, zero_to);
 995
 996         flush_dcache_page(page);
 997
 998         if (ocfs2_should_order_data(inode)) {
 999                 ret = walk_page_buffers(handle,
1000                                         page_buffers(page),
1001                                         from, to, NULL,
1002                                         ocfs2_journal_dirty_data);
1003                 if (ret < 0)
1004                         mlog_errno(ret);
1005         }
1006
1007         /*
1008          * We don't use generic_commit_write() because we need to
1009          * handle our own i_size update.
1010          */
1011         ret = block_commit_write(page, from, to);
1012         if (ret)
1013                 mlog_errno(ret);
1014 out:
1015
1016         return copied ? copied : ret;
1017 }
1018
1019 /*
1020  * Do the actual write of some data into an inode. Optionally allocate
1021  * in order to fulfill the write.
1022  *
1023  * cpos is the logical cluster offset within the file to write at
1024  *
1025  * 'phys' is the physical mapping of that offset. a 'phys' value of
1026  * zero indicates that allocation is required. In this case, data_ac
1027  * and meta_ac should be valid (meta_ac can be null if metadata
1028  * allocation isn't required).
1029  */
1030 static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
1031                            struct buffer_head *di_bh,
1032                            struct ocfs2_alloc_context *data_ac,
1033                            struct ocfs2_alloc_context *meta_ac,
1034                            struct ocfs2_write_ctxt *wc)
1035 {
1036         int ret, i, numpages = 1, new;
1037         unsigned int copied = 0;
1038         u32 tmp_pos;
1039         u64 v_blkno, p_blkno;
1040         struct address_space *mapping = file->f_mapping;
1041         struct inode *inode = mapping->host;
1042         unsigned long index, start;
1043         struct page **cpages;
1044
1045         new = phys == 0 ? 1 : 0;
1046
1047         /*
1048          * Figure out how many pages we'll be manipulating here. For
1049          * non allocating write, we just change the one
1050          * page. Otherwise, we'll need a whole clusters worth.
1051          */
1052         if (new)
1053                 numpages = ocfs2_pages_per_cluster(inode->i_sb);
1054
1055         cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
1056         if (!cpages) {
1057                 ret = -ENOMEM;
1058                 mlog_errno(ret);
1059                 return ret;
1060         }
1061
1062         /*
1063          * Fill our page array first. That way we've grabbed enough so
1064          * that we can zero and flush if we error after adding the
1065          * extent.
1066          */
1067         if (new) {
1068                 start = ocfs2_align_clusters_to_page_index(inode->i_sb,
1069                                                            wc->w_cpos);
1070                 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
1071         } else {
1072                 start = wc->w_pos >> PAGE_CACHE_SHIFT;
1073                 v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
1074         }
1075
1076         for(i = 0; i < numpages; i++) {
1077                 index = start + i;
1078
1079                 cpages[i] = grab_cache_page(mapping, index);
1080                 if (!cpages[i]) {
1081                         ret = -ENOMEM;
1082                         mlog_errno(ret);
1083                         goto out;
1084                 }
1085         }
1086
1087         if (new) {
1088                 /*
1089                  * This is safe to call with the page locks - it won't take
1090                  * any additional semaphores or cluster locks.
1091                  */
1092                 tmp_pos = wc->w_cpos;
1093                 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
1094                                                  &tmp_pos, 1, di_bh, handle,
1095                                                  data_ac, meta_ac, NULL);
1096                 /*
1097                  * This shouldn't happen because we must have already
1098                  * calculated the correct meta data allocation required. The
1099                  * internal tree allocation code should know how to increase
1100                  * transaction credits itself.
1101                  *
1102                  * If need be, we could handle -EAGAIN for a
1103                  * RESTART_TRANS here.
1104                  */
1105                 mlog_bug_on_msg(ret == -EAGAIN,
1106                                 "Inode %llu: EAGAIN return during allocation.\n",
1107                                 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1108                 if (ret < 0) {
1109                         mlog_errno(ret);
1110                         goto out;
1111                 }
1112         }
1113
1114         ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL);
1115         if (ret < 0) {
1116
1117                 /*
1118                  * XXX: Should we go readonly here?
1119                  */
1120
1121                 mlog_errno(ret);
1122                 goto out;
1123         }
1124
1125         BUG_ON(p_blkno == 0);
1126
1127         for(i = 0; i < numpages; i++) {
1128                 ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
1129                                             wc, new);
1130                 if (ret < 0) {
1131                         mlog_errno(ret);
1132                         goto out;
1133                 }
1134
1135                 copied += ret;
1136         }
1137
1138 out:
1139         for(i = 0; i < numpages; i++) {
1140                 unlock_page(cpages[i]);
1141                 mark_page_accessed(cpages[i]);
1142                 page_cache_release(cpages[i]);
1143         }
1144         kfree(cpages);
1145
1146         return copied ? copied : ret;
1147 }
1148
1149 static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
1150                                   struct ocfs2_super *osb, loff_t pos,
1151                                   size_t count, ocfs2_page_writer *cb,
1152                                   void *cb_priv)
1153 {
1154         wc->w_count = count;
1155         wc->w_pos = pos;
1156         wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
1157         wc->w_finished_copy = 0;
1158
1159         if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
1160                 wc->w_large_pages = 1;
1161         else
1162                 wc->w_large_pages = 0;
1163
1164         wc->w_write_data_page = cb;
1165         wc->w_private = cb_priv;
1166 }
1167
1168 /*
1169  * Write a cluster to an inode. The cluster may not be allocated yet,
1170  * in which case it will be. This only exists for buffered writes -
1171  * O_DIRECT takes a more "traditional" path through the kernel.
1172  *
1173  * The caller is responsible for incrementing pos, written counts, etc
1174  *
1175  * For file systems that don't support sparse files, pre-allocation
1176  * and page zeroing up until cpos should be done prior to this
1177  * function call.
1178  *
1179  * Callers should be holding i_sem, and the rw cluster lock.
1180  *
1181  * Returns the number of user bytes written, or less than zero for
1182  * error.
1183  */
1184 ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
1185                                      size_t count, ocfs2_page_writer *actor,
1186                                      void *priv)
1187 {
1188         int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
1189         ssize_t written = 0;
1190         u32 phys;
1191         struct inode *inode = file->f_mapping->host;
1192         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1193         struct buffer_head *di_bh = NULL;
1194         struct ocfs2_dinode *di;
1195         struct ocfs2_alloc_context *data_ac = NULL;
1196         struct ocfs2_alloc_context *meta_ac = NULL;
1197         handle_t *handle;
1198         struct ocfs2_write_ctxt wc;
1199
1200         ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
1201
1202         ret = ocfs2_meta_lock(inode, &di_bh, 1);
1203         if (ret) {
1204                 mlog_errno(ret);
1205                 goto out;
1206         }
1207         di = (struct ocfs2_dinode *)di_bh->b_data;
1208
1209         /*
1210          * Take alloc sem here to prevent concurrent lookups. That way
1211          * the mapping, zeroing and tree manipulation within
1212          * ocfs2_write() will be safe against ->readpage(). This
1213          * should also serve to lock out allocation from a shared
1214          * writeable region.
1215          */
1216         down_write(&OCFS2_I(inode)->ip_alloc_sem);
1217
1218         ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL);
1219         if (ret) {
1220                 mlog_errno(ret);
1221                 goto out_meta;
1222         }
1223
1224         /* phys == 0 means that allocation is required. */
1225         if (phys == 0) {
1226                 ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
1227                 if (ret) {
1228                         mlog_errno(ret);
1229                         goto out_meta;
1230                 }
1231
1232                 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
1233         }
1234
1235         ret = ocfs2_data_lock(inode, 1);
1236         if (ret) {
1237                 mlog_errno(ret);
1238                 goto out_meta;
1239         }
1240
1241         handle = ocfs2_start_trans(osb, credits);
1242         if (IS_ERR(handle)) {
1243                 ret = PTR_ERR(handle);
1244                 mlog_errno(ret);
1245                 goto out_data;
1246         }
1247
1248         written = ocfs2_write(file, phys, handle, di_bh, data_ac,
1249                               meta_ac, &wc);
1250         if (written < 0) {
1251                 ret = written;
1252                 mlog_errno(ret);
1253                 goto out_commit;
1254         }
1255
1256         ret = ocfs2_journal_access(handle, inode, di_bh,
1257                                    OCFS2_JOURNAL_ACCESS_WRITE);
1258         if (ret) {
1259                 mlog_errno(ret);
1260                 goto out_commit;
1261         }
1262
1263         pos += written;
1264         if (pos > inode->i_size) {
1265                 i_size_write(inode, pos);
1266                 mark_inode_dirty(inode);
1267         }
1268         inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
1269         di->i_size = cpu_to_le64((u64)i_size_read(inode));
1270         inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1271         di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
1272         di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1273
1274         ret = ocfs2_journal_dirty(handle, di_bh);
1275         if (ret)
1276                 mlog_errno(ret);
1277
1278 out_commit:
1279         ocfs2_commit_trans(osb, handle);
1280
1281 out_data:
1282         ocfs2_data_unlock(inode, 1);
1283
1284 out_meta:
1285         up_write(&OCFS2_I(inode)->ip_alloc_sem);
1286         ocfs2_meta_unlock(inode, 1);
1287
1288 out:
1289         brelse(di_bh);
1290         if (data_ac)
1291                 ocfs2_free_alloc_context(data_ac);
1292         if (meta_ac)
1293                 ocfs2_free_alloc_context(meta_ac);
1294
1295         return written ? written : ret;
1296 }
1297
1298 const struct address_space_operations ocfs2_aops = {
1299         .readpage       = ocfs2_readpage,
1300         .writepage      = ocfs2_writepage,
1301         .bmap           = ocfs2_bmap,
1302         .sync_page      = block_sync_page,
1303         .direct_IO      = ocfs2_direct_IO,
1304         .invalidatepage = ocfs2_invalidatepage,
1305         .releasepage    = ocfs2_releasepage,
1306         .migratepage    = buffer_migrate_page,
1307 };