Merge git://git.infradead.org/battery-2.6
[powerpc.git] / fs / ocfs2 / file.c
index 781ba6c..f92fe91 100644 (file)
@@ -397,6 +397,15 @@ static int ocfs2_truncate_file(struct inode *inode,
        unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
        truncate_inode_pages(inode->i_mapping, new_i_size);
 
+       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+               status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
+                                              i_size_read(inode), 0);
+               if (status)
+                       mlog_errno(status);
+
+               goto bail_unlock_data;
+       }
+
        /* alright, we're going to need to do a full blown alloc size
         * change. Orphan the inode so that recovery can complete the
         * truncate if necessary. This does the task of marking
@@ -908,7 +917,8 @@ static int ocfs2_extend_file(struct inode *inode,
                             struct buffer_head *di_bh,
                             u64 new_i_size)
 {
-       int ret = 0;
+       int ret = 0, data_locked = 0;
+       struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
        BUG_ON(!di_bh);
 
@@ -920,7 +930,17 @@ static int ocfs2_extend_file(struct inode *inode,
                goto out;
        BUG_ON(new_i_size < i_size_read(inode));
 
-       if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+       /*
+        * Fall through for converting inline data, even if the fs
+        * supports sparse files.
+        *
+        * The check for inline data here is legal - nobody can add
+        * the feature since we have i_mutex. We must check it again
+        * after acquiring ip_alloc_sem though, as paths like mmap
+        * might have raced us to converting the inode to extents.
+        */
+       if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+           && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
                goto out_update_size;
 
        /* 
@@ -935,6 +955,7 @@ static int ocfs2_extend_file(struct inode *inode,
                mlog_errno(ret);
                goto out;
        }
+       data_locked = 1;
 
        /*
         * The alloc sem blocks people in read/write from reading our
@@ -942,9 +963,31 @@ static int ocfs2_extend_file(struct inode *inode,
         * i_mutex to block other extend/truncate calls while we're
         * here.
         */
-       down_write(&OCFS2_I(inode)->ip_alloc_sem);
-       ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
-       up_write(&OCFS2_I(inode)->ip_alloc_sem);
+       down_write(&oi->ip_alloc_sem);
+
+       if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+               /*
+                * We can optimize small extends by keeping the inodes
+                * inline data.
+                */
+               if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
+                       up_write(&oi->ip_alloc_sem);
+                       goto out_update_size;
+               }
+
+               ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
+               if (ret) {
+                       up_write(&oi->ip_alloc_sem);
+
+                       mlog_errno(ret);
+                       goto out_unlock;
+               }
+       }
+
+       if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+               ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+
+       up_write(&oi->ip_alloc_sem);
 
        if (ret < 0) {
                mlog_errno(ret);
@@ -957,7 +1000,7 @@ out_update_size:
                mlog_errno(ret);
 
 out_unlock:
-       if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+       if (data_locked)
                ocfs2_data_unlock(inode, 1);
 
 out:
@@ -1231,6 +1274,31 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
 {
        int ret;
        u32 cpos, phys_cpos, clusters, alloc_size;
+       u64 end = start + len;
+       struct buffer_head *di_bh = NULL;
+
+       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+               ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                      OCFS2_I(inode)->ip_blkno, &di_bh,
+                                      OCFS2_BH_CACHED, inode);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               /*
+                * Nothing to do if the requested reservation range
+                * fits within the inode.
+                */
+               if (ocfs2_size_fits_inline_data(di_bh, end))
+                       goto out;
+
+               ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
 
        /*
         * We consider both start and len to be inclusive.
@@ -1276,6 +1344,8 @@ next:
 
        ret = 0;
 out:
+
+       brelse(di_bh);
        return ret;
 }
 
@@ -1457,6 +1527,14 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        if (byte_len == 0)
                return 0;
 
+       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+               ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
+                                           byte_start + byte_len, 1);
+               if (ret)
+                       mlog_errno(ret);
+               return ret;
+       }
+
        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
        trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
        if (trunc_len >= trunc_start)
@@ -1758,6 +1836,15 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                if (!direct_io || !(*direct_io))
                        break;
 
+               /*
+                * There's no sane way to do direct writes to an inode
+                * with inline data.
+                */
+               if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+                       *direct_io = 0;
+                       break;
+               }
+
                /*
                 * Allowing concurrent direct writes means
                 * i_size changes wouldn't be synchronized, so
@@ -1794,143 +1881,13 @@ out:
        return ret;
 }
 
-static inline void
-ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
-{
-       const struct iovec *iov = *iovp;
-       size_t base = *basep;
-
-       do {
-               int copy = min(bytes, iov->iov_len - base);
-
-               bytes -= copy;
-               base += copy;
-               if (iov->iov_len == base) {
-                       iov++;
-                       base = 0;
-               }
-       } while (bytes);
-       *iovp = iov;
-       *basep = base;
-}
-
-static struct page * ocfs2_get_write_source(char **ret_src_buf,
-                                           const struct iovec *cur_iov,
-                                           size_t iov_offset)
-{
-       int ret;
-       char *buf = cur_iov->iov_base + iov_offset;
-       struct page *src_page = NULL;
-       unsigned long off;
-
-       off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
-
-       if (!segment_eq(get_fs(), KERNEL_DS)) {
-               /*
-                * Pull in the user page. We want to do this outside
-                * of the meta data locks in order to preserve locking
-                * order in case of page fault.
-                */
-               ret = get_user_pages(current, current->mm,
-                                    (unsigned long)buf & PAGE_CACHE_MASK, 1,
-                                    0, 0, &src_page, NULL);
-               if (ret == 1)
-                       *ret_src_buf = kmap(src_page) + off;
-               else
-                       src_page = ERR_PTR(-EFAULT);
-       } else {
-               *ret_src_buf = buf;
-       }
-
-       return src_page;
-}
-
-static void ocfs2_put_write_source(struct page *page)
-{
-       if (page) {
-               kunmap(page);
-               page_cache_release(page);
-       }
-}
-
-static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
-                                        const struct iovec *iov,
-                                        unsigned long nr_segs,
-                                        size_t count,
-                                        ssize_t o_direct_written)
-{
-       int ret = 0;
-       ssize_t copied, total = 0;
-       size_t iov_offset = 0, bytes;
-       loff_t pos;
-       const struct iovec *cur_iov = iov;
-       struct page *user_page, *page;
-       char * uninitialized_var(buf);
-       char *dst;
-       void *fsdata;
-
-       /*
-        * handle partial DIO write.  Adjust cur_iov if needed.
-        */
-       ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
-
-       do {
-               pos = *ppos;
-
-               user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
-               if (IS_ERR(user_page)) {
-                       ret = PTR_ERR(user_page);
-                       goto out;
-               }
-
-               /* Stay within our page boundaries */
-               bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
-                           (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
-               /* Stay within the vector boundary */
-               bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
-               /* Stay within count */
-               bytes = min(bytes, count);
-
-               page = NULL;
-               ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
-                                       &page, &fsdata);
-               if (ret) {
-                       mlog_errno(ret);
-                       goto out;
-               }
-
-               dst = kmap_atomic(page, KM_USER0);
-               memcpy(dst + (pos & (loff_t)(PAGE_CACHE_SIZE - 1)), buf, bytes);
-               kunmap_atomic(dst, KM_USER0);
-               flush_dcache_page(page);
-               ocfs2_put_write_source(user_page);
-
-               copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
-                                        bytes, page, fsdata);
-               if (copied < 0) {
-                       mlog_errno(copied);
-                       ret = copied;
-                       goto out;
-               }
-
-               total += copied;
-               *ppos = pos + copied;
-               count -= copied;
-
-               ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
-       } while(count);
-
-out:
-       return total ? total : ret;
-}
-
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
                                    const struct iovec *iov,
                                    unsigned long nr_segs,
                                    loff_t pos)
 {
        int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
-       int can_do_direct, sync = 0;
+       int can_do_direct;
        ssize_t written = 0;
        size_t ocount;          /* original count */
        size_t count;           /* after file limit checks */
@@ -1946,12 +1903,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        if (iocb->ki_left == 0)
                return 0;
 
-       ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-       if (ret)
-               return ret;
-
-       count = ocount;
-
        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 
        appending = file->f_flags & O_APPEND ? 1 : 0;
@@ -1995,33 +1946,23 @@ relock:
                rw_level = -1;
 
                direct_io = 0;
-               sync = 1;
                goto relock;
        }
 
-       if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
-               sync = 1;
-
-       /*
-        * XXX: Is it ok to execute these checks a second time?
-        */
-       ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
-       if (ret)
-               goto out;
-
-       /*
-        * Set pos so that sync_page_range_nolock() below understands
-        * where to start from. We might've moved it around via the
-        * calls above. The range we want to actually sync starts from
-        * *ppos here.
-        *
-        */
-       pos = *ppos;
-
        /* communicate with ocfs2_dio_end_io */
        ocfs2_iocb_set_rw_locked(iocb, rw_level);
 
        if (direct_io) {
+               ret = generic_segment_checks(iov, &nr_segs, &ocount,
+                                            VERIFY_READ);
+               if (ret)
+                       goto out_dio;
+
+               ret = generic_write_checks(file, ppos, &count,
+                                          S_ISBLK(inode->i_mode));
+               if (ret)
+                       goto out_dio;
+
                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
                                                    ppos, count, ocount);
                if (written < 0) {
@@ -2029,14 +1970,8 @@ relock:
                        goto out_dio;
                }
        } else {
-               written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
-                                                   count, written);
-               if (written < 0) {
-                       ret = written;
-                       if (ret != -EFAULT || ret != -ENOSPC)
-                               mlog_errno(ret);
-                       goto out;
-               }
+               written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
+                                                       *ppos);
        }
 
 out_dio:
@@ -2066,97 +2001,12 @@ out_sems:
        if (have_alloc_sem)
                up_read(&inode->i_alloc_sem);
 
-       if (written > 0 && sync) {
-               ssize_t err;
-
-               err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
-               if (err < 0)
-                       written = err;
-       }
-
        mutex_unlock(&inode->i_mutex);
 
        mlog_exit(ret);
        return written ? written : ret;
 }
 
-static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
-                                   struct pipe_buffer *buf,
-                                   struct splice_desc *sd)
-{
-       int ret, count;
-       ssize_t copied = 0;
-       struct file *file = sd->u.file;
-       unsigned int offset;
-       struct page *page = NULL;
-       void *fsdata;
-       char *src, *dst;
-
-       ret = buf->ops->confirm(pipe, buf);
-       if (ret)
-               goto out;
-
-       offset = sd->pos & ~PAGE_CACHE_MASK;
-       count = sd->len;
-       if (count + offset > PAGE_CACHE_SIZE)
-               count = PAGE_CACHE_SIZE - offset;
-
-       ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
-                               &page, &fsdata);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
-
-       src = buf->ops->map(pipe, buf, 1);
-       dst = kmap_atomic(page, KM_USER1);
-       memcpy(dst + offset, src + buf->offset, count);
-       kunmap_atomic(dst, KM_USER1);
-       buf->ops->unmap(pipe, buf, src);
-
-       copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
-                                page, fsdata);
-       if (copied < 0) {
-               mlog_errno(copied);
-               ret = copied;
-               goto out;
-       }
-out:
-
-       return copied ? copied : ret;
-}
-
-static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
-                                        struct file *out,
-                                        loff_t *ppos,
-                                        size_t len,
-                                        unsigned int flags)
-{
-       int ret, err;
-       struct address_space *mapping = out->f_mapping;
-       struct inode *inode = mapping->host;
-       struct splice_desc sd = {
-               .total_len = len,
-               .flags = flags,
-               .pos = *ppos,
-               .u.file = out,
-       };
-
-       ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor);
-       if (ret > 0) {
-               *ppos += ret;
-
-               if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-                       err = generic_osync_inode(inode, mapping,
-                                                 OSYNC_METADATA|OSYNC_DATA);
-                       if (err)
-                               ret = err;
-               }
-       }
-
-       return ret;
-}
-
 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                                       struct file *out,
                                       loff_t *ppos,
@@ -2186,8 +2036,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                goto out_unlock;
        }
 
-       /* ok, we're done with i_size and alloc work */
-       ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
+       ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
 
 out_unlock:
        ocfs2_rw_unlock(inode, 1);