aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c3
-rw-r--r--fs/9p/vfs_dir.c6
-rw-r--r--fs/9p/vfs_inode.c9
-rw-r--r--fs/9p/vfs_super.c20
-rw-r--r--fs/aio.c13
-rw-r--r--fs/binfmt_aout.c4
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/bio-integrity.c4
-rw-r--r--fs/ceph/Kconfig1
-rw-r--r--fs/ceph/addr.c19
-rw-r--r--fs/ceph/auth_x.c15
-rw-r--r--fs/ceph/caps.c90
-rw-r--r--fs/ceph/debugfs.c4
-rw-r--r--fs/ceph/dir.c12
-rw-r--r--fs/ceph/export.c21
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/inode.c16
-rw-r--r--fs/ceph/locks.c14
-rw-r--r--fs/ceph/mds_client.c103
-rw-r--r--fs/ceph/mds_client.h3
-rw-r--r--fs/ceph/osd_client.c4
-rw-r--r--fs/ceph/pagelist.c12
-rw-r--r--fs/ceph/snap.c177
-rw-r--r--fs/ceph/super.h16
-rw-r--r--fs/ceph/xattr.c1
-rw-r--r--fs/char_dev.c4
-rw-r--r--fs/cifs/cifs_unicode.h18
-rw-r--r--fs/cifs/cifs_uniupr.h16
-rw-r--r--fs/cifs/cifsencrypt.c57
-rw-r--r--fs/cifs/cifsproto.h5
-rw-r--r--fs/cifs/cifssmb.c49
-rw-r--r--fs/cifs/connect.c68
-rw-r--r--fs/cifs/dir.c157
-rw-r--r--fs/cifs/file.c3
-rw-r--r--fs/cifs/inode.c34
-rw-r--r--fs/cifs/netmisc.c22
-rw-r--r--fs/coda/psdev.c4
-rw-r--r--fs/compat.c2
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/ecryptfs/crypto.c3
-rw-r--r--fs/ecryptfs/inode.c31
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/kthread.c2
-rw-r--r--fs/ecryptfs/messaging.c2
-rw-r--r--fs/ecryptfs/miscdev.c2
-rw-r--r--fs/exec.c52
-rw-r--r--fs/exofs/inode.c8
-rw-r--r--fs/fcntl.c10
-rw-r--r--fs/fs-writeback.c14
-rw-r--r--fs/fuse/dev.c44
-rw-r--r--fs/fuse/file.c8
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/minix/namei.c2
-rw-r--r--fs/namespace.c23
-rw-r--r--fs/nfs/Kconfig1
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfs/super.c8
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs4state.c28
-rw-r--r--fs/nfsd/nfsfh.h2
-rw-r--r--fs/nfsd/state.h14
-rw-r--r--fs/nfsd/vfs.c14
-rw-r--r--fs/nilfs2/the_nilfs.c1
-rw-r--r--fs/notify/Kconfig2
-rw-r--r--fs/notify/fanotify/fanotify.c3
-rw-r--r--fs/notify/fanotify/fanotify_user.c29
-rw-r--r--fs/notify/fsnotify.c68
-rw-r--r--fs/ocfs2/acl.c3
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/blockcheck.c4
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dir.c24
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c9
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c40
-rw-r--r--fs/ocfs2/dlmglue.h1
-rw-r--r--fs/ocfs2/file.c15
-rw-r--r--fs/ocfs2/inode.c6
-rw-r--r--fs/ocfs2/mmap.c8
-rw-r--r--fs/ocfs2/namei.c302
-rw-r--r--fs/ocfs2/ocfs2_fs.h37
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h8
-rw-r--r--fs/ocfs2/refcounttree.c10
-rw-r--r--fs/ocfs2/reservations.c22
-rw-r--r--fs/ocfs2/suballoc.c223
-rw-r--r--fs/ocfs2/suballoc.h21
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/xattr.c4
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/task_mmu.c7
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/reiserfs/ioctl.c7
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c13
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c61
-rw-r--r--fs/xfs/xfs_bmap.c14
-rw-r--r--fs/xfs/xfs_fs.h4
-rw-r--r--fs/xfs/xfs_fsops.c31
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_ialloc.c16
-rw-r--r--fs/xfs/xfs_inode.c49
-rw-r--r--fs/xfs/xfs_log.c7
-rw-r--r--fs/xfs/xfs_log_cil.c271
-rw-r--r--fs/xfs/xfs_log_priv.h50
-rw-r--r--fs/xfs/xfs_trans.c5
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/xfs/xfs_vnodeops.c13
115 files changed, 1740 insertions, 996 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 358563689064..6406f896bf95 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -242,7 +242,8 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
242 } 242 }
243 kfree(wnames); 243 kfree(wnames);
244fid_out: 244fid_out:
245 v9fs_fid_add(dentry, fid); 245 if (!IS_ERR(fid))
246 v9fs_fid_add(dentry, fid);
246err_out: 247err_out:
247 up_read(&v9ses->rename_sem); 248 up_read(&v9ses->rename_sem);
248 return fid; 249 return fid;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 16c8a2a98c1b..899f168fd19c 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -292,9 +292,11 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
292 292
293 fid = filp->private_data; 293 fid = filp->private_data;
294 P9_DPRINTK(P9_DEBUG_VFS, 294 P9_DPRINTK(P9_DEBUG_VFS,
295 "inode: %p filp: %p fid: %d\n", inode, filp, fid->fid); 295 "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
296 inode, filp, fid ? fid->fid : -1);
296 filemap_write_and_wait(inode->i_mapping); 297 filemap_write_and_wait(inode->i_mapping);
297 p9_client_clunk(fid); 298 if (fid)
299 p9_client_clunk(fid);
298 return 0; 300 return 0;
299} 301}
300 302
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c7c23eab9440..9e670d527646 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -730,7 +730,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
730 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 730 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
731 goto error; 731 goto error;
732 } 732 }
733 dentry->d_op = &v9fs_cached_dentry_operations; 733 if (v9ses->cache)
734 dentry->d_op = &v9fs_cached_dentry_operations;
735 else
736 dentry->d_op = &v9fs_dentry_operations;
734 d_instantiate(dentry, inode); 737 d_instantiate(dentry, inode);
735 err = v9fs_fid_add(dentry, fid); 738 err = v9fs_fid_add(dentry, fid);
736 if (err < 0) 739 if (err < 0)
@@ -1128,6 +1131,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1128 v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); 1131 v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
1129 generic_fillattr(dentry->d_inode, stat); 1132 generic_fillattr(dentry->d_inode, stat);
1130 1133
1134 p9stat_free(st);
1131 kfree(st); 1135 kfree(st);
1132 return 0; 1136 return 0;
1133} 1137}
@@ -1489,6 +1493,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1489 1493
1490 retval = strnlen(buffer, buflen); 1494 retval = strnlen(buffer, buflen);
1491done: 1495done:
1496 p9stat_free(st);
1492 kfree(st); 1497 kfree(st);
1493 return retval; 1498 return retval;
1494} 1499}
@@ -1942,7 +1947,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1942 .unlink = v9fs_vfs_unlink, 1947 .unlink = v9fs_vfs_unlink,
1943 .mkdir = v9fs_vfs_mkdir, 1948 .mkdir = v9fs_vfs_mkdir,
1944 .rmdir = v9fs_vfs_rmdir, 1949 .rmdir = v9fs_vfs_rmdir,
1945 .mknod = v9fs_vfs_mknod_dotl, 1950 .mknod = v9fs_vfs_mknod,
1946 .rename = v9fs_vfs_rename, 1951 .rename = v9fs_vfs_rename,
1947 .getattr = v9fs_vfs_getattr, 1952 .getattr = v9fs_vfs_getattr,
1948 .setattr = v9fs_vfs_setattr, 1953 .setattr = v9fs_vfs_setattr,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index f9311077de68..1d12ba0ed3db 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -122,6 +122,10 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
122 fid = v9fs_session_init(v9ses, dev_name, data); 122 fid = v9fs_session_init(v9ses, dev_name, data);
123 if (IS_ERR(fid)) { 123 if (IS_ERR(fid)) {
124 retval = PTR_ERR(fid); 124 retval = PTR_ERR(fid);
125 /*
126 * we need to call session_close to tear down some
127 * of the data structure setup by session_init
128 */
125 goto close_session; 129 goto close_session;
126 } 130 }
127 131
@@ -144,7 +148,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
144 retval = -ENOMEM; 148 retval = -ENOMEM;
145 goto release_sb; 149 goto release_sb;
146 } 150 }
147
148 sb->s_root = root; 151 sb->s_root = root;
149 152
150 if (v9fs_proto_dotl(v9ses)) { 153 if (v9fs_proto_dotl(v9ses)) {
@@ -152,7 +155,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
152 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); 155 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
153 if (IS_ERR(st)) { 156 if (IS_ERR(st)) {
154 retval = PTR_ERR(st); 157 retval = PTR_ERR(st);
155 goto clunk_fid; 158 goto release_sb;
156 } 159 }
157 160
158 v9fs_stat2inode_dotl(st, root->d_inode); 161 v9fs_stat2inode_dotl(st, root->d_inode);
@@ -162,7 +165,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
162 st = p9_client_stat(fid); 165 st = p9_client_stat(fid);
163 if (IS_ERR(st)) { 166 if (IS_ERR(st)) {
164 retval = PTR_ERR(st); 167 retval = PTR_ERR(st);
165 goto clunk_fid; 168 goto release_sb;
166 } 169 }
167 170
168 root->d_inode->i_ino = v9fs_qid2ino(&st->qid); 171 root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
@@ -174,19 +177,24 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
174 177
175 v9fs_fid_add(root, fid); 178 v9fs_fid_add(root, fid);
176 179
177P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); 180 P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
178 simple_set_mnt(mnt, sb); 181 simple_set_mnt(mnt, sb);
179 return 0; 182 return 0;
180 183
181clunk_fid: 184clunk_fid:
182 p9_client_clunk(fid); 185 p9_client_clunk(fid);
183
184close_session: 186close_session:
185 v9fs_session_close(v9ses); 187 v9fs_session_close(v9ses);
186 kfree(v9ses); 188 kfree(v9ses);
187 return retval; 189 return retval;
188
189release_sb: 190release_sb:
191 /*
192 * we will do the session_close and root dentry release
193 * in the below call. But we need to clunk fid, because we haven't
194 * attached the fid to dentry so it won't get clunked
195 * automatically.
196 */
197 p9_client_clunk(fid);
190 deactivate_locked_super(sb); 198 deactivate_locked_super(sb);
191 return retval; 199 return retval;
192} 200}
diff --git a/fs/aio.c b/fs/aio.c
index 3006b5bc33d6..250b0a73c8a8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -712,8 +712,16 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
712 */ 712 */
713 ret = retry(iocb); 713 ret = retry(iocb);
714 714
715 if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) 715 if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
716 /*
717 * There's no easy way to restart the syscall since other AIO's
718 * may be already running. Just fail this IO with EINTR.
719 */
720 if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
721 ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
722 ret = -EINTR;
716 aio_complete(iocb, ret, 0); 723 aio_complete(iocb, ret, 0);
724 }
717out: 725out:
718 spin_lock_irq(&ctx->ctx_lock); 726 spin_lock_irq(&ctx->ctx_lock);
719 727
@@ -1659,6 +1667,9 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1659 if (unlikely(nr < 0)) 1667 if (unlikely(nr < 0))
1660 return -EINVAL; 1668 return -EINVAL;
1661 1669
1670 if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
1671 nr = LONG_MAX/sizeof(*iocbpp);
1672
1662 if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) 1673 if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
1663 return -EFAULT; 1674 return -EFAULT;
1664 1675
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f96eff04e11a..a6395bdb26ae 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm)
134 if (!dump_write(file, dump_start, dump_size)) 134 if (!dump_write(file, dump_start, dump_size))
135 goto end_coredump; 135 goto end_coredump;
136 } 136 }
137/* Finally dump the task struct. Not be used by gdb, but could be useful */
138 set_fs(KERNEL_DS);
139 if (!dump_write(file, current, sizeof(*current)))
140 goto end_coredump;
141end_coredump: 137end_coredump:
142 set_fs(fs); 138 set_fs(fs);
143 return has_dumped; 139 return has_dumped;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index a7528b913936..fd0cc0bf9a40 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -724,7 +724,7 @@ static int __init init_misc_binfmt(void)
724{ 724{
725 int err = register_filesystem(&bm_fs_type); 725 int err = register_filesystem(&bm_fs_type);
726 if (!err) { 726 if (!err) {
727 err = register_binfmt(&misc_format); 727 err = insert_binfmt(&misc_format);
728 if (err) 728 if (err)
729 unregister_filesystem(&bm_fs_type); 729 unregister_filesystem(&bm_fs_type);
730 } 730 }
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 612a5c38d3c1..4d0ff5ee27b8 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -413,10 +413,10 @@ int bio_integrity_prep(struct bio *bio)
413 413
414 /* Allocate kernel buffer for protection data */ 414 /* Allocate kernel buffer for protection data */
415 len = sectors * blk_integrity_tuple_size(bi); 415 len = sectors * blk_integrity_tuple_size(bi);
416 buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); 416 buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
417 if (unlikely(buf == NULL)) { 417 if (unlikely(buf == NULL)) {
418 printk(KERN_ERR "could not allocate integrity buffer\n"); 418 printk(KERN_ERR "could not allocate integrity buffer\n");
419 return -EIO; 419 return -ENOMEM;
420 } 420 }
421 421
422 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 422 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index bc87b9c1d27e..0fcd2640c23f 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -3,6 +3,7 @@ config CEPH_FS
3 depends on INET && EXPERIMENTAL 3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C 4 select LIBCRC32C
5 select CRYPTO_AES 5 select CRYPTO_AES
6 select CRYPTO
6 help 7 help
7 Choose Y or M here to include support for mounting the 8 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely 9 experimental Ceph distributed file system. Ceph is an extremely
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5598a0d02295..efbc604001c8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page)
87 87
88 /* dirty the head */ 88 /* dirty the head */
89 spin_lock(&inode->i_lock); 89 spin_lock(&inode->i_lock);
90 if (ci->i_wrbuffer_ref_head == 0) 90 if (ci->i_head_snapc == NULL)
91 ci->i_head_snapc = ceph_get_snap_context(snapc); 91 ci->i_head_snapc = ceph_get_snap_context(snapc);
92 ++ci->i_wrbuffer_ref_head; 92 ++ci->i_wrbuffer_ref_head;
93 if (ci->i_wrbuffer_ref == 0) 93 if (ci->i_wrbuffer_ref == 0)
@@ -105,13 +105,7 @@ static int ceph_set_page_dirty(struct page *page)
105 spin_lock_irq(&mapping->tree_lock); 105 spin_lock_irq(&mapping->tree_lock);
106 if (page->mapping) { /* Race with truncate? */ 106 if (page->mapping) { /* Race with truncate? */
107 WARN_ON_ONCE(!PageUptodate(page)); 107 WARN_ON_ONCE(!PageUptodate(page));
108 108 account_page_dirtied(page, page->mapping);
109 if (mapping_cap_account_dirty(mapping)) {
110 __inc_zone_page_state(page, NR_FILE_DIRTY);
111 __inc_bdi_stat(mapping->backing_dev_info,
112 BDI_RECLAIMABLE);
113 task_io_account_write(PAGE_CACHE_SIZE);
114 }
115 radix_tree_tag_set(&mapping->page_tree, 109 radix_tree_tag_set(&mapping->page_tree,
116 page_index(page), PAGECACHE_TAG_DIRTY); 110 page_index(page), PAGECACHE_TAG_DIRTY);
117 111
@@ -352,7 +346,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
352 break; 346 break;
353 } 347 }
354 } 348 }
355 if (!snapc && ci->i_head_snapc) { 349 if (!snapc && ci->i_wrbuffer_ref_head) {
356 snapc = ceph_get_snap_context(ci->i_head_snapc); 350 snapc = ceph_get_snap_context(ci->i_head_snapc);
357 dout(" head snapc %p has %d dirty pages\n", 351 dout(" head snapc %p has %d dirty pages\n",
358 snapc, ci->i_wrbuffer_ref_head); 352 snapc, ci->i_wrbuffer_ref_head);
@@ -417,8 +411,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
417 if (i_size < page_off + len) 411 if (i_size < page_off + len)
418 len = i_size - page_off; 412 len = i_size - page_off;
419 413
420 dout("writepage %p page %p index %lu on %llu~%u\n", 414 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
421 inode, page, page->index, page_off, len); 415 inode, page, page->index, page_off, len, snapc);
422 416
423 writeback_stat = atomic_long_inc_return(&client->writeback_count); 417 writeback_stat = atomic_long_inc_return(&client->writeback_count);
424 if (writeback_stat > 418 if (writeback_stat >
@@ -772,7 +766,8 @@ get_more_pages:
772 /* ok */ 766 /* ok */
773 if (locked_pages == 0) { 767 if (locked_pages == 0) {
774 /* prepare async write request */ 768 /* prepare async write request */
775 offset = page->index << PAGE_CACHE_SHIFT; 769 offset = (unsigned long long)page->index
770 << PAGE_CACHE_SHIFT;
776 len = wsize; 771 len = wsize;
777 req = ceph_osdc_new_request(&client->osdc, 772 req = ceph_osdc_new_request(&client->osdc,
778 &ci->i_layout, 773 &ci->i_layout,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 582e0b2caf8a..a2d002cbdec2 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -376,7 +376,7 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
376 376
377 th = get_ticket_handler(ac, service); 377 th = get_ticket_handler(ac, service);
378 378
379 if (!th) { 379 if (IS_ERR(th)) {
380 *pneed |= service; 380 *pneed |= service;
381 continue; 381 continue;
382 } 382 }
@@ -399,6 +399,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
399 struct ceph_x_ticket_handler *th = 399 struct ceph_x_ticket_handler *th =
400 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); 400 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
401 401
402 if (IS_ERR(th))
403 return PTR_ERR(th);
404
402 ceph_x_validate_tickets(ac, &need); 405 ceph_x_validate_tickets(ac, &need);
403 406
404 dout("build_request want %x have %x need %x\n", 407 dout("build_request want %x have %x need %x\n",
@@ -450,7 +453,6 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
450 return -ERANGE; 453 return -ERANGE;
451 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); 454 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
452 455
453 BUG_ON(!th);
454 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); 456 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
455 if (ret) 457 if (ret)
456 return ret; 458 return ret;
@@ -505,7 +507,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
505 507
506 case CEPHX_GET_PRINCIPAL_SESSION_KEY: 508 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
507 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); 509 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
508 BUG_ON(!th); 510 if (IS_ERR(th))
511 return PTR_ERR(th);
509 ret = ceph_x_proc_ticket_reply(ac, &th->session_key, 512 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
510 buf + sizeof(*head), end); 513 buf + sizeof(*head), end);
511 break; 514 break;
@@ -563,8 +566,8 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
563 void *end = p + sizeof(au->reply_buf); 566 void *end = p + sizeof(au->reply_buf);
564 567
565 th = get_ticket_handler(ac, au->service); 568 th = get_ticket_handler(ac, au->service);
566 if (!th) 569 if (IS_ERR(th))
567 return -EIO; /* hrm! */ 570 return PTR_ERR(th);
568 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); 571 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
569 if (ret < 0) 572 if (ret < 0)
570 return ret; 573 return ret;
@@ -626,7 +629,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
626 struct ceph_x_ticket_handler *th; 629 struct ceph_x_ticket_handler *th;
627 630
628 th = get_ticket_handler(ac, peer_type); 631 th = get_ticket_handler(ac, peer_type);
629 if (th && !IS_ERR(th)) 632 if (!IS_ERR(th))
630 remove_ticket_handler(ac, th); 633 remove_ticket_handler(ac, th);
631} 634}
632 635
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7bf182b03973..5e9da996a151 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -814,7 +814,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
814 used |= CEPH_CAP_PIN; 814 used |= CEPH_CAP_PIN;
815 if (ci->i_rd_ref) 815 if (ci->i_rd_ref)
816 used |= CEPH_CAP_FILE_RD; 816 used |= CEPH_CAP_FILE_RD;
817 if (ci->i_rdcache_ref || ci->i_rdcache_gen) 817 if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
818 used |= CEPH_CAP_FILE_CACHE; 818 used |= CEPH_CAP_FILE_CACHE;
819 if (ci->i_wr_ref) 819 if (ci->i_wr_ref)
820 used |= CEPH_CAP_FILE_WR; 820 used |= CEPH_CAP_FILE_WR;
@@ -1082,6 +1082,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1082 gid_t gid; 1082 gid_t gid;
1083 struct ceph_mds_session *session; 1083 struct ceph_mds_session *session;
1084 u64 xattr_version = 0; 1084 u64 xattr_version = 0;
1085 struct ceph_buffer *xattr_blob = NULL;
1085 int delayed = 0; 1086 int delayed = 0;
1086 u64 flush_tid = 0; 1087 u64 flush_tid = 0;
1087 int i; 1088 int i;
@@ -1142,6 +1143,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1142 for (i = 0; i < CEPH_CAP_BITS; i++) 1143 for (i = 0; i < CEPH_CAP_BITS; i++)
1143 if (flushing & (1 << i)) 1144 if (flushing & (1 << i))
1144 ci->i_cap_flush_tid[i] = flush_tid; 1145 ci->i_cap_flush_tid[i] = flush_tid;
1146
1147 follows = ci->i_head_snapc->seq;
1148 } else {
1149 follows = 0;
1145 } 1150 }
1146 1151
1147 keep = cap->implemented; 1152 keep = cap->implemented;
@@ -1155,14 +1160,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1155 mtime = inode->i_mtime; 1160 mtime = inode->i_mtime;
1156 atime = inode->i_atime; 1161 atime = inode->i_atime;
1157 time_warp_seq = ci->i_time_warp_seq; 1162 time_warp_seq = ci->i_time_warp_seq;
1158 follows = ci->i_snap_realm->cached_context->seq;
1159 uid = inode->i_uid; 1163 uid = inode->i_uid;
1160 gid = inode->i_gid; 1164 gid = inode->i_gid;
1161 mode = inode->i_mode; 1165 mode = inode->i_mode;
1162 1166
1163 if (dropping & CEPH_CAP_XATTR_EXCL) { 1167 if (flushing & CEPH_CAP_XATTR_EXCL) {
1164 __ceph_build_xattrs_blob(ci); 1168 __ceph_build_xattrs_blob(ci);
1165 xattr_version = ci->i_xattrs.version + 1; 1169 xattr_blob = ci->i_xattrs.blob;
1170 xattr_version = ci->i_xattrs.version;
1166 } 1171 }
1167 1172
1168 spin_unlock(&inode->i_lock); 1173 spin_unlock(&inode->i_lock);
@@ -1170,9 +1175,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1170 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1175 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1171 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, 1176 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1172 size, max_size, &mtime, &atime, time_warp_seq, 1177 size, max_size, &mtime, &atime, time_warp_seq,
1173 uid, gid, mode, 1178 uid, gid, mode, xattr_version, xattr_blob,
1174 xattr_version,
1175 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1176 follows); 1179 follows);
1177 if (ret < 0) { 1180 if (ret < 0) {
1178 dout("error sending cap msg, must requeue %p\n", inode); 1181 dout("error sending cap msg, must requeue %p\n", inode);
@@ -1192,10 +1195,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1192 * asynchronously back to the MDS once sync writes complete and dirty 1195 * asynchronously back to the MDS once sync writes complete and dirty
1193 * data is written out. 1196 * data is written out.
1194 * 1197 *
1198 * Unless @again is true, skip cap_snaps that were already sent to
1199 * the MDS (i.e., during this session).
1200 *
1195 * Called under i_lock. Takes s_mutex as needed. 1201 * Called under i_lock. Takes s_mutex as needed.
1196 */ 1202 */
1197void __ceph_flush_snaps(struct ceph_inode_info *ci, 1203void __ceph_flush_snaps(struct ceph_inode_info *ci,
1198 struct ceph_mds_session **psession) 1204 struct ceph_mds_session **psession,
1205 int again)
1199 __releases(ci->vfs_inode->i_lock) 1206 __releases(ci->vfs_inode->i_lock)
1200 __acquires(ci->vfs_inode->i_lock) 1207 __acquires(ci->vfs_inode->i_lock)
1201{ 1208{
@@ -1224,7 +1231,7 @@ retry:
1224 * pages to be written out. 1231 * pages to be written out.
1225 */ 1232 */
1226 if (capsnap->dirty_pages || capsnap->writing) 1233 if (capsnap->dirty_pages || capsnap->writing)
1227 continue; 1234 break;
1228 1235
1229 /* 1236 /*
1230 * if cap writeback already occurred, we should have dropped 1237 * if cap writeback already occurred, we should have dropped
@@ -1237,6 +1244,13 @@ retry:
1237 dout("no auth cap (migrating?), doing nothing\n"); 1244 dout("no auth cap (migrating?), doing nothing\n");
1238 goto out; 1245 goto out;
1239 } 1246 }
1247
1248 /* only flush each capsnap once */
1249 if (!again && !list_empty(&capsnap->flushing_item)) {
1250 dout("already flushed %p, skipping\n", capsnap);
1251 continue;
1252 }
1253
1240 mds = ci->i_auth_cap->session->s_mds; 1254 mds = ci->i_auth_cap->session->s_mds;
1241 mseq = ci->i_auth_cap->mseq; 1255 mseq = ci->i_auth_cap->mseq;
1242 1256
@@ -1273,8 +1287,8 @@ retry:
1273 &session->s_cap_snaps_flushing); 1287 &session->s_cap_snaps_flushing);
1274 spin_unlock(&inode->i_lock); 1288 spin_unlock(&inode->i_lock);
1275 1289
1276 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n", 1290 dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
1277 inode, capsnap, next_follows, capsnap->size); 1291 inode, capsnap, capsnap->follows, capsnap->flush_tid);
1278 send_cap_msg(session, ceph_vino(inode).ino, 0, 1292 send_cap_msg(session, ceph_vino(inode).ino, 0,
1279 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, 1293 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1280 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, 1294 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
@@ -1282,7 +1296,7 @@ retry:
1282 &capsnap->mtime, &capsnap->atime, 1296 &capsnap->mtime, &capsnap->atime,
1283 capsnap->time_warp_seq, 1297 capsnap->time_warp_seq,
1284 capsnap->uid, capsnap->gid, capsnap->mode, 1298 capsnap->uid, capsnap->gid, capsnap->mode,
1285 0, NULL, 1299 capsnap->xattr_version, capsnap->xattr_blob,
1286 capsnap->follows); 1300 capsnap->follows);
1287 1301
1288 next_follows = capsnap->follows + 1; 1302 next_follows = capsnap->follows + 1;
@@ -1311,7 +1325,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1311 struct inode *inode = &ci->vfs_inode; 1325 struct inode *inode = &ci->vfs_inode;
1312 1326
1313 spin_lock(&inode->i_lock); 1327 spin_lock(&inode->i_lock);
1314 __ceph_flush_snaps(ci, NULL); 1328 __ceph_flush_snaps(ci, NULL, 0);
1315 spin_unlock(&inode->i_lock); 1329 spin_unlock(&inode->i_lock);
1316} 1330}
1317 1331
@@ -1332,7 +1346,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1332 ceph_cap_string(was | mask)); 1346 ceph_cap_string(was | mask));
1333 ci->i_dirty_caps |= mask; 1347 ci->i_dirty_caps |= mask;
1334 if (was == 0) { 1348 if (was == 0) {
1335 dout(" inode %p now dirty\n", &ci->vfs_inode); 1349 if (!ci->i_head_snapc)
1350 ci->i_head_snapc = ceph_get_snap_context(
1351 ci->i_snap_realm->cached_context);
1352 dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
1353 ci->i_head_snapc);
1336 BUG_ON(!list_empty(&ci->i_dirty_item)); 1354 BUG_ON(!list_empty(&ci->i_dirty_item));
1337 spin_lock(&mdsc->cap_dirty_lock); 1355 spin_lock(&mdsc->cap_dirty_lock);
1338 list_add(&ci->i_dirty_item, &mdsc->cap_dirty); 1356 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -1470,7 +1488,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1470 1488
1471 /* flush snaps first time around only */ 1489 /* flush snaps first time around only */
1472 if (!list_empty(&ci->i_cap_snaps)) 1490 if (!list_empty(&ci->i_cap_snaps))
1473 __ceph_flush_snaps(ci, &session); 1491 __ceph_flush_snaps(ci, &session, 0);
1474 goto retry_locked; 1492 goto retry_locked;
1475retry: 1493retry:
1476 spin_lock(&inode->i_lock); 1494 spin_lock(&inode->i_lock);
@@ -1887,7 +1905,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1887 if (cap && cap->session == session) { 1905 if (cap && cap->session == session) {
1888 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, 1906 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1889 cap, capsnap); 1907 cap, capsnap);
1890 __ceph_flush_snaps(ci, &session); 1908 __ceph_flush_snaps(ci, &session, 1);
1891 } else { 1909 } else {
1892 pr_err("%p auth cap %p not mds%d ???\n", inode, 1910 pr_err("%p auth cap %p not mds%d ???\n", inode,
1893 cap, session->s_mds); 1911 cap, session->s_mds);
@@ -2190,7 +2208,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2190 2208
2191 if (ci->i_head_snapc == snapc) { 2209 if (ci->i_head_snapc == snapc) {
2192 ci->i_wrbuffer_ref_head -= nr; 2210 ci->i_wrbuffer_ref_head -= nr;
2193 if (!ci->i_wrbuffer_ref_head) { 2211 if (ci->i_wrbuffer_ref_head == 0 &&
2212 ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
2213 BUG_ON(!ci->i_head_snapc);
2194 ceph_put_snap_context(ci->i_head_snapc); 2214 ceph_put_snap_context(ci->i_head_snapc);
2195 ci->i_head_snapc = NULL; 2215 ci->i_head_snapc = NULL;
2196 } 2216 }
@@ -2263,7 +2283,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2263{ 2283{
2264 struct ceph_inode_info *ci = ceph_inode(inode); 2284 struct ceph_inode_info *ci = ceph_inode(inode);
2265 int mds = session->s_mds; 2285 int mds = session->s_mds;
2266 int seq = le32_to_cpu(grant->seq); 2286 unsigned seq = le32_to_cpu(grant->seq);
2287 unsigned issue_seq = le32_to_cpu(grant->issue_seq);
2267 int newcaps = le32_to_cpu(grant->caps); 2288 int newcaps = le32_to_cpu(grant->caps);
2268 int issued, implemented, used, wanted, dirty; 2289 int issued, implemented, used, wanted, dirty;
2269 u64 size = le64_to_cpu(grant->size); 2290 u64 size = le64_to_cpu(grant->size);
@@ -2275,8 +2296,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2275 int revoked_rdcache = 0; 2296 int revoked_rdcache = 0;
2276 int queue_invalidate = 0; 2297 int queue_invalidate = 0;
2277 2298
2278 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 2299 dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
2279 inode, cap, mds, seq, ceph_cap_string(newcaps)); 2300 inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
2280 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2301 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2281 inode->i_size); 2302 inode->i_size);
2282 2303
@@ -2372,6 +2393,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2372 } 2393 }
2373 2394
2374 cap->seq = seq; 2395 cap->seq = seq;
2396 cap->issue_seq = issue_seq;
2375 2397
2376 /* file layout may have changed */ 2398 /* file layout may have changed */
2377 ci->i_layout = grant->layout; 2399 ci->i_layout = grant->layout;
@@ -2483,6 +2505,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2483 dout(" inode %p now clean\n", inode); 2505 dout(" inode %p now clean\n", inode);
2484 BUG_ON(!list_empty(&ci->i_dirty_item)); 2506 BUG_ON(!list_empty(&ci->i_dirty_item));
2485 drop = 1; 2507 drop = 1;
2508 if (ci->i_wrbuffer_ref_head == 0) {
2509 BUG_ON(!ci->i_head_snapc);
2510 ceph_put_snap_context(ci->i_head_snapc);
2511 ci->i_head_snapc = NULL;
2512 }
2486 } else { 2513 } else {
2487 BUG_ON(list_empty(&ci->i_dirty_item)); 2514 BUG_ON(list_empty(&ci->i_dirty_item));
2488 } 2515 }
@@ -2749,15 +2776,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2749 if (op == CEPH_CAP_OP_IMPORT) 2776 if (op == CEPH_CAP_OP_IMPORT)
2750 __queue_cap_release(session, vino.ino, cap_id, 2777 __queue_cap_release(session, vino.ino, cap_id,
2751 mseq, seq); 2778 mseq, seq);
2752 2779 goto flush_cap_releases;
2753 /*
2754 * send any full release message to try to move things
2755 * along for the mds (who clearly thinks we still have this
2756 * cap).
2757 */
2758 ceph_add_cap_releases(mdsc, session);
2759 ceph_send_cap_releases(mdsc, session);
2760 goto done;
2761 } 2780 }
2762 2781
2763 /* these will work even if we don't have a cap yet */ 2782 /* these will work even if we don't have a cap yet */
@@ -2785,7 +2804,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2785 dout(" no cap on %p ino %llx.%llx from mds%d\n", 2804 dout(" no cap on %p ino %llx.%llx from mds%d\n",
2786 inode, ceph_ino(inode), ceph_snap(inode), mds); 2805 inode, ceph_ino(inode), ceph_snap(inode), mds);
2787 spin_unlock(&inode->i_lock); 2806 spin_unlock(&inode->i_lock);
2788 goto done; 2807 goto flush_cap_releases;
2789 } 2808 }
2790 2809
2791 /* note that each of these drops i_lock for us */ 2810 /* note that each of these drops i_lock for us */
@@ -2809,6 +2828,17 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2809 ceph_cap_op_name(op)); 2828 ceph_cap_op_name(op));
2810 } 2829 }
2811 2830
2831 goto done;
2832
2833flush_cap_releases:
2834 /*
2835 * send any full release message to try to move things
2836 * along for the mds (who clearly thinks we still have this
2837 * cap).
2838 */
2839 ceph_add_cap_releases(mdsc, session);
2840 ceph_send_cap_releases(mdsc, session);
2841
2812done: 2842done:
2813 mutex_unlock(&session->s_mutex); 2843 mutex_unlock(&session->s_mutex);
2814done_unlocked: 2844done_unlocked:
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 360c4f22718d..6fd8b20a8611 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -171,6 +171,8 @@ static int mdsc_show(struct seq_file *s, void *p)
171 } else if (req->r_dentry) { 171 } else if (req->r_dentry) {
172 path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 172 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
173 &pathbase, 0); 173 &pathbase, 0);
174 if (IS_ERR(path))
175 path = NULL;
174 spin_lock(&req->r_dentry->d_lock); 176 spin_lock(&req->r_dentry->d_lock);
175 seq_printf(s, " #%llx/%.*s (%s)", 177 seq_printf(s, " #%llx/%.*s (%s)",
176 ceph_ino(req->r_dentry->d_parent->d_inode), 178 ceph_ino(req->r_dentry->d_parent->d_inode),
@@ -187,6 +189,8 @@ static int mdsc_show(struct seq_file *s, void *p)
187 if (req->r_old_dentry) { 189 if (req->r_old_dentry) {
188 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, 190 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
189 &pathbase, 0); 191 &pathbase, 0);
192 if (IS_ERR(path))
193 path = NULL;
190 spin_lock(&req->r_old_dentry->d_lock); 194 spin_lock(&req->r_old_dentry->d_lock);
191 seq_printf(s, " #%llx/%.*s (%s)", 195 seq_printf(s, " #%llx/%.*s (%s)",
192 ceph_ino(req->r_old_dentry->d_parent->d_inode), 196 ceph_ino(req->r_old_dentry->d_parent->d_inode),
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 67bbb41d5526..a1986eb52045 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -46,7 +46,7 @@ int ceph_init_dentry(struct dentry *dentry)
46 else 46 else
47 dentry->d_op = &ceph_snap_dentry_ops; 47 dentry->d_op = &ceph_snap_dentry_ops;
48 48
49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS); 49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
50 if (!di) 50 if (!di)
51 return -ENOMEM; /* oh well */ 51 return -ENOMEM; /* oh well */
52 52
@@ -1021,11 +1021,15 @@ out_touch:
1021static void ceph_dentry_release(struct dentry *dentry) 1021static void ceph_dentry_release(struct dentry *dentry)
1022{ 1022{
1023 struct ceph_dentry_info *di = ceph_dentry(dentry); 1023 struct ceph_dentry_info *di = ceph_dentry(dentry);
1024 struct inode *parent_inode = dentry->d_parent->d_inode; 1024 struct inode *parent_inode = NULL;
1025 u64 snapid = ceph_snap(parent_inode); 1025 u64 snapid = CEPH_NOSNAP;
1026 1026
1027 if (!IS_ROOT(dentry)) {
1028 parent_inode = dentry->d_parent->d_inode;
1029 if (parent_inode)
1030 snapid = ceph_snap(parent_inode);
1031 }
1027 dout("dentry_release %p parent %p\n", dentry, parent_inode); 1032 dout("dentry_release %p parent %p\n", dentry, parent_inode);
1028
1029 if (parent_inode && snapid != CEPH_SNAPDIR) { 1033 if (parent_inode && snapid != CEPH_SNAPDIR) {
1030 struct ceph_inode_info *ci = ceph_inode(parent_inode); 1034 struct ceph_inode_info *ci = ceph_inode(parent_inode);
1031 1035
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 4480cb1c63e7..e38423e82f2e 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -42,32 +42,37 @@ struct ceph_nfs_confh {
42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, 42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
43 int connectable) 43 int connectable)
44{ 44{
45 int type;
45 struct ceph_nfs_fh *fh = (void *)rawfh; 46 struct ceph_nfs_fh *fh = (void *)rawfh;
46 struct ceph_nfs_confh *cfh = (void *)rawfh; 47 struct ceph_nfs_confh *cfh = (void *)rawfh;
47 struct dentry *parent = dentry->d_parent; 48 struct dentry *parent = dentry->d_parent;
48 struct inode *inode = dentry->d_inode; 49 struct inode *inode = dentry->d_inode;
49 int type; 50 int connected_handle_length = sizeof(*cfh)/4;
51 int handle_length = sizeof(*fh)/4;
50 52
51 /* don't re-export snaps */ 53 /* don't re-export snaps */
52 if (ceph_snap(inode) != CEPH_NOSNAP) 54 if (ceph_snap(inode) != CEPH_NOSNAP)
53 return -EINVAL; 55 return -EINVAL;
54 56
55 if (*max_len >= sizeof(*cfh)) { 57 if (*max_len >= connected_handle_length) {
56 dout("encode_fh %p connectable\n", dentry); 58 dout("encode_fh %p connectable\n", dentry);
57 cfh->ino = ceph_ino(dentry->d_inode); 59 cfh->ino = ceph_ino(dentry->d_inode);
58 cfh->parent_ino = ceph_ino(parent->d_inode); 60 cfh->parent_ino = ceph_ino(parent->d_inode);
59 cfh->parent_name_hash = parent->d_name.hash; 61 cfh->parent_name_hash = parent->d_name.hash;
60 *max_len = sizeof(*cfh); 62 *max_len = connected_handle_length;
61 type = 2; 63 type = 2;
62 } else if (*max_len > sizeof(*fh)) { 64 } else if (*max_len >= handle_length) {
63 if (connectable) 65 if (connectable) {
64 return -ENOSPC; 66 *max_len = connected_handle_length;
67 return 255;
68 }
65 dout("encode_fh %p\n", dentry); 69 dout("encode_fh %p\n", dentry);
66 fh->ino = ceph_ino(dentry->d_inode); 70 fh->ino = ceph_ino(dentry->d_inode);
67 *max_len = sizeof(*fh); 71 *max_len = handle_length;
68 type = 1; 72 type = 1;
69 } else { 73 } else {
70 return -ENOSPC; 74 *max_len = handle_length;
75 return 255;
71 } 76 }
72 return type; 77 return type;
73} 78}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8c044a4f0457..66e4da6dba22 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -697,7 +697,7 @@ more:
697 * start_request so that a tid has been assigned. 697 * start_request so that a tid has been assigned.
698 */ 698 */
699 spin_lock(&ci->i_unsafe_lock); 699 spin_lock(&ci->i_unsafe_lock);
700 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item); 700 list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
701 spin_unlock(&ci->i_unsafe_lock); 701 spin_unlock(&ci->i_unsafe_lock);
702 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); 702 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
703 } 703 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5d893d31e399..62377ec37edf 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -677,6 +677,7 @@ static int fill_inode(struct inode *inode,
677 if (ci->i_files == 0 && ci->i_subdirs == 0 && 677 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
678 ceph_snap(inode) == CEPH_NOSNAP && 678 ceph_snap(inode) == CEPH_NOSNAP &&
679 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && 679 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
680 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
680 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { 681 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
681 dout(" marking %p complete (empty)\n", inode); 682 dout(" marking %p complete (empty)\n", inode);
682 ci->i_ceph_flags |= CEPH_I_COMPLETE; 683 ci->i_ceph_flags |= CEPH_I_COMPLETE;
@@ -844,7 +845,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
844 * the caller) if we fail. 845 * the caller) if we fail.
845 */ 846 */
846static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, 847static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
847 bool *prehash) 848 bool *prehash, bool set_offset)
848{ 849{
849 struct dentry *realdn; 850 struct dentry *realdn;
850 851
@@ -876,7 +877,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
876 } 877 }
877 if ((!prehash || *prehash) && d_unhashed(dn)) 878 if ((!prehash || *prehash) && d_unhashed(dn))
878 d_rehash(dn); 879 d_rehash(dn);
879 ceph_set_dentry_offset(dn); 880 if (set_offset)
881 ceph_set_dentry_offset(dn);
880out: 882out:
881 return dn; 883 return dn;
882} 884}
@@ -1061,7 +1063,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1061 d_delete(dn); 1063 d_delete(dn);
1062 goto done; 1064 goto done;
1063 } 1065 }
1064 dn = splice_dentry(dn, in, &have_lease); 1066 dn = splice_dentry(dn, in, &have_lease, true);
1065 if (IS_ERR(dn)) { 1067 if (IS_ERR(dn)) {
1066 err = PTR_ERR(dn); 1068 err = PTR_ERR(dn);
1067 goto done; 1069 goto done;
@@ -1104,7 +1106,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1104 goto done; 1106 goto done;
1105 } 1107 }
1106 dout(" linking snapped dir %p to dn %p\n", in, dn); 1108 dout(" linking snapped dir %p to dn %p\n", in, dn);
1107 dn = splice_dentry(dn, in, NULL); 1109 dn = splice_dentry(dn, in, NULL, true);
1108 if (IS_ERR(dn)) { 1110 if (IS_ERR(dn)) {
1109 err = PTR_ERR(dn); 1111 err = PTR_ERR(dn);
1110 goto done; 1112 goto done;
@@ -1229,14 +1231,14 @@ retry_lookup:
1229 in = dn->d_inode; 1231 in = dn->d_inode;
1230 } else { 1232 } else {
1231 in = ceph_get_inode(parent->d_sb, vino); 1233 in = ceph_get_inode(parent->d_sb, vino);
1232 if (in == NULL) { 1234 if (IS_ERR(in)) {
1233 dout("new_inode badness\n"); 1235 dout("new_inode badness\n");
1234 d_delete(dn); 1236 d_delete(dn);
1235 dput(dn); 1237 dput(dn);
1236 err = -ENOMEM; 1238 err = PTR_ERR(in);
1237 goto out; 1239 goto out;
1238 } 1240 }
1239 dn = splice_dentry(dn, in, NULL); 1241 dn = splice_dentry(dn, in, NULL, false);
1240 if (IS_ERR(dn)) 1242 if (IS_ERR(dn))
1241 dn = NULL; 1243 dn = NULL;
1242 } 1244 }
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae85af06454f..ff4e753aae92 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -82,7 +82,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
82 length = fl->fl_end - fl->fl_start + 1; 82 length = fl->fl_end - fl->fl_start + 1;
83 83
84 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, 84 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
85 (u64)fl->fl_pid, (u64)fl->fl_nspid, 85 (u64)fl->fl_pid,
86 (u64)(unsigned long)fl->fl_nspid,
86 lock_cmd, fl->fl_start, 87 lock_cmd, fl->fl_start,
87 length, wait); 88 length, wait);
88 if (!err) { 89 if (!err) {
@@ -92,7 +93,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
92 /* undo! This should only happen if the kernel detects 93 /* undo! This should only happen if the kernel detects
93 * local deadlock. */ 94 * local deadlock. */
94 ceph_lock_message(CEPH_LOCK_FCNTL, op, file, 95 ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
95 (u64)fl->fl_pid, (u64)fl->fl_nspid, 96 (u64)fl->fl_pid,
97 (u64)(unsigned long)fl->fl_nspid,
96 CEPH_LOCK_UNLOCK, fl->fl_start, 98 CEPH_LOCK_UNLOCK, fl->fl_start,
97 length, 0); 99 length, 0);
98 dout("got %d on posix_lock_file, undid lock", err); 100 dout("got %d on posix_lock_file, undid lock", err);
@@ -132,7 +134,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
132 length = fl->fl_end - fl->fl_start + 1; 134 length = fl->fl_end - fl->fl_start + 1;
133 135
134 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, 136 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
135 file, (u64)fl->fl_pid, (u64)fl->fl_nspid, 137 file, (u64)fl->fl_pid,
138 (u64)(unsigned long)fl->fl_nspid,
136 lock_cmd, fl->fl_start, 139 lock_cmd, fl->fl_start,
137 length, wait); 140 length, wait);
138 if (!err) { 141 if (!err) {
@@ -141,7 +144,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
141 ceph_lock_message(CEPH_LOCK_FLOCK, 144 ceph_lock_message(CEPH_LOCK_FLOCK,
142 CEPH_MDS_OP_SETFILELOCK, 145 CEPH_MDS_OP_SETFILELOCK,
143 file, (u64)fl->fl_pid, 146 file, (u64)fl->fl_pid,
144 (u64)fl->fl_nspid, 147 (u64)(unsigned long)fl->fl_nspid,
145 CEPH_LOCK_UNLOCK, fl->fl_start, 148 CEPH_LOCK_UNLOCK, fl->fl_start,
146 length, 0); 149 length, 0);
147 dout("got %d on flock_lock_file_wait, undid lock", err); 150 dout("got %d on flock_lock_file_wait, undid lock", err);
@@ -235,7 +238,8 @@ int lock_to_ceph_filelock(struct file_lock *lock,
235 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); 238 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
236 cephlock->client = cpu_to_le64(0); 239 cephlock->client = cpu_to_le64(0);
237 cephlock->pid = cpu_to_le64(lock->fl_pid); 240 cephlock->pid = cpu_to_le64(lock->fl_pid);
238 cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid); 241 cephlock->pid_namespace =
242 cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
239 243
240 switch (lock->fl_type) { 244 switch (lock->fl_type) {
241 case F_RDLCK: 245 case F_RDLCK:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a75ddbf9fe37..fad95f8f2608 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -560,6 +560,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
560 * 560 *
561 * Called under mdsc->mutex. 561 * Called under mdsc->mutex.
562 */ 562 */
563struct dentry *get_nonsnap_parent(struct dentry *dentry)
564{
565 while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
566 dentry = dentry->d_parent;
567 return dentry;
568}
569
563static int __choose_mds(struct ceph_mds_client *mdsc, 570static int __choose_mds(struct ceph_mds_client *mdsc,
564 struct ceph_mds_request *req) 571 struct ceph_mds_request *req)
565{ 572{
@@ -590,14 +597,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
590 if (req->r_inode) { 597 if (req->r_inode) {
591 inode = req->r_inode; 598 inode = req->r_inode;
592 } else if (req->r_dentry) { 599 } else if (req->r_dentry) {
593 if (req->r_dentry->d_inode) { 600 struct inode *dir = req->r_dentry->d_parent->d_inode;
601
602 if (dir->i_sb != mdsc->client->sb) {
603 /* not this fs! */
604 inode = req->r_dentry->d_inode;
605 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
606 /* direct snapped/virtual snapdir requests
607 * based on parent dir inode */
608 struct dentry *dn =
609 get_nonsnap_parent(req->r_dentry->d_parent);
610 inode = dn->d_inode;
611 dout("__choose_mds using nonsnap parent %p\n", inode);
612 } else if (req->r_dentry->d_inode) {
613 /* dentry target */
594 inode = req->r_dentry->d_inode; 614 inode = req->r_dentry->d_inode;
595 } else { 615 } else {
596 inode = req->r_dentry->d_parent->d_inode; 616 /* dir + name */
617 inode = dir;
597 hash = req->r_dentry->d_name.hash; 618 hash = req->r_dentry->d_name.hash;
598 is_hash = true; 619 is_hash = true;
599 } 620 }
600 } 621 }
622
601 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, 623 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
602 (int)hash, mode); 624 (int)hash, mode);
603 if (!inode) 625 if (!inode)
@@ -2208,7 +2230,7 @@ static void handle_session(struct ceph_mds_session *session,
2208 pr_info("mds%d reconnect denied\n", session->s_mds); 2230 pr_info("mds%d reconnect denied\n", session->s_mds);
2209 remove_session_caps(session); 2231 remove_session_caps(session);
2210 wake = 1; /* for good measure */ 2232 wake = 1; /* for good measure */
2211 complete_all(&mdsc->session_close_waiters); 2233 wake_up_all(&mdsc->session_close_wq);
2212 kick_requests(mdsc, mds); 2234 kick_requests(mdsc, mds);
2213 break; 2235 break;
2214 2236
@@ -2302,7 +2324,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2302 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); 2324 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2303 if (IS_ERR(path)) { 2325 if (IS_ERR(path)) {
2304 err = PTR_ERR(path); 2326 err = PTR_ERR(path);
2305 BUG_ON(err); 2327 goto out_dput;
2306 } 2328 }
2307 } else { 2329 } else {
2308 path = NULL; 2330 path = NULL;
@@ -2310,7 +2332,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2310 } 2332 }
2311 err = ceph_pagelist_encode_string(pagelist, path, pathlen); 2333 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2312 if (err) 2334 if (err)
2313 goto out; 2335 goto out_free;
2314 2336
2315 spin_lock(&inode->i_lock); 2337 spin_lock(&inode->i_lock);
2316 cap->seq = 0; /* reset cap seq */ 2338 cap->seq = 0; /* reset cap seq */
@@ -2352,10 +2374,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2352 num_fcntl_locks, 2374 num_fcntl_locks,
2353 num_flock_locks); 2375 num_flock_locks);
2354 unlock_kernel(); 2376 unlock_kernel();
2377 } else {
2378 err = ceph_pagelist_append(pagelist, &rec, reclen);
2355 } 2379 }
2356 2380
2357out: 2381out_free:
2358 kfree(path); 2382 kfree(path);
2383out_dput:
2359 dput(dentry); 2384 dput(dentry);
2360 return err; 2385 return err;
2361} 2386}
@@ -2876,7 +2901,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2876 return -ENOMEM; 2901 return -ENOMEM;
2877 2902
2878 init_completion(&mdsc->safe_umount_waiters); 2903 init_completion(&mdsc->safe_umount_waiters);
2879 init_completion(&mdsc->session_close_waiters); 2904 init_waitqueue_head(&mdsc->session_close_wq);
2880 INIT_LIST_HEAD(&mdsc->waiting_for_map); 2905 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2881 mdsc->sessions = NULL; 2906 mdsc->sessions = NULL;
2882 mdsc->max_sessions = 0; 2907 mdsc->max_sessions = 0;
@@ -3021,6 +3046,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3021 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); 3046 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
3022} 3047}
3023 3048
3049/*
3050 * true if all sessions are closed, or we force unmount
3051 */
3052bool done_closing_sessions(struct ceph_mds_client *mdsc)
3053{
3054 int i, n = 0;
3055
3056 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
3057 return true;
3058
3059 mutex_lock(&mdsc->mutex);
3060 for (i = 0; i < mdsc->max_sessions; i++)
3061 if (mdsc->sessions[i])
3062 n++;
3063 mutex_unlock(&mdsc->mutex);
3064 return n == 0;
3065}
3024 3066
3025/* 3067/*
3026 * called after sb is ro. 3068 * called after sb is ro.
@@ -3029,45 +3071,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3029{ 3071{
3030 struct ceph_mds_session *session; 3072 struct ceph_mds_session *session;
3031 int i; 3073 int i;
3032 int n;
3033 struct ceph_client *client = mdsc->client; 3074 struct ceph_client *client = mdsc->client;
3034 unsigned long started, timeout = client->mount_args->mount_timeout * HZ; 3075 unsigned long timeout = client->mount_args->mount_timeout * HZ;
3035 3076
3036 dout("close_sessions\n"); 3077 dout("close_sessions\n");
3037 3078
3038 mutex_lock(&mdsc->mutex);
3039
3040 /* close sessions */ 3079 /* close sessions */
3041 started = jiffies; 3080 mutex_lock(&mdsc->mutex);
3042 while (time_before(jiffies, started + timeout)) { 3081 for (i = 0; i < mdsc->max_sessions; i++) {
3043 dout("closing sessions\n"); 3082 session = __ceph_lookup_mds_session(mdsc, i);
3044 n = 0; 3083 if (!session)
3045 for (i = 0; i < mdsc->max_sessions; i++) { 3084 continue;
3046 session = __ceph_lookup_mds_session(mdsc, i);
3047 if (!session)
3048 continue;
3049 mutex_unlock(&mdsc->mutex);
3050 mutex_lock(&session->s_mutex);
3051 __close_session(mdsc, session);
3052 mutex_unlock(&session->s_mutex);
3053 ceph_put_mds_session(session);
3054 mutex_lock(&mdsc->mutex);
3055 n++;
3056 }
3057 if (n == 0)
3058 break;
3059
3060 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
3061 break;
3062
3063 dout("waiting for sessions to close\n");
3064 mutex_unlock(&mdsc->mutex); 3085 mutex_unlock(&mdsc->mutex);
3065 wait_for_completion_timeout(&mdsc->session_close_waiters, 3086 mutex_lock(&session->s_mutex);
3066 timeout); 3087 __close_session(mdsc, session);
3088 mutex_unlock(&session->s_mutex);
3089 ceph_put_mds_session(session);
3067 mutex_lock(&mdsc->mutex); 3090 mutex_lock(&mdsc->mutex);
3068 } 3091 }
3092 mutex_unlock(&mdsc->mutex);
3093
3094 dout("waiting for sessions to close\n");
3095 wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
3096 timeout);
3069 3097
3070 /* tear down remaining sessions */ 3098 /* tear down remaining sessions */
3099 mutex_lock(&mdsc->mutex);
3071 for (i = 0; i < mdsc->max_sessions; i++) { 3100 for (i = 0; i < mdsc->max_sessions; i++) {
3072 if (mdsc->sessions[i]) { 3101 if (mdsc->sessions[i]) {
3073 session = get_session(mdsc->sessions[i]); 3102 session = get_session(mdsc->sessions[i]);
@@ -3080,9 +3109,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3080 mutex_lock(&mdsc->mutex); 3109 mutex_lock(&mdsc->mutex);
3081 } 3110 }
3082 } 3111 }
3083
3084 WARN_ON(!list_empty(&mdsc->cap_delay_list)); 3112 WARN_ON(!list_empty(&mdsc->cap_delay_list));
3085
3086 mutex_unlock(&mdsc->mutex); 3113 mutex_unlock(&mdsc->mutex);
3087 3114
3088 ceph_cleanup_empty_realms(mdsc); 3115 ceph_cleanup_empty_realms(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ab7e89f5e344..c98267ce6d2a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -234,7 +234,8 @@ struct ceph_mds_client {
234 struct mutex mutex; /* all nested structures */ 234 struct mutex mutex; /* all nested structures */
235 235
236 struct ceph_mdsmap *mdsmap; 236 struct ceph_mdsmap *mdsmap;
237 struct completion safe_umount_waiters, session_close_waiters; 237 struct completion safe_umount_waiters;
238 wait_queue_head_t session_close_wq;
238 struct list_head waiting_for_map; 239 struct list_head waiting_for_map;
239 240
240 struct ceph_mds_session **sessions; /* NULL for mds if no session */ 241 struct ceph_mds_session **sessions; /* NULL for mds if no session */
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index bed6391e52c7..3b5571b8ce22 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -549,7 +549,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
549 */ 549 */
550static void __cancel_request(struct ceph_osd_request *req) 550static void __cancel_request(struct ceph_osd_request *req)
551{ 551{
552 if (req->r_sent) { 552 if (req->r_sent && req->r_osd) {
553 ceph_con_revoke(&req->r_osd->o_con, req->r_request); 553 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
554 req->r_sent = 0; 554 req->r_sent = 0;
555 } 555 }
@@ -661,7 +661,7 @@ static int __send_request(struct ceph_osd_client *osdc,
661 reqhead->reassert_version = req->r_reassert_version; 661 reqhead->reassert_version = req->r_reassert_version;
662 662
663 req->r_stamp = jiffies; 663 req->r_stamp = jiffies;
664 list_move_tail(&osdc->req_lru, &req->r_req_lru_item); 664 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
665 665
666 ceph_msg_get(req->r_request); /* send consumes a ref */ 666 ceph_msg_get(req->r_request); /* send consumes a ref */
667 ceph_con_send(&req->r_osd->o_con, req->r_request); 667 ceph_con_send(&req->r_osd->o_con, req->r_request);
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index b6859f47d364..46a368b6dce5 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -5,10 +5,18 @@
5 5
6#include "pagelist.h" 6#include "pagelist.h"
7 7
8static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
9{
10 struct page *page = list_entry(pl->head.prev, struct page,
11 lru);
12 kunmap(page);
13}
14
8int ceph_pagelist_release(struct ceph_pagelist *pl) 15int ceph_pagelist_release(struct ceph_pagelist *pl)
9{ 16{
10 if (pl->mapped_tail) 17 if (pl->mapped_tail)
11 kunmap(pl->mapped_tail); 18 ceph_pagelist_unmap_tail(pl);
19
12 while (!list_empty(&pl->head)) { 20 while (!list_empty(&pl->head)) {
13 struct page *page = list_first_entry(&pl->head, struct page, 21 struct page *page = list_first_entry(&pl->head, struct page,
14 lru); 22 lru);
@@ -26,7 +34,7 @@ static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
26 pl->room += PAGE_SIZE; 34 pl->room += PAGE_SIZE;
27 list_add_tail(&page->lru, &pl->head); 35 list_add_tail(&page->lru, &pl->head);
28 if (pl->mapped_tail) 36 if (pl->mapped_tail)
29 kunmap(pl->mapped_tail); 37 ceph_pagelist_unmap_tail(pl);
30 pl->mapped_tail = kmap(page); 38 pl->mapped_tail = kmap(page);
31 return 0; 39 return 0;
32} 40}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index c0b26b6badba..190b6c4a6f2b 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -119,6 +119,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
119 INIT_LIST_HEAD(&realm->children); 119 INIT_LIST_HEAD(&realm->children);
120 INIT_LIST_HEAD(&realm->child_item); 120 INIT_LIST_HEAD(&realm->child_item);
121 INIT_LIST_HEAD(&realm->empty_item); 121 INIT_LIST_HEAD(&realm->empty_item);
122 INIT_LIST_HEAD(&realm->dirty_item);
122 INIT_LIST_HEAD(&realm->inodes_with_caps); 123 INIT_LIST_HEAD(&realm->inodes_with_caps);
123 spin_lock_init(&realm->inodes_with_caps_lock); 124 spin_lock_init(&realm->inodes_with_caps_lock);
124 __insert_snap_realm(&mdsc->snap_realms, realm); 125 __insert_snap_realm(&mdsc->snap_realms, realm);
@@ -435,7 +436,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
435{ 436{
436 struct inode *inode = &ci->vfs_inode; 437 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap; 438 struct ceph_cap_snap *capsnap;
438 int used; 439 int used, dirty;
439 440
440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); 441 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
441 if (!capsnap) { 442 if (!capsnap) {
@@ -445,6 +446,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
445 446
446 spin_lock(&inode->i_lock); 447 spin_lock(&inode->i_lock);
447 used = __ceph_caps_used(ci); 448 used = __ceph_caps_used(ci);
449 dirty = __ceph_caps_dirty(ci);
448 if (__ceph_have_pending_cap_snap(ci)) { 450 if (__ceph_have_pending_cap_snap(ci)) {
449 /* there is no point in queuing multiple "pending" cap_snaps, 451 /* there is no point in queuing multiple "pending" cap_snaps,
450 as no new writes are allowed to start when pending, so any 452 as no new writes are allowed to start when pending, so any
@@ -452,27 +454,37 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
452 cap_snap. lucky us. */ 454 cap_snap. lucky us. */
453 dout("queue_cap_snap %p already pending\n", inode); 455 dout("queue_cap_snap %p already pending\n", inode);
454 kfree(capsnap); 456 kfree(capsnap);
455 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { 457 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
458 (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
459 CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
456 struct ceph_snap_context *snapc = ci->i_head_snapc; 460 struct ceph_snap_context *snapc = ci->i_head_snapc;
457 461
462 dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
463 capsnap, snapc);
458 igrab(inode); 464 igrab(inode);
459 465
460 atomic_set(&capsnap->nref, 1); 466 atomic_set(&capsnap->nref, 1);
461 capsnap->ci = ci; 467 capsnap->ci = ci;
462 INIT_LIST_HEAD(&capsnap->ci_item); 468 INIT_LIST_HEAD(&capsnap->ci_item);
463 INIT_LIST_HEAD(&capsnap->flushing_item); 469 INIT_LIST_HEAD(&capsnap->flushing_item);
464 470
465 capsnap->follows = snapc->seq - 1; 471 capsnap->follows = snapc->seq;
466 capsnap->issued = __ceph_caps_issued(ci, NULL); 472 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci); 473 capsnap->dirty = dirty;
468 474
469 capsnap->mode = inode->i_mode; 475 capsnap->mode = inode->i_mode;
470 capsnap->uid = inode->i_uid; 476 capsnap->uid = inode->i_uid;
471 capsnap->gid = inode->i_gid; 477 capsnap->gid = inode->i_gid;
472 478
473 /* fixme? */ 479 if (dirty & CEPH_CAP_XATTR_EXCL) {
474 capsnap->xattr_blob = NULL; 480 __ceph_build_xattrs_blob(ci);
475 capsnap->xattr_len = 0; 481 capsnap->xattr_blob =
482 ceph_buffer_get(ci->i_xattrs.blob);
483 capsnap->xattr_version = ci->i_xattrs.version;
484 } else {
485 capsnap->xattr_blob = NULL;
486 capsnap->xattr_version = 0;
487 }
476 488
477 /* dirty page count moved from _head to this cap_snap; 489 /* dirty page count moved from _head to this cap_snap;
478 all subsequent writes page dirties occur _after_ this 490 all subsequent writes page dirties occur _after_ this
@@ -480,7 +492,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head; 492 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0; 493 ci->i_wrbuffer_ref_head = 0;
482 capsnap->context = snapc; 494 capsnap->context = snapc;
483 ci->i_head_snapc = NULL; 495 ci->i_head_snapc =
496 ceph_get_snap_context(ci->i_snap_realm->cached_context);
497 dout(" new snapc is %p\n", ci->i_head_snapc);
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); 498 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485 499
486 if (used & CEPH_CAP_FILE_WR) { 500 if (used & CEPH_CAP_FILE_WR) {
@@ -539,6 +553,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
539 return 1; /* caller may want to ceph_flush_snaps */ 553 return 1; /* caller may want to ceph_flush_snaps */
540} 554}
541 555
556/*
557 * Queue cap_snaps for snap writeback for this realm and its children.
558 * Called under snap_rwsem, so realm topology won't change.
559 */
560static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
561{
562 struct ceph_inode_info *ci;
563 struct inode *lastinode = NULL;
564 struct ceph_snap_realm *child;
565
566 dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
567
568 spin_lock(&realm->inodes_with_caps_lock);
569 list_for_each_entry(ci, &realm->inodes_with_caps,
570 i_snap_realm_item) {
571 struct inode *inode = igrab(&ci->vfs_inode);
572 if (!inode)
573 continue;
574 spin_unlock(&realm->inodes_with_caps_lock);
575 if (lastinode)
576 iput(lastinode);
577 lastinode = inode;
578 ceph_queue_cap_snap(ci);
579 spin_lock(&realm->inodes_with_caps_lock);
580 }
581 spin_unlock(&realm->inodes_with_caps_lock);
582 if (lastinode)
583 iput(lastinode);
584
585 dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
586 list_for_each_entry(child, &realm->children, child_item)
587 queue_realm_cap_snaps(child);
588
589 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
590}
542 591
543/* 592/*
544 * Parse and apply a snapblob "snap trace" from the MDS. This specifies 593 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
@@ -556,6 +605,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
556 struct ceph_snap_realm *realm; 605 struct ceph_snap_realm *realm;
557 int invalidate = 0; 606 int invalidate = 0;
558 int err = -ENOMEM; 607 int err = -ENOMEM;
608 LIST_HEAD(dirty_realms);
559 609
560 dout("update_snap_trace deletion=%d\n", deletion); 610 dout("update_snap_trace deletion=%d\n", deletion);
561more: 611more:
@@ -578,45 +628,6 @@ more:
578 } 628 }
579 } 629 }
580 630
581 if (le64_to_cpu(ri->seq) > realm->seq) {
582 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
583 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
584 /*
585 * if the realm seq has changed, queue a cap_snap for every
586 * inode with open caps. we do this _before_ we update
587 * the realm info so that we prepare for writeback under the
588 * _previous_ snap context.
589 *
590 * ...unless it's a snap deletion!
591 */
592 if (!deletion) {
593 struct ceph_inode_info *ci;
594 struct inode *lastinode = NULL;
595
596 spin_lock(&realm->inodes_with_caps_lock);
597 list_for_each_entry(ci, &realm->inodes_with_caps,
598 i_snap_realm_item) {
599 struct inode *inode = igrab(&ci->vfs_inode);
600 if (!inode)
601 continue;
602 spin_unlock(&realm->inodes_with_caps_lock);
603 if (lastinode)
604 iput(lastinode);
605 lastinode = inode;
606 ceph_queue_cap_snap(ci);
607 spin_lock(&realm->inodes_with_caps_lock);
608 }
609 spin_unlock(&realm->inodes_with_caps_lock);
610 if (lastinode)
611 iput(lastinode);
612 dout("update_snap_trace cap_snaps queued\n");
613 }
614
615 } else {
616 dout("update_snap_trace %llx %p seq %lld unchanged\n",
617 realm->ino, realm, realm->seq);
618 }
619
620 /* ensure the parent is correct */ 631 /* ensure the parent is correct */
621 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); 632 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
622 if (err < 0) 633 if (err < 0)
@@ -624,6 +635,8 @@ more:
624 invalidate += err; 635 invalidate += err;
625 636
626 if (le64_to_cpu(ri->seq) > realm->seq) { 637 if (le64_to_cpu(ri->seq) > realm->seq) {
638 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
639 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
627 /* update realm parameters, snap lists */ 640 /* update realm parameters, snap lists */
628 realm->seq = le64_to_cpu(ri->seq); 641 realm->seq = le64_to_cpu(ri->seq);
629 realm->created = le64_to_cpu(ri->created); 642 realm->created = le64_to_cpu(ri->created);
@@ -641,9 +654,17 @@ more:
641 if (err < 0) 654 if (err < 0)
642 goto fail; 655 goto fail;
643 656
657 /* queue realm for cap_snap creation */
658 list_add(&realm->dirty_item, &dirty_realms);
659
644 invalidate = 1; 660 invalidate = 1;
645 } else if (!realm->cached_context) { 661 } else if (!realm->cached_context) {
662 dout("update_snap_trace %llx %p seq %lld new\n",
663 realm->ino, realm, realm->seq);
646 invalidate = 1; 664 invalidate = 1;
665 } else {
666 dout("update_snap_trace %llx %p seq %lld unchanged\n",
667 realm->ino, realm, realm->seq);
647 } 668 }
648 669
649 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, 670 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
@@ -656,6 +677,14 @@ more:
656 if (invalidate) 677 if (invalidate)
657 rebuild_snap_realms(realm); 678 rebuild_snap_realms(realm);
658 679
680 /*
681 * queue cap snaps _after_ we've built the new snap contexts,
682 * so that i_head_snapc can be set appropriately.
683 */
684 list_for_each_entry(realm, &dirty_realms, dirty_item) {
685 queue_realm_cap_snaps(realm);
686 }
687
659 __cleanup_empty_realms(mdsc); 688 __cleanup_empty_realms(mdsc);
660 return 0; 689 return 0;
661 690
@@ -688,7 +717,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
688 igrab(inode); 717 igrab(inode);
689 spin_unlock(&mdsc->snap_flush_lock); 718 spin_unlock(&mdsc->snap_flush_lock);
690 spin_lock(&inode->i_lock); 719 spin_lock(&inode->i_lock);
691 __ceph_flush_snaps(ci, &session); 720 __ceph_flush_snaps(ci, &session, 0);
692 spin_unlock(&inode->i_lock); 721 spin_unlock(&inode->i_lock);
693 iput(inode); 722 iput(inode);
694 spin_lock(&mdsc->snap_flush_lock); 723 spin_lock(&mdsc->snap_flush_lock);
@@ -789,6 +818,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
789 }; 818 };
790 struct inode *inode = ceph_find_inode(sb, vino); 819 struct inode *inode = ceph_find_inode(sb, vino);
791 struct ceph_inode_info *ci; 820 struct ceph_inode_info *ci;
821 struct ceph_snap_realm *oldrealm;
792 822
793 if (!inode) 823 if (!inode)
794 continue; 824 continue;
@@ -814,18 +844,19 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
814 dout(" will move %p to split realm %llx %p\n", 844 dout(" will move %p to split realm %llx %p\n",
815 inode, realm->ino, realm); 845 inode, realm->ino, realm);
816 /* 846 /*
817 * Remove the inode from the realm's inode 847 * Move the inode to the new realm
818 * list, but don't add it to the new realm
819 * yet. We don't want the cap_snap to be
820 * queued (again) by ceph_update_snap_trace()
821 * below. Queue it _now_, under the old context.
822 */ 848 */
823 spin_lock(&realm->inodes_with_caps_lock); 849 spin_lock(&realm->inodes_with_caps_lock);
824 list_del_init(&ci->i_snap_realm_item); 850 list_del_init(&ci->i_snap_realm_item);
851 list_add(&ci->i_snap_realm_item,
852 &realm->inodes_with_caps);
853 oldrealm = ci->i_snap_realm;
854 ci->i_snap_realm = realm;
825 spin_unlock(&realm->inodes_with_caps_lock); 855 spin_unlock(&realm->inodes_with_caps_lock);
826 spin_unlock(&inode->i_lock); 856 spin_unlock(&inode->i_lock);
827 857
828 ceph_queue_cap_snap(ci); 858 ceph_get_snap_realm(mdsc, realm);
859 ceph_put_snap_realm(mdsc, oldrealm);
829 860
830 iput(inode); 861 iput(inode);
831 continue; 862 continue;
@@ -853,43 +884,9 @@ skip_inode:
853 ceph_update_snap_trace(mdsc, p, e, 884 ceph_update_snap_trace(mdsc, p, e,
854 op == CEPH_SNAP_OP_DESTROY); 885 op == CEPH_SNAP_OP_DESTROY);
855 886
856 if (op == CEPH_SNAP_OP_SPLIT) { 887 if (op == CEPH_SNAP_OP_SPLIT)
857 /*
858 * ok, _now_ add the inodes into the new realm.
859 */
860 for (i = 0; i < num_split_inos; i++) {
861 struct ceph_vino vino = {
862 .ino = le64_to_cpu(split_inos[i]),
863 .snap = CEPH_NOSNAP,
864 };
865 struct inode *inode = ceph_find_inode(sb, vino);
866 struct ceph_inode_info *ci;
867
868 if (!inode)
869 continue;
870 ci = ceph_inode(inode);
871 spin_lock(&inode->i_lock);
872 if (list_empty(&ci->i_snap_realm_item)) {
873 struct ceph_snap_realm *oldrealm =
874 ci->i_snap_realm;
875
876 dout(" moving %p to split realm %llx %p\n",
877 inode, realm->ino, realm);
878 spin_lock(&realm->inodes_with_caps_lock);
879 list_add(&ci->i_snap_realm_item,
880 &realm->inodes_with_caps);
881 ci->i_snap_realm = realm;
882 spin_unlock(&realm->inodes_with_caps_lock);
883 ceph_get_snap_realm(mdsc, realm);
884 ceph_put_snap_realm(mdsc, oldrealm);
885 }
886 spin_unlock(&inode->i_lock);
887 iput(inode);
888 }
889
890 /* we took a reference when we created the realm, above */ 888 /* we took a reference when we created the realm, above */
891 ceph_put_snap_realm(mdsc, realm); 889 ceph_put_snap_realm(mdsc, realm);
892 }
893 890
894 __cleanup_empty_realms(mdsc); 891 __cleanup_empty_realms(mdsc);
895 892
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2482d696f0de..b87638e84c4b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -216,8 +216,7 @@ struct ceph_cap_snap {
216 uid_t uid; 216 uid_t uid;
217 gid_t gid; 217 gid_t gid;
218 218
219 void *xattr_blob; 219 struct ceph_buffer *xattr_blob;
220 int xattr_len;
221 u64 xattr_version; 220 u64 xattr_version;
222 221
223 u64 size; 222 u64 size;
@@ -229,8 +228,11 @@ struct ceph_cap_snap {
229 228
230static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) 229static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
231{ 230{
232 if (atomic_dec_and_test(&capsnap->nref)) 231 if (atomic_dec_and_test(&capsnap->nref)) {
232 if (capsnap->xattr_blob)
233 ceph_buffer_put(capsnap->xattr_blob);
233 kfree(capsnap); 234 kfree(capsnap);
235 }
234} 236}
235 237
236/* 238/*
@@ -342,7 +344,8 @@ struct ceph_inode_info {
342 unsigned i_cap_exporting_issued; 344 unsigned i_cap_exporting_issued;
343 struct ceph_cap_reservation i_cap_migration_resv; 345 struct ceph_cap_reservation i_cap_migration_resv;
344 struct list_head i_cap_snaps; /* snapped state pending flush to mds */ 346 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
345 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */ 347 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
348 dirty|flushing caps */
346 unsigned i_snap_caps; /* cap bits for snapped files */ 349 unsigned i_snap_caps; /* cap bits for snapped files */
347 350
348 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ 351 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
@@ -687,6 +690,8 @@ struct ceph_snap_realm {
687 690
688 struct list_head empty_item; /* if i have ref==0 */ 691 struct list_head empty_item; /* if i have ref==0 */
689 692
693 struct list_head dirty_item; /* if realm needs new context */
694
690 /* the current set of snaps for this realm */ 695 /* the current set of snaps for this realm */
691 struct ceph_snap_context *cached_context; 696 struct ceph_snap_context *cached_context;
692 697
@@ -823,7 +828,8 @@ extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
823extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, 828extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
824 struct ceph_snap_context *snapc); 829 struct ceph_snap_context *snapc);
825extern void __ceph_flush_snaps(struct ceph_inode_info *ci, 830extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
826 struct ceph_mds_session **psession); 831 struct ceph_mds_session **psession,
832 int again);
827extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, 833extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
828 struct ceph_mds_session *session); 834 struct ceph_mds_session *session);
829extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); 835extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 097a2654c00f..9578af610b73 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -485,6 +485,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
485 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; 485 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
486 ci->i_xattrs.prealloc_blob = NULL; 486 ci->i_xattrs.prealloc_blob = NULL;
487 ci->i_xattrs.dirty = false; 487 ci->i_xattrs.dirty = false;
488 ci->i_xattrs.version++;
488 } 489 }
489} 490}
490 491
diff --git a/fs/char_dev.c b/fs/char_dev.c
index f80a4f25123c..143d393881cb 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -40,7 +40,9 @@ struct backing_dev_info directly_mappable_cdev_bdi = {
40#endif 40#endif
41 /* permit direct mmap, for read, write or exec */ 41 /* permit direct mmap, for read, write or exec */
42 BDI_CAP_MAP_DIRECT | 42 BDI_CAP_MAP_DIRECT |
43 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP), 43 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
44 /* no writeback happens */
45 BDI_CAP_NO_ACCT_AND_WRITEBACK),
44}; 46};
45 47
46static struct kobj_map *cdev_map; 48static struct kobj_map *cdev_map;
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 650638275a6f..7fe6b52df507 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -30,6 +30,8 @@
30 * This is a compressed table of upper and lower case conversion. 30 * This is a compressed table of upper and lower case conversion.
31 * 31 *
32 */ 32 */
33#ifndef _CIFS_UNICODE_H
34#define _CIFS_UNICODE_H
33 35
34#include <asm/byteorder.h> 36#include <asm/byteorder.h>
35#include <linux/types.h> 37#include <linux/types.h>
@@ -67,8 +69,8 @@ extern const struct UniCaseRange CifsUniUpperRange[];
67#endif /* UNIUPR_NOUPPER */ 69#endif /* UNIUPR_NOUPPER */
68 70
69#ifndef UNIUPR_NOLOWER 71#ifndef UNIUPR_NOLOWER
70extern signed char UniLowerTable[512]; 72extern signed char CifsUniLowerTable[512];
71extern struct UniCaseRange UniLowerRange[]; 73extern const struct UniCaseRange CifsUniLowerRange[];
72#endif /* UNIUPR_NOLOWER */ 74#endif /* UNIUPR_NOLOWER */
73 75
74#ifdef __KERNEL__ 76#ifdef __KERNEL__
@@ -337,15 +339,15 @@ UniStrupr(register wchar_t *upin)
337 * UniTolower: Convert a unicode character to lower case 339 * UniTolower: Convert a unicode character to lower case
338 */ 340 */
339static inline wchar_t 341static inline wchar_t
340UniTolower(wchar_t uc) 342UniTolower(register wchar_t uc)
341{ 343{
342 register struct UniCaseRange *rp; 344 register const struct UniCaseRange *rp;
343 345
344 if (uc < sizeof(UniLowerTable)) { 346 if (uc < sizeof(CifsUniLowerTable)) {
345 /* Latin characters */ 347 /* Latin characters */
346 return uc + UniLowerTable[uc]; /* Use base tables */ 348 return uc + CifsUniLowerTable[uc]; /* Use base tables */
347 } else { 349 } else {
348 rp = UniLowerRange; /* Use range tables */ 350 rp = CifsUniLowerRange; /* Use range tables */
349 while (rp->start) { 351 while (rp->start) {
350 if (uc < rp->start) /* Before start of range */ 352 if (uc < rp->start) /* Before start of range */
351 return uc; /* Uppercase = input */ 353 return uc; /* Uppercase = input */
@@ -374,3 +376,5 @@ UniStrlwr(register wchar_t *upin)
374} 376}
375 377
376#endif 378#endif
379
380#endif /* _CIFS_UNICODE_H */
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index 18a9d978e519..0ac7c5a8633a 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h
@@ -140,7 +140,7 @@ const struct UniCaseRange CifsUniUpperRange[] = {
140/* 140/*
141 * Latin lower case 141 * Latin lower case
142 */ 142 */
143static signed char CifsUniLowerTable[512] = { 143signed char CifsUniLowerTable[512] = {
144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ 144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ 145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ 146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
@@ -242,12 +242,12 @@ static signed char UniCaseRangeLff20[27] = {
242/* 242/*
243 * Lower Case Range 243 * Lower Case Range
244 */ 244 */
245static const struct UniCaseRange CifsUniLowerRange[] = { 245const struct UniCaseRange CifsUniLowerRange[] = {
246 0x0380, 0x03ab, UniCaseRangeL0380, 246 {0x0380, 0x03ab, UniCaseRangeL0380},
247 0x0400, 0x042f, UniCaseRangeL0400, 247 {0x0400, 0x042f, UniCaseRangeL0400},
248 0x0490, 0x04cb, UniCaseRangeL0490, 248 {0x0490, 0x04cb, UniCaseRangeL0490},
249 0x1e00, 0x1ff7, UniCaseRangeL1e00, 249 {0x1e00, 0x1ff7, UniCaseRangeL1e00},
250 0xff20, 0xff3a, UniCaseRangeLff20, 250 {0xff20, 0xff3a, UniCaseRangeLff20},
251 0, 0, 0 251 {0}
252}; 252};
253#endif 253#endif
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 847628dfdc44..35042d8f7338 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -223,63 +223,6 @@ int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
223 return 0; 223 return 0;
224} 224}
225 225
226int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses,
227 const struct nls_table *nls_info)
228{
229 char temp_hash[16];
230 struct HMACMD5Context ctx;
231 char *ucase_buf;
232 __le16 *unicode_buf;
233 unsigned int i, user_name_len, dom_name_len;
234
235 if (ses == NULL)
236 return -EINVAL;
237
238 E_md4hash(ses->password, temp_hash);
239
240 hmac_md5_init_limK_to_64(temp_hash, 16, &ctx);
241 user_name_len = strlen(ses->userName);
242 if (user_name_len > MAX_USERNAME_SIZE)
243 return -EINVAL;
244 if (ses->domainName == NULL)
245 return -EINVAL; /* BB should we use CIFS_LINUX_DOM */
246 dom_name_len = strlen(ses->domainName);
247 if (dom_name_len > MAX_USERNAME_SIZE)
248 return -EINVAL;
249
250 ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL);
251 if (ucase_buf == NULL)
252 return -ENOMEM;
253 unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL);
254 if (unicode_buf == NULL) {
255 kfree(ucase_buf);
256 return -ENOMEM;
257 }
258
259 for (i = 0; i < user_name_len; i++)
260 ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]];
261 ucase_buf[i] = 0;
262 user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf,
263 MAX_USERNAME_SIZE*2, nls_info);
264 unicode_buf[user_name_len] = 0;
265 user_name_len++;
266
267 for (i = 0; i < dom_name_len; i++)
268 ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]];
269 ucase_buf[i] = 0;
270 dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf,
271 MAX_USERNAME_SIZE*2, nls_info);
272
273 unicode_buf[user_name_len + dom_name_len] = 0;
274 hmac_md5_update((const unsigned char *) unicode_buf,
275 (user_name_len+dom_name_len)*2, &ctx);
276
277 hmac_md5_final(ses->server->ntlmv2_hash, &ctx);
278 kfree(ucase_buf);
279 kfree(unicode_buf);
280 return 0;
281}
282
283#ifdef CONFIG_CIFS_WEAK_PW_HASH 226#ifdef CONFIG_CIFS_WEAK_PW_HASH
284void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, 227void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
285 char *lnm_session_key) 228 char *lnm_session_key)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1f5450814087..1d60c655e3e0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -87,8 +87,9 @@ extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
87extern int decode_negTokenInit(unsigned char *security_blob, int length, 87extern int decode_negTokenInit(unsigned char *security_blob, int length,
88 struct TCP_Server_Info *server); 88 struct TCP_Server_Info *server);
89extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); 89extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
90extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
90extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, 91extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
91 unsigned short int port); 92 const unsigned short int port);
92extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); 93extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
93extern void header_assemble(struct smb_hdr *, char /* command */ , 94extern void header_assemble(struct smb_hdr *, char /* command */ ,
94 const struct cifsTconInfo *, int /* length of 95 const struct cifsTconInfo *, int /* length of
@@ -365,8 +366,6 @@ extern int cifs_verify_signature(struct smb_hdr *,
365 __u32 expected_sequence_number); 366 __u32 expected_sequence_number);
366extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn, 367extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
367 const char *pass); 368 const char *pass);
368extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *,
369 const struct nls_table *);
370extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *); 369extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
371extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, 370extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
372 const struct nls_table *); 371 const struct nls_table *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd37..7e83b356cc9e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -232,7 +232,7 @@ static int
232small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 232small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
233 void **request_buf) 233 void **request_buf)
234{ 234{
235 int rc = 0; 235 int rc;
236 236
237 rc = cifs_reconnect_tcon(tcon, smb_command); 237 rc = cifs_reconnect_tcon(tcon, smb_command);
238 if (rc) 238 if (rc)
@@ -250,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
250 if (tcon != NULL) 250 if (tcon != NULL)
251 cifs_stats_inc(&tcon->num_smbs_sent); 251 cifs_stats_inc(&tcon->num_smbs_sent);
252 252
253 return rc; 253 return 0;
254} 254}
255 255
256int 256int
@@ -281,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct,
281 281
282/* If the return code is zero, this function must fill in request_buf pointer */ 282/* If the return code is zero, this function must fill in request_buf pointer */
283static int 283static int
284smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 284__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
285 void **request_buf /* returned */ , 285 void **request_buf, void **response_buf)
286 void **response_buf /* returned */ )
287{ 286{
288 int rc = 0;
289
290 rc = cifs_reconnect_tcon(tcon, smb_command);
291 if (rc)
292 return rc;
293
294 *request_buf = cifs_buf_get(); 287 *request_buf = cifs_buf_get();
295 if (*request_buf == NULL) { 288 if (*request_buf == NULL) {
296 /* BB should we add a retry in here if not a writepage? */ 289 /* BB should we add a retry in here if not a writepage? */
@@ -309,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
309 if (tcon != NULL) 302 if (tcon != NULL)
310 cifs_stats_inc(&tcon->num_smbs_sent); 303 cifs_stats_inc(&tcon->num_smbs_sent);
311 304
312 return rc; 305 return 0;
306}
307
308/* If the return code is zero, this function must fill in request_buf pointer */
309static int
310smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
311 void **request_buf, void **response_buf)
312{
313 int rc;
314
315 rc = cifs_reconnect_tcon(tcon, smb_command);
316 if (rc)
317 return rc;
318
319 return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
320}
321
322static int
323smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
324 void **request_buf, void **response_buf)
325{
326 if (tcon->ses->need_reconnect || tcon->need_reconnect)
327 return -EHOSTDOWN;
328
329 return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
313} 330}
314 331
315static int validate_t2(struct smb_t2_rsp *pSMB) 332static int validate_t2(struct smb_t2_rsp *pSMB)
@@ -4534,8 +4551,8 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
4534 4551
4535 cFYI(1, "In QFSUnixInfo"); 4552 cFYI(1, "In QFSUnixInfo");
4536QFSUnixRetry: 4553QFSUnixRetry:
4537 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4554 rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
4538 (void **) &pSMBr); 4555 (void **) &pSMB, (void **) &pSMBr);
4539 if (rc) 4556 if (rc)
4540 return rc; 4557 return rc;
4541 4558
@@ -4604,8 +4621,8 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
4604 cFYI(1, "In SETFSUnixInfo"); 4621 cFYI(1, "In SETFSUnixInfo");
4605SETFSUnixRetry: 4622SETFSUnixRetry:
4606 /* BB switch to small buf init to save memory */ 4623 /* BB switch to small buf init to save memory */
4607 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4624 rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
4608 (void **) &pSMBr); 4625 (void **) &pSMB, (void **) &pSMBr);
4609 if (rc) 4626 if (rc)
4610 return rc; 4627 return rc;
4611 4628
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 95c2ea67edfb..88c84a38bccb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -400,7 +400,9 @@ incomplete_rcv:
400 cFYI(1, "call to reconnect done"); 400 cFYI(1, "call to reconnect done");
401 csocket = server->ssocket; 401 csocket = server->ssocket;
402 continue; 402 continue;
403 } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) { 403 } else if (length == -ERESTARTSYS ||
404 length == -EAGAIN ||
405 length == -EINTR) {
404 msleep(1); /* minimum sleep to prevent looping 406 msleep(1); /* minimum sleep to prevent looping
405 allowing socket to clear and app threads to set 407 allowing socket to clear and app threads to set
406 tcpStatus CifsNeedReconnect if server hung */ 408 tcpStatus CifsNeedReconnect if server hung */
@@ -414,18 +416,6 @@ incomplete_rcv:
414 } else 416 } else
415 continue; 417 continue;
416 } else if (length <= 0) { 418 } else if (length <= 0) {
417 if (server->tcpStatus == CifsNew) {
418 cFYI(1, "tcp session abend after SMBnegprot");
419 /* some servers kill the TCP session rather than
420 returning an SMB negprot error, in which
421 case reconnecting here is not going to help,
422 and so simply return error to mount */
423 break;
424 }
425 if (!try_to_freeze() && (length == -EINTR)) {
426 cFYI(1, "cifsd thread killed");
427 break;
428 }
429 cFYI(1, "Reconnect after unexpected peek error %d", 419 cFYI(1, "Reconnect after unexpected peek error %d",
430 length); 420 length);
431 cifs_reconnect(server); 421 cifs_reconnect(server);
@@ -466,27 +456,19 @@ incomplete_rcv:
466 an error on SMB negprot response */ 456 an error on SMB negprot response */
467 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", 457 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
468 pdu_length); 458 pdu_length);
469 if (server->tcpStatus == CifsNew) { 459 /* give server a second to clean up */
470 /* if nack on negprot (rather than 460 msleep(1000);
471 ret of smb negprot error) reconnecting 461 /* always try 445 first on reconnect since we get NACK
472 not going to help, ret error to mount */ 462 * on some if we ever connected to port 139 (the NACK
473 break; 463 * is since we do not begin with RFC1001 session
474 } else { 464 * initialize frame)
475 /* give server a second to 465 */
476 clean up before reconnect attempt */ 466 cifs_set_port((struct sockaddr *)
477 msleep(1000); 467 &server->addr.sockAddr, CIFS_PORT);
478 /* always try 445 first on reconnect 468 cifs_reconnect(server);
479 since we get NACK on some if we ever 469 csocket = server->ssocket;
480 connected to port 139 (the NACK is 470 wake_up(&server->response_q);
481 since we do not begin with RFC1001 471 continue;
482 session initialize frame) */
483 server->addr.sockAddr.sin_port =
484 htons(CIFS_PORT);
485 cifs_reconnect(server);
486 csocket = server->ssocket;
487 wake_up(&server->response_q);
488 continue;
489 }
490 } else if (temp != (char) 0) { 472 } else if (temp != (char) 0) {
491 cERROR(1, "Unknown RFC 1002 frame"); 473 cERROR(1, "Unknown RFC 1002 frame");
492 cifs_dump_mem(" Received Data: ", (char *)smb_buffer, 474 cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
@@ -522,8 +504,7 @@ incomplete_rcv:
522 total_read += length) { 504 total_read += length) {
523 length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, 505 length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
524 pdu_length - total_read, 0); 506 pdu_length - total_read, 0);
525 if ((server->tcpStatus == CifsExiting) || 507 if (server->tcpStatus == CifsExiting) {
526 (length == -EINTR)) {
527 /* then will exit */ 508 /* then will exit */
528 reconnect = 2; 509 reconnect = 2;
529 break; 510 break;
@@ -534,8 +515,9 @@ incomplete_rcv:
534 /* Now we will reread sock */ 515 /* Now we will reread sock */
535 reconnect = 1; 516 reconnect = 1;
536 break; 517 break;
537 } else if ((length == -ERESTARTSYS) || 518 } else if (length == -ERESTARTSYS ||
538 (length == -EAGAIN)) { 519 length == -EAGAIN ||
520 length == -EINTR) {
539 msleep(1); /* minimum sleep to prevent looping, 521 msleep(1); /* minimum sleep to prevent looping,
540 allowing socket to clear and app 522 allowing socket to clear and app
541 threads to set tcpStatus 523 threads to set tcpStatus
@@ -1673,7 +1655,9 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
1673 MAX_USERNAME_SIZE)) 1655 MAX_USERNAME_SIZE))
1674 continue; 1656 continue;
1675 if (strlen(vol->username) != 0 && 1657 if (strlen(vol->username) != 0 &&
1676 strncmp(ses->password, vol->password, 1658 ses->password != NULL &&
1659 strncmp(ses->password,
1660 vol->password ? vol->password : "",
1677 MAX_PASSWORD_SIZE)) 1661 MAX_PASSWORD_SIZE))
1678 continue; 1662 continue;
1679 } 1663 }
@@ -1722,9 +1706,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1722 if (ses) { 1706 if (ses) {
1723 cFYI(1, "Existing smb sess found (status=%d)", ses->status); 1707 cFYI(1, "Existing smb sess found (status=%d)", ses->status);
1724 1708
1725 /* existing SMB ses has a server reference already */
1726 cifs_put_tcp_session(server);
1727
1728 mutex_lock(&ses->session_mutex); 1709 mutex_lock(&ses->session_mutex);
1729 rc = cifs_negotiate_protocol(xid, ses); 1710 rc = cifs_negotiate_protocol(xid, ses);
1730 if (rc) { 1711 if (rc) {
@@ -1747,6 +1728,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1747 } 1728 }
1748 } 1729 }
1749 mutex_unlock(&ses->session_mutex); 1730 mutex_unlock(&ses->session_mutex);
1731
1732 /* existing SMB ses has a server reference already */
1733 cifs_put_tcp_session(server);
1750 FreeXid(xid); 1734 FreeXid(xid);
1751 return ses; 1735 return ses;
1752 } 1736 }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 578d88c5b46e..f9ed0751cc12 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -305,8 +305,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
305 full_path = build_path_from_dentry(direntry); 305 full_path = build_path_from_dentry(direntry);
306 if (full_path == NULL) { 306 if (full_path == NULL) {
307 rc = -ENOMEM; 307 rc = -ENOMEM;
308 FreeXid(xid); 308 goto cifs_create_out;
309 return rc;
310 } 309 }
311 310
312 if (oplockEnabled) 311 if (oplockEnabled)
@@ -365,9 +364,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
365 364
366 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 365 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
367 if (buf == NULL) { 366 if (buf == NULL) {
368 kfree(full_path); 367 rc = -ENOMEM;
369 FreeXid(xid); 368 goto cifs_create_out;
370 return -ENOMEM;
371 } 369 }
372 370
373 /* 371 /*
@@ -496,6 +494,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
496 struct cifsTconInfo *pTcon; 494 struct cifsTconInfo *pTcon;
497 char *full_path = NULL; 495 char *full_path = NULL;
498 struct inode *newinode = NULL; 496 struct inode *newinode = NULL;
497 int oplock = 0;
498 u16 fileHandle;
499 FILE_ALL_INFO *buf = NULL;
500 unsigned int bytes_written;
501 struct win_dev *pdev;
499 502
500 if (!old_valid_dev(device_number)) 503 if (!old_valid_dev(device_number))
501 return -EINVAL; 504 return -EINVAL;
@@ -506,9 +509,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
506 pTcon = cifs_sb->tcon; 509 pTcon = cifs_sb->tcon;
507 510
508 full_path = build_path_from_dentry(direntry); 511 full_path = build_path_from_dentry(direntry);
509 if (full_path == NULL) 512 if (full_path == NULL) {
510 rc = -ENOMEM; 513 rc = -ENOMEM;
511 else if (pTcon->unix_ext) { 514 goto mknod_out;
515 }
516
517 if (pTcon->unix_ext) {
512 struct cifs_unix_set_info_args args = { 518 struct cifs_unix_set_info_args args = {
513 .mode = mode & ~current_umask(), 519 .mode = mode & ~current_umask(),
514 .ctime = NO_CHANGE_64, 520 .ctime = NO_CHANGE_64,
@@ -527,87 +533,78 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
527 cifs_sb->local_nls, 533 cifs_sb->local_nls,
528 cifs_sb->mnt_cifs_flags & 534 cifs_sb->mnt_cifs_flags &
529 CIFS_MOUNT_MAP_SPECIAL_CHR); 535 CIFS_MOUNT_MAP_SPECIAL_CHR);
536 if (rc)
537 goto mknod_out;
530 538
531 if (!rc) { 539 rc = cifs_get_inode_info_unix(&newinode, full_path,
532 rc = cifs_get_inode_info_unix(&newinode, full_path,
533 inode->i_sb, xid); 540 inode->i_sb, xid);
534 if (pTcon->nocase) 541 if (pTcon->nocase)
535 direntry->d_op = &cifs_ci_dentry_ops; 542 direntry->d_op = &cifs_ci_dentry_ops;
536 else 543 else
537 direntry->d_op = &cifs_dentry_ops; 544 direntry->d_op = &cifs_dentry_ops;
538 if (rc == 0)
539 d_instantiate(direntry, newinode);
540 }
541 } else {
542 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
543 int oplock = 0;
544 u16 fileHandle;
545 FILE_ALL_INFO *buf;
546 545
547 cFYI(1, "sfu compat create special file"); 546 if (rc == 0)
547 d_instantiate(direntry, newinode);
548 goto mknod_out;
549 }
548 550
549 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 551 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
550 if (buf == NULL) { 552 goto mknod_out;
551 kfree(full_path);
552 rc = -ENOMEM;
553 FreeXid(xid);
554 return rc;
555 }
556 553
557 rc = CIFSSMBOpen(xid, pTcon, full_path, 554
558 FILE_CREATE, /* fail if exists */ 555 cFYI(1, "sfu compat create special file");
559 GENERIC_WRITE /* BB would 556
560 WRITE_OWNER | WRITE_DAC be better? */, 557 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
561 /* Create a file and set the 558 if (buf == NULL) {
562 file attribute to SYSTEM */ 559 kfree(full_path);
563 CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, 560 rc = -ENOMEM;
564 &fileHandle, &oplock, buf, 561 FreeXid(xid);
565 cifs_sb->local_nls, 562 return rc;
566 cifs_sb->mnt_cifs_flags &
567 CIFS_MOUNT_MAP_SPECIAL_CHR);
568
569 /* BB FIXME - add handling for backlevel servers
570 which need legacy open and check for all
571 calls to SMBOpen for fallback to SMBLeagcyOpen */
572 if (!rc) {
573 /* BB Do not bother to decode buf since no
574 local inode yet to put timestamps in,
575 but we can reuse it safely */
576 unsigned int bytes_written;
577 struct win_dev *pdev;
578 pdev = (struct win_dev *)buf;
579 if (S_ISCHR(mode)) {
580 memcpy(pdev->type, "IntxCHR", 8);
581 pdev->major =
582 cpu_to_le64(MAJOR(device_number));
583 pdev->minor =
584 cpu_to_le64(MINOR(device_number));
585 rc = CIFSSMBWrite(xid, pTcon,
586 fileHandle,
587 sizeof(struct win_dev),
588 0, &bytes_written, (char *)pdev,
589 NULL, 0);
590 } else if (S_ISBLK(mode)) {
591 memcpy(pdev->type, "IntxBLK", 8);
592 pdev->major =
593 cpu_to_le64(MAJOR(device_number));
594 pdev->minor =
595 cpu_to_le64(MINOR(device_number));
596 rc = CIFSSMBWrite(xid, pTcon,
597 fileHandle,
598 sizeof(struct win_dev),
599 0, &bytes_written, (char *)pdev,
600 NULL, 0);
601 } /* else if(S_ISFIFO */
602 CIFSSMBClose(xid, pTcon, fileHandle);
603 d_drop(direntry);
604 }
605 kfree(buf);
606 /* add code here to set EAs */
607 }
608 } 563 }
609 564
565 /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
566 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
567 GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
568 &fileHandle, &oplock, buf, cifs_sb->local_nls,
569 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
570 if (rc)
571 goto mknod_out;
572
573 /* BB Do not bother to decode buf since no local inode yet to put
574 * timestamps in, but we can reuse it safely */
575
576 pdev = (struct win_dev *)buf;
577 if (S_ISCHR(mode)) {
578 memcpy(pdev->type, "IntxCHR", 8);
579 pdev->major =
580 cpu_to_le64(MAJOR(device_number));
581 pdev->minor =
582 cpu_to_le64(MINOR(device_number));
583 rc = CIFSSMBWrite(xid, pTcon,
584 fileHandle,
585 sizeof(struct win_dev),
586 0, &bytes_written, (char *)pdev,
587 NULL, 0);
588 } else if (S_ISBLK(mode)) {
589 memcpy(pdev->type, "IntxBLK", 8);
590 pdev->major =
591 cpu_to_le64(MAJOR(device_number));
592 pdev->minor =
593 cpu_to_le64(MINOR(device_number));
594 rc = CIFSSMBWrite(xid, pTcon,
595 fileHandle,
596 sizeof(struct win_dev),
597 0, &bytes_written, (char *)pdev,
598 NULL, 0);
599 } /* else if (S_ISFIFO) */
600 CIFSSMBClose(xid, pTcon, fileHandle);
601 d_drop(direntry);
602
603 /* FIXME: add code here to set EAs */
604
605mknod_out:
610 kfree(full_path); 606 kfree(full_path);
607 kfree(buf);
611 FreeXid(xid); 608 FreeXid(xid);
612 return rc; 609 return rc;
613} 610}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index db11fdef0e92..de748c652d11 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -242,8 +242,7 @@ int cifs_open(struct inode *inode, struct file *file)
242 full_path = build_path_from_dentry(file->f_path.dentry); 242 full_path = build_path_from_dentry(file->f_path.dentry);
243 if (full_path == NULL) { 243 if (full_path == NULL) {
244 rc = -ENOMEM; 244 rc = -ENOMEM;
245 FreeXid(xid); 245 goto out;
246 return rc;
247 } 246 }
248 247
249 cFYI(1, "inode = 0x%p file flags are 0x%x for %s", 248 cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4bc47e5b5f29..53cce8cc2224 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -801,6 +801,8 @@ retry_iget5_locked:
801 inode->i_flags |= S_NOATIME | S_NOCMTIME; 801 inode->i_flags |= S_NOATIME | S_NOCMTIME;
802 if (inode->i_state & I_NEW) { 802 if (inode->i_state & I_NEW) {
803 inode->i_ino = hash; 803 inode->i_ino = hash;
804 if (S_ISREG(inode->i_mode))
805 inode->i_data.backing_dev_info = sb->s_bdi;
804#ifdef CONFIG_CIFS_FSCACHE 806#ifdef CONFIG_CIFS_FSCACHE
805 /* initialize per-inode cache cookie pointer */ 807 /* initialize per-inode cache cookie pointer */
806 CIFS_I(inode)->fscache = NULL; 808 CIFS_I(inode)->fscache = NULL;
@@ -834,7 +836,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
834 xid, NULL); 836 xid, NULL);
835 837
836 if (!inode) 838 if (!inode)
837 return ERR_PTR(-ENOMEM); 839 return ERR_PTR(rc);
838 840
839#ifdef CONFIG_CIFS_FSCACHE 841#ifdef CONFIG_CIFS_FSCACHE
840 /* populate tcon->resource_id */ 842 /* populate tcon->resource_id */
@@ -1462,29 +1464,18 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1462{ 1464{
1463 char *fromName = NULL; 1465 char *fromName = NULL;
1464 char *toName = NULL; 1466 char *toName = NULL;
1465 struct cifs_sb_info *cifs_sb_source; 1467 struct cifs_sb_info *cifs_sb;
1466 struct cifs_sb_info *cifs_sb_target;
1467 struct cifsTconInfo *tcon; 1468 struct cifsTconInfo *tcon;
1468 FILE_UNIX_BASIC_INFO *info_buf_source = NULL; 1469 FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
1469 FILE_UNIX_BASIC_INFO *info_buf_target; 1470 FILE_UNIX_BASIC_INFO *info_buf_target;
1470 int xid, rc, tmprc; 1471 int xid, rc, tmprc;
1471 1472
1472 cifs_sb_target = CIFS_SB(target_dir->i_sb); 1473 cifs_sb = CIFS_SB(source_dir->i_sb);
1473 cifs_sb_source = CIFS_SB(source_dir->i_sb); 1474 tcon = cifs_sb->tcon;
1474 tcon = cifs_sb_source->tcon;
1475 1475
1476 xid = GetXid(); 1476 xid = GetXid();
1477 1477
1478 /* 1478 /*
1479 * BB: this might be allowed if same server, but different share.
1480 * Consider adding support for this
1481 */
1482 if (tcon != cifs_sb_target->tcon) {
1483 rc = -EXDEV;
1484 goto cifs_rename_exit;
1485 }
1486
1487 /*
1488 * we already have the rename sem so we do not need to 1479 * we already have the rename sem so we do not need to
1489 * grab it again here to protect the path integrity 1480 * grab it again here to protect the path integrity
1490 */ 1481 */
@@ -1519,17 +1510,16 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1519 info_buf_target = info_buf_source + 1; 1510 info_buf_target = info_buf_source + 1;
1520 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName, 1511 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName,
1521 info_buf_source, 1512 info_buf_source,
1522 cifs_sb_source->local_nls, 1513 cifs_sb->local_nls,
1523 cifs_sb_source->mnt_cifs_flags & 1514 cifs_sb->mnt_cifs_flags &
1524 CIFS_MOUNT_MAP_SPECIAL_CHR); 1515 CIFS_MOUNT_MAP_SPECIAL_CHR);
1525 if (tmprc != 0) 1516 if (tmprc != 0)
1526 goto unlink_target; 1517 goto unlink_target;
1527 1518
1528 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, 1519 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName,
1529 toName, info_buf_target, 1520 info_buf_target,
1530 cifs_sb_target->local_nls, 1521 cifs_sb->local_nls,
1531 /* remap based on source sb */ 1522 cifs_sb->mnt_cifs_flags &
1532 cifs_sb_source->mnt_cifs_flags &
1533 CIFS_MOUNT_MAP_SPECIAL_CHR); 1523 CIFS_MOUNT_MAP_SPECIAL_CHR);
1534 1524
1535 if (tmprc == 0 && (info_buf_source->UniqueId == 1525 if (tmprc == 0 && (info_buf_source->UniqueId ==
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index f97851119e6c..9aad47a2d62f 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -206,26 +206,30 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
206} 206}
207 207
208int 208int
209cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, 209cifs_set_port(struct sockaddr *addr, const unsigned short int port)
210 const unsigned short int port)
211{ 210{
212 if (!cifs_convert_address(dst, src, len)) 211 switch (addr->sa_family) {
213 return 0;
214
215 switch (dst->sa_family) {
216 case AF_INET: 212 case AF_INET:
217 ((struct sockaddr_in *)dst)->sin_port = htons(port); 213 ((struct sockaddr_in *)addr)->sin_port = htons(port);
218 break; 214 break;
219 case AF_INET6: 215 case AF_INET6:
220 ((struct sockaddr_in6 *)dst)->sin6_port = htons(port); 216 ((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
221 break; 217 break;
222 default: 218 default:
223 return 0; 219 return 0;
224 } 220 }
225
226 return 1; 221 return 1;
227} 222}
228 223
224int
225cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
226 const unsigned short int port)
227{
228 if (!cifs_convert_address(dst, src, len))
229 return 0;
230 return cifs_set_port(dst, port);
231}
232
229/***************************************************************************** 233/*****************************************************************************
230convert a NT status code to a dos class/code 234convert a NT status code to a dos class/code
231 *****************************************************************************/ 235 *****************************************************************************/
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index de89645777c7..116af7546cf0 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -184,8 +184,8 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
184 } 184 }
185 185
186 /* adjust outsize. is this useful ?? */ 186 /* adjust outsize. is this useful ?? */
187 req->uc_outSize = nbytes; 187 req->uc_outSize = nbytes;
188 req->uc_flags |= REQ_WRITE; 188 req->uc_flags |= CODA_REQ_WRITE;
189 count = nbytes; 189 count = nbytes;
190 190
191 /* Convert filedescriptor into a file handle */ 191 /* Convert filedescriptor into a file handle */
diff --git a/fs/compat.c b/fs/compat.c
index 718c7062aec1..0644a154672b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1153,7 +1153,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1153{ 1153{
1154 compat_ssize_t tot_len; 1154 compat_ssize_t tot_len;
1155 struct iovec iovstack[UIO_FASTIOV]; 1155 struct iovec iovstack[UIO_FASTIOV];
1156 struct iovec *iov; 1156 struct iovec *iov = iovstack;
1157 ssize_t ret; 1157 ssize_t ret;
1158 io_fn_t fn; 1158 io_fn_t fn;
1159 iov_fn_t fnv; 1159 iov_fn_t fnv;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 51f270b479b6..48d74c7391d1 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -634,7 +634,7 @@ static int dio_send_cur_page(struct dio *dio)
634 int ret = 0; 634 int ret = 0;
635 635
636 if (dio->bio) { 636 if (dio->bio) {
637 loff_t cur_offset = dio->block_in_file << dio->blkbits; 637 loff_t cur_offset = dio->cur_page_fs_offset;
638 loff_t bio_next_offset = dio->logical_offset_in_bio + 638 loff_t bio_next_offset = dio->logical_offset_in_bio +
639 dio->bio->bi_size; 639 dio->bio->bi_size;
640 640
@@ -659,7 +659,7 @@ static int dio_send_cur_page(struct dio *dio)
659 * Submit now if the underlying fs is about to perform a 659 * Submit now if the underlying fs is about to perform a
660 * metadata read 660 * metadata read
661 */ 661 */
662 if (dio->boundary) 662 else if (dio->boundary)
663 dio_bio_submit(dio); 663 dio_bio_submit(dio);
664 } 664 }
665 665
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index a2e3b562e65d..cbadc1bee6e7 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1793,7 +1793,7 @@ struct kmem_cache *ecryptfs_key_tfm_cache;
1793static struct list_head key_tfm_list; 1793static struct list_head key_tfm_list;
1794struct mutex key_tfm_list_mutex; 1794struct mutex key_tfm_list_mutex;
1795 1795
1796int ecryptfs_init_crypto(void) 1796int __init ecryptfs_init_crypto(void)
1797{ 1797{
1798 mutex_init(&key_tfm_list_mutex); 1798 mutex_init(&key_tfm_list_mutex);
1799 INIT_LIST_HEAD(&key_tfm_list); 1799 INIT_LIST_HEAD(&key_tfm_list);
@@ -2169,7 +2169,6 @@ int ecryptfs_encrypt_and_encode_filename(
2169 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 2169 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2170 + encoded_name_no_prefix_size); 2170 + encoded_name_no_prefix_size);
2171 (*encoded_name)[(*encoded_name_size)] = '\0'; 2171 (*encoded_name)[(*encoded_name_size)] = '\0';
2172 (*encoded_name_size)++;
2173 } else { 2172 } else {
2174 rc = -EOPNOTSUPP; 2173 rc = -EOPNOTSUPP;
2175 } 2174 }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 6c55113e7222..3fbc94203380 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -349,7 +349,7 @@ out:
349 349
350/** 350/**
351 * ecryptfs_new_lower_dentry 351 * ecryptfs_new_lower_dentry
352 * @ename: The name of the new dentry. 352 * @name: The name of the new dentry.
353 * @lower_dir_dentry: Parent directory of the new dentry. 353 * @lower_dir_dentry: Parent directory of the new dentry.
354 * @nd: nameidata from last lookup. 354 * @nd: nameidata from last lookup.
355 * 355 *
@@ -386,20 +386,19 @@ ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
386 * ecryptfs_lookup_one_lower 386 * ecryptfs_lookup_one_lower
387 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up 387 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
388 * @lower_dir_dentry: lower parent directory 388 * @lower_dir_dentry: lower parent directory
389 * @name: lower file name
389 * 390 *
390 * Get the lower dentry from vfs. If lower dentry does not exist yet, 391 * Get the lower dentry from vfs. If lower dentry does not exist yet,
391 * create it. 392 * create it.
392 */ 393 */
393static struct dentry * 394static struct dentry *
394ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry, 395ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
395 struct dentry *lower_dir_dentry) 396 struct dentry *lower_dir_dentry, struct qstr *name)
396{ 397{
397 struct nameidata nd; 398 struct nameidata nd;
398 struct vfsmount *lower_mnt; 399 struct vfsmount *lower_mnt;
399 struct qstr *name;
400 int err; 400 int err;
401 401
402 name = &ecryptfs_dentry->d_name;
403 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt( 402 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
404 ecryptfs_dentry->d_parent)); 403 ecryptfs_dentry->d_parent));
405 err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd); 404 err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
@@ -434,6 +433,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
434 size_t encrypted_and_encoded_name_size; 433 size_t encrypted_and_encoded_name_size;
435 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; 434 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
436 struct dentry *lower_dir_dentry, *lower_dentry; 435 struct dentry *lower_dir_dentry, *lower_dentry;
436 struct qstr lower_name;
437 int rc = 0; 437 int rc = 0;
438 438
439 ecryptfs_dentry->d_op = &ecryptfs_dops; 439 ecryptfs_dentry->d_op = &ecryptfs_dops;
@@ -444,9 +444,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
444 goto out_d_drop; 444 goto out_d_drop;
445 } 445 }
446 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); 446 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
447 447 lower_name.name = ecryptfs_dentry->d_name.name;
448 lower_name.len = ecryptfs_dentry->d_name.len;
449 lower_name.hash = ecryptfs_dentry->d_name.hash;
450 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
451 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
452 &lower_name);
453 if (rc < 0)
454 goto out_d_drop;
455 }
448 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry, 456 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
449 lower_dir_dentry); 457 lower_dir_dentry, &lower_name);
450 if (IS_ERR(lower_dentry)) { 458 if (IS_ERR(lower_dentry)) {
451 rc = PTR_ERR(lower_dentry); 459 rc = PTR_ERR(lower_dentry);
452 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " 460 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
@@ -471,8 +479,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
471 "filename; rc = [%d]\n", __func__, rc); 479 "filename; rc = [%d]\n", __func__, rc);
472 goto out_d_drop; 480 goto out_d_drop;
473 } 481 }
482 lower_name.name = encrypted_and_encoded_name;
483 lower_name.len = encrypted_and_encoded_name_size;
484 lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
485 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
486 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
487 &lower_name);
488 if (rc < 0)
489 goto out_d_drop;
490 }
474 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry, 491 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
475 lower_dir_dentry); 492 lower_dir_dentry, &lower_name);
476 if (IS_ERR(lower_dentry)) { 493 if (IS_ERR(lower_dentry)) {
477 rc = PTR_ERR(lower_dentry); 494 rc = PTR_ERR(lower_dentry);
478 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " 495 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 89c5476506ef..73811cfa2ea4 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -515,6 +515,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
515 if (!s) { 515 if (!s) {
516 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " 516 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
517 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); 517 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
518 rc = -ENOMEM;
518 goto out; 519 goto out;
519 } 520 }
520 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; 521 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -806,6 +807,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
806 if (!s) { 807 if (!s) {
807 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " 808 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
808 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); 809 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
810 rc = -ENOMEM;
809 goto out; 811 goto out;
810 } 812 }
811 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; 813 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index d8c3a373aafa..0851ab6980f5 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -86,7 +86,7 @@ out:
86 return 0; 86 return 0;
87} 87}
88 88
89int ecryptfs_init_kthread(void) 89int __init ecryptfs_init_kthread(void)
90{ 90{
91 int rc = 0; 91 int rc = 0;
92 92
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index bcb68c0cb1f0..ab2248090515 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -473,7 +473,7 @@ sleep:
473 return rc; 473 return rc;
474} 474}
475 475
476int ecryptfs_init_messaging(void) 476int __init ecryptfs_init_messaging(void)
477{ 477{
478 int i; 478 int i;
479 int rc = 0; 479 int rc = 0;
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 3745f612bcd4..00208c3d7e92 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -500,7 +500,7 @@ static struct miscdevice ecryptfs_miscdev = {
500 * 500 *
501 * Returns zero on success; non-zero otherwise 501 * Returns zero on success; non-zero otherwise
502 */ 502 */
503int ecryptfs_init_ecryptfs_miscdev(void) 503int __init ecryptfs_init_ecryptfs_miscdev(void)
504{ 504{
505 int rc; 505 int rc;
506 506
diff --git a/fs/exec.c b/fs/exec.c
index 2d9455282744..03278c984ba0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -376,6 +376,9 @@ static int count(const char __user * const __user * argv, int max)
376 argv++; 376 argv++;
377 if (i++ >= max) 377 if (i++ >= max)
378 return -E2BIG; 378 return -E2BIG;
379
380 if (fatal_signal_pending(current))
381 return -ERESTARTNOHAND;
379 cond_resched(); 382 cond_resched();
380 } 383 }
381 } 384 }
@@ -419,6 +422,12 @@ static int copy_strings(int argc, const char __user *const __user *argv,
419 while (len > 0) { 422 while (len > 0) {
420 int offset, bytes_to_copy; 423 int offset, bytes_to_copy;
421 424
425 if (fatal_signal_pending(current)) {
426 ret = -ERESTARTNOHAND;
427 goto out;
428 }
429 cond_resched();
430
422 offset = pos % PAGE_SIZE; 431 offset = pos % PAGE_SIZE;
423 if (offset == 0) 432 if (offset == 0)
424 offset = PAGE_SIZE; 433 offset = PAGE_SIZE;
@@ -594,6 +603,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
594#else 603#else
595 stack_top = arch_align_stack(stack_top); 604 stack_top = arch_align_stack(stack_top);
596 stack_top = PAGE_ALIGN(stack_top); 605 stack_top = PAGE_ALIGN(stack_top);
606
607 if (unlikely(stack_top < mmap_min_addr) ||
608 unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
609 return -ENOMEM;
610
597 stack_shift = vma->vm_end - stack_top; 611 stack_shift = vma->vm_end - stack_top;
598 612
599 bprm->p -= stack_shift; 613 bprm->p -= stack_shift;
@@ -2000,3 +2014,41 @@ fail_creds:
2000fail: 2014fail:
2001 return; 2015 return;
2002} 2016}
2017
2018/*
2019 * Core dumping helper functions. These are the only things you should
2020 * do on a core-file: use only these functions to write out all the
2021 * necessary info.
2022 */
2023int dump_write(struct file *file, const void *addr, int nr)
2024{
2025 return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
2026}
2027
2028int dump_seek(struct file *file, loff_t off)
2029{
2030 int ret = 1;
2031
2032 if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
2033 if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
2034 return 0;
2035 } else {
2036 char *buf = (char *)get_zeroed_page(GFP_KERNEL);
2037
2038 if (!buf)
2039 return 0;
2040 while (off > 0) {
2041 unsigned long n = off;
2042
2043 if (n > PAGE_SIZE)
2044 n = PAGE_SIZE;
2045 if (!dump_write(file, buf, n)) {
2046 ret = 0;
2047 break;
2048 }
2049 off -= n;
2050 }
2051 free_page((unsigned long)buf);
2052 }
2053 return ret;
2054}
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index eb7368ebd8cd..3eadd97324b1 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -54,6 +54,9 @@ struct page_collect {
54 unsigned nr_pages; 54 unsigned nr_pages;
55 unsigned long length; 55 unsigned long length;
56 loff_t pg_first; /* keep 64bit also in 32-arches */ 56 loff_t pg_first; /* keep 64bit also in 32-arches */
57 bool read_4_write; /* This means two things: that the read is sync
58 * And the pages should not be unlocked.
59 */
57}; 60};
58 61
59static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, 62static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -71,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
71 pcol->nr_pages = 0; 74 pcol->nr_pages = 0;
72 pcol->length = 0; 75 pcol->length = 0;
73 pcol->pg_first = -1; 76 pcol->pg_first = -1;
77 pcol->read_4_write = false;
74} 78}
75 79
76static void _pcol_reset(struct page_collect *pcol) 80static void _pcol_reset(struct page_collect *pcol)
@@ -347,7 +351,8 @@ static int readpage_strip(void *data, struct page *page)
347 if (PageError(page)) 351 if (PageError(page))
348 ClearPageError(page); 352 ClearPageError(page);
349 353
350 unlock_page(page); 354 if (!pcol->read_4_write)
355 unlock_page(page);
351 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," 356 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
352 " splitting\n", inode->i_ino, page->index); 357 " splitting\n", inode->i_ino, page->index);
353 358
@@ -428,6 +433,7 @@ static int _readpage(struct page *page, bool is_sync)
428 /* readpage_strip might call read_exec(,is_sync==false) at several 433 /* readpage_strip might call read_exec(,is_sync==false) at several
429 * places but not if we have a single page. 434 * places but not if we have a single page.
430 */ 435 */
436 pcol.read_4_write = is_sync;
431 ret = readpage_strip(&pcol, page); 437 ret = readpage_strip(&pcol, page);
432 if (ret) { 438 if (ret) {
433 EXOFS_ERR("_readpage => %d\n", ret); 439 EXOFS_ERR("_readpage => %d\n", ret);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 6769fd0f35b8..f8cc34f542c3 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -769,11 +769,15 @@ EXPORT_SYMBOL(kill_fasync);
769 769
770static int __init fcntl_init(void) 770static int __init fcntl_init(void)
771{ 771{
772 /* please add new bits here to ensure allocation uniqueness */ 772 /*
773 BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( 773 * Please add new bits here to ensure allocation uniqueness.
774 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
775 * is defined as O_NONBLOCK on some platforms and not on others.
776 */
777 BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
774 O_RDONLY | O_WRONLY | O_RDWR | 778 O_RDONLY | O_WRONLY | O_RDWR |
775 O_CREAT | O_EXCL | O_NOCTTY | 779 O_CREAT | O_EXCL | O_NOCTTY |
776 O_TRUNC | O_APPEND | O_NONBLOCK | 780 O_TRUNC | O_APPEND | /* O_NONBLOCK | */
777 __O_SYNC | O_DSYNC | FASYNC | 781 __O_SYNC | O_DSYNC | FASYNC |
778 O_DIRECT | O_LARGEFILE | O_DIRECTORY | 782 O_DIRECT | O_LARGEFILE | O_DIRECTORY |
779 O_NOFOLLOW | O_NOATIME | O_CLOEXEC | 783 O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7d9d06ba184b..ab38fef1c9a1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -52,8 +52,6 @@ struct wb_writeback_work {
52#define CREATE_TRACE_POINTS 52#define CREATE_TRACE_POINTS
53#include <trace/events/writeback.h> 53#include <trace/events/writeback.h>
54 54
55#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
56
57/* 55/*
58 * We don't actually have pdflush, but this one is exported though /proc... 56 * We don't actually have pdflush, but this one is exported though /proc...
59 */ 57 */
@@ -71,6 +69,16 @@ int writeback_in_progress(struct backing_dev_info *bdi)
71 return test_bit(BDI_writeback_running, &bdi->state); 69 return test_bit(BDI_writeback_running, &bdi->state);
72} 70}
73 71
72static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
73{
74 struct super_block *sb = inode->i_sb;
75
76 if (strcmp(sb->s_type->name, "bdev") == 0)
77 return inode->i_mapping->backing_dev_info;
78
79 return sb->s_bdi;
80}
81
74static void bdi_queue_work(struct backing_dev_info *bdi, 82static void bdi_queue_work(struct backing_dev_info *bdi,
75 struct wb_writeback_work *work) 83 struct wb_writeback_work *work)
76{ 84{
@@ -808,7 +816,7 @@ int bdi_writeback_thread(void *data)
808 wb->last_active = jiffies; 816 wb->last_active = jiffies;
809 817
810 set_current_state(TASK_INTERRUPTIBLE); 818 set_current_state(TASK_INTERRUPTIBLE);
811 if (!list_empty(&bdi->work_list)) { 819 if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
812 __set_current_state(TASK_RUNNING); 820 __set_current_state(TASK_RUNNING);
813 continue; 821 continue;
814 } 822 }
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 69ad053ffd78..cde755cca564 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -276,7 +276,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
276 * Called with fc->lock, unlocks it 276 * Called with fc->lock, unlocks it
277 */ 277 */
278static void request_end(struct fuse_conn *fc, struct fuse_req *req) 278static void request_end(struct fuse_conn *fc, struct fuse_req *req)
279__releases(&fc->lock) 279__releases(fc->lock)
280{ 280{
281 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; 281 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
282 req->end = NULL; 282 req->end = NULL;
@@ -306,8 +306,8 @@ __releases(&fc->lock)
306 306
307static void wait_answer_interruptible(struct fuse_conn *fc, 307static void wait_answer_interruptible(struct fuse_conn *fc,
308 struct fuse_req *req) 308 struct fuse_req *req)
309__releases(&fc->lock) 309__releases(fc->lock)
310__acquires(&fc->lock) 310__acquires(fc->lock)
311{ 311{
312 if (signal_pending(current)) 312 if (signal_pending(current))
313 return; 313 return;
@@ -325,8 +325,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
325} 325}
326 326
327static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) 327static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
328__releases(&fc->lock) 328__releases(fc->lock)
329__acquires(&fc->lock) 329__acquires(fc->lock)
330{ 330{
331 if (!fc->no_interrupt) { 331 if (!fc->no_interrupt) {
332 /* Any signal may interrupt this */ 332 /* Any signal may interrupt this */
@@ -905,8 +905,8 @@ static int request_pending(struct fuse_conn *fc)
905 905
906/* Wait until a request is available on the pending list */ 906/* Wait until a request is available on the pending list */
907static void request_wait(struct fuse_conn *fc) 907static void request_wait(struct fuse_conn *fc)
908__releases(&fc->lock) 908__releases(fc->lock)
909__acquires(&fc->lock) 909__acquires(fc->lock)
910{ 910{
911 DECLARE_WAITQUEUE(wait, current); 911 DECLARE_WAITQUEUE(wait, current);
912 912
@@ -934,7 +934,7 @@ __acquires(&fc->lock)
934 */ 934 */
935static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, 935static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
936 size_t nbytes, struct fuse_req *req) 936 size_t nbytes, struct fuse_req *req)
937__releases(&fc->lock) 937__releases(fc->lock)
938{ 938{
939 struct fuse_in_header ih; 939 struct fuse_in_header ih;
940 struct fuse_interrupt_in arg; 940 struct fuse_interrupt_in arg;
@@ -1354,7 +1354,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1354 loff_t file_size; 1354 loff_t file_size;
1355 unsigned int num; 1355 unsigned int num;
1356 unsigned int offset; 1356 unsigned int offset;
1357 size_t total_len; 1357 size_t total_len = 0;
1358 1358
1359 req = fuse_get_req(fc); 1359 req = fuse_get_req(fc);
1360 if (IS_ERR(req)) 1360 if (IS_ERR(req))
@@ -1720,8 +1720,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
1720 * This function releases and reacquires fc->lock 1720 * This function releases and reacquires fc->lock
1721 */ 1721 */
1722static void end_requests(struct fuse_conn *fc, struct list_head *head) 1722static void end_requests(struct fuse_conn *fc, struct list_head *head)
1723__releases(&fc->lock) 1723__releases(fc->lock)
1724__acquires(&fc->lock) 1724__acquires(fc->lock)
1725{ 1725{
1726 while (!list_empty(head)) { 1726 while (!list_empty(head)) {
1727 struct fuse_req *req; 1727 struct fuse_req *req;
@@ -1744,8 +1744,8 @@ __acquires(&fc->lock)
1744 * locked). 1744 * locked).
1745 */ 1745 */
1746static void end_io_requests(struct fuse_conn *fc) 1746static void end_io_requests(struct fuse_conn *fc)
1747__releases(&fc->lock) 1747__releases(fc->lock)
1748__acquires(&fc->lock) 1748__acquires(fc->lock)
1749{ 1749{
1750 while (!list_empty(&fc->io)) { 1750 while (!list_empty(&fc->io)) {
1751 struct fuse_req *req = 1751 struct fuse_req *req =
@@ -1769,6 +1769,16 @@ __acquires(&fc->lock)
1769 } 1769 }
1770} 1770}
1771 1771
1772static void end_queued_requests(struct fuse_conn *fc)
1773__releases(fc->lock)
1774__acquires(fc->lock)
1775{
1776 fc->max_background = UINT_MAX;
1777 flush_bg_queue(fc);
1778 end_requests(fc, &fc->pending);
1779 end_requests(fc, &fc->processing);
1780}
1781
1772/* 1782/*
1773 * Abort all requests. 1783 * Abort all requests.
1774 * 1784 *
@@ -1795,8 +1805,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
1795 fc->connected = 0; 1805 fc->connected = 0;
1796 fc->blocked = 0; 1806 fc->blocked = 0;
1797 end_io_requests(fc); 1807 end_io_requests(fc);
1798 end_requests(fc, &fc->pending); 1808 end_queued_requests(fc);
1799 end_requests(fc, &fc->processing);
1800 wake_up_all(&fc->waitq); 1809 wake_up_all(&fc->waitq);
1801 wake_up_all(&fc->blocked_waitq); 1810 wake_up_all(&fc->blocked_waitq);
1802 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 1811 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1811,8 +1820,9 @@ int fuse_dev_release(struct inode *inode, struct file *file)
1811 if (fc) { 1820 if (fc) {
1812 spin_lock(&fc->lock); 1821 spin_lock(&fc->lock);
1813 fc->connected = 0; 1822 fc->connected = 0;
1814 end_requests(fc, &fc->pending); 1823 fc->blocked = 0;
1815 end_requests(fc, &fc->processing); 1824 end_queued_requests(fc);
1825 wake_up_all(&fc->blocked_waitq);
1816 spin_unlock(&fc->lock); 1826 spin_unlock(&fc->lock);
1817 fuse_conn_put(fc); 1827 fuse_conn_put(fc);
1818 } 1828 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 147c1f71bdb9..c8224587123f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1144,8 +1144,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1144 1144
1145/* Called under fc->lock, may release and reacquire it */ 1145/* Called under fc->lock, may release and reacquire it */
1146static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) 1146static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1147__releases(&fc->lock) 1147__releases(fc->lock)
1148__acquires(&fc->lock) 1148__acquires(fc->lock)
1149{ 1149{
1150 struct fuse_inode *fi = get_fuse_inode(req->inode); 1150 struct fuse_inode *fi = get_fuse_inode(req->inode);
1151 loff_t size = i_size_read(req->inode); 1151 loff_t size = i_size_read(req->inode);
@@ -1183,8 +1183,8 @@ __acquires(&fc->lock)
1183 * Called with fc->lock 1183 * Called with fc->lock
1184 */ 1184 */
1185void fuse_flush_writepages(struct inode *inode) 1185void fuse_flush_writepages(struct inode *inode)
1186__releases(&fc->lock) 1186__releases(fc->lock)
1187__acquires(&fc->lock) 1187__acquires(fc->lock)
1188{ 1188{
1189 struct fuse_conn *fc = get_fuse_conn(inode); 1189 struct fuse_conn *fc = get_fuse_conn(inode);
1190 struct fuse_inode *fi = get_fuse_inode(inode); 1190 struct fuse_inode *fi = get_fuse_inode(inode);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 9c65170e932e..eb01f3575e10 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -923,7 +923,7 @@ int gfs2_logd(void *data)
923 923
924 do { 924 do {
925 prepare_to_wait(&sdp->sd_logd_waitq, &wait, 925 prepare_to_wait(&sdp->sd_logd_waitq, &wait,
926 TASK_UNINTERRUPTIBLE); 926 TASK_INTERRUPTIBLE);
927 if (!gfs2_ail_flush_reqd(sdp) && 927 if (!gfs2_ail_flush_reqd(sdp) &&
928 !gfs2_jrnl_flush_reqd(sdp) && 928 !gfs2_jrnl_flush_reqd(sdp) &&
929 !kthread_should_stop()) 929 !kthread_should_stop())
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index e20ee85955d1..f3f3578393a4 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -115,7 +115,7 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
115 115
116 inode_inc_link_count(dir); 116 inode_inc_link_count(dir);
117 117
118 inode = minix_new_inode(dir, mode, &err); 118 inode = minix_new_inode(dir, S_IFDIR | mode, &err);
119 if (!inode) 119 if (!inode)
120 goto out_dir; 120 goto out_dir;
121 121
diff --git a/fs/namespace.c b/fs/namespace.c
index de402eb6eafb..a72eaabfe8f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1484,13 +1484,30 @@ out_unlock:
1484} 1484}
1485 1485
1486/* 1486/*
1487 * Sanity check the flags to change_mnt_propagation.
1488 */
1489
1490static int flags_to_propagation_type(int flags)
1491{
1492 int type = flags & ~MS_REC;
1493
1494 /* Fail if any non-propagation flags are set */
1495 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
1496 return 0;
1497 /* Only one propagation flag should be set */
1498 if (!is_power_of_2(type))
1499 return 0;
1500 return type;
1501}
1502
1503/*
1487 * recursively change the type of the mountpoint. 1504 * recursively change the type of the mountpoint.
1488 */ 1505 */
1489static int do_change_type(struct path *path, int flag) 1506static int do_change_type(struct path *path, int flag)
1490{ 1507{
1491 struct vfsmount *m, *mnt = path->mnt; 1508 struct vfsmount *m, *mnt = path->mnt;
1492 int recurse = flag & MS_REC; 1509 int recurse = flag & MS_REC;
1493 int type = flag & ~MS_REC; 1510 int type;
1494 int err = 0; 1511 int err = 0;
1495 1512
1496 if (!capable(CAP_SYS_ADMIN)) 1513 if (!capable(CAP_SYS_ADMIN))
@@ -1499,6 +1516,10 @@ static int do_change_type(struct path *path, int flag)
1499 if (path->dentry != path->mnt->mnt_root) 1516 if (path->dentry != path->mnt->mnt_root)
1500 return -EINVAL; 1517 return -EINVAL;
1501 1518
1519 type = flags_to_propagation_type(flag);
1520 if (!type)
1521 return -EINVAL;
1522
1502 down_write(&namespace_sem); 1523 down_write(&namespace_sem);
1503 if (type == MS_SHARED) { 1524 if (type == MS_SHARED) {
1504 err = invent_group_ids(mnt, recurse); 1525 err = invent_group_ids(mnt, recurse);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 6c2aad49d731..f7e13db613cb 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -63,6 +63,7 @@ config NFS_V3_ACL
63config NFS_V4 63config NFS_V4
64 bool "NFS client support for NFS version 4" 64 bool "NFS client support for NFS version 4"
65 depends on NFS_FS 65 depends on NFS_FS
66 select SUNRPC_GSS
66 help 67 help
67 This option enables support for version 4 of the NFS protocol 68 This option enables support for version 4 of the NFS protocol
68 (RFC 3530) in the kernel's NFS client. 69 (RFC 3530) in the kernel's NFS client.
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4e7df2adb212..e7340729af89 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -275,7 +275,7 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
275 sin1->sin6_scope_id != sin2->sin6_scope_id) 275 sin1->sin6_scope_id != sin2->sin6_scope_id)
276 return 0; 276 return 0;
277 277
278 return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr); 278 return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
279} 279}
280#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ 280#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
281static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, 281static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index eb51bd6201da..05bf3c0dc751 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -723,10 +723,6 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
723 default: 723 default:
724 BUG(); 724 BUG();
725 } 725 }
726 if (res < 0)
727 dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager"
728 " - error %d!\n",
729 __func__, res);
730 return res; 726 return res;
731} 727}
732 728
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ec3966e4706b..f4cbf0c306c6 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -431,7 +431,15 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
431 goto out_err; 431 goto out_err;
432 432
433 error = server->nfs_client->rpc_ops->statfs(server, fh, &res); 433 error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
434 if (unlikely(error == -ESTALE)) {
435 struct dentry *pd_dentry;
434 436
437 pd_dentry = dget_parent(dentry);
438 if (pd_dentry != NULL) {
439 nfs_zap_caches(pd_dentry->d_inode);
440 dput(pd_dentry);
441 }
442 }
435 nfs_free_fattr(res.fattr); 443 nfs_free_fattr(res.fattr);
436 if (error < 0) 444 if (error < 0)
437 goto out_err; 445 goto out_err;
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 95932f523aef..4264377552e2 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -69,6 +69,7 @@ config NFSD_V4
69 depends on NFSD && PROC_FS && EXPERIMENTAL 69 depends on NFSD && PROC_FS && EXPERIMENTAL
70 select NFSD_V3 70 select NFSD_V3
71 select FS_POSIX_ACL 71 select FS_POSIX_ACL
72 select SUNRPC_GSS
72 help 73 help
73 This option enables support in your system's NFS server for 74 This option enables support in your system's NFS server for
74 version 4 of the NFS protocol (RFC 3530). 75 version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2e7357104cfd..cf0d2ffb3c84 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -440,7 +440,7 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
440 440
441static int nfs4_access_to_omode(u32 access) 441static int nfs4_access_to_omode(u32 access)
442{ 442{
443 switch (access) { 443 switch (access & NFS4_SHARE_ACCESS_BOTH) {
444 case NFS4_SHARE_ACCESS_READ: 444 case NFS4_SHARE_ACCESS_READ:
445 return O_RDONLY; 445 return O_RDONLY;
446 case NFS4_SHARE_ACCESS_WRITE: 446 case NFS4_SHARE_ACCESS_WRITE:
@@ -2450,14 +2450,13 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
2450static __be32 2450static __be32
2451nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) 2451nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
2452{ 2452{
2453 u32 op_share_access, new_access; 2453 u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
2454 bool new_access;
2454 __be32 status; 2455 __be32 status;
2455 2456
2456 set_access(&new_access, stp->st_access_bmap); 2457 new_access = !test_bit(op_share_access, &stp->st_access_bmap);
2457 new_access = (~new_access) & open->op_share_access & ~NFS4_SHARE_WANT_MASK;
2458
2459 if (new_access) { 2458 if (new_access) {
2460 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, new_access); 2459 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access);
2461 if (status) 2460 if (status)
2462 return status; 2461 return status;
2463 } 2462 }
@@ -2470,7 +2469,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
2470 return status; 2469 return status;
2471 } 2470 }
2472 /* remember the open */ 2471 /* remember the open */
2473 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
2474 __set_bit(op_share_access, &stp->st_access_bmap); 2472 __set_bit(op_share_access, &stp->st_access_bmap);
2475 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 2473 __set_bit(open->op_share_deny, &stp->st_deny_bmap);
2476 2474
@@ -2983,7 +2981,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2983 *filpp = find_readable_file(stp->st_file); 2981 *filpp = find_readable_file(stp->st_file);
2984 else 2982 else
2985 *filpp = find_writeable_file(stp->st_file); 2983 *filpp = find_writeable_file(stp->st_file);
2986 BUG_ON(!*filpp); /* assured by check_openmode */
2987 } 2984 }
2988 } 2985 }
2989 status = nfs_ok; 2986 status = nfs_ok;
@@ -3561,7 +3558,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3561 struct nfs4_stateowner *open_sop = NULL; 3558 struct nfs4_stateowner *open_sop = NULL;
3562 struct nfs4_stateowner *lock_sop = NULL; 3559 struct nfs4_stateowner *lock_sop = NULL;
3563 struct nfs4_stateid *lock_stp; 3560 struct nfs4_stateid *lock_stp;
3564 struct file *filp; 3561 struct nfs4_file *fp;
3562 struct file *filp = NULL;
3565 struct file_lock file_lock; 3563 struct file_lock file_lock;
3566 struct file_lock conflock; 3564 struct file_lock conflock;
3567 __be32 status = 0; 3565 __be32 status = 0;
@@ -3591,7 +3589,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3591 * lock stateid. 3589 * lock stateid.
3592 */ 3590 */
3593 struct nfs4_stateid *open_stp = NULL; 3591 struct nfs4_stateid *open_stp = NULL;
3594 struct nfs4_file *fp;
3595 3592
3596 status = nfserr_stale_clientid; 3593 status = nfserr_stale_clientid;
3597 if (!nfsd4_has_session(cstate) && 3594 if (!nfsd4_has_session(cstate) &&
@@ -3634,6 +3631,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3634 if (status) 3631 if (status)
3635 goto out; 3632 goto out;
3636 lock_sop = lock->lk_replay_owner; 3633 lock_sop = lock->lk_replay_owner;
3634 fp = lock_stp->st_file;
3637 } 3635 }
3638 /* lock->lk_replay_owner and lock_stp have been created or found */ 3636 /* lock->lk_replay_owner and lock_stp have been created or found */
3639 3637
@@ -3648,13 +3646,19 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3648 switch (lock->lk_type) { 3646 switch (lock->lk_type) {
3649 case NFS4_READ_LT: 3647 case NFS4_READ_LT:
3650 case NFS4_READW_LT: 3648 case NFS4_READW_LT:
3651 filp = find_readable_file(lock_stp->st_file); 3649 if (find_readable_file(lock_stp->st_file)) {
3650 nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
3651 filp = find_readable_file(lock_stp->st_file);
3652 }
3652 file_lock.fl_type = F_RDLCK; 3653 file_lock.fl_type = F_RDLCK;
3653 cmd = F_SETLK; 3654 cmd = F_SETLK;
3654 break; 3655 break;
3655 case NFS4_WRITE_LT: 3656 case NFS4_WRITE_LT:
3656 case NFS4_WRITEW_LT: 3657 case NFS4_WRITEW_LT:
3657 filp = find_writeable_file(lock_stp->st_file); 3658 if (find_writeable_file(lock_stp->st_file)) {
3659 nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
3660 filp = find_writeable_file(lock_stp->st_file);
3661 }
3658 file_lock.fl_type = F_WRLCK; 3662 file_lock.fl_type = F_WRLCK;
3659 cmd = F_SETLK; 3663 cmd = F_SETLK;
3660 break; 3664 break;
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index cdfb8c6a4206..c16f8d8331b5 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp)
196static inline void 196static inline void
197fh_unlock(struct svc_fh *fhp) 197fh_unlock(struct svc_fh *fhp)
198{ 198{
199 BUG_ON(!fhp->fh_dentry);
200
201 if (fhp->fh_locked) { 199 if (fhp->fh_locked) {
202 fill_post_wcc(fhp); 200 fill_post_wcc(fhp);
203 mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); 201 mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 7731a75971dd..322518c88e4b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -363,23 +363,23 @@ struct nfs4_file {
363 * at all? */ 363 * at all? */
364static inline struct file *find_writeable_file(struct nfs4_file *f) 364static inline struct file *find_writeable_file(struct nfs4_file *f)
365{ 365{
366 if (f->fi_fds[O_RDWR]) 366 if (f->fi_fds[O_WRONLY])
367 return f->fi_fds[O_RDWR]; 367 return f->fi_fds[O_WRONLY];
368 return f->fi_fds[O_WRONLY]; 368 return f->fi_fds[O_RDWR];
369} 369}
370 370
371static inline struct file *find_readable_file(struct nfs4_file *f) 371static inline struct file *find_readable_file(struct nfs4_file *f)
372{ 372{
373 if (f->fi_fds[O_RDWR]) 373 if (f->fi_fds[O_RDONLY])
374 return f->fi_fds[O_RDWR]; 374 return f->fi_fds[O_RDONLY];
375 return f->fi_fds[O_RDONLY]; 375 return f->fi_fds[O_RDWR];
376} 376}
377 377
378static inline struct file *find_any_file(struct nfs4_file *f) 378static inline struct file *find_any_file(struct nfs4_file *f)
379{ 379{
380 if (f->fi_fds[O_RDWR]) 380 if (f->fi_fds[O_RDWR])
381 return f->fi_fds[O_RDWR]; 381 return f->fi_fds[O_RDWR];
382 else if (f->fi_fds[O_RDWR]) 382 else if (f->fi_fds[O_WRONLY])
383 return f->fi_fds[O_WRONLY]; 383 return f->fi_fds[O_WRONLY];
384 else 384 else
385 return f->fi_fds[O_RDONLY]; 385 return f->fi_fds[O_RDONLY];
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 96360a83cb91..661a6cf8e826 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2033,15 +2033,17 @@ out:
2033__be32 2033__be32
2034nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) 2034nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
2035{ 2035{
2036 struct path path = {
2037 .mnt = fhp->fh_export->ex_path.mnt,
2038 .dentry = fhp->fh_dentry,
2039 };
2040 __be32 err; 2036 __be32 err;
2041 2037
2042 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); 2038 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
2043 if (!err && vfs_statfs(&path, stat)) 2039 if (!err) {
2044 err = nfserr_io; 2040 struct path path = {
2041 .mnt = fhp->fh_export->ex_path.mnt,
2042 .dentry = fhp->fh_dentry,
2043 };
2044 if (vfs_statfs(&path, stat))
2045 err = nfserr_io;
2046 }
2045 return err; 2047 return err;
2046} 2048}
2047 2049
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d97310f07bef..d27715103376 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -446,6 +446,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
446 nilfs_mdt_destroy(nilfs->ns_cpfile); 446 nilfs_mdt_destroy(nilfs->ns_cpfile);
447 nilfs_mdt_destroy(nilfs->ns_sufile); 447 nilfs_mdt_destroy(nilfs->ns_sufile);
448 nilfs_mdt_destroy(nilfs->ns_dat); 448 nilfs_mdt_destroy(nilfs->ns_dat);
449 nilfs_mdt_destroy(nilfs->ns_gc_dat);
449 450
450 failed: 451 failed:
451 nilfs_clear_recovery_info(&ri); 452 nilfs_clear_recovery_info(&ri);
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629eedd82..b388443c3a09 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,4 +3,4 @@ config FSNOTIFY
3 3
4source "fs/notify/dnotify/Kconfig" 4source "fs/notify/dnotify/Kconfig"
5source "fs/notify/inotify/Kconfig" 5source "fs/notify/inotify/Kconfig"
6source "fs/notify/fanotify/Kconfig" 6#source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 756566fe8449..85366c78cc37 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -165,9 +165,6 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
165 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, 165 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
166 inode_mark, vfsmnt_mark, event_mask, data, data_type); 166 inode_mark, vfsmnt_mark, event_mask, data, data_type);
167 167
168 pr_debug("%s: group=%p vfsmount_mark=%p inode_mark=%p mask=%x\n",
169 __func__, group, vfsmnt_mark, inode_mark, event_mask);
170
171 /* sorry, fanotify only gives a damn about files and dirs */ 168 /* sorry, fanotify only gives a damn about files and dirs */
172 if (!S_ISREG(to_tell->i_mode) && 169 if (!S_ISREG(to_tell->i_mode) &&
173 !S_ISDIR(to_tell->i_mode)) 170 !S_ISDIR(to_tell->i_mode))
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 032b837fcd11..5ed8e58d7bfc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -195,6 +195,14 @@ static int prepare_for_access_response(struct fsnotify_group *group,
195 re->fd = fd; 195 re->fd = fd;
196 196
197 mutex_lock(&group->fanotify_data.access_mutex); 197 mutex_lock(&group->fanotify_data.access_mutex);
198
199 if (group->fanotify_data.bypass_perm) {
200 mutex_unlock(&group->fanotify_data.access_mutex);
201 kmem_cache_free(fanotify_response_event_cache, re);
202 event->response = FAN_ALLOW;
203 return 0;
204 }
205
198 list_add_tail(&re->list, &group->fanotify_data.access_list); 206 list_add_tail(&re->list, &group->fanotify_data.access_list);
199 mutex_unlock(&group->fanotify_data.access_mutex); 207 mutex_unlock(&group->fanotify_data.access_mutex);
200 208
@@ -364,9 +372,28 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
364static int fanotify_release(struct inode *ignored, struct file *file) 372static int fanotify_release(struct inode *ignored, struct file *file)
365{ 373{
366 struct fsnotify_group *group = file->private_data; 374 struct fsnotify_group *group = file->private_data;
375 struct fanotify_response_event *re, *lre;
367 376
368 pr_debug("%s: file=%p group=%p\n", __func__, file, group); 377 pr_debug("%s: file=%p group=%p\n", __func__, file, group);
369 378
379#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
380 mutex_lock(&group->fanotify_data.access_mutex);
381
382 group->fanotify_data.bypass_perm = true;
383
384 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
385 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
386 re, re->event);
387
388 list_del_init(&re->list);
389 re->event->response = FAN_ALLOW;
390
391 kmem_cache_free(fanotify_response_event_cache, re);
392 }
393 mutex_unlock(&group->fanotify_data.access_mutex);
394
395 wake_up(&group->fanotify_data.access_waitq);
396#endif
370 /* matches the fanotify_init->fsnotify_alloc_group */ 397 /* matches the fanotify_init->fsnotify_alloc_group */
371 fsnotify_put_group(group); 398 fsnotify_put_group(group);
372 399
@@ -614,7 +641,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
614 __func__, flags, event_f_flags); 641 __func__, flags, event_f_flags);
615 642
616 if (!capable(CAP_SYS_ADMIN)) 643 if (!capable(CAP_SYS_ADMIN))
617 return -EACCES; 644 return -EPERM;
618 645
619 if (flags & ~FAN_ALL_INIT_FLAGS) 646 if (flags & ~FAN_ALL_INIT_FLAGS)
620 return -EINVAL; 647 return -EINVAL;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 3970392b2722..36802420d69a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -148,13 +148,14 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
148 const unsigned char *file_name, 148 const unsigned char *file_name,
149 struct fsnotify_event **event) 149 struct fsnotify_event **event)
150{ 150{
151 struct fsnotify_group *group = inode_mark->group; 151 struct fsnotify_group *group = NULL;
152 __u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); 152 __u32 inode_test_mask = 0;
153 __u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); 153 __u32 vfsmount_test_mask = 0;
154 154
155 pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p" 155 if (unlikely(!inode_mark && !vfsmount_mark)) {
156 " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell, 156 BUG();
157 mnt, inode_mark, mask, data, data_is, cookie, *event); 157 return 0;
158 }
158 159
159 /* clear ignored on inode modification */ 160 /* clear ignored on inode modification */
160 if (mask & FS_MODIFY) { 161 if (mask & FS_MODIFY) {
@@ -168,18 +169,29 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
168 169
169 /* does the inode mark tell us to do something? */ 170 /* does the inode mark tell us to do something? */
170 if (inode_mark) { 171 if (inode_mark) {
172 group = inode_mark->group;
173 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
171 inode_test_mask &= inode_mark->mask; 174 inode_test_mask &= inode_mark->mask;
172 inode_test_mask &= ~inode_mark->ignored_mask; 175 inode_test_mask &= ~inode_mark->ignored_mask;
173 } 176 }
174 177
175 /* does the vfsmount_mark tell us to do something? */ 178 /* does the vfsmount_mark tell us to do something? */
176 if (vfsmount_mark) { 179 if (vfsmount_mark) {
180 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
181 group = vfsmount_mark->group;
177 vfsmount_test_mask &= vfsmount_mark->mask; 182 vfsmount_test_mask &= vfsmount_mark->mask;
178 vfsmount_test_mask &= ~vfsmount_mark->ignored_mask; 183 vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
179 if (inode_mark) 184 if (inode_mark)
180 vfsmount_test_mask &= ~inode_mark->ignored_mask; 185 vfsmount_test_mask &= ~inode_mark->ignored_mask;
181 } 186 }
182 187
188 pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
189 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
190 " data=%p data_is=%d cookie=%d event=%p\n",
191 __func__, group, to_tell, mnt, mask, inode_mark,
192 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
193 data_is, cookie, *event);
194
183 if (!inode_test_mask && !vfsmount_test_mask) 195 if (!inode_test_mask && !vfsmount_test_mask)
184 return 0; 196 return 0;
185 197
@@ -207,13 +219,12 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
207int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, 219int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
208 const unsigned char *file_name, u32 cookie) 220 const unsigned char *file_name, u32 cookie)
209{ 221{
210 struct hlist_node *inode_node, *vfsmount_node; 222 struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
211 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; 223 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
212 struct fsnotify_group *inode_group, *vfsmount_group; 224 struct fsnotify_group *inode_group, *vfsmount_group;
213 struct fsnotify_event *event = NULL; 225 struct fsnotify_event *event = NULL;
214 struct vfsmount *mnt; 226 struct vfsmount *mnt;
215 int idx, ret = 0; 227 int idx, ret = 0;
216 bool used_inode = false, used_vfsmount = false;
217 /* global tests shouldn't care about events on child only the specific event */ 228 /* global tests shouldn't care about events on child only the specific event */
218 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); 229 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
219 230
@@ -238,57 +249,50 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
238 (test_mask & to_tell->i_fsnotify_mask)) 249 (test_mask & to_tell->i_fsnotify_mask))
239 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, 250 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
240 &fsnotify_mark_srcu); 251 &fsnotify_mark_srcu);
241 else
242 inode_node = NULL;
243 252
244 if (mnt) { 253 if (mnt && ((mask & FS_MODIFY) ||
245 if ((mask & FS_MODIFY) || 254 (test_mask & mnt->mnt_fsnotify_mask))) {
246 (test_mask & mnt->mnt_fsnotify_mask)) 255 vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
247 vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, 256 &fsnotify_mark_srcu);
248 &fsnotify_mark_srcu); 257 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
249 else 258 &fsnotify_mark_srcu);
250 vfsmount_node = NULL;
251 } else {
252 mnt = NULL;
253 vfsmount_node = NULL;
254 } 259 }
255 260
256 while (inode_node || vfsmount_node) { 261 while (inode_node || vfsmount_node) {
262 inode_group = vfsmount_group = NULL;
263
257 if (inode_node) { 264 if (inode_node) {
258 inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), 265 inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
259 struct fsnotify_mark, i.i_list); 266 struct fsnotify_mark, i.i_list);
260 inode_group = inode_mark->group; 267 inode_group = inode_mark->group;
261 } else 268 }
262 inode_group = (void *)-1;
263 269
264 if (vfsmount_node) { 270 if (vfsmount_node) {
265 vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), 271 vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
266 struct fsnotify_mark, m.m_list); 272 struct fsnotify_mark, m.m_list);
267 vfsmount_group = vfsmount_mark->group; 273 vfsmount_group = vfsmount_mark->group;
268 } else 274 }
269 vfsmount_group = (void *)-1;
270 275
271 if (inode_group < vfsmount_group) { 276 if (inode_group > vfsmount_group) {
272 /* handle inode */ 277 /* handle inode */
273 send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, 278 send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
274 data_is, cookie, file_name, &event); 279 data_is, cookie, file_name, &event);
275 used_inode = true; 280 /* we didn't use the vfsmount_mark */
276 } else if (vfsmount_group < inode_group) { 281 vfsmount_group = NULL;
282 } else if (vfsmount_group > inode_group) {
277 send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, 283 send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
278 data_is, cookie, file_name, &event); 284 data_is, cookie, file_name, &event);
279 used_vfsmount = true; 285 inode_group = NULL;
280 } else { 286 } else {
281 send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, 287 send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
282 mask, data, data_is, cookie, file_name, 288 mask, data, data_is, cookie, file_name,
283 &event); 289 &event);
284 used_vfsmount = true;
285 used_inode = true;
286 } 290 }
287 291
288 if (used_inode) 292 if (inode_group)
289 inode_node = srcu_dereference(inode_node->next, 293 inode_node = srcu_dereference(inode_node->next,
290 &fsnotify_mark_srcu); 294 &fsnotify_mark_srcu);
291 if (used_vfsmount) 295 if (vfsmount_group)
292 vfsmount_node = srcu_dereference(vfsmount_node->next, 296 vfsmount_node = srcu_dereference(vfsmount_node->next,
293 &fsnotify_mark_srcu); 297 &fsnotify_mark_srcu);
294 } 298 }
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index a76e0aa5cd3f..391915093fe1 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -209,7 +209,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
209 } 209 }
210 210
211 inode->i_mode = new_mode; 211 inode->i_mode = new_mode;
212 inode->i_ctime = CURRENT_TIME;
212 di->i_mode = cpu_to_le16(inode->i_mode); 213 di->i_mode = cpu_to_le16(inode->i_mode);
214 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
215 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
213 216
214 ocfs2_journal_dirty(handle, di_bh); 217 ocfs2_journal_dirty(handle, di_bh);
215 218
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 215e12ce1d85..592fae5007d1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6672,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
6672 last_page_bytes = PAGE_ALIGN(end); 6672 last_page_bytes = PAGE_ALIGN(end);
6673 index = start >> PAGE_CACHE_SHIFT; 6673 index = start >> PAGE_CACHE_SHIFT;
6674 do { 6674 do {
6675 pages[numpages] = grab_cache_page(mapping, index); 6675 pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
6676 if (!pages[numpages]) { 6676 if (!pages[numpages]) {
6677 ret = -ENOMEM; 6677 ret = -ENOMEM;
6678 mlog_errno(ret); 6678 mlog_errno(ret);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index ec6d12339593..c7ee03c22226 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
439 439
440 ocfs2_blockcheck_inc_failure(stats); 440 ocfs2_blockcheck_inc_failure(stats);
441 mlog(ML_ERROR, 441 mlog(ML_ERROR,
442 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", 442 "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
443 (unsigned int)check.bc_crc32e, (unsigned int)crc); 443 (unsigned int)check.bc_crc32e, (unsigned int)crc);
444 444
445 /* Ok, try ECC fixups */ 445 /* Ok, try ECC fixups */
@@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
453 goto out; 453 goto out;
454 } 454 }
455 455
456 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", 456 mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
457 (unsigned int)check.bc_crc32e, (unsigned int)crc); 457 (unsigned int)check.bc_crc32e, (unsigned int)crc);
458 458
459 rc = -EIO; 459 rc = -EIO;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1361997cf205..cbe2f057cc28 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -977,7 +977,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
977int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, 977int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
978 size_t caller_veclen, u8 target_node, int *status) 978 size_t caller_veclen, u8 target_node, int *status)
979{ 979{
980 int ret; 980 int ret = 0;
981 struct o2net_msg *msg = NULL; 981 struct o2net_msg *msg = NULL;
982 size_t veclen, caller_bytes = 0; 982 size_t veclen, caller_bytes = 0;
983 struct kvec *vec = NULL; 983 struct kvec *vec = NULL;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f04ebcfffc4a..c49f6de0e7ab 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3931,6 +3931,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3931 goto out_commit; 3931 goto out_commit;
3932 } 3932 }
3933 3933
3934 cpos = split_hash;
3935 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3936 data_ac, meta_ac, new_dx_leaves,
3937 num_dx_leaves);
3938 if (ret) {
3939 mlog_errno(ret);
3940 goto out_commit;
3941 }
3942
3934 for (i = 0; i < num_dx_leaves; i++) { 3943 for (i = 0; i < num_dx_leaves; i++) {
3935 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), 3944 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3936 orig_dx_leaves[i], 3945 orig_dx_leaves[i],
@@ -3939,15 +3948,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3939 mlog_errno(ret); 3948 mlog_errno(ret);
3940 goto out_commit; 3949 goto out_commit;
3941 } 3950 }
3942 }
3943 3951
3944 cpos = split_hash; 3952 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3945 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, 3953 new_dx_leaves[i],
3946 data_ac, meta_ac, new_dx_leaves, 3954 OCFS2_JOURNAL_ACCESS_WRITE);
3947 num_dx_leaves); 3955 if (ret) {
3948 if (ret) { 3956 mlog_errno(ret);
3949 mlog_errno(ret); 3957 goto out_commit;
3950 goto out_commit; 3958 }
3951 } 3959 }
3952 3960
3953 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, 3961 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4b6ae2c13b47..765298908f1d 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1030,6 +1030,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
1030 struct dlm_lock_resource *res); 1030 struct dlm_lock_resource *res);
1031void dlm_clean_master_list(struct dlm_ctxt *dlm, 1031void dlm_clean_master_list(struct dlm_ctxt *dlm,
1032 u8 dead_node); 1032 u8 dead_node);
1033void dlm_force_free_mles(struct dlm_ctxt *dlm);
1033int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); 1034int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
1034int __dlm_lockres_has_locks(struct dlm_lock_resource *res); 1035int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
1035int __dlm_lockres_unused(struct dlm_lock_resource *res); 1036int __dlm_lockres_unused(struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 5efdd37dfe48..901ca52bf86b 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -636,8 +636,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
636 spin_lock(&dlm->track_lock); 636 spin_lock(&dlm->track_lock);
637 if (oldres) 637 if (oldres)
638 track_list = &oldres->tracking; 638 track_list = &oldres->tracking;
639 else 639 else {
640 track_list = &dlm->tracking_list; 640 track_list = &dlm->tracking_list;
641 if (list_empty(track_list)) {
642 dl = NULL;
643 spin_unlock(&dlm->track_lock);
644 goto bail;
645 }
646 }
641 647
642 list_for_each_entry(res, track_list, tracking) { 648 list_for_each_entry(res, track_list, tracking) {
643 if (&res->tracking == &dlm->tracking_list) 649 if (&res->tracking == &dlm->tracking_list)
@@ -660,6 +666,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
660 } else 666 } else
661 dl = NULL; 667 dl = NULL;
662 668
669bail:
663 /* passed to seq_show */ 670 /* passed to seq_show */
664 return dl; 671 return dl;
665} 672}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 153abb5abef0..11a5c87fd7f7 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -693,6 +693,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
693 693
694 dlm_mark_domain_leaving(dlm); 694 dlm_mark_domain_leaving(dlm);
695 dlm_leave_domain(dlm); 695 dlm_leave_domain(dlm);
696 dlm_force_free_mles(dlm);
696 dlm_complete_dlm_shutdown(dlm); 697 dlm_complete_dlm_shutdown(dlm);
697 } 698 }
698 dlm_put(dlm); 699 dlm_put(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ffb4c68dafa4..f564b0e5f80d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -3433,3 +3433,43 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
3433 wake_up(&res->wq); 3433 wake_up(&res->wq);
3434 wake_up(&dlm->migration_wq); 3434 wake_up(&dlm->migration_wq);
3435} 3435}
3436
3437void dlm_force_free_mles(struct dlm_ctxt *dlm)
3438{
3439 int i;
3440 struct hlist_head *bucket;
3441 struct dlm_master_list_entry *mle;
3442 struct hlist_node *tmp, *list;
3443
3444 /*
3445 * We notified all other nodes that we are exiting the domain and
3446 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
3447 * around we force free them and wake any processes that are waiting
3448 * on the mles
3449 */
3450 spin_lock(&dlm->spinlock);
3451 spin_lock(&dlm->master_lock);
3452
3453 BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
3454 BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
3455
3456 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3457 bucket = dlm_master_hash(dlm, i);
3458 hlist_for_each_safe(list, tmp, bucket) {
3459 mle = hlist_entry(list, struct dlm_master_list_entry,
3460 master_hash_node);
3461 if (mle->type != DLM_MLE_BLOCK) {
3462 mlog(ML_ERROR, "bad mle: %p\n", mle);
3463 dlm_print_one_mle(mle);
3464 }
3465 atomic_set(&mle->woken, 1);
3466 wake_up(&mle->wq);
3467
3468 __dlm_unlink_mle(dlm, mle);
3469 __dlm_mle_detach_hb_events(dlm, mle);
3470 __dlm_put_mle(mle);
3471 }
3472 }
3473 spin_unlock(&dlm->master_lock);
3474 spin_unlock(&dlm->spinlock);
3475}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d1ce48e1b3d6..1d596d8c4a4a 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -84,6 +84,7 @@ enum {
84 OI_LS_PARENT, 84 OI_LS_PARENT,
85 OI_LS_RENAME1, 85 OI_LS_RENAME1,
86 OI_LS_RENAME2, 86 OI_LS_RENAME2,
87 OI_LS_REFLINK_TARGET,
87}; 88};
88 89
89int ocfs2_dlm_init(struct ocfs2_super *osb); 90int ocfs2_dlm_init(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 81296b4e3646..9a03c151b5ce 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -36,6 +36,7 @@
36#include <linux/writeback.h> 36#include <linux/writeback.h>
37#include <linux/falloc.h> 37#include <linux/falloc.h>
38#include <linux/quotaops.h> 38#include <linux/quotaops.h>
39#include <linux/blkdev.h>
39 40
40#define MLOG_MASK_PREFIX ML_INODE 41#define MLOG_MASK_PREFIX ML_INODE
41#include <cluster/masklog.h> 42#include <cluster/masklog.h>
@@ -190,8 +191,16 @@ static int ocfs2_sync_file(struct file *file, int datasync)
190 if (err) 191 if (err)
191 goto bail; 192 goto bail;
192 193
193 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 194 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
195 /*
196 * We still have to flush drive's caches to get data to the
197 * platter
198 */
199 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
200 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
201 NULL, BLKDEV_IFL_WAIT);
194 goto bail; 202 goto bail;
203 }
195 204
196 journal = osb->journal->j_journal; 205 journal = osb->journal->j_journal;
197 err = jbd2_journal_force_commit(journal); 206 err = jbd2_journal_force_commit(journal);
@@ -774,7 +783,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
774 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); 783 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
775 BUG_ON(abs_from & (inode->i_blkbits - 1)); 784 BUG_ON(abs_from & (inode->i_blkbits - 1));
776 785
777 page = grab_cache_page(mapping, index); 786 page = find_or_create_page(mapping, index, GFP_NOFS);
778 if (!page) { 787 if (!page) {
779 ret = -ENOMEM; 788 ret = -ENOMEM;
780 mlog_errno(ret); 789 mlog_errno(ret);
@@ -2329,7 +2338,7 @@ out_dio:
2329 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2338 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2330 2339
2331 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || 2340 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2332 ((file->f_flags & O_DIRECT) && has_refcount)) { 2341 ((file->f_flags & O_DIRECT) && !direct_io)) {
2333 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2342 ret = filemap_fdatawrite_range(file->f_mapping, pos,
2334 pos + count - 1); 2343 pos + count - 1);
2335 if (ret < 0) 2344 if (ret < 0)
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0492464916b1..eece3e05d9d0 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -488,7 +488,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
488 OCFS2_BH_IGNORE_CACHE); 488 OCFS2_BH_IGNORE_CACHE);
489 } else { 489 } else {
490 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); 490 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
491 if (!status) 491 /*
492 * If buffer is in jbd, then its checksum may not have been
493 * computed as yet.
494 */
495 if (!status && !buffer_jbd(bh))
492 status = ocfs2_validate_inode_block(osb->sb, bh); 496 status = ocfs2_validate_inode_block(osb->sb, bh);
493 } 497 }
494 if (status < 0) { 498 if (status < 0) {
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af2b8fe1f139..4c18f4ad93b4 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -74,9 +74,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
74 /* 74 /*
75 * Another node might have truncated while we were waiting on 75 * Another node might have truncated while we were waiting on
76 * cluster locks. 76 * cluster locks.
77 * We don't check size == 0 before the shift. This is borrowed
78 * from do_generic_file_read.
77 */ 79 */
78 last_index = size >> PAGE_CACHE_SHIFT; 80 last_index = (size - 1) >> PAGE_CACHE_SHIFT;
79 if (page->index > last_index) { 81 if (unlikely(!size || page->index > last_index)) {
80 ret = -EINVAL; 82 ret = -EINVAL;
81 goto out; 83 goto out;
82 } 84 }
@@ -107,7 +109,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
107 * because the "write" would invalidate their data. 109 * because the "write" would invalidate their data.
108 */ 110 */
109 if (page->index == last_index) 111 if (page->index == last_index)
110 len = size & ~PAGE_CACHE_MASK; 112 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
111 113
112 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, 114 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
113 &fsdata, di_bh, page); 115 &fsdata, di_bh, page);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f171b51a74f7..a00dda2e4f16 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -472,32 +472,23 @@ leave:
472 return status; 472 return status;
473} 473}
474 474
475static int ocfs2_mknod_locked(struct ocfs2_super *osb, 475static int __ocfs2_mknod_locked(struct inode *dir,
476 struct inode *dir, 476 struct inode *inode,
477 struct inode *inode, 477 dev_t dev,
478 dev_t dev, 478 struct buffer_head **new_fe_bh,
479 struct buffer_head **new_fe_bh, 479 struct buffer_head *parent_fe_bh,
480 struct buffer_head *parent_fe_bh, 480 handle_t *handle,
481 handle_t *handle, 481 struct ocfs2_alloc_context *inode_ac,
482 struct ocfs2_alloc_context *inode_ac) 482 u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
483{ 483{
484 int status = 0; 484 int status = 0;
485 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
485 struct ocfs2_dinode *fe = NULL; 486 struct ocfs2_dinode *fe = NULL;
486 struct ocfs2_extent_list *fel; 487 struct ocfs2_extent_list *fel;
487 u64 suballoc_loc, fe_blkno = 0;
488 u16 suballoc_bit;
489 u16 feat; 488 u16 feat;
490 489
491 *new_fe_bh = NULL; 490 *new_fe_bh = NULL;
492 491
493 status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
494 inode_ac, &suballoc_loc,
495 &suballoc_bit, &fe_blkno);
496 if (status < 0) {
497 mlog_errno(status);
498 goto leave;
499 }
500
501 /* populate as many fields early on as possible - many of 492 /* populate as many fields early on as possible - many of
502 * these are used by the support functions here and in 493 * these are used by the support functions here and in
503 * callers. */ 494 * callers. */
@@ -591,6 +582,34 @@ leave:
591 return status; 582 return status;
592} 583}
593 584
585static int ocfs2_mknod_locked(struct ocfs2_super *osb,
586 struct inode *dir,
587 struct inode *inode,
588 dev_t dev,
589 struct buffer_head **new_fe_bh,
590 struct buffer_head *parent_fe_bh,
591 handle_t *handle,
592 struct ocfs2_alloc_context *inode_ac)
593{
594 int status = 0;
595 u64 suballoc_loc, fe_blkno = 0;
596 u16 suballoc_bit;
597
598 *new_fe_bh = NULL;
599
600 status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
601 inode_ac, &suballoc_loc,
602 &suballoc_bit, &fe_blkno);
603 if (status < 0) {
604 mlog_errno(status);
605 return status;
606 }
607
608 return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
609 parent_fe_bh, handle, inode_ac,
610 fe_blkno, suballoc_loc, suballoc_bit);
611}
612
594static int ocfs2_mkdir(struct inode *dir, 613static int ocfs2_mkdir(struct inode *dir,
595 struct dentry *dentry, 614 struct dentry *dentry,
596 int mode) 615 int mode)
@@ -1852,61 +1871,117 @@ bail:
1852 return status; 1871 return status;
1853} 1872}
1854 1873
1855static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 1874static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
1856 struct inode **ret_orphan_dir, 1875 struct inode **ret_orphan_dir,
1857 u64 blkno, 1876 struct buffer_head **ret_orphan_dir_bh)
1858 char *name,
1859 struct ocfs2_dir_lookup_result *lookup)
1860{ 1877{
1861 struct inode *orphan_dir_inode; 1878 struct inode *orphan_dir_inode;
1862 struct buffer_head *orphan_dir_bh = NULL; 1879 struct buffer_head *orphan_dir_bh = NULL;
1863 int status = 0; 1880 int ret = 0;
1864
1865 status = ocfs2_blkno_stringify(blkno, name);
1866 if (status < 0) {
1867 mlog_errno(status);
1868 return status;
1869 }
1870 1881
1871 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 1882 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1872 ORPHAN_DIR_SYSTEM_INODE, 1883 ORPHAN_DIR_SYSTEM_INODE,
1873 osb->slot_num); 1884 osb->slot_num);
1874 if (!orphan_dir_inode) { 1885 if (!orphan_dir_inode) {
1875 status = -ENOENT; 1886 ret = -ENOENT;
1876 mlog_errno(status); 1887 mlog_errno(ret);
1877 return status; 1888 return ret;
1878 } 1889 }
1879 1890
1880 mutex_lock(&orphan_dir_inode->i_mutex); 1891 mutex_lock(&orphan_dir_inode->i_mutex);
1881 1892
1882 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); 1893 ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
1883 if (status < 0) { 1894 if (ret < 0) {
1884 mlog_errno(status); 1895 mutex_unlock(&orphan_dir_inode->i_mutex);
1885 goto leave; 1896 iput(orphan_dir_inode);
1897
1898 mlog_errno(ret);
1899 return ret;
1886 } 1900 }
1887 1901
1888 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, 1902 *ret_orphan_dir = orphan_dir_inode;
1889 orphan_dir_bh, name, 1903 *ret_orphan_dir_bh = orphan_dir_bh;
1890 OCFS2_ORPHAN_NAMELEN, lookup);
1891 if (status < 0) {
1892 ocfs2_inode_unlock(orphan_dir_inode, 1);
1893 1904
1894 mlog_errno(status); 1905 return 0;
1895 goto leave; 1906}
1907
1908static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
1909 struct buffer_head *orphan_dir_bh,
1910 u64 blkno,
1911 char *name,
1912 struct ocfs2_dir_lookup_result *lookup)
1913{
1914 int ret;
1915 struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
1916
1917 ret = ocfs2_blkno_stringify(blkno, name);
1918 if (ret < 0) {
1919 mlog_errno(ret);
1920 return ret;
1921 }
1922
1923 ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
1924 orphan_dir_bh, name,
1925 OCFS2_ORPHAN_NAMELEN, lookup);
1926 if (ret < 0) {
1927 mlog_errno(ret);
1928 return ret;
1929 }
1930
1931 return 0;
1932}
1933
1934/**
1935 * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
1936 * insertion of an orphan.
1937 * @osb: ocfs2 file system
1938 * @ret_orphan_dir: Orphan dir inode - returned locked!
1939 * @blkno: Actual block number of the inode to be inserted into orphan dir.
1940 * @lookup: dir lookup result, to be passed back into functions like
1941 * ocfs2_orphan_add
1942 *
1943 * Returns zero on success and the ret_orphan_dir, name and lookup
1944 * fields will be populated.
1945 *
1946 * Returns non-zero on failure.
1947 */
1948static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1949 struct inode **ret_orphan_dir,
1950 u64 blkno,
1951 char *name,
1952 struct ocfs2_dir_lookup_result *lookup)
1953{
1954 struct inode *orphan_dir_inode = NULL;
1955 struct buffer_head *orphan_dir_bh = NULL;
1956 int ret = 0;
1957
1958 ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
1959 &orphan_dir_bh);
1960 if (ret < 0) {
1961 mlog_errno(ret);
1962 return ret;
1963 }
1964
1965 ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
1966 blkno, name, lookup);
1967 if (ret < 0) {
1968 mlog_errno(ret);
1969 goto out;
1896 } 1970 }
1897 1971
1898 *ret_orphan_dir = orphan_dir_inode; 1972 *ret_orphan_dir = orphan_dir_inode;
1899 1973
1900leave: 1974out:
1901 if (status) { 1975 brelse(orphan_dir_bh);
1976
1977 if (ret) {
1978 ocfs2_inode_unlock(orphan_dir_inode, 1);
1902 mutex_unlock(&orphan_dir_inode->i_mutex); 1979 mutex_unlock(&orphan_dir_inode->i_mutex);
1903 iput(orphan_dir_inode); 1980 iput(orphan_dir_inode);
1904 } 1981 }
1905 1982
1906 brelse(orphan_dir_bh); 1983 mlog_exit(ret);
1907 1984 return ret;
1908 mlog_exit(status);
1909 return status;
1910} 1985}
1911 1986
1912static int ocfs2_orphan_add(struct ocfs2_super *osb, 1987static int ocfs2_orphan_add(struct ocfs2_super *osb,
@@ -2053,6 +2128,99 @@ leave:
2053 return status; 2128 return status;
2054} 2129}
2055 2130
2131/**
2132 * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly
2133 * allocated file. This is different from the typical 'add to orphan dir'
2134 * operation in that the inode does not yet exist. This is a problem because
2135 * the orphan dir stringifies the inode block number to come up with it's
2136 * dirent. Obviously if the inode does not yet exist we have a chicken and egg
2137 * problem. This function works around it by calling deeper into the orphan
2138 * and suballoc code than other callers. Use this only by necessity.
2139 * @dir: The directory which this inode will ultimately wind up under - not the
2140 * orphan dir!
2141 * @dir_bh: buffer_head the @dir inode block
2142 * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
2143 * with the string to be used for orphan dirent. Pass back to the orphan dir
2144 * code.
2145 * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
2146 * dir code.
2147 * @ret_di_blkno: block number where the new inode will be allocated.
2148 * @orphan_insert: Dir insert context to be passed back into orphan dir code.
2149 * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
2150 *
2151 * Returns zero on success and the ret_orphan_dir, name and lookup
2152 * fields will be populated.
2153 *
2154 * Returns non-zero on failure.
2155 */
2156static int ocfs2_prep_new_orphaned_file(struct inode *dir,
2157 struct buffer_head *dir_bh,
2158 char *orphan_name,
2159 struct inode **ret_orphan_dir,
2160 u64 *ret_di_blkno,
2161 struct ocfs2_dir_lookup_result *orphan_insert,
2162 struct ocfs2_alloc_context **ret_inode_ac)
2163{
2164 int ret;
2165 u64 di_blkno;
2166 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2167 struct inode *orphan_dir = NULL;
2168 struct buffer_head *orphan_dir_bh = NULL;
2169 struct ocfs2_alloc_context *inode_ac = NULL;
2170
2171 ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
2172 if (ret < 0) {
2173 mlog_errno(ret);
2174 return ret;
2175 }
2176
2177 /* reserve an inode spot */
2178 ret = ocfs2_reserve_new_inode(osb, &inode_ac);
2179 if (ret < 0) {
2180 if (ret != -ENOSPC)
2181 mlog_errno(ret);
2182 goto out;
2183 }
2184
2185 ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
2186 &di_blkno);
2187 if (ret) {
2188 mlog_errno(ret);
2189 goto out;
2190 }
2191
2192 ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
2193 di_blkno, orphan_name, orphan_insert);
2194 if (ret < 0) {
2195 mlog_errno(ret);
2196 goto out;
2197 }
2198
2199out:
2200 if (ret == 0) {
2201 *ret_orphan_dir = orphan_dir;
2202 *ret_di_blkno = di_blkno;
2203 *ret_inode_ac = inode_ac;
2204 /*
2205 * orphan_name and orphan_insert are already up to
2206 * date via prepare_orphan_dir
2207 */
2208 } else {
2209 /* Unroll reserve_new_inode* */
2210 if (inode_ac)
2211 ocfs2_free_alloc_context(inode_ac);
2212
2213 /* Unroll orphan dir locking */
2214 mutex_unlock(&orphan_dir->i_mutex);
2215 ocfs2_inode_unlock(orphan_dir, 1);
2216 iput(orphan_dir);
2217 }
2218
2219 brelse(orphan_dir_bh);
2220
2221 return 0;
2222}
2223
2056int ocfs2_create_inode_in_orphan(struct inode *dir, 2224int ocfs2_create_inode_in_orphan(struct inode *dir,
2057 int mode, 2225 int mode,
2058 struct inode **new_inode) 2226 struct inode **new_inode)
@@ -2068,6 +2236,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2068 struct buffer_head *new_di_bh = NULL; 2236 struct buffer_head *new_di_bh = NULL;
2069 struct ocfs2_alloc_context *inode_ac = NULL; 2237 struct ocfs2_alloc_context *inode_ac = NULL;
2070 struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; 2238 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
2239 u64 uninitialized_var(di_blkno), suballoc_loc;
2240 u16 suballoc_bit;
2071 2241
2072 status = ocfs2_inode_lock(dir, &parent_di_bh, 1); 2242 status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
2073 if (status < 0) { 2243 if (status < 0) {
@@ -2076,20 +2246,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2076 return status; 2246 return status;
2077 } 2247 }
2078 2248
2079 /* 2249 status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
2080 * We give the orphan dir the root blkno to fake an orphan name, 2250 orphan_name, &orphan_dir,
2081 * and allocate enough space for our insertion. 2251 &di_blkno, &orphan_insert, &inode_ac);
2082 */
2083 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
2084 osb->root_blkno,
2085 orphan_name, &orphan_insert);
2086 if (status < 0) {
2087 mlog_errno(status);
2088 goto leave;
2089 }
2090
2091 /* reserve an inode spot */
2092 status = ocfs2_reserve_new_inode(osb, &inode_ac);
2093 if (status < 0) { 2252 if (status < 0) {
2094 if (status != -ENOSPC) 2253 if (status != -ENOSPC)
2095 mlog_errno(status); 2254 mlog_errno(status);
@@ -2116,17 +2275,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2116 goto leave; 2275 goto leave;
2117 did_quota_inode = 1; 2276 did_quota_inode = 1;
2118 2277
2119 inode->i_nlink = 0; 2278 status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
2120 /* do the real work now. */ 2279 &suballoc_loc,
2121 status = ocfs2_mknod_locked(osb, dir, inode, 2280 &suballoc_bit, di_blkno);
2122 0, &new_di_bh, parent_di_bh, handle,
2123 inode_ac);
2124 if (status < 0) { 2281 if (status < 0) {
2125 mlog_errno(status); 2282 mlog_errno(status);
2126 goto leave; 2283 goto leave;
2127 } 2284 }
2128 2285
2129 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name); 2286 inode->i_nlink = 0;
2287 /* do the real work now. */
2288 status = __ocfs2_mknod_locked(dir, inode,
2289 0, &new_di_bh, parent_di_bh, handle,
2290 inode_ac, di_blkno, suballoc_loc,
2291 suballoc_bit);
2130 if (status < 0) { 2292 if (status < 0) {
2131 mlog_errno(status); 2293 mlog_errno(status);
2132 goto leave; 2294 goto leave;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 33f1c9a8258d..fa31d05e41b7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -235,18 +235,31 @@
235#define OCFS2_HAS_REFCOUNT_FL (0x0010) 235#define OCFS2_HAS_REFCOUNT_FL (0x0010)
236 236
237/* Inode attributes, keep in sync with EXT2 */ 237/* Inode attributes, keep in sync with EXT2 */
238#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ 238#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */
239#define OCFS2_UNRM_FL (0x00000002) /* Undelete */ 239#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */
240#define OCFS2_COMPR_FL (0x00000004) /* Compress file */ 240#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */
241#define OCFS2_SYNC_FL (0x00000008) /* Synchronous updates */ 241#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */
242#define OCFS2_IMMUTABLE_FL (0x00000010) /* Immutable file */ 242#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */
243#define OCFS2_APPEND_FL (0x00000020) /* writes to file may only append */ 243#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */
244#define OCFS2_NODUMP_FL (0x00000040) /* do not dump file */ 244#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */
245#define OCFS2_NOATIME_FL (0x00000080) /* do not update atime */ 245#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */
246#define OCFS2_DIRSYNC_FL (0x00010000) /* dirsync behaviour (directories only) */ 246/* Reserved for compression usage... */
247 247#define OCFS2_DIRTY_FL FS_DIRTY_FL
248#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */ 248#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */
249#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ 249#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */
250#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */
251/* End compression flags --- maybe not all used */
252#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */
253#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */
254#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */
255#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */
256#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */
257#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */
258#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/
259#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */
260
261#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */
262#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */
250 263
251/* 264/*
252 * Extent record flags (e_node.leaf.flags) 265 * Extent record flags (e_node.leaf.flags)
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420af1a83..5d241505690b 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -23,10 +23,10 @@
23/* 23/*
24 * ioctl commands 24 * ioctl commands
25 */ 25 */
26#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) 26#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS
27#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long) 27#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS
28#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int) 28#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
29#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) 29#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
30 30
31/* 31/*
32 * Space reservation / allocation / free ioctls and argument structure 32 * Space reservation / allocation / free ioctls and argument structure
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 73a11ccfd4c2..efdd75607406 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2960,7 +2960,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2960 if (map_end & (PAGE_CACHE_SIZE - 1)) 2960 if (map_end & (PAGE_CACHE_SIZE - 1))
2961 to = map_end & (PAGE_CACHE_SIZE - 1); 2961 to = map_end & (PAGE_CACHE_SIZE - 1);
2962 2962
2963 page = grab_cache_page(mapping, page_index); 2963 page = find_or_create_page(mapping, page_index, GFP_NOFS);
2964 2964
2965 /* 2965 /*
2966 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page 2966 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -3179,7 +3179,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
3179 if (map_end > end) 3179 if (map_end > end)
3180 map_end = end; 3180 map_end = end;
3181 3181
3182 page = grab_cache_page(context->inode->i_mapping, page_index); 3182 page = find_or_create_page(context->inode->i_mapping,
3183 page_index, GFP_NOFS);
3183 BUG_ON(!page); 3184 BUG_ON(!page);
3184 3185
3185 wait_on_page_writeback(page); 3186 wait_on_page_writeback(page);
@@ -4200,8 +4201,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
4200 goto out; 4201 goto out;
4201 } 4202 }
4202 4203
4203 mutex_lock(&new_inode->i_mutex); 4204 mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
4204 ret = ocfs2_inode_lock(new_inode, &new_bh, 1); 4205 ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
4206 OI_LS_REFLINK_TARGET);
4205 if (ret) { 4207 if (ret) {
4206 mlog_errno(ret); 4208 mlog_errno(ret);
4207 goto out_unlock; 4209 goto out_unlock;
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index d8b6e4259b80..3e78db361bc7 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -732,25 +732,23 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
732 struct ocfs2_alloc_reservation *resv, 732 struct ocfs2_alloc_reservation *resv,
733 int *cstart, int *clen) 733 int *cstart, int *clen)
734{ 734{
735 unsigned int wanted = *clen;
736
737 if (resv == NULL || ocfs2_resmap_disabled(resmap)) 735 if (resv == NULL || ocfs2_resmap_disabled(resmap))
738 return -ENOSPC; 736 return -ENOSPC;
739 737
740 spin_lock(&resv_lock); 738 spin_lock(&resv_lock);
741 739
742 /*
743 * We don't want to over-allocate for temporary
744 * windows. Otherwise, we run the risk of fragmenting the
745 * allocation space.
746 */
747 wanted = ocfs2_resv_window_bits(resmap, resv);
748 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
749 wanted = *clen;
750
751 if (ocfs2_resv_empty(resv)) { 740 if (ocfs2_resv_empty(resv)) {
752 mlog(0, "empty reservation, find new window\n"); 741 /*
742 * We don't want to over-allocate for temporary
743 * windows. Otherwise, we run the risk of fragmenting the
744 * allocation space.
745 */
746 unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
753 747
748 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
749 wanted = *clen;
750
751 mlog(0, "empty reservation, find new window\n");
754 /* 752 /*
755 * Try to get a window here. If it works, we must fall 753 * Try to get a window here. If it works, we must fall
756 * through and test the bitmap . This avoids some 754 * through and test the bitmap . This avoids some
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a8e6a95a353f..849c2f0e0a0e 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -57,11 +57,28 @@ struct ocfs2_suballoc_result {
57 u64 sr_bg_blkno; /* The bg we allocated from. Set 57 u64 sr_bg_blkno; /* The bg we allocated from. Set
58 to 0 when a block group is 58 to 0 when a block group is
59 contiguous. */ 59 contiguous. */
60 u64 sr_bg_stable_blkno; /*
61 * Doesn't change, always
62 * set to target block
63 * group descriptor
64 * block.
65 */
60 u64 sr_blkno; /* The first allocated block */ 66 u64 sr_blkno; /* The first allocated block */
61 unsigned int sr_bit_offset; /* The bit in the bg */ 67 unsigned int sr_bit_offset; /* The bit in the bg */
62 unsigned int sr_bits; /* How many bits we claimed */ 68 unsigned int sr_bits; /* How many bits we claimed */
63}; 69};
64 70
71static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
72{
73 if (res->sr_blkno == 0)
74 return 0;
75
76 if (res->sr_bg_blkno)
77 return res->sr_bg_blkno;
78
79 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
80}
81
65static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 82static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
66static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 83static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
67static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 84static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -138,6 +155,10 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
138 brelse(ac->ac_bh); 155 brelse(ac->ac_bh);
139 ac->ac_bh = NULL; 156 ac->ac_bh = NULL;
140 ac->ac_resv = NULL; 157 ac->ac_resv = NULL;
158 if (ac->ac_find_loc_priv) {
159 kfree(ac->ac_find_loc_priv);
160 ac->ac_find_loc_priv = NULL;
161 }
141} 162}
142 163
143void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 164void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -336,7 +357,7 @@ out:
336static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, 357static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
337 struct ocfs2_group_desc *bg, 358 struct ocfs2_group_desc *bg,
338 struct ocfs2_chain_list *cl, 359 struct ocfs2_chain_list *cl,
339 u64 p_blkno, u32 clusters) 360 u64 p_blkno, unsigned int clusters)
340{ 361{
341 struct ocfs2_extent_list *el = &bg->bg_list; 362 struct ocfs2_extent_list *el = &bg->bg_list;
342 struct ocfs2_extent_rec *rec; 363 struct ocfs2_extent_rec *rec;
@@ -348,7 +369,7 @@ static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
348 rec->e_blkno = cpu_to_le64(p_blkno); 369 rec->e_blkno = cpu_to_le64(p_blkno);
349 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / 370 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
350 le16_to_cpu(cl->cl_bpc)); 371 le16_to_cpu(cl->cl_bpc));
351 rec->e_leaf_clusters = cpu_to_le32(clusters); 372 rec->e_leaf_clusters = cpu_to_le16(clusters);
352 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); 373 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
353 le16_add_cpu(&bg->bg_free_bits_count, 374 le16_add_cpu(&bg->bg_free_bits_count,
354 clusters * le16_to_cpu(cl->cl_bpc)); 375 clusters * le16_to_cpu(cl->cl_bpc));
@@ -1678,6 +1699,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1678 if (!ret) 1699 if (!ret)
1679 ocfs2_bg_discontig_fix_result(ac, gd, res); 1700 ocfs2_bg_discontig_fix_result(ac, gd, res);
1680 1701
1702 /*
1703 * sr_bg_blkno might have been changed by
1704 * ocfs2_bg_discontig_fix_result
1705 */
1706 res->sr_bg_stable_blkno = group_bh->b_blocknr;
1707
1708 if (ac->ac_find_loc_only)
1709 goto out_loc_only;
1710
1681 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, 1711 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1682 res->sr_bits, 1712 res->sr_bits,
1683 le16_to_cpu(gd->bg_chain)); 1713 le16_to_cpu(gd->bg_chain));
@@ -1691,6 +1721,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1691 if (ret < 0) 1721 if (ret < 0)
1692 mlog_errno(ret); 1722 mlog_errno(ret);
1693 1723
1724out_loc_only:
1694 *bits_left = le16_to_cpu(gd->bg_free_bits_count); 1725 *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1695 1726
1696out: 1727out:
@@ -1708,7 +1739,6 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1708{ 1739{
1709 int status; 1740 int status;
1710 u16 chain; 1741 u16 chain;
1711 u32 tmp_used;
1712 u64 next_group; 1742 u64 next_group;
1713 struct inode *alloc_inode = ac->ac_inode; 1743 struct inode *alloc_inode = ac->ac_inode;
1714 struct buffer_head *group_bh = NULL; 1744 struct buffer_head *group_bh = NULL;
@@ -1770,6 +1800,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1770 if (!status) 1800 if (!status)
1771 ocfs2_bg_discontig_fix_result(ac, bg, res); 1801 ocfs2_bg_discontig_fix_result(ac, bg, res);
1772 1802
1803 /*
1804 * sr_bg_blkno might have been changed by
1805 * ocfs2_bg_discontig_fix_result
1806 */
1807 res->sr_bg_stable_blkno = group_bh->b_blocknr;
1773 1808
1774 /* 1809 /*
1775 * Keep track of previous block descriptor read. When 1810 * Keep track of previous block descriptor read. When
@@ -1796,22 +1831,17 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1796 } 1831 }
1797 } 1832 }
1798 1833
1799 /* Ok, claim our bits now: set the info on dinode, chainlist 1834 if (ac->ac_find_loc_only)
1800 * and then the group */ 1835 goto out_loc_only;
1801 status = ocfs2_journal_access_di(handle, 1836
1802 INODE_CACHE(alloc_inode), 1837 status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1803 ac->ac_bh, 1838 ac->ac_bh, res->sr_bits,
1804 OCFS2_JOURNAL_ACCESS_WRITE); 1839 chain);
1805 if (status < 0) { 1840 if (status) {
1806 mlog_errno(status); 1841 mlog_errno(status);
1807 goto bail; 1842 goto bail;
1808 } 1843 }
1809 1844
1810 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1811 fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
1812 le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
1813 ocfs2_journal_dirty(handle, ac->ac_bh);
1814
1815 status = ocfs2_block_group_set_bits(handle, 1845 status = ocfs2_block_group_set_bits(handle,
1816 alloc_inode, 1846 alloc_inode,
1817 bg, 1847 bg,
@@ -1826,6 +1856,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1826 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, 1856 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
1827 (unsigned long long)le64_to_cpu(fe->i_blkno)); 1857 (unsigned long long)le64_to_cpu(fe->i_blkno));
1828 1858
1859out_loc_only:
1829 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1860 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1830bail: 1861bail:
1831 brelse(group_bh); 1862 brelse(group_bh);
@@ -1845,6 +1876,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1845 int status; 1876 int status;
1846 u16 victim, i; 1877 u16 victim, i;
1847 u16 bits_left = 0; 1878 u16 bits_left = 0;
1879 u64 hint = ac->ac_last_group;
1848 struct ocfs2_chain_list *cl; 1880 struct ocfs2_chain_list *cl;
1849 struct ocfs2_dinode *fe; 1881 struct ocfs2_dinode *fe;
1850 1882
@@ -1872,7 +1904,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1872 goto bail; 1904 goto bail;
1873 } 1905 }
1874 1906
1875 res->sr_bg_blkno = ac->ac_last_group; 1907 res->sr_bg_blkno = hint;
1876 if (res->sr_bg_blkno) { 1908 if (res->sr_bg_blkno) {
1877 /* Attempt to short-circuit the usual search mechanism 1909 /* Attempt to short-circuit the usual search mechanism
1878 * by jumping straight to the most recently used 1910 * by jumping straight to the most recently used
@@ -1896,8 +1928,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1896 1928
1897 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1929 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1898 res, &bits_left); 1930 res, &bits_left);
1899 if (!status) 1931 if (!status) {
1932 hint = ocfs2_group_from_res(res);
1900 goto set_hint; 1933 goto set_hint;
1934 }
1901 if (status < 0 && status != -ENOSPC) { 1935 if (status < 0 && status != -ENOSPC) {
1902 mlog_errno(status); 1936 mlog_errno(status);
1903 goto bail; 1937 goto bail;
@@ -1920,8 +1954,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1920 ac->ac_chain = i; 1954 ac->ac_chain = i;
1921 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 1955 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1922 res, &bits_left); 1956 res, &bits_left);
1923 if (!status) 1957 if (!status) {
1958 hint = ocfs2_group_from_res(res);
1924 break; 1959 break;
1960 }
1925 if (status < 0 && status != -ENOSPC) { 1961 if (status < 0 && status != -ENOSPC) {
1926 mlog_errno(status); 1962 mlog_errno(status);
1927 goto bail; 1963 goto bail;
@@ -1936,7 +1972,7 @@ set_hint:
1936 if (bits_left < min_bits) 1972 if (bits_left < min_bits)
1937 ac->ac_last_group = 0; 1973 ac->ac_last_group = 0;
1938 else 1974 else
1939 ac->ac_last_group = res->sr_bg_blkno; 1975 ac->ac_last_group = hint;
1940 } 1976 }
1941 1977
1942bail: 1978bail:
@@ -2016,6 +2052,136 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2016 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; 2052 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2017} 2053}
2018 2054
2055int ocfs2_find_new_inode_loc(struct inode *dir,
2056 struct buffer_head *parent_fe_bh,
2057 struct ocfs2_alloc_context *ac,
2058 u64 *fe_blkno)
2059{
2060 int ret;
2061 handle_t *handle = NULL;
2062 struct ocfs2_suballoc_result *res;
2063
2064 BUG_ON(!ac);
2065 BUG_ON(ac->ac_bits_given != 0);
2066 BUG_ON(ac->ac_bits_wanted != 1);
2067 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2068
2069 res = kzalloc(sizeof(*res), GFP_NOFS);
2070 if (res == NULL) {
2071 ret = -ENOMEM;
2072 mlog_errno(ret);
2073 goto out;
2074 }
2075
2076 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2077
2078 /*
2079 * The handle started here is for chain relink. Alternatively,
2080 * we could just disable relink for these calls.
2081 */
2082 handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2083 if (IS_ERR(handle)) {
2084 ret = PTR_ERR(handle);
2085 handle = NULL;
2086 mlog_errno(ret);
2087 goto out;
2088 }
2089
2090 /*
2091 * This will instruct ocfs2_claim_suballoc_bits and
2092 * ocfs2_search_one_group to search but save actual allocation
2093 * for later.
2094 */
2095 ac->ac_find_loc_only = 1;
2096
2097 ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2098 if (ret < 0) {
2099 mlog_errno(ret);
2100 goto out;
2101 }
2102
2103 ac->ac_find_loc_priv = res;
2104 *fe_blkno = res->sr_blkno;
2105
2106out:
2107 if (handle)
2108 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2109
2110 if (ret)
2111 kfree(res);
2112
2113 return ret;
2114}
2115
2116int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2117 struct inode *dir,
2118 struct ocfs2_alloc_context *ac,
2119 u64 *suballoc_loc,
2120 u16 *suballoc_bit,
2121 u64 di_blkno)
2122{
2123 int ret;
2124 u16 chain;
2125 struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2126 struct buffer_head *bg_bh = NULL;
2127 struct ocfs2_group_desc *bg;
2128 struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2129
2130 /*
2131 * Since di_blkno is being passed back in, we check for any
2132 * inconsistencies which may have happened between
2133 * calls. These are code bugs as di_blkno is not expected to
2134 * change once returned from ocfs2_find_new_inode_loc()
2135 */
2136 BUG_ON(res->sr_blkno != di_blkno);
2137
2138 ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2139 res->sr_bg_stable_blkno, &bg_bh);
2140 if (ret) {
2141 mlog_errno(ret);
2142 goto out;
2143 }
2144
2145 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2146 chain = le16_to_cpu(bg->bg_chain);
2147
2148 ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2149 ac->ac_bh, res->sr_bits,
2150 chain);
2151 if (ret) {
2152 mlog_errno(ret);
2153 goto out;
2154 }
2155
2156 ret = ocfs2_block_group_set_bits(handle,
2157 ac->ac_inode,
2158 bg,
2159 bg_bh,
2160 res->sr_bit_offset,
2161 res->sr_bits);
2162 if (ret < 0) {
2163 mlog_errno(ret);
2164 goto out;
2165 }
2166
2167 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
2168 (unsigned long long)di_blkno);
2169
2170 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2171
2172 BUG_ON(res->sr_bits != 1);
2173
2174 *suballoc_loc = res->sr_bg_blkno;
2175 *suballoc_bit = res->sr_bit_offset;
2176 ac->ac_bits_given++;
2177 ocfs2_save_inode_ac_group(dir, ac);
2178
2179out:
2180 brelse(bg_bh);
2181
2182 return ret;
2183}
2184
2019int ocfs2_claim_new_inode(handle_t *handle, 2185int ocfs2_claim_new_inode(handle_t *handle,
2020 struct inode *dir, 2186 struct inode *dir,
2021 struct buffer_head *parent_fe_bh, 2187 struct buffer_head *parent_fe_bh,
@@ -2567,7 +2733,8 @@ out:
2567 * suballoc_bit. 2733 * suballoc_bit.
2568 */ 2734 */
2569static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, 2735static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2570 u16 *suballoc_slot, u16 *suballoc_bit) 2736 u16 *suballoc_slot, u64 *group_blkno,
2737 u16 *suballoc_bit)
2571{ 2738{
2572 int status; 2739 int status;
2573 struct buffer_head *inode_bh = NULL; 2740 struct buffer_head *inode_bh = NULL;
@@ -2604,6 +2771,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2604 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); 2771 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2605 if (suballoc_bit) 2772 if (suballoc_bit)
2606 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); 2773 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2774 if (group_blkno)
2775 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2607 2776
2608bail: 2777bail:
2609 brelse(inode_bh); 2778 brelse(inode_bh);
@@ -2621,7 +2790,8 @@ bail:
2621 */ 2790 */
2622static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, 2791static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2623 struct inode *suballoc, 2792 struct inode *suballoc,
2624 struct buffer_head *alloc_bh, u64 blkno, 2793 struct buffer_head *alloc_bh,
2794 u64 group_blkno, u64 blkno,
2625 u16 bit, int *res) 2795 u16 bit, int *res)
2626{ 2796{
2627 struct ocfs2_dinode *alloc_di; 2797 struct ocfs2_dinode *alloc_di;
@@ -2642,10 +2812,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2642 goto bail; 2812 goto bail;
2643 } 2813 }
2644 2814
2645 if (alloc_di->i_suballoc_loc) 2815 bg_blkno = group_blkno ? group_blkno :
2646 bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc); 2816 ocfs2_which_suballoc_group(blkno, bit);
2647 else
2648 bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2649 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, 2817 status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2650 &group_bh); 2818 &group_bh);
2651 if (status < 0) { 2819 if (status < 0) {
@@ -2680,6 +2848,7 @@ bail:
2680int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) 2848int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2681{ 2849{
2682 int status; 2850 int status;
2851 u64 group_blkno = 0;
2683 u16 suballoc_bit = 0, suballoc_slot = 0; 2852 u16 suballoc_bit = 0, suballoc_slot = 0;
2684 struct inode *inode_alloc_inode; 2853 struct inode *inode_alloc_inode;
2685 struct buffer_head *alloc_bh = NULL; 2854 struct buffer_head *alloc_bh = NULL;
@@ -2687,7 +2856,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2687 mlog_entry("blkno: %llu", (unsigned long long)blkno); 2856 mlog_entry("blkno: %llu", (unsigned long long)blkno);
2688 2857
2689 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 2858 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2690 &suballoc_bit); 2859 &group_blkno, &suballoc_bit);
2691 if (status < 0) { 2860 if (status < 0) {
2692 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); 2861 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2693 goto bail; 2862 goto bail;
@@ -2715,7 +2884,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2715 } 2884 }
2716 2885
2717 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, 2886 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2718 blkno, suballoc_bit, res); 2887 group_blkno, blkno, suballoc_bit, res);
2719 if (status < 0) 2888 if (status < 0)
2720 mlog(ML_ERROR, "test suballoc bit failed %d\n", status); 2889 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2721 2890
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a017dd3ee7d9..b8afabfeede4 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,9 @@ struct ocfs2_alloc_context {
56 u64 ac_max_block; /* Highest block number to allocate. 0 is 56 u64 ac_max_block; /* Highest block number to allocate. 0 is
57 is the same as ~0 - unlimited */ 57 is the same as ~0 - unlimited */
58 58
59 int ac_find_loc_only; /* hack for reflink operation ordering */
60 struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
61
59 struct ocfs2_alloc_reservation *ac_resv; 62 struct ocfs2_alloc_reservation *ac_resv;
60}; 63};
61 64
@@ -197,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
197 struct ocfs2_alloc_context **meta_ac); 200 struct ocfs2_alloc_context **meta_ac);
198 201
199int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); 202int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
203
204
205
206/*
207 * The following two interfaces are for ocfs2_create_inode_in_orphan().
208 */
209int ocfs2_find_new_inode_loc(struct inode *dir,
210 struct buffer_head *parent_fe_bh,
211 struct ocfs2_alloc_context *ac,
212 u64 *fe_blkno);
213
214int ocfs2_claim_new_inode_at_loc(handle_t *handle,
215 struct inode *dir,
216 struct ocfs2_alloc_context *ac,
217 u64 *suballoc_loc,
218 u16 *suballoc_bit,
219 u64 di_blkno);
220
200#endif /* _CHAINALLOC_H_ */ 221#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d213fc4..9975457c981f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
128 } 128 }
129 129
130 /* Fast symlinks can't be large */ 130 /* Fast symlinks can't be large */
131 len = strlen(target); 131 len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
132 link = kzalloc(len + 1, GFP_NOFS); 132 link = kzalloc(len + 1, GFP_NOFS);
133 if (!link) { 133 if (!link) {
134 status = -ENOMEM; 134 status = -ENOMEM;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d03469f61801..06fa5e77c40e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1286,13 +1286,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
1286 xis.inode_bh = xbs.inode_bh = di_bh; 1286 xis.inode_bh = xbs.inode_bh = di_bh;
1287 di = (struct ocfs2_dinode *)di_bh->b_data; 1287 di = (struct ocfs2_dinode *)di_bh->b_data;
1288 1288
1289 down_read(&oi->ip_xattr_sem);
1290 ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, 1289 ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
1291 buffer_size, &xis); 1290 buffer_size, &xis);
1292 if (ret == -ENODATA && di->i_xattr_loc) 1291 if (ret == -ENODATA && di->i_xattr_loc)
1293 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, 1292 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
1294 buffer_size, &xbs); 1293 buffer_size, &xbs);
1295 up_read(&oi->ip_xattr_sem);
1296 1294
1297 return ret; 1295 return ret;
1298} 1296}
@@ -1316,8 +1314,10 @@ static int ocfs2_xattr_get(struct inode *inode,
1316 mlog_errno(ret); 1314 mlog_errno(ret);
1317 return ret; 1315 return ret;
1318 } 1316 }
1317 down_read(&OCFS2_I(inode)->ip_xattr_sem);
1319 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, 1318 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
1320 name, buffer, buffer_size); 1319 name, buffer, buffer_size);
1320 up_read(&OCFS2_I(inode)->ip_xattr_sem);
1321 1321
1322 ocfs2_inode_unlock(inode, 0); 1322 ocfs2_inode_unlock(inode, 0);
1323 1323
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1c43e7c8a7b..8e4addaa5424 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2675,7 +2675,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2675 INF("auxv", S_IRUSR, proc_pid_auxv), 2675 INF("auxv", S_IRUSR, proc_pid_auxv),
2676 ONE("status", S_IRUGO, proc_pid_status), 2676 ONE("status", S_IRUGO, proc_pid_status),
2677 ONE("personality", S_IRUSR, proc_pid_personality), 2677 ONE("personality", S_IRUSR, proc_pid_personality),
2678 INF("limits", S_IRUSR, proc_pid_limits), 2678 INF("limits", S_IRUGO, proc_pid_limits),
2679#ifdef CONFIG_SCHED_DEBUG 2679#ifdef CONFIG_SCHED_DEBUG
2680 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2680 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2681#endif 2681#endif
@@ -3011,7 +3011,7 @@ static const struct pid_entry tid_base_stuff[] = {
3011 INF("auxv", S_IRUSR, proc_pid_auxv), 3011 INF("auxv", S_IRUSR, proc_pid_auxv),
3012 ONE("status", S_IRUGO, proc_pid_status), 3012 ONE("status", S_IRUGO, proc_pid_status),
3013 ONE("personality", S_IRUSR, proc_pid_personality), 3013 ONE("personality", S_IRUSR, proc_pid_personality),
3014 INF("limits", S_IRUSR, proc_pid_limits), 3014 INF("limits", S_IRUGO, proc_pid_limits),
3015#ifdef CONFIG_SCHED_DEBUG 3015#ifdef CONFIG_SCHED_DEBUG
3016 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 3016 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3017#endif 3017#endif
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 180cf5a0bd67..3b8b45660331 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -146,7 +146,7 @@ u64 stable_page_flags(struct page *page)
146 u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); 146 u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
147#endif 147#endif
148 148
149#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR 149#ifdef CONFIG_ARCH_USES_PG_UNCACHED
150 u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); 150 u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
151#endif 151#endif
152 152
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 439fc1f1c1c4..1dbca4e8cc16 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -224,7 +224,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
224 /* We don't show the stack guard page in /proc/maps */ 224 /* We don't show the stack guard page in /proc/maps */
225 start = vma->vm_start; 225 start = vma->vm_start;
226 if (vma->vm_flags & VM_GROWSDOWN) 226 if (vma->vm_flags & VM_GROWSDOWN)
227 start += PAGE_SIZE; 227 if (!vma_stack_continue(vma->vm_prev, vma->vm_start))
228 start += PAGE_SIZE;
228 229
229 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", 230 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
230 start, 231 start,
@@ -362,13 +363,13 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
362 mss->referenced += PAGE_SIZE; 363 mss->referenced += PAGE_SIZE;
363 mapcount = page_mapcount(page); 364 mapcount = page_mapcount(page);
364 if (mapcount >= 2) { 365 if (mapcount >= 2) {
365 if (pte_dirty(ptent)) 366 if (pte_dirty(ptent) || PageDirty(page))
366 mss->shared_dirty += PAGE_SIZE; 367 mss->shared_dirty += PAGE_SIZE;
367 else 368 else
368 mss->shared_clean += PAGE_SIZE; 369 mss->shared_clean += PAGE_SIZE;
369 mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; 370 mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
370 } else { 371 } else {
371 if (pte_dirty(ptent)) 372 if (pte_dirty(ptent) || PageDirty(page))
372 mss->private_dirty += PAGE_SIZE; 373 mss->private_dirty += PAGE_SIZE;
373 else 374 else
374 mss->private_clean += PAGE_SIZE; 375 mss->private_clean += PAGE_SIZE;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 91c817ff02c3..2367fb3f70bc 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -163,7 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
163 163
164static const struct file_operations proc_vmcore_operations = { 164static const struct file_operations proc_vmcore_operations = {
165 .read = read_vmcore, 165 .read = read_vmcore,
166 .llseek = generic_file_llseek, 166 .llseek = default_llseek,
167}; 167};
168 168
169static struct vmcore* __init get_new_element(void) 169static struct vmcore* __init get_new_element(void)
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index f53505de0712..5cbb81e134ac 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -170,6 +170,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
170int reiserfs_unpack(struct inode *inode, struct file *filp) 170int reiserfs_unpack(struct inode *inode, struct file *filp)
171{ 171{
172 int retval = 0; 172 int retval = 0;
173 int depth;
173 int index; 174 int index;
174 struct page *page; 175 struct page *page;
175 struct address_space *mapping; 176 struct address_space *mapping;
@@ -188,8 +189,8 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
188 /* we need to make sure nobody is changing the file size beneath 189 /* we need to make sure nobody is changing the file size beneath
189 ** us 190 ** us
190 */ 191 */
191 mutex_lock(&inode->i_mutex); 192 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
192 reiserfs_write_lock(inode->i_sb); 193 depth = reiserfs_write_lock_once(inode->i_sb);
193 194
194 write_from = inode->i_size & (blocksize - 1); 195 write_from = inode->i_size & (blocksize - 1);
195 /* if we are on a block boundary, we are already unpacked. */ 196 /* if we are on a block boundary, we are already unpacked. */
@@ -224,6 +225,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
224 225
225 out: 226 out:
226 mutex_unlock(&inode->i_mutex); 227 mutex_unlock(&inode->i_mutex);
227 reiserfs_write_unlock(inode->i_sb); 228 reiserfs_write_unlock_once(inode->i_sb, depth);
228 return retval; 229 return retval;
229} 230}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1b27b5688f62..da3fefe91a8f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
340 char *p; 340 char *p;
341 341
342 p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); 342 p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
343 if (p) 343 if (!IS_ERR(p))
344 memmove(last_sysfs_file, p, strlen(p) + 1); 344 memmove(last_sysfs_file, p, strlen(p) + 1);
345 345
346 /* need attr_sd for attr and ops, its parent for kobj */ 346 /* need attr_sd for attr and ops, its parent for kobj */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 15412fe15c3a..b552f816de15 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -852,8 +852,8 @@ xfs_convert_page(
852 SetPageUptodate(page); 852 SetPageUptodate(page);
853 853
854 if (count) { 854 if (count) {
855 wbc->nr_to_write--; 855 if (--wbc->nr_to_write <= 0 &&
856 if (wbc->nr_to_write <= 0) 856 wbc->sync_mode == WB_SYNC_NONE)
857 done = 1; 857 done = 1;
858 } 858 }
859 xfs_start_page_writeback(page, !page_dirty, count); 859 xfs_start_page_writeback(page, !page_dirty, count);
@@ -1068,7 +1068,7 @@ xfs_vm_writepage(
1068 * by themselves. 1068 * by themselves.
1069 */ 1069 */
1070 if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) 1070 if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
1071 goto out_fail; 1071 goto redirty;
1072 1072
1073 /* 1073 /*
1074 * We need a transaction if there are delalloc or unwritten buffers 1074 * We need a transaction if there are delalloc or unwritten buffers
@@ -1080,7 +1080,7 @@ xfs_vm_writepage(
1080 */ 1080 */
1081 xfs_count_page_state(page, &delalloc, &unwritten); 1081 xfs_count_page_state(page, &delalloc, &unwritten);
1082 if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) 1082 if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
1083 goto out_fail; 1083 goto redirty;
1084 1084
1085 /* Is this page beyond the end of the file? */ 1085 /* Is this page beyond the end of the file? */
1086 offset = i_size_read(inode); 1086 offset = i_size_read(inode);
@@ -1245,12 +1245,15 @@ error:
1245 if (iohead) 1245 if (iohead)
1246 xfs_cancel_ioend(iohead); 1246 xfs_cancel_ioend(iohead);
1247 1247
1248 if (err == -EAGAIN)
1249 goto redirty;
1250
1248 xfs_aops_discard_page(page); 1251 xfs_aops_discard_page(page);
1249 ClearPageUptodate(page); 1252 ClearPageUptodate(page);
1250 unlock_page(page); 1253 unlock_page(page);
1251 return err; 1254 return err;
1252 1255
1253out_fail: 1256redirty:
1254 redirty_page_for_writepage(wbc, page); 1257 redirty_page_for_writepage(wbc, page);
1255 unlock_page(page); 1258 unlock_page(page);
1256 return 0; 1259 return 0;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b93ea3342281..1846a0dd7035 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -440,12 +440,7 @@ _xfs_buf_find(
440 ASSERT(btp == bp->b_target); 440 ASSERT(btp == bp->b_target);
441 if (bp->b_file_offset == range_base && 441 if (bp->b_file_offset == range_base &&
442 bp->b_buffer_length == range_length) { 442 bp->b_buffer_length == range_length) {
443 /*
444 * If we look at something, bring it to the
445 * front of the list for next time.
446 */
447 atomic_inc(&bp->b_hold); 443 atomic_inc(&bp->b_hold);
448 list_move(&bp->b_hash_list, &hash->bh_list);
449 goto found; 444 goto found;
450 } 445 }
451 } 446 }
@@ -1431,8 +1426,7 @@ xfs_alloc_bufhash(
1431{ 1426{
1432 unsigned int i; 1427 unsigned int i;
1433 1428
1434 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1429 btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */
1435 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1436 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * 1430 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
1437 sizeof(xfs_bufhash_t)); 1431 sizeof(xfs_bufhash_t));
1438 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1432 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
@@ -1926,7 +1920,8 @@ xfs_buf_init(void)
1926 if (!xfs_buf_zone) 1920 if (!xfs_buf_zone)
1927 goto out; 1921 goto out;
1928 1922
1929 xfslogd_workqueue = create_workqueue("xfslogd"); 1923 xfslogd_workqueue = alloc_workqueue("xfslogd",
1924 WQ_RESCUER | WQ_HIGHPRI, 1);
1930 if (!xfslogd_workqueue) 1925 if (!xfslogd_workqueue)
1931 goto out_free_buf_zone; 1926 goto out_free_buf_zone;
1932 1927
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index d533d64e2c3e..9d021c73ea52 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,7 +128,6 @@ typedef struct xfs_buftarg {
128 size_t bt_smask; 128 size_t bt_smask;
129 129
130 /* per device buffer hash table */ 130 /* per device buffer hash table */
131 uint bt_hashmask;
132 uint bt_hashshift; 131 uint bt_hashshift;
133 xfs_bufhash_t *bt_hash; 132 xfs_bufhash_t *bt_hash;
134 133
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 237f5ffb2ee8..3b9e626f7cd1 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -785,6 +785,8 @@ xfs_ioc_fsgetxattr(
785{ 785{
786 struct fsxattr fa; 786 struct fsxattr fa;
787 787
788 memset(&fa, 0, sizeof(struct fsxattr));
789
788 xfs_ilock(ip, XFS_ILOCK_SHARED); 790 xfs_ilock(ip, XFS_ILOCK_SHARED);
789 fa.fsx_xflags = xfs_ip2xflags(ip); 791 fa.fsx_xflags = xfs_ip2xflags(ip);
790 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; 792 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
@@ -907,6 +909,13 @@ xfs_ioctl_setattr(
907 return XFS_ERROR(EIO); 909 return XFS_ERROR(EIO);
908 910
909 /* 911 /*
912 * Disallow 32bit project ids because on-disk structure
913 * is 16bit only.
914 */
915 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
916 return XFS_ERROR(EINVAL);
917
918 /*
910 * If disk quotas is on, we make sure that the dquots do exist on disk, 919 * If disk quotas is on, we make sure that the dquots do exist on disk,
911 * before we start any other transactions. Trying to do this later 920 * before we start any other transactions. Trying to do this later
912 * is messy. We don't care to take a readlock to look at the ids 921 * is messy. We don't care to take a readlock to look at the ids
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 68be25dcd301..b1fc2a6bfe83 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -664,7 +664,7 @@ xfs_vn_fiemap(
664 fieinfo->fi_extents_max + 1; 664 fieinfo->fi_extents_max + 1;
665 bm.bmv_count = min_t(__s32, bm.bmv_count, 665 bm.bmv_count = min_t(__s32, bm.bmv_count,
666 (PAGE_SIZE * 16 / sizeof(struct getbmapx))); 666 (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
667 bm.bmv_iflags = BMV_IF_PREALLOC; 667 bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
668 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) 668 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
669 bm.bmv_iflags |= BMV_IF_ATTRFORK; 669 bm.bmv_iflags |= BMV_IF_ATTRFORK;
670 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) 670 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 5fa7a30cc3f0..08fd3102128c 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1225,6 +1225,7 @@ xfs_fs_statfs(
1225 struct xfs_inode *ip = XFS_I(dentry->d_inode); 1225 struct xfs_inode *ip = XFS_I(dentry->d_inode);
1226 __uint64_t fakeinos, id; 1226 __uint64_t fakeinos, id;
1227 xfs_extlen_t lsize; 1227 xfs_extlen_t lsize;
1228 __int64_t ffree;
1228 1229
1229 statp->f_type = XFS_SB_MAGIC; 1230 statp->f_type = XFS_SB_MAGIC;
1230 statp->f_namelen = MAXNAMELEN - 1; 1231 statp->f_namelen = MAXNAMELEN - 1;
@@ -1248,7 +1249,11 @@ xfs_fs_statfs(
1248 statp->f_files = min_t(typeof(statp->f_files), 1249 statp->f_files = min_t(typeof(statp->f_files),
1249 statp->f_files, 1250 statp->f_files,
1250 mp->m_maxicount); 1251 mp->m_maxicount);
1251 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1252
1253 /* make sure statp->f_ffree does not underflow */
1254 ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
1255 statp->f_ffree = max_t(__int64_t, ffree, 0);
1256
1252 spin_unlock(&mp->m_sb_lock); 1257 spin_unlock(&mp->m_sb_lock);
1253 1258
1254 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || 1259 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
@@ -1401,7 +1406,7 @@ xfs_fs_freeze(
1401 1406
1402 xfs_save_resvblks(mp); 1407 xfs_save_resvblks(mp);
1403 xfs_quiesce_attr(mp); 1408 xfs_quiesce_attr(mp);
1404 return -xfs_fs_log_dummy(mp); 1409 return -xfs_fs_log_dummy(mp, SYNC_WAIT);
1405} 1410}
1406 1411
1407STATIC int 1412STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index dfcbd98d1599..81976ffed7d6 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -34,6 +34,7 @@
34#include "xfs_inode_item.h" 34#include "xfs_inode_item.h"
35#include "xfs_quota.h" 35#include "xfs_quota.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_fsops.h"
37 38
38#include <linux/kthread.h> 39#include <linux/kthread.h>
39#include <linux/freezer.h> 40#include <linux/freezer.h>
@@ -341,38 +342,6 @@ xfs_sync_attr(
341} 342}
342 343
343STATIC int 344STATIC int
344xfs_commit_dummy_trans(
345 struct xfs_mount *mp,
346 uint flags)
347{
348 struct xfs_inode *ip = mp->m_rootip;
349 struct xfs_trans *tp;
350 int error;
351
352 /*
353 * Put a dummy transaction in the log to tell recovery
354 * that all others are OK.
355 */
356 tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
357 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
358 if (error) {
359 xfs_trans_cancel(tp, 0);
360 return error;
361 }
362
363 xfs_ilock(ip, XFS_ILOCK_EXCL);
364
365 xfs_trans_ijoin(tp, ip);
366 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
367 error = xfs_trans_commit(tp, 0);
368 xfs_iunlock(ip, XFS_ILOCK_EXCL);
369
370 /* the log force ensures this transaction is pushed to disk */
371 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
372 return error;
373}
374
375STATIC int
376xfs_sync_fsdata( 345xfs_sync_fsdata(
377 struct xfs_mount *mp) 346 struct xfs_mount *mp)
378{ 347{
@@ -432,7 +401,7 @@ xfs_quiesce_data(
432 401
433 /* mark the log as covered if needed */ 402 /* mark the log as covered if needed */
434 if (xfs_log_need_covered(mp)) 403 if (xfs_log_need_covered(mp))
435 error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); 404 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
436 405
437 /* flush data-only devices */ 406 /* flush data-only devices */
438 if (mp->m_rtdev_targp) 407 if (mp->m_rtdev_targp)
@@ -563,7 +532,7 @@ xfs_flush_inodes(
563/* 532/*
564 * Every sync period we need to unpin all items, reclaim inodes and sync 533 * Every sync period we need to unpin all items, reclaim inodes and sync
565 * disk quotas. We might need to cover the log to indicate that the 534 * disk quotas. We might need to cover the log to indicate that the
566 * filesystem is idle. 535 * filesystem is idle and not frozen.
567 */ 536 */
568STATIC void 537STATIC void
569xfs_sync_worker( 538xfs_sync_worker(
@@ -577,8 +546,9 @@ xfs_sync_worker(
577 xfs_reclaim_inodes(mp, 0); 546 xfs_reclaim_inodes(mp, 0);
578 /* dgc: errors ignored here */ 547 /* dgc: errors ignored here */
579 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 548 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
580 if (xfs_log_need_covered(mp)) 549 if (mp->m_super->s_frozen == SB_UNFROZEN &&
581 error = xfs_commit_dummy_trans(mp, 0); 550 xfs_log_need_covered(mp))
551 error = xfs_fs_log_dummy(mp, 0);
582 } 552 }
583 mp->m_sync_seq++; 553 mp->m_sync_seq++;
584 wake_up(&mp->m_wait_single_sync_task); 554 wake_up(&mp->m_wait_single_sync_task);
@@ -698,14 +668,11 @@ xfs_inode_set_reclaim_tag(
698 xfs_perag_put(pag); 668 xfs_perag_put(pag);
699} 669}
700 670
701void 671STATIC void
702__xfs_inode_clear_reclaim_tag( 672__xfs_inode_clear_reclaim(
703 xfs_mount_t *mp,
704 xfs_perag_t *pag, 673 xfs_perag_t *pag,
705 xfs_inode_t *ip) 674 xfs_inode_t *ip)
706{ 675{
707 radix_tree_tag_clear(&pag->pag_ici_root,
708 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
709 pag->pag_ici_reclaimable--; 676 pag->pag_ici_reclaimable--;
710 if (!pag->pag_ici_reclaimable) { 677 if (!pag->pag_ici_reclaimable) {
711 /* clear the reclaim tag from the perag radix tree */ 678 /* clear the reclaim tag from the perag radix tree */
@@ -719,6 +686,17 @@ __xfs_inode_clear_reclaim_tag(
719 } 686 }
720} 687}
721 688
689void
690__xfs_inode_clear_reclaim_tag(
691 xfs_mount_t *mp,
692 xfs_perag_t *pag,
693 xfs_inode_t *ip)
694{
695 radix_tree_tag_clear(&pag->pag_ici_root,
696 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
697 __xfs_inode_clear_reclaim(pag, ip);
698}
699
722/* 700/*
723 * Inodes in different states need to be treated differently, and the return 701 * Inodes in different states need to be treated differently, and the return
724 * value of xfs_iflush is not sufficient to get this right. The following table 702 * value of xfs_iflush is not sufficient to get this right. The following table
@@ -868,6 +846,7 @@ reclaim:
868 if (!radix_tree_delete(&pag->pag_ici_root, 846 if (!radix_tree_delete(&pag->pag_ici_root,
869 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 847 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
870 ASSERT(0); 848 ASSERT(0);
849 __xfs_inode_clear_reclaim(pag, ip);
871 write_unlock(&pag->pag_ici_lock); 850 write_unlock(&pag->pag_ici_lock);
872 851
873 /* 852 /*
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 23f14e595c18..f90dadd5a968 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5533,12 +5533,24 @@ xfs_getbmap(
5533 map[i].br_startblock)) 5533 map[i].br_startblock))
5534 goto out_free_map; 5534 goto out_free_map;
5535 5535
5536 nexleft--;
5537 bmv->bmv_offset = 5536 bmv->bmv_offset =
5538 out[cur_ext].bmv_offset + 5537 out[cur_ext].bmv_offset +
5539 out[cur_ext].bmv_length; 5538 out[cur_ext].bmv_length;
5540 bmv->bmv_length = 5539 bmv->bmv_length =
5541 max_t(__int64_t, 0, bmvend - bmv->bmv_offset); 5540 max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
5541
5542 /*
5543 * In case we don't want to return the hole,
5544 * don't increase cur_ext so that we can reuse
5545 * it in the next loop.
5546 */
5547 if ((iflags & BMV_IF_NO_HOLES) &&
5548 map[i].br_startblock == HOLESTARTBLOCK) {
5549 memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
5550 continue;
5551 }
5552
5553 nexleft--;
5542 bmv->bmv_entries++; 5554 bmv->bmv_entries++;
5543 cur_ext++; 5555 cur_ext++;
5544 } 5556 }
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 7cf7220e7d5f..87c2e9d02288 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -114,8 +114,10 @@ struct getbmapx {
114#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ 114#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */
115#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ 115#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
116#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ 116#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
117#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */
117#define BMV_IF_VALID \ 118#define BMV_IF_VALID \
118 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) 119 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \
120 BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
119 121
120/* bmv_oflags values - returned for each non-header segment */ 122/* bmv_oflags values - returned for each non-header segment */
121#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ 123#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index dbca5f5c37ba..43b1d5699335 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -604,31 +604,36 @@ out:
604 return 0; 604 return 0;
605} 605}
606 606
607/*
608 * Dump a transaction into the log that contains no real change. This is needed
609 * to be able to make the log dirty or stamp the current tail LSN into the log
610 * during the covering operation.
611 *
612 * We cannot use an inode here for this - that will push dirty state back up
613 * into the VFS and then periodic inode flushing will prevent log covering from
614 * making progress. Hence we log a field in the superblock instead.
615 */
607int 616int
608xfs_fs_log_dummy( 617xfs_fs_log_dummy(
609 xfs_mount_t *mp) 618 xfs_mount_t *mp,
619 int flags)
610{ 620{
611 xfs_trans_t *tp; 621 xfs_trans_t *tp;
612 xfs_inode_t *ip;
613 int error; 622 int error;
614 623
615 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); 624 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
616 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); 625 error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
626 XFS_DEFAULT_LOG_COUNT);
617 if (error) { 627 if (error) {
618 xfs_trans_cancel(tp, 0); 628 xfs_trans_cancel(tp, 0);
619 return error; 629 return error;
620 } 630 }
621 631
622 ip = mp->m_rootip; 632 /* log the UUID because it is an unchanging field */
623 xfs_ilock(ip, XFS_ILOCK_EXCL); 633 xfs_mod_sb(tp, XFS_SB_UUID);
624 634 if (flags & SYNC_WAIT)
625 xfs_trans_ijoin(tp, ip); 635 xfs_trans_set_sync(tp);
626 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 636 return xfs_trans_commit(tp, 0);
627 xfs_trans_set_sync(tp);
628 error = xfs_trans_commit(tp, 0);
629
630 xfs_iunlock(ip, XFS_ILOCK_EXCL);
631 return error;
632} 637}
633 638
634int 639int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 88435e0a77c9..a786c5212c1e 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(xfs_mount_t *mp); 28extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index abf80ae1e95b..5371d2dc360e 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1213,7 +1213,6 @@ xfs_imap_lookup(
1213 struct xfs_inobt_rec_incore rec; 1213 struct xfs_inobt_rec_incore rec;
1214 struct xfs_btree_cur *cur; 1214 struct xfs_btree_cur *cur;
1215 struct xfs_buf *agbp; 1215 struct xfs_buf *agbp;
1216 xfs_agino_t startino;
1217 int error; 1216 int error;
1218 int i; 1217 int i;
1219 1218
@@ -1227,13 +1226,13 @@ xfs_imap_lookup(
1227 } 1226 }
1228 1227
1229 /* 1228 /*
1230 * derive and lookup the exact inode record for the given agino. If the 1229 * Lookup the inode record for the given agino. If the record cannot be
1231 * record cannot be found, then it's an invalid inode number and we 1230 * found, then it's an invalid inode number and we should abort. Once
1232 * should abort. 1231 * we have a record, we need to ensure it contains the inode number
1232 * we are looking up.
1233 */ 1233 */
1234 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1234 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1235 startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); 1235 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
1236 error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i);
1237 if (!error) { 1236 if (!error) {
1238 if (i) 1237 if (i)
1239 error = xfs_inobt_get_rec(cur, &rec, &i); 1238 error = xfs_inobt_get_rec(cur, &rec, &i);
@@ -1246,6 +1245,11 @@ xfs_imap_lookup(
1246 if (error) 1245 if (error)
1247 return error; 1246 return error;
1248 1247
1248 /* check that the returned record contains the required inode */
1249 if (rec.ir_startino > agino ||
1250 rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
1251 return EINVAL;
1252
1249 /* for untrusted inodes check it is allocated first */ 1253 /* for untrusted inodes check it is allocated first */
1250 if ((flags & XFS_IGET_UNTRUSTED) && 1254 if ((flags & XFS_IGET_UNTRUSTED) &&
1251 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) 1255 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 68415cb4f23c..34798f391c49 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1914,6 +1914,11 @@ xfs_iunlink_remove(
1914 return 0; 1914 return 0;
1915} 1915}
1916 1916
1917/*
1918 * A big issue when freeing the inode cluster is is that we _cannot_ skip any
1919 * inodes that are in memory - they all must be marked stale and attached to
1920 * the cluster buffer.
1921 */
1917STATIC void 1922STATIC void
1918xfs_ifree_cluster( 1923xfs_ifree_cluster(
1919 xfs_inode_t *free_ip, 1924 xfs_inode_t *free_ip,
@@ -1945,8 +1950,6 @@ xfs_ifree_cluster(
1945 } 1950 }
1946 1951
1947 for (j = 0; j < nbufs; j++, inum += ninodes) { 1952 for (j = 0; j < nbufs; j++, inum += ninodes) {
1948 int found = 0;
1949
1950 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1953 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1951 XFS_INO_TO_AGBNO(mp, inum)); 1954 XFS_INO_TO_AGBNO(mp, inum));
1952 1955
@@ -1965,7 +1968,9 @@ xfs_ifree_cluster(
1965 /* 1968 /*
1966 * Walk the inodes already attached to the buffer and mark them 1969 * Walk the inodes already attached to the buffer and mark them
1967 * stale. These will all have the flush locks held, so an 1970 * stale. These will all have the flush locks held, so an
1968 * in-memory inode walk can't lock them. 1971 * in-memory inode walk can't lock them. By marking them all
1972 * stale first, we will not attempt to lock them in the loop
1973 * below as the XFS_ISTALE flag will be set.
1969 */ 1974 */
1970 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 1975 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1971 while (lip) { 1976 while (lip) {
@@ -1977,11 +1982,11 @@ xfs_ifree_cluster(
1977 &iip->ili_flush_lsn, 1982 &iip->ili_flush_lsn,
1978 &iip->ili_item.li_lsn); 1983 &iip->ili_item.li_lsn);
1979 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 1984 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1980 found++;
1981 } 1985 }
1982 lip = lip->li_bio_list; 1986 lip = lip->li_bio_list;
1983 } 1987 }
1984 1988
1989
1985 /* 1990 /*
1986 * For each inode in memory attempt to add it to the inode 1991 * For each inode in memory attempt to add it to the inode
1987 * buffer and set it up for being staled on buffer IO 1992 * buffer and set it up for being staled on buffer IO
@@ -1993,6 +1998,7 @@ xfs_ifree_cluster(
1993 * even trying to lock them. 1998 * even trying to lock them.
1994 */ 1999 */
1995 for (i = 0; i < ninodes; i++) { 2000 for (i = 0; i < ninodes; i++) {
2001retry:
1996 read_lock(&pag->pag_ici_lock); 2002 read_lock(&pag->pag_ici_lock);
1997 ip = radix_tree_lookup(&pag->pag_ici_root, 2003 ip = radix_tree_lookup(&pag->pag_ici_root,
1998 XFS_INO_TO_AGINO(mp, (inum + i))); 2004 XFS_INO_TO_AGINO(mp, (inum + i)));
@@ -2003,38 +2009,36 @@ xfs_ifree_cluster(
2003 continue; 2009 continue;
2004 } 2010 }
2005 2011
2006 /* don't try to lock/unlock the current inode */ 2012 /*
2013 * Don't try to lock/unlock the current inode, but we
2014 * _cannot_ skip the other inodes that we did not find
2015 * in the list attached to the buffer and are not
2016 * already marked stale. If we can't lock it, back off
2017 * and retry.
2018 */
2007 if (ip != free_ip && 2019 if (ip != free_ip &&
2008 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2020 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2009 read_unlock(&pag->pag_ici_lock); 2021 read_unlock(&pag->pag_ici_lock);
2010 continue; 2022 delay(1);
2023 goto retry;
2011 } 2024 }
2012 read_unlock(&pag->pag_ici_lock); 2025 read_unlock(&pag->pag_ici_lock);
2013 2026
2014 if (!xfs_iflock_nowait(ip)) { 2027 xfs_iflock(ip);
2015 if (ip != free_ip)
2016 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2017 continue;
2018 }
2019
2020 xfs_iflags_set(ip, XFS_ISTALE); 2028 xfs_iflags_set(ip, XFS_ISTALE);
2021 if (xfs_inode_clean(ip)) {
2022 ASSERT(ip != free_ip);
2023 xfs_ifunlock(ip);
2024 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2025 continue;
2026 }
2027 2029
2030 /*
2031 * we don't need to attach clean inodes or those only
2032 * with unlogged changes (which we throw away, anyway).
2033 */
2028 iip = ip->i_itemp; 2034 iip = ip->i_itemp;
2029 if (!iip) { 2035 if (!iip || xfs_inode_clean(ip)) {
2030 /* inode with unlogged changes only */
2031 ASSERT(ip != free_ip); 2036 ASSERT(ip != free_ip);
2032 ip->i_update_core = 0; 2037 ip->i_update_core = 0;
2033 xfs_ifunlock(ip); 2038 xfs_ifunlock(ip);
2034 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2039 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2035 continue; 2040 continue;
2036 } 2041 }
2037 found++;
2038 2042
2039 iip->ili_last_fields = iip->ili_format.ilf_fields; 2043 iip->ili_last_fields = iip->ili_format.ilf_fields;
2040 iip->ili_format.ilf_fields = 0; 2044 iip->ili_format.ilf_fields = 0;
@@ -2049,8 +2053,7 @@ xfs_ifree_cluster(
2049 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2053 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2050 } 2054 }
2051 2055
2052 if (found) 2056 xfs_trans_stale_inode_buf(tp, bp);
2053 xfs_trans_stale_inode_buf(tp, bp);
2054 xfs_trans_binval(tp, bp); 2057 xfs_trans_binval(tp, bp);
2055 } 2058 }
2056 2059
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 430a8fc02c1f..ba8e36e0b4e7 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3002,7 +3002,8 @@ _xfs_log_force(
3002 3002
3003 XFS_STATS_INC(xs_log_force); 3003 XFS_STATS_INC(xs_log_force);
3004 3004
3005 xlog_cil_push(log, 1); 3005 if (log->l_cilp)
3006 xlog_cil_force(log);
3006 3007
3007 spin_lock(&log->l_icloglock); 3008 spin_lock(&log->l_icloglock);
3008 3009
@@ -3154,7 +3155,7 @@ _xfs_log_force_lsn(
3154 XFS_STATS_INC(xs_log_force); 3155 XFS_STATS_INC(xs_log_force);
3155 3156
3156 if (log->l_cilp) { 3157 if (log->l_cilp) {
3157 lsn = xlog_cil_push_lsn(log, lsn); 3158 lsn = xlog_cil_force_lsn(log, lsn);
3158 if (lsn == NULLCOMMITLSN) 3159 if (lsn == NULLCOMMITLSN)
3159 return 0; 3160 return 0;
3160 } 3161 }
@@ -3711,7 +3712,7 @@ xfs_log_force_umount(
3711 * call below. 3712 * call below.
3712 */ 3713 */
3713 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) 3714 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
3714 xlog_cil_push(log, 1); 3715 xlog_cil_force(log);
3715 3716
3716 /* 3717 /*
3717 * We must hold both the GRANT lock and the LOG lock, 3718 * We must hold both the GRANT lock and the LOG lock,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 31e4ea2d19ac..7e206fc1fa36 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -68,6 +68,7 @@ xlog_cil_init(
68 ctx->sequence = 1; 68 ctx->sequence = 1;
69 ctx->cil = cil; 69 ctx->cil = cil;
70 cil->xc_ctx = ctx; 70 cil->xc_ctx = ctx;
71 cil->xc_current_sequence = ctx->sequence;
71 72
72 cil->xc_log = log; 73 cil->xc_log = log;
73 log->l_cilp = cil; 74 log->l_cilp = cil;
@@ -269,15 +270,10 @@ xlog_cil_insert(
269static void 270static void
270xlog_cil_format_items( 271xlog_cil_format_items(
271 struct log *log, 272 struct log *log,
272 struct xfs_log_vec *log_vector, 273 struct xfs_log_vec *log_vector)
273 struct xlog_ticket *ticket,
274 xfs_lsn_t *start_lsn)
275{ 274{
276 struct xfs_log_vec *lv; 275 struct xfs_log_vec *lv;
277 276
278 if (start_lsn)
279 *start_lsn = log->l_cilp->xc_ctx->sequence;
280
281 ASSERT(log_vector); 277 ASSERT(log_vector);
282 for (lv = log_vector; lv; lv = lv->lv_next) { 278 for (lv = log_vector; lv; lv = lv->lv_next) {
283 void *ptr; 279 void *ptr;
@@ -301,9 +297,24 @@ xlog_cil_format_items(
301 ptr += vec->i_len; 297 ptr += vec->i_len;
302 } 298 }
303 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); 299 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
300 }
301}
304 302
303static void
304xlog_cil_insert_items(
305 struct log *log,
306 struct xfs_log_vec *log_vector,
307 struct xlog_ticket *ticket,
308 xfs_lsn_t *start_lsn)
309{
310 struct xfs_log_vec *lv;
311
312 if (start_lsn)
313 *start_lsn = log->l_cilp->xc_ctx->sequence;
314
315 ASSERT(log_vector);
316 for (lv = log_vector; lv; lv = lv->lv_next)
305 xlog_cil_insert(log, ticket, lv->lv_item, lv); 317 xlog_cil_insert(log, ticket, lv->lv_item, lv);
306 }
307} 318}
308 319
309static void 320static void
@@ -321,80 +332,6 @@ xlog_cil_free_logvec(
321} 332}
322 333
323/* 334/*
324 * Commit a transaction with the given vector to the Committed Item List.
325 *
326 * To do this, we need to format the item, pin it in memory if required and
327 * account for the space used by the transaction. Once we have done that we
328 * need to release the unused reservation for the transaction, attach the
329 * transaction to the checkpoint context so we carry the busy extents through
330 * to checkpoint completion, and then unlock all the items in the transaction.
331 *
332 * For more specific information about the order of operations in
333 * xfs_log_commit_cil() please refer to the comments in
334 * xfs_trans_commit_iclog().
335 *
336 * Called with the context lock already held in read mode to lock out
337 * background commit, returns without it held once background commits are
338 * allowed again.
339 */
340int
341xfs_log_commit_cil(
342 struct xfs_mount *mp,
343 struct xfs_trans *tp,
344 struct xfs_log_vec *log_vector,
345 xfs_lsn_t *commit_lsn,
346 int flags)
347{
348 struct log *log = mp->m_log;
349 int log_flags = 0;
350 int push = 0;
351
352 if (flags & XFS_TRANS_RELEASE_LOG_RES)
353 log_flags = XFS_LOG_REL_PERM_RESERV;
354
355 if (XLOG_FORCED_SHUTDOWN(log)) {
356 xlog_cil_free_logvec(log_vector);
357 return XFS_ERROR(EIO);
358 }
359
360 /* lock out background commit */
361 down_read(&log->l_cilp->xc_ctx_lock);
362 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
363
364 /* check we didn't blow the reservation */
365 if (tp->t_ticket->t_curr_res < 0)
366 xlog_print_tic_res(log->l_mp, tp->t_ticket);
367
368 /* attach the transaction to the CIL if it has any busy extents */
369 if (!list_empty(&tp->t_busy)) {
370 spin_lock(&log->l_cilp->xc_cil_lock);
371 list_splice_init(&tp->t_busy,
372 &log->l_cilp->xc_ctx->busy_extents);
373 spin_unlock(&log->l_cilp->xc_cil_lock);
374 }
375
376 tp->t_commit_lsn = *commit_lsn;
377 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
378 xfs_trans_unreserve_and_mod_sb(tp);
379
380 /* check for background commit before unlock */
381 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
382 push = 1;
383 up_read(&log->l_cilp->xc_ctx_lock);
384
385 /*
386 * We need to push CIL every so often so we don't cache more than we
387 * can fit in the log. The limit really is that a checkpoint can't be
388 * more than half the log (the current checkpoint is not allowed to
389 * overwrite the previous checkpoint), but commit latency and memory
390 * usage limit this to a smaller size in most cases.
391 */
392 if (push)
393 xlog_cil_push(log, 0);
394 return 0;
395}
396
397/*
398 * Mark all items committed and clear busy extents. We free the log vector 335 * Mark all items committed and clear busy extents. We free the log vector
399 * chains in a separate pass so that we unpin the log items as quickly as 336 * chains in a separate pass so that we unpin the log items as quickly as
400 * possible. 337 * possible.
@@ -427,13 +364,23 @@ xlog_cil_committed(
427} 364}
428 365
429/* 366/*
430 * Push the Committed Item List to the log. If the push_now flag is not set, 367 * Push the Committed Item List to the log. If @push_seq flag is zero, then it
431 * then it is a background flush and so we can chose to ignore it. 368 * is a background flush and so we can chose to ignore it. Otherwise, if the
369 * current sequence is the same as @push_seq we need to do a flush. If
370 * @push_seq is less than the current sequence, then it has already been
371 * flushed and we don't need to do anything - the caller will wait for it to
372 * complete if necessary.
373 *
374 * @push_seq is a value rather than a flag because that allows us to do an
375 * unlocked check of the sequence number for a match. Hence we can allows log
376 * forces to run racily and not issue pushes for the same sequence twice. If we
377 * get a race between multiple pushes for the same sequence they will block on
378 * the first one and then abort, hence avoiding needless pushes.
432 */ 379 */
433int 380STATIC int
434xlog_cil_push( 381xlog_cil_push(
435 struct log *log, 382 struct log *log,
436 int push_now) 383 xfs_lsn_t push_seq)
437{ 384{
438 struct xfs_cil *cil = log->l_cilp; 385 struct xfs_cil *cil = log->l_cilp;
439 struct xfs_log_vec *lv; 386 struct xfs_log_vec *lv;
@@ -453,12 +400,20 @@ xlog_cil_push(
453 if (!cil) 400 if (!cil)
454 return 0; 401 return 0;
455 402
403 ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
404
456 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 405 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
457 new_ctx->ticket = xlog_cil_ticket_alloc(log); 406 new_ctx->ticket = xlog_cil_ticket_alloc(log);
458 407
459 /* lock out transaction commit, but don't block on background push */ 408 /*
409 * Lock out transaction commit, but don't block for background pushes
410 * unless we are well over the CIL space limit. See the definition of
411 * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
412 * used here.
413 */
460 if (!down_write_trylock(&cil->xc_ctx_lock)) { 414 if (!down_write_trylock(&cil->xc_ctx_lock)) {
461 if (!push_now) 415 if (!push_seq &&
416 cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
462 goto out_free_ticket; 417 goto out_free_ticket;
463 down_write(&cil->xc_ctx_lock); 418 down_write(&cil->xc_ctx_lock);
464 } 419 }
@@ -469,7 +424,11 @@ xlog_cil_push(
469 goto out_skip; 424 goto out_skip;
470 425
471 /* check for spurious background flush */ 426 /* check for spurious background flush */
472 if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) 427 if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
428 goto out_skip;
429
430 /* check for a previously pushed seqeunce */
431 if (push_seq && push_seq < cil->xc_ctx->sequence)
473 goto out_skip; 432 goto out_skip;
474 433
475 /* 434 /*
@@ -515,6 +474,13 @@ xlog_cil_push(
515 cil->xc_ctx = new_ctx; 474 cil->xc_ctx = new_ctx;
516 475
517 /* 476 /*
477 * mirror the new sequence into the cil structure so that we can do
478 * unlocked checks against the current sequence in log forces without
479 * risking deferencing a freed context pointer.
480 */
481 cil->xc_current_sequence = new_ctx->sequence;
482
483 /*
518 * The switch is now done, so we can drop the context lock and move out 484 * The switch is now done, so we can drop the context lock and move out
519 * of a shared context. We can't just go straight to the commit record, 485 * of a shared context. We can't just go straight to the commit record,
520 * though - we need to synchronise with previous and future commits so 486 * though - we need to synchronise with previous and future commits so
@@ -626,6 +592,102 @@ out_abort:
626} 592}
627 593
628/* 594/*
595 * Commit a transaction with the given vector to the Committed Item List.
596 *
597 * To do this, we need to format the item, pin it in memory if required and
598 * account for the space used by the transaction. Once we have done that we
599 * need to release the unused reservation for the transaction, attach the
600 * transaction to the checkpoint context so we carry the busy extents through
601 * to checkpoint completion, and then unlock all the items in the transaction.
602 *
603 * For more specific information about the order of operations in
604 * xfs_log_commit_cil() please refer to the comments in
605 * xfs_trans_commit_iclog().
606 *
607 * Called with the context lock already held in read mode to lock out
608 * background commit, returns without it held once background commits are
609 * allowed again.
610 */
611int
612xfs_log_commit_cil(
613 struct xfs_mount *mp,
614 struct xfs_trans *tp,
615 struct xfs_log_vec *log_vector,
616 xfs_lsn_t *commit_lsn,
617 int flags)
618{
619 struct log *log = mp->m_log;
620 int log_flags = 0;
621 int push = 0;
622
623 if (flags & XFS_TRANS_RELEASE_LOG_RES)
624 log_flags = XFS_LOG_REL_PERM_RESERV;
625
626 if (XLOG_FORCED_SHUTDOWN(log)) {
627 xlog_cil_free_logvec(log_vector);
628 return XFS_ERROR(EIO);
629 }
630
631 /*
632 * do all the hard work of formatting items (including memory
633 * allocation) outside the CIL context lock. This prevents stalling CIL
634 * pushes when we are low on memory and a transaction commit spends a
635 * lot of time in memory reclaim.
636 */
637 xlog_cil_format_items(log, log_vector);
638
639 /* lock out background commit */
640 down_read(&log->l_cilp->xc_ctx_lock);
641 xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
642
643 /* check we didn't blow the reservation */
644 if (tp->t_ticket->t_curr_res < 0)
645 xlog_print_tic_res(log->l_mp, tp->t_ticket);
646
647 /* attach the transaction to the CIL if it has any busy extents */
648 if (!list_empty(&tp->t_busy)) {
649 spin_lock(&log->l_cilp->xc_cil_lock);
650 list_splice_init(&tp->t_busy,
651 &log->l_cilp->xc_ctx->busy_extents);
652 spin_unlock(&log->l_cilp->xc_cil_lock);
653 }
654
655 tp->t_commit_lsn = *commit_lsn;
656 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
657 xfs_trans_unreserve_and_mod_sb(tp);
658
659 /*
660 * Once all the items of the transaction have been copied to the CIL,
661 * the items can be unlocked and freed.
662 *
663 * This needs to be done before we drop the CIL context lock because we
664 * have to update state in the log items and unlock them before they go
665 * to disk. If we don't, then the CIL checkpoint can race with us and
666 * we can run checkpoint completion before we've updated and unlocked
667 * the log items. This affects (at least) processing of stale buffers,
668 * inodes and EFIs.
669 */
670 xfs_trans_free_items(tp, *commit_lsn, 0);
671
672 /* check for background commit before unlock */
673 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
674 push = 1;
675
676 up_read(&log->l_cilp->xc_ctx_lock);
677
678 /*
679 * We need to push CIL every so often so we don't cache more than we
680 * can fit in the log. The limit really is that a checkpoint can't be
681 * more than half the log (the current checkpoint is not allowed to
682 * overwrite the previous checkpoint), but commit latency and memory
683 * usage limit this to a smaller size in most cases.
684 */
685 if (push)
686 xlog_cil_push(log, 0);
687 return 0;
688}
689
690/*
629 * Conditionally push the CIL based on the sequence passed in. 691 * Conditionally push the CIL based on the sequence passed in.
630 * 692 *
631 * We only need to push if we haven't already pushed the sequence 693 * We only need to push if we haven't already pushed the sequence
@@ -639,39 +701,34 @@ out_abort:
639 * commit lsn is there. It'll be empty, so this is broken for now. 701 * commit lsn is there. It'll be empty, so this is broken for now.
640 */ 702 */
641xfs_lsn_t 703xfs_lsn_t
642xlog_cil_push_lsn( 704xlog_cil_force_lsn(
643 struct log *log, 705 struct log *log,
644 xfs_lsn_t push_seq) 706 xfs_lsn_t sequence)
645{ 707{
646 struct xfs_cil *cil = log->l_cilp; 708 struct xfs_cil *cil = log->l_cilp;
647 struct xfs_cil_ctx *ctx; 709 struct xfs_cil_ctx *ctx;
648 xfs_lsn_t commit_lsn = NULLCOMMITLSN; 710 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
649 711
650restart: 712 ASSERT(sequence <= cil->xc_current_sequence);
651 down_write(&cil->xc_ctx_lock); 713
652 ASSERT(push_seq <= cil->xc_ctx->sequence); 714 /*
653 715 * check to see if we need to force out the current context.
654 /* check to see if we need to force out the current context */ 716 * xlog_cil_push() handles racing pushes for the same sequence,
655 if (push_seq == cil->xc_ctx->sequence) { 717 * so no need to deal with it here.
656 up_write(&cil->xc_ctx_lock); 718 */
657 xlog_cil_push(log, 1); 719 if (sequence == cil->xc_current_sequence)
658 goto restart; 720 xlog_cil_push(log, sequence);
659 }
660 721
661 /* 722 /*
662 * See if we can find a previous sequence still committing. 723 * See if we can find a previous sequence still committing.
663 * We can drop the flush lock as soon as we have the cil lock
664 * because we are now only comparing contexts protected by
665 * the cil lock.
666 *
667 * We need to wait for all previous sequence commits to complete 724 * We need to wait for all previous sequence commits to complete
668 * before allowing the force of push_seq to go ahead. Hence block 725 * before allowing the force of push_seq to go ahead. Hence block
669 * on commits for those as well. 726 * on commits for those as well.
670 */ 727 */
728restart:
671 spin_lock(&cil->xc_cil_lock); 729 spin_lock(&cil->xc_cil_lock);
672 up_write(&cil->xc_ctx_lock);
673 list_for_each_entry(ctx, &cil->xc_committing, committing) { 730 list_for_each_entry(ctx, &cil->xc_committing, committing) {
674 if (ctx->sequence > push_seq) 731 if (ctx->sequence > sequence)
675 continue; 732 continue;
676 if (!ctx->commit_lsn) { 733 if (!ctx->commit_lsn) {
677 /* 734 /*
@@ -681,7 +738,7 @@ restart:
681 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 738 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
682 goto restart; 739 goto restart;
683 } 740 }
684 if (ctx->sequence != push_seq) 741 if (ctx->sequence != sequence)
685 continue; 742 continue;
686 /* found it! */ 743 /* found it! */
687 commit_lsn = ctx->commit_lsn; 744 commit_lsn = ctx->commit_lsn;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8c072618965c..edcdfe01617f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -422,16 +422,17 @@ struct xfs_cil {
422 struct rw_semaphore xc_ctx_lock; 422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing; 423 struct list_head xc_committing;
424 sv_t xc_commit_wait; 424 sv_t xc_commit_wait;
425 xfs_lsn_t xc_current_sequence;
425}; 426};
426 427
427/* 428/*
428 * The amount of log space we should the CIL to aggregate is difficult to size. 429 * The amount of log space we allow the CIL to aggregate is difficult to size.
429 * Whatever we chose we have to make we can get a reservation for the log space 430 * Whatever we choose, we have to make sure we can get a reservation for the
430 * effectively, that it is large enough to capture sufficient relogging to 431 * log space effectively, that it is large enough to capture sufficient
431 * reduce log buffer IO significantly, but it is not too large for the log or 432 * relogging to reduce log buffer IO significantly, but it is not too large for
432 * induces too much latency when writing out through the iclogs. We track both 433 * the log or induces too much latency when writing out through the iclogs. We
433 * space consumed and the number of vectors in the checkpoint context, so we 434 * track both space consumed and the number of vectors in the checkpoint
434 * need to decide which to use for limiting. 435 * context, so we need to decide which to use for limiting.
435 * 436 *
436 * Every log buffer we write out during a push needs a header reserved, which 437 * Every log buffer we write out during a push needs a header reserved, which
437 * is at least one sector and more for v2 logs. Hence we need a reservation of 438 * is at least one sector and more for v2 logs. Hence we need a reservation of
@@ -458,16 +459,21 @@ struct xfs_cil {
458 * checkpoint transaction ticket is specific to the checkpoint context, rather 459 * checkpoint transaction ticket is specific to the checkpoint context, rather
459 * than the CIL itself. 460 * than the CIL itself.
460 * 461 *
461 * With dynamic reservations, we can basically make up arbitrary limits for the 462 * With dynamic reservations, we can effectively make up arbitrary limits for
462 * checkpoint size so long as they don't violate any other size rules. Hence 463 * the checkpoint size so long as they don't violate any other size rules.
463 * the initial maximum size for the checkpoint transaction will be set to a 464 * Recovery imposes a rule that no transaction exceed half the log, so we are
464 * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit 465 * limited by that. Furthermore, the log transaction reservation subsystem
465 * right now based on the latency of writing out a large amount of data through 466 * tries to keep 25% of the log free, so we need to keep below that limit or we
466 * the circular iclog buffers. 467 * risk running out of free log space to start any new transactions.
468 *
469 * In order to keep background CIL push efficient, we will set a lower
470 * threshold at which background pushing is attempted without blocking current
471 * transaction commits. A separate, higher bound defines when CIL pushes are
472 * enforced to ensure we stay within our maximum checkpoint size bounds.
473 * threshold, yet give us plenty of space for aggregation on large logs.
467 */ 474 */
468 475#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
469#define XLOG_CIL_SPACE_LIMIT(log) \ 476#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
470 (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
471 477
472/* 478/*
473 * The reservation head lsn is not made up of a cycle number and block number. 479 * The reservation head lsn is not made up of a cycle number and block number.
@@ -562,8 +568,16 @@ int xlog_cil_init(struct log *log);
562void xlog_cil_init_post_recovery(struct log *log); 568void xlog_cil_init_post_recovery(struct log *log);
563void xlog_cil_destroy(struct log *log); 569void xlog_cil_destroy(struct log *log);
564 570
565int xlog_cil_push(struct log *log, int push_now); 571/*
566xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); 572 * CIL force routines
573 */
574xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
575
576static inline void
577xlog_cil_force(struct log *log)
578{
579 xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
580}
567 581
568/* 582/*
569 * Unmount record type is used as a pseudo transaction type for the ticket. 583 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fdca7416c754..1c47edaea0d2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1167,7 +1167,7 @@ xfs_trans_del_item(
1167 * Unlock all of the items of a transaction and free all the descriptors 1167 * Unlock all of the items of a transaction and free all the descriptors
1168 * of that transaction. 1168 * of that transaction.
1169 */ 1169 */
1170STATIC void 1170void
1171xfs_trans_free_items( 1171xfs_trans_free_items(
1172 struct xfs_trans *tp, 1172 struct xfs_trans *tp,
1173 xfs_lsn_t commit_lsn, 1173 xfs_lsn_t commit_lsn,
@@ -1653,9 +1653,6 @@ xfs_trans_commit_cil(
1653 return error; 1653 return error;
1654 1654
1655 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1655 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1656
1657 /* xfs_trans_free_items() unlocks them first */
1658 xfs_trans_free_items(tp, *commit_lsn, 0);
1659 xfs_trans_free(tp); 1656 xfs_trans_free(tp);
1660 return 0; 1657 return 0;
1661} 1658}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index e2d93d8ead7b..62da86c90de5 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -25,7 +25,8 @@ struct xfs_trans;
25 25
26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
27void xfs_trans_del_item(struct xfs_log_item *); 27void xfs_trans_del_item(struct xfs_log_item *);
28 28void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
29 int flags);
29void xfs_trans_item_committed(struct xfs_log_item *lip, 30void xfs_trans_item_committed(struct xfs_log_item *lip,
30 xfs_lsn_t commit_lsn, int aborted); 31 xfs_lsn_t commit_lsn, int aborted);
31void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); 32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 66d585c6917c..4c7c7bfb2b2f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2299,15 +2299,22 @@ xfs_alloc_file_space(
2299 e = allocatesize_fsb; 2299 e = allocatesize_fsb;
2300 } 2300 }
2301 2301
2302 /*
2303 * The transaction reservation is limited to a 32-bit block
2304 * count, hence we need to limit the number of blocks we are
2305 * trying to reserve to avoid an overflow. We can't allocate
2306 * more than @nimaps extents, and an extent is limited on disk
2307 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
2308 */
2309 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
2302 if (unlikely(rt)) { 2310 if (unlikely(rt)) {
2303 resrtextents = qblocks = (uint)(e - s); 2311 resrtextents = qblocks = resblks;
2304 resrtextents /= mp->m_sb.sb_rextsize; 2312 resrtextents /= mp->m_sb.sb_rextsize;
2305 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); 2313 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
2306 quota_flag = XFS_QMOPT_RES_RTBLKS; 2314 quota_flag = XFS_QMOPT_RES_RTBLKS;
2307 } else { 2315 } else {
2308 resrtextents = 0; 2316 resrtextents = 0;
2309 resblks = qblocks = \ 2317 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
2310 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
2311 quota_flag = XFS_QMOPT_RES_REGBLKS; 2318 quota_flag = XFS_QMOPT_RES_REGBLKS;
2312 } 2319 }
2313 2320