aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/afs/Makefile1
-rw-r--r--fs/afs/afs.h8
-rw-r--r--fs/afs/afs_fs.h3
-rw-r--r--fs/afs/callback.c3
-rw-r--r--fs/afs/dir.c1
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/flock.c558
-rw-r--r--fs/afs/fsclient.c155
-rw-r--r--fs/afs/internal.h30
-rw-r--r--fs/afs/main.c1
-rw-r--r--fs/afs/misc.c1
-rw-r--r--fs/afs/proc.c81
-rw-r--r--fs/afs/super.c3
-rw-r--r--fs/afs/vnode.c132
-rw-r--r--fs/anon_inodes.c10
-rw-r--r--fs/binfmt_elf.c109
-rw-r--r--fs/block_dev.c63
-rw-r--r--fs/buffer.c63
-rw-r--r--fs/cifs/cifsfs.c1
-rw-r--r--fs/cifs/connect.c1
-rw-r--r--fs/cifs/export.c1
-rw-r--r--fs/compat_ioctl.c4
-rw-r--r--fs/configfs/configfs_internal.h7
-rw-r--r--fs/configfs/dir.c289
-rw-r--r--fs/configfs/file.c28
-rw-r--r--fs/configfs/item.c29
-rw-r--r--fs/dcache.c7
-rw-r--r--fs/dlm/config.c20
-rw-r--r--fs/dquot.c7
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/inode.c4
-rw-r--r--fs/efs/namei.c32
-rw-r--r--fs/efs/super.c2
-rw-r--r--fs/exportfs/expfs.c439
-rw-r--r--fs/ext2/file.c6
-rw-r--r--fs/ext2/super.c21
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext3/namei.c10
-rw-r--r--fs/ext3/super.c52
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/namei.c10
-rw-r--r--fs/ext4/super.c49
-rw-r--r--fs/fat/dir.c31
-rw-r--r--fs/fat/fatent.c7
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/freevxfs/vxfs_dir.h2
-rw-r--r--fs/gfs2/eaops.c1
-rw-r--r--fs/gfs2/ops_export.c1
-rw-r--r--fs/hfsplus/btree.c4
-rw-r--r--fs/hfsplus/dir.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h4
-rw-r--r--fs/hfsplus/inode.c5
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/hfsplus/unicode.c230
-rw-r--r--fs/hugetlbfs/inode.c96
-rw-r--r--fs/inode.c17
-rw-r--r--fs/ioctl.c22
-rw-r--r--fs/isofs/dir.c87
-rw-r--r--fs/isofs/inode.c417
-rw-r--r--fs/isofs/isofs.h1
-rw-r--r--fs/isofs/joliet.c10
-rw-r--r--fs/isofs/namei.c26
-rw-r--r--fs/jbd/commit.c3
-rw-r--r--fs/jbd/revoke.c5
-rw-r--r--fs/jbd2/commit.c3
-rw-r--r--fs/jbd2/revoke.c5
-rw-r--r--fs/jffs2/background.c1
-rw-r--r--fs/jfs/jfs_inode.h1
-rw-r--r--fs/jfs/namei.c32
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/lockd/svc.c29
-rw-r--r--fs/mbcache.c9
-rw-r--r--fs/namespace.c23
-rw-r--r--fs/ncpfs/file.c2
-rw-r--r--fs/nfs/callback.c2
-rw-r--r--fs/nfs/client.c54
-rw-r--r--fs/nfs/super.c10
-rw-r--r--fs/nfsd/auth.c18
-rw-r--r--fs/nfsd/export.c289
-rw-r--r--fs/nfsd/lockd.c1
-rw-r--r--fs/nfsd/nfs4acl.c12
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/nfsd/nfs4idmap.c13
-rw-r--r--fs/nfsd/nfs4proc.c35
-rw-r--r--fs/nfsd/nfs4state.c46
-rw-r--r--fs/nfsd/nfs4xdr.c101
-rw-r--r--fs/nfsd/nfsctl.c3
-rw-r--r--fs/nfsd/nfsfh.c51
-rw-r--r--fs/nfsd/nfsproc.c3
-rw-r--r--fs/nfsd/nfssvc.c12
-rw-r--r--fs/nfsd/vfs.c110
-rw-r--r--fs/nls/Makefile2
-rw-r--r--fs/ntfs/namei.c1
-rw-r--r--fs/ocfs2/alloc.c2676
-rw-r--r--fs/ocfs2/alloc.h43
-rw-r--r--fs/ocfs2/aops.c1015
-rw-r--r--fs/ocfs2/aops.h61
-rw-r--r--fs/ocfs2/cluster/heartbeat.c96
-rw-r--r--fs/ocfs2/cluster/heartbeat.h6
-rw-r--r--fs/ocfs2/cluster/nodemanager.c42
-rw-r--r--fs/ocfs2/cluster/nodemanager.h5
-rw-r--r--fs/ocfs2/cluster/tcp.c21
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c8
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c40
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c79
-rw-r--r--fs/ocfs2/dlmglue.c6
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/export.h2
-rw-r--r--fs/ocfs2/extent_map.c41
-rw-r--r--fs/ocfs2/file.c702
-rw-r--r--fs/ocfs2/file.h10
-rw-r--r--fs/ocfs2/heartbeat.c10
-rw-r--r--fs/ocfs2/ioctl.c15
-rw-r--r--fs/ocfs2/journal.c6
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/mmap.c167
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h14
-rw-r--r--fs/ocfs2/ocfs2_fs.h33
-rw-r--r--fs/ocfs2/slot_map.c12
-rw-r--r--fs/ocfs2/suballoc.c46
-rw-r--r--fs/ocfs2/suballoc.h17
-rw-r--r--fs/ocfs2/super.c27
-rw-r--r--fs/ocfs2/super.h2
-rw-r--r--fs/open.c14
-rw-r--r--fs/partitions/acorn.c9
-rw-r--r--fs/partitions/ldm.c137
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/proc/array.c68
-rw-r--r--fs/proc/base.c85
-rw-r--r--fs/proc/generic.c52
-rw-r--r--fs/proc/inode.c254
-rw-r--r--fs/proc/proc_misc.c7
-rw-r--r--fs/proc/proc_tty.c15
-rw-r--r--fs/quota.c118
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/reiserfs/file.c1
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/seq_file.c18
-rw-r--r--fs/splice.c4
-rw-r--r--fs/super.c1
-rw-r--r--fs/udf/crc.c4
-rw-r--r--fs/udf/ialloc.c9
-rw-r--r--fs/udf/inode.c51
-rw-r--r--fs/ufs/super.c5
-rw-r--r--fs/utimes.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c14
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/quota/xfs_qm.c10
153 files changed, 7910 insertions, 2292 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 94b9d861bf9b..613df554728d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -991,7 +991,7 @@ config TMPFS_POSIX_ACL
991 991
992config HUGETLBFS 992config HUGETLBFS
993 bool "HugeTLB file system support" 993 bool "HugeTLB file system support"
994 depends on X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN 994 depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || BROKEN
995 help 995 help
996 hugetlbfs is a filesystem backing for HugeTLB pages, based on 996 hugetlbfs is a filesystem backing for HugeTLB pages, based on
997 ramfs. For architectures that support it, say Y here and read 997 ramfs. For architectures that support it, say Y here and read
@@ -1675,6 +1675,7 @@ config NFSD_V3_ACL
1675config NFSD_V4 1675config NFSD_V4
1676 bool "Provide NFSv4 server support (EXPERIMENTAL)" 1676 bool "Provide NFSv4 server support (EXPERIMENTAL)"
1677 depends on NFSD_V3 && EXPERIMENTAL 1677 depends on NFSD_V3 && EXPERIMENTAL
1678 select RPCSEC_GSS_KRB5
1678 help 1679 help
1679 If you would like to include the NFSv4 server as well as the NFSv2 1680 If you would like to include the NFSv4 server as well as the NFSv2
1680 and NFSv3 servers, say Y here. This feature is experimental, and 1681 and NFSv3 servers, say Y here. This feature is experimental, and
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 73ce561f3ea0..a66671082cfb 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -8,6 +8,7 @@ kafs-objs := \
8 cmservice.o \ 8 cmservice.o \
9 dir.o \ 9 dir.o \
10 file.o \ 10 file.o \
11 flock.o \
11 fsclient.o \ 12 fsclient.o \
12 inode.o \ 13 inode.o \
13 main.o \ 14 main.o \
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 245257948140..c548aa346f0d 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -37,6 +37,13 @@ typedef enum {
37 AFS_FTYPE_SYMLINK = 3, 37 AFS_FTYPE_SYMLINK = 3,
38} afs_file_type_t; 38} afs_file_type_t;
39 39
40typedef enum {
41 AFS_LOCK_READ = 0, /* read lock request */
42 AFS_LOCK_WRITE = 1, /* write lock request */
43} afs_lock_type_t;
44
45#define AFS_LOCKWAIT (5 * 60) /* time until a lock times out (seconds) */
46
40/* 47/*
41 * AFS file identifier 48 * AFS file identifier
42 */ 49 */
@@ -120,6 +127,7 @@ struct afs_file_status {
120 struct afs_fid parent; /* parent dir ID for non-dirs only */ 127 struct afs_fid parent; /* parent dir ID for non-dirs only */
121 time_t mtime_client; /* last time client changed data */ 128 time_t mtime_client; /* last time client changed data */
122 time_t mtime_server; /* last time server changed data */ 129 time_t mtime_server; /* last time server changed data */
130 s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
123}; 131};
124 132
125/* 133/*
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
index a18c374ebe08..eb647323d8f0 100644
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h
@@ -31,6 +31,9 @@ enum AFS_FS_Operations {
31 FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */ 31 FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */
32 FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */ 32 FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */
33 FSGETROOTVOLUME = 151, /* AFS Get root volume name */ 33 FSGETROOTVOLUME = 151, /* AFS Get root volume name */
34 FSSETLOCK = 156, /* AFS Request a file lock */
35 FSEXTENDLOCK = 157, /* AFS Extend a file lock */
36 FSRELEASELOCK = 158, /* AFS Release a file lock */
34 FSLOOKUP = 161, /* AFS lookup file in directory */ 37 FSLOOKUP = 161, /* AFS lookup file in directory */
35 FSFETCHDATA64 = 65537, /* AFS Fetch file data */ 38 FSFETCHDATA64 = 65537, /* AFS Fetch file data */
36 FSSTOREDATA64 = 65538, /* AFS Store file data */ 39 FSSTOREDATA64 = 65538, /* AFS Store file data */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index bacf518c6fa8..b8243945818d 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -125,6 +125,9 @@ static void afs_break_callback(struct afs_server *server,
125 spin_unlock(&server->cb_lock); 125 spin_unlock(&server->cb_lock);
126 126
127 queue_work(afs_callback_update_worker, &vnode->cb_broken_work); 127 queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
128 if (list_empty(&vnode->granted_locks) &&
129 !list_empty(&vnode->pending_locks))
130 afs_lock_may_be_available(vnode);
128 spin_unlock(&vnode->lock); 131 spin_unlock(&vnode->lock);
129 } 132 }
130} 133}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 546c59522eb1..33fe39ad4e03 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -44,6 +44,7 @@ const struct file_operations afs_dir_file_operations = {
44 .open = afs_dir_open, 44 .open = afs_dir_open,
45 .release = afs_release, 45 .release = afs_release,
46 .readdir = afs_readdir, 46 .readdir = afs_readdir,
47 .lock = afs_lock,
47}; 48};
48 49
49const struct inode_operations afs_dir_inode_operations = { 50const struct inode_operations afs_dir_inode_operations = {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index aede7eb66dd4..525f7c56e068 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -34,6 +34,8 @@ const struct file_operations afs_file_operations = {
34 .mmap = generic_file_readonly_mmap, 34 .mmap = generic_file_readonly_mmap,
35 .splice_read = generic_file_splice_read, 35 .splice_read = generic_file_splice_read,
36 .fsync = afs_fsync, 36 .fsync = afs_fsync,
37 .lock = afs_lock,
38 .flock = afs_flock,
37}; 39};
38 40
39const struct inode_operations afs_file_inode_operations = { 41const struct inode_operations afs_file_inode_operations = {
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
new file mode 100644
index 000000000000..8f07f8d1bfa9
--- /dev/null
+++ b/fs/afs/flock.c
@@ -0,0 +1,558 @@
1/* AFS file locking support
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/smp_lock.h>
13#include "internal.h"
14
15#define AFS_LOCK_GRANTED 0
16#define AFS_LOCK_PENDING 1
17
18static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl);
19static void afs_fl_release_private(struct file_lock *fl);
20
21static struct workqueue_struct *afs_lock_manager;
22
23static struct file_lock_operations afs_lock_ops = {
24 .fl_copy_lock = afs_fl_copy_lock,
25 .fl_release_private = afs_fl_release_private,
26};
27
28/*
29 * initialise the lock manager thread if it isn't already running
30 */
31static int afs_init_lock_manager(void)
32{
33 if (!afs_lock_manager) {
34 afs_lock_manager = create_singlethread_workqueue("kafs_lockd");
35 if (!afs_lock_manager)
36 return -ENOMEM;
37 }
38 return 0;
39}
40
41/*
42 * destroy the lock manager thread if it's running
43 */
44void __exit afs_kill_lock_manager(void)
45{
46 if (afs_lock_manager)
47 destroy_workqueue(afs_lock_manager);
48}
49
50/*
51 * if the callback is broken on this vnode, then the lock may now be available
52 */
53void afs_lock_may_be_available(struct afs_vnode *vnode)
54{
55 _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
56
57 queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0);
58}
59
60/*
61 * the lock will time out in 5 minutes unless we extend it, so schedule
62 * extension in a bit less than that time
63 */
64static void afs_schedule_lock_extension(struct afs_vnode *vnode)
65{
66 queue_delayed_work(afs_lock_manager, &vnode->lock_work,
67 AFS_LOCKWAIT * HZ / 2);
68}
69
70/*
71 * do work for a lock, including:
72 * - probing for a lock we're waiting on but didn't get immediately
73 * - extending a lock that's close to timing out
74 */
75void afs_lock_work(struct work_struct *work)
76{
77 struct afs_vnode *vnode =
78 container_of(work, struct afs_vnode, lock_work.work);
79 struct file_lock *fl;
80 afs_lock_type_t type;
81 struct key *key;
82 int ret;
83
84 _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
85
86 spin_lock(&vnode->lock);
87
88 if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) {
89 _debug("unlock");
90 spin_unlock(&vnode->lock);
91
92 /* attempt to release the server lock; if it fails, we just
93 * wait 5 minutes and it'll time out anyway */
94 ret = afs_vnode_release_lock(vnode, vnode->unlock_key);
95 if (ret < 0)
96 printk(KERN_WARNING "AFS:"
97 " Failed to release lock on {%x:%x} error %d\n",
98 vnode->fid.vid, vnode->fid.vnode, ret);
99
100 spin_lock(&vnode->lock);
101 key_put(vnode->unlock_key);
102 vnode->unlock_key = NULL;
103 clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags);
104 }
105
106 /* if we've got a lock, then it must be time to extend that lock as AFS
107 * locks time out after 5 minutes */
108 if (!list_empty(&vnode->granted_locks)) {
109 _debug("extend");
110
111 if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
112 BUG();
113 fl = list_entry(vnode->granted_locks.next,
114 struct file_lock, fl_u.afs.link);
115 key = key_get(fl->fl_file->private_data);
116 spin_unlock(&vnode->lock);
117
118 ret = afs_vnode_extend_lock(vnode, key);
119 clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
120 key_put(key);
121 switch (ret) {
122 case 0:
123 afs_schedule_lock_extension(vnode);
124 break;
125 default:
126 /* ummm... we failed to extend the lock - retry
127 * extension shortly */
128 printk(KERN_WARNING "AFS:"
129 " Failed to extend lock on {%x:%x} error %d\n",
130 vnode->fid.vid, vnode->fid.vnode, ret);
131 queue_delayed_work(afs_lock_manager, &vnode->lock_work,
132 HZ * 10);
133 break;
134 }
135 _leave(" [extend]");
136 return;
137 }
138
139 /* if we don't have a granted lock, then we must've been called back by
140 * the server, and so if might be possible to get a lock we're
141 * currently waiting for */
142 if (!list_empty(&vnode->pending_locks)) {
143 _debug("get");
144
145 if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
146 BUG();
147 fl = list_entry(vnode->pending_locks.next,
148 struct file_lock, fl_u.afs.link);
149 key = key_get(fl->fl_file->private_data);
150 type = (fl->fl_type == F_RDLCK) ?
151 AFS_LOCK_READ : AFS_LOCK_WRITE;
152 spin_unlock(&vnode->lock);
153
154 ret = afs_vnode_set_lock(vnode, key, type);
155 clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
156 switch (ret) {
157 case -EWOULDBLOCK:
158 _debug("blocked");
159 break;
160 case 0:
161 _debug("acquired");
162 if (type == AFS_LOCK_READ)
163 set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
164 else
165 set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
166 ret = AFS_LOCK_GRANTED;
167 default:
168 spin_lock(&vnode->lock);
169 /* the pending lock may have been withdrawn due to a
170 * signal */
171 if (list_entry(vnode->pending_locks.next,
172 struct file_lock, fl_u.afs.link) == fl) {
173 fl->fl_u.afs.state = ret;
174 if (ret == AFS_LOCK_GRANTED)
175 list_move_tail(&fl->fl_u.afs.link,
176 &vnode->granted_locks);
177 else
178 list_del_init(&fl->fl_u.afs.link);
179 wake_up(&fl->fl_wait);
180 spin_unlock(&vnode->lock);
181 } else {
182 _debug("withdrawn");
183 clear_bit(AFS_VNODE_READLOCKED, &vnode->flags);
184 clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
185 spin_unlock(&vnode->lock);
186 afs_vnode_release_lock(vnode, key);
187 if (!list_empty(&vnode->pending_locks))
188 afs_lock_may_be_available(vnode);
189 }
190 break;
191 }
192 key_put(key);
193 _leave(" [pend]");
194 return;
195 }
196
197 /* looks like the lock request was withdrawn on a signal */
198 spin_unlock(&vnode->lock);
199 _leave(" [no locks]");
200}
201
202/*
203 * pass responsibility for the unlocking of a vnode on the server to the
204 * manager thread, lest a pending signal in the calling thread interrupt
205 * AF_RXRPC
206 * - the caller must hold the vnode lock
207 */
208static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
209{
210 cancel_delayed_work(&vnode->lock_work);
211 if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) &&
212 !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags))
213 BUG();
214 if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags))
215 BUG();
216 vnode->unlock_key = key_get(key);
217 afs_lock_may_be_available(vnode);
218}
219
220/*
221 * request a lock on a file on the server
222 */
223static int afs_do_setlk(struct file *file, struct file_lock *fl)
224{
225 struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
226 afs_lock_type_t type;
227 struct key *key = file->private_data;
228 int ret;
229
230 _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
231
232 /* only whole-file locks are supported */
233 if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
234 return -EINVAL;
235
236 ret = afs_init_lock_manager();
237 if (ret < 0)
238 return ret;
239
240 fl->fl_ops = &afs_lock_ops;
241 INIT_LIST_HEAD(&fl->fl_u.afs.link);
242 fl->fl_u.afs.state = AFS_LOCK_PENDING;
243
244 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
245
246 lock_kernel();
247
248 /* make sure we've got a callback on this file and that our view of the
249 * data version is up to date */
250 ret = afs_vnode_fetch_status(vnode, NULL, key);
251 if (ret < 0)
252 goto error;
253
254 if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) {
255 ret = -EAGAIN;
256 goto error;
257 }
258
259 spin_lock(&vnode->lock);
260
261 if (list_empty(&vnode->pending_locks)) {
262 /* if there's no-one else with a lock on this vnode, then we
263 * need to ask the server for a lock */
264 if (list_empty(&vnode->granted_locks)) {
265 _debug("not locked");
266 ASSERTCMP(vnode->flags &
267 ((1 << AFS_VNODE_LOCKING) |
268 (1 << AFS_VNODE_READLOCKED) |
269 (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
270 list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
271 set_bit(AFS_VNODE_LOCKING, &vnode->flags);
272 spin_unlock(&vnode->lock);
273
274 ret = afs_vnode_set_lock(vnode, key, type);
275 clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
276 switch (ret) {
277 case 0:
278 goto acquired_server_lock;
279 case -EWOULDBLOCK:
280 spin_lock(&vnode->lock);
281 ASSERT(list_empty(&vnode->granted_locks));
282 ASSERTCMP(vnode->pending_locks.next, ==,
283 &fl->fl_u.afs.link);
284 goto wait;
285 default:
286 spin_lock(&vnode->lock);
287 list_del_init(&fl->fl_u.afs.link);
288 spin_unlock(&vnode->lock);
289 goto error;
290 }
291 }
292
293 /* if we've already got a readlock on the server and no waiting
294 * writelocks, then we might be able to instantly grant another
295 * readlock */
296 if (type == AFS_LOCK_READ &&
297 vnode->flags & (1 << AFS_VNODE_READLOCKED)) {
298 _debug("instant readlock");
299 ASSERTCMP(vnode->flags &
300 ((1 << AFS_VNODE_LOCKING) |
301 (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
302 ASSERT(!list_empty(&vnode->granted_locks));
303 goto sharing_existing_lock;
304 }
305 }
306
307 /* otherwise, we need to wait for a local lock to become available */
308 _debug("wait local");
309 list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
310wait:
311 if (!(fl->fl_flags & FL_SLEEP)) {
312 _debug("noblock");
313 ret = -EAGAIN;
314 goto abort_attempt;
315 }
316 spin_unlock(&vnode->lock);
317
318 /* now we need to sleep and wait for the lock manager thread to get the
319 * lock from the server */
320 _debug("sleep");
321 ret = wait_event_interruptible(fl->fl_wait,
322 fl->fl_u.afs.state <= AFS_LOCK_GRANTED);
323 if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
324 ret = fl->fl_u.afs.state;
325 if (ret < 0)
326 goto error;
327 spin_lock(&vnode->lock);
328 goto given_lock;
329 }
330
331 /* we were interrupted, but someone may still be in the throes of
332 * giving us the lock */
333 _debug("intr");
334 ASSERTCMP(ret, ==, -ERESTARTSYS);
335
336 spin_lock(&vnode->lock);
337 if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
338 ret = fl->fl_u.afs.state;
339 if (ret < 0) {
340 spin_unlock(&vnode->lock);
341 goto error;
342 }
343 goto given_lock;
344 }
345
346abort_attempt:
347 /* we aren't going to get the lock, either because we're unwilling to
348 * wait, or because some signal happened */
349 _debug("abort");
350 if (list_empty(&vnode->granted_locks) &&
351 vnode->pending_locks.next == &fl->fl_u.afs.link) {
352 if (vnode->pending_locks.prev != &fl->fl_u.afs.link) {
353 /* kick the next pending lock into having a go */
354 list_del_init(&fl->fl_u.afs.link);
355 afs_lock_may_be_available(vnode);
356 }
357 } else {
358 list_del_init(&fl->fl_u.afs.link);
359 }
360 spin_unlock(&vnode->lock);
361 goto error;
362
363acquired_server_lock:
364 /* we've acquired a server lock, but it needs to be renewed after 5
365 * mins */
366 spin_lock(&vnode->lock);
367 afs_schedule_lock_extension(vnode);
368 if (type == AFS_LOCK_READ)
369 set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
370 else
371 set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
372sharing_existing_lock:
373 /* the lock has been granted as far as we're concerned... */
374 fl->fl_u.afs.state = AFS_LOCK_GRANTED;
375 list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
376given_lock:
377 /* ... but we do still need to get the VFS's blessing */
378 ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING)));
379 ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) |
380 (1 << AFS_VNODE_WRITELOCKED))) != 0);
381 ret = posix_lock_file(file, fl, NULL);
382 if (ret < 0)
383 goto vfs_rejected_lock;
384 spin_unlock(&vnode->lock);
385
386 /* again, make sure we've got a callback on this file and, again, make
387 * sure that our view of the data version is up to date (we ignore
388 * errors incurred here and deal with the consequences elsewhere) */
389 afs_vnode_fetch_status(vnode, NULL, key);
390
391error:
392 unlock_kernel();
393 _leave(" = %d", ret);
394 return ret;
395
396vfs_rejected_lock:
397 /* the VFS rejected the lock we just obtained, so we have to discard
398 * what we just got */
399 _debug("vfs refused %d", ret);
400 list_del_init(&fl->fl_u.afs.link);
401 if (list_empty(&vnode->granted_locks))
402 afs_defer_unlock(vnode, key);
403 spin_unlock(&vnode->lock);
404 goto abort_attempt;
405}
406
407/*
408 * unlock on a file on the server
409 */
410static int afs_do_unlk(struct file *file, struct file_lock *fl)
411{
412 struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
413 struct key *key = file->private_data;
414 int ret;
415
416 _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
417
418 /* only whole-file unlocks are supported */
419 if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
420 return -EINVAL;
421
422 fl->fl_ops = &afs_lock_ops;
423 INIT_LIST_HEAD(&fl->fl_u.afs.link);
424 fl->fl_u.afs.state = AFS_LOCK_PENDING;
425
426 spin_lock(&vnode->lock);
427 ret = posix_lock_file(file, fl, NULL);
428 if (ret < 0) {
429 spin_unlock(&vnode->lock);
430 _leave(" = %d [vfs]", ret);
431 return ret;
432 }
433
434 /* discard the server lock only if all granted locks are gone */
435 if (list_empty(&vnode->granted_locks))
436 afs_defer_unlock(vnode, key);
437 spin_unlock(&vnode->lock);
438 _leave(" = 0");
439 return 0;
440}
441
442/*
443 * return information about a lock we currently hold, if indeed we hold one
444 */
445static int afs_do_getlk(struct file *file, struct file_lock *fl)
446{
447 struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
448 struct key *key = file->private_data;
449 int ret, lock_count;
450
451 _enter("");
452
453 fl->fl_type = F_UNLCK;
454
455 mutex_lock(&vnode->vfs_inode.i_mutex);
456
457 /* check local lock records first */
458 ret = 0;
459 if (posix_test_lock(file, fl) == 0) {
460 /* no local locks; consult the server */
461 ret = afs_vnode_fetch_status(vnode, NULL, key);
462 if (ret < 0)
463 goto error;
464 lock_count = vnode->status.lock_count;
465 if (lock_count) {
466 if (lock_count > 0)
467 fl->fl_type = F_RDLCK;
468 else
469 fl->fl_type = F_WRLCK;
470 fl->fl_start = 0;
471 fl->fl_end = OFFSET_MAX;
472 }
473 }
474
475error:
476 mutex_unlock(&vnode->vfs_inode.i_mutex);
477 _leave(" = %d [%hd]", ret, fl->fl_type);
478 return ret;
479}
480
481/*
482 * manage POSIX locks on a file
483 */
484int afs_lock(struct file *file, int cmd, struct file_lock *fl)
485{
486 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
487
488 _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
489 vnode->fid.vid, vnode->fid.vnode, cmd,
490 fl->fl_type, fl->fl_flags,
491 (long long) fl->fl_start, (long long) fl->fl_end);
492
493 /* AFS doesn't support mandatory locks */
494 if ((vnode->vfs_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
495 fl->fl_type != F_UNLCK)
496 return -ENOLCK;
497
498 if (IS_GETLK(cmd))
499 return afs_do_getlk(file, fl);
500 if (fl->fl_type == F_UNLCK)
501 return afs_do_unlk(file, fl);
502 return afs_do_setlk(file, fl);
503}
504
505/*
506 * manage FLOCK locks on a file
507 */
508int afs_flock(struct file *file, int cmd, struct file_lock *fl)
509{
510 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
511
512 _enter("{%x:%u},%d,{t=%x,fl=%x}",
513 vnode->fid.vid, vnode->fid.vnode, cmd,
514 fl->fl_type, fl->fl_flags);
515
516 /*
517 * No BSD flocks over NFS allowed.
518 * Note: we could try to fake a POSIX lock request here by
519 * using ((u32) filp | 0x80000000) or some such as the pid.
520 * Not sure whether that would be unique, though, or whether
521 * that would break in other places.
522 */
523 if (!(fl->fl_flags & FL_FLOCK))
524 return -ENOLCK;
525
526 /* we're simulating flock() locks using posix locks on the server */
527 fl->fl_owner = (fl_owner_t) file;
528 fl->fl_start = 0;
529 fl->fl_end = OFFSET_MAX;
530
531 if (fl->fl_type == F_UNLCK)
532 return afs_do_unlk(file, fl);
533 return afs_do_setlk(file, fl);
534}
535
536/*
537 * the POSIX lock management core VFS code copies the lock record and adds the
538 * copy into its own list, so we need to add that copy to the vnode's lock
539 * queue in the same place as the original (which will be deleted shortly
540 * after)
541 */
542static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
543{
544 _enter("");
545
546 list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link);
547}
548
549/*
550 * need to remove this lock from the vnode queue when it's removed from the
551 * VFS's list
552 */
553static void afs_fl_release_private(struct file_lock *fl)
554{
555 _enter("");
556
557 list_del_init(&fl->fl_u.afs.link);
558}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 5dff1308b6f0..023b95b0d9d7 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -67,7 +67,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
67 EXTRACT(status->group); 67 EXTRACT(status->group);
68 bp++; /* sync counter */ 68 bp++; /* sync counter */
69 data_version |= (u64) ntohl(*bp++) << 32; 69 data_version |= (u64) ntohl(*bp++) << 32;
70 bp++; /* lock count */ 70 EXTRACT(status->lock_count);
71 size |= (u64) ntohl(*bp++) << 32; 71 size |= (u64) ntohl(*bp++) << 32;
72 bp++; /* spare 4 */ 72 bp++; /* spare 4 */
73 *_bp = bp; 73 *_bp = bp;
@@ -1748,3 +1748,156 @@ int afs_fs_get_volume_status(struct afs_server *server,
1748 1748
1749 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); 1749 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1750} 1750}
1751
1752/*
1753 * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock
1754 */
1755static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
1756 struct sk_buff *skb, bool last)
1757{
1758 const __be32 *bp;
1759
1760 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
1761
1762 afs_transfer_reply(call, skb);
1763 if (!last)
1764 return 0;
1765
1766 if (call->reply_size != call->reply_max)
1767 return -EBADMSG;
1768
1769 /* unmarshall the reply once we've received all of it */
1770 bp = call->buffer;
1771 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
1772
1773 _leave(" = 0 [done]");
1774 return 0;
1775}
1776
1777/*
1778 * FS.SetLock operation type
1779 */
1780static const struct afs_call_type afs_RXFSSetLock = {
1781 .name = "FS.SetLock",
1782 .deliver = afs_deliver_fs_xxxx_lock,
1783 .abort_to_error = afs_abort_to_error,
1784 .destructor = afs_flat_call_destructor,
1785};
1786
1787/*
1788 * FS.ExtendLock operation type
1789 */
1790static const struct afs_call_type afs_RXFSExtendLock = {
1791 .name = "FS.ExtendLock",
1792 .deliver = afs_deliver_fs_xxxx_lock,
1793 .abort_to_error = afs_abort_to_error,
1794 .destructor = afs_flat_call_destructor,
1795};
1796
1797/*
1798 * FS.ReleaseLock operation type
1799 */
1800static const struct afs_call_type afs_RXFSReleaseLock = {
1801 .name = "FS.ReleaseLock",
1802 .deliver = afs_deliver_fs_xxxx_lock,
1803 .abort_to_error = afs_abort_to_error,
1804 .destructor = afs_flat_call_destructor,
1805};
1806
1807/*
1808 * get a lock on a file
1809 */
1810int afs_fs_set_lock(struct afs_server *server,
1811 struct key *key,
1812 struct afs_vnode *vnode,
1813 afs_lock_type_t type,
1814 const struct afs_wait_mode *wait_mode)
1815{
1816 struct afs_call *call;
1817 __be32 *bp;
1818
1819 _enter("");
1820
1821 call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4);
1822 if (!call)
1823 return -ENOMEM;
1824
1825 call->key = key;
1826 call->reply = vnode;
1827 call->service_id = FS_SERVICE;
1828 call->port = htons(AFS_FS_PORT);
1829
1830 /* marshall the parameters */
1831 bp = call->request;
1832 *bp++ = htonl(FSSETLOCK);
1833 *bp++ = htonl(vnode->fid.vid);
1834 *bp++ = htonl(vnode->fid.vnode);
1835 *bp++ = htonl(vnode->fid.unique);
1836 *bp++ = htonl(type);
1837
1838 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1839}
1840
1841/*
1842 * extend a lock on a file
1843 */
1844int afs_fs_extend_lock(struct afs_server *server,
1845 struct key *key,
1846 struct afs_vnode *vnode,
1847 const struct afs_wait_mode *wait_mode)
1848{
1849 struct afs_call *call;
1850 __be32 *bp;
1851
1852 _enter("");
1853
1854 call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4);
1855 if (!call)
1856 return -ENOMEM;
1857
1858 call->key = key;
1859 call->reply = vnode;
1860 call->service_id = FS_SERVICE;
1861 call->port = htons(AFS_FS_PORT);
1862
1863 /* marshall the parameters */
1864 bp = call->request;
1865 *bp++ = htonl(FSEXTENDLOCK);
1866 *bp++ = htonl(vnode->fid.vid);
1867 *bp++ = htonl(vnode->fid.vnode);
1868 *bp++ = htonl(vnode->fid.unique);
1869
1870 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1871}
1872
1873/*
1874 * release a lock on a file
1875 */
1876int afs_fs_release_lock(struct afs_server *server,
1877 struct key *key,
1878 struct afs_vnode *vnode,
1879 const struct afs_wait_mode *wait_mode)
1880{
1881 struct afs_call *call;
1882 __be32 *bp;
1883
1884 _enter("");
1885
1886 call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4);
1887 if (!call)
1888 return -ENOMEM;
1889
1890 call->key = key;
1891 call->reply = vnode;
1892 call->service_id = FS_SERVICE;
1893 call->port = htons(AFS_FS_PORT);
1894
1895 /* marshall the parameters */
1896 bp = call->request;
1897 *bp++ = htonl(FSRELEASELOCK);
1898 *bp++ = htonl(vnode->fid.vid);
1899 *bp++ = htonl(vnode->fid.vnode);
1900 *bp++ = htonl(vnode->fid.unique);
1901
1902 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
1903}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 2c55dd94a1de..6306438f331f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -351,10 +351,18 @@ struct afs_vnode {
351#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ 351#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
352#define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ 352#define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */
353#define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ 353#define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */
354#define AFS_VNODE_LOCKING 6 /* set if waiting for lock on vnode */
355#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */
356#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */
357#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */
354 358
355 long acl_order; /* ACL check count (callback break count) */ 359 long acl_order; /* ACL check count (callback break count) */
356 360
357 struct list_head writebacks; /* alterations in pagecache that need writing */ 361 struct list_head writebacks; /* alterations in pagecache that need writing */
362 struct list_head pending_locks; /* locks waiting to be granted */
363 struct list_head granted_locks; /* locks granted on this file */
364 struct delayed_work lock_work; /* work to be done in locking */
365 struct key *unlock_key; /* key to be used in unlocking */
358 366
359 /* outstanding callback notification on this file */ 367 /* outstanding callback notification on this file */
360 struct rb_node server_rb; /* link in server->fs_vnodes */ 368 struct rb_node server_rb; /* link in server->fs_vnodes */
@@ -474,6 +482,15 @@ extern int afs_open(struct inode *, struct file *);
474extern int afs_release(struct inode *, struct file *); 482extern int afs_release(struct inode *, struct file *);
475 483
476/* 484/*
485 * flock.c
486 */
487extern void __exit afs_kill_lock_manager(void);
488extern void afs_lock_work(struct work_struct *);
489extern void afs_lock_may_be_available(struct afs_vnode *);
490extern int afs_lock(struct file *, int, struct file_lock *);
491extern int afs_flock(struct file *, int, struct file_lock *);
492
493/*
477 * fsclient.c 494 * fsclient.c
478 */ 495 */
479extern int afs_fs_fetch_file_status(struct afs_server *, struct key *, 496extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
@@ -513,6 +530,15 @@ extern int afs_fs_get_volume_status(struct afs_server *, struct key *,
513 struct afs_vnode *, 530 struct afs_vnode *,
514 struct afs_volume_status *, 531 struct afs_volume_status *,
515 const struct afs_wait_mode *); 532 const struct afs_wait_mode *);
533extern int afs_fs_set_lock(struct afs_server *, struct key *,
534 struct afs_vnode *, afs_lock_type_t,
535 const struct afs_wait_mode *);
536extern int afs_fs_extend_lock(struct afs_server *, struct key *,
537 struct afs_vnode *,
538 const struct afs_wait_mode *);
539extern int afs_fs_release_lock(struct afs_server *, struct key *,
540 struct afs_vnode *,
541 const struct afs_wait_mode *);
516 542
517/* 543/*
518 * inode.c 544 * inode.c
@@ -681,6 +707,10 @@ extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t,
681extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *); 707extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *);
682extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *, 708extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *,
683 struct afs_volume_status *); 709 struct afs_volume_status *);
710extern int afs_vnode_set_lock(struct afs_vnode *, struct key *,
711 afs_lock_type_t);
712extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *);
713extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
684 714
685/* 715/*
686 * volume.c 716 * volume.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cd21195bbb24..0f60f6b35769 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -168,6 +168,7 @@ static void __exit afs_exit(void)
168 printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n"); 168 printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
169 169
170 afs_fs_exit(); 170 afs_fs_exit();
171 afs_kill_lock_manager();
171 afs_close_socket(); 172 afs_close_socket();
172 afs_purge_servers(); 173 afs_purge_servers();
173 afs_callback_update_kill(); 174 afs_callback_update_kill();
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index d1a889c40742..2d33a5f7d218 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -35,6 +35,7 @@ int afs_abort_to_error(u32 abort_code)
35 case VOVERQUOTA: return -EDQUOT; 35 case VOVERQUOTA: return -EDQUOT;
36 case VBUSY: return -EBUSY; 36 case VBUSY: return -EBUSY;
37 case VMOVED: return -ENXIO; 37 case VMOVED: return -ENXIO;
38 case 0x2f6df0a: return -EWOULDBLOCK;
38 case 0x2f6df0c: return -EACCES; 39 case 0x2f6df0c: return -EACCES;
39 case 0x2f6df0f: return -EBUSY; 40 case 0x2f6df0f: return -EBUSY;
40 case 0x2f6df10: return -EEXIST; 41 case 0x2f6df10: return -EEXIST;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 13df512aea9e..6edb56683b9a 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -201,23 +201,9 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
201 */ 201 */
202static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) 202static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
203{ 203{
204 struct list_head *_p;
205 loff_t pos = *_pos;
206
207 /* lock the list against modification */ 204 /* lock the list against modification */
208 down_read(&afs_proc_cells_sem); 205 down_read(&afs_proc_cells_sem);
209 206 return seq_list_start_head(&afs_proc_cells, *_pos);
210 /* allow for the header line */
211 if (!pos)
212 return (void *) 1;
213 pos--;
214
215 /* find the n'th element in the list */
216 list_for_each(_p, &afs_proc_cells)
217 if (!pos--)
218 break;
219
220 return _p != &afs_proc_cells ? _p : NULL;
221} 207}
222 208
223/* 209/*
@@ -225,14 +211,7 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
225 */ 211 */
226static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos) 212static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
227{ 213{
228 struct list_head *_p; 214 return seq_list_next(v, &afs_proc_cells, pos);
229
230 (*pos)++;
231
232 _p = v;
233 _p = v == (void *) 1 ? afs_proc_cells.next : _p->next;
234
235 return _p != &afs_proc_cells ? _p : NULL;
236} 215}
237 216
238/* 217/*
@@ -250,7 +229,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
250{ 229{
251 struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); 230 struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
252 231
253 if (v == (void *) 1) { 232 if (v == &afs_proc_cells) {
254 /* display header on line 1 */ 233 /* display header on line 1 */
255 seq_puts(m, "USE NAME\n"); 234 seq_puts(m, "USE NAME\n");
256 return 0; 235 return 0;
@@ -503,26 +482,13 @@ static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
503 */ 482 */
504static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) 483static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
505{ 484{
506 struct list_head *_p;
507 struct afs_cell *cell = m->private; 485 struct afs_cell *cell = m->private;
508 loff_t pos = *_pos;
509 486
510 _enter("cell=%p pos=%Ld", cell, *_pos); 487 _enter("cell=%p pos=%Ld", cell, *_pos);
511 488
512 /* lock the list against modification */ 489 /* lock the list against modification */
513 down_read(&cell->vl_sem); 490 down_read(&cell->vl_sem);
514 491 return seq_list_start_head(&cell->vl_list, *_pos);
515 /* allow for the header line */
516 if (!pos)
517 return (void *) 1;
518 pos--;
519
520 /* find the n'th element in the list */
521 list_for_each(_p, &cell->vl_list)
522 if (!pos--)
523 break;
524
525 return _p != &cell->vl_list ? _p : NULL;
526} 492}
527 493
528/* 494/*
@@ -531,17 +497,10 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
531static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, 497static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
532 loff_t *_pos) 498 loff_t *_pos)
533{ 499{
534 struct list_head *_p;
535 struct afs_cell *cell = p->private; 500 struct afs_cell *cell = p->private;
536 501
537 _enter("cell=%p pos=%Ld", cell, *_pos); 502 _enter("cell=%p pos=%Ld", cell, *_pos);
538 503 return seq_list_next(v, &cell->vl_list, _pos);
539 (*_pos)++;
540
541 _p = v;
542 _p = (v == (void *) 1) ? cell->vl_list.next : _p->next;
543
544 return (_p != &cell->vl_list) ? _p : NULL;
545} 504}
546 505
547/* 506/*
@@ -569,11 +528,12 @@ const char afs_vlocation_states[][4] = {
569 */ 528 */
570static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) 529static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
571{ 530{
531 struct afs_cell *cell = m->private;
572 struct afs_vlocation *vlocation = 532 struct afs_vlocation *vlocation =
573 list_entry(v, struct afs_vlocation, link); 533 list_entry(v, struct afs_vlocation, link);
574 534
575 /* display header on line 1 */ 535 /* display header on line 1 */
576 if (v == (void *) 1) { 536 if (v == &cell->vl_list) {
577 seq_puts(m, "USE STT VLID[0] VLID[1] VLID[2] NAME\n"); 537 seq_puts(m, "USE STT VLID[0] VLID[1] VLID[2] NAME\n");
578 return 0; 538 return 0;
579 } 539 }
@@ -734,26 +694,13 @@ static int afs_proc_cell_servers_release(struct inode *inode,
734static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos) 694static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
735 __acquires(m->private->servers_lock) 695 __acquires(m->private->servers_lock)
736{ 696{
737 struct list_head *_p;
738 struct afs_cell *cell = m->private; 697 struct afs_cell *cell = m->private;
739 loff_t pos = *_pos;
740 698
741 _enter("cell=%p pos=%Ld", cell, *_pos); 699 _enter("cell=%p pos=%Ld", cell, *_pos);
742 700
743 /* lock the list against modification */ 701 /* lock the list against modification */
744 read_lock(&cell->servers_lock); 702 read_lock(&cell->servers_lock);
745 703 return seq_list_start_head(&cell->servers, *_pos);
746 /* allow for the header line */
747 if (!pos)
748 return (void *) 1;
749 pos--;
750
751 /* find the n'th element in the list */
752 list_for_each(_p, &cell->servers)
753 if (!pos--)
754 break;
755
756 return _p != &cell->servers ? _p : NULL;
757} 704}
758 705
759/* 706/*
@@ -762,17 +709,10 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
762static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, 709static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
763 loff_t *_pos) 710 loff_t *_pos)
764{ 711{
765 struct list_head *_p;
766 struct afs_cell *cell = p->private; 712 struct afs_cell *cell = p->private;
767 713
768 _enter("cell=%p pos=%Ld", cell, *_pos); 714 _enter("cell=%p pos=%Ld", cell, *_pos);
769 715 return seq_list_next(v, &cell->servers, _pos);
770 (*_pos)++;
771
772 _p = v;
773 _p = v == (void *) 1 ? cell->servers.next : _p->next;
774
775 return _p != &cell->servers ? _p : NULL;
776} 716}
777 717
778/* 718/*
@@ -791,11 +731,12 @@ static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
791 */ 731 */
792static int afs_proc_cell_servers_show(struct seq_file *m, void *v) 732static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
793{ 733{
734 struct afs_cell *cell = m->private;
794 struct afs_server *server = list_entry(v, struct afs_server, link); 735 struct afs_server *server = list_entry(v, struct afs_server, link);
795 char ipaddr[20]; 736 char ipaddr[20];
796 737
797 /* display header on line 1 */ 738 /* display header on line 1 */
798 if (v == (void *) 1) { 739 if (v == &cell->servers) {
799 seq_puts(m, "USE ADDR STATE\n"); 740 seq_puts(m, "USE ADDR STATE\n");
800 return 0; 741 return 0;
801 } 742 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 2e8496ba1205..993cdf1cce3a 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -460,6 +460,9 @@ static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
460 spin_lock_init(&vnode->writeback_lock); 460 spin_lock_init(&vnode->writeback_lock);
461 spin_lock_init(&vnode->lock); 461 spin_lock_init(&vnode->lock);
462 INIT_LIST_HEAD(&vnode->writebacks); 462 INIT_LIST_HEAD(&vnode->writebacks);
463 INIT_LIST_HEAD(&vnode->pending_locks);
464 INIT_LIST_HEAD(&vnode->granted_locks);
465 INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work);
463 INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work); 466 INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
464} 467}
465 468
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 232c55dc245d..2f05c4fc2a70 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -561,7 +561,7 @@ no_server:
561/* 561/*
562 * create a hard link 562 * create a hard link
563 */ 563 */
564extern int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode, 564int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
565 struct key *key, const char *name) 565 struct key *key, const char *name)
566{ 566{
567 struct afs_server *server; 567 struct afs_server *server;
@@ -887,11 +887,6 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
887 vnode->fid.unique, 887 vnode->fid.unique,
888 key_serial(key)); 888 key_serial(key));
889 889
890 /* this op will fetch the status */
891 spin_lock(&vnode->lock);
892 vnode->update_cnt++;
893 spin_unlock(&vnode->lock);
894
895 do { 890 do {
896 /* pick a server to query */ 891 /* pick a server to query */
897 server = afs_volume_pick_fileserver(vnode); 892 server = afs_volume_pick_fileserver(vnode);
@@ -905,20 +900,127 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
905 } while (!afs_volume_release_fileserver(vnode, server, ret)); 900 } while (!afs_volume_release_fileserver(vnode, server, ret));
906 901
907 /* adjust the flags */ 902 /* adjust the flags */
908 if (ret == 0) { 903 if (ret == 0)
909 afs_vnode_finalise_status_update(vnode, server); 904 afs_put_server(server);
905
906 _leave(" = %d", ret);
907 return ret;
908
909no_server:
910 return PTR_ERR(server);
911}
912
913/*
914 * get a lock on a file
915 */
916int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key,
917 afs_lock_type_t type)
918{
919 struct afs_server *server;
920 int ret;
921
922 _enter("%s{%x:%u.%u},%x,%u",
923 vnode->volume->vlocation->vldb.name,
924 vnode->fid.vid,
925 vnode->fid.vnode,
926 vnode->fid.unique,
927 key_serial(key), type);
928
929 do {
930 /* pick a server to query */
931 server = afs_volume_pick_fileserver(vnode);
932 if (IS_ERR(server))
933 goto no_server;
934
935 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
936
937 ret = afs_fs_set_lock(server, key, vnode, type, &afs_sync_call);
938
939 } while (!afs_volume_release_fileserver(vnode, server, ret));
940
941 /* adjust the flags */
942 if (ret == 0)
943 afs_put_server(server);
944
945 _leave(" = %d", ret);
946 return ret;
947
948no_server:
949 return PTR_ERR(server);
950}
951
952/*
953 * extend a lock on a file
954 */
955int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key)
956{
957 struct afs_server *server;
958 int ret;
959
960 _enter("%s{%x:%u.%u},%x",
961 vnode->volume->vlocation->vldb.name,
962 vnode->fid.vid,
963 vnode->fid.vnode,
964 vnode->fid.unique,
965 key_serial(key));
966
967 do {
968 /* pick a server to query */
969 server = afs_volume_pick_fileserver(vnode);
970 if (IS_ERR(server))
971 goto no_server;
972
973 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
974
975 ret = afs_fs_extend_lock(server, key, vnode, &afs_sync_call);
976
977 } while (!afs_volume_release_fileserver(vnode, server, ret));
978
979 /* adjust the flags */
980 if (ret == 0)
981 afs_put_server(server);
982
983 _leave(" = %d", ret);
984 return ret;
985
986no_server:
987 return PTR_ERR(server);
988}
989
990/*
991 * release a lock on a file
992 */
993int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key)
994{
995 struct afs_server *server;
996 int ret;
997
998 _enter("%s{%x:%u.%u},%x",
999 vnode->volume->vlocation->vldb.name,
1000 vnode->fid.vid,
1001 vnode->fid.vnode,
1002 vnode->fid.unique,
1003 key_serial(key));
1004
1005 do {
1006 /* pick a server to query */
1007 server = afs_volume_pick_fileserver(vnode);
1008 if (IS_ERR(server))
1009 goto no_server;
1010
1011 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
1012
1013 ret = afs_fs_release_lock(server, key, vnode, &afs_sync_call);
1014
1015 } while (!afs_volume_release_fileserver(vnode, server, ret));
1016
1017 /* adjust the flags */
1018 if (ret == 0)
910 afs_put_server(server); 1019 afs_put_server(server);
911 } else {
912 afs_vnode_status_update_failed(vnode, ret);
913 }
914 1020
915 _leave(" = %d", ret); 1021 _leave(" = %d", ret);
916 return ret; 1022 return ret;
917 1023
918no_server: 1024no_server:
919 spin_lock(&vnode->lock);
920 vnode->update_cnt--;
921 ASSERTCMP(vnode->update_cnt, >=, 0);
922 spin_unlock(&vnode->lock);
923 return PTR_ERR(server); 1025 return PTR_ERR(server);
924} 1026}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index edc67486238f..b4a75880f6fd 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -53,7 +53,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
53}; 53};
54 54
55/** 55/**
56 * anon_inode_getfd - creates a new file instance by hooking it up to and 56 * anon_inode_getfd - creates a new file instance by hooking it up to an
57 * anonymous inode, and a dentry that describe the "class" 57 * anonymous inode, and a dentry that describe the "class"
58 * of the file 58 * of the file
59 * 59 *
@@ -66,7 +66,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
66 * 66 *
67 * Creates a new file by hooking it on a single inode. This is useful for files 67 * Creates a new file by hooking it on a single inode. This is useful for files
68 * that do not need to have a full-fledged inode in order to operate correctly. 68 * that do not need to have a full-fledged inode in order to operate correctly.
69 * All the files created with anon_inode_getfd() will share a single inode, by 69 * All the files created with anon_inode_getfd() will share a single inode,
70 * hence saving memory and avoiding code duplication for the file/inode/dentry 70 * hence saving memory and avoiding code duplication for the file/inode/dentry
71 * setup. 71 * setup.
72 */ 72 */
@@ -142,9 +142,9 @@ err_put_filp:
142EXPORT_SYMBOL_GPL(anon_inode_getfd); 142EXPORT_SYMBOL_GPL(anon_inode_getfd);
143 143
144/* 144/*
145 * A single inode exist for all anon_inode files. Contrary to pipes, 145 * A single inode exists for all anon_inode files. Contrary to pipes,
146 * anon_inode inodes has no per-instance data associated, so we can avoid 146 * anon_inode inodes have no associated per-instance data, so we need
147 * the allocation of multiple of them. 147 * only allocate one of them.
148 */ 148 */
149static struct inode *anon_inode_mkinode(void) 149static struct inode *anon_inode_mkinode(void)
150{ 150{
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 08e4414b8374..a27e42bf3400 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -45,7 +45,7 @@
45 45
46static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); 46static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
47static int load_elf_library(struct file *); 47static int load_elf_library(struct file *);
48static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int); 48static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long);
49 49
50/* 50/*
51 * If we don't support core dumping, then supply a NULL so we 51 * If we don't support core dumping, then supply a NULL so we
@@ -80,7 +80,7 @@ static struct linux_binfmt elf_format = {
80 .hasvdso = 1 80 .hasvdso = 1
81}; 81};
82 82
83#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) 83#define BAD_ADDR(x) IS_ERR_VALUE(x)
84 84
85static int set_brk(unsigned long start, unsigned long end) 85static int set_brk(unsigned long start, unsigned long end)
86{ 86{
@@ -285,33 +285,70 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
285#ifndef elf_map 285#ifndef elf_map
286 286
287static unsigned long elf_map(struct file *filep, unsigned long addr, 287static unsigned long elf_map(struct file *filep, unsigned long addr,
288 struct elf_phdr *eppnt, int prot, int type) 288 struct elf_phdr *eppnt, int prot, int type,
289 unsigned long total_size)
289{ 290{
290 unsigned long map_addr; 291 unsigned long map_addr;
291 unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr); 292 unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
293 unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
294 addr = ELF_PAGESTART(addr);
295 size = ELF_PAGEALIGN(size);
292 296
293 down_write(&current->mm->mmap_sem);
294 /* mmap() will return -EINVAL if given a zero size, but a 297 /* mmap() will return -EINVAL if given a zero size, but a
295 * segment with zero filesize is perfectly valid */ 298 * segment with zero filesize is perfectly valid */
296 if (eppnt->p_filesz + pageoffset) 299 if (!size)
297 map_addr = do_mmap(filep, ELF_PAGESTART(addr), 300 return addr;
298 eppnt->p_filesz + pageoffset, prot, type, 301
299 eppnt->p_offset - pageoffset); 302 down_write(&current->mm->mmap_sem);
300 else 303 /*
301 map_addr = ELF_PAGESTART(addr); 304 * total_size is the size of the ELF (interpreter) image.
305 * The _first_ mmap needs to know the full size, otherwise
306 * randomization might put this image into an overlapping
307 * position with the ELF binary image. (since size < total_size)
308 * So we first map the 'big' image - and unmap the remainder at
309 * the end. (which unmap is needed for ELF images with holes.)
310 */
311 if (total_size) {
312 total_size = ELF_PAGEALIGN(total_size);
313 map_addr = do_mmap(filep, addr, total_size, prot, type, off);
314 if (!BAD_ADDR(map_addr))
315 do_munmap(current->mm, map_addr+size, total_size-size);
316 } else
317 map_addr = do_mmap(filep, addr, size, prot, type, off);
318
302 up_write(&current->mm->mmap_sem); 319 up_write(&current->mm->mmap_sem);
303 return(map_addr); 320 return(map_addr);
304} 321}
305 322
306#endif /* !elf_map */ 323#endif /* !elf_map */
307 324
325static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
326{
327 int i, first_idx = -1, last_idx = -1;
328
329 for (i = 0; i < nr; i++) {
330 if (cmds[i].p_type == PT_LOAD) {
331 last_idx = i;
332 if (first_idx == -1)
333 first_idx = i;
334 }
335 }
336 if (first_idx == -1)
337 return 0;
338
339 return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
340 ELF_PAGESTART(cmds[first_idx].p_vaddr);
341}
342
343
308/* This is much more generalized than the library routine read function, 344/* This is much more generalized than the library routine read function,
309 so we keep this separate. Technically the library read function 345 so we keep this separate. Technically the library read function
310 is only provided so that we can read a.out libraries that have 346 is only provided so that we can read a.out libraries that have
311 an ELF header */ 347 an ELF header */
312 348
313static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, 349static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
314 struct file *interpreter, unsigned long *interp_load_addr) 350 struct file *interpreter, unsigned long *interp_map_addr,
351 unsigned long no_base)
315{ 352{
316 struct elf_phdr *elf_phdata; 353 struct elf_phdr *elf_phdata;
317 struct elf_phdr *eppnt; 354 struct elf_phdr *eppnt;
@@ -319,6 +356,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
319 int load_addr_set = 0; 356 int load_addr_set = 0;
320 unsigned long last_bss = 0, elf_bss = 0; 357 unsigned long last_bss = 0, elf_bss = 0;
321 unsigned long error = ~0UL; 358 unsigned long error = ~0UL;
359 unsigned long total_size;
322 int retval, i, size; 360 int retval, i, size;
323 361
324 /* First of all, some simple consistency checks */ 362 /* First of all, some simple consistency checks */
@@ -357,6 +395,12 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
357 goto out_close; 395 goto out_close;
358 } 396 }
359 397
398 total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
399 if (!total_size) {
400 error = -EINVAL;
401 goto out_close;
402 }
403
360 eppnt = elf_phdata; 404 eppnt = elf_phdata;
361 for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { 405 for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
362 if (eppnt->p_type == PT_LOAD) { 406 if (eppnt->p_type == PT_LOAD) {
@@ -374,9 +418,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
374 vaddr = eppnt->p_vaddr; 418 vaddr = eppnt->p_vaddr;
375 if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) 419 if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
376 elf_type |= MAP_FIXED; 420 elf_type |= MAP_FIXED;
421 else if (no_base && interp_elf_ex->e_type == ET_DYN)
422 load_addr = -vaddr;
377 423
378 map_addr = elf_map(interpreter, load_addr + vaddr, 424 map_addr = elf_map(interpreter, load_addr + vaddr,
379 eppnt, elf_prot, elf_type); 425 eppnt, elf_prot, elf_type, total_size);
426 total_size = 0;
427 if (!*interp_map_addr)
428 *interp_map_addr = map_addr;
380 error = map_addr; 429 error = map_addr;
381 if (BAD_ADDR(map_addr)) 430 if (BAD_ADDR(map_addr))
382 goto out_close; 431 goto out_close;
@@ -442,8 +491,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
442 goto out_close; 491 goto out_close;
443 } 492 }
444 493
445 *interp_load_addr = load_addr; 494 error = load_addr;
446 error = ((unsigned long)interp_elf_ex->e_entry) + load_addr;
447 495
448out_close: 496out_close:
449 kfree(elf_phdata); 497 kfree(elf_phdata);
@@ -540,7 +588,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
540 int elf_exec_fileno; 588 int elf_exec_fileno;
541 int retval, i; 589 int retval, i;
542 unsigned int size; 590 unsigned int size;
543 unsigned long elf_entry, interp_load_addr = 0; 591 unsigned long elf_entry;
592 unsigned long interp_load_addr = 0;
544 unsigned long start_code, end_code, start_data, end_data; 593 unsigned long start_code, end_code, start_data, end_data;
545 unsigned long reloc_func_desc = 0; 594 unsigned long reloc_func_desc = 0;
546 char passed_fileno[6]; 595 char passed_fileno[6];
@@ -808,9 +857,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
808 current->mm->start_stack = bprm->p; 857 current->mm->start_stack = bprm->p;
809 858
810 /* Now we do a little grungy work by mmaping the ELF image into 859 /* Now we do a little grungy work by mmaping the ELF image into
811 the correct location in memory. At this point, we assume that 860 the correct location in memory. */
812 the image should be loaded at fixed address, not at a variable
813 address. */
814 for(i = 0, elf_ppnt = elf_phdata; 861 for(i = 0, elf_ppnt = elf_phdata;
815 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { 862 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
816 int elf_prot = 0, elf_flags; 863 int elf_prot = 0, elf_flags;
@@ -864,11 +911,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
864 * default mmap base, as well as whatever program they 911 * default mmap base, as well as whatever program they
865 * might try to exec. This is because the brk will 912 * might try to exec. This is because the brk will
866 * follow the loader, and is not movable. */ 913 * follow the loader, and is not movable. */
914#ifdef CONFIG_X86
915 load_bias = 0;
916#else
867 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); 917 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
918#endif
868 } 919 }
869 920
870 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, 921 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
871 elf_prot, elf_flags); 922 elf_prot, elf_flags,0);
872 if (BAD_ADDR(error)) { 923 if (BAD_ADDR(error)) {
873 send_sig(SIGKILL, current, 0); 924 send_sig(SIGKILL, current, 0);
874 retval = IS_ERR((void *)error) ? 925 retval = IS_ERR((void *)error) ?
@@ -944,13 +995,25 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
944 } 995 }
945 996
946 if (elf_interpreter) { 997 if (elf_interpreter) {
947 if (interpreter_type == INTERPRETER_AOUT) 998 if (interpreter_type == INTERPRETER_AOUT) {
948 elf_entry = load_aout_interp(&loc->interp_ex, 999 elf_entry = load_aout_interp(&loc->interp_ex,
949 interpreter); 1000 interpreter);
950 else 1001 } else {
1002 unsigned long uninitialized_var(interp_map_addr);
1003
951 elf_entry = load_elf_interp(&loc->interp_elf_ex, 1004 elf_entry = load_elf_interp(&loc->interp_elf_ex,
952 interpreter, 1005 interpreter,
953 &interp_load_addr); 1006 &interp_map_addr,
1007 load_bias);
1008 if (!BAD_ADDR(elf_entry)) {
1009 /*
1010 * load_elf_interp() returns relocation
1011 * adjustment
1012 */
1013 interp_load_addr = elf_entry;
1014 elf_entry += loc->interp_elf_ex.e_entry;
1015 }
1016 }
954 if (BAD_ADDR(elf_entry)) { 1017 if (BAD_ADDR(elf_entry)) {
955 force_sig(SIGSEGV, current); 1018 force_sig(SIGSEGV, current);
956 retval = IS_ERR((void *)elf_entry) ? 1019 retval = IS_ERR((void *)elf_entry) ?
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b3e9bfa748cf..3635315e3b99 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -588,12 +588,10 @@ EXPORT_SYMBOL(bdget);
588 588
589long nr_blockdev_pages(void) 589long nr_blockdev_pages(void)
590{ 590{
591 struct list_head *p; 591 struct block_device *bdev;
592 long ret = 0; 592 long ret = 0;
593 spin_lock(&bdev_lock); 593 spin_lock(&bdev_lock);
594 list_for_each(p, &all_bdevs) { 594 list_for_each_entry(bdev, &all_bdevs, bd_list) {
595 struct block_device *bdev;
596 bdev = list_entry(p, struct block_device, bd_list);
597 ret += bdev->bd_inode->i_mapping->nrpages; 595 ret += bdev->bd_inode->i_mapping->nrpages;
598 } 596 }
599 spin_unlock(&bdev_lock); 597 spin_unlock(&bdev_lock);
@@ -874,7 +872,7 @@ static struct bd_holder *find_bd_holder(struct block_device *bdev,
874 */ 872 */
875static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) 873static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
876{ 874{
877 int ret; 875 int err;
878 876
879 if (!bo) 877 if (!bo)
880 return -EINVAL; 878 return -EINVAL;
@@ -882,15 +880,18 @@ static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
882 if (!bd_holder_grab_dirs(bdev, bo)) 880 if (!bd_holder_grab_dirs(bdev, bo))
883 return -EBUSY; 881 return -EBUSY;
884 882
885 ret = add_symlink(bo->sdir, bo->sdev); 883 err = add_symlink(bo->sdir, bo->sdev);
886 if (ret == 0) { 884 if (err)
887 ret = add_symlink(bo->hdir, bo->hdev); 885 return err;
888 if (ret) 886
889 del_symlink(bo->sdir, bo->sdev); 887 err = add_symlink(bo->hdir, bo->hdev);
888 if (err) {
889 del_symlink(bo->sdir, bo->sdev);
890 return err;
890 } 891 }
891 if (ret == 0) 892
892 list_add_tail(&bo->list, &bdev->bd_holder_list); 893 list_add_tail(&bo->list, &bdev->bd_holder_list);
893 return ret; 894 return 0;
894} 895}
895 896
896/** 897/**
@@ -948,7 +949,7 @@ static struct bd_holder *del_bd_holder(struct block_device *bdev,
948static int bd_claim_by_kobject(struct block_device *bdev, void *holder, 949static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
949 struct kobject *kobj) 950 struct kobject *kobj)
950{ 951{
951 int res; 952 int err;
952 struct bd_holder *bo, *found; 953 struct bd_holder *bo, *found;
953 954
954 if (!kobj) 955 if (!kobj)
@@ -959,21 +960,24 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
959 return -ENOMEM; 960 return -ENOMEM;
960 961
961 mutex_lock(&bdev->bd_mutex); 962 mutex_lock(&bdev->bd_mutex);
962 res = bd_claim(bdev, holder);
963 if (res == 0) {
964 found = find_bd_holder(bdev, bo);
965 if (found == NULL) {
966 res = add_bd_holder(bdev, bo);
967 if (res)
968 bd_release(bdev);
969 }
970 }
971 963
972 if (res || found) 964 err = bd_claim(bdev, holder);
973 free_bd_holder(bo); 965 if (err)
974 mutex_unlock(&bdev->bd_mutex); 966 goto fail;
975 967
976 return res; 968 found = find_bd_holder(bdev, bo);
969 if (found)
970 goto fail;
971
972 err = add_bd_holder(bdev, bo);
973 if (err)
974 bd_release(bdev);
975 else
976 bo = NULL;
977fail:
978 mutex_unlock(&bdev->bd_mutex);
979 free_bd_holder(bo);
980 return err;
977} 981}
978 982
979/** 983/**
@@ -987,15 +991,12 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
987static void bd_release_from_kobject(struct block_device *bdev, 991static void bd_release_from_kobject(struct block_device *bdev,
988 struct kobject *kobj) 992 struct kobject *kobj)
989{ 993{
990 struct bd_holder *bo;
991
992 if (!kobj) 994 if (!kobj)
993 return; 995 return;
994 996
995 mutex_lock(&bdev->bd_mutex); 997 mutex_lock(&bdev->bd_mutex);
996 bd_release(bdev); 998 bd_release(bdev);
997 if ((bo = del_bd_holder(bdev, kobj))) 999 free_bd_holder(del_bd_holder(bdev, kobj));
998 free_bd_holder(bo);
999 mutex_unlock(&bdev->bd_mutex); 1000 mutex_unlock(&bdev->bd_mutex);
1000} 1001}
1001 1002
diff --git a/fs/buffer.c b/fs/buffer.c
index aa68206bd517..0f9006714230 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -356,7 +356,7 @@ static void free_more_memory(void)
356 for_each_online_pgdat(pgdat) { 356 for_each_online_pgdat(pgdat) {
357 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; 357 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
358 if (*zones) 358 if (*zones)
359 try_to_free_pages(zones, GFP_NOFS); 359 try_to_free_pages(zones, 0, GFP_NOFS);
360 } 360 }
361} 361}
362 362
@@ -676,6 +676,39 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
676EXPORT_SYMBOL(mark_buffer_dirty_inode); 676EXPORT_SYMBOL(mark_buffer_dirty_inode);
677 677
678/* 678/*
679 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
680 * dirty.
681 *
682 * If warn is true, then emit a warning if the page is not uptodate and has
683 * not been truncated.
684 */
685static int __set_page_dirty(struct page *page,
686 struct address_space *mapping, int warn)
687{
688 if (unlikely(!mapping))
689 return !TestSetPageDirty(page);
690
691 if (TestSetPageDirty(page))
692 return 0;
693
694 write_lock_irq(&mapping->tree_lock);
695 if (page->mapping) { /* Race with truncate? */
696 WARN_ON_ONCE(warn && !PageUptodate(page));
697
698 if (mapping_cap_account_dirty(mapping)) {
699 __inc_zone_page_state(page, NR_FILE_DIRTY);
700 task_io_account_write(PAGE_CACHE_SIZE);
701 }
702 radix_tree_tag_set(&mapping->page_tree,
703 page_index(page), PAGECACHE_TAG_DIRTY);
704 }
705 write_unlock_irq(&mapping->tree_lock);
706 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
707
708 return 1;
709}
710
711/*
679 * Add a page to the dirty page list. 712 * Add a page to the dirty page list.
680 * 713 *
681 * It is a sad fact of life that this function is called from several places 714 * It is a sad fact of life that this function is called from several places
@@ -702,7 +735,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
702 */ 735 */
703int __set_page_dirty_buffers(struct page *page) 736int __set_page_dirty_buffers(struct page *page)
704{ 737{
705 struct address_space * const mapping = page_mapping(page); 738 struct address_space *mapping = page_mapping(page);
706 739
707 if (unlikely(!mapping)) 740 if (unlikely(!mapping))
708 return !TestSetPageDirty(page); 741 return !TestSetPageDirty(page);
@@ -719,21 +752,7 @@ int __set_page_dirty_buffers(struct page *page)
719 } 752 }
720 spin_unlock(&mapping->private_lock); 753 spin_unlock(&mapping->private_lock);
721 754
722 if (TestSetPageDirty(page)) 755 return __set_page_dirty(page, mapping, 1);
723 return 0;
724
725 write_lock_irq(&mapping->tree_lock);
726 if (page->mapping) { /* Race with truncate? */
727 if (mapping_cap_account_dirty(mapping)) {
728 __inc_zone_page_state(page, NR_FILE_DIRTY);
729 task_io_account_write(PAGE_CACHE_SIZE);
730 }
731 radix_tree_tag_set(&mapping->page_tree,
732 page_index(page), PAGECACHE_TAG_DIRTY);
733 }
734 write_unlock_irq(&mapping->tree_lock);
735 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
736 return 1;
737} 756}
738EXPORT_SYMBOL(__set_page_dirty_buffers); 757EXPORT_SYMBOL(__set_page_dirty_buffers);
739 758
@@ -982,7 +1001,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
982 struct buffer_head *bh; 1001 struct buffer_head *bh;
983 1002
984 page = find_or_create_page(inode->i_mapping, index, 1003 page = find_or_create_page(inode->i_mapping, index,
985 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 1004 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
986 if (!page) 1005 if (!page)
987 return NULL; 1006 return NULL;
988 1007
@@ -1026,11 +1045,6 @@ failed:
1026/* 1045/*
1027 * Create buffers for the specified block device block's page. If 1046 * Create buffers for the specified block device block's page. If
1028 * that page was dirty, the buffers are set dirty also. 1047 * that page was dirty, the buffers are set dirty also.
1029 *
1030 * Except that's a bug. Attaching dirty buffers to a dirty
1031 * blockdev's page can result in filesystem corruption, because
1032 * some of those buffers may be aliases of filesystem data.
1033 * grow_dev_page() will go BUG() if this happens.
1034 */ 1048 */
1035static int 1049static int
1036grow_buffers(struct block_device *bdev, sector_t block, int size) 1050grow_buffers(struct block_device *bdev, sector_t block, int size)
@@ -1137,8 +1151,9 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1137 */ 1151 */
1138void fastcall mark_buffer_dirty(struct buffer_head *bh) 1152void fastcall mark_buffer_dirty(struct buffer_head *bh)
1139{ 1153{
1154 WARN_ON_ONCE(!buffer_uptodate(bh));
1140 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) 1155 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1141 __set_page_dirty_nobuffers(bh->b_page); 1156 __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
1142} 1157}
1143 1158
1144/* 1159/*
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8b0cbf4a4ad0..bd0f2f2353ce 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -849,6 +849,7 @@ static int cifs_oplock_thread(void * dummyarg)
849 __u16 netfid; 849 __u16 netfid;
850 int rc; 850 int rc;
851 851
852 set_freezable();
852 do { 853 do {
853 if (try_to_freeze()) 854 if (try_to_freeze())
854 continue; 855 continue;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f4e92661b223..0a1b8bd1dfcb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -363,6 +363,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
363 GFP_KERNEL); 363 GFP_KERNEL);
364 } 364 }
365 365
366 set_freezable();
366 while (!kthread_should_stop()) { 367 while (!kthread_should_stop()) {
367 if (try_to_freeze()) 368 if (try_to_freeze())
368 continue; 369 continue;
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 1d716392c3aa..96df1d51fdc3 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -29,6 +29,7 @@
29 */ 29 */
30 30
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/exportfs.h>
32 33
33#ifdef CONFIG_CIFS_EXPERIMENTAL 34#ifdef CONFIG_CIFS_EXPERIMENTAL
34 35
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6b44cdc96fac..e440a7b95d02 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -63,6 +63,7 @@
63#include <linux/wireless.h> 63#include <linux/wireless.h>
64#include <linux/atalk.h> 64#include <linux/atalk.h>
65#include <linux/blktrace_api.h> 65#include <linux/blktrace_api.h>
66#include <linux/loop.h>
66 67
67#include <net/bluetooth/bluetooth.h> 68#include <net/bluetooth/bluetooth.h>
68#include <net/bluetooth/hci.h> 69#include <net/bluetooth/hci.h>
@@ -3489,6 +3490,9 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
3489 3490
3490IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) 3491IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
3491IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) 3492IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
3493
3494/* loop */
3495IGNORE_IOCTL(LOOP_CLR_FD)
3492}; 3496};
3493 3497
3494#define IOCTL_HASHSIZE 256 3498#define IOCTL_HASHSIZE 256
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 7b48c034b312..3b0185fdf9a4 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -29,10 +29,11 @@
29 29
30struct configfs_dirent { 30struct configfs_dirent {
31 atomic_t s_count; 31 atomic_t s_count;
32 int s_dependent_count;
32 struct list_head s_sibling; 33 struct list_head s_sibling;
33 struct list_head s_children; 34 struct list_head s_children;
34 struct list_head s_links; 35 struct list_head s_links;
35 void * s_element; 36 void * s_element;
36 int s_type; 37 int s_type;
37 umode_t s_mode; 38 umode_t s_mode;
38 struct dentry * s_dentry; 39 struct dentry * s_dentry;
@@ -41,8 +42,8 @@ struct configfs_dirent {
41 42
42#define CONFIGFS_ROOT 0x0001 43#define CONFIGFS_ROOT 0x0001
43#define CONFIGFS_DIR 0x0002 44#define CONFIGFS_DIR 0x0002
44#define CONFIGFS_ITEM_ATTR 0x0004 45#define CONFIGFS_ITEM_ATTR 0x0004
45#define CONFIGFS_ITEM_LINK 0x0020 46#define CONFIGFS_ITEM_LINK 0x0020
46#define CONFIGFS_USET_DIR 0x0040 47#define CONFIGFS_USET_DIR 0x0040
47#define CONFIGFS_USET_DEFAULT 0x0080 48#define CONFIGFS_USET_DEFAULT 0x0080
48#define CONFIGFS_USET_DROPPING 0x0100 49#define CONFIGFS_USET_DROPPING 0x0100
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5e6e37e58f36..2f436d4f1d6d 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -355,6 +355,10 @@ static int configfs_detach_prep(struct dentry *dentry)
355 /* Mark that we've taken i_mutex */ 355 /* Mark that we've taken i_mutex */
356 sd->s_type |= CONFIGFS_USET_DROPPING; 356 sd->s_type |= CONFIGFS_USET_DROPPING;
357 357
358 /*
359 * Yup, recursive. If there's a problem, blame
360 * deep nesting of default_groups
361 */
358 ret = configfs_detach_prep(sd->s_dentry); 362 ret = configfs_detach_prep(sd->s_dentry);
359 if (!ret) 363 if (!ret)
360 continue; 364 continue;
@@ -562,7 +566,7 @@ static int populate_groups(struct config_group *group)
562 566
563/* 567/*
564 * All of link_obj/unlink_obj/link_group/unlink_group require that 568 * All of link_obj/unlink_obj/link_group/unlink_group require that
565 * subsys->su_sem is held. 569 * subsys->su_mutex is held.
566 */ 570 */
567 571
568static void unlink_obj(struct config_item *item) 572static void unlink_obj(struct config_item *item)
@@ -714,6 +718,28 @@ static void configfs_detach_group(struct config_item *item)
714} 718}
715 719
716/* 720/*
721 * After the item has been detached from the filesystem view, we are
722 * ready to tear it out of the hierarchy. Notify the client before
723 * we do that so they can perform any cleanup that requires
724 * navigating the hierarchy. A client does not need to provide this
725 * callback. The subsystem semaphore MUST be held by the caller, and
726 * references must be valid for both items. It also assumes the
727 * caller has validated ci_type.
728 */
729static void client_disconnect_notify(struct config_item *parent_item,
730 struct config_item *item)
731{
732 struct config_item_type *type;
733
734 type = parent_item->ci_type;
735 BUG_ON(!type);
736
737 if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)
738 type->ct_group_ops->disconnect_notify(to_config_group(parent_item),
739 item);
740}
741
742/*
717 * Drop the initial reference from make_item()/make_group() 743 * Drop the initial reference from make_item()/make_group()
718 * This function assumes that reference is held on item 744 * This function assumes that reference is held on item
719 * and that item holds a valid reference to the parent. Also, it 745 * and that item holds a valid reference to the parent. Also, it
@@ -733,11 +759,244 @@ static void client_drop_item(struct config_item *parent_item,
733 */ 759 */
734 if (type->ct_group_ops && type->ct_group_ops->drop_item) 760 if (type->ct_group_ops && type->ct_group_ops->drop_item)
735 type->ct_group_ops->drop_item(to_config_group(parent_item), 761 type->ct_group_ops->drop_item(to_config_group(parent_item),
736 item); 762 item);
737 else 763 else
738 config_item_put(item); 764 config_item_put(item);
739} 765}
740 766
767#ifdef DEBUG
768static void configfs_dump_one(struct configfs_dirent *sd, int level)
769{
770 printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
771
772#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
773 type_print(CONFIGFS_ROOT);
774 type_print(CONFIGFS_DIR);
775 type_print(CONFIGFS_ITEM_ATTR);
776 type_print(CONFIGFS_ITEM_LINK);
777 type_print(CONFIGFS_USET_DIR);
778 type_print(CONFIGFS_USET_DEFAULT);
779 type_print(CONFIGFS_USET_DROPPING);
780#undef type_print
781}
782
783static int configfs_dump(struct configfs_dirent *sd, int level)
784{
785 struct configfs_dirent *child_sd;
786 int ret = 0;
787
788 configfs_dump_one(sd, level);
789
790 if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT)))
791 return 0;
792
793 list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
794 ret = configfs_dump(child_sd, level + 2);
795 if (ret)
796 break;
797 }
798
799 return ret;
800}
801#endif
802
803
804/*
805 * configfs_depend_item() and configfs_undepend_item()
806 *
807 * WARNING: Do not call these from a configfs callback!
808 *
809 * This describes these functions and their helpers.
810 *
811 * Allow another kernel system to depend on a config_item. If this
812 * happens, the item cannot go away until the dependant can live without
813 * it. The idea is to give client modules as simple an interface as
814 * possible. When a system asks them to depend on an item, they just
815 * call configfs_depend_item(). If the item is live and the client
816 * driver is in good shape, we'll happily do the work for them.
817 *
818 * Why is the locking complex? Because configfs uses the VFS to handle
819 * all locking, but this function is called outside the normal
820 * VFS->configfs path. So it must take VFS locks to prevent the
821 * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is
822 * why you can't call these functions underneath configfs callbacks.
823 *
824 * Note, btw, that this can be called at *any* time, even when a configfs
825 * subsystem isn't registered, or when configfs is loading or unloading.
826 * Just like configfs_register_subsystem(). So we take the same
827 * precautions. We pin the filesystem. We lock each i_mutex _in_order_
828 * on our way down the tree. If we can find the target item in the
829 * configfs tree, it must be part of the subsystem tree as well, so we
830 * do not need the subsystem semaphore. Holding the i_mutex chain locks
831 * out mkdir() and rmdir(), who might be racing us.
832 */
833
834/*
835 * configfs_depend_prep()
836 *
837 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
838 * attributes. This is similar but not the same to configfs_detach_prep().
839 * Note that configfs_detach_prep() expects the parent to be locked when it
840 * is called, but we lock the parent *inside* configfs_depend_prep(). We
841 * do that so we can unlock it if we find nothing.
842 *
843 * Here we do a depth-first search of the dentry hierarchy looking for
844 * our object. We take i_mutex on each step of the way down. IT IS
845 * ESSENTIAL THAT i_mutex LOCKING IS ORDERED. If we come back up a branch,
846 * we'll drop the i_mutex.
847 *
848 * If the target is not found, -ENOENT is bubbled up and we have released
849 * all locks. If the target was found, the locks will be cleared by
850 * configfs_depend_rollback().
851 *
852 * This adds a requirement that all config_items be unique!
853 *
854 * This is recursive because the locking traversal is tricky. There isn't
855 * much on the stack, though, so folks that need this function - be careful
856 * about your stack! Patches will be accepted to make it iterative.
857 */
858static int configfs_depend_prep(struct dentry *origin,
859 struct config_item *target)
860{
861 struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
862 int ret = 0;
863
864 BUG_ON(!origin || !sd);
865
866 /* Lock this guy on the way down */
867 mutex_lock(&sd->s_dentry->d_inode->i_mutex);
868 if (sd->s_element == target) /* Boo-yah */
869 goto out;
870
871 list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
872 if (child_sd->s_type & CONFIGFS_DIR) {
873 ret = configfs_depend_prep(child_sd->s_dentry,
874 target);
875 if (!ret)
876 goto out; /* Child path boo-yah */
877 }
878 }
879
880 /* We looped all our children and didn't find target */
881 mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
882 ret = -ENOENT;
883
884out:
885 return ret;
886}
887
888/*
889 * This is ONLY called if configfs_depend_prep() did its job. So we can
890 * trust the entire path from item back up to origin.
891 *
892 * We walk backwards from item, unlocking each i_mutex. We finish by
893 * unlocking origin.
894 */
895static void configfs_depend_rollback(struct dentry *origin,
896 struct config_item *item)
897{
898 struct dentry *dentry = item->ci_dentry;
899
900 while (dentry != origin) {
901 mutex_unlock(&dentry->d_inode->i_mutex);
902 dentry = dentry->d_parent;
903 }
904
905 mutex_unlock(&origin->d_inode->i_mutex);
906}
907
908int configfs_depend_item(struct configfs_subsystem *subsys,
909 struct config_item *target)
910{
911 int ret;
912 struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
913 struct config_item *s_item = &subsys->su_group.cg_item;
914
915 /*
916 * Pin the configfs filesystem. This means we can safely access
917 * the root of the configfs filesystem.
918 */
919 ret = configfs_pin_fs();
920 if (ret)
921 return ret;
922
923 /*
924 * Next, lock the root directory. We're going to check that the
925 * subsystem is really registered, and so we need to lock out
926 * configfs_[un]register_subsystem().
927 */
928 mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
929
930 root_sd = configfs_sb->s_root->d_fsdata;
931
932 list_for_each_entry(p, &root_sd->s_children, s_sibling) {
933 if (p->s_type & CONFIGFS_DIR) {
934 if (p->s_element == s_item) {
935 subsys_sd = p;
936 break;
937 }
938 }
939 }
940
941 if (!subsys_sd) {
942 ret = -ENOENT;
943 goto out_unlock_fs;
944 }
945
946 /* Ok, now we can trust subsys/s_item */
947
948 /* Scan the tree, locking i_mutex recursively, return 0 if found */
949 ret = configfs_depend_prep(subsys_sd->s_dentry, target);
950 if (ret)
951 goto out_unlock_fs;
952
953 /* We hold all i_mutexes from the subsystem down to the target */
954 p = target->ci_dentry->d_fsdata;
955 p->s_dependent_count += 1;
956
957 configfs_depend_rollback(subsys_sd->s_dentry, target);
958
959out_unlock_fs:
960 mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
961
962 /*
963 * If we succeeded, the fs is pinned via other methods. If not,
964 * we're done with it anyway. So release_fs() is always right.
965 */
966 configfs_release_fs();
967
968 return ret;
969}
970EXPORT_SYMBOL(configfs_depend_item);
971
972/*
973 * Release the dependent linkage. This is much simpler than
974 * configfs_depend_item() because we know that that the client driver is
975 * pinned, thus the subsystem is pinned, and therefore configfs is pinned.
976 */
977void configfs_undepend_item(struct configfs_subsystem *subsys,
978 struct config_item *target)
979{
980 struct configfs_dirent *sd;
981
982 /*
983 * Since we can trust everything is pinned, we just need i_mutex
984 * on the item.
985 */
986 mutex_lock(&target->ci_dentry->d_inode->i_mutex);
987
988 sd = target->ci_dentry->d_fsdata;
989 BUG_ON(sd->s_dependent_count < 1);
990
991 sd->s_dependent_count -= 1;
992
993 /*
994 * After this unlock, we cannot trust the item to stay alive!
995 * DO NOT REFERENCE item after this unlock.
996 */
997 mutex_unlock(&target->ci_dentry->d_inode->i_mutex);
998}
999EXPORT_SYMBOL(configfs_undepend_item);
741 1000
742static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 1001static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
743{ 1002{
@@ -783,7 +1042,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
783 1042
784 snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); 1043 snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
785 1044
786 down(&subsys->su_sem); 1045 mutex_lock(&subsys->su_mutex);
787 group = NULL; 1046 group = NULL;
788 item = NULL; 1047 item = NULL;
789 if (type->ct_group_ops->make_group) { 1048 if (type->ct_group_ops->make_group) {
@@ -797,7 +1056,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
797 if (item) 1056 if (item)
798 link_obj(parent_item, item); 1057 link_obj(parent_item, item);
799 } 1058 }
800 up(&subsys->su_sem); 1059 mutex_unlock(&subsys->su_mutex);
801 1060
802 kfree(name); 1061 kfree(name);
803 if (!item) { 1062 if (!item) {
@@ -841,13 +1100,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
841out_unlink: 1100out_unlink:
842 if (ret) { 1101 if (ret) {
843 /* Tear down everything we built up */ 1102 /* Tear down everything we built up */
844 down(&subsys->su_sem); 1103 mutex_lock(&subsys->su_mutex);
1104
1105 client_disconnect_notify(parent_item, item);
845 if (group) 1106 if (group)
846 unlink_group(group); 1107 unlink_group(group);
847 else 1108 else
848 unlink_obj(item); 1109 unlink_obj(item);
849 client_drop_item(parent_item, item); 1110 client_drop_item(parent_item, item);
850 up(&subsys->su_sem); 1111
1112 mutex_unlock(&subsys->su_mutex);
851 1113
852 if (module_got) 1114 if (module_got)
853 module_put(owner); 1115 module_put(owner);
@@ -881,6 +1143,13 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
881 if (sd->s_type & CONFIGFS_USET_DEFAULT) 1143 if (sd->s_type & CONFIGFS_USET_DEFAULT)
882 return -EPERM; 1144 return -EPERM;
883 1145
1146 /*
1147 * Here's where we check for dependents. We're protected by
1148 * i_mutex.
1149 */
1150 if (sd->s_dependent_count)
1151 return -EBUSY;
1152
884 /* Get a working ref until we have the child */ 1153 /* Get a working ref until we have the child */
885 parent_item = configfs_get_config_item(dentry->d_parent); 1154 parent_item = configfs_get_config_item(dentry->d_parent);
886 subsys = to_config_group(parent_item)->cg_subsys; 1155 subsys = to_config_group(parent_item)->cg_subsys;
@@ -910,17 +1179,19 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
910 if (sd->s_type & CONFIGFS_USET_DIR) { 1179 if (sd->s_type & CONFIGFS_USET_DIR) {
911 configfs_detach_group(item); 1180 configfs_detach_group(item);
912 1181
913 down(&subsys->su_sem); 1182 mutex_lock(&subsys->su_mutex);
1183 client_disconnect_notify(parent_item, item);
914 unlink_group(to_config_group(item)); 1184 unlink_group(to_config_group(item));
915 } else { 1185 } else {
916 configfs_detach_item(item); 1186 configfs_detach_item(item);
917 1187
918 down(&subsys->su_sem); 1188 mutex_lock(&subsys->su_mutex);
1189 client_disconnect_notify(parent_item, item);
919 unlink_obj(item); 1190 unlink_obj(item);
920 } 1191 }
921 1192
922 client_drop_item(parent_item, item); 1193 client_drop_item(parent_item, item);
923 up(&subsys->su_sem); 1194 mutex_unlock(&subsys->su_mutex);
924 1195
925 /* Drop our reference from above */ 1196 /* Drop our reference from above */
926 config_item_put(item); 1197 config_item_put(item);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 3527c7c6def8..a3658f9a082c 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -27,19 +27,26 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/mutex.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/semaphore.h>
32 32
33#include <linux/configfs.h> 33#include <linux/configfs.h>
34#include "configfs_internal.h" 34#include "configfs_internal.h"
35 35
36/*
37 * A simple attribute can only be 4096 characters. Why 4k? Because the
38 * original code limited it to PAGE_SIZE. That's a bad idea, though,
39 * because an attribute of 16k on ia64 won't work on x86. So we limit to
40 * 4k, our minimum common page size.
41 */
42#define SIMPLE_ATTR_SIZE 4096
36 43
37struct configfs_buffer { 44struct configfs_buffer {
38 size_t count; 45 size_t count;
39 loff_t pos; 46 loff_t pos;
40 char * page; 47 char * page;
41 struct configfs_item_operations * ops; 48 struct configfs_item_operations * ops;
42 struct semaphore sem; 49 struct mutex mutex;
43 int needs_read_fill; 50 int needs_read_fill;
44}; 51};
45 52
@@ -69,7 +76,7 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
69 76
70 count = ops->show_attribute(item,attr,buffer->page); 77 count = ops->show_attribute(item,attr,buffer->page);
71 buffer->needs_read_fill = 0; 78 buffer->needs_read_fill = 0;
72 BUG_ON(count > (ssize_t)PAGE_SIZE); 79 BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
73 if (count >= 0) 80 if (count >= 0)
74 buffer->count = count; 81 buffer->count = count;
75 else 82 else
@@ -102,7 +109,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
102 struct configfs_buffer * buffer = file->private_data; 109 struct configfs_buffer * buffer = file->private_data;
103 ssize_t retval = 0; 110 ssize_t retval = 0;
104 111
105 down(&buffer->sem); 112 mutex_lock(&buffer->mutex);
106 if (buffer->needs_read_fill) { 113 if (buffer->needs_read_fill) {
107 if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) 114 if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
108 goto out; 115 goto out;
@@ -112,7 +119,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
112 retval = simple_read_from_buffer(buf, count, ppos, buffer->page, 119 retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
113 buffer->count); 120 buffer->count);
114out: 121out:
115 up(&buffer->sem); 122 mutex_unlock(&buffer->mutex);
116 return retval; 123 return retval;
117} 124}
118 125
@@ -137,8 +144,8 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
137 if (!buffer->page) 144 if (!buffer->page)
138 return -ENOMEM; 145 return -ENOMEM;
139 146
140 if (count >= PAGE_SIZE) 147 if (count >= SIMPLE_ATTR_SIZE)
141 count = PAGE_SIZE - 1; 148 count = SIMPLE_ATTR_SIZE - 1;
142 error = copy_from_user(buffer->page,buf,count); 149 error = copy_from_user(buffer->page,buf,count);
143 buffer->needs_read_fill = 1; 150 buffer->needs_read_fill = 1;
144 /* if buf is assumed to contain a string, terminate it by \0, 151 /* if buf is assumed to contain a string, terminate it by \0,
@@ -193,13 +200,13 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
193 struct configfs_buffer * buffer = file->private_data; 200 struct configfs_buffer * buffer = file->private_data;
194 ssize_t len; 201 ssize_t len;
195 202
196 down(&buffer->sem); 203 mutex_lock(&buffer->mutex);
197 len = fill_write_buffer(buffer, buf, count); 204 len = fill_write_buffer(buffer, buf, count);
198 if (len > 0) 205 if (len > 0)
199 len = flush_write_buffer(file->f_path.dentry, buffer, count); 206 len = flush_write_buffer(file->f_path.dentry, buffer, count);
200 if (len > 0) 207 if (len > 0)
201 *ppos += len; 208 *ppos += len;
202 up(&buffer->sem); 209 mutex_unlock(&buffer->mutex);
203 return len; 210 return len;
204} 211}
205 212
@@ -253,7 +260,7 @@ static int check_perm(struct inode * inode, struct file * file)
253 error = -ENOMEM; 260 error = -ENOMEM;
254 goto Enomem; 261 goto Enomem;
255 } 262 }
256 init_MUTEX(&buffer->sem); 263 mutex_init(&buffer->mutex);
257 buffer->needs_read_fill = 1; 264 buffer->needs_read_fill = 1;
258 buffer->ops = ops; 265 buffer->ops = ops;
259 file->private_data = buffer; 266 file->private_data = buffer;
@@ -292,6 +299,7 @@ static int configfs_release(struct inode * inode, struct file * filp)
292 if (buffer) { 299 if (buffer) {
293 if (buffer->page) 300 if (buffer->page)
294 free_page((unsigned long)buffer->page); 301 free_page((unsigned long)buffer->page);
302 mutex_destroy(&buffer->mutex);
295 kfree(buffer); 303 kfree(buffer);
296 } 304 }
297 return 0; 305 return 0;
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 24421209f854..76dc4c3e5d51 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -62,7 +62,6 @@ void config_item_init(struct config_item * item)
62 * dynamically allocated string that @item->ci_name points to. 62 * dynamically allocated string that @item->ci_name points to.
63 * Otherwise, use the static @item->ci_namebuf array. 63 * Otherwise, use the static @item->ci_namebuf array.
64 */ 64 */
65
66int config_item_set_name(struct config_item * item, const char * fmt, ...) 65int config_item_set_name(struct config_item * item, const char * fmt, ...)
67{ 66{
68 int error = 0; 67 int error = 0;
@@ -139,12 +138,7 @@ struct config_item * config_item_get(struct config_item * item)
139 return item; 138 return item;
140} 139}
141 140
142/** 141static void config_item_cleanup(struct config_item * item)
143 * config_item_cleanup - free config_item resources.
144 * @item: item.
145 */
146
147void config_item_cleanup(struct config_item * item)
148{ 142{
149 struct config_item_type * t = item->ci_type; 143 struct config_item_type * t = item->ci_type;
150 struct config_group * s = item->ci_group; 144 struct config_group * s = item->ci_group;
@@ -179,39 +173,35 @@ void config_item_put(struct config_item * item)
179 kref_put(&item->ci_kref, config_item_release); 173 kref_put(&item->ci_kref, config_item_release);
180} 174}
181 175
182
183/** 176/**
184 * config_group_init - initialize a group for use 177 * config_group_init - initialize a group for use
185 * @k: group 178 * @k: group
186 */ 179 */
187
188void config_group_init(struct config_group *group) 180void config_group_init(struct config_group *group)
189{ 181{
190 config_item_init(&group->cg_item); 182 config_item_init(&group->cg_item);
191 INIT_LIST_HEAD(&group->cg_children); 183 INIT_LIST_HEAD(&group->cg_children);
192} 184}
193 185
194
195/** 186/**
196 * config_group_find_obj - search for item in group. 187 * config_group_find_item - search for item in group.
197 * @group: group we're looking in. 188 * @group: group we're looking in.
198 * @name: item's name. 189 * @name: item's name.
199 * 190 *
200 * Lock group via @group->cg_subsys, and iterate over @group->cg_list, 191 * Iterate over @group->cg_list, looking for a matching config_item.
201 * looking for a matching config_item. If matching item is found 192 * If matching item is found take a reference and return the item.
202 * take a reference and return the item. 193 * Caller must have locked group via @group->cg_subsys->su_mtx.
203 */ 194 */
204 195struct config_item *config_group_find_item(struct config_group *group,
205struct config_item * config_group_find_obj(struct config_group * group, const char * name) 196 const char *name)
206{ 197{
207 struct list_head * entry; 198 struct list_head * entry;
208 struct config_item * ret = NULL; 199 struct config_item * ret = NULL;
209 200
210 /* XXX LOCKING! */
211 list_for_each(entry,&group->cg_children) { 201 list_for_each(entry,&group->cg_children) {
212 struct config_item * item = to_item(entry); 202 struct config_item * item = to_item(entry);
213 if (config_item_name(item) && 203 if (config_item_name(item) &&
214 !strcmp(config_item_name(item), name)) { 204 !strcmp(config_item_name(item), name)) {
215 ret = config_item_get(item); 205 ret = config_item_get(item);
216 break; 206 break;
217 } 207 }
@@ -219,9 +209,8 @@ struct config_item * config_group_find_obj(struct config_group * group, const ch
219 return ret; 209 return ret;
220} 210}
221 211
222
223EXPORT_SYMBOL(config_item_init); 212EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init); 213EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get); 214EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put); 215EXPORT_SYMBOL(config_item_put);
227EXPORT_SYMBOL(config_group_find_obj); 216EXPORT_SYMBOL(config_group_find_item);
diff --git a/fs/dcache.c b/fs/dcache.c
index 0e73aa0a0e8b..cb9d05056b54 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -883,6 +883,11 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
883 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 883 return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
884} 884}
885 885
886static struct shrinker dcache_shrinker = {
887 .shrink = shrink_dcache_memory,
888 .seeks = DEFAULT_SEEKS,
889};
890
886/** 891/**
887 * d_alloc - allocate a dcache entry 892 * d_alloc - allocate a dcache entry
888 * @parent: parent of entry to allocate 893 * @parent: parent of entry to allocate
@@ -2115,7 +2120,7 @@ static void __init dcache_init(unsigned long mempages)
2115 dentry_cache = KMEM_CACHE(dentry, 2120 dentry_cache = KMEM_CACHE(dentry,
2116 SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); 2121 SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
2117 2122
2118 set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory); 2123 register_shrinker(&dcache_shrinker);
2119 2124
2120 /* Hash may have been set up in dcache_init_early */ 2125 /* Hash may have been set up in dcache_init_early */
2121 if (!hashdist) 2126 if (!hashdist)
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 5069b2cb5a1f..2f8e3c81bc19 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -133,14 +133,6 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
133 return len; 133 return len;
134} 134}
135 135
136#define __CONFIGFS_ATTR(_name,_mode,_read,_write) { \
137 .attr = { .ca_name = __stringify(_name), \
138 .ca_mode = _mode, \
139 .ca_owner = THIS_MODULE }, \
140 .show = _read, \
141 .store = _write, \
142}
143
144#define CLUSTER_ATTR(name, check_zero) \ 136#define CLUSTER_ATTR(name, check_zero) \
145static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \ 137static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \
146{ \ 138{ \
@@ -615,7 +607,7 @@ static struct clusters clusters_root = {
615int dlm_config_init(void) 607int dlm_config_init(void)
616{ 608{
617 config_group_init(&clusters_root.subsys.su_group); 609 config_group_init(&clusters_root.subsys.su_group);
618 init_MUTEX(&clusters_root.subsys.su_sem); 610 mutex_init(&clusters_root.subsys.su_mutex);
619 return configfs_register_subsystem(&clusters_root.subsys); 611 return configfs_register_subsystem(&clusters_root.subsys);
620} 612}
621 613
@@ -759,9 +751,9 @@ static struct space *get_space(char *name)
759 if (!space_list) 751 if (!space_list)
760 return NULL; 752 return NULL;
761 753
762 down(&space_list->cg_subsys->su_sem); 754 mutex_lock(&space_list->cg_subsys->su_mutex);
763 i = config_group_find_obj(space_list, name); 755 i = config_group_find_item(space_list, name);
764 up(&space_list->cg_subsys->su_sem); 756 mutex_unlock(&space_list->cg_subsys->su_mutex);
765 757
766 return to_space(i); 758 return to_space(i);
767} 759}
@@ -780,7 +772,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
780 if (!comm_list) 772 if (!comm_list)
781 return NULL; 773 return NULL;
782 774
783 down(&clusters_root.subsys.su_sem); 775 mutex_lock(&clusters_root.subsys.su_mutex);
784 776
785 list_for_each_entry(i, &comm_list->cg_children, ci_entry) { 777 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
786 cm = to_comm(i); 778 cm = to_comm(i);
@@ -800,7 +792,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
800 break; 792 break;
801 } 793 }
802 } 794 }
803 up(&clusters_root.subsys.su_sem); 795 mutex_unlock(&clusters_root.subsys.su_mutex);
804 796
805 if (!found) 797 if (!found)
806 cm = NULL; 798 cm = NULL;
diff --git a/fs/dquot.c b/fs/dquot.c
index 8819d281500c..7e273151f589 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -538,6 +538,11 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
538 return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure; 538 return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure;
539} 539}
540 540
541static struct shrinker dqcache_shrinker = {
542 .shrink = shrink_dqcache_memory,
543 .seeks = DEFAULT_SEEKS,
544};
545
541/* 546/*
542 * Put reference to dquot 547 * Put reference to dquot
543 * NOTE: If you change this function please check whether dqput_blocks() works right... 548 * NOTE: If you change this function please check whether dqput_blocks() works right...
@@ -1870,7 +1875,7 @@ static int __init dquot_init(void)
1870 printk("Dquot-cache hash table entries: %ld (order %ld, %ld bytes)\n", 1875 printk("Dquot-cache hash table entries: %ld (order %ld, %ld bytes)\n",
1871 nr_hash, order, (PAGE_SIZE << order)); 1876 nr_hash, order, (PAGE_SIZE << order));
1872 1877
1873 set_shrinker(DEFAULT_SEEKS, shrink_dqcache_memory); 1878 register_shrinker(&dqcache_shrinker);
1874 1879
1875 return 0; 1880 return 0;
1876} 1881}
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 03ea7696fe39..59375efcf39d 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -20,7 +20,7 @@ static void drop_pagecache_sb(struct super_block *sb)
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_WILL_FREE)) 21 if (inode->i_state & (I_FREEING|I_WILL_FREE))
22 continue; 22 continue;
23 invalidate_mapping_pages(inode->i_mapping, 0, -1); 23 __invalidate_mapping_pages(inode->i_mapping, 0, -1, true);
24 } 24 }
25 spin_unlock(&inode_lock); 25 spin_unlock(&inode_lock);
26} 26}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 83e94fedd4e9..e77a2ec71aa5 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -282,7 +282,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
282 struct dentry *lower_dentry; 282 struct dentry *lower_dentry;
283 struct vfsmount *lower_mnt; 283 struct vfsmount *lower_mnt;
284 char *encoded_name; 284 char *encoded_name;
285 unsigned int encoded_namelen; 285 int encoded_namelen;
286 struct ecryptfs_crypt_stat *crypt_stat = NULL; 286 struct ecryptfs_crypt_stat *crypt_stat = NULL;
287 struct ecryptfs_mount_crypt_stat *mount_crypt_stat; 287 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
288 char *page_virt = NULL; 288 char *page_virt = NULL;
@@ -473,7 +473,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
473 struct dentry *lower_dir_dentry; 473 struct dentry *lower_dir_dentry;
474 umode_t mode; 474 umode_t mode;
475 char *encoded_symname; 475 char *encoded_symname;
476 unsigned int encoded_symlen; 476 int encoded_symlen;
477 struct ecryptfs_crypt_stat *crypt_stat = NULL; 477 struct ecryptfs_crypt_stat *crypt_stat = NULL;
478 478
479 lower_dentry = ecryptfs_dentry_to_lower(dentry); 479 lower_dentry = ecryptfs_dentry_to_lower(dentry);
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index ed4a207fe22a..5276b19423c1 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -75,6 +75,38 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
75 return NULL; 75 return NULL;
76} 76}
77 77
78struct dentry *efs_get_dentry(struct super_block *sb, void *vobjp)
79{
80 __u32 *objp = vobjp;
81 unsigned long ino = objp[0];
82 __u32 generation = objp[1];
83 struct inode *inode;
84 struct dentry *result;
85
86 if (ino == 0)
87 return ERR_PTR(-ESTALE);
88 inode = iget(sb, ino);
89 if (inode == NULL)
90 return ERR_PTR(-ENOMEM);
91
92 if (is_bad_inode(inode) ||
93 (generation && inode->i_generation != generation)) {
94 result = ERR_PTR(-ESTALE);
95 goto out_iput;
96 }
97
98 result = d_alloc_anon(inode);
99 if (!result) {
100 result = ERR_PTR(-ENOMEM);
101 goto out_iput;
102 }
103 return result;
104
105 out_iput:
106 iput(inode);
107 return result;
108}
109
78struct dentry *efs_get_parent(struct dentry *child) 110struct dentry *efs_get_parent(struct dentry *child)
79{ 111{
80 struct dentry *parent; 112 struct dentry *parent;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index e0a6839e68ae..d360c81f3a72 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -11,6 +11,7 @@
11#include <linux/efs_fs.h> 11#include <linux/efs_fs.h>
12#include <linux/efs_vh.h> 12#include <linux/efs_vh.h>
13#include <linux/efs_fs_sb.h> 13#include <linux/efs_fs_sb.h>
14#include <linux/exportfs.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
16#include <linux/vfs.h> 17#include <linux/vfs.h>
@@ -113,6 +114,7 @@ static const struct super_operations efs_superblock_operations = {
113}; 114};
114 115
115static struct export_operations efs_export_ops = { 116static struct export_operations efs_export_ops = {
117 .get_dentry = efs_get_dentry,
116 .get_parent = efs_get_parent, 118 .get_parent = efs_get_parent,
117}; 119};
118 120
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e98f6cd7200c..8adb32a9387a 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -1,15 +1,45 @@
1 1
2#include <linux/exportfs.h>
2#include <linux/fs.h> 3#include <linux/fs.h>
3#include <linux/file.h> 4#include <linux/file.h>
4#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/mount.h>
5#include <linux/namei.h> 7#include <linux/namei.h>
6 8
7struct export_operations export_op_default; 9#define dprintk(fmt, args...) do{}while(0)
8 10
9#define CALL(ops,fun) ((ops->fun)?(ops->fun):export_op_default.fun)
10 11
11#define dprintk(fmt, args...) do{}while(0) 12static int get_name(struct dentry *dentry, char *name,
13 struct dentry *child);
14
15
16static struct dentry *exportfs_get_dentry(struct super_block *sb, void *obj)
17{
18 struct dentry *result = ERR_PTR(-ESTALE);
19
20 if (sb->s_export_op->get_dentry) {
21 result = sb->s_export_op->get_dentry(sb, obj);
22 if (!result)
23 result = ERR_PTR(-ESTALE);
24 }
25
26 return result;
27}
28
29static int exportfs_get_name(struct dentry *dir, char *name,
30 struct dentry *child)
31{
32 struct export_operations *nop = dir->d_sb->s_export_op;
12 33
34 if (nop->get_name)
35 return nop->get_name(dir, name, child);
36 else
37 return get_name(dir, name, child);
38}
39
40/*
41 * Check if the dentry or any of it's aliases is acceptable.
42 */
13static struct dentry * 43static struct dentry *
14find_acceptable_alias(struct dentry *result, 44find_acceptable_alias(struct dentry *result,
15 int (*acceptable)(void *context, struct dentry *dentry), 45 int (*acceptable)(void *context, struct dentry *dentry),
@@ -17,6 +47,9 @@ find_acceptable_alias(struct dentry *result,
17{ 47{
18 struct dentry *dentry, *toput = NULL; 48 struct dentry *dentry, *toput = NULL;
19 49
50 if (acceptable(context, result))
51 return result;
52
20 spin_lock(&dcache_lock); 53 spin_lock(&dcache_lock);
21 list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) { 54 list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
22 dget_locked(dentry); 55 dget_locked(dentry);
@@ -37,130 +70,50 @@ find_acceptable_alias(struct dentry *result,
37 return NULL; 70 return NULL;
38} 71}
39 72
40/** 73/*
41 * find_exported_dentry - helper routine to implement export_operations->decode_fh 74 * Find root of a disconnected subtree and return a reference to it.
42 * @sb: The &super_block identifying the filesystem
43 * @obj: An opaque identifier of the object to be found - passed to
44 * get_inode
45 * @parent: An optional opqaue identifier of the parent of the object.
46 * @acceptable: A function used to test possible &dentries to see if they are
47 * acceptable
48 * @context: A parameter to @acceptable so that it knows on what basis to
49 * judge.
50 *
51 * find_exported_dentry is the central helper routine to enable file systems
52 * to provide the decode_fh() export_operation. It's main task is to take
53 * an &inode, find or create an appropriate &dentry structure, and possibly
54 * splice this into the dcache in the correct place.
55 *
56 * The decode_fh() operation provided by the filesystem should call
57 * find_exported_dentry() with the same parameters that it received except
58 * that instead of the file handle fragment, pointers to opaque identifiers
59 * for the object and optionally its parent are passed. The default decode_fh
60 * routine passes one pointer to the start of the filehandle fragment, and
61 * one 8 bytes into the fragment. It is expected that most filesystems will
62 * take this approach, though the offset to the parent identifier may well be
63 * different.
64 *
65 * find_exported_dentry() will call get_dentry to get an dentry pointer from
66 * the file system. If any &dentry in the d_alias list is acceptable, it will
67 * be returned. Otherwise find_exported_dentry() will attempt to splice a new
68 * &dentry into the dcache using get_name() and get_parent() to find the
69 * appropriate place.
70 */ 75 */
71 76static struct dentry *
72struct dentry * 77find_disconnected_root(struct dentry *dentry)
73find_exported_dentry(struct super_block *sb, void *obj, void *parent,
74 int (*acceptable)(void *context, struct dentry *de),
75 void *context)
76{ 78{
77 struct dentry *result = NULL; 79 dget(dentry);
78 struct dentry *target_dir; 80 spin_lock(&dentry->d_lock);
79 int err; 81 while (!IS_ROOT(dentry) &&
80 struct export_operations *nops = sb->s_export_op; 82 (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) {
81 struct dentry *alias; 83 struct dentry *parent = dentry->d_parent;
82 int noprogress; 84 dget(parent);
83 char nbuf[NAME_MAX+1]; 85 spin_unlock(&dentry->d_lock);
84 86 dput(dentry);
85 /* 87 dentry = parent;
86 * Attempt to find the inode. 88 spin_lock(&dentry->d_lock);
87 */
88 result = CALL(sb->s_export_op,get_dentry)(sb,obj);
89 err = -ESTALE;
90 if (result == NULL)
91 goto err_out;
92 if (IS_ERR(result)) {
93 err = PTR_ERR(result);
94 goto err_out;
95 } 89 }
96 if (S_ISDIR(result->d_inode->i_mode) && 90 spin_unlock(&dentry->d_lock);
97 (result->d_flags & DCACHE_DISCONNECTED)) { 91 return dentry;
98 /* it is an unconnected directory, we must connect it */ 92}
99 ;
100 } else {
101 if (acceptable(context, result))
102 return result;
103 if (S_ISDIR(result->d_inode->i_mode)) {
104 err = -EACCES;
105 goto err_result;
106 }
107 93
108 alias = find_acceptable_alias(result, acceptable, context);
109 if (alias)
110 return alias;
111 }
112
113 /* It's a directory, or we are required to confirm the file's
114 * location in the tree based on the parent information
115 */
116 dprintk("find_exported_dentry: need to look harder for %s/%d\n",sb->s_id,*(int*)obj);
117 if (S_ISDIR(result->d_inode->i_mode))
118 target_dir = dget(result);
119 else {
120 if (parent == NULL)
121 goto err_result;
122 94
123 target_dir = CALL(sb->s_export_op,get_dentry)(sb,parent); 95/*
124 if (IS_ERR(target_dir)) 96 * Make sure target_dir is fully connected to the dentry tree.
125 err = PTR_ERR(target_dir); 97 *
126 if (target_dir == NULL || IS_ERR(target_dir)) 98 * It may already be, as the flag isn't always updated when connection happens.
127 goto err_result; 99 */
128 } 100static int
129 /* 101reconnect_path(struct super_block *sb, struct dentry *target_dir)
130 * Now we need to make sure that target_dir is properly connected. 102{
131 * It may already be, as the flag isn't always updated when connection 103 char nbuf[NAME_MAX+1];
132 * happens. 104 int noprogress = 0;
133 * So, we walk up parent links until we find a connected directory, 105 int err = -ESTALE;
134 * or we run out of directories. Then we find the parent, find
135 * the name of the child in that parent, and do a lookup.
136 * This should connect the child into the parent
137 * We then repeat.
138 */
139 106
140 /* it is possible that a confused file system might not let us complete 107 /*
108 * It is possible that a confused file system might not let us complete
141 * the path to the root. For example, if get_parent returns a directory 109 * the path to the root. For example, if get_parent returns a directory
142 * in which we cannot find a name for the child. While this implies a 110 * in which we cannot find a name for the child. While this implies a
143 * very sick filesystem we don't want it to cause knfsd to spin. Hence 111 * very sick filesystem we don't want it to cause knfsd to spin. Hence
144 * the noprogress counter. If we go through the loop 10 times (2 is 112 * the noprogress counter. If we go through the loop 10 times (2 is
145 * probably enough) without getting anywhere, we just give up 113 * probably enough) without getting anywhere, we just give up
146 */ 114 */
147 noprogress= 0;
148 while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) { 115 while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) {
149 struct dentry *pd = target_dir; 116 struct dentry *pd = find_disconnected_root(target_dir);
150
151 dget(pd);
152 spin_lock(&pd->d_lock);
153 while (!IS_ROOT(pd) &&
154 (pd->d_parent->d_flags&DCACHE_DISCONNECTED)) {
155 struct dentry *parent = pd->d_parent;
156
157 dget(parent);
158 spin_unlock(&pd->d_lock);
159 dput(pd);
160 pd = parent;
161 spin_lock(&pd->d_lock);
162 }
163 spin_unlock(&pd->d_lock);
164 117
165 if (!IS_ROOT(pd)) { 118 if (!IS_ROOT(pd)) {
166 /* must have found a connected parent - great */ 119 /* must have found a connected parent - great */
@@ -175,29 +128,40 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
175 spin_unlock(&pd->d_lock); 128 spin_unlock(&pd->d_lock);
176 noprogress = 0; 129 noprogress = 0;
177 } else { 130 } else {
178 /* we have hit the top of a disconnected path. Try 131 /*
179 * to find parent and connect 132 * We have hit the top of a disconnected path, try to
180 * note: racing with some other process renaming a 133 * find parent and connect.
181 * directory isn't much of a problem here. If someone 134 *
182 * renames the directory, it will end up properly 135 * Racing with some other process renaming a directory
183 * connected, which is what we want 136 * isn't much of a problem here. If someone renames
137 * the directory, it will end up properly connected,
138 * which is what we want
139 *
140 * Getting the parent can't be supported generically,
141 * the locking is too icky.
142 *
143 * Instead we just return EACCES. If server reboots
144 * or inodes get flushed, you lose
184 */ 145 */
185 struct dentry *ppd; 146 struct dentry *ppd = ERR_PTR(-EACCES);
186 struct dentry *npd; 147 struct dentry *npd;
187 148
188 mutex_lock(&pd->d_inode->i_mutex); 149 mutex_lock(&pd->d_inode->i_mutex);
189 ppd = CALL(nops,get_parent)(pd); 150 if (sb->s_export_op->get_parent)
151 ppd = sb->s_export_op->get_parent(pd);
190 mutex_unlock(&pd->d_inode->i_mutex); 152 mutex_unlock(&pd->d_inode->i_mutex);
191 153
192 if (IS_ERR(ppd)) { 154 if (IS_ERR(ppd)) {
193 err = PTR_ERR(ppd); 155 err = PTR_ERR(ppd);
194 dprintk("find_exported_dentry: get_parent of %ld failed, err %d\n", 156 dprintk("%s: get_parent of %ld failed, err %d\n",
195 pd->d_inode->i_ino, err); 157 __FUNCTION__, pd->d_inode->i_ino, err);
196 dput(pd); 158 dput(pd);
197 break; 159 break;
198 } 160 }
199 dprintk("find_exported_dentry: find name of %lu in %lu\n", pd->d_inode->i_ino, ppd->d_inode->i_ino); 161
200 err = CALL(nops,get_name)(ppd, nbuf, pd); 162 dprintk("%s: find name of %lu in %lu\n", __FUNCTION__,
163 pd->d_inode->i_ino, ppd->d_inode->i_ino);
164 err = exportfs_get_name(ppd, nbuf, pd);
201 if (err) { 165 if (err) {
202 dput(ppd); 166 dput(ppd);
203 dput(pd); 167 dput(pd);
@@ -208,13 +172,14 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
208 continue; 172 continue;
209 break; 173 break;
210 } 174 }
211 dprintk("find_exported_dentry: found name: %s\n", nbuf); 175 dprintk("%s: found name: %s\n", __FUNCTION__, nbuf);
212 mutex_lock(&ppd->d_inode->i_mutex); 176 mutex_lock(&ppd->d_inode->i_mutex);
213 npd = lookup_one_len(nbuf, ppd, strlen(nbuf)); 177 npd = lookup_one_len(nbuf, ppd, strlen(nbuf));
214 mutex_unlock(&ppd->d_inode->i_mutex); 178 mutex_unlock(&ppd->d_inode->i_mutex);
215 if (IS_ERR(npd)) { 179 if (IS_ERR(npd)) {
216 err = PTR_ERR(npd); 180 err = PTR_ERR(npd);
217 dprintk("find_exported_dentry: lookup failed: %d\n", err); 181 dprintk("%s: lookup failed: %d\n",
182 __FUNCTION__, err);
218 dput(ppd); 183 dput(ppd);
219 dput(pd); 184 dput(pd);
220 break; 185 break;
@@ -227,7 +192,7 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
227 if (npd == pd) 192 if (npd == pd)
228 noprogress = 0; 193 noprogress = 0;
229 else 194 else
230 printk("find_exported_dentry: npd != pd\n"); 195 printk("%s: npd != pd\n", __FUNCTION__);
231 dput(npd); 196 dput(npd);
232 dput(ppd); 197 dput(ppd);
233 if (IS_ROOT(pd)) { 198 if (IS_ROOT(pd)) {
@@ -243,15 +208,101 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
243 /* something went wrong - oh-well */ 208 /* something went wrong - oh-well */
244 if (!err) 209 if (!err)
245 err = -ESTALE; 210 err = -ESTALE;
246 goto err_target; 211 return err;
247 } 212 }
248 /* if we weren't after a directory, have one more step to go */ 213
249 if (result != target_dir) { 214 return 0;
250 struct dentry *nresult; 215}
251 err = CALL(nops,get_name)(target_dir, nbuf, result); 216
217/**
218 * find_exported_dentry - helper routine to implement export_operations->decode_fh
219 * @sb: The &super_block identifying the filesystem
220 * @obj: An opaque identifier of the object to be found - passed to
221 * get_inode
222 * @parent: An optional opqaue identifier of the parent of the object.
223 * @acceptable: A function used to test possible &dentries to see if they are
224 * acceptable
225 * @context: A parameter to @acceptable so that it knows on what basis to
226 * judge.
227 *
228 * find_exported_dentry is the central helper routine to enable file systems
229 * to provide the decode_fh() export_operation. It's main task is to take
230 * an &inode, find or create an appropriate &dentry structure, and possibly
231 * splice this into the dcache in the correct place.
232 *
233 * The decode_fh() operation provided by the filesystem should call
234 * find_exported_dentry() with the same parameters that it received except
235 * that instead of the file handle fragment, pointers to opaque identifiers
236 * for the object and optionally its parent are passed. The default decode_fh
237 * routine passes one pointer to the start of the filehandle fragment, and
238 * one 8 bytes into the fragment. It is expected that most filesystems will
239 * take this approach, though the offset to the parent identifier may well be
240 * different.
241 *
242 * find_exported_dentry() will call get_dentry to get an dentry pointer from
243 * the file system. If any &dentry in the d_alias list is acceptable, it will
244 * be returned. Otherwise find_exported_dentry() will attempt to splice a new
245 * &dentry into the dcache using get_name() and get_parent() to find the
246 * appropriate place.
247 */
248
249struct dentry *
250find_exported_dentry(struct super_block *sb, void *obj, void *parent,
251 int (*acceptable)(void *context, struct dentry *de),
252 void *context)
253{
254 struct dentry *result, *alias;
255 int err = -ESTALE;
256
257 /*
258 * Attempt to find the inode.
259 */
260 result = exportfs_get_dentry(sb, obj);
261 if (IS_ERR(result))
262 return result;
263
264 if (S_ISDIR(result->d_inode->i_mode)) {
265 if (!(result->d_flags & DCACHE_DISCONNECTED)) {
266 if (acceptable(context, result))
267 return result;
268 err = -EACCES;
269 goto err_result;
270 }
271
272 err = reconnect_path(sb, result);
273 if (err)
274 goto err_result;
275 } else {
276 struct dentry *target_dir, *nresult;
277 char nbuf[NAME_MAX+1];
278
279 alias = find_acceptable_alias(result, acceptable, context);
280 if (alias)
281 return alias;
282
283 if (parent == NULL)
284 goto err_result;
285
286 target_dir = exportfs_get_dentry(sb,parent);
287 if (IS_ERR(target_dir)) {
288 err = PTR_ERR(target_dir);
289 goto err_result;
290 }
291
292 err = reconnect_path(sb, target_dir);
293 if (err) {
294 dput(target_dir);
295 goto err_result;
296 }
297
298 /*
299 * As we weren't after a directory, have one more step to go.
300 */
301 err = exportfs_get_name(target_dir, nbuf, result);
252 if (!err) { 302 if (!err) {
253 mutex_lock(&target_dir->d_inode->i_mutex); 303 mutex_lock(&target_dir->d_inode->i_mutex);
254 nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); 304 nresult = lookup_one_len(nbuf, target_dir,
305 strlen(nbuf));
255 mutex_unlock(&target_dir->d_inode->i_mutex); 306 mutex_unlock(&target_dir->d_inode->i_mutex);
256 if (!IS_ERR(nresult)) { 307 if (!IS_ERR(nresult)) {
257 if (nresult->d_inode) { 308 if (nresult->d_inode) {
@@ -261,11 +312,8 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
261 dput(nresult); 312 dput(nresult);
262 } 313 }
263 } 314 }
315 dput(target_dir);
264 } 316 }
265 dput(target_dir);
266 /* now result is properly connected, it is our best bet */
267 if (acceptable(context, result))
268 return result;
269 317
270 alias = find_acceptable_alias(result, acceptable, context); 318 alias = find_acceptable_alias(result, acceptable, context);
271 if (alias) 319 if (alias)
@@ -275,32 +323,16 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
275 dput(result); 323 dput(result);
276 /* It might be justifiable to return ESTALE here, 324 /* It might be justifiable to return ESTALE here,
277 * but the filehandle at-least looks reasonable good 325 * but the filehandle at-least looks reasonable good
278 * and it just be a permission problem, so returning 326 * and it may just be a permission problem, so returning
279 * -EACCESS is safer 327 * -EACCESS is safer
280 */ 328 */
281 return ERR_PTR(-EACCES); 329 return ERR_PTR(-EACCES);
282 330
283 err_target:
284 dput(target_dir);
285 err_result: 331 err_result:
286 dput(result); 332 dput(result);
287 err_out:
288 return ERR_PTR(err); 333 return ERR_PTR(err);
289} 334}
290 335
291
292
293static struct dentry *get_parent(struct dentry *child)
294{
295 /* get_parent cannot be supported generically, the locking
296 * is too icky.
297 * instead, we just return EACCES. If server reboots or inodes
298 * get flushed, you lose
299 */
300 return ERR_PTR(-EACCES);
301}
302
303
304struct getdents_callback { 336struct getdents_callback {
305 char *name; /* name that was found. It already points to a 337 char *name; /* name that was found. It already points to a
306 buffer NAME_MAX+1 is size */ 338 buffer NAME_MAX+1 is size */
@@ -390,61 +422,6 @@ out:
390 return error; 422 return error;
391} 423}
392 424
393
394static struct dentry *export_iget(struct super_block *sb, unsigned long ino, __u32 generation)
395{
396
397 /* iget isn't really right if the inode is currently unallocated!!
398 * This should really all be done inside each filesystem
399 *
400 * ext2fs' read_inode has been strengthed to return a bad_inode if
401 * the inode had been deleted.
402 *
403 * Currently we don't know the generation for parent directory, so
404 * a generation of 0 means "accept any"
405 */
406 struct inode *inode;
407 struct dentry *result;
408 if (ino == 0)
409 return ERR_PTR(-ESTALE);
410 inode = iget(sb, ino);
411 if (inode == NULL)
412 return ERR_PTR(-ENOMEM);
413 if (is_bad_inode(inode)
414 || (generation && inode->i_generation != generation)
415 ) {
416 /* we didn't find the right inode.. */
417 dprintk("fh_verify: Inode %lu, Bad count: %d %d or version %u %u\n",
418 inode->i_ino,
419 inode->i_nlink, atomic_read(&inode->i_count),
420 inode->i_generation,
421 generation);
422
423 iput(inode);
424 return ERR_PTR(-ESTALE);
425 }
426 /* now to find a dentry.
427 * If possible, get a well-connected one
428 */
429 result = d_alloc_anon(inode);
430 if (!result) {
431 iput(inode);
432 return ERR_PTR(-ENOMEM);
433 }
434 return result;
435}
436
437
438static struct dentry *get_object(struct super_block *sb, void *vobjp)
439{
440 __u32 *objp = vobjp;
441 unsigned long ino = objp[0];
442 __u32 generation = objp[1];
443
444 return export_iget(sb, ino, generation);
445}
446
447
448/** 425/**
449 * export_encode_fh - default export_operations->encode_fh function 426 * export_encode_fh - default export_operations->encode_fh function
450 * @dentry: the dentry to encode 427 * @dentry: the dentry to encode
@@ -517,16 +494,40 @@ static struct dentry *export_decode_fh(struct super_block *sb, __u32 *fh, int fh
517 acceptable, context); 494 acceptable, context);
518} 495}
519 496
520struct export_operations export_op_default = { 497int exportfs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len,
521 .decode_fh = export_decode_fh, 498 int connectable)
522 .encode_fh = export_encode_fh, 499{
500 struct export_operations *nop = dentry->d_sb->s_export_op;
501 int error;
502
503 if (nop->encode_fh)
504 error = nop->encode_fh(dentry, fh, max_len, connectable);
505 else
506 error = export_encode_fh(dentry, fh, max_len, connectable);
523 507
524 .get_name = get_name, 508 return error;
525 .get_parent = get_parent, 509}
526 .get_dentry = get_object, 510EXPORT_SYMBOL_GPL(exportfs_encode_fh);
527}; 511
512struct dentry *exportfs_decode_fh(struct vfsmount *mnt, __u32 *fh, int fh_len,
513 int fileid_type, int (*acceptable)(void *, struct dentry *),
514 void *context)
515{
516 struct export_operations *nop = mnt->mnt_sb->s_export_op;
517 struct dentry *result;
518
519 if (nop->decode_fh) {
520 result = nop->decode_fh(mnt->mnt_sb, fh, fh_len, fileid_type,
521 acceptable, context);
522 } else {
523 result = export_decode_fh(mnt->mnt_sb, fh, fh_len, fileid_type,
524 acceptable, context);
525 }
526
527 return result;
528}
529EXPORT_SYMBOL_GPL(exportfs_decode_fh);
528 530
529EXPORT_SYMBOL(export_op_default);
530EXPORT_SYMBOL(find_exported_dentry); 531EXPORT_SYMBOL(find_exported_dentry);
531 532
532MODULE_LICENSE("GPL"); 533MODULE_LICENSE("GPL");
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 04afeecaaef3..ab7961260c49 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -24,9 +24,9 @@
24#include "acl.h" 24#include "acl.h"
25 25
26/* 26/*
27 * Called when an inode is released. Note that this is different 27 * Called when filp is released. This happens when all file descriptors
28 * from ext2_open_file: open gets called at every open, but release 28 * for a single struct file are closed. Note that different open() calls
29 * gets called only when /all/ the files are closed. 29 * for the same file yield different struct file structures.
30 */ 30 */
31static int ext2_release_file (struct inode * inode, struct file * filp) 31static int ext2_release_file (struct inode * inode, struct file * filp)
32{ 32{
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 5de5061eb331..3eefa97fe204 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -25,6 +25,7 @@
25#include <linux/parser.h> 25#include <linux/parser.h>
26#include <linux/random.h> 26#include <linux/random.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/exportfs.h>
28#include <linux/smp_lock.h> 29#include <linux/smp_lock.h>
29#include <linux/vfs.h> 30#include <linux/vfs.h>
30#include <linux/seq_file.h> 31#include <linux/seq_file.h>
@@ -1099,15 +1100,18 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1099 struct super_block *sb = dentry->d_sb; 1100 struct super_block *sb = dentry->d_sb;
1100 struct ext2_sb_info *sbi = EXT2_SB(sb); 1101 struct ext2_sb_info *sbi = EXT2_SB(sb);
1101 struct ext2_super_block *es = sbi->s_es; 1102 struct ext2_super_block *es = sbi->s_es;
1102 unsigned long overhead;
1103 int i;
1104 u64 fsid; 1103 u64 fsid;
1105 1104
1106 if (test_opt (sb, MINIX_DF)) 1105 if (test_opt (sb, MINIX_DF))
1107 overhead = 0; 1106 sbi->s_overhead_last = 0;
1108 else { 1107 else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
1108 unsigned long i, overhead = 0;
1109 smp_rmb();
1110
1109 /* 1111 /*
1110 * Compute the overhead (FS structures) 1112 * Compute the overhead (FS structures). This is constant
1113 * for a given filesystem unless the number of block groups
1114 * changes so we cache the previous value until it does.
1111 */ 1115 */
1112 1116
1113 /* 1117 /*
@@ -1131,17 +1135,22 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1131 */ 1135 */
1132 overhead += (sbi->s_groups_count * 1136 overhead += (sbi->s_groups_count *
1133 (2 + sbi->s_itb_per_group)); 1137 (2 + sbi->s_itb_per_group));
1138 sbi->s_overhead_last = overhead;
1139 smp_wmb();
1140 sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
1134 } 1141 }
1135 1142
1136 buf->f_type = EXT2_SUPER_MAGIC; 1143 buf->f_type = EXT2_SUPER_MAGIC;
1137 buf->f_bsize = sb->s_blocksize; 1144 buf->f_bsize = sb->s_blocksize;
1138 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; 1145 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
1139 buf->f_bfree = ext2_count_free_blocks(sb); 1146 buf->f_bfree = ext2_count_free_blocks(sb);
1147 es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
1140 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); 1148 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
1141 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) 1149 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
1142 buf->f_bavail = 0; 1150 buf->f_bavail = 0;
1143 buf->f_files = le32_to_cpu(es->s_inodes_count); 1151 buf->f_files = le32_to_cpu(es->s_inodes_count);
1144 buf->f_ffree = ext2_count_free_inodes(sb); 1152 buf->f_ffree = ext2_count_free_inodes(sb);
1153 es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
1145 buf->f_namelen = EXT2_NAME_LEN; 1154 buf->f_namelen = EXT2_NAME_LEN;
1146 fsid = le64_to_cpup((void *)es->s_uuid) ^ 1155 fsid = le64_to_cpup((void *)es->s_uuid) ^
1147 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 1156 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2a85ddee4740..de4e3161e479 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3195,7 +3195,7 @@ int ext3_change_inode_journal_flag(struct inode *inode, int val)
3195 */ 3195 */
3196 3196
3197 journal = EXT3_JOURNAL(inode); 3197 journal = EXT3_JOURNAL(inode);
3198 if (is_journal_aborted(journal) || IS_RDONLY(inode)) 3198 if (is_journal_aborted(journal))
3199 return -EROFS; 3199 return -EROFS;
3200 3200
3201 journal_lock_updates(journal); 3201 journal_lock_updates(journal);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 9bb046df827a..1586807b8177 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1019,6 +1019,11 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
1019 1019
1020 if (!inode) 1020 if (!inode)
1021 return ERR_PTR(-EACCES); 1021 return ERR_PTR(-EACCES);
1022
1023 if (is_bad_inode(inode)) {
1024 iput(inode);
1025 return ERR_PTR(-ENOENT);
1026 }
1022 } 1027 }
1023 return d_splice_alias(inode, dentry); 1028 return d_splice_alias(inode, dentry);
1024} 1029}
@@ -1054,6 +1059,11 @@ struct dentry *ext3_get_parent(struct dentry *child)
1054 if (!inode) 1059 if (!inode)
1055 return ERR_PTR(-EACCES); 1060 return ERR_PTR(-EACCES);
1056 1061
1062 if (is_bad_inode(inode)) {
1063 iput(inode);
1064 return ERR_PTR(-ENOENT);
1065 }
1066
1057 parent = d_alloc_anon(inode); 1067 parent = d_alloc_anon(inode);
1058 if (!parent) { 1068 if (!parent) {
1059 iput(inode); 1069 iput(inode);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6e3062913a92..4f84dc86628a 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -29,12 +29,14 @@
29#include <linux/parser.h> 29#include <linux/parser.h>
30#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
32#include <linux/exportfs.h>
32#include <linux/vfs.h> 33#include <linux/vfs.h>
33#include <linux/random.h> 34#include <linux/random.h>
34#include <linux/mount.h> 35#include <linux/mount.h>
35#include <linux/namei.h> 36#include <linux/namei.h>
36#include <linux/quotaops.h> 37#include <linux/quotaops.h>
37#include <linux/seq_file.h> 38#include <linux/seq_file.h>
39#include <linux/log2.h>
38 40
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
40 42
@@ -459,6 +461,14 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
459 461
460static void ext3_destroy_inode(struct inode *inode) 462static void ext3_destroy_inode(struct inode *inode)
461{ 463{
464 if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
465 printk("EXT3 Inode %p: orphan list check failed!\n",
466 EXT3_I(inode));
467 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
468 EXT3_I(inode), sizeof(struct ext3_inode_info),
469 false);
470 dump_stack();
471 }
462 kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); 472 kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
463} 473}
464 474
@@ -1566,7 +1576,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1566 sbi->s_inode_size = le16_to_cpu(es->s_inode_size); 1576 sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
1567 sbi->s_first_ino = le32_to_cpu(es->s_first_ino); 1577 sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
1568 if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || 1578 if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
1569 (sbi->s_inode_size & (sbi->s_inode_size - 1)) || 1579 (!is_power_of_2(sbi->s_inode_size)) ||
1570 (sbi->s_inode_size > blocksize)) { 1580 (sbi->s_inode_size > blocksize)) {
1571 printk (KERN_ERR 1581 printk (KERN_ERR
1572 "EXT3-fs: unsupported inode size: %d\n", 1582 "EXT3-fs: unsupported inode size: %d\n",
@@ -2075,6 +2085,7 @@ static int ext3_create_journal(struct super_block * sb,
2075 unsigned int journal_inum) 2085 unsigned int journal_inum)
2076{ 2086{
2077 journal_t *journal; 2087 journal_t *journal;
2088 int err;
2078 2089
2079 if (sb->s_flags & MS_RDONLY) { 2090 if (sb->s_flags & MS_RDONLY) {
2080 printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " 2091 printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
@@ -2082,13 +2093,15 @@ static int ext3_create_journal(struct super_block * sb,
2082 return -EROFS; 2093 return -EROFS;
2083 } 2094 }
2084 2095
2085 if (!(journal = ext3_get_journal(sb, journal_inum))) 2096 journal = ext3_get_journal(sb, journal_inum);
2097 if (!journal)
2086 return -EINVAL; 2098 return -EINVAL;
2087 2099
2088 printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n", 2100 printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n",
2089 journal_inum); 2101 journal_inum);
2090 2102
2091 if (journal_create(journal)) { 2103 err = journal_create(journal);
2104 if (err) {
2092 printk(KERN_ERR "EXT3-fs: error creating journal.\n"); 2105 printk(KERN_ERR "EXT3-fs: error creating journal.\n");
2093 journal_destroy(journal); 2106 journal_destroy(journal);
2094 return -EIO; 2107 return -EIO;
@@ -2139,12 +2152,14 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
2139 2152
2140 journal_lock_updates(journal); 2153 journal_lock_updates(journal);
2141 journal_flush(journal); 2154 journal_flush(journal);
2155 lock_super(sb);
2142 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && 2156 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
2143 sb->s_flags & MS_RDONLY) { 2157 sb->s_flags & MS_RDONLY) {
2144 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); 2158 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2145 sb->s_dirt = 0; 2159 sb->s_dirt = 0;
2146 ext3_commit_super(sb, es, 1); 2160 ext3_commit_super(sb, es, 1);
2147 } 2161 }
2162 unlock_super(sb);
2148 journal_unlock_updates(journal); 2163 journal_unlock_updates(journal);
2149} 2164}
2150 2165
@@ -2333,7 +2348,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2333 (sbi->s_mount_state & EXT3_VALID_FS)) 2348 (sbi->s_mount_state & EXT3_VALID_FS))
2334 es->s_state = cpu_to_le16(sbi->s_mount_state); 2349 es->s_state = cpu_to_le16(sbi->s_mount_state);
2335 2350
2351 /*
2352 * We have to unlock super so that we can wait for
2353 * transactions.
2354 */
2355 unlock_super(sb);
2336 ext3_mark_recovery_complete(sb, es); 2356 ext3_mark_recovery_complete(sb, es);
2357 lock_super(sb);
2337 } else { 2358 } else {
2338 __le32 ret; 2359 __le32 ret;
2339 if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, 2360 if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
@@ -2406,19 +2427,19 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2406 struct super_block *sb = dentry->d_sb; 2427 struct super_block *sb = dentry->d_sb;
2407 struct ext3_sb_info *sbi = EXT3_SB(sb); 2428 struct ext3_sb_info *sbi = EXT3_SB(sb);
2408 struct ext3_super_block *es = sbi->s_es; 2429 struct ext3_super_block *es = sbi->s_es;
2409 ext3_fsblk_t overhead;
2410 int i;
2411 u64 fsid; 2430 u64 fsid;
2412 2431
2413 if (test_opt (sb, MINIX_DF)) 2432 if (test_opt(sb, MINIX_DF)) {
2414 overhead = 0; 2433 sbi->s_overhead_last = 0;
2415 else { 2434 } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
2416 unsigned long ngroups; 2435 unsigned long ngroups = sbi->s_groups_count, i;
2417 ngroups = EXT3_SB(sb)->s_groups_count; 2436 ext3_fsblk_t overhead = 0;
2418 smp_rmb(); 2437 smp_rmb();
2419 2438
2420 /* 2439 /*
2421 * Compute the overhead (FS structures) 2440 * Compute the overhead (FS structures). This is constant
2441 * for a given filesystem unless the number of block groups
2442 * changes so we cache the previous value until it does.
2422 */ 2443 */
2423 2444
2424 /* 2445 /*
@@ -2442,18 +2463,23 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2442 * Every block group has an inode bitmap, a block 2463 * Every block group has an inode bitmap, a block
2443 * bitmap, and an inode table. 2464 * bitmap, and an inode table.
2444 */ 2465 */
2445 overhead += (ngroups * (2 + EXT3_SB(sb)->s_itb_per_group)); 2466 overhead += ngroups * (2 + sbi->s_itb_per_group);
2467 sbi->s_overhead_last = overhead;
2468 smp_wmb();
2469 sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
2446 } 2470 }
2447 2471
2448 buf->f_type = EXT3_SUPER_MAGIC; 2472 buf->f_type = EXT3_SUPER_MAGIC;
2449 buf->f_bsize = sb->s_blocksize; 2473 buf->f_bsize = sb->s_blocksize;
2450 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; 2474 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
2451 buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter); 2475 buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
2476 es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
2452 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); 2477 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
2453 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) 2478 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
2454 buf->f_bavail = 0; 2479 buf->f_bavail = 0;
2455 buf->f_files = le32_to_cpu(es->s_inodes_count); 2480 buf->f_files = le32_to_cpu(es->s_inodes_count);
2456 buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter); 2481 buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
2482 es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
2457 buf->f_namelen = EXT3_NAME_LEN; 2483 buf->f_namelen = EXT3_NAME_LEN;
2458 fsid = le64_to_cpup((void *)es->s_uuid) ^ 2484 fsid = le64_to_cpup((void *)es->s_uuid) ^
2459 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 2485 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 3b64bb16c727..9de54ae48dee 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1585,7 +1585,7 @@ allocated:
1585 ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no); 1585 ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
1586 1586
1587 if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) || 1587 if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1588 in_range(ext4_block_bitmap(sb, gdp), ret_block, num) || 1588 in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
1589 in_range(ret_block, ext4_inode_table(sb, gdp), 1589 in_range(ret_block, ext4_inode_table(sb, gdp),
1590 EXT4_SB(sb)->s_itb_per_group) || 1590 EXT4_SB(sb)->s_itb_per_group) ||
1591 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp), 1591 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2811e5720ad0..2de339dd7554 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1017,6 +1017,11 @@ static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, str
1017 1017
1018 if (!inode) 1018 if (!inode)
1019 return ERR_PTR(-EACCES); 1019 return ERR_PTR(-EACCES);
1020
1021 if (is_bad_inode(inode)) {
1022 iput(inode);
1023 return ERR_PTR(-ENOENT);
1024 }
1020 } 1025 }
1021 return d_splice_alias(inode, dentry); 1026 return d_splice_alias(inode, dentry);
1022} 1027}
@@ -1052,6 +1057,11 @@ struct dentry *ext4_get_parent(struct dentry *child)
1052 if (!inode) 1057 if (!inode)
1053 return ERR_PTR(-EACCES); 1058 return ERR_PTR(-EACCES);
1054 1059
1060 if (is_bad_inode(inode)) {
1061 iput(inode);
1062 return ERR_PTR(-ENOENT);
1063 }
1064
1055 parent = d_alloc_anon(inode); 1065 parent = d_alloc_anon(inode);
1056 if (!parent) { 1066 if (!parent) {
1057 iput(inode); 1067 iput(inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 175b68c60968..b806e689c4aa 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -29,6 +29,7 @@
29#include <linux/parser.h> 29#include <linux/parser.h>
30#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
32#include <linux/exportfs.h>
32#include <linux/vfs.h> 33#include <linux/vfs.h>
33#include <linux/random.h> 34#include <linux/random.h>
34#include <linux/mount.h> 35#include <linux/mount.h>
@@ -510,6 +511,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
510 511
511static void ext4_destroy_inode(struct inode *inode) 512static void ext4_destroy_inode(struct inode *inode)
512{ 513{
514 if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
515 printk("EXT4 Inode %p: orphan list check failed!\n",
516 EXT4_I(inode));
517 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
518 EXT4_I(inode), sizeof(struct ext4_inode_info),
519 true);
520 dump_stack();
521 }
513 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); 522 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
514} 523}
515 524
@@ -2150,6 +2159,7 @@ static int ext4_create_journal(struct super_block * sb,
2150 unsigned int journal_inum) 2159 unsigned int journal_inum)
2151{ 2160{
2152 journal_t *journal; 2161 journal_t *journal;
2162 int err;
2153 2163
2154 if (sb->s_flags & MS_RDONLY) { 2164 if (sb->s_flags & MS_RDONLY) {
2155 printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to " 2165 printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
@@ -2157,13 +2167,15 @@ static int ext4_create_journal(struct super_block * sb,
2157 return -EROFS; 2167 return -EROFS;
2158 } 2168 }
2159 2169
2160 if (!(journal = ext4_get_journal(sb, journal_inum))) 2170 journal = ext4_get_journal(sb, journal_inum);
2171 if (!journal)
2161 return -EINVAL; 2172 return -EINVAL;
2162 2173
2163 printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n", 2174 printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
2164 journal_inum); 2175 journal_inum);
2165 2176
2166 if (jbd2_journal_create(journal)) { 2177 err = jbd2_journal_create(journal);
2178 if (err) {
2167 printk(KERN_ERR "EXT4-fs: error creating journal.\n"); 2179 printk(KERN_ERR "EXT4-fs: error creating journal.\n");
2168 jbd2_journal_destroy(journal); 2180 jbd2_journal_destroy(journal);
2169 return -EIO; 2181 return -EIO;
@@ -2214,12 +2226,14 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
2214 2226
2215 jbd2_journal_lock_updates(journal); 2227 jbd2_journal_lock_updates(journal);
2216 jbd2_journal_flush(journal); 2228 jbd2_journal_flush(journal);
2229 lock_super(sb);
2217 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && 2230 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
2218 sb->s_flags & MS_RDONLY) { 2231 sb->s_flags & MS_RDONLY) {
2219 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 2232 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2220 sb->s_dirt = 0; 2233 sb->s_dirt = 0;
2221 ext4_commit_super(sb, es, 1); 2234 ext4_commit_super(sb, es, 1);
2222 } 2235 }
2236 unlock_super(sb);
2223 jbd2_journal_unlock_updates(journal); 2237 jbd2_journal_unlock_updates(journal);
2224} 2238}
2225 2239
@@ -2408,7 +2422,13 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
2408 (sbi->s_mount_state & EXT4_VALID_FS)) 2422 (sbi->s_mount_state & EXT4_VALID_FS))
2409 es->s_state = cpu_to_le16(sbi->s_mount_state); 2423 es->s_state = cpu_to_le16(sbi->s_mount_state);
2410 2424
2425 /*
2426 * We have to unlock super so that we can wait for
2427 * transactions.
2428 */
2429 unlock_super(sb);
2411 ext4_mark_recovery_complete(sb, es); 2430 ext4_mark_recovery_complete(sb, es);
2431 lock_super(sb);
2412 } else { 2432 } else {
2413 __le32 ret; 2433 __le32 ret;
2414 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 2434 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -2481,19 +2501,19 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
2481 struct super_block *sb = dentry->d_sb; 2501 struct super_block *sb = dentry->d_sb;
2482 struct ext4_sb_info *sbi = EXT4_SB(sb); 2502 struct ext4_sb_info *sbi = EXT4_SB(sb);
2483 struct ext4_super_block *es = sbi->s_es; 2503 struct ext4_super_block *es = sbi->s_es;
2484 ext4_fsblk_t overhead;
2485 int i;
2486 u64 fsid; 2504 u64 fsid;
2487 2505
2488 if (test_opt (sb, MINIX_DF)) 2506 if (test_opt(sb, MINIX_DF)) {
2489 overhead = 0; 2507 sbi->s_overhead_last = 0;
2490 else { 2508 } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
2491 unsigned long ngroups; 2509 unsigned long ngroups = sbi->s_groups_count, i;
2492 ngroups = EXT4_SB(sb)->s_groups_count; 2510 ext4_fsblk_t overhead = 0;
2493 smp_rmb(); 2511 smp_rmb();
2494 2512
2495 /* 2513 /*
2496 * Compute the overhead (FS structures) 2514 * Compute the overhead (FS structures). This is constant
2515 * for a given filesystem unless the number of block groups
2516 * changes so we cache the previous value until it does.
2497 */ 2517 */
2498 2518
2499 /* 2519 /*
@@ -2517,18 +2537,23 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
2517 * Every block group has an inode bitmap, a block 2537 * Every block group has an inode bitmap, a block
2518 * bitmap, and an inode table. 2538 * bitmap, and an inode table.
2519 */ 2539 */
2520 overhead += (ngroups * (2 + EXT4_SB(sb)->s_itb_per_group)); 2540 overhead += ngroups * (2 + sbi->s_itb_per_group);
2541 sbi->s_overhead_last = overhead;
2542 smp_wmb();
2543 sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
2521 } 2544 }
2522 2545
2523 buf->f_type = EXT4_SUPER_MAGIC; 2546 buf->f_type = EXT4_SUPER_MAGIC;
2524 buf->f_bsize = sb->s_blocksize; 2547 buf->f_bsize = sb->s_blocksize;
2525 buf->f_blocks = ext4_blocks_count(es) - overhead; 2548 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
2526 buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter); 2549 buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
2550 es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
2527 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 2551 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
2528 if (buf->f_bfree < ext4_r_blocks_count(es)) 2552 if (buf->f_bfree < ext4_r_blocks_count(es))
2529 buf->f_bavail = 0; 2553 buf->f_bavail = 0;
2530 buf->f_files = le32_to_cpu(es->s_inodes_count); 2554 buf->f_files = le32_to_cpu(es->s_inodes_count);
2531 buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter); 2555 buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
2556 es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
2532 buf->f_namelen = EXT4_NAME_LEN; 2557 buf->f_namelen = EXT4_NAME_LEN;
2533 fsid = le64_to_cpup((void *)es->s_uuid) ^ 2558 fsid = le64_to_cpup((void *)es->s_uuid) ^
2534 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 2559 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index ccf161dffb63..72cbcd61bd95 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -313,7 +313,7 @@ int fat_search_long(struct inode *inode, const unsigned char *name,
313 wchar_t bufuname[14]; 313 wchar_t bufuname[14];
314 unsigned char xlate_len, nr_slots; 314 unsigned char xlate_len, nr_slots;
315 wchar_t *unicode = NULL; 315 wchar_t *unicode = NULL;
316 unsigned char work[8], bufname[260]; /* 256 + 4 */ 316 unsigned char work[MSDOS_NAME], bufname[260]; /* 256 + 4 */
317 int uni_xlate = sbi->options.unicode_xlate; 317 int uni_xlate = sbi->options.unicode_xlate;
318 int utf8 = sbi->options.utf8; 318 int utf8 = sbi->options.utf8;
319 int anycase = (sbi->options.name_check != 's'); 319 int anycase = (sbi->options.name_check != 's');
@@ -351,7 +351,8 @@ parse_record:
351 if (work[0] == 0x05) 351 if (work[0] == 0x05)
352 work[0] = 0xE5; 352 work[0] = 0xE5;
353 for (i = 0, j = 0, last_u = 0; i < 8;) { 353 for (i = 0, j = 0, last_u = 0; i < 8;) {
354 if (!work[i]) break; 354 if (!work[i])
355 break;
355 chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, 356 chl = fat_shortname2uni(nls_disk, &work[i], 8 - i,
356 &bufuname[j++], opt_shortname, 357 &bufuname[j++], opt_shortname,
357 de->lcase & CASE_LOWER_BASE); 358 de->lcase & CASE_LOWER_BASE);
@@ -365,13 +366,15 @@ parse_record:
365 } 366 }
366 j = last_u; 367 j = last_u;
367 fat_short2uni(nls_disk, ".", 1, &bufuname[j++]); 368 fat_short2uni(nls_disk, ".", 1, &bufuname[j++]);
368 for (i = 0; i < 3;) { 369 for (i = 8; i < MSDOS_NAME;) {
369 if (!de->ext[i]) break; 370 if (!work[i])
370 chl = fat_shortname2uni(nls_disk, &de->ext[i], 3 - i, 371 break;
372 chl = fat_shortname2uni(nls_disk, &work[i],
373 MSDOS_NAME - i,
371 &bufuname[j++], opt_shortname, 374 &bufuname[j++], opt_shortname,
372 de->lcase & CASE_LOWER_EXT); 375 de->lcase & CASE_LOWER_EXT);
373 if (chl <= 1) { 376 if (chl <= 1) {
374 if (de->ext[i] != ' ') 377 if (work[i] != ' ')
375 last_u = j; 378 last_u = j;
376 } else { 379 } else {
377 last_u = j; 380 last_u = j;
@@ -445,7 +448,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
445 int fill_len; 448 int fill_len;
446 wchar_t bufuname[14]; 449 wchar_t bufuname[14];
447 wchar_t *unicode = NULL; 450 wchar_t *unicode = NULL;
448 unsigned char c, work[8], bufname[56], *ptname = bufname; 451 unsigned char c, work[MSDOS_NAME], bufname[56], *ptname = bufname;
449 unsigned long lpos, dummy, *furrfu = &lpos; 452 unsigned long lpos, dummy, *furrfu = &lpos;
450 int uni_xlate = sbi->options.unicode_xlate; 453 int uni_xlate = sbi->options.unicode_xlate;
451 int isvfat = sbi->options.isvfat; 454 int isvfat = sbi->options.isvfat;
@@ -527,7 +530,8 @@ parse_record:
527 if (work[0] == 0x05) 530 if (work[0] == 0x05)
528 work[0] = 0xE5; 531 work[0] = 0xE5;
529 for (i = 0, j = 0, last = 0, last_u = 0; i < 8;) { 532 for (i = 0, j = 0, last = 0, last_u = 0; i < 8;) {
530 if (!(c = work[i])) break; 533 if (!(c = work[i]))
534 break;
531 chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, 535 chl = fat_shortname2uni(nls_disk, &work[i], 8 - i,
532 &bufuname[j++], opt_shortname, 536 &bufuname[j++], opt_shortname,
533 de->lcase & CASE_LOWER_BASE); 537 de->lcase & CASE_LOWER_BASE);
@@ -549,9 +553,10 @@ parse_record:
549 j = last_u; 553 j = last_u;
550 fat_short2uni(nls_disk, ".", 1, &bufuname[j++]); 554 fat_short2uni(nls_disk, ".", 1, &bufuname[j++]);
551 ptname[i++] = '.'; 555 ptname[i++] = '.';
552 for (i2 = 0; i2 < 3;) { 556 for (i2 = 8; i2 < MSDOS_NAME;) {
553 if (!(c = de->ext[i2])) break; 557 if (!(c = work[i2]))
554 chl = fat_shortname2uni(nls_disk, &de->ext[i2], 3 - i2, 558 break;
559 chl = fat_shortname2uni(nls_disk, &work[i2], MSDOS_NAME - i2,
555 &bufuname[j++], opt_shortname, 560 &bufuname[j++], opt_shortname,
556 de->lcase & CASE_LOWER_EXT); 561 de->lcase & CASE_LOWER_EXT);
557 if (chl <= 1) { 562 if (chl <= 1) {
@@ -563,8 +568,8 @@ parse_record:
563 } 568 }
564 } else { 569 } else {
565 last_u = j; 570 last_u = j;
566 for (chi = 0; chi < chl && i2 < 3; chi++) { 571 for (chi = 0; chi < chl && i2 < MSDOS_NAME; chi++) {
567 ptname[i++] = de->ext[i2++]; 572 ptname[i++] = work[i2++];
568 last = i; 573 last = i;
569 } 574 }
570 } 575 }
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index ab171ea8e869..2c1b73fb82ae 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -17,6 +17,8 @@ struct fatent_operations {
17 int (*ent_next)(struct fat_entry *); 17 int (*ent_next)(struct fat_entry *);
18}; 18};
19 19
20static DEFINE_SPINLOCK(fat12_entry_lock);
21
20static void fat12_ent_blocknr(struct super_block *sb, int entry, 22static void fat12_ent_blocknr(struct super_block *sb, int entry,
21 int *offset, sector_t *blocknr) 23 int *offset, sector_t *blocknr)
22{ 24{
@@ -116,10 +118,13 @@ static int fat12_ent_get(struct fat_entry *fatent)
116 u8 **ent12_p = fatent->u.ent12_p; 118 u8 **ent12_p = fatent->u.ent12_p;
117 int next; 119 int next;
118 120
121 spin_lock(&fat12_entry_lock);
119 if (fatent->entry & 1) 122 if (fatent->entry & 1)
120 next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4); 123 next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4);
121 else 124 else
122 next = (*ent12_p[1] << 8) | *ent12_p[0]; 125 next = (*ent12_p[1] << 8) | *ent12_p[0];
126 spin_unlock(&fat12_entry_lock);
127
123 next &= 0x0fff; 128 next &= 0x0fff;
124 if (next >= BAD_FAT12) 129 if (next >= BAD_FAT12)
125 next = FAT_ENT_EOF; 130 next = FAT_ENT_EOF;
@@ -151,6 +156,7 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
151 if (new == FAT_ENT_EOF) 156 if (new == FAT_ENT_EOF)
152 new = EOF_FAT12; 157 new = EOF_FAT12;
153 158
159 spin_lock(&fat12_entry_lock);
154 if (fatent->entry & 1) { 160 if (fatent->entry & 1) {
155 *ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f); 161 *ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f);
156 *ent12_p[1] = new >> 4; 162 *ent12_p[1] = new >> 4;
@@ -158,6 +164,7 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
158 *ent12_p[0] = new & 0xff; 164 *ent12_p[0] = new & 0xff;
159 *ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8); 165 *ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8);
160 } 166 }
167 spin_unlock(&fat12_entry_lock);
161 168
162 mark_buffer_dirty(fatent->bhs[0]); 169 mark_buffer_dirty(fatent->bhs[0]);
163 if (fatent->nr_bhs == 2) 170 if (fatent->nr_bhs == 2)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 479722d89667..0a7ddb39a593 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -20,6 +20,7 @@
20#include <linux/pagemap.h> 20#include <linux/pagemap.h>
21#include <linux/mpage.h> 21#include <linux/mpage.h>
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/exportfs.h>
23#include <linux/mount.h> 24#include <linux/mount.h>
24#include <linux/vfs.h> 25#include <linux/vfs.h>
25#include <linux/parser.h> 26#include <linux/parser.h>
@@ -354,8 +355,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
354 } else { /* not a directory */ 355 } else { /* not a directory */
355 inode->i_generation |= 1; 356 inode->i_generation |= 1;
356 inode->i_mode = MSDOS_MKMODE(de->attr, 357 inode->i_mode = MSDOS_MKMODE(de->attr,
357 ((sbi->options.showexec && 358 ((sbi->options.showexec && !is_exec(de->name + 8))
358 !is_exec(de->ext))
359 ? S_IRUGO|S_IWUGO : S_IRWXUGO) 359 ? S_IRUGO|S_IWUGO : S_IRWXUGO)
360 & ~sbi->options.fs_fmask) | S_IFREG; 360 & ~sbi->options.fs_fmask) | S_IFREG;
361 MSDOS_I(inode)->i_start = le16_to_cpu(de->start); 361 MSDOS_I(inode)->i_start = le16_to_cpu(de->start);
diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h
index 8a4dfef1ddad..3c96d6e63978 100644
--- a/fs/freevxfs/vxfs_dir.h
+++ b/fs/freevxfs/vxfs_dir.h
@@ -80,7 +80,7 @@ struct vxfs_direct {
80 * a d_name with size len. 80 * a d_name with size len.
81 */ 81 */
82#define VXFS_DIRPAD 4 82#define VXFS_DIRPAD 4
83#define VXFS_NAMEMIN ((int)((struct vxfs_direct *)0)->d_name) 83#define VXFS_NAMEMIN offsetof(struct vxfs_direct, d_name)
84#define VXFS_DIRROUND(len) ((VXFS_DIRPAD + (len) - 1) & ~(VXFS_DIRPAD -1)) 84#define VXFS_DIRROUND(len) ((VXFS_DIRPAD + (len) - 1) & ~(VXFS_DIRPAD -1))
85#define VXFS_DIRLEN(len) (VXFS_DIRROUND(VXFS_NAMEMIN + (len))) 85#define VXFS_DIRLEN(len) (VXFS_DIRROUND(VXFS_NAMEMIN + (len)))
86 86
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index c1f44009853f..1ab3e9d73886 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -11,6 +11,7 @@
11#include <linux/spinlock.h> 11#include <linux/spinlock.h>
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/capability.h>
14#include <linux/xattr.h> 15#include <linux/xattr.h>
15#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h> 17#include <linux/lm_interface.h>
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 99ea5659bc2c..b8312edee0e4 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -11,6 +11,7 @@
11#include <linux/spinlock.h> 11#include <linux/spinlock.h>
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/exportfs.h>
14#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 16#include <linux/crc32.h>
16#include <linux/lm_interface.h> 17#include <linux/lm_interface.h>
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 90ebab753d30..050d29c0a5b5 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -62,8 +62,10 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
62 if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) && 62 if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) &&
63 (head->key_type == HFSPLUS_KEY_BINARY)) 63 (head->key_type == HFSPLUS_KEY_BINARY))
64 tree->keycmp = hfsplus_cat_bin_cmp_key; 64 tree->keycmp = hfsplus_cat_bin_cmp_key;
65 else 65 else {
66 tree->keycmp = hfsplus_cat_case_cmp_key; 66 tree->keycmp = hfsplus_cat_case_cmp_key;
67 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD;
68 }
67 } else { 69 } else {
68 printk(KERN_ERR "hfs: unknown B*Tree requested\n"); 70 printk(KERN_ERR "hfs: unknown B*Tree requested\n");
69 goto fail_page; 71 goto fail_page;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 80b5682a2273..1955ee61251c 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -36,6 +36,8 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
36 u16 type; 36 u16 type;
37 37
38 sb = dir->i_sb; 38 sb = dir->i_sb;
39
40 dentry->d_op = &hfsplus_dentry_operations;
39 dentry->d_fsdata = NULL; 41 dentry->d_fsdata = NULL;
40 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 42 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
41 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); 43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 3915635b4470..d9f5eda6d039 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -150,6 +150,7 @@ struct hfsplus_sb_info {
150#define HFSPLUS_SB_NODECOMPOSE 0x0002 150#define HFSPLUS_SB_NODECOMPOSE 0x0002
151#define HFSPLUS_SB_FORCE 0x0004 151#define HFSPLUS_SB_FORCE 0x0004
152#define HFSPLUS_SB_HFSX 0x0008 152#define HFSPLUS_SB_HFSX 0x0008
153#define HFSPLUS_SB_CASEFOLD 0x0010
153 154
154 155
155struct hfsplus_inode_info { 156struct hfsplus_inode_info {
@@ -321,6 +322,7 @@ void hfsplus_file_truncate(struct inode *);
321/* inode.c */ 322/* inode.c */
322extern const struct address_space_operations hfsplus_aops; 323extern const struct address_space_operations hfsplus_aops;
323extern const struct address_space_operations hfsplus_btree_aops; 324extern const struct address_space_operations hfsplus_btree_aops;
325extern struct dentry_operations hfsplus_dentry_operations;
324 326
325void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *); 327void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
326void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *); 328void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
@@ -353,6 +355,8 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unist
353int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *); 355int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
354int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); 356int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *);
355int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int); 357int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int);
358int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str);
359int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2);
356 360
357/* wrapper.c */ 361/* wrapper.c */
358int hfsplus_read_wrapper(struct super_block *); 362int hfsplus_read_wrapper(struct super_block *);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 409ce5429c91..6f7c662174db 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -131,6 +131,11 @@ const struct address_space_operations hfsplus_aops = {
131 .writepages = hfsplus_writepages, 131 .writepages = hfsplus_writepages,
132}; 132};
133 133
134struct dentry_operations hfsplus_dentry_operations = {
135 .d_hash = hfsplus_hash_dentry,
136 .d_compare = hfsplus_compare_dentry,
137};
138
134static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry, 139static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry,
135 struct nameidata *nd) 140 struct nameidata *nd)
136{ 141{
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index ebd1b380cbbc..6d87a2a9534d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -283,11 +283,10 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
283 struct nls_table *nls = NULL; 283 struct nls_table *nls = NULL;
284 int err = -EINVAL; 284 int err = -EINVAL;
285 285
286 sbi = kmalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL); 286 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
287 if (!sbi) 287 if (!sbi)
288 return -ENOMEM; 288 return -ENOMEM;
289 289
290 memset(sbi, 0, sizeof(HFSPLUS_SB(sb)));
291 sb->s_fs_info = sbi; 290 sb->s_fs_info = sbi;
292 INIT_HLIST_HEAD(&sbi->rsrc_inodes); 291 INIT_HLIST_HEAD(&sbi->rsrc_inodes);
293 hfsplus_fill_defaults(sbi); 292 hfsplus_fill_defaults(sbi);
@@ -381,6 +380,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
381 iput(root); 380 iput(root);
382 goto cleanup; 381 goto cleanup;
383 } 382 }
383 sb->s_root->d_op = &hfsplus_dentry_operations;
384 384
385 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 385 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
386 str.name = HFSP_HIDDENDIR_NAME; 386 str.name = HFSP_HIDDENDIR_NAME;
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 689c8bd721fb..9e10f9444b64 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -239,61 +239,201 @@ out:
239 return res; 239 return res;
240} 240}
241 241
242int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, const char *astr, int len) 242/*
243 * Convert one or more ASCII characters into a single unicode character.
244 * Returns the number of ASCII characters corresponding to the unicode char.
245 */
246static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
247 wchar_t *uc)
243{ 248{
244 struct nls_table *nls = HFSPLUS_SB(sb).nls; 249 int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc);
245 int size, off, decompose; 250 if (size <= 0) {
251 *uc = '?';
252 size = 1;
253 }
254 switch (*uc) {
255 case 0x2400:
256 *uc = 0;
257 break;
258 case ':':
259 *uc = '/';
260 break;
261 }
262 return size;
263}
264
265/* Decomposes a single unicode character. */
266static inline u16 *decompose_unichar(wchar_t uc, int *size)
267{
268 int off;
269
270 off = hfsplus_decompose_table[(uc >> 12) & 0xf];
271 if (off == 0 || off == 0xffff)
272 return NULL;
273
274 off = hfsplus_decompose_table[off + ((uc >> 8) & 0xf)];
275 if (!off)
276 return NULL;
277
278 off = hfsplus_decompose_table[off + ((uc >> 4) & 0xf)];
279 if (!off)
280 return NULL;
281
282 off = hfsplus_decompose_table[off + (uc & 0xf)];
283 *size = off & 3;
284 if (*size == 0)
285 return NULL;
286 return hfsplus_decompose_table + (off / 4);
287}
288
289int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
290 const char *astr, int len)
291{
292 int size, dsize, decompose;
293 u16 *dstr, outlen = 0;
246 wchar_t c; 294 wchar_t c;
247 u16 outlen = 0;
248 295
249 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 296 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
250
251 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { 297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
252 size = nls->char2uni(astr, len, &c); 298 size = asc2unichar(sb, astr, len, &c);
253 if (size <= 0) { 299
254 c = '?'; 300 if (decompose && (dstr = decompose_unichar(c, &dsize))) {
255 size = 1; 301 if (outlen + dsize > HFSPLUS_MAX_STRLEN)
256 }
257 astr += size;
258 len -= size;
259 switch (c) {
260 case 0x2400:
261 c = 0;
262 break;
263 case ':':
264 c = '/';
265 break;
266 }
267 if (c >= 0xc0 && decompose) {
268 off = hfsplus_decompose_table[(c >> 12) & 0xf];
269 if (!off)
270 goto done;
271 if (off == 0xffff) {
272 goto done;
273 }
274 off = hfsplus_decompose_table[off + ((c >> 8) & 0xf)];
275 if (!off)
276 goto done;
277 off = hfsplus_decompose_table[off + ((c >> 4) & 0xf)];
278 if (!off)
279 goto done;
280 off = hfsplus_decompose_table[off + (c & 0xf)];
281 size = off & 3;
282 if (!size)
283 goto done;
284 off /= 4;
285 if (outlen + size > HFSPLUS_MAX_STRLEN)
286 break; 302 break;
287 do { 303 do {
288 ustr->unicode[outlen++] = cpu_to_be16(hfsplus_decompose_table[off++]); 304 ustr->unicode[outlen++] = cpu_to_be16(*dstr++);
289 } while (--size > 0); 305 } while (--dsize > 0);
290 continue; 306 } else
291 } 307 ustr->unicode[outlen++] = cpu_to_be16(c);
292 done: 308
293 ustr->unicode[outlen++] = cpu_to_be16(c); 309 astr += size;
310 len -= size;
294 } 311 }
295 ustr->length = cpu_to_be16(outlen); 312 ustr->length = cpu_to_be16(outlen);
296 if (len > 0) 313 if (len > 0)
297 return -ENAMETOOLONG; 314 return -ENAMETOOLONG;
298 return 0; 315 return 0;
299} 316}
317
318/*
319 * Hash a string to an integer as appropriate for the HFS+ filesystem.
320 * Composed unicode characters are decomposed and case-folding is performed
321 * if the appropriate bits are (un)set on the superblock.
322 */
323int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
324{
325 struct super_block *sb = dentry->d_sb;
326 const char *astr;
327 const u16 *dstr;
328 int casefold, decompose, size, dsize, len;
329 unsigned long hash;
330 wchar_t c;
331 u16 c2;
332
333 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
334 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
335 hash = init_name_hash();
336 astr = str->name;
337 len = str->len;
338 while (len > 0) {
339 size = asc2unichar(sb, astr, len, &c);
340 astr += size;
341 len -= size;
342
343 if (decompose && (dstr = decompose_unichar(c, &dsize))) {
344 do {
345 c2 = *dstr++;
346 if (!casefold || (c2 = case_fold(c2)))
347 hash = partial_name_hash(c2, hash);
348 } while (--dsize > 0);
349 } else {
350 c2 = c;
351 if (!casefold || (c2 = case_fold(c2)))
352 hash = partial_name_hash(c2, hash);
353 }
354 }
355 str->hash = end_name_hash(hash);
356
357 return 0;
358}
359
360/*
361 * Compare strings with HFS+ filename ordering.
362 * Composed unicode characters are decomposed and case-folding is performed
363 * if the appropriate bits are (un)set on the superblock.
364 */
365int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
366{
367 struct super_block *sb = dentry->d_sb;
368 int casefold, decompose, size;
369 int dsize1, dsize2, len1, len2;
370 const u16 *dstr1, *dstr2;
371 const char *astr1, *astr2;
372 u16 c1, c2;
373 wchar_t c;
374
375 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
376 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
377 astr1 = s1->name;
378 len1 = s1->len;
379 astr2 = s2->name;
380 len2 = s2->len;
381 dsize1 = dsize2 = 0;
382 dstr1 = dstr2 = NULL;
383
384 while (len1 > 0 && len2 > 0) {
385 if (!dsize1) {
386 size = asc2unichar(sb, astr1, len1, &c);
387 astr1 += size;
388 len1 -= size;
389
390 if (!decompose || !(dstr1 = decompose_unichar(c, &dsize1))) {
391 c1 = c;
392 dstr1 = &c1;
393 dsize1 = 1;
394 }
395 }
396
397 if (!dsize2) {
398 size = asc2unichar(sb, astr2, len2, &c);
399 astr2 += size;
400 len2 -= size;
401
402 if (!decompose || !(dstr2 = decompose_unichar(c, &dsize2))) {
403 c2 = c;
404 dstr2 = &c2;
405 dsize2 = 1;
406 }
407 }
408
409 c1 = *dstr1;
410 c2 = *dstr2;
411 if (casefold) {
412 if (!(c1 = case_fold(c1))) {
413 dstr1++;
414 dsize1--;
415 continue;
416 }
417 if (!(c2 = case_fold(c2))) {
418 dstr2++;
419 dsize2--;
420 continue;
421 }
422 }
423 if (c1 < c2)
424 return -1;
425 else if (c1 > c2)
426 return 1;
427
428 dstr1++;
429 dsize1--;
430 dstr2++;
431 dsize2--;
432 }
433
434 if (len1 < len2)
435 return -1;
436 if (len1 > len2)
437 return 1;
438 return 0;
439}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e6b46b3ac2fe..d145cb79c30a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -13,15 +13,18 @@
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/mount.h> 14#include <linux/mount.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/kernel.h>
16#include <linux/writeback.h> 17#include <linux/writeback.h>
17#include <linux/pagemap.h> 18#include <linux/pagemap.h>
18#include <linux/highmem.h> 19#include <linux/highmem.h>
19#include <linux/init.h> 20#include <linux/init.h>
20#include <linux/string.h> 21#include <linux/string.h>
21#include <linux/capability.h> 22#include <linux/capability.h>
23#include <linux/ctype.h>
22#include <linux/backing-dev.h> 24#include <linux/backing-dev.h>
23#include <linux/hugetlb.h> 25#include <linux/hugetlb.h>
24#include <linux/pagevec.h> 26#include <linux/pagevec.h>
27#include <linux/parser.h>
25#include <linux/mman.h> 28#include <linux/mman.h>
26#include <linux/quotaops.h> 29#include <linux/quotaops.h>
27#include <linux/slab.h> 30#include <linux/slab.h>
@@ -47,6 +50,21 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = {
47 50
48int sysctl_hugetlb_shm_group; 51int sysctl_hugetlb_shm_group;
49 52
53enum {
54 Opt_size, Opt_nr_inodes,
55 Opt_mode, Opt_uid, Opt_gid,
56 Opt_err,
57};
58
59static match_table_t tokens = {
60 {Opt_size, "size=%s"},
61 {Opt_nr_inodes, "nr_inodes=%s"},
62 {Opt_mode, "mode=%o"},
63 {Opt_uid, "uid=%u"},
64 {Opt_gid, "gid=%u"},
65 {Opt_err, NULL},
66};
67
50static void huge_pagevec_release(struct pagevec *pvec) 68static void huge_pagevec_release(struct pagevec *pvec)
51{ 69{
52 int i; 70 int i;
@@ -594,46 +612,73 @@ static const struct super_operations hugetlbfs_ops = {
594static int 612static int
595hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) 613hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
596{ 614{
597 char *opt, *value, *rest; 615 char *p, *rest;
616 substring_t args[MAX_OPT_ARGS];
617 int option;
598 618
599 if (!options) 619 if (!options)
600 return 0; 620 return 0;
601 while ((opt = strsep(&options, ",")) != NULL) { 621
602 if (!*opt) 622 while ((p = strsep(&options, ",")) != NULL) {
623 int token;
624 if (!*p)
603 continue; 625 continue;
604 626
605 value = strchr(opt, '='); 627 token = match_token(p, tokens, args);
606 if (!value || !*value) 628 switch (token) {
607 return -EINVAL; 629 case Opt_uid:
608 else 630 if (match_int(&args[0], &option))
609 *value++ = '\0'; 631 goto bad_val;
610 632 pconfig->uid = option;
611 if (!strcmp(opt, "uid")) 633 break;
612 pconfig->uid = simple_strtoul(value, &value, 0); 634
613 else if (!strcmp(opt, "gid")) 635 case Opt_gid:
614 pconfig->gid = simple_strtoul(value, &value, 0); 636 if (match_int(&args[0], &option))
615 else if (!strcmp(opt, "mode")) 637 goto bad_val;
616 pconfig->mode = simple_strtoul(value,&value,0) & 0777U; 638 pconfig->gid = option;
617 else if (!strcmp(opt, "size")) { 639 break;
618 unsigned long long size = memparse(value, &rest); 640
641 case Opt_mode:
642 if (match_octal(&args[0], &option))
643 goto bad_val;
644 pconfig->mode = option & 0777U;
645 break;
646
647 case Opt_size: {
648 unsigned long long size;
649 /* memparse() will accept a K/M/G without a digit */
650 if (!isdigit(*args[0].from))
651 goto bad_val;
652 size = memparse(args[0].from, &rest);
619 if (*rest == '%') { 653 if (*rest == '%') {
620 size <<= HPAGE_SHIFT; 654 size <<= HPAGE_SHIFT;
621 size *= max_huge_pages; 655 size *= max_huge_pages;
622 do_div(size, 100); 656 do_div(size, 100);
623 rest++;
624 } 657 }
625 pconfig->nr_blocks = (size >> HPAGE_SHIFT); 658 pconfig->nr_blocks = (size >> HPAGE_SHIFT);
626 value = rest; 659 break;
627 } else if (!strcmp(opt,"nr_inodes")) { 660 }
628 pconfig->nr_inodes = memparse(value, &rest); 661
629 value = rest; 662 case Opt_nr_inodes:
630 } else 663 /* memparse() will accept a K/M/G without a digit */
631 return -EINVAL; 664 if (!isdigit(*args[0].from))
665 goto bad_val;
666 pconfig->nr_inodes = memparse(args[0].from, &rest);
667 break;
632 668
633 if (*value) 669 default:
670 printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
671 p);
634 return -EINVAL; 672 return -EINVAL;
673 break;
674 }
635 } 675 }
636 return 0; 676 return 0;
677
678bad_val:
679 printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
680 args[0].from, p);
681 return 1;
637} 682}
638 683
639static int 684static int
@@ -651,7 +696,6 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
651 config.gid = current->fsgid; 696 config.gid = current->fsgid;
652 config.mode = 0755; 697 config.mode = 0755;
653 ret = hugetlbfs_parse_options(data, &config); 698 ret = hugetlbfs_parse_options(data, &config);
654
655 if (ret) 699 if (ret)
656 return ret; 700 return ret;
657 701
diff --git a/fs/inode.c b/fs/inode.c
index 9a012cc5b6cd..320e088d0b28 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -145,7 +145,7 @@ static struct inode *alloc_inode(struct super_block *sb)
145 mapping->a_ops = &empty_aops; 145 mapping->a_ops = &empty_aops;
146 mapping->host = inode; 146 mapping->host = inode;
147 mapping->flags = 0; 147 mapping->flags = 0;
148 mapping_set_gfp_mask(mapping, GFP_HIGHUSER); 148 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
149 mapping->assoc_mapping = NULL; 149 mapping->assoc_mapping = NULL;
150 mapping->backing_dev_info = &default_backing_dev_info; 150 mapping->backing_dev_info = &default_backing_dev_info;
151 151
@@ -462,6 +462,11 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask)
462 return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 462 return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
463} 463}
464 464
465static struct shrinker icache_shrinker = {
466 .shrink = shrink_icache_memory,
467 .seeks = DEFAULT_SEEKS,
468};
469
465static void __wait_on_freeing_inode(struct inode *inode); 470static void __wait_on_freeing_inode(struct inode *inode);
466/* 471/*
467 * Called with the inode lock held. 472 * Called with the inode lock held.
@@ -519,7 +524,13 @@ repeat:
519 * new_inode - obtain an inode 524 * new_inode - obtain an inode
520 * @sb: superblock 525 * @sb: superblock
521 * 526 *
522 * Allocates a new inode for given superblock. 527 * Allocates a new inode for given superblock. The default gfp_mask
528 * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE.
529 * If HIGHMEM pages are unsuitable or it is known that pages allocated
530 * for the page cache are not reclaimable or migratable,
531 * mapping_set_gfp_mask() must be called with suitable flags on the
532 * newly created inode's mapping
533 *
523 */ 534 */
524struct inode *new_inode(struct super_block *sb) 535struct inode *new_inode(struct super_block *sb)
525{ 536{
@@ -1379,7 +1390,7 @@ void __init inode_init(unsigned long mempages)
1379 SLAB_MEM_SPREAD), 1390 SLAB_MEM_SPREAD),
1380 init_once, 1391 init_once,
1381 NULL); 1392 NULL);
1382 set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); 1393 register_shrinker(&icache_shrinker);
1383 1394
1384 /* Hash may have been set up in inode_init_early */ 1395 /* Hash may have been set up in inode_init_early */
1385 if (!hashdist) 1396 if (!hashdist)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 8c90cbc903fa..c2a773e8620b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -12,7 +12,6 @@
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/security.h> 13#include <linux/security.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/kallsyms.h>
16 15
17#include <asm/uaccess.h> 16#include <asm/uaccess.h>
18#include <asm/ioctls.h> 17#include <asm/ioctls.h>
@@ -21,7 +20,6 @@ static long do_ioctl(struct file *filp, unsigned int cmd,
21 unsigned long arg) 20 unsigned long arg)
22{ 21{
23 int error = -ENOTTY; 22 int error = -ENOTTY;
24 void *f;
25 23
26 if (!filp->f_op) 24 if (!filp->f_op)
27 goto out; 25 goto out;
@@ -31,16 +29,10 @@ static long do_ioctl(struct file *filp, unsigned int cmd,
31 if (error == -ENOIOCTLCMD) 29 if (error == -ENOIOCTLCMD)
32 error = -EINVAL; 30 error = -EINVAL;
33 goto out; 31 goto out;
34 } else if ((f = filp->f_op->ioctl)) { 32 } else if (filp->f_op->ioctl) {
35 lock_kernel(); 33 lock_kernel();
36 if (!filp->f_op->ioctl) { 34 error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
37 printk("%s: ioctl %p disappeared\n", __FUNCTION__, f); 35 filp, cmd, arg);
38 print_symbol("symbol: %s\n", (unsigned long)f);
39 dump_stack();
40 } else {
41 error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
42 filp, cmd, arg);
43 }
44 unlock_kernel(); 36 unlock_kernel();
45 } 37 }
46 38
@@ -182,11 +174,3 @@ asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
182 out: 174 out:
183 return error; 175 return error;
184} 176}
185
186/*
187 * Platforms implementing 32 bit compatibility ioctl handlers in
188 * modules need this exported
189 */
190#ifdef CONFIG_COMPAT
191EXPORT_SYMBOL(sys_ioctl);
192#endif
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 0e94c31cad9b..1ba407c64df1 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -7,34 +7,18 @@
7 * 7 *
8 * Steve Beynon : Missing last directory entries fixed 8 * Steve Beynon : Missing last directory entries fixed
9 * (stephen@askone.demon.co.uk) : 21st June 1996 9 * (stephen@askone.demon.co.uk) : 21st June 1996
10 * 10 *
11 * isofs directory handling functions 11 * isofs directory handling functions
12 */ 12 */
13#include <linux/smp_lock.h> 13#include <linux/smp_lock.h>
14#include "isofs.h" 14#include "isofs.h"
15 15
16static int isofs_readdir(struct file *, void *, filldir_t);
17
18const struct file_operations isofs_dir_operations =
19{
20 .read = generic_read_dir,
21 .readdir = isofs_readdir,
22};
23
24/*
25 * directories can handle most operations...
26 */
27const struct inode_operations isofs_dir_inode_operations =
28{
29 .lookup = isofs_lookup,
30};
31
32int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode) 16int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
33{ 17{
34 char * old = de->name; 18 char * old = de->name;
35 int len = de->name_len[0]; 19 int len = de->name_len[0];
36 int i; 20 int i;
37 21
38 for (i = 0; i < len; i++) { 22 for (i = 0; i < len; i++) {
39 unsigned char c = old[i]; 23 unsigned char c = old[i];
40 if (!c) 24 if (!c)
@@ -62,22 +46,27 @@ int isofs_name_translate(struct iso_directory_record *de, char *new, struct inod
62} 46}
63 47
64/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */ 48/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */
65int get_acorn_filename(struct iso_directory_record * de, 49int get_acorn_filename(struct iso_directory_record *de,
66 char * retname, struct inode * inode) 50 char *retname, struct inode *inode)
67{ 51{
68 int std; 52 int std;
69 unsigned char * chr; 53 unsigned char *chr;
70 int retnamlen = isofs_name_translate(de, retname, inode); 54 int retnamlen = isofs_name_translate(de, retname, inode);
71 if (retnamlen == 0) return 0; 55
56 if (retnamlen == 0)
57 return 0;
72 std = sizeof(struct iso_directory_record) + de->name_len[0]; 58 std = sizeof(struct iso_directory_record) + de->name_len[0];
73 if (std & 1) std++; 59 if (std & 1)
74 if ((*((unsigned char *) de) - std) != 32) return retnamlen; 60 std++;
61 if ((*((unsigned char *) de) - std) != 32)
62 return retnamlen;
75 chr = ((unsigned char *) de) + std; 63 chr = ((unsigned char *) de) + std;
76 if (strncmp(chr, "ARCHIMEDES", 10)) return retnamlen; 64 if (strncmp(chr, "ARCHIMEDES", 10))
77 if ((*retname == '_') && ((chr[19] & 1) == 1)) *retname = '!'; 65 return retnamlen;
66 if ((*retname == '_') && ((chr[19] & 1) == 1))
67 *retname = '!';
78 if (((de->flags[0] & 2) == 0) && (chr[13] == 0xff) 68 if (((de->flags[0] & 2) == 0) && (chr[13] == 0xff)
79 && ((chr[12] & 0xf0) == 0xf0)) 69 && ((chr[12] & 0xf0) == 0xf0)) {
80 {
81 retname[retnamlen] = ','; 70 retname[retnamlen] = ',';
82 sprintf(retname+retnamlen+1, "%3.3x", 71 sprintf(retname+retnamlen+1, "%3.3x",
83 ((chr[12] & 0xf) << 8) | chr[11]); 72 ((chr[12] & 0xf) << 8) | chr[11]);
@@ -91,7 +80,7 @@ int get_acorn_filename(struct iso_directory_record * de,
91 */ 80 */
92static int do_isofs_readdir(struct inode *inode, struct file *filp, 81static int do_isofs_readdir(struct inode *inode, struct file *filp,
93 void *dirent, filldir_t filldir, 82 void *dirent, filldir_t filldir,
94 char * tmpname, struct iso_directory_record * tmpde) 83 char *tmpname, struct iso_directory_record *tmpde)
95{ 84{
96 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 85 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
97 unsigned char bufbits = ISOFS_BUFFER_BITS(inode); 86 unsigned char bufbits = ISOFS_BUFFER_BITS(inode);
@@ -121,9 +110,11 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
121 110
122 de_len = *(unsigned char *) de; 111 de_len = *(unsigned char *) de;
123 112
124 /* If the length byte is zero, we should move on to the next 113 /*
125 CDROM sector. If we are at the end of the directory, we 114 * If the length byte is zero, we should move on to the next
126 kick out of the while loop. */ 115 * CDROM sector. If we are at the end of the directory, we
116 * kick out of the while loop.
117 */
127 118
128 if (de_len == 0) { 119 if (de_len == 0) {
129 brelse(bh); 120 brelse(bh);
@@ -157,11 +148,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
157 148
158 if (first_de) { 149 if (first_de) {
159 isofs_normalize_block_and_offset(de, 150 isofs_normalize_block_and_offset(de,
160 &block_saved, 151 &block_saved,
161 &offset_saved); 152 &offset_saved);
162 inode_number = isofs_get_ino(block_saved, 153 inode_number = isofs_get_ino(block_saved,
163 offset_saved, 154 offset_saved, bufbits);
164 bufbits);
165 } 155 }
166 156
167 if (de->flags[-sbi->s_high_sierra] & 0x80) { 157 if (de->flags[-sbi->s_high_sierra] & 0x80) {
@@ -199,7 +189,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
199 */ 189 */
200 if ((sbi->s_hide == 'y' && 190 if ((sbi->s_hide == 'y' &&
201 (de->flags[-sbi->s_high_sierra] & 1)) || 191 (de->flags[-sbi->s_high_sierra] & 1)) ||
202 (sbi->s_showassoc =='n' && 192 (sbi->s_showassoc =='n' &&
203 (de->flags[-sbi->s_high_sierra] & 4))) { 193 (de->flags[-sbi->s_high_sierra] & 4))) {
204 filp->f_pos += de_len; 194 filp->f_pos += de_len;
205 continue; 195 continue;
@@ -240,7 +230,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
240 230
241 continue; 231 continue;
242 } 232 }
243 if (bh) brelse(bh); 233 if (bh)
234 brelse(bh);
244 return 0; 235 return 0;
245} 236}
246 237
@@ -253,8 +244,8 @@ static int isofs_readdir(struct file *filp,
253 void *dirent, filldir_t filldir) 244 void *dirent, filldir_t filldir)
254{ 245{
255 int result; 246 int result;
256 char * tmpname; 247 char *tmpname;
257 struct iso_directory_record * tmpde; 248 struct iso_directory_record *tmpde;
258 struct inode *inode = filp->f_path.dentry->d_inode; 249 struct inode *inode = filp->f_path.dentry->d_inode;
259 250
260 tmpname = (char *)__get_free_page(GFP_KERNEL); 251 tmpname = (char *)__get_free_page(GFP_KERNEL);
@@ -270,3 +261,19 @@ static int isofs_readdir(struct file *filp,
270 unlock_kernel(); 261 unlock_kernel();
271 return result; 262 return result;
272} 263}
264
265const struct file_operations isofs_dir_operations =
266{
267 .read = generic_read_dir,
268 .readdir = isofs_readdir,
269};
270
271/*
272 * directories can handle most operations...
273 */
274const struct inode_operations isofs_dir_inode_operations =
275{
276 .lookup = isofs_lookup,
277};
278
279
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 5c3eecf7542e..4f5418be0590 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -73,20 +73,20 @@ static void isofs_destroy_inode(struct inode *inode)
73 kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); 73 kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
74} 74}
75 75
76static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags) 76static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
77{ 77{
78 struct iso_inode_info *ei = foo; 78 struct iso_inode_info *ei = foo;
79 79
80 inode_init_once(&ei->vfs_inode); 80 inode_init_once(&ei->vfs_inode);
81} 81}
82 82
83static int init_inodecache(void) 83static int init_inodecache(void)
84{ 84{
85 isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", 85 isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
86 sizeof(struct iso_inode_info), 86 sizeof(struct iso_inode_info),
87 0, (SLAB_RECLAIM_ACCOUNT| 87 0, (SLAB_RECLAIM_ACCOUNT|
88 SLAB_MEM_SPREAD), 88 SLAB_MEM_SPREAD),
89 init_once, NULL); 89 init_once, NULL);
90 if (isofs_inode_cachep == NULL) 90 if (isofs_inode_cachep == NULL)
91 return -ENOMEM; 91 return -ENOMEM;
92 return 0; 92 return 0;
@@ -150,9 +150,9 @@ struct iso9660_options{
150 uid_t uid; 150 uid_t uid;
151 char *iocharset; 151 char *iocharset;
152 unsigned char utf8; 152 unsigned char utf8;
153 /* LVE */ 153 /* LVE */
154 s32 session; 154 s32 session;
155 s32 sbsector; 155 s32 sbsector;
156}; 156};
157 157
158/* 158/*
@@ -197,7 +197,7 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
197 hash = init_name_hash(); 197 hash = init_name_hash();
198 while (len--) { 198 while (len--) {
199 c = tolower(*name++); 199 c = tolower(*name++);
200 hash = partial_name_hash(tolower(c), hash); 200 hash = partial_name_hash(c, hash);
201 } 201 }
202 qstr->hash = end_name_hash(hash); 202 qstr->hash = end_name_hash(hash);
203 203
@@ -360,10 +360,12 @@ static int parse_options(char *options, struct iso9660_options *popt)
360 popt->check = 'u'; /* unset */ 360 popt->check = 'u'; /* unset */
361 popt->nocompress = 0; 361 popt->nocompress = 0;
362 popt->blocksize = 1024; 362 popt->blocksize = 1024;
363 popt->mode = S_IRUGO | S_IXUGO; /* r-x for all. The disc could 363 popt->mode = S_IRUGO | S_IXUGO; /*
364 be shared with DOS machines so 364 * r-x for all. The disc could
365 virtually anything could be 365 * be shared with DOS machines so
366 a valid executable. */ 366 * virtually anything could be
367 * a valid executable.
368 */
367 popt->gid = 0; 369 popt->gid = 0;
368 popt->uid = 0; 370 popt->uid = 0;
369 popt->iocharset = NULL; 371 popt->iocharset = NULL;
@@ -503,30 +505,30 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
503 Te.cdte_format=CDROM_LBA; 505 Te.cdte_format=CDROM_LBA;
504 i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te); 506 i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te);
505 if (!i) { 507 if (!i) {
506 printk(KERN_DEBUG "Session %d start %d type %d\n", 508 printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n",
507 session, Te.cdte_addr.lba, 509 session, Te.cdte_addr.lba,
508 Te.cdte_ctrl&CDROM_DATA_TRACK); 510 Te.cdte_ctrl&CDROM_DATA_TRACK);
509 if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4) 511 if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4)
510 return Te.cdte_addr.lba; 512 return Te.cdte_addr.lba;
511 } 513 }
512 514
513 printk(KERN_ERR "Invalid session number or type of track\n"); 515 printk(KERN_ERR "ISOFS: Invalid session number or type of track\n");
514 } 516 }
515 i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info); 517 i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info);
516 if (session > 0) 518 if (session > 0)
517 printk(KERN_ERR "Invalid session number\n"); 519 printk(KERN_ERR "ISOFS: Invalid session number\n");
518#if 0 520#if 0
519 printk("isofs.inode: CDROMMULTISESSION: rc=%d\n",i); 521 printk(KERN_DEBUG "isofs.inode: CDROMMULTISESSION: rc=%d\n",i);
520 if (i==0) { 522 if (i==0) {
521 printk("isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no"); 523 printk(KERN_DEBUG "isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no");
522 printk("isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba); 524 printk(KERN_DEBUG "isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba);
523 } 525 }
524#endif 526#endif
525 if (i==0) 527 if (i==0)
526#if WE_OBEY_THE_WRITTEN_STANDARDS 528#if WE_OBEY_THE_WRITTEN_STANDARDS
527 if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ 529 if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
528#endif 530#endif
529 vol_desc_start=ms_info.addr.lba; 531 vol_desc_start=ms_info.addr.lba;
530 return vol_desc_start; 532 return vol_desc_start;
531} 533}
532 534
@@ -538,20 +540,20 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
538 */ 540 */
539static int isofs_fill_super(struct super_block *s, void *data, int silent) 541static int isofs_fill_super(struct super_block *s, void *data, int silent)
540{ 542{
541 struct buffer_head * bh = NULL, *pri_bh = NULL; 543 struct buffer_head *bh = NULL, *pri_bh = NULL;
542 struct hs_primary_descriptor * h_pri = NULL; 544 struct hs_primary_descriptor *h_pri = NULL;
543 struct iso_primary_descriptor * pri = NULL; 545 struct iso_primary_descriptor *pri = NULL;
544 struct iso_supplementary_descriptor *sec = NULL; 546 struct iso_supplementary_descriptor *sec = NULL;
545 struct iso_directory_record * rootp; 547 struct iso_directory_record *rootp;
546 int joliet_level = 0; 548 struct inode *inode;
547 int iso_blknum, block; 549 struct iso9660_options opt;
548 int orig_zonesize; 550 struct isofs_sb_info *sbi;
549 int table; 551 unsigned long first_data_zone;
550 unsigned int vol_desc_start; 552 int joliet_level = 0;
551 unsigned long first_data_zone; 553 int iso_blknum, block;
552 struct inode * inode; 554 int orig_zonesize;
553 struct iso9660_options opt; 555 int table;
554 struct isofs_sb_info * sbi; 556 unsigned int vol_desc_start;
555 557
556 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 558 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
557 if (!sbi) 559 if (!sbi)
@@ -577,72 +579,73 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
577 vol_desc_start = (opt.sbsector != -1) ? 579 vol_desc_start = (opt.sbsector != -1) ?
578 opt.sbsector : isofs_get_last_session(s,opt.session); 580 opt.sbsector : isofs_get_last_session(s,opt.session);
579 581
580 for (iso_blknum = vol_desc_start+16; 582 for (iso_blknum = vol_desc_start+16;
581 iso_blknum < vol_desc_start+100; iso_blknum++) 583 iso_blknum < vol_desc_start+100; iso_blknum++) {
582 { 584 struct hs_volume_descriptor *hdp;
583 struct hs_volume_descriptor * hdp; 585 struct iso_volume_descriptor *vdp;
584 struct iso_volume_descriptor * vdp; 586
585 587 block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits);
586 block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits); 588 if (!(bh = sb_bread(s, block)))
587 if (!(bh = sb_bread(s, block))) 589 goto out_no_read;
588 goto out_no_read; 590
589 591 vdp = (struct iso_volume_descriptor *)bh->b_data;
590 vdp = (struct iso_volume_descriptor *)bh->b_data; 592 hdp = (struct hs_volume_descriptor *)bh->b_data;
591 hdp = (struct hs_volume_descriptor *)bh->b_data; 593
592 594 /*
593 /* Due to the overlapping physical location of the descriptors, 595 * Due to the overlapping physical location of the descriptors,
594 * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure 596 * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure
595 * proper identification in this case, we first check for ISO. 597 * proper identification in this case, we first check for ISO.
596 */ 598 */
597 if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) { 599 if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) {
598 if (isonum_711 (vdp->type) == ISO_VD_END) 600 if (isonum_711(vdp->type) == ISO_VD_END)
599 break; 601 break;
600 if (isonum_711 (vdp->type) == ISO_VD_PRIMARY) { 602 if (isonum_711(vdp->type) == ISO_VD_PRIMARY) {
601 if (pri == NULL) { 603 if (pri == NULL) {
602 pri = (struct iso_primary_descriptor *)vdp; 604 pri = (struct iso_primary_descriptor *)vdp;
603 /* Save the buffer in case we need it ... */ 605 /* Save the buffer in case we need it ... */
604 pri_bh = bh; 606 pri_bh = bh;
605 bh = NULL; 607 bh = NULL;
606 } 608 }
607 } 609 }
608#ifdef CONFIG_JOLIET 610#ifdef CONFIG_JOLIET
609 else if (isonum_711 (vdp->type) == ISO_VD_SUPPLEMENTARY) { 611 else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) {
610 sec = (struct iso_supplementary_descriptor *)vdp; 612 sec = (struct iso_supplementary_descriptor *)vdp;
611 if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) { 613 if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
612 if (opt.joliet == 'y') { 614 if (opt.joliet == 'y') {
613 if (sec->escape[2] == 0x40) { 615 if (sec->escape[2] == 0x40)
614 joliet_level = 1; 616 joliet_level = 1;
615 } else if (sec->escape[2] == 0x43) { 617 else if (sec->escape[2] == 0x43)
616 joliet_level = 2; 618 joliet_level = 2;
617 } else if (sec->escape[2] == 0x45) { 619 else if (sec->escape[2] == 0x45)
618 joliet_level = 3; 620 joliet_level = 3;
619 } 621
620 printk(KERN_DEBUG"ISO 9660 Extensions: Microsoft Joliet Level %d\n", 622 printk(KERN_DEBUG "ISO 9660 Extensions: "
621 joliet_level); 623 "Microsoft Joliet Level %d\n",
624 joliet_level);
625 }
626 goto root_found;
627 } else {
628 /* Unknown supplementary volume descriptor */
629 sec = NULL;
630 }
622 } 631 }
623 goto root_found;
624 } else {
625 /* Unknown supplementary volume descriptor */
626 sec = NULL;
627 }
628 }
629#endif 632#endif
630 } else { 633 } else {
631 if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) { 634 if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) {
632 if (isonum_711 (hdp->type) != ISO_VD_PRIMARY) 635 if (isonum_711(hdp->type) != ISO_VD_PRIMARY)
633 goto out_freebh; 636 goto out_freebh;
634 637
635 sbi->s_high_sierra = 1; 638 sbi->s_high_sierra = 1;
636 opt.rock = 'n'; 639 opt.rock = 'n';
637 h_pri = (struct hs_primary_descriptor *)vdp; 640 h_pri = (struct hs_primary_descriptor *)vdp;
638 goto root_found; 641 goto root_found;
642 }
639 } 643 }
640 }
641 644
642 /* Just skip any volume descriptors we don't recognize */ 645 /* Just skip any volume descriptors we don't recognize */
643 646
644 brelse(bh); 647 brelse(bh);
645 bh = NULL; 648 bh = NULL;
646 } 649 }
647 /* 650 /*
648 * If we fall through, either no volume descriptor was found, 651 * If we fall through, either no volume descriptor was found,
@@ -657,24 +660,24 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
657root_found: 660root_found:
658 661
659 if (joliet_level && (pri == NULL || opt.rock == 'n')) { 662 if (joliet_level && (pri == NULL || opt.rock == 'n')) {
660 /* This is the case of Joliet with the norock mount flag. 663 /* This is the case of Joliet with the norock mount flag.
661 * A disc with both Joliet and Rock Ridge is handled later 664 * A disc with both Joliet and Rock Ridge is handled later
662 */ 665 */
663 pri = (struct iso_primary_descriptor *) sec; 666 pri = (struct iso_primary_descriptor *) sec;
664 } 667 }
665 668
666 if(sbi->s_high_sierra){ 669 if(sbi->s_high_sierra){
667 rootp = (struct iso_directory_record *) h_pri->root_directory_record; 670 rootp = (struct iso_directory_record *) h_pri->root_directory_record;
668 sbi->s_nzones = isonum_733 (h_pri->volume_space_size); 671 sbi->s_nzones = isonum_733(h_pri->volume_space_size);
669 sbi->s_log_zone_size = isonum_723 (h_pri->logical_block_size); 672 sbi->s_log_zone_size = isonum_723(h_pri->logical_block_size);
670 sbi->s_max_size = isonum_733(h_pri->volume_space_size); 673 sbi->s_max_size = isonum_733(h_pri->volume_space_size);
671 } else { 674 } else {
672 if (!pri) 675 if (!pri)
673 goto out_freebh; 676 goto out_freebh;
674 rootp = (struct iso_directory_record *) pri->root_directory_record; 677 rootp = (struct iso_directory_record *) pri->root_directory_record;
675 sbi->s_nzones = isonum_733 (pri->volume_space_size); 678 sbi->s_nzones = isonum_733(pri->volume_space_size);
676 sbi->s_log_zone_size = isonum_723 (pri->logical_block_size); 679 sbi->s_log_zone_size = isonum_723(pri->logical_block_size);
677 sbi->s_max_size = isonum_733(pri->volume_space_size); 680 sbi->s_max_size = isonum_733(pri->volume_space_size);
678 } 681 }
679 682
680 sbi->s_ninodes = 0; /* No way to figure this out easily */ 683 sbi->s_ninodes = 0; /* No way to figure this out easily */
@@ -687,42 +690,43 @@ root_found:
687 * blocks that were 512 bytes (which should only very rarely 690 * blocks that were 512 bytes (which should only very rarely
688 * happen.) 691 * happen.)
689 */ 692 */
690 if(orig_zonesize < opt.blocksize) 693 if (orig_zonesize < opt.blocksize)
691 goto out_bad_size; 694 goto out_bad_size;
692 695
693 /* RDE: convert log zone size to bit shift */ 696 /* RDE: convert log zone size to bit shift */
694 switch (sbi->s_log_zone_size) 697 switch (sbi->s_log_zone_size) {
695 { case 512: sbi->s_log_zone_size = 9; break; 698 case 512: sbi->s_log_zone_size = 9; break;
696 case 1024: sbi->s_log_zone_size = 10; break; 699 case 1024: sbi->s_log_zone_size = 10; break;
697 case 2048: sbi->s_log_zone_size = 11; break; 700 case 2048: sbi->s_log_zone_size = 11; break;
698 701
699 default: 702 default:
700 goto out_bad_zone_size; 703 goto out_bad_zone_size;
701 } 704 }
702 705
703 s->s_magic = ISOFS_SUPER_MAGIC; 706 s->s_magic = ISOFS_SUPER_MAGIC;
704 s->s_maxbytes = 0xffffffff; /* We can handle files up to 4 GB */ 707 s->s_maxbytes = 0xffffffff; /* We can handle files up to 4 GB */
705 708
706 /* The CDROM is read-only, has no nodes (devices) on it, and since 709 /*
707 all of the files appear to be owned by root, we really do not want 710 * The CDROM is read-only, has no nodes (devices) on it, and since
708 to allow suid. (suid or devices will not show up unless we have 711 * all of the files appear to be owned by root, we really do not want
709 Rock Ridge extensions) */ 712 * to allow suid. (suid or devices will not show up unless we have
713 * Rock Ridge extensions)
714 */
710 715
711 s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */; 716 s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */;
712 717
713 /* Set this for reference. Its not currently used except on write 718 /* Set this for reference. Its not currently used except on write
714 which we don't have .. */ 719 which we don't have .. */
715 720
716 first_data_zone = isonum_733 (rootp->extent) + 721 first_data_zone = isonum_733(rootp->extent) +
717 isonum_711 (rootp->ext_attr_length); 722 isonum_711(rootp->ext_attr_length);
718 sbi->s_firstdatazone = first_data_zone; 723 sbi->s_firstdatazone = first_data_zone;
719#ifndef BEQUIET 724#ifndef BEQUIET
720 printk(KERN_DEBUG "Max size:%ld Log zone size:%ld\n", 725 printk(KERN_DEBUG "ISOFS: Max size:%ld Log zone size:%ld\n",
721 sbi->s_max_size, 726 sbi->s_max_size, 1UL << sbi->s_log_zone_size);
722 1UL << sbi->s_log_zone_size); 727 printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone);
723 printk(KERN_DEBUG "First datazone:%ld\n", sbi->s_firstdatazone);
724 if(sbi->s_high_sierra) 728 if(sbi->s_high_sierra)
725 printk(KERN_DEBUG "Disc in High Sierra format.\n"); 729 printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n");
726#endif 730#endif
727 731
728 /* 732 /*
@@ -737,8 +741,8 @@ root_found:
737 pri = (struct iso_primary_descriptor *) sec; 741 pri = (struct iso_primary_descriptor *) sec;
738 rootp = (struct iso_directory_record *) 742 rootp = (struct iso_directory_record *)
739 pri->root_directory_record; 743 pri->root_directory_record;
740 first_data_zone = isonum_733 (rootp->extent) + 744 first_data_zone = isonum_733(rootp->extent) +
741 isonum_711 (rootp->ext_attr_length); 745 isonum_711(rootp->ext_attr_length);
742 } 746 }
743 747
744 /* 748 /*
@@ -771,7 +775,7 @@ root_found:
771 775
772#ifdef CONFIG_JOLIET 776#ifdef CONFIG_JOLIET
773 if (joliet_level && opt.utf8 == 0) { 777 if (joliet_level && opt.utf8 == 0) {
774 char * p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT; 778 char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT;
775 sbi->s_nls_iocharset = load_nls(p); 779 sbi->s_nls_iocharset = load_nls(p);
776 if (! sbi->s_nls_iocharset) { 780 if (! sbi->s_nls_iocharset) {
777 /* Fail only if explicit charset specified */ 781 /* Fail only if explicit charset specified */
@@ -821,7 +825,7 @@ root_found:
821 sbi->s_rock = 0; 825 sbi->s_rock = 0;
822 if (sbi->s_firstdatazone != first_data_zone) { 826 if (sbi->s_firstdatazone != first_data_zone) {
823 sbi->s_firstdatazone = first_data_zone; 827 sbi->s_firstdatazone = first_data_zone;
824 printk(KERN_DEBUG 828 printk(KERN_DEBUG
825 "ISOFS: changing to secondary root\n"); 829 "ISOFS: changing to secondary root\n");
826 iput(inode); 830 iput(inode);
827 inode = isofs_iget(s, sbi->s_firstdatazone, 0); 831 inode = isofs_iget(s, sbi->s_firstdatazone, 0);
@@ -830,8 +834,10 @@ root_found:
830 834
831 if (opt.check == 'u') { 835 if (opt.check == 'u') {
832 /* Only Joliet is case insensitive by default */ 836 /* Only Joliet is case insensitive by default */
833 if (joliet_level) opt.check = 'r'; 837 if (joliet_level)
834 else opt.check = 's'; 838 opt.check = 'r';
839 else
840 opt.check = 's';
835 } 841 }
836 sbi->s_joliet_level = joliet_level; 842 sbi->s_joliet_level = joliet_level;
837 843
@@ -846,8 +852,10 @@ root_found:
846 goto out_no_root; 852 goto out_no_root;
847 853
848 table = 0; 854 table = 0;
849 if (joliet_level) table += 2; 855 if (joliet_level)
850 if (opt.check == 'r') table++; 856 table += 2;
857 if (opt.check == 'r')
858 table++;
851 s->s_root->d_op = &isofs_dentry_ops[table]; 859 s->s_root->d_op = &isofs_dentry_ops[table];
852 860
853 kfree(opt.iocharset); 861 kfree(opt.iocharset);
@@ -858,10 +866,10 @@ root_found:
858 * Display error messages and free resources. 866 * Display error messages and free resources.
859 */ 867 */
860out_bad_root: 868out_bad_root:
861 printk(KERN_WARNING "isofs_fill_super: root inode not initialized\n"); 869 printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
862 goto out_iput; 870 goto out_iput;
863out_no_root: 871out_no_root:
864 printk(KERN_WARNING "isofs_fill_super: get root inode failed\n"); 872 printk(KERN_WARNING "%s: get root inode failed\n", __func__);
865out_iput: 873out_iput:
866 iput(inode); 874 iput(inode);
867#ifdef CONFIG_JOLIET 875#ifdef CONFIG_JOLIET
@@ -870,21 +878,20 @@ out_iput:
870#endif 878#endif
871 goto out_freesbi; 879 goto out_freesbi;
872out_no_read: 880out_no_read:
873 printk(KERN_WARNING "isofs_fill_super: " 881 printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n",
874 "bread failed, dev=%s, iso_blknum=%d, block=%d\n", 882 __func__, s->s_id, iso_blknum, block);
875 s->s_id, iso_blknum, block);
876 goto out_freesbi; 883 goto out_freesbi;
877out_bad_zone_size: 884out_bad_zone_size:
878 printk(KERN_WARNING "Bad logical zone size %ld\n", 885 printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n",
879 sbi->s_log_zone_size); 886 sbi->s_log_zone_size);
880 goto out_freebh; 887 goto out_freebh;
881out_bad_size: 888out_bad_size:
882 printk(KERN_WARNING "Logical zone size(%d) < hardware blocksize(%u)\n", 889 printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n",
883 orig_zonesize, opt.blocksize); 890 orig_zonesize, opt.blocksize);
884 goto out_freebh; 891 goto out_freebh;
885out_unknown_format: 892out_unknown_format:
886 if (!silent) 893 if (!silent)
887 printk(KERN_WARNING "Unable to identify CD-ROM format.\n"); 894 printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n");
888 895
889out_freebh: 896out_freebh:
890 brelse(bh); 897 brelse(bh);
@@ -902,7 +909,7 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
902 buf->f_type = ISOFS_SUPER_MAGIC; 909 buf->f_type = ISOFS_SUPER_MAGIC;
903 buf->f_bsize = sb->s_blocksize; 910 buf->f_bsize = sb->s_blocksize;
904 buf->f_blocks = (ISOFS_SB(sb)->s_nzones 911 buf->f_blocks = (ISOFS_SB(sb)->s_nzones
905 << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits)); 912 << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits));
906 buf->f_bfree = 0; 913 buf->f_bfree = 0;
907 buf->f_bavail = 0; 914 buf->f_bavail = 0;
908 buf->f_files = ISOFS_SB(sb)->s_ninodes; 915 buf->f_files = ISOFS_SB(sb)->s_ninodes;
@@ -931,20 +938,20 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
931 938
932 rv = 0; 939 rv = 0;
933 if (iblock < 0 || iblock != iblock_s) { 940 if (iblock < 0 || iblock != iblock_s) {
934 printk("isofs_get_blocks: block number too large\n"); 941 printk(KERN_DEBUG "%s: block number too large\n", __func__);
935 goto abort; 942 goto abort;
936 } 943 }
937 944
938 b_off = iblock; 945 b_off = iblock;
939 946
940 offset = 0; 947 offset = 0;
941 firstext = ei->i_first_extent; 948 firstext = ei->i_first_extent;
942 sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode); 949 sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode);
943 nextblk = ei->i_next_section_block; 950 nextblk = ei->i_next_section_block;
944 nextoff = ei->i_next_section_offset; 951 nextoff = ei->i_next_section_offset;
945 section = 0; 952 section = 0;
946 953
947 while ( nblocks ) { 954 while (nblocks) {
948 /* If we are *way* beyond the end of the file, print a message. 955 /* If we are *way* beyond the end of the file, print a message.
949 * Access beyond the end of the file up to the next page boundary 956 * Access beyond the end of the file up to the next page boundary
950 * is normal, however because of the way the page cache works. 957 * is normal, however because of the way the page cache works.
@@ -953,11 +960,11 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
953 * I/O errors. 960 * I/O errors.
954 */ 961 */
955 if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { 962 if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
956 printk("isofs_get_blocks: block >= EOF (%ld, %ld)\n", 963 printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n",
957 iblock, (unsigned long) inode->i_size); 964 __func__, iblock, (unsigned long) inode->i_size);
958 goto abort; 965 goto abort;
959 } 966 }
960 967
961 /* On the last section, nextblk == 0, section size is likely to 968 /* On the last section, nextblk == 0, section size is likely to
962 * exceed sect_size by a partial block, and access beyond the 969 * exceed sect_size by a partial block, and access beyond the
963 * end of the file will reach beyond the section size, too. 970 * end of the file will reach beyond the section size, too.
@@ -976,20 +983,21 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
976 iput(ninode); 983 iput(ninode);
977 984
978 if (++section > 100) { 985 if (++section > 100) {
979 printk("isofs_get_blocks: More than 100 file sections ?!?, aborting...\n"); 986 printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
980 printk("isofs_get_blocks: block=%ld firstext=%u sect_size=%u " 987 " aborting...\n", __func__);
981 "nextblk=%lu nextoff=%lu\n", 988 printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u "
982 iblock, firstext, (unsigned) sect_size, 989 "nextblk=%lu nextoff=%lu\n", __func__,
983 nextblk, nextoff); 990 iblock, firstext, (unsigned) sect_size,
991 nextblk, nextoff);
984 goto abort; 992 goto abort;
985 } 993 }
986 } 994 }
987 995
988 if ( *bh ) { 996 if (*bh) {
989 map_bh(*bh, inode->i_sb, firstext + b_off - offset); 997 map_bh(*bh, inode->i_sb, firstext + b_off - offset);
990 } else { 998 } else {
991 *bh = sb_getblk(inode->i_sb, firstext+b_off-offset); 999 *bh = sb_getblk(inode->i_sb, firstext+b_off-offset);
992 if ( !*bh ) 1000 if (!*bh)
993 goto abort; 1001 goto abort;
994 } 1002 }
995 bh++; /* Next buffer head */ 1003 bh++; /* Next buffer head */
@@ -1010,7 +1018,7 @@ static int isofs_get_block(struct inode *inode, sector_t iblock,
1010 struct buffer_head *bh_result, int create) 1018 struct buffer_head *bh_result, int create)
1011{ 1019{
1012 if (create) { 1020 if (create) {
1013 printk("isofs_get_block: Kernel tries to allocate a block\n"); 1021 printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__);
1014 return -EROFS; 1022 return -EROFS;
1015 } 1023 }
1016 1024
@@ -1070,11 +1078,11 @@ static int isofs_read_level3_size(struct inode *inode)
1070{ 1078{
1071 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 1079 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
1072 int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra; 1080 int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra;
1073 struct buffer_head * bh = NULL; 1081 struct buffer_head *bh = NULL;
1074 unsigned long block, offset, block_saved, offset_saved; 1082 unsigned long block, offset, block_saved, offset_saved;
1075 int i = 0; 1083 int i = 0;
1076 int more_entries = 0; 1084 int more_entries = 0;
1077 struct iso_directory_record * tmpde = NULL; 1085 struct iso_directory_record *tmpde = NULL;
1078 struct iso_inode_info *ei = ISOFS_I(inode); 1086 struct iso_inode_info *ei = ISOFS_I(inode);
1079 1087
1080 inode->i_size = 0; 1088 inode->i_size = 0;
@@ -1089,7 +1097,7 @@ static int isofs_read_level3_size(struct inode *inode)
1089 offset = ei->i_iget5_offset; 1097 offset = ei->i_iget5_offset;
1090 1098
1091 do { 1099 do {
1092 struct iso_directory_record * de; 1100 struct iso_directory_record *de;
1093 unsigned int de_len; 1101 unsigned int de_len;
1094 1102
1095 if (!bh) { 1103 if (!bh) {
@@ -1163,10 +1171,9 @@ out_noread:
1163 return -EIO; 1171 return -EIO;
1164 1172
1165out_toomany: 1173out_toomany:
1166 printk(KERN_INFO "isofs_read_level3_size: " 1174 printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n"
1167 "More than 100 file sections ?!?, aborting...\n" 1175 "isofs_read_level3_size: inode=%lu\n",
1168 "isofs_read_level3_size: inode=%lu\n", 1176 __func__, inode->i_ino);
1169 inode->i_ino);
1170 goto out; 1177 goto out;
1171} 1178}
1172 1179
@@ -1177,9 +1184,9 @@ static void isofs_read_inode(struct inode *inode)
1177 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 1184 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
1178 unsigned long block; 1185 unsigned long block;
1179 int high_sierra = sbi->s_high_sierra; 1186 int high_sierra = sbi->s_high_sierra;
1180 struct buffer_head * bh = NULL; 1187 struct buffer_head *bh = NULL;
1181 struct iso_directory_record * de; 1188 struct iso_directory_record *de;
1182 struct iso_directory_record * tmpde = NULL; 1189 struct iso_directory_record *tmpde = NULL;
1183 unsigned int de_len; 1190 unsigned int de_len;
1184 unsigned long offset; 1191 unsigned long offset;
1185 struct iso_inode_info *ei = ISOFS_I(inode); 1192 struct iso_inode_info *ei = ISOFS_I(inode);
@@ -1199,7 +1206,7 @@ static void isofs_read_inode(struct inode *inode)
1199 1206
1200 tmpde = kmalloc(de_len, GFP_KERNEL); 1207 tmpde = kmalloc(de_len, GFP_KERNEL);
1201 if (tmpde == NULL) { 1208 if (tmpde == NULL) {
1202 printk(KERN_INFO "isofs_read_inode: out of memory\n"); 1209 printk(KERN_INFO "%s: out of memory\n", __func__);
1203 goto fail; 1210 goto fail;
1204 } 1211 }
1205 memcpy(tmpde, bh->b_data + offset, frag1); 1212 memcpy(tmpde, bh->b_data + offset, frag1);
@@ -1212,24 +1219,26 @@ static void isofs_read_inode(struct inode *inode)
1212 } 1219 }
1213 1220
1214 inode->i_ino = isofs_get_ino(ei->i_iget5_block, 1221 inode->i_ino = isofs_get_ino(ei->i_iget5_block,
1215 ei->i_iget5_offset, 1222 ei->i_iget5_offset,
1216 ISOFS_BUFFER_BITS(inode)); 1223 ISOFS_BUFFER_BITS(inode));
1217 1224
1218 /* Assume it is a normal-format file unless told otherwise */ 1225 /* Assume it is a normal-format file unless told otherwise */
1219 ei->i_file_format = isofs_file_normal; 1226 ei->i_file_format = isofs_file_normal;
1220 1227
1221 if (de->flags[-high_sierra] & 2) { 1228 if (de->flags[-high_sierra] & 2) {
1222 inode->i_mode = S_IRUGO | S_IXUGO | S_IFDIR; 1229 inode->i_mode = S_IRUGO | S_IXUGO | S_IFDIR;
1223 inode->i_nlink = 1; /* Set to 1. We know there are 2, but 1230 inode->i_nlink = 1; /*
1224 the find utility tries to optimize 1231 * Set to 1. We know there are 2, but
1225 if it is 2, and it screws up. It is 1232 * the find utility tries to optimize
1226 easier to give 1 which tells find to 1233 * if it is 2, and it screws up. It is
1227 do it the hard way. */ 1234 * easier to give 1 which tells find to
1235 * do it the hard way.
1236 */
1228 } else { 1237 } else {
1229 /* Everybody gets to read the file. */ 1238 /* Everybody gets to read the file. */
1230 inode->i_mode = sbi->s_mode; 1239 inode->i_mode = sbi->s_mode;
1231 inode->i_nlink = 1; 1240 inode->i_nlink = 1;
1232 inode->i_mode |= S_IFREG; 1241 inode->i_mode |= S_IFREG;
1233 } 1242 }
1234 inode->i_uid = sbi->s_uid; 1243 inode->i_uid = sbi->s_uid;
1235 inode->i_gid = sbi->s_gid; 1244 inode->i_gid = sbi->s_gid;
@@ -1239,13 +1248,14 @@ static void isofs_read_inode(struct inode *inode)
1239 ei->i_format_parm[1] = 0; 1248 ei->i_format_parm[1] = 0;
1240 ei->i_format_parm[2] = 0; 1249 ei->i_format_parm[2] = 0;
1241 1250
1242 ei->i_section_size = isonum_733 (de->size); 1251 ei->i_section_size = isonum_733(de->size);
1243 if (de->flags[-high_sierra] & 0x80) { 1252 if (de->flags[-high_sierra] & 0x80) {
1244 if(isofs_read_level3_size(inode)) goto fail; 1253 if(isofs_read_level3_size(inode))
1254 goto fail;
1245 } else { 1255 } else {
1246 ei->i_next_section_block = 0; 1256 ei->i_next_section_block = 0;
1247 ei->i_next_section_offset = 0; 1257 ei->i_next_section_offset = 0;
1248 inode->i_size = isonum_733 (de->size); 1258 inode->i_size = isonum_733(de->size);
1249 } 1259 }
1250 1260
1251 /* 1261 /*
@@ -1258,23 +1268,24 @@ static void isofs_read_inode(struct inode *inode)
1258 inode->i_size &= 0x00ffffff; 1268 inode->i_size &= 0x00ffffff;
1259 1269
1260 if (de->interleave[0]) { 1270 if (de->interleave[0]) {
1261 printk("Interleaved files not (yet) supported.\n"); 1271 printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n");
1262 inode->i_size = 0; 1272 inode->i_size = 0;
1263 } 1273 }
1264 1274
1265 /* I have no idea what file_unit_size is used for, so 1275 /* I have no idea what file_unit_size is used for, so
1266 we will flag it for now */ 1276 we will flag it for now */
1267 if (de->file_unit_size[0] != 0) { 1277 if (de->file_unit_size[0] != 0) {
1268 printk("File unit size != 0 for ISO file (%ld).\n", 1278 printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n",
1269 inode->i_ino); 1279 inode->i_ino);
1270 } 1280 }
1271 1281
1272 /* I have no idea what other flag bits are used for, so 1282 /* I have no idea what other flag bits are used for, so
1273 we will flag it for now */ 1283 we will flag it for now */
1274#ifdef DEBUG 1284#ifdef DEBUG
1275 if((de->flags[-high_sierra] & ~2)!= 0){ 1285 if((de->flags[-high_sierra] & ~2)!= 0){
1276 printk("Unusual flag settings for ISO file (%ld %x).\n", 1286 printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file "
1277 inode->i_ino, de->flags[-high_sierra]); 1287 "(%ld %x).\n",
1288 inode->i_ino, de->flags[-high_sierra]);
1278 } 1289 }
1279#endif 1290#endif
1280 1291
@@ -1285,11 +1296,11 @@ static void isofs_read_inode(struct inode *inode)
1285 inode->i_atime.tv_nsec = 1296 inode->i_atime.tv_nsec =
1286 inode->i_ctime.tv_nsec = 0; 1297 inode->i_ctime.tv_nsec = 0;
1287 1298
1288 ei->i_first_extent = (isonum_733 (de->extent) + 1299 ei->i_first_extent = (isonum_733(de->extent) +
1289 isonum_711 (de->ext_attr_length)); 1300 isonum_711(de->ext_attr_length));
1290 1301
1291 /* Set the number of blocks for stat() - should be done before RR */ 1302 /* Set the number of blocks for stat() - should be done before RR */
1292 inode->i_blocks = (inode->i_size + 511) >> 9; 1303 inode->i_blocks = (inode->i_size + 511) >> 9;
1293 1304
1294 /* 1305 /*
1295 * Now test for possible Rock Ridge extensions which will override 1306 * Now test for possible Rock Ridge extensions which will override
@@ -1306,7 +1317,7 @@ static void isofs_read_inode(struct inode *inode)
1306 /* Install the inode operations vector */ 1317 /* Install the inode operations vector */
1307 if (S_ISREG(inode->i_mode)) { 1318 if (S_ISREG(inode->i_mode)) {
1308 inode->i_fop = &generic_ro_fops; 1319 inode->i_fop = &generic_ro_fops;
1309 switch ( ei->i_file_format ) { 1320 switch (ei->i_file_format) {
1310#ifdef CONFIG_ZISOFS 1321#ifdef CONFIG_ZISOFS
1311 case isofs_file_compressed: 1322 case isofs_file_compressed:
1312 inode->i_data.a_ops = &zisofs_aops; 1323 inode->i_data.a_ops = &zisofs_aops;
@@ -1350,7 +1361,7 @@ static int isofs_iget5_test(struct inode *ino, void *data)
1350 struct isofs_iget5_callback_data *d = 1361 struct isofs_iget5_callback_data *d =
1351 (struct isofs_iget5_callback_data*)data; 1362 (struct isofs_iget5_callback_data*)data;
1352 return (i->i_iget5_block == d->block) 1363 return (i->i_iget5_block == d->block)
1353 && (i->i_iget5_offset == d->offset); 1364 && (i->i_iget5_offset == d->offset);
1354} 1365}
1355 1366
1356static int isofs_iget5_set(struct inode *ino, void *data) 1367static int isofs_iget5_set(struct inode *ino, void *data)
@@ -1384,7 +1395,7 @@ struct inode *isofs_iget(struct super_block *sb,
1384 hashval = (block << sb->s_blocksize_bits) | offset; 1395 hashval = (block << sb->s_blocksize_bits) | offset;
1385 1396
1386 inode = iget5_locked(sb, hashval, &isofs_iget5_test, 1397 inode = iget5_locked(sb, hashval, &isofs_iget5_test,
1387 &isofs_iget5_set, &data); 1398 &isofs_iget5_set, &data);
1388 1399
1389 if (inode && (inode->i_state & I_NEW)) { 1400 if (inode && (inode->i_state & I_NEW)) {
1390 sb->s_op->read_inode(inode); 1401 sb->s_op->read_inode(inode);
@@ -1398,7 +1409,7 @@ static int isofs_get_sb(struct file_system_type *fs_type,
1398 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1409 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
1399{ 1410{
1400 return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super, 1411 return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
1401 mnt); 1412 mnt);
1402} 1413}
1403 1414
1404static struct file_system_type iso9660_fs_type = { 1415static struct file_system_type iso9660_fs_type = {
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index efe2872cd4e3..a07e67b1ea7f 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -1,5 +1,6 @@
1#include <linux/fs.h> 1#include <linux/fs.h>
2#include <linux/buffer_head.h> 2#include <linux/buffer_head.h>
3#include <linux/exportfs.h>
3#include <linux/iso_fs.h> 4#include <linux/iso_fs.h>
4#include <asm/unaligned.h> 5#include <asm/unaligned.h>
5 6
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index fb8fe7a9ddc6..92c14b850e9c 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -80,22 +80,20 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
80 80
81 if (utf8) { 81 if (utf8) {
82 len = wcsntombs_be(outname, de->name, 82 len = wcsntombs_be(outname, de->name,
83 de->name_len[0] >> 1, PAGE_SIZE); 83 de->name_len[0] >> 1, PAGE_SIZE);
84 } else { 84 } else {
85 len = uni16_to_x8(outname, (__be16 *) de->name, 85 len = uni16_to_x8(outname, (__be16 *) de->name,
86 de->name_len[0] >> 1, nls); 86 de->name_len[0] >> 1, nls);
87 } 87 }
88 if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1')) { 88 if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1'))
89 len -= 2; 89 len -= 2;
90 }
91 90
92 /* 91 /*
93 * Windows doesn't like periods at the end of a name, 92 * Windows doesn't like periods at the end of a name,
94 * so neither do we 93 * so neither do we
95 */ 94 */
96 while (len >= 2 && (outname[len-1] == '.')) { 95 while (len >= 2 && (outname[len-1] == '.'))
97 len--; 96 len--;
98 }
99 97
100 return len; 98 return len;
101} 99}
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index c04b3a14a3e9..c8c7e5138a01 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -15,7 +15,7 @@
15 * some sanity tests. 15 * some sanity tests.
16 */ 16 */
17static int 17static int
18isofs_cmp(struct dentry * dentry, const char * compare, int dlen) 18isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
19{ 19{
20 struct qstr qstr; 20 struct qstr qstr;
21 21
@@ -48,24 +48,24 @@ isofs_cmp(struct dentry * dentry, const char * compare, int dlen)
48 */ 48 */
49static unsigned long 49static unsigned long
50isofs_find_entry(struct inode *dir, struct dentry *dentry, 50isofs_find_entry(struct inode *dir, struct dentry *dentry,
51 unsigned long *block_rv, unsigned long* offset_rv, 51 unsigned long *block_rv, unsigned long *offset_rv,
52 char * tmpname, struct iso_directory_record * tmpde) 52 char *tmpname, struct iso_directory_record *tmpde)
53{ 53{
54 unsigned long bufsize = ISOFS_BUFFER_SIZE(dir); 54 unsigned long bufsize = ISOFS_BUFFER_SIZE(dir);
55 unsigned char bufbits = ISOFS_BUFFER_BITS(dir); 55 unsigned char bufbits = ISOFS_BUFFER_BITS(dir);
56 unsigned long block, f_pos, offset, block_saved, offset_saved; 56 unsigned long block, f_pos, offset, block_saved, offset_saved;
57 struct buffer_head * bh = NULL; 57 struct buffer_head *bh = NULL;
58 struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb); 58 struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
59 59
60 if (!ISOFS_I(dir)->i_first_extent) 60 if (!ISOFS_I(dir)->i_first_extent)
61 return 0; 61 return 0;
62 62
63 f_pos = 0; 63 f_pos = 0;
64 offset = 0; 64 offset = 0;
65 block = 0; 65 block = 0;
66 66
67 while (f_pos < dir->i_size) { 67 while (f_pos < dir->i_size) {
68 struct iso_directory_record * de; 68 struct iso_directory_record *de;
69 int de_len, match, i, dlen; 69 int de_len, match, i, dlen;
70 char *dpnt; 70 char *dpnt;
71 71
@@ -114,7 +114,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
114 114
115 if (sbi->s_rock && 115 if (sbi->s_rock &&
116 ((i = get_rock_ridge_filename(de, tmpname, dir)))) { 116 ((i = get_rock_ridge_filename(de, tmpname, dir)))) {
117 dlen = i; /* possibly -1 */ 117 dlen = i; /* possibly -1 */
118 dpnt = tmpname; 118 dpnt = tmpname;
119#ifdef CONFIG_JOLIET 119#ifdef CONFIG_JOLIET
120 } else if (sbi->s_joliet_level) { 120 } else if (sbi->s_joliet_level) {
@@ -145,8 +145,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
145 isofs_normalize_block_and_offset(de, 145 isofs_normalize_block_and_offset(de,
146 &block_saved, 146 &block_saved,
147 &offset_saved); 147 &offset_saved);
148 *block_rv = block_saved; 148 *block_rv = block_saved;
149 *offset_rv = offset_saved; 149 *offset_rv = offset_saved;
150 brelse(bh); 150 brelse(bh);
151 return 1; 151 return 1;
152 } 152 }
@@ -155,7 +155,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
155 return 0; 155 return 0;
156} 156}
157 157
158struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) 158struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
159{ 159{
160 int found; 160 int found;
161 unsigned long block, offset; 161 unsigned long block, offset;
@@ -170,9 +170,9 @@ struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry, struct n
170 170
171 lock_kernel(); 171 lock_kernel();
172 found = isofs_find_entry(dir, dentry, 172 found = isofs_find_entry(dir, dentry,
173 &block, &offset, 173 &block, &offset,
174 page_address(page), 174 page_address(page),
175 1024 + page_address(page)); 175 1024 + page_address(page));
176 __free_page(page); 176 __free_page(page);
177 177
178 inode = NULL; 178 inode = NULL;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 1facfaff97cb..a003d50edcdb 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -887,7 +887,8 @@ restart_loop:
887 journal->j_committing_transaction = NULL; 887 journal->j_committing_transaction = NULL;
888 spin_unlock(&journal->j_state_lock); 888 spin_unlock(&journal->j_state_lock);
889 889
890 if (commit_transaction->t_checkpoint_list == NULL) { 890 if (commit_transaction->t_checkpoint_list == NULL &&
891 commit_transaction->t_checkpoint_io_list == NULL) {
891 __journal_drop_transaction(journal, commit_transaction); 892 __journal_drop_transaction(journal, commit_transaction);
892 } else { 893 } else {
893 if (journal->j_checkpoint_transactions == NULL) { 894 if (journal->j_checkpoint_transactions == NULL) {
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 824e3b7d4ec1..8db2fa25170b 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -68,6 +68,7 @@
68#include <linux/list.h> 68#include <linux/list.h>
69#include <linux/init.h> 69#include <linux/init.h>
70#endif 70#endif
71#include <linux/log2.h>
71 72
72static struct kmem_cache *revoke_record_cache; 73static struct kmem_cache *revoke_record_cache;
73static struct kmem_cache *revoke_table_cache; 74static struct kmem_cache *revoke_table_cache;
@@ -211,7 +212,7 @@ int journal_init_revoke(journal_t *journal, int hash_size)
211 journal->j_revoke = journal->j_revoke_table[0]; 212 journal->j_revoke = journal->j_revoke_table[0];
212 213
213 /* Check that the hash_size is a power of two */ 214 /* Check that the hash_size is a power of two */
214 J_ASSERT ((hash_size & (hash_size-1)) == 0); 215 J_ASSERT(is_power_of_2(hash_size));
215 216
216 journal->j_revoke->hash_size = hash_size; 217 journal->j_revoke->hash_size = hash_size;
217 218
@@ -238,7 +239,7 @@ int journal_init_revoke(journal_t *journal, int hash_size)
238 journal->j_revoke = journal->j_revoke_table[1]; 239 journal->j_revoke = journal->j_revoke_table[1];
239 240
240 /* Check that the hash_size is a power of two */ 241 /* Check that the hash_size is a power of two */
241 J_ASSERT ((hash_size & (hash_size-1)) == 0); 242 J_ASSERT(is_power_of_2(hash_size));
242 243
243 journal->j_revoke->hash_size = hash_size; 244 journal->j_revoke->hash_size = hash_size;
244 245
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 2856e1100a5f..c0f59d1b13dc 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -896,7 +896,8 @@ restart_loop:
896 journal->j_committing_transaction = NULL; 896 journal->j_committing_transaction = NULL;
897 spin_unlock(&journal->j_state_lock); 897 spin_unlock(&journal->j_state_lock);
898 898
899 if (commit_transaction->t_checkpoint_list == NULL) { 899 if (commit_transaction->t_checkpoint_list == NULL &&
900 commit_transaction->t_checkpoint_io_list == NULL) {
900 __jbd2_journal_drop_transaction(journal, commit_transaction); 901 __jbd2_journal_drop_transaction(journal, commit_transaction);
901 } else { 902 } else {
902 if (journal->j_checkpoint_transactions == NULL) { 903 if (journal->j_checkpoint_transactions == NULL) {
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 9246e763da78..28cac049a56b 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -68,6 +68,7 @@
68#include <linux/list.h> 68#include <linux/list.h>
69#include <linux/init.h> 69#include <linux/init.h>
70#endif 70#endif
71#include <linux/log2.h>
71 72
72static struct kmem_cache *jbd2_revoke_record_cache; 73static struct kmem_cache *jbd2_revoke_record_cache;
73static struct kmem_cache *jbd2_revoke_table_cache; 74static struct kmem_cache *jbd2_revoke_table_cache;
@@ -212,7 +213,7 @@ int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
212 journal->j_revoke = journal->j_revoke_table[0]; 213 journal->j_revoke = journal->j_revoke_table[0];
213 214
214 /* Check that the hash_size is a power of two */ 215 /* Check that the hash_size is a power of two */
215 J_ASSERT ((hash_size & (hash_size-1)) == 0); 216 J_ASSERT(is_power_of_2(hash_size));
216 217
217 journal->j_revoke->hash_size = hash_size; 218 journal->j_revoke->hash_size = hash_size;
218 219
@@ -239,7 +240,7 @@ int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
239 journal->j_revoke = journal->j_revoke_table[1]; 240 journal->j_revoke = journal->j_revoke_table[1];
240 241
241 /* Check that the hash_size is a power of two */ 242 /* Check that the hash_size is a power of two */
242 J_ASSERT ((hash_size & (hash_size-1)) == 0); 243 J_ASSERT(is_power_of_2(hash_size));
243 244
244 journal->j_revoke->hash_size = hash_size; 245 journal->j_revoke->hash_size = hash_size;
245 246
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 0c82dfcfd246..143c5530caf3 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -81,6 +81,7 @@ static int jffs2_garbage_collect_thread(void *_c)
81 81
82 set_user_nice(current, 10); 82 set_user_nice(current, 10);
83 83
84 set_freezable();
84 for (;;) { 85 for (;;) {
85 allow_signal(SIGHUP); 86 allow_signal(SIGHUP);
86 87
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 2374b595f2e1..f0ec72b263f1 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -32,6 +32,7 @@ extern void jfs_truncate_nolock(struct inode *, loff_t);
32extern void jfs_free_zero_link(struct inode *); 32extern void jfs_free_zero_link(struct inode *);
33extern struct dentry *jfs_get_parent(struct dentry *dentry); 33extern struct dentry *jfs_get_parent(struct dentry *dentry);
34extern void jfs_get_inode_flags(struct jfs_inode_info *); 34extern void jfs_get_inode_flags(struct jfs_inode_info *);
35extern struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp);
35extern void jfs_set_inode_flags(struct inode *); 36extern void jfs_set_inode_flags(struct inode *);
36extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); 37extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
37 38
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 25161c4121e4..932797ba433b 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1477,6 +1477,38 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
1477 return dentry; 1477 return dentry;
1478} 1478}
1479 1479
1480struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp)
1481{
1482 __u32 *objp = vobjp;
1483 unsigned long ino = objp[0];
1484 __u32 generation = objp[1];
1485 struct inode *inode;
1486 struct dentry *result;
1487
1488 if (ino == 0)
1489 return ERR_PTR(-ESTALE);
1490 inode = iget(sb, ino);
1491 if (inode == NULL)
1492 return ERR_PTR(-ENOMEM);
1493
1494 if (is_bad_inode(inode) ||
1495 (generation && inode->i_generation != generation)) {
1496 result = ERR_PTR(-ESTALE);
1497 goto out_iput;
1498 }
1499
1500 result = d_alloc_anon(inode);
1501 if (!result) {
1502 result = ERR_PTR(-ENOMEM);
1503 goto out_iput;
1504 }
1505 return result;
1506
1507 out_iput:
1508 iput(inode);
1509 return result;
1510}
1511
1480struct dentry *jfs_get_parent(struct dentry *dentry) 1512struct dentry *jfs_get_parent(struct dentry *dentry)
1481{ 1513{
1482 struct super_block *sb = dentry->d_inode->i_sb; 1514 struct super_block *sb = dentry->d_inode->i_sb;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 20e4ac1c79a3..929fceca7999 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -27,6 +27,7 @@
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/posix_acl.h> 28#include <linux/posix_acl.h>
29#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
30#include <linux/exportfs.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <linux/seq_file.h> 32#include <linux/seq_file.h>
32 33
@@ -737,6 +738,7 @@ static const struct super_operations jfs_super_operations = {
737}; 738};
738 739
739static struct export_operations jfs_export_operations = { 740static struct export_operations jfs_export_operations = {
741 .get_dentry = jfs_get_dentry,
740 .get_parent = jfs_get_parent, 742 .get_parent = jfs_get_parent,
741}; 743};
742 744
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 26809325469c..82e2192a0d5c 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -25,6 +25,7 @@
25#include <linux/smp.h> 25#include <linux/smp.h>
26#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
27#include <linux/mutex.h> 27#include <linux/mutex.h>
28#include <linux/freezer.h>
28 29
29#include <linux/sunrpc/types.h> 30#include <linux/sunrpc/types.h>
30#include <linux/sunrpc/stats.h> 31#include <linux/sunrpc/stats.h>
@@ -75,18 +76,31 @@ static const int nlm_port_min = 0, nlm_port_max = 65535;
75 76
76static struct ctl_table_header * nlm_sysctl_table; 77static struct ctl_table_header * nlm_sysctl_table;
77 78
78static unsigned long set_grace_period(void) 79static unsigned long get_lockd_grace_period(void)
79{ 80{
80 unsigned long grace_period;
81
82 /* Note: nlm_timeout should always be nonzero */ 81 /* Note: nlm_timeout should always be nonzero */
83 if (nlm_grace_period) 82 if (nlm_grace_period)
84 grace_period = ((nlm_grace_period + nlm_timeout - 1) 83 return roundup(nlm_grace_period, nlm_timeout) * HZ;
85 / nlm_timeout) * nlm_timeout * HZ;
86 else 84 else
87 grace_period = nlm_timeout * 5 * HZ; 85 return nlm_timeout * 5 * HZ;
86}
87
88unsigned long get_nfs_grace_period(void)
89{
90 unsigned long lockdgrace = get_lockd_grace_period();
91 unsigned long nfsdgrace = 0;
92
93 if (nlmsvc_ops)
94 nfsdgrace = nlmsvc_ops->get_grace_period();
95
96 return max(lockdgrace, nfsdgrace);
97}
98EXPORT_SYMBOL(get_nfs_grace_period);
99
100static unsigned long set_grace_period(void)
101{
88 nlmsvc_grace_period = 1; 102 nlmsvc_grace_period = 1;
89 return grace_period + jiffies; 103 return get_nfs_grace_period() + jiffies;
90} 104}
91 105
92static inline void clear_grace_period(void) 106static inline void clear_grace_period(void)
@@ -119,6 +133,7 @@ lockd(struct svc_rqst *rqstp)
119 complete(&lockd_start_done); 133 complete(&lockd_start_done);
120 134
121 daemonize("lockd"); 135 daemonize("lockd");
136 set_freezable();
122 137
123 /* Process request with signals blocked, but allow SIGKILL. */ 138 /* Process request with signals blocked, but allow SIGKILL. */
124 allow_signal(SIGKILL); 139 allow_signal(SIGKILL);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index deeb9dc062d9..fbb1d02f8791 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -100,7 +100,6 @@ struct mb_cache {
100static LIST_HEAD(mb_cache_list); 100static LIST_HEAD(mb_cache_list);
101static LIST_HEAD(mb_cache_lru_list); 101static LIST_HEAD(mb_cache_lru_list);
102static DEFINE_SPINLOCK(mb_cache_spinlock); 102static DEFINE_SPINLOCK(mb_cache_spinlock);
103static struct shrinker *mb_shrinker;
104 103
105static inline int 104static inline int
106mb_cache_indexes(struct mb_cache *cache) 105mb_cache_indexes(struct mb_cache *cache)
@@ -118,6 +117,10 @@ mb_cache_indexes(struct mb_cache *cache)
118 117
119static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); 118static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
120 119
120static struct shrinker mb_cache_shrinker = {
121 .shrink = mb_cache_shrink_fn,
122 .seeks = DEFAULT_SEEKS,
123};
121 124
122static inline int 125static inline int
123__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) 126__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
@@ -662,13 +665,13 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
662 665
663static int __init init_mbcache(void) 666static int __init init_mbcache(void)
664{ 667{
665 mb_shrinker = set_shrinker(DEFAULT_SEEKS, mb_cache_shrink_fn); 668 register_shrinker(&mb_cache_shrinker);
666 return 0; 669 return 0;
667} 670}
668 671
669static void __exit exit_mbcache(void) 672static void __exit exit_mbcache(void)
670{ 673{
671 remove_shrinker(mb_shrinker); 674 unregister_shrinker(&mb_cache_shrinker);
672} 675}
673 676
674module_init(init_mbcache) 677module_init(init_mbcache)
diff --git a/fs/namespace.c b/fs/namespace.c
index b696e3a0d18f..4198003d7e18 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -28,6 +28,7 @@
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29#include <asm/unistd.h> 29#include <asm/unistd.h>
30#include "pnode.h" 30#include "pnode.h"
31#include "internal.h"
31 32
32/* spinlock for vfsmount related operations, inplace of dcache_lock */ 33/* spinlock for vfsmount related operations, inplace of dcache_lock */
33__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); 34__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
@@ -320,22 +321,16 @@ EXPORT_SYMBOL(mnt_unpin);
320static void *m_start(struct seq_file *m, loff_t *pos) 321static void *m_start(struct seq_file *m, loff_t *pos)
321{ 322{
322 struct mnt_namespace *n = m->private; 323 struct mnt_namespace *n = m->private;
323 struct list_head *p;
324 loff_t l = *pos;
325 324
326 down_read(&namespace_sem); 325 down_read(&namespace_sem);
327 list_for_each(p, &n->list) 326 return seq_list_start(&n->list, *pos);
328 if (!l--)
329 return list_entry(p, struct vfsmount, mnt_list);
330 return NULL;
331} 327}
332 328
333static void *m_next(struct seq_file *m, void *v, loff_t *pos) 329static void *m_next(struct seq_file *m, void *v, loff_t *pos)
334{ 330{
335 struct mnt_namespace *n = m->private; 331 struct mnt_namespace *n = m->private;
336 struct list_head *p = ((struct vfsmount *)v)->mnt_list.next; 332
337 (*pos)++; 333 return seq_list_next(v, &n->list, pos);
338 return p == &n->list ? NULL : list_entry(p, struct vfsmount, mnt_list);
339} 334}
340 335
341static void m_stop(struct seq_file *m, void *v) 336static void m_stop(struct seq_file *m, void *v)
@@ -350,7 +345,7 @@ static inline void mangle(struct seq_file *m, const char *s)
350 345
351static int show_vfsmnt(struct seq_file *m, void *v) 346static int show_vfsmnt(struct seq_file *m, void *v)
352{ 347{
353 struct vfsmount *mnt = v; 348 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
354 int err = 0; 349 int err = 0;
355 static struct proc_fs_info { 350 static struct proc_fs_info {
356 int flag; 351 int flag;
@@ -405,7 +400,7 @@ struct seq_operations mounts_op = {
405 400
406static int show_vfsstat(struct seq_file *m, void *v) 401static int show_vfsstat(struct seq_file *m, void *v)
407{ 402{
408 struct vfsmount *mnt = v; 403 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
409 int err = 0; 404 int err = 0;
410 405
411 /* device */ 406 /* device */
@@ -1457,7 +1452,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
1457 1452
1458 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 1453 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
1459 if (!new_ns) 1454 if (!new_ns)
1460 return NULL; 1455 return ERR_PTR(-ENOMEM);
1461 1456
1462 atomic_set(&new_ns->count, 1); 1457 atomic_set(&new_ns->count, 1);
1463 INIT_LIST_HEAD(&new_ns->list); 1458 INIT_LIST_HEAD(&new_ns->list);
@@ -1471,7 +1466,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
1471 if (!new_ns->root) { 1466 if (!new_ns->root) {
1472 up_write(&namespace_sem); 1467 up_write(&namespace_sem);
1473 kfree(new_ns); 1468 kfree(new_ns);
1474 return NULL; 1469 return ERR_PTR(-ENOMEM);;
1475 } 1470 }
1476 spin_lock(&vfsmount_lock); 1471 spin_lock(&vfsmount_lock);
1477 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 1472 list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
@@ -1515,7 +1510,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
1515 return new_ns; 1510 return new_ns;
1516} 1511}
1517 1512
1518struct mnt_namespace *copy_mnt_ns(int flags, struct mnt_namespace *ns, 1513struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
1519 struct fs_struct *new_fs) 1514 struct fs_struct *new_fs)
1520{ 1515{
1521 struct mnt_namespace *new_ns; 1516 struct mnt_namespace *new_ns;
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index d3152f8d95c6..2b145de45b39 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -203,7 +203,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
203 203
204 if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { 204 if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
205 if (pos >= MAX_NON_LFS) { 205 if (pos >= MAX_NON_LFS) {
206 send_sig(SIGXFSZ, current, 0);
207 return -EFBIG; 206 return -EFBIG;
208 } 207 }
209 if (count > MAX_NON_LFS - (u32)pos) { 208 if (count > MAX_NON_LFS - (u32)pos) {
@@ -212,7 +211,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
212 } 211 }
213 if (pos >= inode->i_sb->s_maxbytes) { 212 if (pos >= inode->i_sb->s_maxbytes) {
214 if (count || pos > inode->i_sb->s_maxbytes) { 213 if (count || pos > inode->i_sb->s_maxbytes) {
215 send_sig(SIGXFSZ, current, 0);
216 return -EFBIG; 214 return -EFBIG;
217 } 215 }
218 } 216 }
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 75f309c8741a..a796be5051bf 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -14,6 +14,7 @@
14#include <linux/sunrpc/svcsock.h> 14#include <linux/sunrpc/svcsock.h>
15#include <linux/nfs_fs.h> 15#include <linux/nfs_fs.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/freezer.h>
17 18
18#include <net/inet_sock.h> 19#include <net/inet_sock.h>
19 20
@@ -67,6 +68,7 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
67 daemonize("nfsv4-svc"); 68 daemonize("nfsv4-svc");
68 /* Process request with signals blocked, but allow SIGKILL. */ 69 /* Process request with signals blocked, but allow SIGKILL. */
69 allow_signal(SIGKILL); 70 allow_signal(SIGKILL);
71 set_freezable();
70 72
71 complete(&nfs_callback_info.started); 73 complete(&nfs_callback_info.started);
72 74
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ccb455053ee4..a49f9feff776 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1206,23 +1206,9 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
1206 */ 1206 */
1207static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) 1207static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
1208{ 1208{
1209 struct list_head *_p;
1210 loff_t pos = *_pos;
1211
1212 /* lock the list against modification */ 1209 /* lock the list against modification */
1213 spin_lock(&nfs_client_lock); 1210 spin_lock(&nfs_client_lock);
1214 1211 return seq_list_start_head(&nfs_client_list, *_pos);
1215 /* allow for the header line */
1216 if (!pos)
1217 return SEQ_START_TOKEN;
1218 pos--;
1219
1220 /* find the n'th element in the list */
1221 list_for_each(_p, &nfs_client_list)
1222 if (!pos--)
1223 break;
1224
1225 return _p != &nfs_client_list ? _p : NULL;
1226} 1212}
1227 1213
1228/* 1214/*
@@ -1230,14 +1216,7 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
1230 */ 1216 */
1231static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) 1217static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
1232{ 1218{
1233 struct list_head *_p; 1219 return seq_list_next(v, &nfs_client_list, pos);
1234
1235 (*pos)++;
1236
1237 _p = v;
1238 _p = (v == SEQ_START_TOKEN) ? nfs_client_list.next : _p->next;
1239
1240 return _p != &nfs_client_list ? _p : NULL;
1241} 1220}
1242 1221
1243/* 1222/*
@@ -1256,7 +1235,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
1256 struct nfs_client *clp; 1235 struct nfs_client *clp;
1257 1236
1258 /* display header on line 1 */ 1237 /* display header on line 1 */
1259 if (v == SEQ_START_TOKEN) { 1238 if (v == &nfs_client_list) {
1260 seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); 1239 seq_puts(m, "NV SERVER PORT USE HOSTNAME\n");
1261 return 0; 1240 return 0;
1262 } 1241 }
@@ -1297,23 +1276,9 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
1297 */ 1276 */
1298static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) 1277static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
1299{ 1278{
1300 struct list_head *_p;
1301 loff_t pos = *_pos;
1302
1303 /* lock the list against modification */ 1279 /* lock the list against modification */
1304 spin_lock(&nfs_client_lock); 1280 spin_lock(&nfs_client_lock);
1305 1281 return seq_list_start_head(&nfs_volume_list, *_pos);
1306 /* allow for the header line */
1307 if (!pos)
1308 return SEQ_START_TOKEN;
1309 pos--;
1310
1311 /* find the n'th element in the list */
1312 list_for_each(_p, &nfs_volume_list)
1313 if (!pos--)
1314 break;
1315
1316 return _p != &nfs_volume_list ? _p : NULL;
1317} 1282}
1318 1283
1319/* 1284/*
@@ -1321,14 +1286,7 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
1321 */ 1286 */
1322static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) 1287static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
1323{ 1288{
1324 struct list_head *_p; 1289 return seq_list_next(v, &nfs_volume_list, pos);
1325
1326 (*pos)++;
1327
1328 _p = v;
1329 _p = (v == SEQ_START_TOKEN) ? nfs_volume_list.next : _p->next;
1330
1331 return _p != &nfs_volume_list ? _p : NULL;
1332} 1290}
1333 1291
1334/* 1292/*
@@ -1349,7 +1307,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1349 char dev[8], fsid[17]; 1307 char dev[8], fsid[17];
1350 1308
1351 /* display header on line 1 */ 1309 /* display header on line 1 */
1352 if (v == SEQ_START_TOKEN) { 1310 if (v == &nfs_volume_list) {
1353 seq_puts(m, "NV SERVER PORT DEV FSID\n"); 1311 seq_puts(m, "NV SERVER PORT DEV FSID\n");
1354 return 0; 1312 return 0;
1355 } 1313 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a2b1af89ca1a..adffe1615c51 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -300,7 +300,10 @@ static const struct super_operations nfs4_sops = {
300}; 300};
301#endif 301#endif
302 302
303static struct shrinker *acl_shrinker; 303static struct shrinker acl_shrinker = {
304 .shrink = nfs_access_cache_shrinker,
305 .seeks = DEFAULT_SEEKS,
306};
304 307
305/* 308/*
306 * Register the NFS filesystems 309 * Register the NFS filesystems
@@ -321,7 +324,7 @@ int __init register_nfs_fs(void)
321 if (ret < 0) 324 if (ret < 0)
322 goto error_2; 325 goto error_2;
323#endif 326#endif
324 acl_shrinker = set_shrinker(DEFAULT_SEEKS, nfs_access_cache_shrinker); 327 register_shrinker(&acl_shrinker);
325 return 0; 328 return 0;
326 329
327#ifdef CONFIG_NFS_V4 330#ifdef CONFIG_NFS_V4
@@ -339,8 +342,7 @@ error_0:
339 */ 342 */
340void __exit unregister_nfs_fs(void) 343void __exit unregister_nfs_fs(void)
341{ 344{
342 if (acl_shrinker != NULL) 345 unregister_shrinker(&acl_shrinker);
343 remove_shrinker(acl_shrinker);
344#ifdef CONFIG_NFS_V4 346#ifdef CONFIG_NFS_V4
345 unregister_filesystem(&nfs4_fs_type); 347 unregister_filesystem(&nfs4_fs_type);
346 nfs_unregister_sysctl(); 348 nfs_unregister_sysctl();
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 6e92b0fe5323..cf61dc8ae942 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -12,17 +12,31 @@
12 12
13#define CAP_NFSD_MASK (CAP_FS_MASK|CAP_TO_MASK(CAP_SYS_RESOURCE)) 13#define CAP_NFSD_MASK (CAP_FS_MASK|CAP_TO_MASK(CAP_SYS_RESOURCE))
14 14
15static int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
16{
17 struct exp_flavor_info *f;
18 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
19
20 for (f = exp->ex_flavors; f < end; f++) {
21 if (f->pseudoflavor == rqstp->rq_flavor)
22 return f->flags;
23 }
24 return exp->ex_flags;
25
26}
27
15int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) 28int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
16{ 29{
17 struct svc_cred cred = rqstp->rq_cred; 30 struct svc_cred cred = rqstp->rq_cred;
18 int i; 31 int i;
32 int flags = nfsexp_flags(rqstp, exp);
19 int ret; 33 int ret;
20 34
21 if (exp->ex_flags & NFSEXP_ALLSQUASH) { 35 if (flags & NFSEXP_ALLSQUASH) {
22 cred.cr_uid = exp->ex_anon_uid; 36 cred.cr_uid = exp->ex_anon_uid;
23 cred.cr_gid = exp->ex_anon_gid; 37 cred.cr_gid = exp->ex_anon_gid;
24 cred.cr_group_info = groups_alloc(0); 38 cred.cr_group_info = groups_alloc(0);
25 } else if (exp->ex_flags & NFSEXP_ROOTSQUASH) { 39 } else if (flags & NFSEXP_ROOTSQUASH) {
26 struct group_info *gi; 40 struct group_info *gi;
27 if (!cred.cr_uid) 41 if (!cred.cr_uid)
28 cred.cr_uid = exp->ex_anon_uid; 42 cred.cr_uid = exp->ex_anon_uid;
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 79bd03b8bbf8..c7bbf460b009 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -26,12 +26,15 @@
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/hash.h> 27#include <linux/hash.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/exportfs.h>
29 30
30#include <linux/sunrpc/svc.h> 31#include <linux/sunrpc/svc.h>
31#include <linux/nfsd/nfsd.h> 32#include <linux/nfsd/nfsd.h>
32#include <linux/nfsd/nfsfh.h> 33#include <linux/nfsd/nfsfh.h>
33#include <linux/nfsd/syscall.h> 34#include <linux/nfsd/syscall.h>
34#include <linux/lockd/bind.h> 35#include <linux/lockd/bind.h>
36#include <linux/sunrpc/msg_prot.h>
37#include <linux/sunrpc/gss_api.h>
35 38
36#define NFSDDBG_FACILITY NFSDDBG_EXPORT 39#define NFSDDBG_FACILITY NFSDDBG_EXPORT
37 40
@@ -451,8 +454,48 @@ out_free_all:
451 return err; 454 return err;
452} 455}
453 456
457static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp)
458{
459 int listsize, err;
460 struct exp_flavor_info *f;
461
462 err = get_int(mesg, &listsize);
463 if (err)
464 return err;
465 if (listsize < 0 || listsize > MAX_SECINFO_LIST)
466 return -EINVAL;
467
468 for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) {
469 err = get_int(mesg, &f->pseudoflavor);
470 if (err)
471 return err;
472 /*
473 * Just a quick sanity check; we could also try to check
474 * whether this pseudoflavor is supported, but at worst
475 * an unsupported pseudoflavor on the export would just
476 * be a pseudoflavor that won't match the flavor of any
477 * authenticated request. The administrator will
478 * probably discover the problem when someone fails to
479 * authenticate.
480 */
481 if (f->pseudoflavor < 0)
482 return -EINVAL;
483 err = get_int(mesg, &f->flags);
484 if (err)
485 return err;
486 /* Only some flags are allowed to differ between flavors: */
487 if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags))
488 return -EINVAL;
489 }
490 exp->ex_nflavors = listsize;
491 return 0;
492}
493
454#else /* CONFIG_NFSD_V4 */ 494#else /* CONFIG_NFSD_V4 */
455static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { return 0; } 495static inline int
496fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;}
497static inline int
498secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
456#endif 499#endif
457 500
458static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) 501static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
@@ -476,6 +519,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
476 519
477 exp.ex_uuid = NULL; 520 exp.ex_uuid = NULL;
478 521
522 /* secinfo */
523 exp.ex_nflavors = 0;
524
479 if (mesg[mlen-1] != '\n') 525 if (mesg[mlen-1] != '\n')
480 return -EINVAL; 526 return -EINVAL;
481 mesg[mlen-1] = 0; 527 mesg[mlen-1] = 0;
@@ -553,7 +599,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
553 if (exp.ex_uuid == NULL) 599 if (exp.ex_uuid == NULL)
554 err = -ENOMEM; 600 err = -ENOMEM;
555 } 601 }
556 } else 602 } else if (strcmp(buf, "secinfo") == 0)
603 err = secinfo_parse(&mesg, buf, &exp);
604 else
557 /* quietly ignore unknown words and anything 605 /* quietly ignore unknown words and anything
558 * following. Newer user-space can try to set 606 * following. Newer user-space can try to set
559 * new values, then see what the result was. 607 * new values, then see what the result was.
@@ -593,6 +641,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
593 641
594static void exp_flags(struct seq_file *m, int flag, int fsid, 642static void exp_flags(struct seq_file *m, int flag, int fsid,
595 uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs); 643 uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs);
644static void show_secinfo(struct seq_file *m, struct svc_export *exp);
596 645
597static int svc_export_show(struct seq_file *m, 646static int svc_export_show(struct seq_file *m,
598 struct cache_detail *cd, 647 struct cache_detail *cd,
@@ -622,6 +671,7 @@ static int svc_export_show(struct seq_file *m,
622 seq_printf(m, "%02x", exp->ex_uuid[i]); 671 seq_printf(m, "%02x", exp->ex_uuid[i]);
623 } 672 }
624 } 673 }
674 show_secinfo(m, exp);
625 } 675 }
626 seq_puts(m, ")\n"); 676 seq_puts(m, ")\n");
627 return 0; 677 return 0;
@@ -654,6 +704,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
654{ 704{
655 struct svc_export *new = container_of(cnew, struct svc_export, h); 705 struct svc_export *new = container_of(cnew, struct svc_export, h);
656 struct svc_export *item = container_of(citem, struct svc_export, h); 706 struct svc_export *item = container_of(citem, struct svc_export, h);
707 int i;
657 708
658 new->ex_flags = item->ex_flags; 709 new->ex_flags = item->ex_flags;
659 new->ex_anon_uid = item->ex_anon_uid; 710 new->ex_anon_uid = item->ex_anon_uid;
@@ -669,6 +720,10 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
669 item->ex_fslocs.locations_count = 0; 720 item->ex_fslocs.locations_count = 0;
670 new->ex_fslocs.migrated = item->ex_fslocs.migrated; 721 new->ex_fslocs.migrated = item->ex_fslocs.migrated;
671 item->ex_fslocs.migrated = 0; 722 item->ex_fslocs.migrated = 0;
723 new->ex_nflavors = item->ex_nflavors;
724 for (i = 0; i < MAX_SECINFO_LIST; i++) {
725 new->ex_flavors[i] = item->ex_flavors[i];
726 }
672} 727}
673 728
674static struct cache_head *svc_export_alloc(void) 729static struct cache_head *svc_export_alloc(void)
@@ -738,16 +793,18 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
738 int err; 793 int err;
739 794
740 if (!clp) 795 if (!clp)
741 return NULL; 796 return ERR_PTR(-ENOENT);
742 797
743 key.ek_client = clp; 798 key.ek_client = clp;
744 key.ek_fsidtype = fsid_type; 799 key.ek_fsidtype = fsid_type;
745 memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); 800 memcpy(key.ek_fsid, fsidv, key_len(fsid_type));
746 801
747 ek = svc_expkey_lookup(&key); 802 ek = svc_expkey_lookup(&key);
748 if (ek != NULL) 803 if (ek == NULL)
749 if ((err = cache_check(&svc_expkey_cache, &ek->h, reqp))) 804 return ERR_PTR(-ENOMEM);
750 ek = ERR_PTR(err); 805 err = cache_check(&svc_expkey_cache, &ek->h, reqp);
806 if (err)
807 return ERR_PTR(err);
751 return ek; 808 return ek;
752} 809}
753 810
@@ -808,30 +865,21 @@ exp_get_by_name(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry,
808 struct cache_req *reqp) 865 struct cache_req *reqp)
809{ 866{
810 struct svc_export *exp, key; 867 struct svc_export *exp, key;
868 int err;
811 869
812 if (!clp) 870 if (!clp)
813 return NULL; 871 return ERR_PTR(-ENOENT);
814 872
815 key.ex_client = clp; 873 key.ex_client = clp;
816 key.ex_mnt = mnt; 874 key.ex_mnt = mnt;
817 key.ex_dentry = dentry; 875 key.ex_dentry = dentry;
818 876
819 exp = svc_export_lookup(&key); 877 exp = svc_export_lookup(&key);
820 if (exp != NULL) { 878 if (exp == NULL)
821 int err; 879 return ERR_PTR(-ENOMEM);
822 880 err = cache_check(&svc_export_cache, &exp->h, reqp);
823 err = cache_check(&svc_export_cache, &exp->h, reqp); 881 if (err)
824 switch (err) { 882 return ERR_PTR(err);
825 case 0: break;
826 case -EAGAIN:
827 case -ETIMEDOUT:
828 exp = ERR_PTR(err);
829 break;
830 default:
831 exp = NULL;
832 }
833 }
834
835 return exp; 883 return exp;
836} 884}
837 885
@@ -847,7 +895,7 @@ exp_parent(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry,
847 dget(dentry); 895 dget(dentry);
848 exp = exp_get_by_name(clp, mnt, dentry, reqp); 896 exp = exp_get_by_name(clp, mnt, dentry, reqp);
849 897
850 while (exp == NULL && !IS_ROOT(dentry)) { 898 while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
851 struct dentry *parent; 899 struct dentry *parent;
852 900
853 parent = dget_parent(dentry); 901 parent = dget_parent(dentry);
@@ -900,7 +948,7 @@ static void exp_fsid_unhash(struct svc_export *exp)
900 return; 948 return;
901 949
902 ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid); 950 ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
903 if (ek && !IS_ERR(ek)) { 951 if (!IS_ERR(ek)) {
904 ek->h.expiry_time = get_seconds()-1; 952 ek->h.expiry_time = get_seconds()-1;
905 cache_put(&ek->h, &svc_expkey_cache); 953 cache_put(&ek->h, &svc_expkey_cache);
906 } 954 }
@@ -938,7 +986,7 @@ static void exp_unhash(struct svc_export *exp)
938 struct inode *inode = exp->ex_dentry->d_inode; 986 struct inode *inode = exp->ex_dentry->d_inode;
939 987
940 ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino); 988 ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
941 if (ek && !IS_ERR(ek)) { 989 if (!IS_ERR(ek)) {
942 ek->h.expiry_time = get_seconds()-1; 990 ek->h.expiry_time = get_seconds()-1;
943 cache_put(&ek->h, &svc_expkey_cache); 991 cache_put(&ek->h, &svc_expkey_cache);
944 } 992 }
@@ -989,13 +1037,12 @@ exp_export(struct nfsctl_export *nxp)
989 1037
990 /* must make sure there won't be an ex_fsid clash */ 1038 /* must make sure there won't be an ex_fsid clash */
991 if ((nxp->ex_flags & NFSEXP_FSID) && 1039 if ((nxp->ex_flags & NFSEXP_FSID) &&
992 (fsid_key = exp_get_fsid_key(clp, nxp->ex_dev)) && 1040 (!IS_ERR(fsid_key = exp_get_fsid_key(clp, nxp->ex_dev))) &&
993 !IS_ERR(fsid_key) &&
994 fsid_key->ek_mnt && 1041 fsid_key->ek_mnt &&
995 (fsid_key->ek_mnt != nd.mnt || fsid_key->ek_dentry != nd.dentry) ) 1042 (fsid_key->ek_mnt != nd.mnt || fsid_key->ek_dentry != nd.dentry) )
996 goto finish; 1043 goto finish;
997 1044
998 if (exp) { 1045 if (!IS_ERR(exp)) {
999 /* just a flags/id/fsid update */ 1046 /* just a flags/id/fsid update */
1000 1047
1001 exp_fsid_unhash(exp); 1048 exp_fsid_unhash(exp);
@@ -1104,7 +1151,7 @@ exp_unexport(struct nfsctl_export *nxp)
1104 err = -EINVAL; 1151 err = -EINVAL;
1105 exp = exp_get_by_name(dom, nd.mnt, nd.dentry, NULL); 1152 exp = exp_get_by_name(dom, nd.mnt, nd.dentry, NULL);
1106 path_release(&nd); 1153 path_release(&nd);
1107 if (!exp) 1154 if (IS_ERR(exp))
1108 goto out_domain; 1155 goto out_domain;
1109 1156
1110 exp_do_unexport(exp); 1157 exp_do_unexport(exp);
@@ -1149,10 +1196,6 @@ exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize)
1149 err = PTR_ERR(exp); 1196 err = PTR_ERR(exp);
1150 goto out; 1197 goto out;
1151 } 1198 }
1152 if (!exp) {
1153 dprintk("nfsd: exp_rootfh export not found.\n");
1154 goto out;
1155 }
1156 1199
1157 /* 1200 /*
1158 * fh must be initialized before calling fh_compose 1201 * fh must be initialized before calling fh_compose
@@ -1176,17 +1219,130 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
1176{ 1219{
1177 struct svc_export *exp; 1220 struct svc_export *exp;
1178 struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp); 1221 struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp);
1179 if (!ek || IS_ERR(ek)) 1222 if (IS_ERR(ek))
1180 return ERR_PTR(PTR_ERR(ek)); 1223 return ERR_PTR(PTR_ERR(ek));
1181 1224
1182 exp = exp_get_by_name(clp, ek->ek_mnt, ek->ek_dentry, reqp); 1225 exp = exp_get_by_name(clp, ek->ek_mnt, ek->ek_dentry, reqp);
1183 cache_put(&ek->h, &svc_expkey_cache); 1226 cache_put(&ek->h, &svc_expkey_cache);
1184 1227
1185 if (!exp || IS_ERR(exp)) 1228 if (IS_ERR(exp))
1186 return ERR_PTR(PTR_ERR(exp)); 1229 return ERR_PTR(PTR_ERR(exp));
1187 return exp; 1230 return exp;
1188} 1231}
1189 1232
1233__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
1234{
1235 struct exp_flavor_info *f;
1236 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
1237
1238 /* legacy gss-only clients are always OK: */
1239 if (exp->ex_client == rqstp->rq_gssclient)
1240 return 0;
1241 /* ip-address based client; check sec= export option: */
1242 for (f = exp->ex_flavors; f < end; f++) {
1243 if (f->pseudoflavor == rqstp->rq_flavor)
1244 return 0;
1245 }
1246 /* defaults in absence of sec= options: */
1247 if (exp->ex_nflavors == 0) {
1248 if (rqstp->rq_flavor == RPC_AUTH_NULL ||
1249 rqstp->rq_flavor == RPC_AUTH_UNIX)
1250 return 0;
1251 }
1252 return nfserr_wrongsec;
1253}
1254
1255/*
1256 * Uses rq_client and rq_gssclient to find an export; uses rq_client (an
1257 * auth_unix client) if it's available and has secinfo information;
1258 * otherwise, will try to use rq_gssclient.
1259 *
1260 * Called from functions that handle requests; functions that do work on
1261 * behalf of mountd are passed a single client name to use, and should
1262 * use exp_get_by_name() or exp_find().
1263 */
1264struct svc_export *
1265rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
1266 struct dentry *dentry)
1267{
1268 struct svc_export *gssexp, *exp = NULL;
1269
1270 if (rqstp->rq_client == NULL)
1271 goto gss;
1272
1273 /* First try the auth_unix client: */
1274 exp = exp_get_by_name(rqstp->rq_client, mnt, dentry,
1275 &rqstp->rq_chandle);
1276 if (PTR_ERR(exp) == -ENOENT)
1277 goto gss;
1278 if (IS_ERR(exp))
1279 return exp;
1280 /* If it has secinfo, assume there are no gss/... clients */
1281 if (exp->ex_nflavors > 0)
1282 return exp;
1283gss:
1284 /* Otherwise, try falling back on gss client */
1285 if (rqstp->rq_gssclient == NULL)
1286 return exp;
1287 gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry,
1288 &rqstp->rq_chandle);
1289 if (PTR_ERR(gssexp) == -ENOENT)
1290 return exp;
1291 if (exp && !IS_ERR(exp))
1292 exp_put(exp);
1293 return gssexp;
1294}
1295
1296struct svc_export *
1297rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv)
1298{
1299 struct svc_export *gssexp, *exp = NULL;
1300
1301 if (rqstp->rq_client == NULL)
1302 goto gss;
1303
1304 /* First try the auth_unix client: */
1305 exp = exp_find(rqstp->rq_client, fsid_type, fsidv, &rqstp->rq_chandle);
1306 if (PTR_ERR(exp) == -ENOENT)
1307 goto gss;
1308 if (IS_ERR(exp))
1309 return exp;
1310 /* If it has secinfo, assume there are no gss/... clients */
1311 if (exp->ex_nflavors > 0)
1312 return exp;
1313gss:
1314 /* Otherwise, try falling back on gss client */
1315 if (rqstp->rq_gssclient == NULL)
1316 return exp;
1317 gssexp = exp_find(rqstp->rq_gssclient, fsid_type, fsidv,
1318 &rqstp->rq_chandle);
1319 if (PTR_ERR(gssexp) == -ENOENT)
1320 return exp;
1321 if (exp && !IS_ERR(exp))
1322 exp_put(exp);
1323 return gssexp;
1324}
1325
1326struct svc_export *
1327rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
1328 struct dentry *dentry)
1329{
1330 struct svc_export *exp;
1331
1332 dget(dentry);
1333 exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
1334
1335 while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
1336 struct dentry *parent;
1337
1338 parent = dget_parent(dentry);
1339 dput(dentry);
1340 dentry = parent;
1341 exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
1342 }
1343 dput(dentry);
1344 return exp;
1345}
1190 1346
1191/* 1347/*
1192 * Called when we need the filehandle for the root of the pseudofs, 1348 * Called when we need the filehandle for the root of the pseudofs,
@@ -1194,8 +1350,7 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
1194 * export point with fsid==0 1350 * export point with fsid==0
1195 */ 1351 */
1196__be32 1352__be32
1197exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, 1353exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
1198 struct cache_req *creq)
1199{ 1354{
1200 struct svc_export *exp; 1355 struct svc_export *exp;
1201 __be32 rv; 1356 __be32 rv;
@@ -1203,12 +1358,16 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
1203 1358
1204 mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); 1359 mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
1205 1360
1206 exp = exp_find(clp, FSID_NUM, fsidv, creq); 1361 exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
1362 if (PTR_ERR(exp) == -ENOENT)
1363 return nfserr_perm;
1207 if (IS_ERR(exp)) 1364 if (IS_ERR(exp))
1208 return nfserrno(PTR_ERR(exp)); 1365 return nfserrno(PTR_ERR(exp));
1209 if (exp == NULL)
1210 return nfserr_perm;
1211 rv = fh_compose(fhp, exp, exp->ex_dentry, NULL); 1366 rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
1367 if (rv)
1368 goto out;
1369 rv = check_nfsd_access(exp, rqstp);
1370out:
1212 exp_put(exp); 1371 exp_put(exp);
1213 return rv; 1372 return rv;
1214} 1373}
@@ -1296,28 +1455,62 @@ static struct flags {
1296 { 0, {"", ""}} 1455 { 0, {"", ""}}
1297}; 1456};
1298 1457
1299static void exp_flags(struct seq_file *m, int flag, int fsid, 1458static void show_expflags(struct seq_file *m, int flags, int mask)
1300 uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
1301{ 1459{
1302 int first = 0;
1303 struct flags *flg; 1460 struct flags *flg;
1461 int state, first = 0;
1304 1462
1305 for (flg = expflags; flg->flag; flg++) { 1463 for (flg = expflags; flg->flag; flg++) {
1306 int state = (flg->flag & flag)?0:1; 1464 if (flg->flag & ~mask)
1465 continue;
1466 state = (flg->flag & flags) ? 0 : 1;
1307 if (*flg->name[state]) 1467 if (*flg->name[state])
1308 seq_printf(m, "%s%s", first++?",":"", flg->name[state]); 1468 seq_printf(m, "%s%s", first++?",":"", flg->name[state]);
1309 } 1469 }
1470}
1471
1472static void show_secinfo_flags(struct seq_file *m, int flags)
1473{
1474 seq_printf(m, ",");
1475 show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
1476}
1477
1478static void show_secinfo(struct seq_file *m, struct svc_export *exp)
1479{
1480 struct exp_flavor_info *f;
1481 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
1482 int lastflags = 0, first = 0;
1483
1484 if (exp->ex_nflavors == 0)
1485 return;
1486 for (f = exp->ex_flavors; f < end; f++) {
1487 if (first || f->flags != lastflags) {
1488 if (!first)
1489 show_secinfo_flags(m, lastflags);
1490 seq_printf(m, ",sec=%d", f->pseudoflavor);
1491 lastflags = f->flags;
1492 } else {
1493 seq_printf(m, ":%d", f->pseudoflavor);
1494 }
1495 }
1496 show_secinfo_flags(m, lastflags);
1497}
1498
1499static void exp_flags(struct seq_file *m, int flag, int fsid,
1500 uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
1501{
1502 show_expflags(m, flag, NFSEXP_ALLFLAGS);
1310 if (flag & NFSEXP_FSID) 1503 if (flag & NFSEXP_FSID)
1311 seq_printf(m, "%sfsid=%d", first++?",":"", fsid); 1504 seq_printf(m, ",fsid=%d", fsid);
1312 if (anonu != (uid_t)-2 && anonu != (0x10000-2)) 1505 if (anonu != (uid_t)-2 && anonu != (0x10000-2))
1313 seq_printf(m, "%sanonuid=%d", first++?",":"", anonu); 1506 seq_printf(m, ",sanonuid=%d", anonu);
1314 if (anong != (gid_t)-2 && anong != (0x10000-2)) 1507 if (anong != (gid_t)-2 && anong != (0x10000-2))
1315 seq_printf(m, "%sanongid=%d", first++?",":"", anong); 1508 seq_printf(m, ",sanongid=%d", anong);
1316 if (fsloc && fsloc->locations_count > 0) { 1509 if (fsloc && fsloc->locations_count > 0) {
1317 char *loctype = (fsloc->migrated) ? "refer" : "replicas"; 1510 char *loctype = (fsloc->migrated) ? "refer" : "replicas";
1318 int i; 1511 int i;
1319 1512
1320 seq_printf(m, "%s%s=", first++?",":"", loctype); 1513 seq_printf(m, ",%s=", loctype);
1321 seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\"); 1514 seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\");
1322 seq_putc(m, '@'); 1515 seq_putc(m, '@');
1323 seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\"); 1516 seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\");
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 221acd1f11f6..9e4a568a5013 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -65,6 +65,7 @@ nlm_fclose(struct file *filp)
65static struct nlmsvc_binding nfsd_nlm_ops = { 65static struct nlmsvc_binding nfsd_nlm_ops = {
66 .fopen = nlm_fopen, /* open file for locking */ 66 .fopen = nlm_fopen, /* open file for locking */
67 .fclose = nlm_fclose, /* close file */ 67 .fclose = nlm_fclose, /* close file */
68 .get_grace_period = get_nfs4_grace_period,
68}; 69};
69 70
70void 71void
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index cc3b7badd486..b6ed38380ab8 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -183,8 +183,13 @@ static void
183summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas) 183summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas)
184{ 184{
185 struct posix_acl_entry *pa, *pe; 185 struct posix_acl_entry *pa, *pe;
186 pas->users = 0; 186
187 pas->groups = 0; 187 /*
188 * Only pas.users and pas.groups need initialization; previous
189 * posix_acl_valid() calls ensure that the other fields will be
190 * initialized in the following loop. But, just to placate gcc:
191 */
192 memset(pas, 0, sizeof(*pas));
188 pas->mask = 07; 193 pas->mask = 07;
189 194
190 pe = acl->a_entries + acl->a_count; 195 pe = acl->a_entries + acl->a_count;
@@ -732,13 +737,16 @@ int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
732 *pacl = posix_state_to_acl(&effective_acl_state, flags); 737 *pacl = posix_state_to_acl(&effective_acl_state, flags);
733 if (IS_ERR(*pacl)) { 738 if (IS_ERR(*pacl)) {
734 ret = PTR_ERR(*pacl); 739 ret = PTR_ERR(*pacl);
740 *pacl = NULL;
735 goto out_dstate; 741 goto out_dstate;
736 } 742 }
737 *dpacl = posix_state_to_acl(&default_acl_state, 743 *dpacl = posix_state_to_acl(&default_acl_state,
738 flags | NFS4_ACL_TYPE_DEFAULT); 744 flags | NFS4_ACL_TYPE_DEFAULT);
739 if (IS_ERR(*dpacl)) { 745 if (IS_ERR(*dpacl)) {
740 ret = PTR_ERR(*dpacl); 746 ret = PTR_ERR(*dpacl);
747 *dpacl = NULL;
741 posix_acl_release(*pacl); 748 posix_acl_release(*pacl);
749 *pacl = NULL;
742 goto out_dstate; 750 goto out_dstate;
743 } 751 }
744 sort_pacl(*pacl); 752 sort_pacl(*pacl);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 5443c52b57aa..31d6633c7fe4 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -75,7 +75,7 @@ enum nfs_cb_opnum4 {
75#define op_enc_sz 1 75#define op_enc_sz 1
76#define op_dec_sz 2 76#define op_dec_sz 2
77#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) 77#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2))
78#define enc_stateid_sz 16 78#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2)
79#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ 79#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \
80 1 + enc_stateid_sz + \ 80 1 + enc_stateid_sz + \
81 enc_nfs4_fh_sz) 81 enc_nfs4_fh_sz)
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 45aa21ce6784..2cf9a9a2d89c 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -587,6 +587,15 @@ idmap_lookup(struct svc_rqst *rqstp,
587 return ret; 587 return ret;
588} 588}
589 589
590static char *
591rqst_authname(struct svc_rqst *rqstp)
592{
593 struct auth_domain *clp;
594
595 clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client;
596 return clp->name;
597}
598
590static int 599static int
591idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, 600idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
592 uid_t *id) 601 uid_t *id)
@@ -600,7 +609,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
600 return -EINVAL; 609 return -EINVAL;
601 memcpy(key.name, name, namelen); 610 memcpy(key.name, name, namelen);
602 key.name[namelen] = '\0'; 611 key.name[namelen] = '\0';
603 strlcpy(key.authname, rqstp->rq_client->name, sizeof(key.authname)); 612 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
604 ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item); 613 ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
605 if (ret == -ENOENT) 614 if (ret == -ENOENT)
606 ret = -ESRCH; /* nfserr_badname */ 615 ret = -ESRCH; /* nfserr_badname */
@@ -620,7 +629,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
620 }; 629 };
621 int ret; 630 int ret;
622 631
623 strlcpy(key.authname, rqstp->rq_client->name, sizeof(key.authname)); 632 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
624 ret = idmap_lookup(rqstp, idtoname_lookup, &key, &idtoname_cache, &item); 633 ret = idmap_lookup(rqstp, idtoname_lookup, &key, &idtoname_cache, &item);
625 if (ret == -ENOENT) 634 if (ret == -ENOENT)
626 return sprintf(name, "%u", id); 635 return sprintf(name, "%u", id);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8522729830db..3c627128e205 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -47,6 +47,7 @@
47#include <linux/nfsd/state.h> 47#include <linux/nfsd/state.h>
48#include <linux/nfsd/xdr4.h> 48#include <linux/nfsd/xdr4.h>
49#include <linux/nfs4_acl.h> 49#include <linux/nfs4_acl.h>
50#include <linux/sunrpc/gss_api.h>
50 51
51#define NFSDDBG_FACILITY NFSDDBG_PROC 52#define NFSDDBG_FACILITY NFSDDBG_PROC
52 53
@@ -286,8 +287,7 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
286 __be32 status; 287 __be32 status;
287 288
288 fh_put(&cstate->current_fh); 289 fh_put(&cstate->current_fh);
289 status = exp_pseudoroot(rqstp->rq_client, &cstate->current_fh, 290 status = exp_pseudoroot(rqstp, &cstate->current_fh);
290 &rqstp->rq_chandle);
291 return status; 291 return status;
292} 292}
293 293
@@ -474,8 +474,8 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
474 __be32 ret; 474 __be32 ret;
475 475
476 fh_init(&tmp_fh, NFS4_FHSIZE); 476 fh_init(&tmp_fh, NFS4_FHSIZE);
477 if((ret = exp_pseudoroot(rqstp->rq_client, &tmp_fh, 477 ret = exp_pseudoroot(rqstp, &tmp_fh);
478 &rqstp->rq_chandle)) != 0) 478 if (ret)
479 return ret; 479 return ret;
480 if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) { 480 if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) {
481 fh_put(&tmp_fh); 481 fh_put(&tmp_fh);
@@ -611,6 +611,30 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
611} 611}
612 612
613static __be32 613static __be32
614nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
615 struct nfsd4_secinfo *secinfo)
616{
617 struct svc_fh resfh;
618 struct svc_export *exp;
619 struct dentry *dentry;
620 __be32 err;
621
622 fh_init(&resfh, NFS4_FHSIZE);
623 err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
624 secinfo->si_name, secinfo->si_namelen,
625 &exp, &dentry);
626 if (err)
627 return err;
628 if (dentry->d_inode == NULL) {
629 exp_put(exp);
630 err = nfserr_noent;
631 } else
632 secinfo->si_exp = exp;
633 dput(dentry);
634 return err;
635}
636
637static __be32
614nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 638nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
615 struct nfsd4_setattr *setattr) 639 struct nfsd4_setattr *setattr)
616{ 640{
@@ -1009,6 +1033,9 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
1009 [OP_SAVEFH] = { 1033 [OP_SAVEFH] = {
1010 .op_func = (nfsd4op_func)nfsd4_savefh, 1034 .op_func = (nfsd4op_func)nfsd4_savefh,
1011 }, 1035 },
1036 [OP_SECINFO] = {
1037 .op_func = (nfsd4op_func)nfsd4_secinfo,
1038 },
1012 [OP_SETATTR] = { 1039 [OP_SETATTR] = {
1013 .op_func = (nfsd4op_func)nfsd4_setattr, 1040 .op_func = (nfsd4op_func)nfsd4_setattr,
1014 }, 1041 },
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8c52913d7cb6..e4a4c87ec8c6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,8 +49,10 @@
49#include <linux/nfsd/state.h> 49#include <linux/nfsd/state.h>
50#include <linux/nfsd/xdr4.h> 50#include <linux/nfsd/xdr4.h>
51#include <linux/namei.h> 51#include <linux/namei.h>
52#include <linux/swap.h>
52#include <linux/mutex.h> 53#include <linux/mutex.h>
53#include <linux/lockd/bind.h> 54#include <linux/lockd/bind.h>
55#include <linux/module.h>
54 56
55#define NFSDDBG_FACILITY NFSDDBG_PROC 57#define NFSDDBG_FACILITY NFSDDBG_PROC
56 58
@@ -149,6 +151,7 @@ get_nfs4_file(struct nfs4_file *fi)
149} 151}
150 152
151static int num_delegations; 153static int num_delegations;
154unsigned int max_delegations;
152 155
153/* 156/*
154 * Open owner state (share locks) 157 * Open owner state (share locks)
@@ -192,7 +195,9 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
192 struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback; 195 struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
193 196
194 dprintk("NFSD alloc_init_deleg\n"); 197 dprintk("NFSD alloc_init_deleg\n");
195 if (num_delegations > STATEID_HASH_SIZE * 4) 198 if (fp->fi_had_conflict)
199 return NULL;
200 if (num_delegations > max_delegations)
196 return NULL; 201 return NULL;
197 dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); 202 dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
198 if (dp == NULL) 203 if (dp == NULL)
@@ -999,6 +1004,7 @@ alloc_init_file(struct inode *ino)
999 list_add(&fp->fi_hash, &file_hashtbl[hashval]); 1004 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1000 fp->fi_inode = igrab(ino); 1005 fp->fi_inode = igrab(ino);
1001 fp->fi_id = current_fileid++; 1006 fp->fi_id = current_fileid++;
1007 fp->fi_had_conflict = false;
1002 return fp; 1008 return fp;
1003 } 1009 }
1004 return NULL; 1010 return NULL;
@@ -1325,6 +1331,7 @@ do_recall(void *__dp)
1325{ 1331{
1326 struct nfs4_delegation *dp = __dp; 1332 struct nfs4_delegation *dp = __dp;
1327 1333
1334 dp->dl_file->fi_had_conflict = true;
1328 nfsd4_cb_recall(dp); 1335 nfsd4_cb_recall(dp);
1329 return 0; 1336 return 0;
1330} 1337}
@@ -3190,20 +3197,49 @@ nfsd4_load_reboot_recovery_data(void)
3190 printk("NFSD: Failure reading reboot recovery data\n"); 3197 printk("NFSD: Failure reading reboot recovery data\n");
3191} 3198}
3192 3199
3200unsigned long
3201get_nfs4_grace_period(void)
3202{
3203 return max(user_lease_time, lease_time) * HZ;
3204}
3205
3206/*
3207 * Since the lifetime of a delegation isn't limited to that of an open, a
3208 * client may quite reasonably hang on to a delegation as long as it has
3209 * the inode cached. This becomes an obvious problem the first time a
3210 * client's inode cache approaches the size of the server's total memory.
3211 *
3212 * For now we avoid this problem by imposing a hard limit on the number
3213 * of delegations, which varies according to the server's memory size.
3214 */
3215static void
3216set_max_delegations(void)
3217{
3218 /*
3219 * Allow at most 4 delegations per megabyte of RAM. Quick
3220 * estimates suggest that in the worst case (where every delegation
3221 * is for a different inode), a delegation could take about 1.5K,
3222 * giving a worst case usage of about 6% of memory.
3223 */
3224 max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);
3225}
3226
3193/* initialization to perform when the nfsd service is started: */ 3227/* initialization to perform when the nfsd service is started: */
3194 3228
3195static void 3229static void
3196__nfs4_state_start(void) 3230__nfs4_state_start(void)
3197{ 3231{
3198 time_t grace_time; 3232 unsigned long grace_time;
3199 3233
3200 boot_time = get_seconds(); 3234 boot_time = get_seconds();
3201 grace_time = max(user_lease_time, lease_time); 3235 grace_time = get_nfs_grace_period();
3202 lease_time = user_lease_time; 3236 lease_time = user_lease_time;
3203 in_grace = 1; 3237 in_grace = 1;
3204 printk("NFSD: starting %ld-second grace period\n", grace_time); 3238 printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
3239 grace_time/HZ);
3205 laundry_wq = create_singlethread_workqueue("nfsd4"); 3240 laundry_wq = create_singlethread_workqueue("nfsd4");
3206 queue_delayed_work(laundry_wq, &laundromat_work, grace_time*HZ); 3241 queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
3242 set_max_delegations();
3207} 3243}
3208 3244
3209int 3245int
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15809dfd88a5..b3d55c6747fd 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -56,6 +56,8 @@
56#include <linux/nfsd_idmap.h> 56#include <linux/nfsd_idmap.h>
57#include <linux/nfs4.h> 57#include <linux/nfs4.h>
58#include <linux/nfs4_acl.h> 58#include <linux/nfs4_acl.h>
59#include <linux/sunrpc/gss_api.h>
60#include <linux/sunrpc/svcauth_gss.h>
59 61
60#define NFSDDBG_FACILITY NFSDDBG_XDR 62#define NFSDDBG_FACILITY NFSDDBG_XDR
61 63
@@ -819,6 +821,23 @@ nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
819} 821}
820 822
821static __be32 823static __be32
824nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
825 struct nfsd4_secinfo *secinfo)
826{
827 DECODE_HEAD;
828
829 READ_BUF(4);
830 READ32(secinfo->si_namelen);
831 READ_BUF(secinfo->si_namelen);
832 SAVEMEM(secinfo->si_name, secinfo->si_namelen);
833 status = check_filename(secinfo->si_name, secinfo->si_namelen,
834 nfserr_noent);
835 if (status)
836 return status;
837 DECODE_TAIL;
838}
839
840static __be32
822nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) 841nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
823{ 842{
824 DECODE_HEAD; 843 DECODE_HEAD;
@@ -1131,6 +1150,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1131 case OP_SAVEFH: 1150 case OP_SAVEFH:
1132 op->status = nfs_ok; 1151 op->status = nfs_ok;
1133 break; 1152 break;
1153 case OP_SECINFO:
1154 op->status = nfsd4_decode_secinfo(argp, &op->u.secinfo);
1155 break;
1134 case OP_SETATTR: 1156 case OP_SETATTR:
1135 op->status = nfsd4_decode_setattr(argp, &op->u.setattr); 1157 op->status = nfsd4_decode_setattr(argp, &op->u.setattr);
1136 break; 1158 break;
@@ -1296,7 +1318,7 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *
1296 char *path, *rootpath; 1318 char *path, *rootpath;
1297 1319
1298 fh_init(&tmp_fh, NFS4_FHSIZE); 1320 fh_init(&tmp_fh, NFS4_FHSIZE);
1299 *stat = exp_pseudoroot(rqstp->rq_client, &tmp_fh, &rqstp->rq_chandle); 1321 *stat = exp_pseudoroot(rqstp, &tmp_fh);
1300 if (*stat) 1322 if (*stat)
1301 return NULL; 1323 return NULL;
1302 rootpath = tmp_fh.fh_export->ex_path; 1324 rootpath = tmp_fh.fh_export->ex_path;
@@ -1847,11 +1869,19 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
1847 if (d_mountpoint(dentry)) { 1869 if (d_mountpoint(dentry)) {
1848 int err; 1870 int err;
1849 1871
1872 /*
1873 * Why the heck aren't we just using nfsd_lookup??
1874 * Different "."/".." handling? Something else?
1875 * At least, add a comment here to explain....
1876 */
1850 err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp); 1877 err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp);
1851 if (err) { 1878 if (err) {
1852 nfserr = nfserrno(err); 1879 nfserr = nfserrno(err);
1853 goto out_put; 1880 goto out_put;
1854 } 1881 }
1882 nfserr = check_nfsd_access(exp, cd->rd_rqstp);
1883 if (nfserr)
1884 goto out_put;
1855 1885
1856 } 1886 }
1857 nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, 1887 nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
@@ -2419,6 +2449,72 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
2419 } 2449 }
2420} 2450}
2421 2451
2452static void
2453nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, int nfserr,
2454 struct nfsd4_secinfo *secinfo)
2455{
2456 int i = 0;
2457 struct svc_export *exp = secinfo->si_exp;
2458 u32 nflavs;
2459 struct exp_flavor_info *flavs;
2460 struct exp_flavor_info def_flavs[2];
2461 ENCODE_HEAD;
2462
2463 if (nfserr)
2464 goto out;
2465 if (exp->ex_nflavors) {
2466 flavs = exp->ex_flavors;
2467 nflavs = exp->ex_nflavors;
2468 } else { /* Handling of some defaults in absence of real secinfo: */
2469 flavs = def_flavs;
2470 if (exp->ex_client->flavour->flavour == RPC_AUTH_UNIX) {
2471 nflavs = 2;
2472 flavs[0].pseudoflavor = RPC_AUTH_UNIX;
2473 flavs[1].pseudoflavor = RPC_AUTH_NULL;
2474 } else if (exp->ex_client->flavour->flavour == RPC_AUTH_GSS) {
2475 nflavs = 1;
2476 flavs[0].pseudoflavor
2477 = svcauth_gss_flavor(exp->ex_client);
2478 } else {
2479 nflavs = 1;
2480 flavs[0].pseudoflavor
2481 = exp->ex_client->flavour->flavour;
2482 }
2483 }
2484
2485 RESERVE_SPACE(4);
2486 WRITE32(nflavs);
2487 ADJUST_ARGS();
2488 for (i = 0; i < nflavs; i++) {
2489 u32 flav = flavs[i].pseudoflavor;
2490 struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav);
2491
2492 if (gm) {
2493 RESERVE_SPACE(4);
2494 WRITE32(RPC_AUTH_GSS);
2495 ADJUST_ARGS();
2496 RESERVE_SPACE(4 + gm->gm_oid.len);
2497 WRITE32(gm->gm_oid.len);
2498 WRITEMEM(gm->gm_oid.data, gm->gm_oid.len);
2499 ADJUST_ARGS();
2500 RESERVE_SPACE(4);
2501 WRITE32(0); /* qop */
2502 ADJUST_ARGS();
2503 RESERVE_SPACE(4);
2504 WRITE32(gss_pseudoflavor_to_service(gm, flav));
2505 ADJUST_ARGS();
2506 gss_mech_put(gm);
2507 } else {
2508 RESERVE_SPACE(4);
2509 WRITE32(flav);
2510 ADJUST_ARGS();
2511 }
2512 }
2513out:
2514 if (exp)
2515 exp_put(exp);
2516}
2517
2422/* 2518/*
2423 * The SETATTR encode routine is special -- it always encodes a bitmap, 2519 * The SETATTR encode routine is special -- it always encodes a bitmap,
2424 * regardless of the error status. 2520 * regardless of the error status.
@@ -2559,6 +2655,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2559 break; 2655 break;
2560 case OP_SAVEFH: 2656 case OP_SAVEFH:
2561 break; 2657 break;
2658 case OP_SECINFO:
2659 nfsd4_encode_secinfo(resp, op->status, &op->u.secinfo);
2660 break;
2562 case OP_SETATTR: 2661 case OP_SETATTR:
2563 nfsd4_encode_setattr(resp, op->status, &op->u.setattr); 2662 nfsd4_encode_setattr(resp, op->status, &op->u.setattr);
2564 break; 2663 break;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 71c686dc7257..baac89d917ca 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -35,7 +35,6 @@
35#include <linux/nfsd/cache.h> 35#include <linux/nfsd/cache.h>
36#include <linux/nfsd/xdr.h> 36#include <linux/nfsd/xdr.h>
37#include <linux/nfsd/syscall.h> 37#include <linux/nfsd/syscall.h>
38#include <linux/nfsd/interface.h>
39 38
40#include <asm/uaccess.h> 39#include <asm/uaccess.h>
41 40
@@ -245,7 +244,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
245 } 244 }
246 exp_readunlock(); 245 exp_readunlock();
247 if (err == 0) 246 if (err == 0)
248 err = res->fh_size + (int)&((struct knfsd_fh*)0)->fh_base; 247 err = res->fh_size + offsetof(struct knfsd_fh, fh_base);
249 out: 248 out:
250 return err; 249 return err;
251} 250}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 6ca2d24fc216..0eb464a39aae 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -15,10 +15,12 @@
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/dcache.h> 17#include <linux/dcache.h>
18#include <linux/exportfs.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
19 20
20#include <linux/sunrpc/clnt.h> 21#include <linux/sunrpc/clnt.h>
21#include <linux/sunrpc/svc.h> 22#include <linux/sunrpc/svc.h>
23#include <linux/sunrpc/svcauth_gss.h>
22#include <linux/nfsd/nfsd.h> 24#include <linux/nfsd/nfsd.h>
23 25
24#define NFSDDBG_FACILITY NFSDDBG_FH 26#define NFSDDBG_FACILITY NFSDDBG_FH
@@ -27,10 +29,6 @@
27static int nfsd_nr_verified; 29static int nfsd_nr_verified;
28static int nfsd_nr_put; 30static int nfsd_nr_put;
29 31
30extern struct export_operations export_op_default;
31
32#define CALL(ops,fun) ((ops->fun)?(ops->fun):export_op_default.fun)
33
34/* 32/*
35 * our acceptability function. 33 * our acceptability function.
36 * if NOSUBTREECHECK, accept anything 34 * if NOSUBTREECHECK, accept anything
@@ -123,8 +121,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
123 int data_left = fh->fh_size/4; 121 int data_left = fh->fh_size/4;
124 122
125 error = nfserr_stale; 123 error = nfserr_stale;
126 if (rqstp->rq_client == NULL)
127 goto out;
128 if (rqstp->rq_vers > 2) 124 if (rqstp->rq_vers > 2)
129 error = nfserr_badhandle; 125 error = nfserr_badhandle;
130 if (rqstp->rq_vers == 4 && fh->fh_size == 0) 126 if (rqstp->rq_vers == 4 && fh->fh_size == 0)
@@ -148,7 +144,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
148 fh->fh_fsid[1] = fh->fh_fsid[2]; 144 fh->fh_fsid[1] = fh->fh_fsid[2];
149 } 145 }
150 if ((data_left -= len)<0) goto out; 146 if ((data_left -= len)<0) goto out;
151 exp = exp_find(rqstp->rq_client, fh->fh_fsid_type, datap, &rqstp->rq_chandle); 147 exp = rqst_exp_find(rqstp, fh->fh_fsid_type, datap);
152 datap += len; 148 datap += len;
153 } else { 149 } else {
154 dev_t xdev; 150 dev_t xdev;
@@ -159,19 +155,17 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
159 xdev = old_decode_dev(fh->ofh_xdev); 155 xdev = old_decode_dev(fh->ofh_xdev);
160 xino = u32_to_ino_t(fh->ofh_xino); 156 xino = u32_to_ino_t(fh->ofh_xino);
161 mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL); 157 mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
162 exp = exp_find(rqstp->rq_client, FSID_DEV, tfh, 158 exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
163 &rqstp->rq_chandle);
164 } 159 }
165 160
166 if (IS_ERR(exp) && (PTR_ERR(exp) == -EAGAIN 161 error = nfserr_stale;
167 || PTR_ERR(exp) == -ETIMEDOUT)) { 162 if (PTR_ERR(exp) == -ENOENT)
168 error = nfserrno(PTR_ERR(exp));
169 goto out; 163 goto out;
170 }
171 164
172 error = nfserr_stale; 165 if (IS_ERR(exp)) {
173 if (!exp || IS_ERR(exp)) 166 error = nfserrno(PTR_ERR(exp));
174 goto out; 167 goto out;
168 }
175 169
176 /* Check if the request originated from a secure port. */ 170 /* Check if the request originated from a secure port. */
177 error = nfserr_perm; 171 error = nfserr_perm;
@@ -211,11 +205,9 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
211 if (fileid_type == 0) 205 if (fileid_type == 0)
212 dentry = dget(exp->ex_dentry); 206 dentry = dget(exp->ex_dentry);
213 else { 207 else {
214 struct export_operations *nop = exp->ex_mnt->mnt_sb->s_export_op; 208 dentry = exportfs_decode_fh(exp->ex_mnt, datap,
215 dentry = CALL(nop,decode_fh)(exp->ex_mnt->mnt_sb, 209 data_left, fileid_type,
216 datap, data_left, 210 nfsd_acceptable, exp);
217 fileid_type,
218 nfsd_acceptable, exp);
219 } 211 }
220 if (dentry == NULL) 212 if (dentry == NULL)
221 goto out; 213 goto out;
@@ -257,8 +249,19 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
257 if (error) 249 if (error)
258 goto out; 250 goto out;
259 251
252 if (!(access & MAY_LOCK)) {
253 /*
254 * pseudoflavor restrictions are not enforced on NLM,
255 * which clients virtually always use auth_sys for,
256 * even while using RPCSEC_GSS for NFS.
257 */
258 error = check_nfsd_access(exp, rqstp);
259 if (error)
260 goto out;
261 }
262
260 /* Finally, check access permissions. */ 263 /* Finally, check access permissions. */
261 error = nfsd_permission(exp, dentry, access); 264 error = nfsd_permission(rqstp, exp, dentry, access);
262 265
263 if (error) { 266 if (error) {
264 dprintk("fh_verify: %s/%s permission failure, " 267 dprintk("fh_verify: %s/%s permission failure, "
@@ -286,15 +289,13 @@ out:
286static inline int _fh_update(struct dentry *dentry, struct svc_export *exp, 289static inline int _fh_update(struct dentry *dentry, struct svc_export *exp,
287 __u32 *datap, int *maxsize) 290 __u32 *datap, int *maxsize)
288{ 291{
289 struct export_operations *nop = exp->ex_mnt->mnt_sb->s_export_op;
290
291 if (dentry == exp->ex_dentry) { 292 if (dentry == exp->ex_dentry) {
292 *maxsize = 0; 293 *maxsize = 0;
293 return 0; 294 return 0;
294 } 295 }
295 296
296 return CALL(nop,encode_fh)(dentry, datap, maxsize, 297 return exportfs_encode_fh(dentry, datap, maxsize,
297 !(exp->ex_flags&NFSEXP_NOSUBTREECHECK)); 298 !(exp->ex_flags & NFSEXP_NOSUBTREECHECK));
298} 299}
299 300
300/* 301/*
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index b2c7147aa921..977a71f64e19 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -278,7 +278,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
278 * echo thing > device-special-file-or-pipe 278 * echo thing > device-special-file-or-pipe
279 * by doing a CREATE with type==0 279 * by doing a CREATE with type==0
280 */ 280 */
281 nfserr = nfsd_permission(newfhp->fh_export, 281 nfserr = nfsd_permission(rqstp,
282 newfhp->fh_export,
282 newfhp->fh_dentry, 283 newfhp->fh_dentry,
283 MAY_WRITE|MAY_LOCAL_ACCESS); 284 MAY_WRITE|MAY_LOCAL_ACCESS);
284 if (nfserr && nfserr != nfserr_rofs) 285 if (nfserr && nfserr != nfserr_rofs)
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ff55950efb43..a8c89ae4c743 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -19,6 +19,7 @@
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/smp.h> 20#include <linux/smp.h>
21#include <linux/smp_lock.h> 21#include <linux/smp_lock.h>
22#include <linux/freezer.h>
22#include <linux/fs_struct.h> 23#include <linux/fs_struct.h>
23 24
24#include <linux/sunrpc/types.h> 25#include <linux/sunrpc/types.h>
@@ -432,6 +433,7 @@ nfsd(struct svc_rqst *rqstp)
432 * dirty pages. 433 * dirty pages.
433 */ 434 */
434 current->flags |= PF_LESS_THROTTLE; 435 current->flags |= PF_LESS_THROTTLE;
436 set_freezable();
435 437
436 /* 438 /*
437 * The main request loop 439 * The main request loop
@@ -492,6 +494,15 @@ out:
492 module_put_and_exit(0); 494 module_put_and_exit(0);
493} 495}
494 496
497static __be32 map_new_errors(u32 vers, __be32 nfserr)
498{
499 if (nfserr == nfserr_jukebox && vers == 2)
500 return nfserr_dropit;
501 if (nfserr == nfserr_wrongsec && vers < 4)
502 return nfserr_acces;
503 return nfserr;
504}
505
495int 506int
496nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) 507nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
497{ 508{
@@ -534,6 +545,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
534 545
535 /* Now call the procedure handler, and encode NFS status. */ 546 /* Now call the procedure handler, and encode NFS status. */
536 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 547 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
548 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
537 if (nfserr == nfserr_jukebox && rqstp->rq_vers == 2) 549 if (nfserr == nfserr_jukebox && rqstp->rq_vers == 2)
538 nfserr = nfserr_dropit; 550 nfserr = nfserr_dropit;
539 if (nfserr == nfserr_dropit) { 551 if (nfserr == nfserr_dropit) {
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 945b1cedde2b..e90f4a8a1d01 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -113,7 +113,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
113 113
114 while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); 114 while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
115 115
116 exp2 = exp_get_by_name(exp->ex_client, mnt, mounts, &rqstp->rq_chandle); 116 exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
117 if (IS_ERR(exp2)) { 117 if (IS_ERR(exp2)) {
118 err = PTR_ERR(exp2); 118 err = PTR_ERR(exp2);
119 dput(mounts); 119 dput(mounts);
@@ -135,21 +135,10 @@ out:
135 return err; 135 return err;
136} 136}
137 137
138/*
139 * Look up one component of a pathname.
140 * N.B. After this call _both_ fhp and resfh need an fh_put
141 *
142 * If the lookup would cross a mountpoint, and the mounted filesystem
143 * is exported to the client with NFSEXP_NOHIDE, then the lookup is
144 * accepted as it stands and the mounted directory is
145 * returned. Otherwise the covered directory is returned.
146 * NOTE: this mountpoint crossing is not supported properly by all
147 * clients and is explicitly disallowed for NFSv3
148 * NeilBrown <neilb@cse.unsw.edu.au>
149 */
150__be32 138__be32
151nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, 139nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
152 int len, struct svc_fh *resfh) 140 const char *name, int len,
141 struct svc_export **exp_ret, struct dentry **dentry_ret)
153{ 142{
154 struct svc_export *exp; 143 struct svc_export *exp;
155 struct dentry *dparent; 144 struct dentry *dparent;
@@ -168,8 +157,6 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
168 exp = fhp->fh_export; 157 exp = fhp->fh_export;
169 exp_get(exp); 158 exp_get(exp);
170 159
171 err = nfserr_acces;
172
173 /* Lookup the name, but don't follow links */ 160 /* Lookup the name, but don't follow links */
174 if (isdotent(name, len)) { 161 if (isdotent(name, len)) {
175 if (len==1) 162 if (len==1)
@@ -190,17 +177,15 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
190 dput(dentry); 177 dput(dentry);
191 dentry = dp; 178 dentry = dp;
192 179
193 exp2 = exp_parent(exp->ex_client, mnt, dentry, 180 exp2 = rqst_exp_parent(rqstp, mnt, dentry);
194 &rqstp->rq_chandle); 181 if (PTR_ERR(exp2) == -ENOENT) {
195 if (IS_ERR(exp2)) { 182 dput(dentry);
183 dentry = dget(dparent);
184 } else if (IS_ERR(exp2)) {
196 host_err = PTR_ERR(exp2); 185 host_err = PTR_ERR(exp2);
197 dput(dentry); 186 dput(dentry);
198 mntput(mnt); 187 mntput(mnt);
199 goto out_nfserr; 188 goto out_nfserr;
200 }
201 if (!exp2) {
202 dput(dentry);
203 dentry = dget(dparent);
204 } else { 189 } else {
205 exp_put(exp); 190 exp_put(exp);
206 exp = exp2; 191 exp = exp2;
@@ -223,6 +208,41 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
223 } 208 }
224 } 209 }
225 } 210 }
211 *dentry_ret = dentry;
212 *exp_ret = exp;
213 return 0;
214
215out_nfserr:
216 exp_put(exp);
217 return nfserrno(host_err);
218}
219
220/*
221 * Look up one component of a pathname.
222 * N.B. After this call _both_ fhp and resfh need an fh_put
223 *
224 * If the lookup would cross a mountpoint, and the mounted filesystem
225 * is exported to the client with NFSEXP_NOHIDE, then the lookup is
226 * accepted as it stands and the mounted directory is
227 * returned. Otherwise the covered directory is returned.
228 * NOTE: this mountpoint crossing is not supported properly by all
229 * clients and is explicitly disallowed for NFSv3
230 * NeilBrown <neilb@cse.unsw.edu.au>
231 */
232__be32
233nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
234 int len, struct svc_fh *resfh)
235{
236 struct svc_export *exp;
237 struct dentry *dentry;
238 __be32 err;
239
240 err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
241 if (err)
242 return err;
243 err = check_nfsd_access(exp, rqstp);
244 if (err)
245 goto out;
226 /* 246 /*
227 * Note: we compose the file handle now, but as the 247 * Note: we compose the file handle now, but as the
228 * dentry may be negative, it may need to be updated. 248 * dentry may be negative, it may need to be updated.
@@ -230,16 +250,13 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
230 err = fh_compose(resfh, exp, dentry, fhp); 250 err = fh_compose(resfh, exp, dentry, fhp);
231 if (!err && !dentry->d_inode) 251 if (!err && !dentry->d_inode)
232 err = nfserr_noent; 252 err = nfserr_noent;
233 dput(dentry);
234out: 253out:
254 dput(dentry);
235 exp_put(exp); 255 exp_put(exp);
236 return err; 256 return err;
237
238out_nfserr:
239 err = nfserrno(host_err);
240 goto out;
241} 257}
242 258
259
243/* 260/*
244 * Set various file attributes. 261 * Set various file attributes.
245 * N.B. After this call fhp needs an fh_put 262 * N.B. After this call fhp needs an fh_put
@@ -311,7 +328,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
311 /* The size case is special. It changes the file as well as the attributes. */ 328 /* The size case is special. It changes the file as well as the attributes. */
312 if (iap->ia_valid & ATTR_SIZE) { 329 if (iap->ia_valid & ATTR_SIZE) {
313 if (iap->ia_size < inode->i_size) { 330 if (iap->ia_size < inode->i_size) {
314 err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE); 331 err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
315 if (err) 332 if (err)
316 goto out; 333 goto out;
317 } 334 }
@@ -435,7 +452,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
435 /* Get inode */ 452 /* Get inode */
436 error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR); 453 error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);
437 if (error) 454 if (error)
438 goto out; 455 return error;
439 456
440 dentry = fhp->fh_dentry; 457 dentry = fhp->fh_dentry;
441 inode = dentry->d_inode; 458 inode = dentry->d_inode;
@@ -444,33 +461,25 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
444 461
445 host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); 462 host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
446 if (host_error == -EINVAL) { 463 if (host_error == -EINVAL) {
447 error = nfserr_attrnotsupp; 464 return nfserr_attrnotsupp;
448 goto out;
449 } else if (host_error < 0) 465 } else if (host_error < 0)
450 goto out_nfserr; 466 goto out_nfserr;
451 467
452 host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); 468 host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
453 if (host_error < 0) 469 if (host_error < 0)
454 goto out_nfserr; 470 goto out_release;
455 471
456 if (S_ISDIR(inode->i_mode)) { 472 if (S_ISDIR(inode->i_mode))
457 host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); 473 host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
458 if (host_error < 0)
459 goto out_nfserr;
460 }
461
462 error = nfs_ok;
463 474
464out: 475out_release:
465 posix_acl_release(pacl); 476 posix_acl_release(pacl);
466 posix_acl_release(dpacl); 477 posix_acl_release(dpacl);
467 return (error);
468out_nfserr: 478out_nfserr:
469 if (host_error == -EOPNOTSUPP) 479 if (host_error == -EOPNOTSUPP)
470 error = nfserr_attrnotsupp; 480 return nfserr_attrnotsupp;
471 else 481 else
472 error = nfserrno(host_error); 482 return nfserrno(host_error);
473 goto out;
474} 483}
475 484
476static struct posix_acl * 485static struct posix_acl *
@@ -607,7 +616,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
607 616
608 sresult |= map->access; 617 sresult |= map->access;
609 618
610 err2 = nfsd_permission(export, dentry, map->how); 619 err2 = nfsd_permission(rqstp, export, dentry, map->how);
611 switch (err2) { 620 switch (err2) {
612 case nfs_ok: 621 case nfs_ok:
613 result |= map->access; 622 result |= map->access;
@@ -1034,7 +1043,7 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1034 __be32 err; 1043 __be32 err;
1035 1044
1036 if (file) { 1045 if (file) {
1037 err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, 1046 err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
1038 MAY_READ|MAY_OWNER_OVERRIDE); 1047 MAY_READ|MAY_OWNER_OVERRIDE);
1039 if (err) 1048 if (err)
1040 goto out; 1049 goto out;
@@ -1063,7 +1072,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1063 __be32 err = 0; 1072 __be32 err = 0;
1064 1073
1065 if (file) { 1074 if (file) {
1066 err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, 1075 err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
1067 MAY_WRITE|MAY_OWNER_OVERRIDE); 1076 MAY_WRITE|MAY_OWNER_OVERRIDE);
1068 if (err) 1077 if (err)
1069 goto out; 1078 goto out;
@@ -1792,7 +1801,8 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
1792 * Check for a user's access permissions to this inode. 1801 * Check for a user's access permissions to this inode.
1793 */ 1802 */
1794__be32 1803__be32
1795nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc) 1804nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
1805 struct dentry *dentry, int acc)
1796{ 1806{
1797 struct inode *inode = dentry->d_inode; 1807 struct inode *inode = dentry->d_inode;
1798 int err; 1808 int err;
@@ -1823,7 +1833,7 @@ nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
1823 */ 1833 */
1824 if (!(acc & MAY_LOCAL_ACCESS)) 1834 if (!(acc & MAY_LOCAL_ACCESS))
1825 if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { 1835 if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
1826 if (EX_RDONLY(exp) || IS_RDONLY(inode)) 1836 if (EX_RDONLY(exp, rqstp) || IS_RDONLY(inode))
1827 return nfserr_rofs; 1837 return nfserr_rofs;
1828 if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode)) 1838 if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
1829 return nfserr_perm; 1839 return nfserr_perm;
diff --git a/fs/nls/Makefile b/fs/nls/Makefile
index a7ade138d684..f499dd7c3905 100644
--- a/fs/nls/Makefile
+++ b/fs/nls/Makefile
@@ -36,11 +36,9 @@ obj-$(CONFIG_NLS_ISO8859_6) += nls_iso8859-6.o
36obj-$(CONFIG_NLS_ISO8859_7) += nls_iso8859-7.o 36obj-$(CONFIG_NLS_ISO8859_7) += nls_iso8859-7.o
37obj-$(CONFIG_NLS_ISO8859_8) += nls_cp1255.o 37obj-$(CONFIG_NLS_ISO8859_8) += nls_cp1255.o
38obj-$(CONFIG_NLS_ISO8859_9) += nls_iso8859-9.o 38obj-$(CONFIG_NLS_ISO8859_9) += nls_iso8859-9.o
39obj-$(CONFIG_NLS_ISO8859_10) += nls_iso8859-10.o
40obj-$(CONFIG_NLS_ISO8859_13) += nls_iso8859-13.o 39obj-$(CONFIG_NLS_ISO8859_13) += nls_iso8859-13.o
41obj-$(CONFIG_NLS_ISO8859_14) += nls_iso8859-14.o 40obj-$(CONFIG_NLS_ISO8859_14) += nls_iso8859-14.o
42obj-$(CONFIG_NLS_ISO8859_15) += nls_iso8859-15.o 41obj-$(CONFIG_NLS_ISO8859_15) += nls_iso8859-15.o
43obj-$(CONFIG_NLS_KOI8_R) += nls_koi8-r.o 42obj-$(CONFIG_NLS_KOI8_R) += nls_koi8-r.o
44obj-$(CONFIG_NLS_KOI8_U) += nls_koi8-u.o nls_koi8-ru.o 43obj-$(CONFIG_NLS_KOI8_U) += nls_koi8-u.o nls_koi8-ru.o
45obj-$(CONFIG_NLS_ABC) += nls_abc.o
46obj-$(CONFIG_NLS_UTF8) += nls_utf8.o 44obj-$(CONFIG_NLS_UTF8) += nls_utf8.o
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index bff01a54675a..e93c6142b23c 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/exportfs.h>
24#include <linux/security.h> 25#include <linux/security.h>
25 26
26#include "attrib.h" 27#include "attrib.h"
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19712a7d145f..f5e11f4fa952 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,6 +50,8 @@
50#include "buffer_head_io.h" 50#include "buffer_head_io.h"
51 51
52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
54 struct ocfs2_extent_block *eb);
53 55
54/* 56/*
55 * Structures which describe a path through a btree, and functions to 57 * Structures which describe a path through a btree, and functions to
@@ -117,6 +119,31 @@ static void ocfs2_free_path(struct ocfs2_path *path)
117} 119}
118 120
119/* 121/*
122 * All the elements of src into dest. After this call, src could be freed
123 * without affecting dest.
124 *
125 * Both paths should have the same root. Any non-root elements of dest
126 * will be freed.
127 */
128static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
129{
130 int i;
131
132 BUG_ON(path_root_bh(dest) != path_root_bh(src));
133 BUG_ON(path_root_el(dest) != path_root_el(src));
134
135 ocfs2_reinit_path(dest, 1);
136
137 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
138 dest->p_node[i].bh = src->p_node[i].bh;
139 dest->p_node[i].el = src->p_node[i].el;
140
141 if (dest->p_node[i].bh)
142 get_bh(dest->p_node[i].bh);
143 }
144}
145
146/*
120 * Make the *dest path the same as src and re-initialize src path to 147 * Make the *dest path the same as src and re-initialize src path to
121 * have a root only. 148 * have a root only.
122 */ 149 */
@@ -212,10 +239,41 @@ out:
212 return ret; 239 return ret;
213} 240}
214 241
242/*
243 * Return the index of the extent record which contains cluster #v_cluster.
244 * -1 is returned if it was not found.
245 *
246 * Should work fine on interior and exterior nodes.
247 */
248int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
249{
250 int ret = -1;
251 int i;
252 struct ocfs2_extent_rec *rec;
253 u32 rec_end, rec_start, clusters;
254
255 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
256 rec = &el->l_recs[i];
257
258 rec_start = le32_to_cpu(rec->e_cpos);
259 clusters = ocfs2_rec_clusters(el, rec);
260
261 rec_end = rec_start + clusters;
262
263 if (v_cluster >= rec_start && v_cluster < rec_end) {
264 ret = i;
265 break;
266 }
267 }
268
269 return ret;
270}
271
215enum ocfs2_contig_type { 272enum ocfs2_contig_type {
216 CONTIG_NONE = 0, 273 CONTIG_NONE = 0,
217 CONTIG_LEFT, 274 CONTIG_LEFT,
218 CONTIG_RIGHT 275 CONTIG_RIGHT,
276 CONTIG_LEFTRIGHT,
219}; 277};
220 278
221 279
@@ -253,6 +311,14 @@ static enum ocfs2_contig_type
253{ 311{
254 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 312 u64 blkno = le64_to_cpu(insert_rec->e_blkno);
255 313
314 /*
315 * Refuse to coalesce extent records with different flag
316 * fields - we don't want to mix unwritten extents with user
317 * data.
318 */
319 if (ext->e_flags != insert_rec->e_flags)
320 return CONTIG_NONE;
321
256 if (ocfs2_extents_adjacent(ext, insert_rec) && 322 if (ocfs2_extents_adjacent(ext, insert_rec) &&
257 ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 323 ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
258 return CONTIG_RIGHT; 324 return CONTIG_RIGHT;
@@ -277,7 +343,14 @@ enum ocfs2_append_type {
277 APPEND_TAIL, 343 APPEND_TAIL,
278}; 344};
279 345
346enum ocfs2_split_type {
347 SPLIT_NONE = 0,
348 SPLIT_LEFT,
349 SPLIT_RIGHT,
350};
351
280struct ocfs2_insert_type { 352struct ocfs2_insert_type {
353 enum ocfs2_split_type ins_split;
281 enum ocfs2_append_type ins_appending; 354 enum ocfs2_append_type ins_appending;
282 enum ocfs2_contig_type ins_contig; 355 enum ocfs2_contig_type ins_contig;
283 int ins_contig_index; 356 int ins_contig_index;
@@ -285,6 +358,13 @@ struct ocfs2_insert_type {
285 int ins_tree_depth; 358 int ins_tree_depth;
286}; 359};
287 360
361struct ocfs2_merge_ctxt {
362 enum ocfs2_contig_type c_contig_type;
363 int c_has_empty_extent;
364 int c_split_covers_rec;
365 int c_used_tail_recs;
366};
367
288/* 368/*
289 * How many free extents have we got before we need more meta data? 369 * How many free extents have we got before we need more meta data?
290 */ 370 */
@@ -384,13 +464,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
384 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 464 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
385 eb->h_blkno = cpu_to_le64(first_blkno); 465 eb->h_blkno = cpu_to_le64(first_blkno);
386 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 466 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
387
388#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
389 /* we always use slot zero's suballocator */
390 eb->h_suballoc_slot = 0;
391#else
392 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 467 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
393#endif
394 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 468 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
395 eb->h_list.l_count = 469 eb->h_list.l_count =
396 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 470 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -461,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
461 struct inode *inode, 535 struct inode *inode,
462 struct buffer_head *fe_bh, 536 struct buffer_head *fe_bh,
463 struct buffer_head *eb_bh, 537 struct buffer_head *eb_bh,
464 struct buffer_head *last_eb_bh, 538 struct buffer_head **last_eb_bh,
465 struct ocfs2_alloc_context *meta_ac) 539 struct ocfs2_alloc_context *meta_ac)
466{ 540{
467 int status, new_blocks, i; 541 int status, new_blocks, i;
@@ -476,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
476 550
477 mlog_entry_void(); 551 mlog_entry_void();
478 552
479 BUG_ON(!last_eb_bh); 553 BUG_ON(!last_eb_bh || !*last_eb_bh);
480 554
481 fe = (struct ocfs2_dinode *) fe_bh->b_data; 555 fe = (struct ocfs2_dinode *) fe_bh->b_data;
482 556
@@ -507,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
507 goto bail; 581 goto bail;
508 } 582 }
509 583
510 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; 584 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
511 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 585 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
512 586
513 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 587 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -568,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
568 * journal_dirty erroring as it won't unless we've aborted the 642 * journal_dirty erroring as it won't unless we've aborted the
569 * handle (in which case we would never be here) so reserving 643 * handle (in which case we would never be here) so reserving
570 * the write with journal_access is all we need to do. */ 644 * the write with journal_access is all we need to do. */
571 status = ocfs2_journal_access(handle, inode, last_eb_bh, 645 status = ocfs2_journal_access(handle, inode, *last_eb_bh,
572 OCFS2_JOURNAL_ACCESS_WRITE); 646 OCFS2_JOURNAL_ACCESS_WRITE);
573 if (status < 0) { 647 if (status < 0) {
574 mlog_errno(status); 648 mlog_errno(status);
@@ -601,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
601 * next_leaf on the previously last-extent-block. */ 675 * next_leaf on the previously last-extent-block. */
602 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); 676 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
603 677
604 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 678 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
605 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 679 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
606 680
607 status = ocfs2_journal_dirty(handle, last_eb_bh); 681 status = ocfs2_journal_dirty(handle, *last_eb_bh);
608 if (status < 0) 682 if (status < 0)
609 mlog_errno(status); 683 mlog_errno(status);
610 status = ocfs2_journal_dirty(handle, fe_bh); 684 status = ocfs2_journal_dirty(handle, fe_bh);
@@ -616,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
616 mlog_errno(status); 690 mlog_errno(status);
617 } 691 }
618 692
693 /*
694 * Some callers want to track the rightmost leaf so pass it
695 * back here.
696 */
697 brelse(*last_eb_bh);
698 get_bh(new_eb_bhs[0]);
699 *last_eb_bh = new_eb_bhs[0];
700
619 status = 0; 701 status = 0;
620bail: 702bail:
621 if (new_eb_bhs) { 703 if (new_eb_bhs) {
@@ -829,6 +911,87 @@ bail:
829} 911}
830 912
831/* 913/*
914 * Grow a b-tree so that it has more records.
915 *
916 * We might shift the tree depth in which case existing paths should
917 * be considered invalid.
918 *
919 * Tree depth after the grow is returned via *final_depth.
920 *
921 * *last_eb_bh will be updated by ocfs2_add_branch().
922 */
923static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
924 struct buffer_head *di_bh, int *final_depth,
925 struct buffer_head **last_eb_bh,
926 struct ocfs2_alloc_context *meta_ac)
927{
928 int ret, shift;
929 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
930 int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
931 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
932 struct buffer_head *bh = NULL;
933
934 BUG_ON(meta_ac == NULL);
935
936 shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
937 if (shift < 0) {
938 ret = shift;
939 mlog_errno(ret);
940 goto out;
941 }
942
943 /* We traveled all the way to the bottom of the allocation tree
944 * and didn't find room for any more extents - we need to add
945 * another tree level */
946 if (shift) {
947 BUG_ON(bh);
948 mlog(0, "need to shift tree depth (current = %d)\n", depth);
949
950 /* ocfs2_shift_tree_depth will return us a buffer with
951 * the new extent block (so we can pass that to
952 * ocfs2_add_branch). */
953 ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
954 meta_ac, &bh);
955 if (ret < 0) {
956 mlog_errno(ret);
957 goto out;
958 }
959 depth++;
960 if (depth == 1) {
961 /*
962 * Special case: we have room now if we shifted from
963 * tree_depth 0, so no more work needs to be done.
964 *
965 * We won't be calling add_branch, so pass
966 * back *last_eb_bh as the new leaf. At depth
967 * zero, it should always be null so there's
968 * no reason to brelse.
969 */
970 BUG_ON(*last_eb_bh);
971 get_bh(bh);
972 *last_eb_bh = bh;
973 goto out;
974 }
975 }
976
977 /* call ocfs2_add_branch to add the final part of the tree with
978 * the new data. */
979 mlog(0, "add branch. bh = %p\n", bh);
980 ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
981 meta_ac);
982 if (ret < 0) {
983 mlog_errno(ret);
984 goto out;
985 }
986
987out:
988 if (final_depth)
989 *final_depth = depth;
990 brelse(bh);
991 return ret;
992}
993
994/*
832 * This is only valid for leaf nodes, which are the only ones that can 995 * This is only valid for leaf nodes, which are the only ones that can
833 * have empty extents anyway. 996 * have empty extents anyway.
834 */ 997 */
@@ -934,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
934 1097
935} 1098}
936 1099
1100static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1101{
1102 int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1103
1104 BUG_ON(num_recs == 0);
1105
1106 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1107 num_recs--;
1108 size = num_recs * sizeof(struct ocfs2_extent_rec);
1109 memmove(&el->l_recs[0], &el->l_recs[1], size);
1110 memset(&el->l_recs[num_recs], 0,
1111 sizeof(struct ocfs2_extent_rec));
1112 el->l_next_free_rec = cpu_to_le16(num_recs);
1113 }
1114}
1115
937/* 1116/*
938 * Create an empty extent record . 1117 * Create an empty extent record .
939 * 1118 *
@@ -1211,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1211 * immediately to their right. 1390 * immediately to their right.
1212 */ 1391 */
1213 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1392 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1393 if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1394 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1395 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1396 }
1214 left_clusters -= le32_to_cpu(left_rec->e_cpos); 1397 left_clusters -= le32_to_cpu(left_rec->e_cpos);
1215 left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1398 left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1216 1399
@@ -1531,10 +1714,16 @@ out:
1531 return ret; 1714 return ret;
1532} 1715}
1533 1716
1717/*
1718 * Extend the transaction by enough credits to complete the rotation,
1719 * and still leave at least the original number of credits allocated
1720 * to this transaction.
1721 */
1534static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 1722static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1723 int op_credits,
1535 struct ocfs2_path *path) 1724 struct ocfs2_path *path)
1536{ 1725{
1537 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; 1726 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1538 1727
1539 if (handle->h_buffer_credits < credits) 1728 if (handle->h_buffer_credits < credits)
1540 return ocfs2_extend_trans(handle, credits); 1729 return ocfs2_extend_trans(handle, credits);
@@ -1568,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1568 return 0; 1757 return 0;
1569} 1758}
1570 1759
1760static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1761{
1762 int next_free = le16_to_cpu(el->l_next_free_rec);
1763 unsigned int range;
1764 struct ocfs2_extent_rec *rec;
1765
1766 if (next_free == 0)
1767 return 0;
1768
1769 rec = &el->l_recs[0];
1770 if (ocfs2_is_empty_extent(rec)) {
1771 /* Empty list. */
1772 if (next_free == 1)
1773 return 0;
1774 rec = &el->l_recs[1];
1775 }
1776
1777 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1778 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1779 return 1;
1780 return 0;
1781}
1782
1571/* 1783/*
1572 * Rotate all the records in a btree right one record, starting at insert_cpos. 1784 * Rotate all the records in a btree right one record, starting at insert_cpos.
1573 * 1785 *
@@ -1586,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1586 */ 1798 */
1587static int ocfs2_rotate_tree_right(struct inode *inode, 1799static int ocfs2_rotate_tree_right(struct inode *inode,
1588 handle_t *handle, 1800 handle_t *handle,
1801 enum ocfs2_split_type split,
1589 u32 insert_cpos, 1802 u32 insert_cpos,
1590 struct ocfs2_path *right_path, 1803 struct ocfs2_path *right_path,
1591 struct ocfs2_path **ret_left_path) 1804 struct ocfs2_path **ret_left_path)
1592{ 1805{
1593 int ret, start; 1806 int ret, start, orig_credits = handle->h_buffer_credits;
1594 u32 cpos; 1807 u32 cpos;
1595 struct ocfs2_path *left_path = NULL; 1808 struct ocfs2_path *left_path = NULL;
1596 1809
@@ -1657,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1657 (unsigned long long) 1870 (unsigned long long)
1658 path_leaf_bh(left_path)->b_blocknr); 1871 path_leaf_bh(left_path)->b_blocknr);
1659 1872
1660 if (ocfs2_rotate_requires_path_adjustment(left_path, 1873 if (split == SPLIT_NONE &&
1874 ocfs2_rotate_requires_path_adjustment(left_path,
1661 insert_cpos)) { 1875 insert_cpos)) {
1662 mlog(0, "Path adjustment required\n");
1663 1876
1664 /* 1877 /*
1665 * We've rotated the tree as much as we 1878 * We've rotated the tree as much as we
@@ -1687,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1687 right_path->p_tree_depth); 1900 right_path->p_tree_depth);
1688 1901
1689 ret = ocfs2_extend_rotate_transaction(handle, start, 1902 ret = ocfs2_extend_rotate_transaction(handle, start,
1690 right_path); 1903 orig_credits, right_path);
1691 if (ret) { 1904 if (ret) {
1692 mlog_errno(ret); 1905 mlog_errno(ret);
1693 goto out; 1906 goto out;
@@ -1700,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1700 goto out; 1913 goto out;
1701 } 1914 }
1702 1915
1916 if (split != SPLIT_NONE &&
1917 ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
1918 insert_cpos)) {
1919 /*
1920 * A rotate moves the rightmost left leaf
1921 * record over to the leftmost right leaf
1922 * slot. If we're doing an extent split
1923 * instead of a real insert, then we have to
1924 * check that the extent to be split wasn't
1925 * just moved over. If it was, then we can
1926 * exit here, passing left_path back -
1927 * ocfs2_split_extent() is smart enough to
1928 * search both leaves.
1929 */
1930 *ret_left_path = left_path;
1931 goto out_ret_path;
1932 }
1933
1703 /* 1934 /*
1704 * There is no need to re-read the next right path 1935 * There is no need to re-read the next right path
1705 * as we know that it'll be our current left 1936 * as we know that it'll be our current left
@@ -1722,6 +1953,1031 @@ out_ret_path:
1722 return ret; 1953 return ret;
1723} 1954}
1724 1955
1956static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
1957 struct ocfs2_path *path)
1958{
1959 int i, idx;
1960 struct ocfs2_extent_rec *rec;
1961 struct ocfs2_extent_list *el;
1962 struct ocfs2_extent_block *eb;
1963 u32 range;
1964
1965 /* Path should always be rightmost. */
1966 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
1967 BUG_ON(eb->h_next_leaf_blk != 0ULL);
1968
1969 el = &eb->h_list;
1970 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1971 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1972 rec = &el->l_recs[idx];
1973 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1974
1975 for (i = 0; i < path->p_tree_depth; i++) {
1976 el = path->p_node[i].el;
1977 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1978 rec = &el->l_recs[idx];
1979
1980 rec->e_int_clusters = cpu_to_le32(range);
1981 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
1982
1983 ocfs2_journal_dirty(handle, path->p_node[i].bh);
1984 }
1985}
1986
1987static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
1988 struct ocfs2_cached_dealloc_ctxt *dealloc,
1989 struct ocfs2_path *path, int unlink_start)
1990{
1991 int ret, i;
1992 struct ocfs2_extent_block *eb;
1993 struct ocfs2_extent_list *el;
1994 struct buffer_head *bh;
1995
1996 for(i = unlink_start; i < path_num_items(path); i++) {
1997 bh = path->p_node[i].bh;
1998
1999 eb = (struct ocfs2_extent_block *)bh->b_data;
2000 /*
2001 * Not all nodes might have had their final count
2002 * decremented by the caller - handle this here.
2003 */
2004 el = &eb->h_list;
2005 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2006 mlog(ML_ERROR,
2007 "Inode %llu, attempted to remove extent block "
2008 "%llu with %u records\n",
2009 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2010 (unsigned long long)le64_to_cpu(eb->h_blkno),
2011 le16_to_cpu(el->l_next_free_rec));
2012
2013 ocfs2_journal_dirty(handle, bh);
2014 ocfs2_remove_from_cache(inode, bh);
2015 continue;
2016 }
2017
2018 el->l_next_free_rec = 0;
2019 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2020
2021 ocfs2_journal_dirty(handle, bh);
2022
2023 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2024 if (ret)
2025 mlog_errno(ret);
2026
2027 ocfs2_remove_from_cache(inode, bh);
2028 }
2029}
2030
2031static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2032 struct ocfs2_path *left_path,
2033 struct ocfs2_path *right_path,
2034 int subtree_index,
2035 struct ocfs2_cached_dealloc_ctxt *dealloc)
2036{
2037 int i;
2038 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2039 struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2040 struct ocfs2_extent_list *el;
2041 struct ocfs2_extent_block *eb;
2042
2043 el = path_leaf_el(left_path);
2044
2045 eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2046
2047 for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2048 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2049 break;
2050
2051 BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2052
2053 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2054 le16_add_cpu(&root_el->l_next_free_rec, -1);
2055
2056 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2057 eb->h_next_leaf_blk = 0;
2058
2059 ocfs2_journal_dirty(handle, root_bh);
2060 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2061
2062 ocfs2_unlink_path(inode, handle, dealloc, right_path,
2063 subtree_index + 1);
2064}
2065
2066static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2067 struct ocfs2_path *left_path,
2068 struct ocfs2_path *right_path,
2069 int subtree_index,
2070 struct ocfs2_cached_dealloc_ctxt *dealloc,
2071 int *deleted)
2072{
2073 int ret, i, del_right_subtree = 0, right_has_empty = 0;
2074 struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
2075 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2076 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2077 struct ocfs2_extent_block *eb;
2078
2079 *deleted = 0;
2080
2081 right_leaf_el = path_leaf_el(right_path);
2082 left_leaf_el = path_leaf_el(left_path);
2083 root_bh = left_path->p_node[subtree_index].bh;
2084 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2085
2086 if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2087 return 0;
2088
2089 eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2090 if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2091 /*
2092 * It's legal for us to proceed if the right leaf is
2093 * the rightmost one and it has an empty extent. There
2094 * are two cases to handle - whether the leaf will be
2095 * empty after removal or not. If the leaf isn't empty
2096 * then just remove the empty extent up front. The
2097 * next block will handle empty leaves by flagging
2098 * them for unlink.
2099 *
2100 * Non rightmost leaves will throw -EAGAIN and the
2101 * caller can manually move the subtree and retry.
2102 */
2103
2104 if (eb->h_next_leaf_blk != 0ULL)
2105 return -EAGAIN;
2106
2107 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2108 ret = ocfs2_journal_access(handle, inode,
2109 path_leaf_bh(right_path),
2110 OCFS2_JOURNAL_ACCESS_WRITE);
2111 if (ret) {
2112 mlog_errno(ret);
2113 goto out;
2114 }
2115
2116 ocfs2_remove_empty_extent(right_leaf_el);
2117 } else
2118 right_has_empty = 1;
2119 }
2120
2121 if (eb->h_next_leaf_blk == 0ULL &&
2122 le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2123 /*
2124 * We have to update i_last_eb_blk during the meta
2125 * data delete.
2126 */
2127 ret = ocfs2_journal_access(handle, inode, di_bh,
2128 OCFS2_JOURNAL_ACCESS_WRITE);
2129 if (ret) {
2130 mlog_errno(ret);
2131 goto out;
2132 }
2133
2134 del_right_subtree = 1;
2135 }
2136
2137 /*
2138 * Getting here with an empty extent in the right path implies
2139 * that it's the rightmost path and will be deleted.
2140 */
2141 BUG_ON(right_has_empty && !del_right_subtree);
2142
2143 ret = ocfs2_journal_access(handle, inode, root_bh,
2144 OCFS2_JOURNAL_ACCESS_WRITE);
2145 if (ret) {
2146 mlog_errno(ret);
2147 goto out;
2148 }
2149
2150 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2151 ret = ocfs2_journal_access(handle, inode,
2152 right_path->p_node[i].bh,
2153 OCFS2_JOURNAL_ACCESS_WRITE);
2154 if (ret) {
2155 mlog_errno(ret);
2156 goto out;
2157 }
2158
2159 ret = ocfs2_journal_access(handle, inode,
2160 left_path->p_node[i].bh,
2161 OCFS2_JOURNAL_ACCESS_WRITE);
2162 if (ret) {
2163 mlog_errno(ret);
2164 goto out;
2165 }
2166 }
2167
2168 if (!right_has_empty) {
2169 /*
2170 * Only do this if we're moving a real
2171 * record. Otherwise, the action is delayed until
2172 * after removal of the right path in which case we
2173 * can do a simple shift to remove the empty extent.
2174 */
2175 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2176 memset(&right_leaf_el->l_recs[0], 0,
2177 sizeof(struct ocfs2_extent_rec));
2178 }
2179 if (eb->h_next_leaf_blk == 0ULL) {
2180 /*
2181 * Move recs over to get rid of empty extent, decrease
2182 * next_free. This is allowed to remove the last
2183 * extent in our leaf (setting l_next_free_rec to
2184 * zero) - the delete code below won't care.
2185 */
2186 ocfs2_remove_empty_extent(right_leaf_el);
2187 }
2188
2189 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2190 if (ret)
2191 mlog_errno(ret);
2192 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2193 if (ret)
2194 mlog_errno(ret);
2195
2196 if (del_right_subtree) {
2197 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2198 subtree_index, dealloc);
2199 ocfs2_update_edge_lengths(inode, handle, left_path);
2200
2201 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2202 di->i_last_eb_blk = eb->h_blkno;
2203
2204 /*
2205 * Removal of the extent in the left leaf was skipped
2206 * above so we could delete the right path
2207 * 1st.
2208 */
2209 if (right_has_empty)
2210 ocfs2_remove_empty_extent(left_leaf_el);
2211
2212 ret = ocfs2_journal_dirty(handle, di_bh);
2213 if (ret)
2214 mlog_errno(ret);
2215
2216 *deleted = 1;
2217 } else
2218 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2219 subtree_index);
2220
2221out:
2222 return ret;
2223}
2224
2225/*
2226 * Given a full path, determine what cpos value would return us a path
2227 * containing the leaf immediately to the right of the current one.
2228 *
2229 * Will return zero if the path passed in is already the rightmost path.
2230 *
2231 * This looks similar, but is subtly different to
2232 * ocfs2_find_cpos_for_left_leaf().
2233 */
2234static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2235 struct ocfs2_path *path, u32 *cpos)
2236{
2237 int i, j, ret = 0;
2238 u64 blkno;
2239 struct ocfs2_extent_list *el;
2240
2241 *cpos = 0;
2242
2243 if (path->p_tree_depth == 0)
2244 return 0;
2245
2246 blkno = path_leaf_bh(path)->b_blocknr;
2247
2248 /* Start at the tree node just above the leaf and work our way up. */
2249 i = path->p_tree_depth - 1;
2250 while (i >= 0) {
2251 int next_free;
2252
2253 el = path->p_node[i].el;
2254
2255 /*
2256 * Find the extent record just after the one in our
2257 * path.
2258 */
2259 next_free = le16_to_cpu(el->l_next_free_rec);
2260 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2261 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2262 if (j == (next_free - 1)) {
2263 if (i == 0) {
2264 /*
2265 * We've determined that the
2266 * path specified is already
2267 * the rightmost one - return a
2268 * cpos of zero.
2269 */
2270 goto out;
2271 }
2272 /*
2273 * The rightmost record points to our
2274 * leaf - we need to travel up the
2275 * tree one level.
2276 */
2277 goto next_node;
2278 }
2279
2280 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2281 goto out;
2282 }
2283 }
2284
2285 /*
2286 * If we got here, we never found a valid node where
2287 * the tree indicated one should be.
2288 */
2289 ocfs2_error(sb,
2290 "Invalid extent tree at extent block %llu\n",
2291 (unsigned long long)blkno);
2292 ret = -EROFS;
2293 goto out;
2294
2295next_node:
2296 blkno = path->p_node[i].bh->b_blocknr;
2297 i--;
2298 }
2299
2300out:
2301 return ret;
2302}
2303
2304static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2305 handle_t *handle,
2306 struct buffer_head *bh,
2307 struct ocfs2_extent_list *el)
2308{
2309 int ret;
2310
2311 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2312 return 0;
2313
2314 ret = ocfs2_journal_access(handle, inode, bh,
2315 OCFS2_JOURNAL_ACCESS_WRITE);
2316 if (ret) {
2317 mlog_errno(ret);
2318 goto out;
2319 }
2320
2321 ocfs2_remove_empty_extent(el);
2322
2323 ret = ocfs2_journal_dirty(handle, bh);
2324 if (ret)
2325 mlog_errno(ret);
2326
2327out:
2328 return ret;
2329}
2330
2331static int __ocfs2_rotate_tree_left(struct inode *inode,
2332 handle_t *handle, int orig_credits,
2333 struct ocfs2_path *path,
2334 struct ocfs2_cached_dealloc_ctxt *dealloc,
2335 struct ocfs2_path **empty_extent_path)
2336{
2337 int ret, subtree_root, deleted;
2338 u32 right_cpos;
2339 struct ocfs2_path *left_path = NULL;
2340 struct ocfs2_path *right_path = NULL;
2341
2342 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2343
2344 *empty_extent_path = NULL;
2345
2346 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2347 &right_cpos);
2348 if (ret) {
2349 mlog_errno(ret);
2350 goto out;
2351 }
2352
2353 left_path = ocfs2_new_path(path_root_bh(path),
2354 path_root_el(path));
2355 if (!left_path) {
2356 ret = -ENOMEM;
2357 mlog_errno(ret);
2358 goto out;
2359 }
2360
2361 ocfs2_cp_path(left_path, path);
2362
2363 right_path = ocfs2_new_path(path_root_bh(path),
2364 path_root_el(path));
2365 if (!right_path) {
2366 ret = -ENOMEM;
2367 mlog_errno(ret);
2368 goto out;
2369 }
2370
2371 while (right_cpos) {
2372 ret = ocfs2_find_path(inode, right_path, right_cpos);
2373 if (ret) {
2374 mlog_errno(ret);
2375 goto out;
2376 }
2377
2378 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2379 right_path);
2380
2381 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2382 subtree_root,
2383 (unsigned long long)
2384 right_path->p_node[subtree_root].bh->b_blocknr,
2385 right_path->p_tree_depth);
2386
2387 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2388 orig_credits, left_path);
2389 if (ret) {
2390 mlog_errno(ret);
2391 goto out;
2392 }
2393
2394 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2395 right_path, subtree_root,
2396 dealloc, &deleted);
2397 if (ret == -EAGAIN) {
2398 /*
2399 * The rotation has to temporarily stop due to
2400 * the right subtree having an empty
2401 * extent. Pass it back to the caller for a
2402 * fixup.
2403 */
2404 *empty_extent_path = right_path;
2405 right_path = NULL;
2406 goto out;
2407 }
2408 if (ret) {
2409 mlog_errno(ret);
2410 goto out;
2411 }
2412
2413 /*
2414 * The subtree rotate might have removed records on
2415 * the rightmost edge. If so, then rotation is
2416 * complete.
2417 */
2418 if (deleted)
2419 break;
2420
2421 ocfs2_mv_path(left_path, right_path);
2422
2423 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2424 &right_cpos);
2425 if (ret) {
2426 mlog_errno(ret);
2427 goto out;
2428 }
2429 }
2430
2431out:
2432 ocfs2_free_path(right_path);
2433 ocfs2_free_path(left_path);
2434
2435 return ret;
2436}
2437
2438static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2439 struct ocfs2_path *path,
2440 struct ocfs2_cached_dealloc_ctxt *dealloc)
2441{
2442 int ret, subtree_index;
2443 u32 cpos;
2444 struct ocfs2_path *left_path = NULL;
2445 struct ocfs2_dinode *di;
2446 struct ocfs2_extent_block *eb;
2447 struct ocfs2_extent_list *el;
2448
2449 /*
2450 * XXX: This code assumes that the root is an inode, which is
2451 * true for now but may change as tree code gets generic.
2452 */
2453 di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
2454 if (!OCFS2_IS_VALID_DINODE(di)) {
2455 ret = -EIO;
2456 ocfs2_error(inode->i_sb,
2457 "Inode %llu has invalid path root",
2458 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2459 goto out;
2460 }
2461
2462 /*
2463 * There's two ways we handle this depending on
2464 * whether path is the only existing one.
2465 */
2466 ret = ocfs2_extend_rotate_transaction(handle, 0,
2467 handle->h_buffer_credits,
2468 path);
2469 if (ret) {
2470 mlog_errno(ret);
2471 goto out;
2472 }
2473
2474 ret = ocfs2_journal_access_path(inode, handle, path);
2475 if (ret) {
2476 mlog_errno(ret);
2477 goto out;
2478 }
2479
2480 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2481 if (ret) {
2482 mlog_errno(ret);
2483 goto out;
2484 }
2485
2486 if (cpos) {
2487 /*
2488 * We have a path to the left of this one - it needs
2489 * an update too.
2490 */
2491 left_path = ocfs2_new_path(path_root_bh(path),
2492 path_root_el(path));
2493 if (!left_path) {
2494 ret = -ENOMEM;
2495 mlog_errno(ret);
2496 goto out;
2497 }
2498
2499 ret = ocfs2_find_path(inode, left_path, cpos);
2500 if (ret) {
2501 mlog_errno(ret);
2502 goto out;
2503 }
2504
2505 ret = ocfs2_journal_access_path(inode, handle, left_path);
2506 if (ret) {
2507 mlog_errno(ret);
2508 goto out;
2509 }
2510
2511 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2512
2513 ocfs2_unlink_subtree(inode, handle, left_path, path,
2514 subtree_index, dealloc);
2515 ocfs2_update_edge_lengths(inode, handle, left_path);
2516
2517 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2518 di->i_last_eb_blk = eb->h_blkno;
2519 } else {
2520 /*
2521 * 'path' is also the leftmost path which
2522 * means it must be the only one. This gets
2523 * handled differently because we want to
2524 * revert the inode back to having extents
2525 * in-line.
2526 */
2527 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2528
2529 el = &di->id2.i_list;
2530 el->l_tree_depth = 0;
2531 el->l_next_free_rec = 0;
2532 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2533
2534 di->i_last_eb_blk = 0;
2535 }
2536
2537 ocfs2_journal_dirty(handle, path_root_bh(path));
2538
2539out:
2540 ocfs2_free_path(left_path);
2541 return ret;
2542}
2543
2544/*
2545 * Left rotation of btree records.
2546 *
2547 * In many ways, this is (unsurprisingly) the opposite of right
2548 * rotation. We start at some non-rightmost path containing an empty
2549 * extent in the leaf block. The code works its way to the rightmost
2550 * path by rotating records to the left in every subtree.
2551 *
2552 * This is used by any code which reduces the number of extent records
2553 * in a leaf. After removal, an empty record should be placed in the
2554 * leftmost list position.
2555 *
2556 * This won't handle a length update of the rightmost path records if
2557 * the rightmost tree leaf record is removed so the caller is
2558 * responsible for detecting and correcting that.
2559 */
2560static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2561 struct ocfs2_path *path,
2562 struct ocfs2_cached_dealloc_ctxt *dealloc)
2563{
2564 int ret, orig_credits = handle->h_buffer_credits;
2565 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2566 struct ocfs2_extent_block *eb;
2567 struct ocfs2_extent_list *el;
2568
2569 el = path_leaf_el(path);
2570 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2571 return 0;
2572
2573 if (path->p_tree_depth == 0) {
2574rightmost_no_delete:
2575 /*
2576 * In-inode extents. This is trivially handled, so do
2577 * it up front.
2578 */
2579 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2580 path_leaf_bh(path),
2581 path_leaf_el(path));
2582 if (ret)
2583 mlog_errno(ret);
2584 goto out;
2585 }
2586
2587 /*
2588 * Handle rightmost branch now. There's several cases:
2589 * 1) simple rotation leaving records in there. That's trivial.
2590 * 2) rotation requiring a branch delete - there's no more
2591 * records left. Two cases of this:
2592 * a) There are branches to the left.
2593 * b) This is also the leftmost (the only) branch.
2594 *
2595 * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
2596 * 2a) we need the left branch so that we can update it with the unlink
2597 * 2b) we need to bring the inode back to inline extents.
2598 */
2599
2600 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2601 el = &eb->h_list;
2602 if (eb->h_next_leaf_blk == 0) {
2603 /*
2604 * This gets a bit tricky if we're going to delete the
2605 * rightmost path. Get the other cases out of the way
2606 * 1st.
2607 */
2608 if (le16_to_cpu(el->l_next_free_rec) > 1)
2609 goto rightmost_no_delete;
2610
2611 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2612 ret = -EIO;
2613 ocfs2_error(inode->i_sb,
2614 "Inode %llu has empty extent block at %llu",
2615 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2616 (unsigned long long)le64_to_cpu(eb->h_blkno));
2617 goto out;
2618 }
2619
2620 /*
2621 * XXX: The caller can not trust "path" any more after
2622 * this as it will have been deleted. What do we do?
2623 *
2624 * In theory the rotate-for-merge code will never get
2625 * here because it'll always ask for a rotate in a
2626 * nonempty list.
2627 */
2628
2629 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2630 dealloc);
2631 if (ret)
2632 mlog_errno(ret);
2633 goto out;
2634 }
2635
2636 /*
2637 * Now we can loop, remembering the path we get from -EAGAIN
2638 * and restarting from there.
2639 */
2640try_rotate:
2641 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2642 dealloc, &restart_path);
2643 if (ret && ret != -EAGAIN) {
2644 mlog_errno(ret);
2645 goto out;
2646 }
2647
2648 while (ret == -EAGAIN) {
2649 tmp_path = restart_path;
2650 restart_path = NULL;
2651
2652 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2653 tmp_path, dealloc,
2654 &restart_path);
2655 if (ret && ret != -EAGAIN) {
2656 mlog_errno(ret);
2657 goto out;
2658 }
2659
2660 ocfs2_free_path(tmp_path);
2661 tmp_path = NULL;
2662
2663 if (ret == 0)
2664 goto try_rotate;
2665 }
2666
2667out:
2668 ocfs2_free_path(tmp_path);
2669 ocfs2_free_path(restart_path);
2670 return ret;
2671}
2672
2673static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2674 int index)
2675{
2676 struct ocfs2_extent_rec *rec = &el->l_recs[index];
2677 unsigned int size;
2678
2679 if (rec->e_leaf_clusters == 0) {
2680 /*
2681 * We consumed all of the merged-from record. An empty
2682 * extent cannot exist anywhere but the 1st array
2683 * position, so move things over if the merged-from
2684 * record doesn't occupy that position.
2685 *
2686 * This creates a new empty extent so the caller
2687 * should be smart enough to have removed any existing
2688 * ones.
2689 */
2690 if (index > 0) {
2691 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2692 size = index * sizeof(struct ocfs2_extent_rec);
2693 memmove(&el->l_recs[1], &el->l_recs[0], size);
2694 }
2695
2696 /*
2697 * Always memset - the caller doesn't check whether it
2698 * created an empty extent, so there could be junk in
2699 * the other fields.
2700 */
2701 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2702 }
2703}
2704
2705/*
2706 * Remove split_rec clusters from the record at index and merge them
2707 * onto the beginning of the record at index + 1.
2708 */
2709static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
2710 handle_t *handle,
2711 struct ocfs2_extent_rec *split_rec,
2712 struct ocfs2_extent_list *el, int index)
2713{
2714 int ret;
2715 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2716 struct ocfs2_extent_rec *left_rec;
2717 struct ocfs2_extent_rec *right_rec;
2718
2719 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2720
2721 left_rec = &el->l_recs[index];
2722 right_rec = &el->l_recs[index + 1];
2723
2724 ret = ocfs2_journal_access(handle, inode, bh,
2725 OCFS2_JOURNAL_ACCESS_WRITE);
2726 if (ret) {
2727 mlog_errno(ret);
2728 goto out;
2729 }
2730
2731 le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
2732
2733 le32_add_cpu(&right_rec->e_cpos, -split_clusters);
2734 le64_add_cpu(&right_rec->e_blkno,
2735 -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2736 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
2737
2738 ocfs2_cleanup_merge(el, index);
2739
2740 ret = ocfs2_journal_dirty(handle, bh);
2741 if (ret)
2742 mlog_errno(ret);
2743
2744out:
2745 return ret;
2746}
2747
2748/*
2749 * Remove split_rec clusters from the record at index and merge them
2750 * onto the tail of the record at index - 1.
2751 */
2752static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2753 handle_t *handle,
2754 struct ocfs2_extent_rec *split_rec,
2755 struct ocfs2_extent_list *el, int index)
2756{
2757 int ret, has_empty_extent = 0;
2758 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2759 struct ocfs2_extent_rec *left_rec;
2760 struct ocfs2_extent_rec *right_rec;
2761
2762 BUG_ON(index <= 0);
2763
2764 left_rec = &el->l_recs[index - 1];
2765 right_rec = &el->l_recs[index];
2766 if (ocfs2_is_empty_extent(&el->l_recs[0]))
2767 has_empty_extent = 1;
2768
2769 ret = ocfs2_journal_access(handle, inode, bh,
2770 OCFS2_JOURNAL_ACCESS_WRITE);
2771 if (ret) {
2772 mlog_errno(ret);
2773 goto out;
2774 }
2775
2776 if (has_empty_extent && index == 1) {
2777 /*
2778 * The easy case - we can just plop the record right in.
2779 */
2780 *left_rec = *split_rec;
2781
2782 has_empty_extent = 0;
2783 } else {
2784 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
2785 }
2786
2787 le32_add_cpu(&right_rec->e_cpos, split_clusters);
2788 le64_add_cpu(&right_rec->e_blkno,
2789 ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2790 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
2791
2792 ocfs2_cleanup_merge(el, index);
2793
2794 ret = ocfs2_journal_dirty(handle, bh);
2795 if (ret)
2796 mlog_errno(ret);
2797
2798out:
2799 return ret;
2800}
2801
2802static int ocfs2_try_to_merge_extent(struct inode *inode,
2803 handle_t *handle,
2804 struct ocfs2_path *left_path,
2805 int split_index,
2806 struct ocfs2_extent_rec *split_rec,
2807 struct ocfs2_cached_dealloc_ctxt *dealloc,
2808 struct ocfs2_merge_ctxt *ctxt)
2809
2810{
2811 int ret = 0, delete_tail_recs = 0;
2812 struct ocfs2_extent_list *el = path_leaf_el(left_path);
2813 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
2814
2815 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
2816
2817 if (ctxt->c_split_covers_rec) {
2818 delete_tail_recs++;
2819
2820 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT ||
2821 ctxt->c_has_empty_extent)
2822 delete_tail_recs++;
2823
2824 if (ctxt->c_has_empty_extent) {
2825 /*
2826 * The merge code will need to create an empty
2827 * extent to take the place of the newly
2828 * emptied slot. Remove any pre-existing empty
2829 * extents - having more than one in a leaf is
2830 * illegal.
2831 */
2832 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2833 dealloc);
2834 if (ret) {
2835 mlog_errno(ret);
2836 goto out;
2837 }
2838 split_index--;
2839 rec = &el->l_recs[split_index];
2840 }
2841 }
2842
2843 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
2844 /*
2845 * Left-right contig implies this.
2846 */
2847 BUG_ON(!ctxt->c_split_covers_rec);
2848 BUG_ON(split_index == 0);
2849
2850 /*
2851 * Since the leftright insert always covers the entire
2852 * extent, this call will delete the insert record
2853 * entirely, resulting in an empty extent record added to
2854 * the extent block.
2855 *
2856 * Since the adding of an empty extent shifts
2857 * everything back to the right, there's no need to
2858 * update split_index here.
2859 */
2860 ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
2861 handle, split_rec, el, split_index);
2862 if (ret) {
2863 mlog_errno(ret);
2864 goto out;
2865 }
2866
2867 /*
2868 * We can only get this from logic error above.
2869 */
2870 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2871
2872 /*
2873 * The left merge left us with an empty extent, remove
2874 * it.
2875 */
2876 ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
2877 if (ret) {
2878 mlog_errno(ret);
2879 goto out;
2880 }
2881 split_index--;
2882 rec = &el->l_recs[split_index];
2883
2884 /*
2885 * Note that we don't pass split_rec here on purpose -
2886 * we've merged it into the left side.
2887 */
2888 ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
2889 handle, rec, el, split_index);
2890 if (ret) {
2891 mlog_errno(ret);
2892 goto out;
2893 }
2894
2895 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2896
2897 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2898 dealloc);
2899 /*
2900 * Error from this last rotate is not critical, so
2901 * print but don't bubble it up.
2902 */
2903 if (ret)
2904 mlog_errno(ret);
2905 ret = 0;
2906 } else {
2907 /*
2908 * Merge a record to the left or right.
2909 *
2910 * 'contig_type' is relative to the existing record,
2911 * so for example, if we're "right contig", it's to
2912 * the record on the left (hence the left merge).
2913 */
2914 if (ctxt->c_contig_type == CONTIG_RIGHT) {
2915 ret = ocfs2_merge_rec_left(inode,
2916 path_leaf_bh(left_path),
2917 handle, split_rec, el,
2918 split_index);
2919 if (ret) {
2920 mlog_errno(ret);
2921 goto out;
2922 }
2923 } else {
2924 ret = ocfs2_merge_rec_right(inode,
2925 path_leaf_bh(left_path),
2926 handle, split_rec, el,
2927 split_index);
2928 if (ret) {
2929 mlog_errno(ret);
2930 goto out;
2931 }
2932 }
2933
2934 if (ctxt->c_split_covers_rec) {
2935 /*
2936 * The merge may have left an empty extent in
2937 * our leaf. Try to rotate it away.
2938 */
2939 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2940 dealloc);
2941 if (ret)
2942 mlog_errno(ret);
2943 ret = 0;
2944 }
2945 }
2946
2947out:
2948 return ret;
2949}
2950
2951static void ocfs2_subtract_from_rec(struct super_block *sb,
2952 enum ocfs2_split_type split,
2953 struct ocfs2_extent_rec *rec,
2954 struct ocfs2_extent_rec *split_rec)
2955{
2956 u64 len_blocks;
2957
2958 len_blocks = ocfs2_clusters_to_blocks(sb,
2959 le16_to_cpu(split_rec->e_leaf_clusters));
2960
2961 if (split == SPLIT_LEFT) {
2962 /*
2963 * Region is on the left edge of the existing
2964 * record.
2965 */
2966 le32_add_cpu(&rec->e_cpos,
2967 le16_to_cpu(split_rec->e_leaf_clusters));
2968 le64_add_cpu(&rec->e_blkno, len_blocks);
2969 le16_add_cpu(&rec->e_leaf_clusters,
2970 -le16_to_cpu(split_rec->e_leaf_clusters));
2971 } else {
2972 /*
2973 * Region is on the right edge of the existing
2974 * record.
2975 */
2976 le16_add_cpu(&rec->e_leaf_clusters,
2977 -le16_to_cpu(split_rec->e_leaf_clusters));
2978 }
2979}
2980
1725/* 2981/*
1726 * Do the final bits of extent record insertion at the target leaf 2982 * Do the final bits of extent record insertion at the target leaf
1727 * list. If this leaf is part of an allocation tree, it is assumed 2983 * list. If this leaf is part of an allocation tree, it is assumed
@@ -1738,6 +2994,15 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1738 2994
1739 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 2995 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1740 2996
2997 if (insert->ins_split != SPLIT_NONE) {
2998 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
2999 BUG_ON(i == -1);
3000 rec = &el->l_recs[i];
3001 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3002 insert_rec);
3003 goto rotate;
3004 }
3005
1741 /* 3006 /*
1742 * Contiguous insert - either left or right. 3007 * Contiguous insert - either left or right.
1743 */ 3008 */
@@ -1792,6 +3057,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1792 return; 3057 return;
1793 } 3058 }
1794 3059
3060rotate:
1795 /* 3061 /*
1796 * Ok, we have to rotate. 3062 * Ok, we have to rotate.
1797 * 3063 *
@@ -1815,13 +3081,53 @@ static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1815 spin_unlock(&OCFS2_I(inode)->ip_lock); 3081 spin_unlock(&OCFS2_I(inode)->ip_lock);
1816} 3082}
1817 3083
3084static void ocfs2_adjust_rightmost_records(struct inode *inode,
3085 handle_t *handle,
3086 struct ocfs2_path *path,
3087 struct ocfs2_extent_rec *insert_rec)
3088{
3089 int ret, i, next_free;
3090 struct buffer_head *bh;
3091 struct ocfs2_extent_list *el;
3092 struct ocfs2_extent_rec *rec;
3093
3094 /*
3095 * Update everything except the leaf block.
3096 */
3097 for (i = 0; i < path->p_tree_depth; i++) {
3098 bh = path->p_node[i].bh;
3099 el = path->p_node[i].el;
3100
3101 next_free = le16_to_cpu(el->l_next_free_rec);
3102 if (next_free == 0) {
3103 ocfs2_error(inode->i_sb,
3104 "Dinode %llu has a bad extent list",
3105 (unsigned long long)OCFS2_I(inode)->ip_blkno);
3106 ret = -EIO;
3107 return;
3108 }
3109
3110 rec = &el->l_recs[next_free - 1];
3111
3112 rec->e_int_clusters = insert_rec->e_cpos;
3113 le32_add_cpu(&rec->e_int_clusters,
3114 le16_to_cpu(insert_rec->e_leaf_clusters));
3115 le32_add_cpu(&rec->e_int_clusters,
3116 -le32_to_cpu(rec->e_cpos));
3117
3118 ret = ocfs2_journal_dirty(handle, bh);
3119 if (ret)
3120 mlog_errno(ret);
3121
3122 }
3123}
3124
1818static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 3125static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1819 struct ocfs2_extent_rec *insert_rec, 3126 struct ocfs2_extent_rec *insert_rec,
1820 struct ocfs2_path *right_path, 3127 struct ocfs2_path *right_path,
1821 struct ocfs2_path **ret_left_path) 3128 struct ocfs2_path **ret_left_path)
1822{ 3129{
1823 int ret, i, next_free; 3130 int ret, next_free;
1824 struct buffer_head *bh;
1825 struct ocfs2_extent_list *el; 3131 struct ocfs2_extent_list *el;
1826 struct ocfs2_path *left_path = NULL; 3132 struct ocfs2_path *left_path = NULL;
1827 3133
@@ -1887,40 +3193,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1887 goto out; 3193 goto out;
1888 } 3194 }
1889 3195
1890 el = path_root_el(right_path); 3196 ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
1891 bh = path_root_bh(right_path);
1892 i = 0;
1893 while (1) {
1894 struct ocfs2_extent_rec *rec;
1895
1896 next_free = le16_to_cpu(el->l_next_free_rec);
1897 if (next_free == 0) {
1898 ocfs2_error(inode->i_sb,
1899 "Dinode %llu has a bad extent list",
1900 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1901 ret = -EIO;
1902 goto out;
1903 }
1904
1905 rec = &el->l_recs[next_free - 1];
1906
1907 rec->e_int_clusters = insert_rec->e_cpos;
1908 le32_add_cpu(&rec->e_int_clusters,
1909 le16_to_cpu(insert_rec->e_leaf_clusters));
1910 le32_add_cpu(&rec->e_int_clusters,
1911 -le32_to_cpu(rec->e_cpos));
1912
1913 ret = ocfs2_journal_dirty(handle, bh);
1914 if (ret)
1915 mlog_errno(ret);
1916
1917 /* Don't touch the leaf node */
1918 if (++i >= right_path->p_tree_depth)
1919 break;
1920
1921 bh = right_path->p_node[i].bh;
1922 el = right_path->p_node[i].el;
1923 }
1924 3197
1925 *ret_left_path = left_path; 3198 *ret_left_path = left_path;
1926 ret = 0; 3199 ret = 0;
@@ -1931,6 +3204,83 @@ out:
1931 return ret; 3204 return ret;
1932} 3205}
1933 3206
3207static void ocfs2_split_record(struct inode *inode,
3208 struct ocfs2_path *left_path,
3209 struct ocfs2_path *right_path,
3210 struct ocfs2_extent_rec *split_rec,
3211 enum ocfs2_split_type split)
3212{
3213 int index;
3214 u32 cpos = le32_to_cpu(split_rec->e_cpos);
3215 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3216 struct ocfs2_extent_rec *rec, *tmprec;
3217
3218 right_el = path_leaf_el(right_path);;
3219 if (left_path)
3220 left_el = path_leaf_el(left_path);
3221
3222 el = right_el;
3223 insert_el = right_el;
3224 index = ocfs2_search_extent_list(el, cpos);
3225 if (index != -1) {
3226 if (index == 0 && left_path) {
3227 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3228
3229 /*
3230 * This typically means that the record
3231 * started in the left path but moved to the
3232 * right as a result of rotation. We either
3233 * move the existing record to the left, or we
3234 * do the later insert there.
3235 *
3236 * In this case, the left path should always
3237 * exist as the rotate code will have passed
3238 * it back for a post-insert update.
3239 */
3240
3241 if (split == SPLIT_LEFT) {
3242 /*
3243 * It's a left split. Since we know
3244 * that the rotate code gave us an
3245 * empty extent in the left path, we
3246 * can just do the insert there.
3247 */
3248 insert_el = left_el;
3249 } else {
3250 /*
3251 * Right split - we have to move the
3252 * existing record over to the left
3253 * leaf. The insert will be into the
3254 * newly created empty extent in the
3255 * right leaf.
3256 */
3257 tmprec = &right_el->l_recs[index];
3258 ocfs2_rotate_leaf(left_el, tmprec);
3259 el = left_el;
3260
3261 memset(tmprec, 0, sizeof(*tmprec));
3262 index = ocfs2_search_extent_list(left_el, cpos);
3263 BUG_ON(index == -1);
3264 }
3265 }
3266 } else {
3267 BUG_ON(!left_path);
3268 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3269 /*
3270 * Left path is easy - we can just allow the insert to
3271 * happen.
3272 */
3273 el = left_el;
3274 insert_el = left_el;
3275 index = ocfs2_search_extent_list(el, cpos);
3276 BUG_ON(index == -1);
3277 }
3278
3279 rec = &el->l_recs[index];
3280 ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3281 ocfs2_rotate_leaf(insert_el, split_rec);
3282}
3283
1934/* 3284/*
1935 * This function only does inserts on an allocation b-tree. For dinode 3285 * This function only does inserts on an allocation b-tree. For dinode
1936 * lists, ocfs2_insert_at_leaf() is called directly. 3286 * lists, ocfs2_insert_at_leaf() is called directly.
@@ -1948,7 +3298,6 @@ static int ocfs2_insert_path(struct inode *inode,
1948{ 3298{
1949 int ret, subtree_index; 3299 int ret, subtree_index;
1950 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 3300 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1951 struct ocfs2_extent_list *el;
1952 3301
1953 /* 3302 /*
1954 * Pass both paths to the journal. The majority of inserts 3303 * Pass both paths to the journal. The majority of inserts
@@ -1984,9 +3333,18 @@ static int ocfs2_insert_path(struct inode *inode,
1984 } 3333 }
1985 } 3334 }
1986 3335
1987 el = path_leaf_el(right_path); 3336 if (insert->ins_split != SPLIT_NONE) {
3337 /*
3338 * We could call ocfs2_insert_at_leaf() for some types
3339 * of splits, but it's easier to just let one seperate
3340 * function sort it all out.
3341 */
3342 ocfs2_split_record(inode, left_path, right_path,
3343 insert_rec, insert->ins_split);
3344 } else
3345 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3346 insert, inode);
1988 3347
1989 ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1990 ret = ocfs2_journal_dirty(handle, leaf_bh); 3348 ret = ocfs2_journal_dirty(handle, leaf_bh);
1991 if (ret) 3349 if (ret)
1992 mlog_errno(ret); 3350 mlog_errno(ret);
@@ -2075,7 +3433,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
2075 * can wind up skipping both of these two special cases... 3433 * can wind up skipping both of these two special cases...
2076 */ 3434 */
2077 if (rotate) { 3435 if (rotate) {
2078 ret = ocfs2_rotate_tree_right(inode, handle, 3436 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
2079 le32_to_cpu(insert_rec->e_cpos), 3437 le32_to_cpu(insert_rec->e_cpos),
2080 right_path, &left_path); 3438 right_path, &left_path);
2081 if (ret) { 3439 if (ret) {
@@ -2100,8 +3458,9 @@ static int ocfs2_do_insert_extent(struct inode *inode,
2100 } 3458 }
2101 3459
2102out_update_clusters: 3460out_update_clusters:
2103 ocfs2_update_dinode_clusters(inode, di, 3461 if (type->ins_split == SPLIT_NONE)
2104 le16_to_cpu(insert_rec->e_leaf_clusters)); 3462 ocfs2_update_dinode_clusters(inode, di,
3463 le16_to_cpu(insert_rec->e_leaf_clusters));
2105 3464
2106 ret = ocfs2_journal_dirty(handle, di_bh); 3465 ret = ocfs2_journal_dirty(handle, di_bh);
2107 if (ret) 3466 if (ret)
@@ -2114,6 +3473,44 @@ out:
2114 return ret; 3473 return ret;
2115} 3474}
2116 3475
3476static enum ocfs2_contig_type
3477ocfs2_figure_merge_contig_type(struct inode *inode,
3478 struct ocfs2_extent_list *el, int index,
3479 struct ocfs2_extent_rec *split_rec)
3480{
3481 struct ocfs2_extent_rec *rec;
3482 enum ocfs2_contig_type ret = CONTIG_NONE;
3483
3484 /*
3485 * We're careful to check for an empty extent record here -
3486 * the merge code will know what to do if it sees one.
3487 */
3488
3489 if (index > 0) {
3490 rec = &el->l_recs[index - 1];
3491 if (index == 1 && ocfs2_is_empty_extent(rec)) {
3492 if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3493 ret = CONTIG_RIGHT;
3494 } else {
3495 ret = ocfs2_extent_contig(inode, rec, split_rec);
3496 }
3497 }
3498
3499 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
3500 enum ocfs2_contig_type contig_type;
3501
3502 rec = &el->l_recs[index + 1];
3503 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3504
3505 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
3506 ret = CONTIG_LEFTRIGHT;
3507 else if (ret == CONTIG_NONE)
3508 ret = contig_type;
3509 }
3510
3511 return ret;
3512}
3513
2117static void ocfs2_figure_contig_type(struct inode *inode, 3514static void ocfs2_figure_contig_type(struct inode *inode,
2118 struct ocfs2_insert_type *insert, 3515 struct ocfs2_insert_type *insert,
2119 struct ocfs2_extent_list *el, 3516 struct ocfs2_extent_list *el,
@@ -2205,6 +3602,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
2205 struct ocfs2_path *path = NULL; 3602 struct ocfs2_path *path = NULL;
2206 struct buffer_head *bh = NULL; 3603 struct buffer_head *bh = NULL;
2207 3604
3605 insert->ins_split = SPLIT_NONE;
3606
2208 el = &di->id2.i_list; 3607 el = &di->id2.i_list;
2209 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 3608 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2210 3609
@@ -2327,9 +3726,10 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2327 u32 cpos, 3726 u32 cpos,
2328 u64 start_blk, 3727 u64 start_blk,
2329 u32 new_clusters, 3728 u32 new_clusters,
3729 u8 flags,
2330 struct ocfs2_alloc_context *meta_ac) 3730 struct ocfs2_alloc_context *meta_ac)
2331{ 3731{
2332 int status, shift; 3732 int status;
2333 struct buffer_head *last_eb_bh = NULL; 3733 struct buffer_head *last_eb_bh = NULL;
2334 struct buffer_head *bh = NULL; 3734 struct buffer_head *bh = NULL;
2335 struct ocfs2_insert_type insert = {0, }; 3735 struct ocfs2_insert_type insert = {0, };
@@ -2350,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2350 rec.e_cpos = cpu_to_le32(cpos); 3750 rec.e_cpos = cpu_to_le32(cpos);
2351 rec.e_blkno = cpu_to_le64(start_blk); 3751 rec.e_blkno = cpu_to_le64(start_blk);
2352 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 3752 rec.e_leaf_clusters = cpu_to_le16(new_clusters);
3753 rec.e_flags = flags;
2353 3754
2354 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, 3755 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2355 &insert); 3756 &insert);
@@ -2364,55 +3765,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2364 insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 3765 insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2365 insert.ins_free_records, insert.ins_tree_depth); 3766 insert.ins_free_records, insert.ins_tree_depth);
2366 3767
2367 /* 3768 if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) {
2368 * Avoid growing the tree unless we're out of records and the 3769 status = ocfs2_grow_tree(inode, handle, fe_bh,
2369 * insert type requres one. 3770 &insert.ins_tree_depth, &last_eb_bh,
2370 */ 3771 meta_ac);
2371 if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) 3772 if (status) {
2372 goto out_add;
2373
2374 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
2375 if (shift < 0) {
2376 status = shift;
2377 mlog_errno(status);
2378 goto bail;
2379 }
2380
2381 /* We traveled all the way to the bottom of the allocation tree
2382 * and didn't find room for any more extents - we need to add
2383 * another tree level */
2384 if (shift) {
2385 BUG_ON(bh);
2386 mlog(0, "need to shift tree depth "
2387 "(current = %d)\n", insert.ins_tree_depth);
2388
2389 /* ocfs2_shift_tree_depth will return us a buffer with
2390 * the new extent block (so we can pass that to
2391 * ocfs2_add_branch). */
2392 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
2393 meta_ac, &bh);
2394 if (status < 0) {
2395 mlog_errno(status); 3773 mlog_errno(status);
2396 goto bail; 3774 goto bail;
2397 } 3775 }
2398 insert.ins_tree_depth++;
2399 /* Special case: we have room now if we shifted from
2400 * tree_depth 0 */
2401 if (insert.ins_tree_depth == 1)
2402 goto out_add;
2403 }
2404
2405 /* call ocfs2_add_branch to add the final part of the tree with
2406 * the new data. */
2407 mlog(0, "add branch. bh = %p\n", bh);
2408 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
2409 meta_ac);
2410 if (status < 0) {
2411 mlog_errno(status);
2412 goto bail;
2413 } 3776 }
2414 3777
2415out_add:
2416 /* Finally, we can add clusters. This might rotate the tree for us. */ 3778 /* Finally, we can add clusters. This might rotate the tree for us. */
2417 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); 3779 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
2418 if (status < 0) 3780 if (status < 0)
@@ -2431,7 +3793,720 @@ bail:
2431 return status; 3793 return status;
2432} 3794}
2433 3795
2434static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 3796static void ocfs2_make_right_split_rec(struct super_block *sb,
3797 struct ocfs2_extent_rec *split_rec,
3798 u32 cpos,
3799 struct ocfs2_extent_rec *rec)
3800{
3801 u32 rec_cpos = le32_to_cpu(rec->e_cpos);
3802 u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
3803
3804 memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
3805
3806 split_rec->e_cpos = cpu_to_le32(cpos);
3807 split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
3808
3809 split_rec->e_blkno = rec->e_blkno;
3810 le64_add_cpu(&split_rec->e_blkno,
3811 ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
3812
3813 split_rec->e_flags = rec->e_flags;
3814}
3815
3816static int ocfs2_split_and_insert(struct inode *inode,
3817 handle_t *handle,
3818 struct ocfs2_path *path,
3819 struct buffer_head *di_bh,
3820 struct buffer_head **last_eb_bh,
3821 int split_index,
3822 struct ocfs2_extent_rec *orig_split_rec,
3823 struct ocfs2_alloc_context *meta_ac)
3824{
3825 int ret = 0, depth;
3826 unsigned int insert_range, rec_range, do_leftright = 0;
3827 struct ocfs2_extent_rec tmprec;
3828 struct ocfs2_extent_list *rightmost_el;
3829 struct ocfs2_extent_rec rec;
3830 struct ocfs2_extent_rec split_rec = *orig_split_rec;
3831 struct ocfs2_insert_type insert;
3832 struct ocfs2_extent_block *eb;
3833 struct ocfs2_dinode *di;
3834
3835leftright:
3836 /*
3837 * Store a copy of the record on the stack - it might move
3838 * around as the tree is manipulated below.
3839 */
3840 rec = path_leaf_el(path)->l_recs[split_index];
3841
3842 di = (struct ocfs2_dinode *)di_bh->b_data;
3843 rightmost_el = &di->id2.i_list;
3844
3845 depth = le16_to_cpu(rightmost_el->l_tree_depth);
3846 if (depth) {
3847 BUG_ON(!(*last_eb_bh));
3848 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
3849 rightmost_el = &eb->h_list;
3850 }
3851
3852 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
3853 le16_to_cpu(rightmost_el->l_count)) {
3854 int old_depth = depth;
3855
3856 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
3857 meta_ac);
3858 if (ret) {
3859 mlog_errno(ret);
3860 goto out;
3861 }
3862
3863 if (old_depth != depth) {
3864 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
3865 rightmost_el = &eb->h_list;
3866 }
3867 }
3868
3869 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
3870 insert.ins_appending = APPEND_NONE;
3871 insert.ins_contig = CONTIG_NONE;
3872 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
3873 - le16_to_cpu(rightmost_el->l_next_free_rec);
3874 insert.ins_tree_depth = depth;
3875
3876 insert_range = le32_to_cpu(split_rec.e_cpos) +
3877 le16_to_cpu(split_rec.e_leaf_clusters);
3878 rec_range = le32_to_cpu(rec.e_cpos) +
3879 le16_to_cpu(rec.e_leaf_clusters);
3880
3881 if (split_rec.e_cpos == rec.e_cpos) {
3882 insert.ins_split = SPLIT_LEFT;
3883 } else if (insert_range == rec_range) {
3884 insert.ins_split = SPLIT_RIGHT;
3885 } else {
3886 /*
3887 * Left/right split. We fake this as a right split
3888 * first and then make a second pass as a left split.
3889 */
3890 insert.ins_split = SPLIT_RIGHT;
3891
3892 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
3893 &rec);
3894
3895 split_rec = tmprec;
3896
3897 BUG_ON(do_leftright);
3898 do_leftright = 1;
3899 }
3900
3901 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
3902 &insert);
3903 if (ret) {
3904 mlog_errno(ret);
3905 goto out;
3906 }
3907
3908 if (do_leftright == 1) {
3909 u32 cpos;
3910 struct ocfs2_extent_list *el;
3911
3912 do_leftright++;
3913 split_rec = *orig_split_rec;
3914
3915 ocfs2_reinit_path(path, 1);
3916
3917 cpos = le32_to_cpu(split_rec.e_cpos);
3918 ret = ocfs2_find_path(inode, path, cpos);
3919 if (ret) {
3920 mlog_errno(ret);
3921 goto out;
3922 }
3923
3924 el = path_leaf_el(path);
3925 split_index = ocfs2_search_extent_list(el, cpos);
3926 goto leftright;
3927 }
3928out:
3929
3930 return ret;
3931}
3932
3933/*
3934 * Mark part or all of the extent record at split_index in the leaf
3935 * pointed to by path as written. This removes the unwritten
3936 * extent flag.
3937 *
3938 * Care is taken to handle contiguousness so as to not grow the tree.
3939 *
3940 * meta_ac is not strictly necessary - we only truly need it if growth
3941 * of the tree is required. All other cases will degrade into a less
3942 * optimal tree layout.
3943 *
3944 * last_eb_bh should be the rightmost leaf block for any inode with a
3945 * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
3946 *
3947 * This code is optimized for readability - several passes might be
3948 * made over certain portions of the tree. All of those blocks will
3949 * have been brought into cache (and pinned via the journal), so the
3950 * extra overhead is not expressed in terms of disk reads.
3951 */
3952static int __ocfs2_mark_extent_written(struct inode *inode,
3953 struct buffer_head *di_bh,
3954 handle_t *handle,
3955 struct ocfs2_path *path,
3956 int split_index,
3957 struct ocfs2_extent_rec *split_rec,
3958 struct ocfs2_alloc_context *meta_ac,
3959 struct ocfs2_cached_dealloc_ctxt *dealloc)
3960{
3961 int ret = 0;
3962 struct ocfs2_extent_list *el = path_leaf_el(path);
3963 struct buffer_head *eb_bh, *last_eb_bh = NULL;
3964 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3965 struct ocfs2_merge_ctxt ctxt;
3966 struct ocfs2_extent_list *rightmost_el;
3967
3968 if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
3969 ret = -EIO;
3970 mlog_errno(ret);
3971 goto out;
3972 }
3973
3974 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
3975 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
3976 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
3977 ret = -EIO;
3978 mlog_errno(ret);
3979 goto out;
3980 }
3981
3982 eb_bh = path_leaf_bh(path);
3983 ret = ocfs2_journal_access(handle, inode, eb_bh,
3984 OCFS2_JOURNAL_ACCESS_WRITE);
3985 if (ret) {
3986 mlog_errno(ret);
3987 goto out;
3988 }
3989
3990 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
3991 split_index,
3992 split_rec);
3993
3994 /*
3995 * The core merge / split code wants to know how much room is
3996 * left in this inodes allocation tree, so we pass the
3997 * rightmost extent list.
3998 */
3999 if (path->p_tree_depth) {
4000 struct ocfs2_extent_block *eb;
4001 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4002
4003 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4004 le64_to_cpu(di->i_last_eb_blk),
4005 &last_eb_bh, OCFS2_BH_CACHED, inode);
4006 if (ret) {
4007 mlog_exit(ret);
4008 goto out;
4009 }
4010
4011 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4012 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4013 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4014 ret = -EROFS;
4015 goto out;
4016 }
4017
4018 rightmost_el = &eb->h_list;
4019 } else
4020 rightmost_el = path_root_el(path);
4021
4022 ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
4023 if (ctxt.c_used_tail_recs > 0 &&
4024 ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
4025 ctxt.c_used_tail_recs--;
4026
4027 if (rec->e_cpos == split_rec->e_cpos &&
4028 rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4029 ctxt.c_split_covers_rec = 1;
4030 else
4031 ctxt.c_split_covers_rec = 0;
4032
4033 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4034
4035 mlog(0, "index: %d, contig: %u, used_tail_recs: %u, "
4036 "has_empty: %u, split_covers: %u\n", split_index,
4037 ctxt.c_contig_type, ctxt.c_used_tail_recs,
4038 ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
4039
4040 if (ctxt.c_contig_type == CONTIG_NONE) {
4041 if (ctxt.c_split_covers_rec)
4042 el->l_recs[split_index] = *split_rec;
4043 else
4044 ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
4045 &last_eb_bh, split_index,
4046 split_rec, meta_ac);
4047 if (ret)
4048 mlog_errno(ret);
4049 } else {
4050 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4051 split_index, split_rec,
4052 dealloc, &ctxt);
4053 if (ret)
4054 mlog_errno(ret);
4055 }
4056
4057 ocfs2_journal_dirty(handle, eb_bh);
4058
4059out:
4060 brelse(last_eb_bh);
4061 return ret;
4062}
4063
4064/*
4065 * Mark the already-existing extent at cpos as written for len clusters.
4066 *
4067 * If the existing extent is larger than the request, initiate a
4068 * split. An attempt will be made at merging with adjacent extents.
4069 *
4070 * The caller is responsible for passing down meta_ac if we'll need it.
4071 */
4072int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4073 handle_t *handle, u32 cpos, u32 len, u32 phys,
4074 struct ocfs2_alloc_context *meta_ac,
4075 struct ocfs2_cached_dealloc_ctxt *dealloc)
4076{
4077 int ret, index;
4078 u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4079 struct ocfs2_extent_rec split_rec;
4080 struct ocfs2_path *left_path = NULL;
4081 struct ocfs2_extent_list *el;
4082
4083 mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4084 inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4085
4086 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4087 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4088 "that are being written to, but the feature bit "
4089 "is not set in the super block.",
4090 (unsigned long long)OCFS2_I(inode)->ip_blkno);
4091 ret = -EROFS;
4092 goto out;
4093 }
4094
4095 /*
4096 * XXX: This should be fixed up so that we just re-insert the
4097 * next extent records.
4098 */
4099 ocfs2_extent_map_trunc(inode, 0);
4100
4101 left_path = ocfs2_new_inode_path(di_bh);
4102 if (!left_path) {
4103 ret = -ENOMEM;
4104 mlog_errno(ret);
4105 goto out;
4106 }
4107
4108 ret = ocfs2_find_path(inode, left_path, cpos);
4109 if (ret) {
4110 mlog_errno(ret);
4111 goto out;
4112 }
4113 el = path_leaf_el(left_path);
4114
4115 index = ocfs2_search_extent_list(el, cpos);
4116 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4117 ocfs2_error(inode->i_sb,
4118 "Inode %llu has an extent at cpos %u which can no "
4119 "longer be found.\n",
4120 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4121 ret = -EROFS;
4122 goto out;
4123 }
4124
4125 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4126 split_rec.e_cpos = cpu_to_le32(cpos);
4127 split_rec.e_leaf_clusters = cpu_to_le16(len);
4128 split_rec.e_blkno = cpu_to_le64(start_blkno);
4129 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4130 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4131
4132 ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
4133 index, &split_rec, meta_ac, dealloc);
4134 if (ret)
4135 mlog_errno(ret);
4136
4137out:
4138 ocfs2_free_path(left_path);
4139 return ret;
4140}
4141
4142static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4143 handle_t *handle, struct ocfs2_path *path,
4144 int index, u32 new_range,
4145 struct ocfs2_alloc_context *meta_ac)
4146{
4147 int ret, depth, credits = handle->h_buffer_credits;
4148 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4149 struct buffer_head *last_eb_bh = NULL;
4150 struct ocfs2_extent_block *eb;
4151 struct ocfs2_extent_list *rightmost_el, *el;
4152 struct ocfs2_extent_rec split_rec;
4153 struct ocfs2_extent_rec *rec;
4154 struct ocfs2_insert_type insert;
4155
4156 /*
4157 * Setup the record to split before we grow the tree.
4158 */
4159 el = path_leaf_el(path);
4160 rec = &el->l_recs[index];
4161 ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4162
4163 depth = path->p_tree_depth;
4164 if (depth > 0) {
4165 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4166 le64_to_cpu(di->i_last_eb_blk),
4167 &last_eb_bh, OCFS2_BH_CACHED, inode);
4168 if (ret < 0) {
4169 mlog_errno(ret);
4170 goto out;
4171 }
4172
4173 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4174 rightmost_el = &eb->h_list;
4175 } else
4176 rightmost_el = path_leaf_el(path);
4177
4178 credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
4179 ret = ocfs2_extend_trans(handle, credits);
4180 if (ret) {
4181 mlog_errno(ret);
4182 goto out;
4183 }
4184
4185 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4186 le16_to_cpu(rightmost_el->l_count)) {
4187 int old_depth = depth;
4188
4189 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
4190 meta_ac);
4191 if (ret) {
4192 mlog_errno(ret);
4193 goto out;
4194 }
4195
4196 if (old_depth != depth) {
4197 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
4198 rightmost_el = &eb->h_list;
4199 }
4200 }
4201
4202 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4203 insert.ins_appending = APPEND_NONE;
4204 insert.ins_contig = CONTIG_NONE;
4205 insert.ins_split = SPLIT_RIGHT;
4206 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
4207 - le16_to_cpu(rightmost_el->l_next_free_rec);
4208 insert.ins_tree_depth = depth;
4209
4210 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
4211 if (ret)
4212 mlog_errno(ret);
4213
4214out:
4215 brelse(last_eb_bh);
4216 return ret;
4217}
4218
4219static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4220 struct ocfs2_path *path, int index,
4221 struct ocfs2_cached_dealloc_ctxt *dealloc,
4222 u32 cpos, u32 len)
4223{
4224 int ret;
4225 u32 left_cpos, rec_range, trunc_range;
4226 int wants_rotate = 0, is_rightmost_tree_rec = 0;
4227 struct super_block *sb = inode->i_sb;
4228 struct ocfs2_path *left_path = NULL;
4229 struct ocfs2_extent_list *el = path_leaf_el(path);
4230 struct ocfs2_extent_rec *rec;
4231 struct ocfs2_extent_block *eb;
4232
4233 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4234 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4235 if (ret) {
4236 mlog_errno(ret);
4237 goto out;
4238 }
4239
4240 index--;
4241 }
4242
4243 if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4244 path->p_tree_depth) {
4245 /*
4246 * Check whether this is the rightmost tree record. If
4247 * we remove all of this record or part of its right
4248 * edge then an update of the record lengths above it
4249 * will be required.
4250 */
4251 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4252 if (eb->h_next_leaf_blk == 0)
4253 is_rightmost_tree_rec = 1;
4254 }
4255
4256 rec = &el->l_recs[index];
4257 if (index == 0 && path->p_tree_depth &&
4258 le32_to_cpu(rec->e_cpos) == cpos) {
4259 /*
4260 * Changing the leftmost offset (via partial or whole
4261 * record truncate) of an interior (or rightmost) path
4262 * means we have to update the subtree that is formed
4263 * by this leaf and the one to it's left.
4264 *
4265 * There are two cases we can skip:
4266 * 1) Path is the leftmost one in our inode tree.
4267 * 2) The leaf is rightmost and will be empty after
4268 * we remove the extent record - the rotate code
4269 * knows how to update the newly formed edge.
4270 */
4271
4272 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
4273 &left_cpos);
4274 if (ret) {
4275 mlog_errno(ret);
4276 goto out;
4277 }
4278
4279 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
4280 left_path = ocfs2_new_path(path_root_bh(path),
4281 path_root_el(path));
4282 if (!left_path) {
4283 ret = -ENOMEM;
4284 mlog_errno(ret);
4285 goto out;
4286 }
4287
4288 ret = ocfs2_find_path(inode, left_path, left_cpos);
4289 if (ret) {
4290 mlog_errno(ret);
4291 goto out;
4292 }
4293 }
4294 }
4295
4296 ret = ocfs2_extend_rotate_transaction(handle, 0,
4297 handle->h_buffer_credits,
4298 path);
4299 if (ret) {
4300 mlog_errno(ret);
4301 goto out;
4302 }
4303
4304 ret = ocfs2_journal_access_path(inode, handle, path);
4305 if (ret) {
4306 mlog_errno(ret);
4307 goto out;
4308 }
4309
4310 ret = ocfs2_journal_access_path(inode, handle, left_path);
4311 if (ret) {
4312 mlog_errno(ret);
4313 goto out;
4314 }
4315
4316 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4317 trunc_range = cpos + len;
4318
4319 if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
4320 int next_free;
4321
4322 memset(rec, 0, sizeof(*rec));
4323 ocfs2_cleanup_merge(el, index);
4324 wants_rotate = 1;
4325
4326 next_free = le16_to_cpu(el->l_next_free_rec);
4327 if (is_rightmost_tree_rec && next_free > 1) {
4328 /*
4329 * We skip the edge update if this path will
4330 * be deleted by the rotate code.
4331 */
4332 rec = &el->l_recs[next_free - 1];
4333 ocfs2_adjust_rightmost_records(inode, handle, path,
4334 rec);
4335 }
4336 } else if (le32_to_cpu(rec->e_cpos) == cpos) {
4337 /* Remove leftmost portion of the record. */
4338 le32_add_cpu(&rec->e_cpos, len);
4339 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
4340 le16_add_cpu(&rec->e_leaf_clusters, -len);
4341 } else if (rec_range == trunc_range) {
4342 /* Remove rightmost portion of the record */
4343 le16_add_cpu(&rec->e_leaf_clusters, -len);
4344 if (is_rightmost_tree_rec)
4345 ocfs2_adjust_rightmost_records(inode, handle, path, rec);
4346 } else {
4347 /* Caller should have trapped this. */
4348 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
4349 "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
4350 le32_to_cpu(rec->e_cpos),
4351 le16_to_cpu(rec->e_leaf_clusters), cpos, len);
4352 BUG();
4353 }
4354
4355 if (left_path) {
4356 int subtree_index;
4357
4358 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
4359 ocfs2_complete_edge_insert(inode, handle, left_path, path,
4360 subtree_index);
4361 }
4362
4363 ocfs2_journal_dirty(handle, path_leaf_bh(path));
4364
4365 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4366 if (ret) {
4367 mlog_errno(ret);
4368 goto out;
4369 }
4370
4371out:
4372 ocfs2_free_path(left_path);
4373 return ret;
4374}
4375
4376int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4377 u32 cpos, u32 len, handle_t *handle,
4378 struct ocfs2_alloc_context *meta_ac,
4379 struct ocfs2_cached_dealloc_ctxt *dealloc)
4380{
4381 int ret, index;
4382 u32 rec_range, trunc_range;
4383 struct ocfs2_extent_rec *rec;
4384 struct ocfs2_extent_list *el;
4385 struct ocfs2_path *path;
4386
4387 ocfs2_extent_map_trunc(inode, 0);
4388
4389 path = ocfs2_new_inode_path(di_bh);
4390 if (!path) {
4391 ret = -ENOMEM;
4392 mlog_errno(ret);
4393 goto out;
4394 }
4395
4396 ret = ocfs2_find_path(inode, path, cpos);
4397 if (ret) {
4398 mlog_errno(ret);
4399 goto out;
4400 }
4401
4402 el = path_leaf_el(path);
4403 index = ocfs2_search_extent_list(el, cpos);
4404 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4405 ocfs2_error(inode->i_sb,
4406 "Inode %llu has an extent at cpos %u which can no "
4407 "longer be found.\n",
4408 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4409 ret = -EROFS;
4410 goto out;
4411 }
4412
4413 /*
4414 * We have 3 cases of extent removal:
4415 * 1) Range covers the entire extent rec
4416 * 2) Range begins or ends on one edge of the extent rec
4417 * 3) Range is in the middle of the extent rec (no shared edges)
4418 *
4419 * For case 1 we remove the extent rec and left rotate to
4420 * fill the hole.
4421 *
4422 * For case 2 we just shrink the existing extent rec, with a
4423 * tree update if the shrinking edge is also the edge of an
4424 * extent block.
4425 *
4426 * For case 3 we do a right split to turn the extent rec into
4427 * something case 2 can handle.
4428 */
4429 rec = &el->l_recs[index];
4430 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4431 trunc_range = cpos + len;
4432
4433 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
4434
4435 mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
4436 "(cpos %u, len %u)\n",
4437 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
4438 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
4439
4440 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
4441 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4442 cpos, len);
4443 if (ret) {
4444 mlog_errno(ret);
4445 goto out;
4446 }
4447 } else {
4448 ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
4449 trunc_range, meta_ac);
4450 if (ret) {
4451 mlog_errno(ret);
4452 goto out;
4453 }
4454
4455 /*
4456 * The split could have manipulated the tree enough to
4457 * move the record location, so we have to look for it again.
4458 */
4459 ocfs2_reinit_path(path, 1);
4460
4461 ret = ocfs2_find_path(inode, path, cpos);
4462 if (ret) {
4463 mlog_errno(ret);
4464 goto out;
4465 }
4466
4467 el = path_leaf_el(path);
4468 index = ocfs2_search_extent_list(el, cpos);
4469 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4470 ocfs2_error(inode->i_sb,
4471 "Inode %llu: split at cpos %u lost record.",
4472 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4473 cpos);
4474 ret = -EROFS;
4475 goto out;
4476 }
4477
4478 /*
4479 * Double check our values here. If anything is fishy,
4480 * it's easier to catch it at the top level.
4481 */
4482 rec = &el->l_recs[index];
4483 rec_range = le32_to_cpu(rec->e_cpos) +
4484 ocfs2_rec_clusters(el, rec);
4485 if (rec_range != trunc_range) {
4486 ocfs2_error(inode->i_sb,
4487 "Inode %llu: error after split at cpos %u"
4488 "trunc len %u, existing record is (%u,%u)",
4489 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4490 cpos, len, le32_to_cpu(rec->e_cpos),
4491 ocfs2_rec_clusters(el, rec));
4492 ret = -EROFS;
4493 goto out;
4494 }
4495
4496 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4497 cpos, len);
4498 if (ret) {
4499 mlog_errno(ret);
4500 goto out;
4501 }
4502 }
4503
4504out:
4505 ocfs2_free_path(path);
4506 return ret;
4507}
4508
4509int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
2435{ 4510{
2436 struct buffer_head *tl_bh = osb->osb_tl_bh; 4511 struct buffer_head *tl_bh = osb->osb_tl_bh;
2437 struct ocfs2_dinode *di; 4512 struct ocfs2_dinode *di;
@@ -2464,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
2464 return current_tail == new_start; 4539 return current_tail == new_start;
2465} 4540}
2466 4541
2467static int ocfs2_truncate_log_append(struct ocfs2_super *osb, 4542int ocfs2_truncate_log_append(struct ocfs2_super *osb,
2468 handle_t *handle, 4543 handle_t *handle,
2469 u64 start_blk, 4544 u64 start_blk,
2470 unsigned int num_clusters) 4545 unsigned int num_clusters)
2471{ 4546{
2472 int status, index; 4547 int status, index;
2473 unsigned int start_cluster, tl_count; 4548 unsigned int start_cluster, tl_count;
@@ -2623,7 +4698,7 @@ bail:
2623} 4698}
2624 4699
2625/* Expects you to already be holding tl_inode->i_mutex */ 4700/* Expects you to already be holding tl_inode->i_mutex */
2626static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 4701int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2627{ 4702{
2628 int status; 4703 int status;
2629 unsigned int num_to_flush; 4704 unsigned int num_to_flush;
@@ -2957,6 +5032,219 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
2957 return status; 5032 return status;
2958} 5033}
2959 5034
5035/*
5036 * Delayed de-allocation of suballocator blocks.
5037 *
5038 * Some sets of block de-allocations might involve multiple suballocator inodes.
5039 *
5040 * The locking for this can get extremely complicated, especially when
5041 * the suballocator inodes to delete from aren't known until deep
5042 * within an unrelated codepath.
5043 *
5044 * ocfs2_extent_block structures are a good example of this - an inode
5045 * btree could have been grown by any number of nodes each allocating
5046 * out of their own suballoc inode.
5047 *
5048 * These structures allow the delay of block de-allocation until a
5049 * later time, when locking of multiple cluster inodes won't cause
5050 * deadlock.
5051 */
5052
5053/*
5054 * Describes a single block free from a suballocator
5055 */
5056struct ocfs2_cached_block_free {
5057 struct ocfs2_cached_block_free *free_next;
5058 u64 free_blk;
5059 unsigned int free_bit;
5060};
5061
5062struct ocfs2_per_slot_free_list {
5063 struct ocfs2_per_slot_free_list *f_next_suballocator;
5064 int f_inode_type;
5065 int f_slot;
5066 struct ocfs2_cached_block_free *f_first;
5067};
5068
5069static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5070 int sysfile_type,
5071 int slot,
5072 struct ocfs2_cached_block_free *head)
5073{
5074 int ret;
5075 u64 bg_blkno;
5076 handle_t *handle;
5077 struct inode *inode;
5078 struct buffer_head *di_bh = NULL;
5079 struct ocfs2_cached_block_free *tmp;
5080
5081 inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5082 if (!inode) {
5083 ret = -EINVAL;
5084 mlog_errno(ret);
5085 goto out;
5086 }
5087
5088 mutex_lock(&inode->i_mutex);
5089
5090 ret = ocfs2_meta_lock(inode, &di_bh, 1);
5091 if (ret) {
5092 mlog_errno(ret);
5093 goto out_mutex;
5094 }
5095
5096 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5097 if (IS_ERR(handle)) {
5098 ret = PTR_ERR(handle);
5099 mlog_errno(ret);
5100 goto out_unlock;
5101 }
5102
5103 while (head) {
5104 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5105 head->free_bit);
5106 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5107 head->free_bit, (unsigned long long)head->free_blk);
5108
5109 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5110 head->free_bit, bg_blkno, 1);
5111 if (ret) {
5112 mlog_errno(ret);
5113 goto out_journal;
5114 }
5115
5116 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5117 if (ret) {
5118 mlog_errno(ret);
5119 goto out_journal;
5120 }
5121
5122 tmp = head;
5123 head = head->free_next;
5124 kfree(tmp);
5125 }
5126
5127out_journal:
5128 ocfs2_commit_trans(osb, handle);
5129
5130out_unlock:
5131 ocfs2_meta_unlock(inode, 1);
5132 brelse(di_bh);
5133out_mutex:
5134 mutex_unlock(&inode->i_mutex);
5135 iput(inode);
5136out:
5137 while(head) {
5138 /* Premature exit may have left some dangling items. */
5139 tmp = head;
5140 head = head->free_next;
5141 kfree(tmp);
5142 }
5143
5144 return ret;
5145}
5146
5147int ocfs2_run_deallocs(struct ocfs2_super *osb,
5148 struct ocfs2_cached_dealloc_ctxt *ctxt)
5149{
5150 int ret = 0, ret2;
5151 struct ocfs2_per_slot_free_list *fl;
5152
5153 if (!ctxt)
5154 return 0;
5155
5156 while (ctxt->c_first_suballocator) {
5157 fl = ctxt->c_first_suballocator;
5158
5159 if (fl->f_first) {
5160 mlog(0, "Free items: (type %u, slot %d)\n",
5161 fl->f_inode_type, fl->f_slot);
5162 ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5163 fl->f_slot, fl->f_first);
5164 if (ret2)
5165 mlog_errno(ret2);
5166 if (!ret)
5167 ret = ret2;
5168 }
5169
5170 ctxt->c_first_suballocator = fl->f_next_suballocator;
5171 kfree(fl);
5172 }
5173
5174 return ret;
5175}
5176
5177static struct ocfs2_per_slot_free_list *
5178ocfs2_find_per_slot_free_list(int type,
5179 int slot,
5180 struct ocfs2_cached_dealloc_ctxt *ctxt)
5181{
5182 struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5183
5184 while (fl) {
5185 if (fl->f_inode_type == type && fl->f_slot == slot)
5186 return fl;
5187
5188 fl = fl->f_next_suballocator;
5189 }
5190
5191 fl = kmalloc(sizeof(*fl), GFP_NOFS);
5192 if (fl) {
5193 fl->f_inode_type = type;
5194 fl->f_slot = slot;
5195 fl->f_first = NULL;
5196 fl->f_next_suballocator = ctxt->c_first_suballocator;
5197
5198 ctxt->c_first_suballocator = fl;
5199 }
5200 return fl;
5201}
5202
5203static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5204 int type, int slot, u64 blkno,
5205 unsigned int bit)
5206{
5207 int ret;
5208 struct ocfs2_per_slot_free_list *fl;
5209 struct ocfs2_cached_block_free *item;
5210
5211 fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5212 if (fl == NULL) {
5213 ret = -ENOMEM;
5214 mlog_errno(ret);
5215 goto out;
5216 }
5217
5218 item = kmalloc(sizeof(*item), GFP_NOFS);
5219 if (item == NULL) {
5220 ret = -ENOMEM;
5221 mlog_errno(ret);
5222 goto out;
5223 }
5224
5225 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5226 type, slot, bit, (unsigned long long)blkno);
5227
5228 item->free_blk = blkno;
5229 item->free_bit = bit;
5230 item->free_next = fl->f_first;
5231
5232 fl->f_first = item;
5233
5234 ret = 0;
5235out:
5236 return ret;
5237}
5238
5239static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5240 struct ocfs2_extent_block *eb)
5241{
5242 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5243 le16_to_cpu(eb->h_suballoc_slot),
5244 le64_to_cpu(eb->h_blkno),
5245 le16_to_cpu(eb->h_suballoc_bit));
5246}
5247
2960/* This function will figure out whether the currently last extent 5248/* This function will figure out whether the currently last extent
2961 * block will be deleted, and if it will, what the new last extent 5249 * block will be deleted, and if it will, what the new last extent
2962 * block will be so we can update his h_next_leaf_blk field, as well 5250 * block will be so we can update his h_next_leaf_blk field, as well
@@ -3238,27 +5526,10 @@ delete:
3238 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 5526 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
3239 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); 5527 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
3240 5528
3241 if (le16_to_cpu(eb->h_suballoc_slot) == 0) { 5529 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
3242 /* 5530 /* An error here is not fatal. */
3243 * This code only understands how to 5531 if (ret < 0)
3244 * lock the suballocator in slot 0, 5532 mlog_errno(ret);
3245 * which is fine because allocation is
3246 * only ever done out of that
3247 * suballocator too. A future version
3248 * might change that however, so avoid
3249 * a free if we don't know how to
3250 * handle it. This way an fs incompat
3251 * bit will not be necessary.
3252 */
3253 ret = ocfs2_free_extent_block(handle,
3254 tc->tc_ext_alloc_inode,
3255 tc->tc_ext_alloc_bh,
3256 eb);
3257
3258 /* An error here is not fatal. */
3259 if (ret < 0)
3260 mlog_errno(ret);
3261 }
3262 } else { 5533 } else {
3263 deleted_eb = 0; 5534 deleted_eb = 0;
3264 } 5535 }
@@ -3397,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
3397 return ocfs2_journal_dirty_data(handle, bh); 5668 return ocfs2_journal_dirty_data(handle, bh);
3398} 5669}
3399 5670
3400static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, 5671static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
3401 struct page **pages, int numpages, 5672 loff_t end, struct page **pages,
3402 u64 phys, handle_t *handle) 5673 int numpages, u64 phys, handle_t *handle)
3403{ 5674{
3404 int i, ret, partial = 0; 5675 int i, ret, partial = 0;
3405 void *kaddr; 5676 void *kaddr;
@@ -3412,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3412 if (numpages == 0) 5683 if (numpages == 0)
3413 goto out; 5684 goto out;
3414 5685
3415 from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ 5686 to = PAGE_CACHE_SIZE;
3416 if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
3417 /*
3418 * Since 'from' has been capped to a value below page
3419 * size, this calculation won't be able to overflow
3420 * 'to'
3421 */
3422 to = ocfs2_align_bytes_to_clusters(sb, from);
3423
3424 /*
3425 * The truncate tail in this case should never contain
3426 * more than one page at maximum. The loop below also
3427 * assumes this.
3428 */
3429 BUG_ON(numpages != 1);
3430 }
3431
3432 for(i = 0; i < numpages; i++) { 5687 for(i = 0; i < numpages; i++) {
3433 page = pages[i]; 5688 page = pages[i];
3434 5689
5690 from = start & (PAGE_CACHE_SIZE - 1);
5691 if ((end >> PAGE_CACHE_SHIFT) == page->index)
5692 to = end & (PAGE_CACHE_SIZE - 1);
5693
3435 BUG_ON(from > PAGE_CACHE_SIZE); 5694 BUG_ON(from > PAGE_CACHE_SIZE);
3436 BUG_ON(to > PAGE_CACHE_SIZE); 5695 BUG_ON(to > PAGE_CACHE_SIZE);
3437 5696
@@ -3468,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3468 5727
3469 flush_dcache_page(page); 5728 flush_dcache_page(page);
3470 5729
3471 /* 5730 start = (page->index + 1) << PAGE_CACHE_SHIFT;
3472 * Every page after the 1st one should be completely zero'd.
3473 */
3474 from = 0;
3475 } 5731 }
3476out: 5732out:
3477 if (pages) { 5733 if (pages) {
@@ -3484,24 +5740,26 @@ out:
3484 } 5740 }
3485} 5741}
3486 5742
3487static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, 5743static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
3488 int *num, u64 *phys) 5744 struct page **pages, int *num, u64 *phys)
3489{ 5745{
3490 int i, numpages = 0, ret = 0; 5746 int i, numpages = 0, ret = 0;
3491 unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
3492 unsigned int ext_flags; 5747 unsigned int ext_flags;
3493 struct super_block *sb = inode->i_sb; 5748 struct super_block *sb = inode->i_sb;
3494 struct address_space *mapping = inode->i_mapping; 5749 struct address_space *mapping = inode->i_mapping;
3495 unsigned long index; 5750 unsigned long index;
3496 u64 next_cluster_bytes; 5751 loff_t last_page_bytes;
3497 5752
3498 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 5753 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
5754 BUG_ON(start > end);
3499 5755
3500 /* Cluster boundary, so we don't need to grab any pages. */ 5756 if (start == end)
3501 if ((isize & (csize - 1)) == 0)
3502 goto out; 5757 goto out;
3503 5758
3504 ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, 5759 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
5760 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
5761
5762 ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits,
3505 phys, NULL, &ext_flags); 5763 phys, NULL, &ext_flags);
3506 if (ret) { 5764 if (ret) {
3507 mlog_errno(ret); 5765 mlog_errno(ret);
@@ -3517,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
3517 if (ext_flags & OCFS2_EXT_UNWRITTEN) 5775 if (ext_flags & OCFS2_EXT_UNWRITTEN)
3518 goto out; 5776 goto out;
3519 5777
3520 next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); 5778 last_page_bytes = PAGE_ALIGN(end);
3521 index = isize >> PAGE_CACHE_SHIFT; 5779 index = start >> PAGE_CACHE_SHIFT;
3522 do { 5780 do {
3523 pages[numpages] = grab_cache_page(mapping, index); 5781 pages[numpages] = grab_cache_page(mapping, index);
3524 if (!pages[numpages]) { 5782 if (!pages[numpages]) {
@@ -3529,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
3529 5787
3530 numpages++; 5788 numpages++;
3531 index++; 5789 index++;
3532 } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); 5790 } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
3533 5791
3534out: 5792out:
3535 if (ret != 0) { 5793 if (ret != 0) {
@@ -3558,11 +5816,10 @@ out:
3558 * otherwise block_write_full_page() will skip writeout of pages past 5816 * otherwise block_write_full_page() will skip writeout of pages past
3559 * i_size. The new_i_size parameter is passed for this reason. 5817 * i_size. The new_i_size parameter is passed for this reason.
3560 */ 5818 */
3561int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, 5819int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
3562 u64 new_i_size) 5820 u64 range_start, u64 range_end)
3563{ 5821{
3564 int ret, numpages; 5822 int ret, numpages;
3565 loff_t endbyte;
3566 struct page **pages = NULL; 5823 struct page **pages = NULL;
3567 u64 phys; 5824 u64 phys;
3568 5825
@@ -3581,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3581 goto out; 5838 goto out;
3582 } 5839 }
3583 5840
3584 ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); 5841 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
5842 &numpages, &phys);
3585 if (ret) { 5843 if (ret) {
3586 mlog_errno(ret); 5844 mlog_errno(ret);
3587 goto out; 5845 goto out;
@@ -3590,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3590 if (numpages == 0) 5848 if (numpages == 0)
3591 goto out; 5849 goto out;
3592 5850
3593 ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, 5851 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
3594 handle); 5852 numpages, phys, handle);
3595 5853
3596 /* 5854 /*
3597 * Initiate writeout of the pages we zero'd here. We don't 5855 * Initiate writeout of the pages we zero'd here. We don't
3598 * wait on them - the truncate_inode_pages() call later will 5856 * wait on them - the truncate_inode_pages() call later will
3599 * do that for us. 5857 * do that for us.
3600 */ 5858 */
3601 endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 5859 ret = do_sync_mapping_range(inode->i_mapping, range_start,
3602 ret = do_sync_mapping_range(inode->i_mapping, new_i_size, 5860 range_end - 1, SYNC_FILE_RANGE_WRITE);
3603 endbyte - 1, SYNC_FILE_RANGE_WRITE);
3604 if (ret) 5861 if (ret)
3605 mlog_errno(ret); 5862 mlog_errno(ret);
3606 5863
@@ -3631,8 +5888,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
3631 5888
3632 mlog_entry_void(); 5889 mlog_entry_void();
3633 5890
3634 down_write(&OCFS2_I(inode)->ip_alloc_sem);
3635
3636 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 5891 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
3637 i_size_read(inode)); 5892 i_size_read(inode));
3638 5893
@@ -3754,7 +6009,6 @@ start:
3754 goto start; 6009 goto start;
3755 6010
3756bail: 6011bail:
3757 up_write(&OCFS2_I(inode)->ip_alloc_sem);
3758 6012
3759 ocfs2_schedule_truncate_log_flush(osb, 1); 6013 ocfs2_schedule_truncate_log_flush(osb, 1);
3760 6014
@@ -3764,6 +6018,8 @@ bail:
3764 if (handle) 6018 if (handle)
3765 ocfs2_commit_trans(osb, handle); 6019 ocfs2_commit_trans(osb, handle);
3766 6020
6021 ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6022
3767 ocfs2_free_path(path); 6023 ocfs2_free_path(path);
3768 6024
3769 /* This will drop the ext_alloc cluster lock for us */ 6025 /* This will drop the ext_alloc cluster lock for us */
@@ -3774,23 +6030,18 @@ bail:
3774} 6030}
3775 6031
3776/* 6032/*
3777 * Expects the inode to already be locked. This will figure out which 6033 * Expects the inode to already be locked.
3778 * inodes need to be locked and will put them on the returned truncate
3779 * context.
3780 */ 6034 */
3781int ocfs2_prepare_truncate(struct ocfs2_super *osb, 6035int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3782 struct inode *inode, 6036 struct inode *inode,
3783 struct buffer_head *fe_bh, 6037 struct buffer_head *fe_bh,
3784 struct ocfs2_truncate_context **tc) 6038 struct ocfs2_truncate_context **tc)
3785{ 6039{
3786 int status, metadata_delete, i; 6040 int status;
3787 unsigned int new_i_clusters; 6041 unsigned int new_i_clusters;
3788 struct ocfs2_dinode *fe; 6042 struct ocfs2_dinode *fe;
3789 struct ocfs2_extent_block *eb; 6043 struct ocfs2_extent_block *eb;
3790 struct ocfs2_extent_list *el;
3791 struct buffer_head *last_eb_bh = NULL; 6044 struct buffer_head *last_eb_bh = NULL;
3792 struct inode *ext_alloc_inode = NULL;
3793 struct buffer_head *ext_alloc_bh = NULL;
3794 6045
3795 mlog_entry_void(); 6046 mlog_entry_void();
3796 6047
@@ -3810,12 +6061,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3810 mlog_errno(status); 6061 mlog_errno(status);
3811 goto bail; 6062 goto bail;
3812 } 6063 }
6064 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
3813 6065
3814 metadata_delete = 0;
3815 if (fe->id2.i_list.l_tree_depth) { 6066 if (fe->id2.i_list.l_tree_depth) {
3816 /* If we have a tree, then the truncate may result in
3817 * metadata deletes. Figure this out from the
3818 * rightmost leaf block.*/
3819 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 6067 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
3820 &last_eb_bh, OCFS2_BH_CACHED, inode); 6068 &last_eb_bh, OCFS2_BH_CACHED, inode);
3821 if (status < 0) { 6069 if (status < 0) {
@@ -3830,43 +6078,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3830 status = -EIO; 6078 status = -EIO;
3831 goto bail; 6079 goto bail;
3832 } 6080 }
3833 el = &(eb->h_list);
3834
3835 i = 0;
3836 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3837 i = 1;
3838 /*
3839 * XXX: Should we check that next_free_rec contains
3840 * the extent?
3841 */
3842 if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
3843 metadata_delete = 1;
3844 } 6081 }
3845 6082
3846 (*tc)->tc_last_eb_bh = last_eb_bh; 6083 (*tc)->tc_last_eb_bh = last_eb_bh;
3847 6084
3848 if (metadata_delete) {
3849 mlog(0, "Will have to delete metadata for this trunc. "
3850 "locking allocator.\n");
3851 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
3852 if (!ext_alloc_inode) {
3853 status = -ENOMEM;
3854 mlog_errno(status);
3855 goto bail;
3856 }
3857
3858 mutex_lock(&ext_alloc_inode->i_mutex);
3859 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
3860
3861 status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
3862 if (status < 0) {
3863 mlog_errno(status);
3864 goto bail;
3865 }
3866 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
3867 (*tc)->tc_ext_alloc_locked = 1;
3868 }
3869
3870 status = 0; 6085 status = 0;
3871bail: 6086bail:
3872 if (status < 0) { 6087 if (status < 0) {
@@ -3880,16 +6095,13 @@ bail:
3880 6095
3881static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 6096static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
3882{ 6097{
3883 if (tc->tc_ext_alloc_inode) { 6098 /*
3884 if (tc->tc_ext_alloc_locked) 6099 * The caller is responsible for completing deallocation
3885 ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); 6100 * before freeing the context.
3886 6101 */
3887 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex); 6102 if (tc->tc_dealloc.c_first_suballocator != NULL)
3888 iput(tc->tc_ext_alloc_inode); 6103 mlog(ML_NOTICE,
3889 } 6104 "Truncate completion has non-empty dealloc context\n");
3890
3891 if (tc->tc_ext_alloc_bh)
3892 brelse(tc->tc_ext_alloc_bh);
3893 6105
3894 if (tc->tc_last_eb_bh) 6106 if (tc->tc_last_eb_bh)
3895 brelse(tc->tc_last_eb_bh); 6107 brelse(tc->tc_last_eb_bh);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fbcb5934a081..990df48ae8d3 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -34,7 +34,17 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
34 u32 cpos, 34 u32 cpos,
35 u64 start_blk, 35 u64 start_blk,
36 u32 new_clusters, 36 u32 new_clusters,
37 u8 flags,
37 struct ocfs2_alloc_context *meta_ac); 38 struct ocfs2_alloc_context *meta_ac);
39struct ocfs2_cached_dealloc_ctxt;
40int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
41 handle_t *handle, u32 cpos, u32 len, u32 phys,
42 struct ocfs2_alloc_context *meta_ac,
43 struct ocfs2_cached_dealloc_ctxt *dealloc);
44int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
45 u32 cpos, u32 len, handle_t *handle,
46 struct ocfs2_alloc_context *meta_ac,
47 struct ocfs2_cached_dealloc_ctxt *dealloc);
38int ocfs2_num_free_extents(struct ocfs2_super *osb, 48int ocfs2_num_free_extents(struct ocfs2_super *osb,
39 struct inode *inode, 49 struct inode *inode,
40 struct ocfs2_dinode *fe); 50 struct ocfs2_dinode *fe);
@@ -62,17 +72,41 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
62 struct ocfs2_dinode **tl_copy); 72 struct ocfs2_dinode **tl_copy);
63int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, 73int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
64 struct ocfs2_dinode *tl_copy); 74 struct ocfs2_dinode *tl_copy);
75int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
76int ocfs2_truncate_log_append(struct ocfs2_super *osb,
77 handle_t *handle,
78 u64 start_blk,
79 unsigned int num_clusters);
80int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
81
82/*
83 * Process local structure which describes the block unlinks done
84 * during an operation. This is populated via
85 * ocfs2_cache_block_dealloc().
86 *
87 * ocfs2_run_deallocs() should be called after the potentially
88 * de-allocating routines. No journal handles should be open, and most
89 * locks should have been dropped.
90 */
91struct ocfs2_cached_dealloc_ctxt {
92 struct ocfs2_per_slot_free_list *c_first_suballocator;
93};
94static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
95{
96 c->c_first_suballocator = NULL;
97}
98int ocfs2_run_deallocs(struct ocfs2_super *osb,
99 struct ocfs2_cached_dealloc_ctxt *ctxt);
65 100
66struct ocfs2_truncate_context { 101struct ocfs2_truncate_context {
67 struct inode *tc_ext_alloc_inode; 102 struct ocfs2_cached_dealloc_ctxt tc_dealloc;
68 struct buffer_head *tc_ext_alloc_bh;
69 int tc_ext_alloc_locked; /* is it cluster locked? */ 103 int tc_ext_alloc_locked; /* is it cluster locked? */
70 /* these get destroyed once it's passed to ocfs2_commit_truncate. */ 104 /* these get destroyed once it's passed to ocfs2_commit_truncate. */
71 struct buffer_head *tc_last_eb_bh; 105 struct buffer_head *tc_last_eb_bh;
72}; 106};
73 107
74int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, 108int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
75 u64 new_i_size); 109 u64 range_start, u64 range_end);
76int ocfs2_prepare_truncate(struct ocfs2_super *osb, 110int ocfs2_prepare_truncate(struct ocfs2_super *osb,
77 struct inode *inode, 111 struct inode *inode,
78 struct buffer_head *fe_bh, 112 struct buffer_head *fe_bh,
@@ -84,6 +118,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
84 118
85int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 119int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
86 u32 cpos, struct buffer_head **leaf_bh); 120 u32 cpos, struct buffer_head **leaf_bh);
121int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
87 122
88/* 123/*
89 * Helper function to look at the # of clusters in an extent record. 124 * Helper function to look at the # of clusters in an extent record.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a480b09c79b9..84bf6e79de23 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
684 bh = bh->b_this_page, block_start += bsize) { 684 bh = bh->b_this_page, block_start += bsize) {
685 block_end = block_start + bsize; 685 block_end = block_start + bsize;
686 686
687 clear_buffer_new(bh);
688
687 /* 689 /*
688 * Ignore blocks outside of our i/o range - 690 * Ignore blocks outside of our i/o range -
689 * they may belong to unallocated clusters. 691 * they may belong to unallocated clusters.
@@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
698 * For an allocating write with cluster size >= page 700 * For an allocating write with cluster size >= page
699 * size, we always write the entire page. 701 * size, we always write the entire page.
700 */ 702 */
701 703 if (new)
702 if (buffer_new(bh)) 704 set_buffer_new(bh);
703 clear_buffer_new(bh);
704 705
705 if (!buffer_mapped(bh)) { 706 if (!buffer_mapped(bh)) {
706 map_bh(bh, inode->i_sb, *p_blkno); 707 map_bh(bh, inode->i_sb, *p_blkno);
@@ -711,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
711 if (!buffer_uptodate(bh)) 712 if (!buffer_uptodate(bh))
712 set_buffer_uptodate(bh); 713 set_buffer_uptodate(bh);
713 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && 714 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
714 (block_start < from || block_end > to)) { 715 !buffer_new(bh) &&
716 (block_start < from || block_end > to)) {
715 ll_rw_block(READ, 1, &bh); 717 ll_rw_block(READ, 1, &bh);
716 *wait_bh++=bh; 718 *wait_bh++=bh;
717 } 719 }
@@ -738,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
738 bh = head; 740 bh = head;
739 block_start = 0; 741 block_start = 0;
740 do { 742 do {
741 void *kaddr;
742
743 block_end = block_start + bsize; 743 block_end = block_start + bsize;
744 if (block_end <= from) 744 if (block_end <= from)
745 goto next_bh; 745 goto next_bh;
746 if (block_start >= to) 746 if (block_start >= to)
747 break; 747 break;
748 748
749 kaddr = kmap_atomic(page, KM_USER0); 749 zero_user_page(page, block_start, bh->b_size, KM_USER0);
750 memset(kaddr+block_start, 0, bh->b_size);
751 flush_dcache_page(page);
752 kunmap_atomic(kaddr, KM_USER0);
753 set_buffer_uptodate(bh); 750 set_buffer_uptodate(bh);
754 mark_buffer_dirty(bh); 751 mark_buffer_dirty(bh);
755 752
@@ -761,217 +758,240 @@ next_bh:
761 return ret; 758 return ret;
762} 759}
763 760
761#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
762#define OCFS2_MAX_CTXT_PAGES 1
763#else
764#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
765#endif
766
767#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
768
764/* 769/*
765 * This will copy user data from the buffer page in the splice 770 * Describe the state of a single cluster to be written to.
766 * context.
767 *
768 * For now, we ignore SPLICE_F_MOVE as that would require some extra
769 * communication out all the way to ocfs2_write().
770 */ 771 */
771int ocfs2_map_and_write_splice_data(struct inode *inode, 772struct ocfs2_write_cluster_desc {
772 struct ocfs2_write_ctxt *wc, u64 *p_blkno, 773 u32 c_cpos;
773 unsigned int *ret_from, unsigned int *ret_to) 774 u32 c_phys;
775 /*
776 * Give this a unique field because c_phys eventually gets
777 * filled.
778 */
779 unsigned c_new;
780 unsigned c_unwritten;
781};
782
783static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
774{ 784{
775 int ret; 785 return d->c_new || d->c_unwritten;
776 unsigned int to, from, cluster_start, cluster_end; 786}
777 char *src, *dst;
778 struct ocfs2_splice_write_priv *sp = wc->w_private;
779 struct pipe_buffer *buf = sp->s_buf;
780 unsigned long bytes, src_from;
781 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
782 787
783 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 788struct ocfs2_write_ctxt {
784 &cluster_end); 789 /* Logical cluster position / len of write */
790 u32 w_cpos;
791 u32 w_clen;
785 792
786 from = sp->s_offset; 793 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
787 src_from = sp->s_buf_offset;
788 bytes = wc->w_count;
789 794
790 if (wc->w_large_pages) { 795 /*
791 /* 796 * This is true if page_size > cluster_size.
792 * For cluster size < page size, we have to 797 *
793 * calculate pos within the cluster and obey 798 * It triggers a set of special cases during write which might
794 * the rightmost boundary. 799 * have to deal with allocating writes to partial pages.
795 */ 800 */
796 bytes = min(bytes, (unsigned long)(osb->s_clustersize 801 unsigned int w_large_pages;
797 - (wc->w_pos & (osb->s_clustersize - 1)))); 802
798 } 803 /*
799 to = from + bytes; 804 * Pages involved in this write.
805 *
806 * w_target_page is the page being written to by the user.
807 *
808 * w_pages is an array of pages which always contains
809 * w_target_page, and in the case of an allocating write with
810 * page_size < cluster size, it will contain zero'd and mapped
811 * pages adjacent to w_target_page which need to be written
812 * out in so that future reads from that region will get
813 * zero's.
814 */
815 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
816 unsigned int w_num_pages;
817 struct page *w_target_page;
800 818
801 BUG_ON(from > PAGE_CACHE_SIZE); 819 /*
802 BUG_ON(to > PAGE_CACHE_SIZE); 820 * ocfs2_write_end() uses this to know what the real range to
803 BUG_ON(from < cluster_start); 821 * write in the target should be.
804 BUG_ON(to > cluster_end); 822 */
823 unsigned int w_target_from;
824 unsigned int w_target_to;
805 825
806 if (wc->w_this_page_new) 826 /*
807 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 827 * We could use journal_current_handle() but this is cleaner,
808 cluster_start, cluster_end, 1); 828 * IMHO -Mark
809 else 829 */
810 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 830 handle_t *w_handle;
811 from, to, 0); 831
812 if (ret) { 832 struct buffer_head *w_di_bh;
813 mlog_errno(ret); 833
814 goto out; 834 struct ocfs2_cached_dealloc_ctxt w_dealloc;
835};
836
837static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
838{
839 int i;
840
841 for(i = 0; i < wc->w_num_pages; i++) {
842 if (wc->w_pages[i] == NULL)
843 continue;
844
845 unlock_page(wc->w_pages[i]);
846 mark_page_accessed(wc->w_pages[i]);
847 page_cache_release(wc->w_pages[i]);
815 } 848 }
816 849
817 src = buf->ops->map(sp->s_pipe, buf, 1); 850 brelse(wc->w_di_bh);
818 dst = kmap_atomic(wc->w_this_page, KM_USER1); 851 kfree(wc);
819 memcpy(dst + from, src + src_from, bytes); 852}
820 kunmap_atomic(wc->w_this_page, KM_USER1); 853
821 buf->ops->unmap(sp->s_pipe, buf, src); 854static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
855 struct ocfs2_super *osb, loff_t pos,
856 unsigned len, struct buffer_head *di_bh)
857{
858 struct ocfs2_write_ctxt *wc;
859
860 wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
861 if (!wc)
862 return -ENOMEM;
822 863
823 wc->w_finished_copy = 1; 864 wc->w_cpos = pos >> osb->s_clustersize_bits;
865 wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
866 get_bh(di_bh);
867 wc->w_di_bh = di_bh;
824 868
825 *ret_from = from; 869 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
826 *ret_to = to; 870 wc->w_large_pages = 1;
827out: 871 else
872 wc->w_large_pages = 0;
873
874 ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
875
876 *wcp = wc;
828 877
829 return bytes ? (unsigned int)bytes : ret; 878 return 0;
830} 879}
831 880
832/* 881/*
833 * This will copy user data from the iovec in the buffered write 882 * If a page has any new buffers, zero them out here, and mark them uptodate
834 * context. 883 * and dirty so they'll be written out (in order to prevent uninitialised
884 * block data from leaking). And clear the new bit.
835 */ 885 */
836int ocfs2_map_and_write_user_data(struct inode *inode, 886static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
837 struct ocfs2_write_ctxt *wc, u64 *p_blkno,
838 unsigned int *ret_from, unsigned int *ret_to)
839{ 887{
840 int ret; 888 unsigned int block_start, block_end;
841 unsigned int to, from, cluster_start, cluster_end; 889 struct buffer_head *head, *bh;
842 unsigned long bytes, src_from;
843 char *dst;
844 struct ocfs2_buffered_write_priv *bp = wc->w_private;
845 const struct iovec *cur_iov = bp->b_cur_iov;
846 char __user *buf;
847 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
848 890
849 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 891 BUG_ON(!PageLocked(page));
850 &cluster_end); 892 if (!page_has_buffers(page))
893 return;
851 894
852 buf = cur_iov->iov_base + bp->b_cur_off; 895 bh = head = page_buffers(page);
853 src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; 896 block_start = 0;
897 do {
898 block_end = block_start + bh->b_size;
854 899
855 from = wc->w_pos & (PAGE_CACHE_SIZE - 1); 900 if (buffer_new(bh)) {
901 if (block_end > from && block_start < to) {
902 if (!PageUptodate(page)) {
903 unsigned start, end;
856 904
857 /* 905 start = max(from, block_start);
858 * This is a lot of comparisons, but it reads quite 906 end = min(to, block_end);
859 * easily, which is important here.
860 */
861 /* Stay within the src page */
862 bytes = PAGE_SIZE - src_from;
863 /* Stay within the vector */
864 bytes = min(bytes,
865 (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
866 /* Stay within count */
867 bytes = min(bytes, (unsigned long)wc->w_count);
868 /*
869 * For clustersize > page size, just stay within
870 * target page, otherwise we have to calculate pos
871 * within the cluster and obey the rightmost
872 * boundary.
873 */
874 if (wc->w_large_pages) {
875 /*
876 * For cluster size < page size, we have to
877 * calculate pos within the cluster and obey
878 * the rightmost boundary.
879 */
880 bytes = min(bytes, (unsigned long)(osb->s_clustersize
881 - (wc->w_pos & (osb->s_clustersize - 1))));
882 } else {
883 /*
884 * cluster size > page size is the most common
885 * case - we just stay within the target page
886 * boundary.
887 */
888 bytes = min(bytes, PAGE_CACHE_SIZE - from);
889 }
890 907
891 to = from + bytes; 908 zero_user_page(page, start, end - start, KM_USER0);
909 set_buffer_uptodate(bh);
910 }
892 911
893 BUG_ON(from > PAGE_CACHE_SIZE); 912 clear_buffer_new(bh);
894 BUG_ON(to > PAGE_CACHE_SIZE); 913 mark_buffer_dirty(bh);
895 BUG_ON(from < cluster_start); 914 }
896 BUG_ON(to > cluster_end); 915 }
897 916
898 if (wc->w_this_page_new) 917 block_start = block_end;
899 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 918 bh = bh->b_this_page;
900 cluster_start, cluster_end, 1); 919 } while (bh != head);
901 else 920}
902 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
903 from, to, 0);
904 if (ret) {
905 mlog_errno(ret);
906 goto out;
907 }
908 921
909 dst = kmap(wc->w_this_page); 922/*
910 memcpy(dst + from, bp->b_src_buf + src_from, bytes); 923 * Only called when we have a failure during allocating write to write
911 kunmap(wc->w_this_page); 924 * zero's to the newly allocated region.
925 */
926static void ocfs2_write_failure(struct inode *inode,
927 struct ocfs2_write_ctxt *wc,
928 loff_t user_pos, unsigned user_len)
929{
930 int i;
931 unsigned from, to;
932 struct page *tmppage;
912 933
913 /* 934 ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
914 * XXX: This is slow, but simple. The caller of
915 * ocfs2_buffered_write_cluster() is responsible for
916 * passing through the iovecs, so it's difficult to
917 * predict what our next step is in here after our
918 * initial write. A future version should be pushing
919 * that iovec manipulation further down.
920 *
921 * By setting this, we indicate that a copy from user
922 * data was done, and subsequent calls for this
923 * cluster will skip copying more data.
924 */
925 wc->w_finished_copy = 1;
926 935
927 *ret_from = from; 936 if (wc->w_large_pages) {
928 *ret_to = to; 937 from = wc->w_target_from;
929out: 938 to = wc->w_target_to;
939 } else {
940 from = 0;
941 to = PAGE_CACHE_SIZE;
942 }
943
944 for(i = 0; i < wc->w_num_pages; i++) {
945 tmppage = wc->w_pages[i];
930 946
931 return bytes ? (unsigned int)bytes : ret; 947 if (ocfs2_should_order_data(inode))
948 walk_page_buffers(wc->w_handle, page_buffers(tmppage),
949 from, to, NULL,
950 ocfs2_journal_dirty_data);
951
952 block_commit_write(tmppage, from, to);
953 }
932} 954}
933 955
934/* 956static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
935 * Map, fill and write a page to disk. 957 struct ocfs2_write_ctxt *wc,
936 * 958 struct page *page, u32 cpos,
937 * The work of copying data is done via callback. Newly allocated 959 loff_t user_pos, unsigned user_len,
938 * pages which don't take user data will be zero'd (set 'new' to 960 int new)
939 * indicate an allocating write)
940 *
941 * Returns a negative error code or the number of bytes copied into
942 * the page.
943 */
944static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
945 u64 *p_blkno, struct page *page,
946 struct ocfs2_write_ctxt *wc, int new)
947{ 961{
948 int ret, copied = 0; 962 int ret;
949 unsigned int from = 0, to = 0; 963 unsigned int map_from = 0, map_to = 0;
950 unsigned int cluster_start, cluster_end; 964 unsigned int cluster_start, cluster_end;
951 unsigned int zero_from = 0, zero_to = 0; 965 unsigned int user_data_from = 0, user_data_to = 0;
952 966
953 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, 967 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
954 &cluster_start, &cluster_end); 968 &cluster_start, &cluster_end);
955 969
956 if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index 970 if (page == wc->w_target_page) {
957 && !wc->w_finished_copy) { 971 map_from = user_pos & (PAGE_CACHE_SIZE - 1);
958 972 map_to = map_from + user_len;
959 wc->w_this_page = page; 973
960 wc->w_this_page_new = new; 974 if (new)
961 ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); 975 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
962 if (ret < 0) { 976 cluster_start, cluster_end,
977 new);
978 else
979 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
980 map_from, map_to, new);
981 if (ret) {
963 mlog_errno(ret); 982 mlog_errno(ret);
964 goto out; 983 goto out;
965 } 984 }
966 985
967 copied = ret; 986 user_data_from = map_from;
968 987 user_data_to = map_to;
969 zero_from = from;
970 zero_to = to;
971 if (new) { 988 if (new) {
972 from = cluster_start; 989 map_from = cluster_start;
973 to = cluster_end; 990 map_to = cluster_end;
974 } 991 }
992
993 wc->w_target_from = map_from;
994 wc->w_target_to = map_to;
975 } else { 995 } else {
976 /* 996 /*
977 * If we haven't allocated the new page yet, we 997 * If we haven't allocated the new page yet, we
@@ -980,11 +1000,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
980 */ 1000 */
981 BUG_ON(!new); 1001 BUG_ON(!new);
982 1002
983 from = cluster_start; 1003 map_from = cluster_start;
984 to = cluster_end; 1004 map_to = cluster_end;
985 1005
986 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 1006 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
987 cluster_start, cluster_end, 1); 1007 cluster_start, cluster_end, new);
988 if (ret) { 1008 if (ret) {
989 mlog_errno(ret); 1009 mlog_errno(ret);
990 goto out; 1010 goto out;
@@ -1003,108 +1023,113 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
1003 */ 1023 */
1004 if (new && !PageUptodate(page)) 1024 if (new && !PageUptodate(page))
1005 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), 1025 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1006 wc->w_cpos, zero_from, zero_to); 1026 cpos, user_data_from, user_data_to);
1007 1027
1008 flush_dcache_page(page); 1028 flush_dcache_page(page);
1009 1029
1010 if (ocfs2_should_order_data(inode)) {
1011 ret = walk_page_buffers(handle,
1012 page_buffers(page),
1013 from, to, NULL,
1014 ocfs2_journal_dirty_data);
1015 if (ret < 0)
1016 mlog_errno(ret);
1017 }
1018
1019 /*
1020 * We don't use generic_commit_write() because we need to
1021 * handle our own i_size update.
1022 */
1023 ret = block_commit_write(page, from, to);
1024 if (ret)
1025 mlog_errno(ret);
1026out: 1030out:
1027 1031 return ret;
1028 return copied ? copied : ret;
1029} 1032}
1030 1033
1031/* 1034/*
1032 * Do the actual write of some data into an inode. Optionally allocate 1035 * This function will only grab one clusters worth of pages.
1033 * in order to fulfill the write.
1034 *
1035 * cpos is the logical cluster offset within the file to write at
1036 *
1037 * 'phys' is the physical mapping of that offset. a 'phys' value of
1038 * zero indicates that allocation is required. In this case, data_ac
1039 * and meta_ac should be valid (meta_ac can be null if metadata
1040 * allocation isn't required).
1041 */ 1036 */
1042static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, 1037static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1043 struct buffer_head *di_bh, 1038 struct ocfs2_write_ctxt *wc,
1044 struct ocfs2_alloc_context *data_ac, 1039 u32 cpos, loff_t user_pos, int new,
1045 struct ocfs2_alloc_context *meta_ac, 1040 struct page *mmap_page)
1046 struct ocfs2_write_ctxt *wc)
1047{ 1041{
1048 int ret, i, numpages = 1, new; 1042 int ret = 0, i;
1049 unsigned int copied = 0; 1043 unsigned long start, target_index, index;
1050 u32 tmp_pos;
1051 u64 v_blkno, p_blkno;
1052 struct address_space *mapping = file->f_mapping;
1053 struct inode *inode = mapping->host; 1044 struct inode *inode = mapping->host;
1054 unsigned long index, start;
1055 struct page **cpages;
1056 1045
1057 new = phys == 0 ? 1 : 0; 1046 target_index = user_pos >> PAGE_CACHE_SHIFT;
1058 1047
1059 /* 1048 /*
1060 * Figure out how many pages we'll be manipulating here. For 1049 * Figure out how many pages we'll be manipulating here. For
1061 * non allocating write, we just change the one 1050 * non allocating write, we just change the one
1062 * page. Otherwise, we'll need a whole clusters worth. 1051 * page. Otherwise, we'll need a whole clusters worth.
1063 */ 1052 */
1064 if (new)
1065 numpages = ocfs2_pages_per_cluster(inode->i_sb);
1066
1067 cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
1068 if (!cpages) {
1069 ret = -ENOMEM;
1070 mlog_errno(ret);
1071 return ret;
1072 }
1073
1074 /*
1075 * Fill our page array first. That way we've grabbed enough so
1076 * that we can zero and flush if we error after adding the
1077 * extent.
1078 */
1079 if (new) { 1053 if (new) {
1080 start = ocfs2_align_clusters_to_page_index(inode->i_sb, 1054 wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
1081 wc->w_cpos); 1055 start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
1082 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
1083 } else { 1056 } else {
1084 start = wc->w_pos >> PAGE_CACHE_SHIFT; 1057 wc->w_num_pages = 1;
1085 v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; 1058 start = target_index;
1086 } 1059 }
1087 1060
1088 for(i = 0; i < numpages; i++) { 1061 for(i = 0; i < wc->w_num_pages; i++) {
1089 index = start + i; 1062 index = start + i;
1090 1063
1091 cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); 1064 if (index == target_index && mmap_page) {
1092 if (!cpages[i]) { 1065 /*
1093 ret = -ENOMEM; 1066 * ocfs2_pagemkwrite() is a little different
1094 mlog_errno(ret); 1067 * and wants us to directly use the page
1095 goto out; 1068 * passed in.
1069 */
1070 lock_page(mmap_page);
1071
1072 if (mmap_page->mapping != mapping) {
1073 unlock_page(mmap_page);
1074 /*
1075 * Sanity check - the locking in
1076 * ocfs2_pagemkwrite() should ensure
1077 * that this code doesn't trigger.
1078 */
1079 ret = -EINVAL;
1080 mlog_errno(ret);
1081 goto out;
1082 }
1083
1084 page_cache_get(mmap_page);
1085 wc->w_pages[i] = mmap_page;
1086 } else {
1087 wc->w_pages[i] = find_or_create_page(mapping, index,
1088 GFP_NOFS);
1089 if (!wc->w_pages[i]) {
1090 ret = -ENOMEM;
1091 mlog_errno(ret);
1092 goto out;
1093 }
1096 } 1094 }
1095
1096 if (index == target_index)
1097 wc->w_target_page = wc->w_pages[i];
1097 } 1098 }
1099out:
1100 return ret;
1101}
1102
1103/*
1104 * Prepare a single cluster for write one cluster into the file.
1105 */
1106static int ocfs2_write_cluster(struct address_space *mapping,
1107 u32 phys, unsigned int unwritten,
1108 struct ocfs2_alloc_context *data_ac,
1109 struct ocfs2_alloc_context *meta_ac,
1110 struct ocfs2_write_ctxt *wc, u32 cpos,
1111 loff_t user_pos, unsigned user_len)
1112{
1113 int ret, i, new, should_zero = 0;
1114 u64 v_blkno, p_blkno;
1115 struct inode *inode = mapping->host;
1116
1117 new = phys == 0 ? 1 : 0;
1118 if (new || unwritten)
1119 should_zero = 1;
1098 1120
1099 if (new) { 1121 if (new) {
1122 u32 tmp_pos;
1123
1100 /* 1124 /*
1101 * This is safe to call with the page locks - it won't take 1125 * This is safe to call with the page locks - it won't take
1102 * any additional semaphores or cluster locks. 1126 * any additional semaphores or cluster locks.
1103 */ 1127 */
1104 tmp_pos = wc->w_cpos; 1128 tmp_pos = cpos;
1105 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, 1129 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
1106 &tmp_pos, 1, di_bh, handle, 1130 &tmp_pos, 1, 0, wc->w_di_bh,
1107 data_ac, meta_ac, NULL); 1131 wc->w_handle, data_ac,
1132 meta_ac, NULL);
1108 /* 1133 /*
1109 * This shouldn't happen because we must have already 1134 * This shouldn't happen because we must have already
1110 * calculated the correct meta data allocation required. The 1135 * calculated the correct meta data allocation required. The
@@ -1121,159 +1146,433 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
1121 mlog_errno(ret); 1146 mlog_errno(ret);
1122 goto out; 1147 goto out;
1123 } 1148 }
1149 } else if (unwritten) {
1150 ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
1151 wc->w_handle, cpos, 1, phys,
1152 meta_ac, &wc->w_dealloc);
1153 if (ret < 0) {
1154 mlog_errno(ret);
1155 goto out;
1156 }
1124 } 1157 }
1125 1158
1159 if (should_zero)
1160 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
1161 else
1162 v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
1163
1164 /*
1165 * The only reason this should fail is due to an inability to
1166 * find the extent added.
1167 */
1126 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1168 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1127 NULL); 1169 NULL);
1128 if (ret < 0) { 1170 if (ret < 0) {
1129 1171 ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
1130 /* 1172 "at logical block %llu",
1131 * XXX: Should we go readonly here? 1173 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1132 */ 1174 (unsigned long long)v_blkno);
1133
1134 mlog_errno(ret);
1135 goto out; 1175 goto out;
1136 } 1176 }
1137 1177
1138 BUG_ON(p_blkno == 0); 1178 BUG_ON(p_blkno == 0);
1139 1179
1140 for(i = 0; i < numpages; i++) { 1180 for(i = 0; i < wc->w_num_pages; i++) {
1141 ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], 1181 int tmpret;
1142 wc, new); 1182
1143 if (ret < 0) { 1183 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
1144 mlog_errno(ret); 1184 wc->w_pages[i], cpos,
1145 goto out; 1185 user_pos, user_len,
1186 should_zero);
1187 if (tmpret) {
1188 mlog_errno(tmpret);
1189 if (ret == 0)
1190 tmpret = ret;
1146 } 1191 }
1147
1148 copied += ret;
1149 } 1192 }
1150 1193
1194 /*
1195 * We only have cleanup to do in case of allocating write.
1196 */
1197 if (ret && new)
1198 ocfs2_write_failure(inode, wc, user_pos, user_len);
1199
1151out: 1200out:
1152 for(i = 0; i < numpages; i++) { 1201
1153 unlock_page(cpages[i]); 1202 return ret;
1154 mark_page_accessed(cpages[i]); 1203}
1155 page_cache_release(cpages[i]); 1204
1205static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1206 struct ocfs2_alloc_context *data_ac,
1207 struct ocfs2_alloc_context *meta_ac,
1208 struct ocfs2_write_ctxt *wc,
1209 loff_t pos, unsigned len)
1210{
1211 int ret, i;
1212 struct ocfs2_write_cluster_desc *desc;
1213
1214 for (i = 0; i < wc->w_clen; i++) {
1215 desc = &wc->w_desc[i];
1216
1217 ret = ocfs2_write_cluster(mapping, desc->c_phys,
1218 desc->c_unwritten, data_ac, meta_ac,
1219 wc, desc->c_cpos, pos, len);
1220 if (ret) {
1221 mlog_errno(ret);
1222 goto out;
1223 }
1156 } 1224 }
1157 kfree(cpages);
1158 1225
1159 return copied ? copied : ret; 1226 ret = 0;
1227out:
1228 return ret;
1160} 1229}
1161 1230
1162static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, 1231/*
1163 struct ocfs2_super *osb, loff_t pos, 1232 * ocfs2_write_end() wants to know which parts of the target page it
1164 size_t count, ocfs2_page_writer *cb, 1233 * should complete the write on. It's easiest to compute them ahead of
1165 void *cb_priv) 1234 * time when a more complete view of the write is available.
1235 */
1236static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1237 struct ocfs2_write_ctxt *wc,
1238 loff_t pos, unsigned len, int alloc)
1166{ 1239{
1167 wc->w_count = count; 1240 struct ocfs2_write_cluster_desc *desc;
1168 wc->w_pos = pos;
1169 wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
1170 wc->w_finished_copy = 0;
1171 1241
1172 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 1242 wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
1173 wc->w_large_pages = 1; 1243 wc->w_target_to = wc->w_target_from + len;
1174 else
1175 wc->w_large_pages = 0;
1176 1244
1177 wc->w_write_data_page = cb; 1245 if (alloc == 0)
1178 wc->w_private = cb_priv; 1246 return;
1247
1248 /*
1249 * Allocating write - we may have different boundaries based
1250 * on page size and cluster size.
1251 *
1252 * NOTE: We can no longer compute one value from the other as
1253 * the actual write length and user provided length may be
1254 * different.
1255 */
1256
1257 if (wc->w_large_pages) {
1258 /*
1259 * We only care about the 1st and last cluster within
1260 * our range and whether they should be zero'd or not. Either
1261 * value may be extended out to the start/end of a
1262 * newly allocated cluster.
1263 */
1264 desc = &wc->w_desc[0];
1265 if (ocfs2_should_zero_cluster(desc))
1266 ocfs2_figure_cluster_boundaries(osb,
1267 desc->c_cpos,
1268 &wc->w_target_from,
1269 NULL);
1270
1271 desc = &wc->w_desc[wc->w_clen - 1];
1272 if (ocfs2_should_zero_cluster(desc))
1273 ocfs2_figure_cluster_boundaries(osb,
1274 desc->c_cpos,
1275 NULL,
1276 &wc->w_target_to);
1277 } else {
1278 wc->w_target_from = 0;
1279 wc->w_target_to = PAGE_CACHE_SIZE;
1280 }
1179} 1281}
1180 1282
1181/* 1283/*
1182 * Write a cluster to an inode. The cluster may not be allocated yet, 1284 * Populate each single-cluster write descriptor in the write context
1183 * in which case it will be. This only exists for buffered writes - 1285 * with information about the i/o to be done.
1184 * O_DIRECT takes a more "traditional" path through the kernel.
1185 *
1186 * The caller is responsible for incrementing pos, written counts, etc
1187 * 1286 *
1188 * For file systems that don't support sparse files, pre-allocation 1287 * Returns the number of clusters that will have to be allocated, as
1189 * and page zeroing up until cpos should be done prior to this 1288 * well as a worst case estimate of the number of extent records that
1190 * function call. 1289 * would have to be created during a write to an unwritten region.
1191 *
1192 * Callers should be holding i_sem, and the rw cluster lock.
1193 *
1194 * Returns the number of user bytes written, or less than zero for
1195 * error.
1196 */ 1290 */
1197ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, 1291static int ocfs2_populate_write_desc(struct inode *inode,
1198 size_t count, ocfs2_page_writer *actor, 1292 struct ocfs2_write_ctxt *wc,
1199 void *priv) 1293 unsigned int *clusters_to_alloc,
1294 unsigned int *extents_to_split)
1295{
1296 int ret;
1297 struct ocfs2_write_cluster_desc *desc;
1298 unsigned int num_clusters = 0;
1299 unsigned int ext_flags = 0;
1300 u32 phys = 0;
1301 int i;
1302
1303 *clusters_to_alloc = 0;
1304 *extents_to_split = 0;
1305
1306 for (i = 0; i < wc->w_clen; i++) {
1307 desc = &wc->w_desc[i];
1308 desc->c_cpos = wc->w_cpos + i;
1309
1310 if (num_clusters == 0) {
1311 /*
1312 * Need to look up the next extent record.
1313 */
1314 ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
1315 &num_clusters, &ext_flags);
1316 if (ret) {
1317 mlog_errno(ret);
1318 goto out;
1319 }
1320
1321 /*
1322 * Assume worst case - that we're writing in
1323 * the middle of the extent.
1324 *
1325 * We can assume that the write proceeds from
1326 * left to right, in which case the extent
1327 * insert code is smart enough to coalesce the
1328 * next splits into the previous records created.
1329 */
1330 if (ext_flags & OCFS2_EXT_UNWRITTEN)
1331 *extents_to_split = *extents_to_split + 2;
1332 } else if (phys) {
1333 /*
1334 * Only increment phys if it doesn't describe
1335 * a hole.
1336 */
1337 phys++;
1338 }
1339
1340 desc->c_phys = phys;
1341 if (phys == 0) {
1342 desc->c_new = 1;
1343 *clusters_to_alloc = *clusters_to_alloc + 1;
1344 }
1345 if (ext_flags & OCFS2_EXT_UNWRITTEN)
1346 desc->c_unwritten = 1;
1347
1348 num_clusters--;
1349 }
1350
1351 ret = 0;
1352out:
1353 return ret;
1354}
1355
1356int ocfs2_write_begin_nolock(struct address_space *mapping,
1357 loff_t pos, unsigned len, unsigned flags,
1358 struct page **pagep, void **fsdata,
1359 struct buffer_head *di_bh, struct page *mmap_page)
1200{ 1360{
1201 int ret, credits = OCFS2_INODE_UPDATE_CREDITS; 1361 int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
1202 ssize_t written = 0; 1362 unsigned int clusters_to_alloc, extents_to_split;
1203 u32 phys; 1363 struct ocfs2_write_ctxt *wc;
1204 struct inode *inode = file->f_mapping->host; 1364 struct inode *inode = mapping->host;
1205 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1365 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1206 struct buffer_head *di_bh = NULL;
1207 struct ocfs2_dinode *di; 1366 struct ocfs2_dinode *di;
1208 struct ocfs2_alloc_context *data_ac = NULL; 1367 struct ocfs2_alloc_context *data_ac = NULL;
1209 struct ocfs2_alloc_context *meta_ac = NULL; 1368 struct ocfs2_alloc_context *meta_ac = NULL;
1210 handle_t *handle; 1369 handle_t *handle;
1211 struct ocfs2_write_ctxt wc;
1212
1213 ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
1214 1370
1215 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1371 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1216 if (ret) { 1372 if (ret) {
1217 mlog_errno(ret); 1373 mlog_errno(ret);
1218 goto out; 1374 return ret;
1219 } 1375 }
1220 di = (struct ocfs2_dinode *)di_bh->b_data;
1221
1222 /*
1223 * Take alloc sem here to prevent concurrent lookups. That way
1224 * the mapping, zeroing and tree manipulation within
1225 * ocfs2_write() will be safe against ->readpage(). This
1226 * should also serve to lock out allocation from a shared
1227 * writeable region.
1228 */
1229 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1230 1376
1231 ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); 1377 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
1378 &extents_to_split);
1232 if (ret) { 1379 if (ret) {
1233 mlog_errno(ret); 1380 mlog_errno(ret);
1234 goto out_meta; 1381 goto out;
1235 } 1382 }
1236 1383
1237 /* phys == 0 means that allocation is required. */ 1384 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1238 if (phys == 0) { 1385
1239 ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); 1386 /*
1387 * We set w_target_from, w_target_to here so that
1388 * ocfs2_write_end() knows which range in the target page to
1389 * write out. An allocation requires that we write the entire
1390 * cluster range.
1391 */
1392 if (clusters_to_alloc || extents_to_split) {
1393 /*
1394 * XXX: We are stretching the limits of
1395 * ocfs2_lock_allocators(). It greatly over-estimates
1396 * the work to be done.
1397 */
1398 ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
1399 extents_to_split, &data_ac, &meta_ac);
1240 if (ret) { 1400 if (ret) {
1241 mlog_errno(ret); 1401 mlog_errno(ret);
1242 goto out_meta; 1402 goto out;
1243 } 1403 }
1244 1404
1245 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); 1405 credits = ocfs2_calc_extend_credits(inode->i_sb, di,
1246 } 1406 clusters_to_alloc);
1247 1407
1248 ret = ocfs2_data_lock(inode, 1);
1249 if (ret) {
1250 mlog_errno(ret);
1251 goto out_meta;
1252 } 1408 }
1253 1409
1410 ocfs2_set_target_boundaries(osb, wc, pos, len,
1411 clusters_to_alloc + extents_to_split);
1412
1254 handle = ocfs2_start_trans(osb, credits); 1413 handle = ocfs2_start_trans(osb, credits);
1255 if (IS_ERR(handle)) { 1414 if (IS_ERR(handle)) {
1256 ret = PTR_ERR(handle); 1415 ret = PTR_ERR(handle);
1257 mlog_errno(ret); 1416 mlog_errno(ret);
1258 goto out_data; 1417 goto out;
1259 } 1418 }
1260 1419
1261 written = ocfs2_write(file, phys, handle, di_bh, data_ac, 1420 wc->w_handle = handle;
1262 meta_ac, &wc); 1421
1263 if (written < 0) { 1422 /*
1264 ret = written; 1423 * We don't want this to fail in ocfs2_write_end(), so do it
1424 * here.
1425 */
1426 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
1427 OCFS2_JOURNAL_ACCESS_WRITE);
1428 if (ret) {
1265 mlog_errno(ret); 1429 mlog_errno(ret);
1266 goto out_commit; 1430 goto out_commit;
1267 } 1431 }
1268 1432
1269 ret = ocfs2_journal_access(handle, inode, di_bh, 1433 /*
1270 OCFS2_JOURNAL_ACCESS_WRITE); 1434 * Fill our page array first. That way we've grabbed enough so
1435 * that we can zero and flush if we error after adding the
1436 * extent.
1437 */
1438 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
1439 clusters_to_alloc + extents_to_split,
1440 mmap_page);
1271 if (ret) { 1441 if (ret) {
1272 mlog_errno(ret); 1442 mlog_errno(ret);
1273 goto out_commit; 1443 goto out_commit;
1274 } 1444 }
1275 1445
1276 pos += written; 1446 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1447 len);
1448 if (ret) {
1449 mlog_errno(ret);
1450 goto out_commit;
1451 }
1452
1453 if (data_ac)
1454 ocfs2_free_alloc_context(data_ac);
1455 if (meta_ac)
1456 ocfs2_free_alloc_context(meta_ac);
1457
1458 *pagep = wc->w_target_page;
1459 *fsdata = wc;
1460 return 0;
1461out_commit:
1462 ocfs2_commit_trans(osb, handle);
1463
1464out:
1465 ocfs2_free_write_ctxt(wc);
1466
1467 if (data_ac)
1468 ocfs2_free_alloc_context(data_ac);
1469 if (meta_ac)
1470 ocfs2_free_alloc_context(meta_ac);
1471 return ret;
1472}
1473
1474int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1475 loff_t pos, unsigned len, unsigned flags,
1476 struct page **pagep, void **fsdata)
1477{
1478 int ret;
1479 struct buffer_head *di_bh = NULL;
1480 struct inode *inode = mapping->host;
1481
1482 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1483 if (ret) {
1484 mlog_errno(ret);
1485 return ret;
1486 }
1487
1488 /*
1489 * Take alloc sem here to prevent concurrent lookups. That way
1490 * the mapping, zeroing and tree manipulation within
1491 * ocfs2_write() will be safe against ->readpage(). This
1492 * should also serve to lock out allocation from a shared
1493 * writeable region.
1494 */
1495 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1496
1497 ret = ocfs2_data_lock(inode, 1);
1498 if (ret) {
1499 mlog_errno(ret);
1500 goto out_fail;
1501 }
1502
1503 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
1504 fsdata, di_bh, NULL);
1505 if (ret) {
1506 mlog_errno(ret);
1507 goto out_fail_data;
1508 }
1509
1510 brelse(di_bh);
1511
1512 return 0;
1513
1514out_fail_data:
1515 ocfs2_data_unlock(inode, 1);
1516out_fail:
1517 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1518
1519 brelse(di_bh);
1520 ocfs2_meta_unlock(inode, 1);
1521
1522 return ret;
1523}
1524
1525int ocfs2_write_end_nolock(struct address_space *mapping,
1526 loff_t pos, unsigned len, unsigned copied,
1527 struct page *page, void *fsdata)
1528{
1529 int i;
1530 unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
1531 struct inode *inode = mapping->host;
1532 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1533 struct ocfs2_write_ctxt *wc = fsdata;
1534 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1535 handle_t *handle = wc->w_handle;
1536 struct page *tmppage;
1537
1538 if (unlikely(copied < len)) {
1539 if (!PageUptodate(wc->w_target_page))
1540 copied = 0;
1541
1542 ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
1543 start+len);
1544 }
1545 flush_dcache_page(wc->w_target_page);
1546
1547 for(i = 0; i < wc->w_num_pages; i++) {
1548 tmppage = wc->w_pages[i];
1549
1550 if (tmppage == wc->w_target_page) {
1551 from = wc->w_target_from;
1552 to = wc->w_target_to;
1553
1554 BUG_ON(from > PAGE_CACHE_SIZE ||
1555 to > PAGE_CACHE_SIZE ||
1556 to < from);
1557 } else {
1558 /*
1559 * Pages adjacent to the target (if any) imply
1560 * a hole-filling write in which case we want
1561 * to flush their entire range.
1562 */
1563 from = 0;
1564 to = PAGE_CACHE_SIZE;
1565 }
1566
1567 if (ocfs2_should_order_data(inode))
1568 walk_page_buffers(wc->w_handle, page_buffers(tmppage),
1569 from, to, NULL,
1570 ocfs2_journal_dirty_data);
1571
1572 block_commit_write(tmppage, from, to);
1573 }
1574
1575 pos += copied;
1277 if (pos > inode->i_size) { 1576 if (pos > inode->i_size) {
1278 i_size_write(inode, pos); 1577 i_size_write(inode, pos);
1279 mark_inode_dirty(inode); 1578 mark_inode_dirty(inode);
@@ -1283,29 +1582,31 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
1283 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1582 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1284 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 1583 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
1285 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1584 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1585 ocfs2_journal_dirty(handle, wc->w_di_bh);
1286 1586
1287 ret = ocfs2_journal_dirty(handle, di_bh);
1288 if (ret)
1289 mlog_errno(ret);
1290
1291out_commit:
1292 ocfs2_commit_trans(osb, handle); 1587 ocfs2_commit_trans(osb, handle);
1293 1588
1294out_data: 1589 ocfs2_run_deallocs(osb, &wc->w_dealloc);
1295 ocfs2_data_unlock(inode, 1); 1590
1591 ocfs2_free_write_ctxt(wc);
1592
1593 return copied;
1594}
1595
1596int ocfs2_write_end(struct file *file, struct address_space *mapping,
1597 loff_t pos, unsigned len, unsigned copied,
1598 struct page *page, void *fsdata)
1599{
1600 int ret;
1601 struct inode *inode = mapping->host;
1296 1602
1297out_meta: 1603 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
1604
1605 ocfs2_data_unlock(inode, 1);
1298 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1606 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1299 ocfs2_meta_unlock(inode, 1); 1607 ocfs2_meta_unlock(inode, 1);
1300 1608
1301out: 1609 return ret;
1302 brelse(di_bh);
1303 if (data_ac)
1304 ocfs2_free_alloc_context(data_ac);
1305 if (meta_ac)
1306 ocfs2_free_alloc_context(meta_ac);
1307
1308 return written ? written : ret;
1309} 1610}
1310 1611
1311const struct address_space_operations ocfs2_aops = { 1612const struct address_space_operations ocfs2_aops = {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 45821d479b5a..389579bd64e3 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -42,57 +42,22 @@ int walk_page_buffers( handle_t *handle,
42 int (*fn)( handle_t *handle, 42 int (*fn)( handle_t *handle,
43 struct buffer_head *bh)); 43 struct buffer_head *bh));
44 44
45struct ocfs2_write_ctxt; 45int ocfs2_write_begin(struct file *file, struct address_space *mapping,
46typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, 46 loff_t pos, unsigned len, unsigned flags,
47 u64 *, unsigned int *, unsigned int *); 47 struct page **pagep, void **fsdata);
48 48
49ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, 49int ocfs2_write_end(struct file *file, struct address_space *mapping,
50 size_t count, ocfs2_page_writer *actor, 50 loff_t pos, unsigned len, unsigned copied,
51 void *priv); 51 struct page *page, void *fsdata);
52 52
53struct ocfs2_write_ctxt { 53int ocfs2_write_end_nolock(struct address_space *mapping,
54 size_t w_count; 54 loff_t pos, unsigned len, unsigned copied,
55 loff_t w_pos; 55 struct page *page, void *fsdata);
56 u32 w_cpos;
57 unsigned int w_finished_copy;
58 56
59 /* This is true if page_size > cluster_size */ 57int ocfs2_write_begin_nolock(struct address_space *mapping,
60 unsigned int w_large_pages; 58 loff_t pos, unsigned len, unsigned flags,
61 59 struct page **pagep, void **fsdata,
62 /* Filler callback and private data */ 60 struct buffer_head *di_bh, struct page *mmap_page);
63 ocfs2_page_writer *w_write_data_page;
64 void *w_private;
65
66 /* Only valid for the filler callback */
67 struct page *w_this_page;
68 unsigned int w_this_page_new;
69};
70
71struct ocfs2_buffered_write_priv {
72 char *b_src_buf;
73 const struct iovec *b_cur_iov; /* Current iovec */
74 size_t b_cur_off; /* Offset in the
75 * current iovec */
76};
77int ocfs2_map_and_write_user_data(struct inode *inode,
78 struct ocfs2_write_ctxt *wc,
79 u64 *p_blkno,
80 unsigned int *ret_from,
81 unsigned int *ret_to);
82
83struct ocfs2_splice_write_priv {
84 struct splice_desc *s_sd;
85 struct pipe_buffer *s_buf;
86 struct pipe_inode_info *s_pipe;
87 /* Neither offset value is ever larger than one page */
88 unsigned int s_offset;
89 unsigned int s_buf_offset;
90};
91int ocfs2_map_and_write_splice_data(struct inode *inode,
92 struct ocfs2_write_ctxt *wc,
93 u64 *p_blkno,
94 unsigned int *ret_from,
95 unsigned int *ret_to);
96 61
97/* all ocfs2_dio_end_io()'s fault */ 62/* all ocfs2_dio_end_io()'s fault */
98#define ocfs2_iocb_is_rw_locked(iocb) \ 63#define ocfs2_iocb_is_rw_locked(iocb) \
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 979113479c66..2bd7f788cf34 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1335,6 +1335,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1335 ret = wait_event_interruptible(o2hb_steady_queue, 1335 ret = wait_event_interruptible(o2hb_steady_queue,
1336 atomic_read(&reg->hr_steady_iterations) == 0); 1336 atomic_read(&reg->hr_steady_iterations) == 0);
1337 if (ret) { 1337 if (ret) {
1338 /* We got interrupted (hello ptrace!). Clean up */
1338 spin_lock(&o2hb_live_lock); 1339 spin_lock(&o2hb_live_lock);
1339 hb_task = reg->hr_task; 1340 hb_task = reg->hr_task;
1340 reg->hr_task = NULL; 1341 reg->hr_task = NULL;
@@ -1345,7 +1346,16 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1345 goto out; 1346 goto out;
1346 } 1347 }
1347 1348
1348 ret = count; 1349 /* Ok, we were woken. Make sure it wasn't by drop_item() */
1350 spin_lock(&o2hb_live_lock);
1351 hb_task = reg->hr_task;
1352 spin_unlock(&o2hb_live_lock);
1353
1354 if (hb_task)
1355 ret = count;
1356 else
1357 ret = -EIO;
1358
1349out: 1359out:
1350 if (filp) 1360 if (filp)
1351 fput(filp); 1361 fput(filp);
@@ -1523,6 +1533,15 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1523 if (hb_task) 1533 if (hb_task)
1524 kthread_stop(hb_task); 1534 kthread_stop(hb_task);
1525 1535
1536 /*
1537 * If we're racing a dev_write(), we need to wake them. They will
1538 * check reg->hr_task
1539 */
1540 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1541 atomic_set(&reg->hr_steady_iterations, 0);
1542 wake_up(&o2hb_steady_queue);
1543 }
1544
1526 config_item_put(item); 1545 config_item_put(item);
1527} 1546}
1528 1547
@@ -1665,7 +1684,67 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
1665} 1684}
1666EXPORT_SYMBOL_GPL(o2hb_setup_callback); 1685EXPORT_SYMBOL_GPL(o2hb_setup_callback);
1667 1686
1668int o2hb_register_callback(struct o2hb_callback_func *hc) 1687static struct o2hb_region *o2hb_find_region(const char *region_uuid)
1688{
1689 struct o2hb_region *p, *reg = NULL;
1690
1691 assert_spin_locked(&o2hb_live_lock);
1692
1693 list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
1694 if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
1695 reg = p;
1696 break;
1697 }
1698 }
1699
1700 return reg;
1701}
1702
1703static int o2hb_region_get(const char *region_uuid)
1704{
1705 int ret = 0;
1706 struct o2hb_region *reg;
1707
1708 spin_lock(&o2hb_live_lock);
1709
1710 reg = o2hb_find_region(region_uuid);
1711 if (!reg)
1712 ret = -ENOENT;
1713 spin_unlock(&o2hb_live_lock);
1714
1715 if (ret)
1716 goto out;
1717
1718 ret = o2nm_depend_this_node();
1719 if (ret)
1720 goto out;
1721
1722 ret = o2nm_depend_item(&reg->hr_item);
1723 if (ret)
1724 o2nm_undepend_this_node();
1725
1726out:
1727 return ret;
1728}
1729
1730static void o2hb_region_put(const char *region_uuid)
1731{
1732 struct o2hb_region *reg;
1733
1734 spin_lock(&o2hb_live_lock);
1735
1736 reg = o2hb_find_region(region_uuid);
1737
1738 spin_unlock(&o2hb_live_lock);
1739
1740 if (reg) {
1741 o2nm_undepend_item(&reg->hr_item);
1742 o2nm_undepend_this_node();
1743 }
1744}
1745
1746int o2hb_register_callback(const char *region_uuid,
1747 struct o2hb_callback_func *hc)
1669{ 1748{
1670 struct o2hb_callback_func *tmp; 1749 struct o2hb_callback_func *tmp;
1671 struct list_head *iter; 1750 struct list_head *iter;
@@ -1681,6 +1760,12 @@ int o2hb_register_callback(struct o2hb_callback_func *hc)
1681 goto out; 1760 goto out;
1682 } 1761 }
1683 1762
1763 if (region_uuid) {
1764 ret = o2hb_region_get(region_uuid);
1765 if (ret)
1766 goto out;
1767 }
1768
1684 down_write(&o2hb_callback_sem); 1769 down_write(&o2hb_callback_sem);
1685 1770
1686 list_for_each(iter, &hbcall->list) { 1771 list_for_each(iter, &hbcall->list) {
@@ -1702,16 +1787,21 @@ out:
1702} 1787}
1703EXPORT_SYMBOL_GPL(o2hb_register_callback); 1788EXPORT_SYMBOL_GPL(o2hb_register_callback);
1704 1789
1705void o2hb_unregister_callback(struct o2hb_callback_func *hc) 1790void o2hb_unregister_callback(const char *region_uuid,
1791 struct o2hb_callback_func *hc)
1706{ 1792{
1707 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); 1793 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
1708 1794
1709 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", 1795 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
1710 __builtin_return_address(0), hc); 1796 __builtin_return_address(0), hc);
1711 1797
1798 /* XXX Can this happen _with_ a region reference? */
1712 if (list_empty(&hc->hc_item)) 1799 if (list_empty(&hc->hc_item))
1713 return; 1800 return;
1714 1801
1802 if (region_uuid)
1803 o2hb_region_put(region_uuid);
1804
1715 down_write(&o2hb_callback_sem); 1805 down_write(&o2hb_callback_sem);
1716 1806
1717 list_del_init(&hc->hc_item); 1807 list_del_init(&hc->hc_item);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index cc6d40b39771..35397dd5ecdb 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -69,8 +69,10 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
69 o2hb_cb_func *func, 69 o2hb_cb_func *func,
70 void *data, 70 void *data,
71 int priority); 71 int priority);
72int o2hb_register_callback(struct o2hb_callback_func *hc); 72int o2hb_register_callback(const char *region_uuid,
73void o2hb_unregister_callback(struct o2hb_callback_func *hc); 73 struct o2hb_callback_func *hc);
74void o2hb_unregister_callback(const char *region_uuid,
75 struct o2hb_callback_func *hc);
74void o2hb_fill_node_map(unsigned long *map, 76void o2hb_fill_node_map(unsigned long *map,
75 unsigned bytes); 77 unsigned bytes);
76void o2hb_init(void); 78void o2hb_init(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 9f5ad0f01ce0..af2070da308b 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -900,6 +900,46 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
900 }, 900 },
901}; 901};
902 902
903int o2nm_depend_item(struct config_item *item)
904{
905 return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
906}
907
908void o2nm_undepend_item(struct config_item *item)
909{
910 configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
911}
912
913int o2nm_depend_this_node(void)
914{
915 int ret = 0;
916 struct o2nm_node *local_node;
917
918 local_node = o2nm_get_node_by_num(o2nm_this_node());
919 if (!local_node) {
920 ret = -EINVAL;
921 goto out;
922 }
923
924 ret = o2nm_depend_item(&local_node->nd_item);
925 o2nm_node_put(local_node);
926
927out:
928 return ret;
929}
930
931void o2nm_undepend_this_node(void)
932{
933 struct o2nm_node *local_node;
934
935 local_node = o2nm_get_node_by_num(o2nm_this_node());
936 BUG_ON(!local_node);
937
938 o2nm_undepend_item(&local_node->nd_item);
939 o2nm_node_put(local_node);
940}
941
942
903static void __exit exit_o2nm(void) 943static void __exit exit_o2nm(void)
904{ 944{
905 if (ocfs2_table_header) 945 if (ocfs2_table_header)
@@ -934,7 +974,7 @@ static int __init init_o2nm(void)
934 goto out_sysctl; 974 goto out_sysctl;
935 975
936 config_group_init(&o2nm_cluster_group.cs_subsys.su_group); 976 config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
937 init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem); 977 mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
938 ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys); 978 ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
939 if (ret) { 979 if (ret) {
940 printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); 980 printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 070522138ae2..7c860361b8dd 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -77,4 +77,9 @@ struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
77void o2nm_node_get(struct o2nm_node *node); 77void o2nm_node_get(struct o2nm_node *node);
78void o2nm_node_put(struct o2nm_node *node); 78void o2nm_node_put(struct o2nm_node *node);
79 79
80int o2nm_depend_item(struct config_item *item);
81void o2nm_undepend_item(struct config_item *item);
82int o2nm_depend_this_node(void);
83void o2nm_undepend_this_node(void);
84
80#endif /* O2CLUSTER_NODEMANAGER_H */ 85#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0b229a9c7952..f0bdfd944c44 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -261,14 +261,12 @@ out:
261 261
262static void o2net_complete_nodes_nsw(struct o2net_node *nn) 262static void o2net_complete_nodes_nsw(struct o2net_node *nn)
263{ 263{
264 struct list_head *iter, *tmp; 264 struct o2net_status_wait *nsw, *tmp;
265 unsigned int num_kills = 0; 265 unsigned int num_kills = 0;
266 struct o2net_status_wait *nsw;
267 266
268 assert_spin_locked(&nn->nn_lock); 267 assert_spin_locked(&nn->nn_lock);
269 268
270 list_for_each_safe(iter, tmp, &nn->nn_status_list) { 269 list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) {
271 nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
272 o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); 270 o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
273 num_kills++; 271 num_kills++;
274 } 272 }
@@ -764,13 +762,10 @@ EXPORT_SYMBOL_GPL(o2net_register_handler);
764 762
765void o2net_unregister_handler_list(struct list_head *list) 763void o2net_unregister_handler_list(struct list_head *list)
766{ 764{
767 struct list_head *pos, *n; 765 struct o2net_msg_handler *nmh, *n;
768 struct o2net_msg_handler *nmh;
769 766
770 write_lock(&o2net_handler_lock); 767 write_lock(&o2net_handler_lock);
771 list_for_each_safe(pos, n, list) { 768 list_for_each_entry_safe(nmh, n, list, nh_unregister_item) {
772 nmh = list_entry(pos, struct o2net_msg_handler,
773 nh_unregister_item);
774 mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", 769 mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
775 nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); 770 nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
776 rb_erase(&nmh->nh_node, &o2net_handler_tree); 771 rb_erase(&nmh->nh_node, &o2net_handler_tree);
@@ -1638,8 +1633,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1638 1633
1639void o2net_unregister_hb_callbacks(void) 1634void o2net_unregister_hb_callbacks(void)
1640{ 1635{
1641 o2hb_unregister_callback(&o2net_hb_up); 1636 o2hb_unregister_callback(NULL, &o2net_hb_up);
1642 o2hb_unregister_callback(&o2net_hb_down); 1637 o2hb_unregister_callback(NULL, &o2net_hb_down);
1643} 1638}
1644 1639
1645int o2net_register_hb_callbacks(void) 1640int o2net_register_hb_callbacks(void)
@@ -1651,9 +1646,9 @@ int o2net_register_hb_callbacks(void)
1651 o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB, 1646 o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
1652 o2net_hb_node_up_cb, NULL, O2NET_HB_PRI); 1647 o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
1653 1648
1654 ret = o2hb_register_callback(&o2net_hb_up); 1649 ret = o2hb_register_callback(NULL, &o2net_hb_up);
1655 if (ret == 0) 1650 if (ret == 0)
1656 ret = o2hb_register_callback(&o2net_hb_down); 1651 ret = o2hb_register_callback(NULL, &o2net_hb_down);
1657 1652
1658 if (ret) 1653 if (ret)
1659 o2net_unregister_hb_callbacks(); 1654 o2net_unregister_hb_callbacks();
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c441ef1f2bad..0d5fdde959c8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -368,7 +368,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
368 u32 offset = OCFS2_I(dir)->ip_clusters; 368 u32 offset = OCFS2_I(dir)->ip_clusters;
369 369
370 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, 370 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
371 1, parent_fe_bh, handle, 371 1, 0, parent_fe_bh, handle,
372 data_ac, meta_ac, NULL); 372 data_ac, meta_ac, NULL);
373 BUG_ON(status == -EAGAIN); 373 BUG_ON(status == -EAGAIN);
374 if (status < 0) { 374 if (status < 0) {
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d836b98dd99a..6954565b8ccb 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1128,8 +1128,8 @@ bail:
1128 1128
1129static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) 1129static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
1130{ 1130{
1131 o2hb_unregister_callback(&dlm->dlm_hb_up); 1131 o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
1132 o2hb_unregister_callback(&dlm->dlm_hb_down); 1132 o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
1133 o2net_unregister_handler_list(&dlm->dlm_domain_handlers); 1133 o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
1134} 1134}
1135 1135
@@ -1141,13 +1141,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1141 1141
1142 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, 1142 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
1143 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); 1143 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
1144 status = o2hb_register_callback(&dlm->dlm_hb_down); 1144 status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
1145 if (status) 1145 if (status)
1146 goto bail; 1146 goto bail;
1147 1147
1148 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, 1148 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
1149 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); 1149 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
1150 status = o2hb_register_callback(&dlm->dlm_hb_up); 1150 status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
1151 if (status) 1151 if (status)
1152 goto bail; 1152 goto bail;
1153 1153
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 6edffca99d98..65b2b9b92688 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -192,25 +192,20 @@ static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
192static void dlm_dump_mles(struct dlm_ctxt *dlm) 192static void dlm_dump_mles(struct dlm_ctxt *dlm)
193{ 193{
194 struct dlm_master_list_entry *mle; 194 struct dlm_master_list_entry *mle;
195 struct list_head *iter;
196 195
197 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); 196 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
198 spin_lock(&dlm->master_lock); 197 spin_lock(&dlm->master_lock);
199 list_for_each(iter, &dlm->master_list) { 198 list_for_each_entry(mle, &dlm->master_list, list)
200 mle = list_entry(iter, struct dlm_master_list_entry, list);
201 dlm_print_one_mle(mle); 199 dlm_print_one_mle(mle);
202 }
203 spin_unlock(&dlm->master_lock); 200 spin_unlock(&dlm->master_lock);
204} 201}
205 202
206int dlm_dump_all_mles(const char __user *data, unsigned int len) 203int dlm_dump_all_mles(const char __user *data, unsigned int len)
207{ 204{
208 struct list_head *iter;
209 struct dlm_ctxt *dlm; 205 struct dlm_ctxt *dlm;
210 206
211 spin_lock(&dlm_domain_lock); 207 spin_lock(&dlm_domain_lock);
212 list_for_each(iter, &dlm_domains) { 208 list_for_each_entry(dlm, &dlm_domains, list) {
213 dlm = list_entry (iter, struct dlm_ctxt, list);
214 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name); 209 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
215 dlm_dump_mles(dlm); 210 dlm_dump_mles(dlm);
216 } 211 }
@@ -454,12 +449,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
454 char *name, unsigned int namelen) 449 char *name, unsigned int namelen)
455{ 450{
456 struct dlm_master_list_entry *tmpmle; 451 struct dlm_master_list_entry *tmpmle;
457 struct list_head *iter;
458 452
459 assert_spin_locked(&dlm->master_lock); 453 assert_spin_locked(&dlm->master_lock);
460 454
461 list_for_each(iter, &dlm->master_list) { 455 list_for_each_entry(tmpmle, &dlm->master_list, list) {
462 tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
463 if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 456 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
464 continue; 457 continue;
465 dlm_get_mle(tmpmle); 458 dlm_get_mle(tmpmle);
@@ -472,13 +465,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
472void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) 465void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
473{ 466{
474 struct dlm_master_list_entry *mle; 467 struct dlm_master_list_entry *mle;
475 struct list_head *iter;
476 468
477 assert_spin_locked(&dlm->spinlock); 469 assert_spin_locked(&dlm->spinlock);
478 470
479 list_for_each(iter, &dlm->mle_hb_events) { 471 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
480 mle = list_entry(iter, struct dlm_master_list_entry,
481 hb_events);
482 if (node_up) 472 if (node_up)
483 dlm_mle_node_up(dlm, mle, NULL, idx); 473 dlm_mle_node_up(dlm, mle, NULL, idx);
484 else 474 else
@@ -2434,7 +2424,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2434 int ret; 2424 int ret;
2435 int i; 2425 int i;
2436 int count = 0; 2426 int count = 0;
2437 struct list_head *queue, *iter; 2427 struct list_head *queue;
2438 struct dlm_lock *lock; 2428 struct dlm_lock *lock;
2439 2429
2440 assert_spin_locked(&res->spinlock); 2430 assert_spin_locked(&res->spinlock);
@@ -2453,8 +2443,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2453 ret = 0; 2443 ret = 0;
2454 queue = &res->granted; 2444 queue = &res->granted;
2455 for (i = 0; i < 3; i++) { 2445 for (i = 0; i < 3; i++) {
2456 list_for_each(iter, queue) { 2446 list_for_each_entry(lock, queue, list) {
2457 lock = list_entry(iter, struct dlm_lock, list);
2458 ++count; 2447 ++count;
2459 if (lock->ml.node == dlm->node_num) { 2448 if (lock->ml.node == dlm->node_num) {
2460 mlog(0, "found a lock owned by this node still " 2449 mlog(0, "found a lock owned by this node still "
@@ -2923,18 +2912,16 @@ again:
2923static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, 2912static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2924 struct dlm_lock_resource *res) 2913 struct dlm_lock_resource *res)
2925{ 2914{
2926 struct list_head *iter, *iter2;
2927 struct list_head *queue = &res->granted; 2915 struct list_head *queue = &res->granted;
2928 int i, bit; 2916 int i, bit;
2929 struct dlm_lock *lock; 2917 struct dlm_lock *lock, *next;
2930 2918
2931 assert_spin_locked(&res->spinlock); 2919 assert_spin_locked(&res->spinlock);
2932 2920
2933 BUG_ON(res->owner == dlm->node_num); 2921 BUG_ON(res->owner == dlm->node_num);
2934 2922
2935 for (i=0; i<3; i++) { 2923 for (i=0; i<3; i++) {
2936 list_for_each_safe(iter, iter2, queue) { 2924 list_for_each_entry_safe(lock, next, queue, list) {
2937 lock = list_entry (iter, struct dlm_lock, list);
2938 if (lock->ml.node != dlm->node_num) { 2925 if (lock->ml.node != dlm->node_num) {
2939 mlog(0, "putting lock for node %u\n", 2926 mlog(0, "putting lock for node %u\n",
2940 lock->ml.node); 2927 lock->ml.node);
@@ -2976,7 +2963,6 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2976{ 2963{
2977 int i; 2964 int i;
2978 struct list_head *queue = &res->granted; 2965 struct list_head *queue = &res->granted;
2979 struct list_head *iter;
2980 struct dlm_lock *lock; 2966 struct dlm_lock *lock;
2981 int nodenum; 2967 int nodenum;
2982 2968
@@ -2984,10 +2970,9 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2984 2970
2985 spin_lock(&res->spinlock); 2971 spin_lock(&res->spinlock);
2986 for (i=0; i<3; i++) { 2972 for (i=0; i<3; i++) {
2987 list_for_each(iter, queue) { 2973 list_for_each_entry(lock, queue, list) {
2988 /* up to the caller to make sure this node 2974 /* up to the caller to make sure this node
2989 * is alive */ 2975 * is alive */
2990 lock = list_entry (iter, struct dlm_lock, list);
2991 if (lock->ml.node != dlm->node_num) { 2976 if (lock->ml.node != dlm->node_num) {
2992 spin_unlock(&res->spinlock); 2977 spin_unlock(&res->spinlock);
2993 return lock->ml.node; 2978 return lock->ml.node;
@@ -3234,8 +3219,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3234 3219
3235void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) 3220void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3236{ 3221{
3237 struct list_head *iter, *iter2; 3222 struct dlm_master_list_entry *mle, *next;
3238 struct dlm_master_list_entry *mle;
3239 struct dlm_lock_resource *res; 3223 struct dlm_lock_resource *res;
3240 unsigned int hash; 3224 unsigned int hash;
3241 3225
@@ -3245,9 +3229,7 @@ top:
3245 3229
3246 /* clean the master list */ 3230 /* clean the master list */
3247 spin_lock(&dlm->master_lock); 3231 spin_lock(&dlm->master_lock);
3248 list_for_each_safe(iter, iter2, &dlm->master_list) { 3232 list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
3249 mle = list_entry(iter, struct dlm_master_list_entry, list);
3250
3251 BUG_ON(mle->type != DLM_MLE_BLOCK && 3233 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3252 mle->type != DLM_MLE_MASTER && 3234 mle->type != DLM_MLE_MASTER &&
3253 mle->type != DLM_MLE_MIGRATION); 3235 mle->type != DLM_MLE_MIGRATION);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 671c4ed58ee2..a2c33160bfd6 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -158,8 +158,7 @@ void dlm_dispatch_work(struct work_struct *work)
158 struct dlm_ctxt *dlm = 158 struct dlm_ctxt *dlm =
159 container_of(work, struct dlm_ctxt, dispatched_work); 159 container_of(work, struct dlm_ctxt, dispatched_work);
160 LIST_HEAD(tmp_list); 160 LIST_HEAD(tmp_list);
161 struct list_head *iter, *iter2; 161 struct dlm_work_item *item, *next;
162 struct dlm_work_item *item;
163 dlm_workfunc_t *workfunc; 162 dlm_workfunc_t *workfunc;
164 int tot=0; 163 int tot=0;
165 164
@@ -167,13 +166,12 @@ void dlm_dispatch_work(struct work_struct *work)
167 list_splice_init(&dlm->work_list, &tmp_list); 166 list_splice_init(&dlm->work_list, &tmp_list);
168 spin_unlock(&dlm->work_lock); 167 spin_unlock(&dlm->work_lock);
169 168
170 list_for_each_safe(iter, iter2, &tmp_list) { 169 list_for_each_entry(item, &tmp_list, list) {
171 tot++; 170 tot++;
172 } 171 }
173 mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); 172 mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
174 173
175 list_for_each_safe(iter, iter2, &tmp_list) { 174 list_for_each_entry_safe(item, next, &tmp_list, list) {
176 item = list_entry(iter, struct dlm_work_item, list);
177 workfunc = item->func; 175 workfunc = item->func;
178 list_del_init(&item->list); 176 list_del_init(&item->list);
179 177
@@ -549,7 +547,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
549{ 547{
550 int status = 0; 548 int status = 0;
551 struct dlm_reco_node_data *ndata; 549 struct dlm_reco_node_data *ndata;
552 struct list_head *iter;
553 int all_nodes_done; 550 int all_nodes_done;
554 int destroy = 0; 551 int destroy = 0;
555 int pass = 0; 552 int pass = 0;
@@ -567,8 +564,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
567 564
568 /* safe to access the node data list without a lock, since this 565 /* safe to access the node data list without a lock, since this
569 * process is the only one to change the list */ 566 * process is the only one to change the list */
570 list_for_each(iter, &dlm->reco.node_data) { 567 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
571 ndata = list_entry (iter, struct dlm_reco_node_data, list);
572 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); 568 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
573 ndata->state = DLM_RECO_NODE_DATA_REQUESTING; 569 ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
574 570
@@ -655,9 +651,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
655 * done, or if anyone died */ 651 * done, or if anyone died */
656 all_nodes_done = 1; 652 all_nodes_done = 1;
657 spin_lock(&dlm_reco_state_lock); 653 spin_lock(&dlm_reco_state_lock);
658 list_for_each(iter, &dlm->reco.node_data) { 654 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
659 ndata = list_entry (iter, struct dlm_reco_node_data, list);
660
661 mlog(0, "checking recovery state of node %u\n", 655 mlog(0, "checking recovery state of node %u\n",
662 ndata->node_num); 656 ndata->node_num);
663 switch (ndata->state) { 657 switch (ndata->state) {
@@ -774,16 +768,14 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
774 768
775static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) 769static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
776{ 770{
777 struct list_head *iter, *iter2; 771 struct dlm_reco_node_data *ndata, *next;
778 struct dlm_reco_node_data *ndata;
779 LIST_HEAD(tmplist); 772 LIST_HEAD(tmplist);
780 773
781 spin_lock(&dlm_reco_state_lock); 774 spin_lock(&dlm_reco_state_lock);
782 list_splice_init(&dlm->reco.node_data, &tmplist); 775 list_splice_init(&dlm->reco.node_data, &tmplist);
783 spin_unlock(&dlm_reco_state_lock); 776 spin_unlock(&dlm_reco_state_lock);
784 777
785 list_for_each_safe(iter, iter2, &tmplist) { 778 list_for_each_entry_safe(ndata, next, &tmplist, list) {
786 ndata = list_entry (iter, struct dlm_reco_node_data, list);
787 list_del_init(&ndata->list); 779 list_del_init(&ndata->list);
788 kfree(ndata); 780 kfree(ndata);
789 } 781 }
@@ -876,7 +868,6 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
876 struct dlm_lock_resource *res; 868 struct dlm_lock_resource *res;
877 struct dlm_ctxt *dlm; 869 struct dlm_ctxt *dlm;
878 LIST_HEAD(resources); 870 LIST_HEAD(resources);
879 struct list_head *iter;
880 int ret; 871 int ret;
881 u8 dead_node, reco_master; 872 u8 dead_node, reco_master;
882 int skip_all_done = 0; 873 int skip_all_done = 0;
@@ -920,8 +911,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
920 911
921 /* any errors returned will be due to the new_master dying, 912 /* any errors returned will be due to the new_master dying,
922 * the dlm_reco_thread should detect this */ 913 * the dlm_reco_thread should detect this */
923 list_for_each(iter, &resources) { 914 list_for_each_entry(res, &resources, recovering) {
924 res = list_entry (iter, struct dlm_lock_resource, recovering);
925 ret = dlm_send_one_lockres(dlm, res, mres, reco_master, 915 ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
926 DLM_MRES_RECOVERY); 916 DLM_MRES_RECOVERY);
927 if (ret < 0) { 917 if (ret < 0) {
@@ -983,7 +973,6 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
983{ 973{
984 struct dlm_ctxt *dlm = data; 974 struct dlm_ctxt *dlm = data;
985 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; 975 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
986 struct list_head *iter;
987 struct dlm_reco_node_data *ndata = NULL; 976 struct dlm_reco_node_data *ndata = NULL;
988 int ret = -EINVAL; 977 int ret = -EINVAL;
989 978
@@ -1000,8 +989,7 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
1000 dlm->reco.dead_node, done->node_idx, dlm->node_num); 989 dlm->reco.dead_node, done->node_idx, dlm->node_num);
1001 990
1002 spin_lock(&dlm_reco_state_lock); 991 spin_lock(&dlm_reco_state_lock);
1003 list_for_each(iter, &dlm->reco.node_data) { 992 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
1004 ndata = list_entry (iter, struct dlm_reco_node_data, list);
1005 if (ndata->node_num != done->node_idx) 993 if (ndata->node_num != done->node_idx)
1006 continue; 994 continue;
1007 995
@@ -1049,13 +1037,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
1049 struct list_head *list, 1037 struct list_head *list,
1050 u8 dead_node) 1038 u8 dead_node)
1051{ 1039{
1052 struct dlm_lock_resource *res; 1040 struct dlm_lock_resource *res, *next;
1053 struct list_head *iter, *iter2;
1054 struct dlm_lock *lock; 1041 struct dlm_lock *lock;
1055 1042
1056 spin_lock(&dlm->spinlock); 1043 spin_lock(&dlm->spinlock);
1057 list_for_each_safe(iter, iter2, &dlm->reco.resources) { 1044 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
1058 res = list_entry (iter, struct dlm_lock_resource, recovering);
1059 /* always prune any $RECOVERY entries for dead nodes, 1045 /* always prune any $RECOVERY entries for dead nodes,
1060 * otherwise hangs can occur during later recovery */ 1046 * otherwise hangs can occur during later recovery */
1061 if (dlm_is_recovery_lock(res->lockname.name, 1047 if (dlm_is_recovery_lock(res->lockname.name,
@@ -1169,7 +1155,7 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
1169 u8 flags, u8 master) 1155 u8 flags, u8 master)
1170{ 1156{
1171 /* mres here is one full page */ 1157 /* mres here is one full page */
1172 memset(mres, 0, PAGE_SIZE); 1158 clear_page(mres);
1173 mres->lockname_len = namelen; 1159 mres->lockname_len = namelen;
1174 memcpy(mres->lockname, lockname, namelen); 1160 memcpy(mres->lockname, lockname, namelen);
1175 mres->num_locks = 0; 1161 mres->num_locks = 0;
@@ -1252,7 +1238,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1252 struct dlm_migratable_lockres *mres, 1238 struct dlm_migratable_lockres *mres,
1253 u8 send_to, u8 flags) 1239 u8 send_to, u8 flags)
1254{ 1240{
1255 struct list_head *queue, *iter; 1241 struct list_head *queue;
1256 int total_locks, i; 1242 int total_locks, i;
1257 u64 mig_cookie = 0; 1243 u64 mig_cookie = 0;
1258 struct dlm_lock *lock; 1244 struct dlm_lock *lock;
@@ -1278,9 +1264,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1278 total_locks = 0; 1264 total_locks = 0;
1279 for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { 1265 for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
1280 queue = dlm_list_idx_to_ptr(res, i); 1266 queue = dlm_list_idx_to_ptr(res, i);
1281 list_for_each(iter, queue) { 1267 list_for_each_entry(lock, queue, list) {
1282 lock = list_entry (iter, struct dlm_lock, list);
1283
1284 /* add another lock. */ 1268 /* add another lock. */
1285 total_locks++; 1269 total_locks++;
1286 if (!dlm_add_lock_to_array(lock, mres, i)) 1270 if (!dlm_add_lock_to_array(lock, mres, i))
@@ -1717,7 +1701,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1717 struct dlm_lockstatus *lksb = NULL; 1701 struct dlm_lockstatus *lksb = NULL;
1718 int ret = 0; 1702 int ret = 0;
1719 int i, j, bad; 1703 int i, j, bad;
1720 struct list_head *iter;
1721 struct dlm_lock *lock = NULL; 1704 struct dlm_lock *lock = NULL;
1722 u8 from = O2NM_MAX_NODES; 1705 u8 from = O2NM_MAX_NODES;
1723 unsigned int added = 0; 1706 unsigned int added = 0;
@@ -1755,8 +1738,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1755 spin_lock(&res->spinlock); 1738 spin_lock(&res->spinlock);
1756 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { 1739 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
1757 tmpq = dlm_list_idx_to_ptr(res, j); 1740 tmpq = dlm_list_idx_to_ptr(res, j);
1758 list_for_each(iter, tmpq) { 1741 list_for_each_entry(lock, tmpq, list) {
1759 lock = list_entry (iter, struct dlm_lock, list);
1760 if (lock->ml.cookie != ml->cookie) 1742 if (lock->ml.cookie != ml->cookie)
1761 lock = NULL; 1743 lock = NULL;
1762 else 1744 else
@@ -1930,8 +1912,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1930 struct dlm_lock_resource *res) 1912 struct dlm_lock_resource *res)
1931{ 1913{
1932 int i; 1914 int i;
1933 struct list_head *queue, *iter, *iter2; 1915 struct list_head *queue;
1934 struct dlm_lock *lock; 1916 struct dlm_lock *lock, *next;
1935 1917
1936 res->state |= DLM_LOCK_RES_RECOVERING; 1918 res->state |= DLM_LOCK_RES_RECOVERING;
1937 if (!list_empty(&res->recovering)) { 1919 if (!list_empty(&res->recovering)) {
@@ -1947,8 +1929,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1947 /* find any pending locks and put them back on proper list */ 1929 /* find any pending locks and put them back on proper list */
1948 for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { 1930 for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
1949 queue = dlm_list_idx_to_ptr(res, i); 1931 queue = dlm_list_idx_to_ptr(res, i);
1950 list_for_each_safe(iter, iter2, queue) { 1932 list_for_each_entry_safe(lock, next, queue, list) {
1951 lock = list_entry (iter, struct dlm_lock, list);
1952 dlm_lock_get(lock); 1933 dlm_lock_get(lock);
1953 if (lock->convert_pending) { 1934 if (lock->convert_pending) {
1954 /* move converting lock back to granted */ 1935 /* move converting lock back to granted */
@@ -2013,18 +1994,15 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2013 u8 dead_node, u8 new_master) 1994 u8 dead_node, u8 new_master)
2014{ 1995{
2015 int i; 1996 int i;
2016 struct list_head *iter, *iter2;
2017 struct hlist_node *hash_iter; 1997 struct hlist_node *hash_iter;
2018 struct hlist_head *bucket; 1998 struct hlist_head *bucket;
2019 1999 struct dlm_lock_resource *res, *next;
2020 struct dlm_lock_resource *res;
2021 2000
2022 mlog_entry_void(); 2001 mlog_entry_void();
2023 2002
2024 assert_spin_locked(&dlm->spinlock); 2003 assert_spin_locked(&dlm->spinlock);
2025 2004
2026 list_for_each_safe(iter, iter2, &dlm->reco.resources) { 2005 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
2027 res = list_entry (iter, struct dlm_lock_resource, recovering);
2028 if (res->owner == dead_node) { 2006 if (res->owner == dead_node) {
2029 list_del_init(&res->recovering); 2007 list_del_init(&res->recovering);
2030 spin_lock(&res->spinlock); 2008 spin_lock(&res->spinlock);
@@ -2099,7 +2077,7 @@ static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
2099static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, 2077static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2100 struct dlm_lock_resource *res, u8 dead_node) 2078 struct dlm_lock_resource *res, u8 dead_node)
2101{ 2079{
2102 struct list_head *iter, *queue; 2080 struct list_head *queue;
2103 struct dlm_lock *lock; 2081 struct dlm_lock *lock;
2104 int blank_lvb = 0, local = 0; 2082 int blank_lvb = 0, local = 0;
2105 int i; 2083 int i;
@@ -2121,8 +2099,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2121 2099
2122 for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { 2100 for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
2123 queue = dlm_list_idx_to_ptr(res, i); 2101 queue = dlm_list_idx_to_ptr(res, i);
2124 list_for_each(iter, queue) { 2102 list_for_each_entry(lock, queue, list) {
2125 lock = list_entry (iter, struct dlm_lock, list);
2126 if (lock->ml.node == search_node) { 2103 if (lock->ml.node == search_node) {
2127 if (dlm_lvb_needs_invalidation(lock, local)) { 2104 if (dlm_lvb_needs_invalidation(lock, local)) {
2128 /* zero the lksb lvb and lockres lvb */ 2105 /* zero the lksb lvb and lockres lvb */
@@ -2143,8 +2120,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2143static void dlm_free_dead_locks(struct dlm_ctxt *dlm, 2120static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2144 struct dlm_lock_resource *res, u8 dead_node) 2121 struct dlm_lock_resource *res, u8 dead_node)
2145{ 2122{
2146 struct list_head *iter, *tmpiter; 2123 struct dlm_lock *lock, *next;
2147 struct dlm_lock *lock;
2148 unsigned int freed = 0; 2124 unsigned int freed = 0;
2149 2125
2150 /* this node is the lockres master: 2126 /* this node is the lockres master:
@@ -2155,24 +2131,21 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2155 assert_spin_locked(&res->spinlock); 2131 assert_spin_locked(&res->spinlock);
2156 2132
2157 /* TODO: check pending_asts, pending_basts here */ 2133 /* TODO: check pending_asts, pending_basts here */
2158 list_for_each_safe(iter, tmpiter, &res->granted) { 2134 list_for_each_entry_safe(lock, next, &res->granted, list) {
2159 lock = list_entry (iter, struct dlm_lock, list);
2160 if (lock->ml.node == dead_node) { 2135 if (lock->ml.node == dead_node) {
2161 list_del_init(&lock->list); 2136 list_del_init(&lock->list);
2162 dlm_lock_put(lock); 2137 dlm_lock_put(lock);
2163 freed++; 2138 freed++;
2164 } 2139 }
2165 } 2140 }
2166 list_for_each_safe(iter, tmpiter, &res->converting) { 2141 list_for_each_entry_safe(lock, next, &res->converting, list) {
2167 lock = list_entry (iter, struct dlm_lock, list);
2168 if (lock->ml.node == dead_node) { 2142 if (lock->ml.node == dead_node) {
2169 list_del_init(&lock->list); 2143 list_del_init(&lock->list);
2170 dlm_lock_put(lock); 2144 dlm_lock_put(lock);
2171 freed++; 2145 freed++;
2172 } 2146 }
2173 } 2147 }
2174 list_for_each_safe(iter, tmpiter, &res->blocked) { 2148 list_for_each_entry_safe(lock, next, &res->blocked, list) {
2175 lock = list_entry (iter, struct dlm_lock, list);
2176 if (lock->ml.node == dead_node) { 2149 if (lock->ml.node == dead_node) {
2177 list_del_init(&lock->list); 2150 list_del_init(&lock->list);
2178 dlm_lock_put(lock); 2151 dlm_lock_put(lock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index d1bd305ef0d7..f71250ed166f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -600,15 +600,13 @@ static inline int ocfs2_highest_compat_lock_level(int level)
600static void lockres_set_flags(struct ocfs2_lock_res *lockres, 600static void lockres_set_flags(struct ocfs2_lock_res *lockres,
601 unsigned long newflags) 601 unsigned long newflags)
602{ 602{
603 struct list_head *pos, *tmp; 603 struct ocfs2_mask_waiter *mw, *tmp;
604 struct ocfs2_mask_waiter *mw;
605 604
606 assert_spin_locked(&lockres->l_lock); 605 assert_spin_locked(&lockres->l_lock);
607 606
608 lockres->l_flags = newflags; 607 lockres->l_flags = newflags;
609 608
610 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { 609 list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
611 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
612 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) 610 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
613 continue; 611 continue;
614 612
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index f226b2207628..ff257628af16 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -32,6 +32,11 @@ static inline void le32_add_cpu(__le32 *var, u32 val)
32 *var = cpu_to_le32(le32_to_cpu(*var) + val); 32 *var = cpu_to_le32(le32_to_cpu(*var) + val);
33} 33}
34 34
35static inline void le64_add_cpu(__le64 *var, u64 val)
36{
37 *var = cpu_to_le64(le64_to_cpu(*var) + val);
38}
39
35static inline void le32_and_cpu(__le32 *var, u32 val) 40static inline void le32_and_cpu(__le32 *var, u32 val)
36{ 41{
37 *var = cpu_to_le32(le32_to_cpu(*var) & val); 42 *var = cpu_to_le32(le32_to_cpu(*var) & val);
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
index 5b77ee7866ef..e08bed9e45a0 100644
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h
@@ -26,6 +26,8 @@
26#ifndef OCFS2_EXPORT_H 26#ifndef OCFS2_EXPORT_H
27#define OCFS2_EXPORT_H 27#define OCFS2_EXPORT_H
28 28
29#include <linux/exportfs.h>
30
29extern struct export_operations ocfs2_export_ops; 31extern struct export_operations ocfs2_export_ops;
30 32
31#endif /* OCFS2_EXPORT_H */ 33#endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index ba2b2ab1c6e4..03c1d365c78b 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -109,17 +109,14 @@ static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
109 */ 109 */
110void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) 110void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
111{ 111{
112 struct list_head *p, *n; 112 struct ocfs2_extent_map_item *emi, *n;
113 struct ocfs2_extent_map_item *emi;
114 struct ocfs2_inode_info *oi = OCFS2_I(inode); 113 struct ocfs2_inode_info *oi = OCFS2_I(inode);
115 struct ocfs2_extent_map *em = &oi->ip_extent_map; 114 struct ocfs2_extent_map *em = &oi->ip_extent_map;
116 LIST_HEAD(tmp_list); 115 LIST_HEAD(tmp_list);
117 unsigned int range; 116 unsigned int range;
118 117
119 spin_lock(&oi->ip_lock); 118 spin_lock(&oi->ip_lock);
120 list_for_each_safe(p, n, &em->em_list) { 119 list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
121 emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
122
123 if (emi->ei_cpos >= cpos) { 120 if (emi->ei_cpos >= cpos) {
124 /* Full truncate of this record. */ 121 /* Full truncate of this record. */
125 list_move(&emi->ei_list, &tmp_list); 122 list_move(&emi->ei_list, &tmp_list);
@@ -136,8 +133,7 @@ void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
136 } 133 }
137 spin_unlock(&oi->ip_lock); 134 spin_unlock(&oi->ip_lock);
138 135
139 list_for_each_safe(p, n, &tmp_list) { 136 list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
140 emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
141 list_del(&emi->ei_list); 137 list_del(&emi->ei_list);
142 kfree(emi); 138 kfree(emi);
143 } 139 }
@@ -377,37 +373,6 @@ out:
377 return ret; 373 return ret;
378} 374}
379 375
380/*
381 * Return the index of the extent record which contains cluster #v_cluster.
382 * -1 is returned if it was not found.
383 *
384 * Should work fine on interior and exterior nodes.
385 */
386static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
387 u32 v_cluster)
388{
389 int ret = -1;
390 int i;
391 struct ocfs2_extent_rec *rec;
392 u32 rec_end, rec_start, clusters;
393
394 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
395 rec = &el->l_recs[i];
396
397 rec_start = le32_to_cpu(rec->e_cpos);
398 clusters = ocfs2_rec_clusters(el, rec);
399
400 rec_end = rec_start + clusters;
401
402 if (v_cluster >= rec_start && v_cluster < rec_end) {
403 ret = i;
404 break;
405 }
406 }
407
408 return ret;
409}
410
411int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, 376int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
412 u32 *p_cluster, u32 *num_clusters, 377 u32 *p_cluster, u32 *num_clusters,
413 unsigned int *extent_flags) 378 unsigned int *extent_flags)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4979b6675717..f04c7aa834cb 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -263,6 +263,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
263 int status; 263 int status;
264 handle_t *handle; 264 handle_t *handle;
265 struct ocfs2_dinode *di; 265 struct ocfs2_dinode *di;
266 u64 cluster_bytes;
266 267
267 mlog_entry_void(); 268 mlog_entry_void();
268 269
@@ -286,7 +287,9 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
286 /* 287 /*
287 * Do this before setting i_size. 288 * Do this before setting i_size.
288 */ 289 */
289 status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); 290 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
291 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
292 cluster_bytes);
290 if (status) { 293 if (status) {
291 mlog_errno(status); 294 mlog_errno(status);
292 goto out_commit; 295 goto out_commit;
@@ -326,9 +329,6 @@ static int ocfs2_truncate_file(struct inode *inode,
326 (unsigned long long)OCFS2_I(inode)->ip_blkno, 329 (unsigned long long)OCFS2_I(inode)->ip_blkno,
327 (unsigned long long)new_i_size); 330 (unsigned long long)new_i_size);
328 331
329 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
330 truncate_inode_pages(inode->i_mapping, new_i_size);
331
332 fe = (struct ocfs2_dinode *) di_bh->b_data; 332 fe = (struct ocfs2_dinode *) di_bh->b_data;
333 if (!OCFS2_IS_VALID_DINODE(fe)) { 333 if (!OCFS2_IS_VALID_DINODE(fe)) {
334 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 334 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
@@ -363,16 +363,23 @@ static int ocfs2_truncate_file(struct inode *inode,
363 if (new_i_size == le64_to_cpu(fe->i_size)) 363 if (new_i_size == le64_to_cpu(fe->i_size))
364 goto bail; 364 goto bail;
365 365
366 down_write(&OCFS2_I(inode)->ip_alloc_sem);
367
366 /* This forces other nodes to sync and drop their pages. Do 368 /* This forces other nodes to sync and drop their pages. Do
367 * this even if we have a truncate without allocation change - 369 * this even if we have a truncate without allocation change -
368 * ocfs2 cluster sizes can be much greater than page size, so 370 * ocfs2 cluster sizes can be much greater than page size, so
369 * we have to truncate them anyway. */ 371 * we have to truncate them anyway. */
370 status = ocfs2_data_lock(inode, 1); 372 status = ocfs2_data_lock(inode, 1);
371 if (status < 0) { 373 if (status < 0) {
374 up_write(&OCFS2_I(inode)->ip_alloc_sem);
375
372 mlog_errno(status); 376 mlog_errno(status);
373 goto bail; 377 goto bail;
374 } 378 }
375 379
380 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
381 truncate_inode_pages(inode->i_mapping, new_i_size);
382
376 /* alright, we're going to need to do a full blown alloc size 383 /* alright, we're going to need to do a full blown alloc size
377 * change. Orphan the inode so that recovery can complete the 384 * change. Orphan the inode so that recovery can complete the
378 * truncate if necessary. This does the task of marking 385 * truncate if necessary. This does the task of marking
@@ -399,6 +406,8 @@ static int ocfs2_truncate_file(struct inode *inode,
399bail_unlock_data: 406bail_unlock_data:
400 ocfs2_data_unlock(inode, 1); 407 ocfs2_data_unlock(inode, 1);
401 408
409 up_write(&OCFS2_I(inode)->ip_alloc_sem);
410
402bail: 411bail:
403 412
404 mlog_exit(status); 413 mlog_exit(status);
@@ -419,6 +428,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
419 struct inode *inode, 428 struct inode *inode,
420 u32 *logical_offset, 429 u32 *logical_offset,
421 u32 clusters_to_add, 430 u32 clusters_to_add,
431 int mark_unwritten,
422 struct buffer_head *fe_bh, 432 struct buffer_head *fe_bh,
423 handle_t *handle, 433 handle_t *handle,
424 struct ocfs2_alloc_context *data_ac, 434 struct ocfs2_alloc_context *data_ac,
@@ -431,9 +441,13 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
431 enum ocfs2_alloc_restarted reason = RESTART_NONE; 441 enum ocfs2_alloc_restarted reason = RESTART_NONE;
432 u32 bit_off, num_bits; 442 u32 bit_off, num_bits;
433 u64 block; 443 u64 block;
444 u8 flags = 0;
434 445
435 BUG_ON(!clusters_to_add); 446 BUG_ON(!clusters_to_add);
436 447
448 if (mark_unwritten)
449 flags = OCFS2_EXT_UNWRITTEN;
450
437 free_extents = ocfs2_num_free_extents(osb, inode, fe); 451 free_extents = ocfs2_num_free_extents(osb, inode, fe);
438 if (free_extents < 0) { 452 if (free_extents < 0) {
439 status = free_extents; 453 status = free_extents;
@@ -483,7 +497,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
483 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 497 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
484 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, 498 status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
485 *logical_offset, block, num_bits, 499 *logical_offset, block, num_bits,
486 meta_ac); 500 flags, meta_ac);
487 if (status < 0) { 501 if (status < 0) {
488 mlog_errno(status); 502 mlog_errno(status);
489 goto leave; 503 goto leave;
@@ -516,25 +530,31 @@ leave:
516 * For a given allocation, determine which allocators will need to be 530 * For a given allocation, determine which allocators will need to be
517 * accessed, and lock them, reserving the appropriate number of bits. 531 * accessed, and lock them, reserving the appropriate number of bits.
518 * 532 *
519 * Called from ocfs2_extend_allocation() for file systems which don't 533 * Sparse file systems call this from ocfs2_write_begin_nolock()
520 * support holes, and from ocfs2_write() for file systems which 534 * and ocfs2_allocate_unwritten_extents().
521 * understand sparse inodes. 535 *
536 * File systems which don't support holes call this from
537 * ocfs2_extend_allocation().
522 */ 538 */
523int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 539int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
524 u32 clusters_to_add, 540 u32 clusters_to_add, u32 extents_to_split,
525 struct ocfs2_alloc_context **data_ac, 541 struct ocfs2_alloc_context **data_ac,
526 struct ocfs2_alloc_context **meta_ac) 542 struct ocfs2_alloc_context **meta_ac)
527{ 543{
528 int ret, num_free_extents; 544 int ret = 0, num_free_extents;
545 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
529 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 546 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
530 547
531 *meta_ac = NULL; 548 *meta_ac = NULL;
532 *data_ac = NULL; 549 if (data_ac)
550 *data_ac = NULL;
551
552 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
533 553
534 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 554 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
535 "clusters_to_add = %u\n", 555 "clusters_to_add = %u, extents_to_split = %u\n",
536 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 556 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
537 le32_to_cpu(di->i_clusters), clusters_to_add); 557 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
538 558
539 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 559 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
540 if (num_free_extents < 0) { 560 if (num_free_extents < 0) {
@@ -552,9 +572,12 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
552 * 572 *
553 * Most of the time we'll only be seeing this 1 cluster at a time 573 * Most of the time we'll only be seeing this 1 cluster at a time
554 * anyway. 574 * anyway.
575 *
576 * Always lock for any unwritten extents - we might want to
577 * add blocks during a split.
555 */ 578 */
556 if (!num_free_extents || 579 if (!num_free_extents ||
557 (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { 580 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
558 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); 581 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
559 if (ret < 0) { 582 if (ret < 0) {
560 if (ret != -ENOSPC) 583 if (ret != -ENOSPC)
@@ -563,6 +586,9 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
563 } 586 }
564 } 587 }
565 588
589 if (clusters_to_add == 0)
590 goto out;
591
566 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 592 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
567 if (ret < 0) { 593 if (ret < 0) {
568 if (ret != -ENOSPC) 594 if (ret != -ENOSPC)
@@ -585,14 +611,13 @@ out:
585 return ret; 611 return ret;
586} 612}
587 613
588static int ocfs2_extend_allocation(struct inode *inode, 614static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
589 u32 clusters_to_add) 615 u32 clusters_to_add, int mark_unwritten)
590{ 616{
591 int status = 0; 617 int status = 0;
592 int restart_func = 0; 618 int restart_func = 0;
593 int drop_alloc_sem = 0;
594 int credits; 619 int credits;
595 u32 prev_clusters, logical_start; 620 u32 prev_clusters;
596 struct buffer_head *bh = NULL; 621 struct buffer_head *bh = NULL;
597 struct ocfs2_dinode *fe = NULL; 622 struct ocfs2_dinode *fe = NULL;
598 handle_t *handle = NULL; 623 handle_t *handle = NULL;
@@ -607,7 +632,7 @@ static int ocfs2_extend_allocation(struct inode *inode,
607 * This function only exists for file systems which don't 632 * This function only exists for file systems which don't
608 * support holes. 633 * support holes.
609 */ 634 */
610 BUG_ON(ocfs2_sparse_alloc(osb)); 635 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
611 636
612 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 637 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
613 OCFS2_BH_CACHED, inode); 638 OCFS2_BH_CACHED, inode);
@@ -623,19 +648,10 @@ static int ocfs2_extend_allocation(struct inode *inode,
623 goto leave; 648 goto leave;
624 } 649 }
625 650
626 logical_start = OCFS2_I(inode)->ip_clusters;
627
628restart_all: 651restart_all:
629 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 652 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
630 653
631 /* blocks peope in read/write from reading our allocation 654 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
632 * until we're done changing it. We depend on i_mutex to block
633 * other extend/truncate calls while we're here. Ordering wrt
634 * start_trans is important here -- always do it before! */
635 down_write(&OCFS2_I(inode)->ip_alloc_sem);
636 drop_alloc_sem = 1;
637
638 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
639 &meta_ac); 655 &meta_ac);
640 if (status) { 656 if (status) {
641 mlog_errno(status); 657 mlog_errno(status);
@@ -668,6 +684,7 @@ restarted_transaction:
668 inode, 684 inode,
669 &logical_start, 685 &logical_start,
670 clusters_to_add, 686 clusters_to_add,
687 mark_unwritten,
671 bh, 688 bh,
672 handle, 689 handle,
673 data_ac, 690 data_ac,
@@ -720,10 +737,6 @@ restarted_transaction:
720 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 737 OCFS2_I(inode)->ip_clusters, i_size_read(inode));
721 738
722leave: 739leave:
723 if (drop_alloc_sem) {
724 up_write(&OCFS2_I(inode)->ip_alloc_sem);
725 drop_alloc_sem = 0;
726 }
727 if (handle) { 740 if (handle) {
728 ocfs2_commit_trans(osb, handle); 741 ocfs2_commit_trans(osb, handle);
729 handle = NULL; 742 handle = NULL;
@@ -749,6 +762,25 @@ leave:
749 return status; 762 return status;
750} 763}
751 764
765static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
766 u32 clusters_to_add, int mark_unwritten)
767{
768 int ret;
769
770 /*
771 * The alloc sem blocks peope in read/write from reading our
772 * allocation until we're done changing it. We depend on
773 * i_mutex to block other extend/truncate calls while we're
774 * here.
775 */
776 down_write(&OCFS2_I(inode)->ip_alloc_sem);
777 ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
778 mark_unwritten);
779 up_write(&OCFS2_I(inode)->ip_alloc_sem);
780
781 return ret;
782}
783
752/* Some parts of this taken from generic_cont_expand, which turned out 784/* Some parts of this taken from generic_cont_expand, which turned out
753 * to be too fragile to do exactly what we need without us having to 785 * to be too fragile to do exactly what we need without us having to
754 * worry about recursive locking in ->prepare_write() and 786 * worry about recursive locking in ->prepare_write() and
@@ -890,7 +922,9 @@ static int ocfs2_extend_file(struct inode *inode,
890 } 922 }
891 923
892 if (clusters_to_add) { 924 if (clusters_to_add) {
893 ret = ocfs2_extend_allocation(inode, clusters_to_add); 925 ret = ocfs2_extend_allocation(inode,
926 OCFS2_I(inode)->ip_clusters,
927 clusters_to_add, 0);
894 if (ret < 0) { 928 if (ret < 0) {
895 mlog_errno(ret); 929 mlog_errno(ret);
896 goto out_unlock; 930 goto out_unlock;
@@ -995,6 +1029,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
995 goto bail_unlock; 1029 goto bail_unlock;
996 } 1030 }
997 1031
1032 /*
1033 * This will intentionally not wind up calling vmtruncate(),
1034 * since all the work for a size change has been done above.
1035 * Otherwise, we could get into problems with truncate as
1036 * ip_alloc_sem is used there to protect against i_size
1037 * changes.
1038 */
998 status = inode_setattr(inode, attr); 1039 status = inode_setattr(inode, attr);
999 if (status < 0) { 1040 if (status < 0) {
1000 mlog_errno(status); 1041 mlog_errno(status);
@@ -1070,17 +1111,16 @@ out:
1070 return ret; 1111 return ret;
1071} 1112}
1072 1113
1073static int ocfs2_write_remove_suid(struct inode *inode) 1114static int __ocfs2_write_remove_suid(struct inode *inode,
1115 struct buffer_head *bh)
1074{ 1116{
1075 int ret; 1117 int ret;
1076 struct buffer_head *bh = NULL;
1077 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1078 handle_t *handle; 1118 handle_t *handle;
1079 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1119 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1080 struct ocfs2_dinode *di; 1120 struct ocfs2_dinode *di;
1081 1121
1082 mlog_entry("(Inode %llu, mode 0%o)\n", 1122 mlog_entry("(Inode %llu, mode 0%o)\n",
1083 (unsigned long long)oi->ip_blkno, inode->i_mode); 1123 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
1084 1124
1085 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1125 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1086 if (handle == NULL) { 1126 if (handle == NULL) {
@@ -1089,17 +1129,11 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1089 goto out; 1129 goto out;
1090 } 1130 }
1091 1131
1092 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1093 if (ret < 0) {
1094 mlog_errno(ret);
1095 goto out_trans;
1096 }
1097
1098 ret = ocfs2_journal_access(handle, inode, bh, 1132 ret = ocfs2_journal_access(handle, inode, bh,
1099 OCFS2_JOURNAL_ACCESS_WRITE); 1133 OCFS2_JOURNAL_ACCESS_WRITE);
1100 if (ret < 0) { 1134 if (ret < 0) {
1101 mlog_errno(ret); 1135 mlog_errno(ret);
1102 goto out_bh; 1136 goto out_trans;
1103 } 1137 }
1104 1138
1105 inode->i_mode &= ~S_ISUID; 1139 inode->i_mode &= ~S_ISUID;
@@ -1112,8 +1146,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1112 ret = ocfs2_journal_dirty(handle, bh); 1146 ret = ocfs2_journal_dirty(handle, bh);
1113 if (ret < 0) 1147 if (ret < 0)
1114 mlog_errno(ret); 1148 mlog_errno(ret);
1115out_bh: 1149
1116 brelse(bh);
1117out_trans: 1150out_trans:
1118 ocfs2_commit_trans(osb, handle); 1151 ocfs2_commit_trans(osb, handle);
1119out: 1152out:
@@ -1159,6 +1192,460 @@ out:
1159 return ret; 1192 return ret;
1160} 1193}
1161 1194
1195static int ocfs2_write_remove_suid(struct inode *inode)
1196{
1197 int ret;
1198 struct buffer_head *bh = NULL;
1199 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1200
1201 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1202 oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1203 if (ret < 0) {
1204 mlog_errno(ret);
1205 goto out;
1206 }
1207
1208 ret = __ocfs2_write_remove_suid(inode, bh);
1209out:
1210 brelse(bh);
1211 return ret;
1212}
1213
1214/*
1215 * Allocate enough extents to cover the region starting at byte offset
1216 * start for len bytes. Existing extents are skipped, any extents
1217 * added are marked as "unwritten".
1218 */
1219static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1220 u64 start, u64 len)
1221{
1222 int ret;
1223 u32 cpos, phys_cpos, clusters, alloc_size;
1224
1225 /*
1226 * We consider both start and len to be inclusive.
1227 */
1228 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1229 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1230 clusters -= cpos;
1231
1232 while (clusters) {
1233 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1234 &alloc_size, NULL);
1235 if (ret) {
1236 mlog_errno(ret);
1237 goto out;
1238 }
1239
1240 /*
1241 * Hole or existing extent len can be arbitrary, so
1242 * cap it to our own allocation request.
1243 */
1244 if (alloc_size > clusters)
1245 alloc_size = clusters;
1246
1247 if (phys_cpos) {
1248 /*
1249 * We already have an allocation at this
1250 * region so we can safely skip it.
1251 */
1252 goto next;
1253 }
1254
1255 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1256 if (ret) {
1257 if (ret != -ENOSPC)
1258 mlog_errno(ret);
1259 goto out;
1260 }
1261
1262next:
1263 cpos += alloc_size;
1264 clusters -= alloc_size;
1265 }
1266
1267 ret = 0;
1268out:
1269 return ret;
1270}
1271
1272static int __ocfs2_remove_inode_range(struct inode *inode,
1273 struct buffer_head *di_bh,
1274 u32 cpos, u32 phys_cpos, u32 len,
1275 struct ocfs2_cached_dealloc_ctxt *dealloc)
1276{
1277 int ret;
1278 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
1279 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1280 struct inode *tl_inode = osb->osb_tl_inode;
1281 handle_t *handle;
1282 struct ocfs2_alloc_context *meta_ac = NULL;
1283 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1284
1285 ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
1286 if (ret) {
1287 mlog_errno(ret);
1288 return ret;
1289 }
1290
1291 mutex_lock(&tl_inode->i_mutex);
1292
1293 if (ocfs2_truncate_log_needs_flush(osb)) {
1294 ret = __ocfs2_flush_truncate_log(osb);
1295 if (ret < 0) {
1296 mlog_errno(ret);
1297 goto out;
1298 }
1299 }
1300
1301 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
1302 if (handle == NULL) {
1303 ret = -ENOMEM;
1304 mlog_errno(ret);
1305 goto out;
1306 }
1307
1308 ret = ocfs2_journal_access(handle, inode, di_bh,
1309 OCFS2_JOURNAL_ACCESS_WRITE);
1310 if (ret) {
1311 mlog_errno(ret);
1312 goto out;
1313 }
1314
1315 ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
1316 dealloc);
1317 if (ret) {
1318 mlog_errno(ret);
1319 goto out_commit;
1320 }
1321
1322 OCFS2_I(inode)->ip_clusters -= len;
1323 di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1324
1325 ret = ocfs2_journal_dirty(handle, di_bh);
1326 if (ret) {
1327 mlog_errno(ret);
1328 goto out_commit;
1329 }
1330
1331 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
1332 if (ret)
1333 mlog_errno(ret);
1334
1335out_commit:
1336 ocfs2_commit_trans(osb, handle);
1337out:
1338 mutex_unlock(&tl_inode->i_mutex);
1339
1340 if (meta_ac)
1341 ocfs2_free_alloc_context(meta_ac);
1342
1343 return ret;
1344}
1345
1346/*
1347 * Truncate a byte range, avoiding pages within partial clusters. This
1348 * preserves those pages for the zeroing code to write to.
1349 */
1350static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1351 u64 byte_len)
1352{
1353 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1354 loff_t start, end;
1355 struct address_space *mapping = inode->i_mapping;
1356
1357 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1358 end = byte_start + byte_len;
1359 end = end & ~(osb->s_clustersize - 1);
1360
1361 if (start < end) {
1362 unmap_mapping_range(mapping, start, end - start, 0);
1363 truncate_inode_pages_range(mapping, start, end - 1);
1364 }
1365}
1366
1367static int ocfs2_zero_partial_clusters(struct inode *inode,
1368 u64 start, u64 len)
1369{
1370 int ret = 0;
1371 u64 tmpend, end = start + len;
1372 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1373 unsigned int csize = osb->s_clustersize;
1374 handle_t *handle;
1375
1376 /*
1377 * The "start" and "end" values are NOT necessarily part of
1378 * the range whose allocation is being deleted. Rather, this
1379 * is what the user passed in with the request. We must zero
1380 * partial clusters here. There's no need to worry about
1381 * physical allocation - the zeroing code knows to skip holes.
1382 */
1383 mlog(0, "byte start: %llu, end: %llu\n",
1384 (unsigned long long)start, (unsigned long long)end);
1385
1386 /*
1387 * If both edges are on a cluster boundary then there's no
1388 * zeroing required as the region is part of the allocation to
1389 * be truncated.
1390 */
1391 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1392 goto out;
1393
1394 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1395 if (handle == NULL) {
1396 ret = -ENOMEM;
1397 mlog_errno(ret);
1398 goto out;
1399 }
1400
1401 /*
1402 * We want to get the byte offset of the end of the 1st cluster.
1403 */
1404 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1405 if (tmpend > end)
1406 tmpend = end;
1407
1408 mlog(0, "1st range: start: %llu, tmpend: %llu\n",
1409 (unsigned long long)start, (unsigned long long)tmpend);
1410
1411 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1412 if (ret)
1413 mlog_errno(ret);
1414
1415 if (tmpend < end) {
1416 /*
1417 * This may make start and end equal, but the zeroing
1418 * code will skip any work in that case so there's no
1419 * need to catch it up here.
1420 */
1421 start = end & ~(osb->s_clustersize - 1);
1422
1423 mlog(0, "2nd range: start: %llu, end: %llu\n",
1424 (unsigned long long)start, (unsigned long long)end);
1425
1426 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1427 if (ret)
1428 mlog_errno(ret);
1429 }
1430
1431 ocfs2_commit_trans(osb, handle);
1432out:
1433 return ret;
1434}
1435
1436static int ocfs2_remove_inode_range(struct inode *inode,
1437 struct buffer_head *di_bh, u64 byte_start,
1438 u64 byte_len)
1439{
1440 int ret = 0;
1441 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
1442 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1443 struct ocfs2_cached_dealloc_ctxt dealloc;
1444
1445 ocfs2_init_dealloc_ctxt(&dealloc);
1446
1447 if (byte_len == 0)
1448 return 0;
1449
1450 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1451 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
1452 if (trunc_len >= trunc_start)
1453 trunc_len -= trunc_start;
1454 else
1455 trunc_len = 0;
1456
1457 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
1458 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1459 (unsigned long long)byte_start,
1460 (unsigned long long)byte_len, trunc_start, trunc_len);
1461
1462 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1463 if (ret) {
1464 mlog_errno(ret);
1465 goto out;
1466 }
1467
1468 cpos = trunc_start;
1469 while (trunc_len) {
1470 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1471 &alloc_size, NULL);
1472 if (ret) {
1473 mlog_errno(ret);
1474 goto out;
1475 }
1476
1477 if (alloc_size > trunc_len)
1478 alloc_size = trunc_len;
1479
1480 /* Only do work for non-holes */
1481 if (phys_cpos != 0) {
1482 ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
1483 phys_cpos, alloc_size,
1484 &dealloc);
1485 if (ret) {
1486 mlog_errno(ret);
1487 goto out;
1488 }
1489 }
1490
1491 cpos += alloc_size;
1492 trunc_len -= alloc_size;
1493 }
1494
1495 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1496
1497out:
1498 ocfs2_schedule_truncate_log_flush(osb, 1);
1499 ocfs2_run_deallocs(osb, &dealloc);
1500
1501 return ret;
1502}
1503
1504/*
1505 * Parts of this function taken from xfs_change_file_space()
1506 */
1507int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1508 struct ocfs2_space_resv *sr)
1509{
1510 int ret;
1511 s64 llen;
1512 struct inode *inode = file->f_path.dentry->d_inode;
1513 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1514 struct buffer_head *di_bh = NULL;
1515 handle_t *handle;
1516 unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits);
1517
1518 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1519 !ocfs2_writes_unwritten_extents(osb))
1520 return -ENOTTY;
1521 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1522 !ocfs2_sparse_alloc(osb))
1523 return -ENOTTY;
1524
1525 if (!S_ISREG(inode->i_mode))
1526 return -EINVAL;
1527
1528 if (!(file->f_mode & FMODE_WRITE))
1529 return -EBADF;
1530
1531 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1532 return -EROFS;
1533
1534 mutex_lock(&inode->i_mutex);
1535
1536 /*
1537 * This prevents concurrent writes on other nodes
1538 */
1539 ret = ocfs2_rw_lock(inode, 1);
1540 if (ret) {
1541 mlog_errno(ret);
1542 goto out;
1543 }
1544
1545 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1546 if (ret) {
1547 mlog_errno(ret);
1548 goto out_rw_unlock;
1549 }
1550
1551 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1552 ret = -EPERM;
1553 goto out_meta_unlock;
1554 }
1555
1556 switch (sr->l_whence) {
1557 case 0: /*SEEK_SET*/
1558 break;
1559 case 1: /*SEEK_CUR*/
1560 sr->l_start += file->f_pos;
1561 break;
1562 case 2: /*SEEK_END*/
1563 sr->l_start += i_size_read(inode);
1564 break;
1565 default:
1566 ret = -EINVAL;
1567 goto out_meta_unlock;
1568 }
1569 sr->l_whence = 0;
1570
1571 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1572
1573 if (sr->l_start < 0
1574 || sr->l_start > max_off
1575 || (sr->l_start + llen) < 0
1576 || (sr->l_start + llen) > max_off) {
1577 ret = -EINVAL;
1578 goto out_meta_unlock;
1579 }
1580
1581 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1582 if (sr->l_len <= 0) {
1583 ret = -EINVAL;
1584 goto out_meta_unlock;
1585 }
1586 }
1587
1588 if (should_remove_suid(file->f_path.dentry)) {
1589 ret = __ocfs2_write_remove_suid(inode, di_bh);
1590 if (ret) {
1591 mlog_errno(ret);
1592 goto out_meta_unlock;
1593 }
1594 }
1595
1596 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1597 switch (cmd) {
1598 case OCFS2_IOC_RESVSP:
1599 case OCFS2_IOC_RESVSP64:
1600 /*
1601 * This takes unsigned offsets, but the signed ones we
1602 * pass have been checked against overflow above.
1603 */
1604 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1605 sr->l_len);
1606 break;
1607 case OCFS2_IOC_UNRESVSP:
1608 case OCFS2_IOC_UNRESVSP64:
1609 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1610 sr->l_len);
1611 break;
1612 default:
1613 ret = -EINVAL;
1614 }
1615 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1616 if (ret) {
1617 mlog_errno(ret);
1618 goto out_meta_unlock;
1619 }
1620
1621 /*
1622 * We update c/mtime for these changes
1623 */
1624 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1625 if (IS_ERR(handle)) {
1626 ret = PTR_ERR(handle);
1627 mlog_errno(ret);
1628 goto out_meta_unlock;
1629 }
1630
1631 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1632 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1633 if (ret < 0)
1634 mlog_errno(ret);
1635
1636 ocfs2_commit_trans(osb, handle);
1637
1638out_meta_unlock:
1639 brelse(di_bh);
1640 ocfs2_meta_unlock(inode, 1);
1641out_rw_unlock:
1642 ocfs2_rw_unlock(inode, 1);
1643
1644 mutex_unlock(&inode->i_mutex);
1645out:
1646 return ret;
1647}
1648
1162static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1649static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1163 loff_t *ppos, 1650 loff_t *ppos,
1164 size_t count, 1651 size_t count,
@@ -1329,15 +1816,16 @@ ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1329 *basep = base; 1816 *basep = base;
1330} 1817}
1331 1818
1332static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, 1819static struct page * ocfs2_get_write_source(char **ret_src_buf,
1333 const struct iovec *cur_iov, 1820 const struct iovec *cur_iov,
1334 size_t iov_offset) 1821 size_t iov_offset)
1335{ 1822{
1336 int ret; 1823 int ret;
1337 char *buf; 1824 char *buf = cur_iov->iov_base + iov_offset;
1338 struct page *src_page = NULL; 1825 struct page *src_page = NULL;
1826 unsigned long off;
1339 1827
1340 buf = cur_iov->iov_base + iov_offset; 1828 off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
1341 1829
1342 if (!segment_eq(get_fs(), KERNEL_DS)) { 1830 if (!segment_eq(get_fs(), KERNEL_DS)) {
1343 /* 1831 /*
@@ -1349,18 +1837,17 @@ static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp
1349 (unsigned long)buf & PAGE_CACHE_MASK, 1, 1837 (unsigned long)buf & PAGE_CACHE_MASK, 1,
1350 0, 0, &src_page, NULL); 1838 0, 0, &src_page, NULL);
1351 if (ret == 1) 1839 if (ret == 1)
1352 bp->b_src_buf = kmap(src_page); 1840 *ret_src_buf = kmap(src_page) + off;
1353 else 1841 else
1354 src_page = ERR_PTR(-EFAULT); 1842 src_page = ERR_PTR(-EFAULT);
1355 } else { 1843 } else {
1356 bp->b_src_buf = buf; 1844 *ret_src_buf = buf;
1357 } 1845 }
1358 1846
1359 return src_page; 1847 return src_page;
1360} 1848}
1361 1849
1362static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, 1850static void ocfs2_put_write_source(struct page *page)
1363 struct page *page)
1364{ 1851{
1365 if (page) { 1852 if (page) {
1366 kunmap(page); 1853 kunmap(page);
@@ -1376,10 +1863,12 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1376{ 1863{
1377 int ret = 0; 1864 int ret = 0;
1378 ssize_t copied, total = 0; 1865 ssize_t copied, total = 0;
1379 size_t iov_offset = 0; 1866 size_t iov_offset = 0, bytes;
1867 loff_t pos;
1380 const struct iovec *cur_iov = iov; 1868 const struct iovec *cur_iov = iov;
1381 struct ocfs2_buffered_write_priv bp; 1869 struct page *user_page, *page;
1382 struct page *page; 1870 char *buf, *dst;
1871 void *fsdata;
1383 1872
1384 /* 1873 /*
1385 * handle partial DIO write. Adjust cur_iov if needed. 1874 * handle partial DIO write. Adjust cur_iov if needed.
@@ -1387,21 +1876,38 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1387 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); 1876 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1388 1877
1389 do { 1878 do {
1390 bp.b_cur_off = iov_offset; 1879 pos = *ppos;
1391 bp.b_cur_iov = cur_iov;
1392 1880
1393 page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); 1881 user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
1394 if (IS_ERR(page)) { 1882 if (IS_ERR(user_page)) {
1395 ret = PTR_ERR(page); 1883 ret = PTR_ERR(user_page);
1396 goto out; 1884 goto out;
1397 } 1885 }
1398 1886
1399 copied = ocfs2_buffered_write_cluster(file, *ppos, count, 1887 /* Stay within our page boundaries */
1400 ocfs2_map_and_write_user_data, 1888 bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
1401 &bp); 1889 (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
1890 /* Stay within the vector boundary */
1891 bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
1892 /* Stay within count */
1893 bytes = min(bytes, count);
1894
1895 page = NULL;
1896 ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
1897 &page, &fsdata);
1898 if (ret) {
1899 mlog_errno(ret);
1900 goto out;
1901 }
1402 1902
1403 ocfs2_put_write_source(&bp, page); 1903 dst = kmap_atomic(page, KM_USER0);
1904 memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
1905 kunmap_atomic(dst, KM_USER0);
1906 flush_dcache_page(page);
1907 ocfs2_put_write_source(user_page);
1404 1908
1909 copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
1910 bytes, page, fsdata);
1405 if (copied < 0) { 1911 if (copied < 0) {
1406 mlog_errno(copied); 1912 mlog_errno(copied);
1407 ret = copied; 1913 ret = copied;
@@ -1409,7 +1915,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1409 } 1915 }
1410 1916
1411 total += copied; 1917 total += copied;
1412 *ppos = *ppos + copied; 1918 *ppos = pos + copied;
1413 count -= copied; 1919 count -= copied;
1414 1920
1415 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); 1921 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
@@ -1579,52 +2085,46 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
1579 struct pipe_buffer *buf, 2085 struct pipe_buffer *buf,
1580 struct splice_desc *sd) 2086 struct splice_desc *sd)
1581{ 2087{
1582 int ret, count, total = 0; 2088 int ret, count;
1583 ssize_t copied = 0; 2089 ssize_t copied = 0;
1584 struct ocfs2_splice_write_priv sp; 2090 struct file *file = sd->u.file;
2091 unsigned int offset;
2092 struct page *page = NULL;
2093 void *fsdata;
2094 char *src, *dst;
1585 2095
1586 ret = buf->ops->confirm(pipe, buf); 2096 ret = buf->ops->confirm(pipe, buf);
1587 if (ret) 2097 if (ret)
1588 goto out; 2098 goto out;
1589 2099
1590 sp.s_sd = sd; 2100 offset = sd->pos & ~PAGE_CACHE_MASK;
1591 sp.s_buf = buf;
1592 sp.s_pipe = pipe;
1593 sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
1594 sp.s_buf_offset = buf->offset;
1595
1596 count = sd->len; 2101 count = sd->len;
1597 if (count + sp.s_offset > PAGE_CACHE_SIZE) 2102 if (count + offset > PAGE_CACHE_SIZE)
1598 count = PAGE_CACHE_SIZE - sp.s_offset; 2103 count = PAGE_CACHE_SIZE - offset;
1599 2104
1600 do { 2105 ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
1601 /* 2106 &page, &fsdata);
1602 * splice wants us to copy up to one page at a 2107 if (ret) {
1603 * time. For pagesize > cluster size, this means we 2108 mlog_errno(ret);
1604 * might enter ocfs2_buffered_write_cluster() more 2109 goto out;
1605 * than once, so keep track of our progress here. 2110 }
1606 */
1607 copied = ocfs2_buffered_write_cluster(sd->u.file,
1608 (loff_t)sd->pos + total,
1609 count,
1610 ocfs2_map_and_write_splice_data,
1611 &sp);
1612 if (copied < 0) {
1613 mlog_errno(copied);
1614 ret = copied;
1615 goto out;
1616 }
1617 2111
1618 count -= copied; 2112 src = buf->ops->map(pipe, buf, 1);
1619 sp.s_offset += copied; 2113 dst = kmap_atomic(page, KM_USER1);
1620 sp.s_buf_offset += copied; 2114 memcpy(dst + offset, src + buf->offset, count);
1621 total += copied; 2115 kunmap_atomic(page, KM_USER1);
1622 } while (count); 2116 buf->ops->unmap(pipe, buf, src);
1623 2117
1624 ret = 0; 2118 copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
2119 page, fsdata);
2120 if (copied < 0) {
2121 mlog_errno(copied);
2122 ret = copied;
2123 goto out;
2124 }
1625out: 2125out:
1626 2126
1627 return total ? total : ret; 2127 return copied ? copied : ret;
1628} 2128}
1629 2129
1630static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, 2130static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index a4dd1fa1822b..36fe27f268ee 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,15 +39,16 @@ enum ocfs2_alloc_restarted {
39}; 39};
40int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 40int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
41 struct inode *inode, 41 struct inode *inode,
42 u32 *cluster_start, 42 u32 *logical_offset,
43 u32 clusters_to_add, 43 u32 clusters_to_add,
44 int mark_unwritten,
44 struct buffer_head *fe_bh, 45 struct buffer_head *fe_bh,
45 handle_t *handle, 46 handle_t *handle,
46 struct ocfs2_alloc_context *data_ac, 47 struct ocfs2_alloc_context *data_ac,
47 struct ocfs2_alloc_context *meta_ac, 48 struct ocfs2_alloc_context *meta_ac,
48 enum ocfs2_alloc_restarted *reason); 49 enum ocfs2_alloc_restarted *reason_ret);
49int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 50int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
50 u32 clusters_to_add, 51 u32 clusters_to_add, u32 extents_to_split,
51 struct ocfs2_alloc_context **data_ac, 52 struct ocfs2_alloc_context **data_ac,
52 struct ocfs2_alloc_context **meta_ac); 53 struct ocfs2_alloc_context **meta_ac);
53int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 54int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
@@ -61,4 +62,7 @@ int ocfs2_should_update_atime(struct inode *inode,
61int ocfs2_update_inode_atime(struct inode *inode, 62int ocfs2_update_inode_atime(struct inode *inode,
62 struct buffer_head *bh); 63 struct buffer_head *bh);
63 64
65int ocfs2_change_file_space(struct file *file, unsigned int cmd,
66 struct ocfs2_space_resv *sr);
67
64#endif /* OCFS2_FILE_H */ 68#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index b25ef63781ba..352eb4a13f98 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -157,16 +157,16 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
157 if (ocfs2_mount_local(osb)) 157 if (ocfs2_mount_local(osb))
158 return 0; 158 return 0;
159 159
160 status = o2hb_register_callback(&osb->osb_hb_down); 160 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
161 if (status < 0) { 161 if (status < 0) {
162 mlog_errno(status); 162 mlog_errno(status);
163 goto bail; 163 goto bail;
164 } 164 }
165 165
166 status = o2hb_register_callback(&osb->osb_hb_up); 166 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
167 if (status < 0) { 167 if (status < 0) {
168 mlog_errno(status); 168 mlog_errno(status);
169 o2hb_unregister_callback(&osb->osb_hb_down); 169 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
170 } 170 }
171 171
172bail: 172bail:
@@ -178,8 +178,8 @@ void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
178 if (ocfs2_mount_local(osb)) 178 if (ocfs2_mount_local(osb))
179 return; 179 return;
180 180
181 o2hb_unregister_callback(&osb->osb_hb_down); 181 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
182 o2hb_unregister_callback(&osb->osb_hb_up); 182 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
183} 183}
184 184
185void ocfs2_stop_heartbeat(struct ocfs2_super *osb) 185void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index f3ad21ad9aed..bd68c3f2afbe 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -14,6 +14,7 @@
14#include "ocfs2.h" 14#include "ocfs2.h"
15#include "alloc.h" 15#include "alloc.h"
16#include "dlmglue.h" 16#include "dlmglue.h"
17#include "file.h"
17#include "inode.h" 18#include "inode.h"
18#include "journal.h" 19#include "journal.h"
19 20
@@ -115,6 +116,7 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
115{ 116{
116 unsigned int flags; 117 unsigned int flags;
117 int status; 118 int status;
119 struct ocfs2_space_resv sr;
118 120
119 switch (cmd) { 121 switch (cmd) {
120 case OCFS2_IOC_GETFLAGS: 122 case OCFS2_IOC_GETFLAGS:
@@ -130,6 +132,14 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
130 132
131 return ocfs2_set_inode_attr(inode, flags, 133 return ocfs2_set_inode_attr(inode, flags,
132 OCFS2_FL_MODIFIABLE); 134 OCFS2_FL_MODIFIABLE);
135 case OCFS2_IOC_RESVSP:
136 case OCFS2_IOC_RESVSP64:
137 case OCFS2_IOC_UNRESVSP:
138 case OCFS2_IOC_UNRESVSP64:
139 if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
140 return -EFAULT;
141
142 return ocfs2_change_file_space(filp, cmd, &sr);
133 default: 143 default:
134 return -ENOTTY; 144 return -ENOTTY;
135 } 145 }
@@ -148,6 +158,11 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
148 case OCFS2_IOC32_SETFLAGS: 158 case OCFS2_IOC32_SETFLAGS:
149 cmd = OCFS2_IOC_SETFLAGS; 159 cmd = OCFS2_IOC_SETFLAGS;
150 break; 160 break;
161 case OCFS2_IOC_RESVSP:
162 case OCFS2_IOC_RESVSP64:
163 case OCFS2_IOC_UNRESVSP:
164 case OCFS2_IOC_UNRESVSP64:
165 break;
151 default: 166 default:
152 return -ENOIOCTLCMD; 167 return -ENOIOCTLCMD;
153 } 168 }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index dc1188081720..dbfb20bb27ea 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -722,8 +722,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
722 container_of(work, struct ocfs2_journal, j_recovery_work); 722 container_of(work, struct ocfs2_journal, j_recovery_work);
723 struct ocfs2_super *osb = journal->j_osb; 723 struct ocfs2_super *osb = journal->j_osb;
724 struct ocfs2_dinode *la_dinode, *tl_dinode; 724 struct ocfs2_dinode *la_dinode, *tl_dinode;
725 struct ocfs2_la_recovery_item *item; 725 struct ocfs2_la_recovery_item *item, *n;
726 struct list_head *p, *n;
727 LIST_HEAD(tmp_la_list); 726 LIST_HEAD(tmp_la_list);
728 727
729 mlog_entry_void(); 728 mlog_entry_void();
@@ -734,8 +733,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
734 list_splice_init(&journal->j_la_cleanups, &tmp_la_list); 733 list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
735 spin_unlock(&journal->j_lock); 734 spin_unlock(&journal->j_lock);
736 735
737 list_for_each_safe(p, n, &tmp_la_list) { 736 list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
738 item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
739 list_del_init(&item->lri_list); 737 list_del_init(&item->lri_list);
740 738
741 mlog(0, "Complete recovery for slot %d\n", item->lri_slot); 739 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3db5de4506da..ce60aab013aa 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -289,6 +289,8 @@ int ocfs2_journal_dirty_data(handle_t *handle,
289#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ 289#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
290 + OCFS2_TRUNCATE_LOG_UPDATE) 290 + OCFS2_TRUNCATE_LOG_UPDATE)
291 291
292#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
293
292/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 294/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
293 * bitmap block for the new bit) */ 295 * bitmap block for the new bit) */
294#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 296#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af01158b39f5..d79aa12137d2 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -37,11 +37,29 @@
37 37
38#include "ocfs2.h" 38#include "ocfs2.h"
39 39
40#include "aops.h"
40#include "dlmglue.h" 41#include "dlmglue.h"
41#include "file.h" 42#include "file.h"
42#include "inode.h" 43#include "inode.h"
43#include "mmap.h" 44#include "mmap.h"
44 45
46static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
47{
48 /* The best way to deal with signals in the vm path is
49 * to block them upfront, rather than allowing the
50 * locking paths to return -ERESTARTSYS. */
51 sigfillset(blocked);
52
53 /* We should technically never get a bad return value
54 * from sigprocmask */
55 return sigprocmask(SIG_BLOCK, blocked, oldset);
56}
57
58static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
59{
60 return sigprocmask(SIG_SETMASK, oldset, NULL);
61}
62
45static struct page *ocfs2_nopage(struct vm_area_struct * area, 63static struct page *ocfs2_nopage(struct vm_area_struct * area,
46 unsigned long address, 64 unsigned long address,
47 int *type) 65 int *type)
@@ -53,14 +71,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
53 mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, 71 mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
54 type); 72 type);
55 73
56 /* The best way to deal with signals in this path is 74 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
57 * to block them upfront, rather than allowing the
58 * locking paths to return -ERESTARTSYS. */
59 sigfillset(&blocked);
60
61 /* We should technically never get a bad ret return
62 * from sigprocmask */
63 ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
64 if (ret < 0) { 75 if (ret < 0) {
65 mlog_errno(ret); 76 mlog_errno(ret);
66 goto out; 77 goto out;
@@ -68,7 +79,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
68 79
69 page = filemap_nopage(area, address, type); 80 page = filemap_nopage(area, address, type);
70 81
71 ret = sigprocmask(SIG_SETMASK, &oldset, NULL); 82 ret = ocfs2_vm_op_unblock_sigs(&oldset);
72 if (ret < 0) 83 if (ret < 0)
73 mlog_errno(ret); 84 mlog_errno(ret);
74out: 85out:
@@ -76,28 +87,136 @@ out:
76 return page; 87 return page;
77} 88}
78 89
79static struct vm_operations_struct ocfs2_file_vm_ops = { 90static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
80 .nopage = ocfs2_nopage, 91 struct page *page)
81}; 92{
93 int ret;
94 struct address_space *mapping = inode->i_mapping;
95 loff_t pos = page->index << PAGE_CACHE_SHIFT;
96 unsigned int len = PAGE_CACHE_SIZE;
97 pgoff_t last_index;
98 struct page *locked_page = NULL;
99 void *fsdata;
100 loff_t size = i_size_read(inode);
82 101
83int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) 102 /*
103 * Another node might have truncated while we were waiting on
104 * cluster locks.
105 */
106 last_index = size >> PAGE_CACHE_SHIFT;
107 if (page->index > last_index) {
108 ret = -EINVAL;
109 goto out;
110 }
111
112 /*
113 * The i_size check above doesn't catch the case where nodes
114 * truncated and then re-extended the file. We'll re-check the
115 * page mapping after taking the page lock inside of
116 * ocfs2_write_begin_nolock().
117 */
118 if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
119 ret = -EINVAL;
120 goto out;
121 }
122
123 /*
124 * Call ocfs2_write_begin() and ocfs2_write_end() to take
125 * advantage of the allocation code there. We pass a write
126 * length of the whole page (chopped to i_size) to make sure
127 * the whole thing is allocated.
128 *
129 * Since we know the page is up to date, we don't have to
130 * worry about ocfs2_write_begin() skipping some buffer reads
131 * because the "write" would invalidate their data.
132 */
133 if (page->index == last_index)
134 len = size & ~PAGE_CACHE_MASK;
135
136 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
137 &fsdata, di_bh, page);
138 if (ret) {
139 if (ret != -ENOSPC)
140 mlog_errno(ret);
141 goto out;
142 }
143
144 ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
145 fsdata);
146 if (ret < 0) {
147 mlog_errno(ret);
148 goto out;
149 }
150 BUG_ON(ret != len);
151 ret = 0;
152out:
153 return ret;
154}
155
156static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
84{ 157{
85 int ret = 0, lock_level = 0; 158 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
86 struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); 159 struct buffer_head *di_bh = NULL;
160 sigset_t blocked, oldset;
161 int ret, ret2;
162
163 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
164 if (ret < 0) {
165 mlog_errno(ret);
166 return ret;
167 }
168
169 /*
170 * The cluster locks taken will block a truncate from another
171 * node. Taking the data lock will also ensure that we don't
172 * attempt page truncation as part of a downconvert.
173 */
174 ret = ocfs2_meta_lock(inode, &di_bh, 1);
175 if (ret < 0) {
176 mlog_errno(ret);
177 goto out;
178 }
87 179
88 /* 180 /*
89 * Only support shared writeable mmap for local mounts which 181 * The alloc sem should be enough to serialize with
90 * don't know about holes. 182 * ocfs2_truncate_file() changing i_size as well as any thread
183 * modifying the inode btree.
91 */ 184 */
92 if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && 185 down_write(&OCFS2_I(inode)->ip_alloc_sem);
93 ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && 186
94 ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { 187 ret = ocfs2_data_lock(inode, 1);
95 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); 188 if (ret < 0) {
96 /* This is -EINVAL because generic_file_readonly_mmap 189 mlog_errno(ret);
97 * returns it in a similar situation. */ 190 goto out_meta_unlock;
98 return -EINVAL;
99 } 191 }
100 192
193 ret = __ocfs2_page_mkwrite(inode, di_bh, page);
194
195 ocfs2_data_unlock(inode, 1);
196
197out_meta_unlock:
198 up_write(&OCFS2_I(inode)->ip_alloc_sem);
199
200 brelse(di_bh);
201 ocfs2_meta_unlock(inode, 1);
202
203out:
204 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
205 if (ret2 < 0)
206 mlog_errno(ret2);
207
208 return ret;
209}
210
211static struct vm_operations_struct ocfs2_file_vm_ops = {
212 .nopage = ocfs2_nopage,
213 .page_mkwrite = ocfs2_page_mkwrite,
214};
215
216int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
217{
218 int ret = 0, lock_level = 0;
219
101 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, 220 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
102 file->f_vfsmnt, &lock_level); 221 file->f_vfsmnt, &lock_level);
103 if (ret < 0) { 222 if (ret < 0) {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 36289e6295ce..d430fdab16e9 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1674,7 +1674,7 @@ static int ocfs2_symlink(struct inode *dir,
1674 u32 offset = 0; 1674 u32 offset = 0;
1675 1675
1676 inode->i_op = &ocfs2_symlink_inode_operations; 1676 inode->i_op = &ocfs2_symlink_inode_operations;
1677 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 1677 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
1678 new_fe_bh, 1678 new_fe_bh,
1679 handle, data_ac, NULL, 1679 handle, data_ac, NULL,
1680 NULL); 1680 NULL);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a860633e833f..5cc90a40b3c5 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -219,6 +219,7 @@ struct ocfs2_super
219 u16 max_slots; 219 u16 max_slots;
220 s16 node_num; 220 s16 node_num;
221 s16 slot_num; 221 s16 slot_num;
222 s16 preferred_slot;
222 int s_sectsize_bits; 223 int s_sectsize_bits;
223 int s_clustersize; 224 int s_clustersize;
224 int s_clustersize_bits; 225 int s_clustersize_bits;
@@ -305,6 +306,19 @@ static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
305 return 0; 306 return 0;
306} 307}
307 308
309static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
310{
311 /*
312 * Support for sparse files is a pre-requisite
313 */
314 if (!ocfs2_sparse_alloc(osb))
315 return 0;
316
317 if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
318 return 1;
319 return 0;
320}
321
308/* set / clear functions because cluster events can make these happen 322/* set / clear functions because cluster events can make these happen
309 * in parallel so we want the transitions to be atomic. this also 323 * in parallel so we want the transitions to be atomic. this also
310 * means that any future flags osb_flags must be protected by spinlock 324 * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f0d9eb08547a..82f8a75b207e 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,7 @@
88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB 88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ 89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) 90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
91#define OCFS2_FEATURE_RO_COMPAT_SUPP 0 91#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
92 92
93/* 93/*
94 * Heartbeat-only devices are missing journals and other files. The 94 * Heartbeat-only devices are missing journals and other files. The
@@ -116,6 +116,11 @@
116 */ 116 */
117#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 117#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001
118 118
119/*
120 * Unwritten extents support.
121 */
122#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001
123
119/* The byte offset of the first backup block will be 1G. 124/* The byte offset of the first backup block will be 1G.
120 * The following will be 4G, 16G, 64G, 256G and 1T. 125 * The following will be 4G, 16G, 64G, 256G and 1T.
121 */ 126 */
@@ -170,6 +175,32 @@
170#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) 175#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
171 176
172/* 177/*
178 * Space reservation / allocation / free ioctls and argument structure
179 * are designed to be compatible with XFS.
180 *
181 * ALLOCSP* and FREESP* are not and will never be supported, but are
182 * included here for completeness.
183 */
184struct ocfs2_space_resv {
185 __s16 l_type;
186 __s16 l_whence;
187 __s64 l_start;
188 __s64 l_len; /* len == 0 means until end of file */
189 __s32 l_sysid;
190 __u32 l_pid;
191 __s32 l_pad[4]; /* reserve area */
192};
193
194#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
195#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
196#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
197#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
198#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
199#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
200#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
201#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
202
203/*
173 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 204 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
174 */ 205 */
175#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ 206#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d8b79067dc14..af4882b62cfa 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -121,17 +121,25 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
121 return ret; 121 return ret;
122} 122}
123 123
124static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si) 124static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
125{ 125{
126 int i; 126 int i;
127 s16 ret = OCFS2_INVALID_SLOT; 127 s16 ret = OCFS2_INVALID_SLOT;
128 128
129 if (preferred >= 0 && preferred < si->si_num_slots) {
130 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
131 ret = preferred;
132 goto out;
133 }
134 }
135
129 for(i = 0; i < si->si_num_slots; i++) { 136 for(i = 0; i < si->si_num_slots; i++) {
130 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { 137 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
131 ret = (s16) i; 138 ret = (s16) i;
132 break; 139 break;
133 } 140 }
134 } 141 }
142out:
135 return ret; 143 return ret;
136} 144}
137 145
@@ -248,7 +256,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
248 if (slot == OCFS2_INVALID_SLOT) { 256 if (slot == OCFS2_INVALID_SLOT) {
249 /* if no slot yet, then just take 1st available 257 /* if no slot yet, then just take 1st available
250 * one. */ 258 * one. */
251 slot = __ocfs2_find_empty_slot(si); 259 slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
252 if (slot == OCFS2_INVALID_SLOT) { 260 if (slot == OCFS2_INVALID_SLOT) {
253 spin_unlock(&si->si_lock); 261 spin_unlock(&si->si_lock);
254 mlog(ML_ERROR, "no free slots available!\n"); 262 mlog(ML_ERROR, "no free slots available!\n");
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index e3437626d183..d9c5c9fcb30f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -98,14 +98,6 @@ static int ocfs2_relink_block_group(handle_t *handle,
98 u16 chain); 98 u16 chain);
99static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 99static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
100 u32 wanted); 100 u32 wanted);
101static int ocfs2_free_suballoc_bits(handle_t *handle,
102 struct inode *alloc_inode,
103 struct buffer_head *alloc_bh,
104 unsigned int start_bit,
105 u64 bg_blkno,
106 unsigned int count);
107static inline u64 ocfs2_which_suballoc_group(u64 block,
108 unsigned int bit);
109static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
110 u64 bg_blkno, 102 u64 bg_blkno,
111 u16 bg_bit_off); 103 u16 bg_bit_off);
@@ -496,13 +488,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
496 488
497 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); 489 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
498 (*ac)->ac_which = OCFS2_AC_USE_META; 490 (*ac)->ac_which = OCFS2_AC_USE_META;
499
500#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
501 slot = 0;
502#else
503 slot = osb->slot_num; 491 slot = osb->slot_num;
504#endif
505
506 (*ac)->ac_group_search = ocfs2_block_group_search; 492 (*ac)->ac_group_search = ocfs2_block_group_search;
507 493
508 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 494 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
@@ -1626,12 +1612,12 @@ bail:
1626/* 1612/*
1627 * expects the suballoc inode to already be locked. 1613 * expects the suballoc inode to already be locked.
1628 */ 1614 */
1629static int ocfs2_free_suballoc_bits(handle_t *handle, 1615int ocfs2_free_suballoc_bits(handle_t *handle,
1630 struct inode *alloc_inode, 1616 struct inode *alloc_inode,
1631 struct buffer_head *alloc_bh, 1617 struct buffer_head *alloc_bh,
1632 unsigned int start_bit, 1618 unsigned int start_bit,
1633 u64 bg_blkno, 1619 u64 bg_blkno,
1634 unsigned int count) 1620 unsigned int count)
1635{ 1621{
1636 int status = 0; 1622 int status = 0;
1637 u32 tmp_used; 1623 u32 tmp_used;
@@ -1703,13 +1689,6 @@ bail:
1703 return status; 1689 return status;
1704} 1690}
1705 1691
1706static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1707{
1708 u64 group = block - (u64) bit;
1709
1710 return group;
1711}
1712
1713int ocfs2_free_dinode(handle_t *handle, 1692int ocfs2_free_dinode(handle_t *handle,
1714 struct inode *inode_alloc_inode, 1693 struct inode *inode_alloc_inode,
1715 struct buffer_head *inode_alloc_bh, 1694 struct buffer_head *inode_alloc_bh,
@@ -1723,19 +1702,6 @@ int ocfs2_free_dinode(handle_t *handle,
1723 inode_alloc_bh, bit, bg_blkno, 1); 1702 inode_alloc_bh, bit, bg_blkno, 1);
1724} 1703}
1725 1704
1726int ocfs2_free_extent_block(handle_t *handle,
1727 struct inode *eb_alloc_inode,
1728 struct buffer_head *eb_alloc_bh,
1729 struct ocfs2_extent_block *eb)
1730{
1731 u64 blk = le64_to_cpu(eb->h_blkno);
1732 u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1733 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1734
1735 return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1736 bit, bg_blkno, 1);
1737}
1738
1739int ocfs2_free_clusters(handle_t *handle, 1705int ocfs2_free_clusters(handle_t *handle,
1740 struct inode *bitmap_inode, 1706 struct inode *bitmap_inode,
1741 struct buffer_head *bitmap_bh, 1707 struct buffer_head *bitmap_bh,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 1a3c94cb9250..f212dc01a84b 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,20 +86,29 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
86 u32 *cluster_start, 86 u32 *cluster_start,
87 u32 *num_clusters); 87 u32 *num_clusters);
88 88
89int ocfs2_free_suballoc_bits(handle_t *handle,
90 struct inode *alloc_inode,
91 struct buffer_head *alloc_bh,
92 unsigned int start_bit,
93 u64 bg_blkno,
94 unsigned int count);
89int ocfs2_free_dinode(handle_t *handle, 95int ocfs2_free_dinode(handle_t *handle,
90 struct inode *inode_alloc_inode, 96 struct inode *inode_alloc_inode,
91 struct buffer_head *inode_alloc_bh, 97 struct buffer_head *inode_alloc_bh,
92 struct ocfs2_dinode *di); 98 struct ocfs2_dinode *di);
93int ocfs2_free_extent_block(handle_t *handle,
94 struct inode *eb_alloc_inode,
95 struct buffer_head *eb_alloc_bh,
96 struct ocfs2_extent_block *eb);
97int ocfs2_free_clusters(handle_t *handle, 99int ocfs2_free_clusters(handle_t *handle,
98 struct inode *bitmap_inode, 100 struct inode *bitmap_inode,
99 struct buffer_head *bitmap_bh, 101 struct buffer_head *bitmap_bh,
100 u64 start_blk, 102 u64 start_blk,
101 unsigned int num_clusters); 103 unsigned int num_clusters);
102 104
105static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
106{
107 u64 group = block - (u64) bit;
108
109 return group;
110}
111
103static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, 112static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
104 u64 bg_blkno) 113 u64 bg_blkno)
105{ 114{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 86b559c7dce9..3a5a1ed09ac9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -82,7 +82,8 @@ MODULE_AUTHOR("Oracle");
82MODULE_LICENSE("GPL"); 82MODULE_LICENSE("GPL");
83 83
84static int ocfs2_parse_options(struct super_block *sb, char *options, 84static int ocfs2_parse_options(struct super_block *sb, char *options,
85 unsigned long *mount_opt, int is_remount); 85 unsigned long *mount_opt, s16 *slot,
86 int is_remount);
86static void ocfs2_put_super(struct super_block *sb); 87static void ocfs2_put_super(struct super_block *sb);
87static int ocfs2_mount_volume(struct super_block *sb); 88static int ocfs2_mount_volume(struct super_block *sb);
88static int ocfs2_remount(struct super_block *sb, int *flags, char *data); 89static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -114,8 +115,6 @@ static void ocfs2_write_super(struct super_block *sb);
114static struct inode *ocfs2_alloc_inode(struct super_block *sb); 115static struct inode *ocfs2_alloc_inode(struct super_block *sb);
115static void ocfs2_destroy_inode(struct inode *inode); 116static void ocfs2_destroy_inode(struct inode *inode);
116 117
117static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
118
119static const struct super_operations ocfs2_sops = { 118static const struct super_operations ocfs2_sops = {
120 .statfs = ocfs2_statfs, 119 .statfs = ocfs2_statfs,
121 .alloc_inode = ocfs2_alloc_inode, 120 .alloc_inode = ocfs2_alloc_inode,
@@ -140,6 +139,7 @@ enum {
140 Opt_data_ordered, 139 Opt_data_ordered,
141 Opt_data_writeback, 140 Opt_data_writeback,
142 Opt_atime_quantum, 141 Opt_atime_quantum,
142 Opt_slot,
143 Opt_err, 143 Opt_err,
144}; 144};
145 145
@@ -154,6 +154,7 @@ static match_table_t tokens = {
154 {Opt_data_ordered, "data=ordered"}, 154 {Opt_data_ordered, "data=ordered"},
155 {Opt_data_writeback, "data=writeback"}, 155 {Opt_data_writeback, "data=writeback"},
156 {Opt_atime_quantum, "atime_quantum=%u"}, 156 {Opt_atime_quantum, "atime_quantum=%u"},
157 {Opt_slot, "preferred_slot=%u"},
157 {Opt_err, NULL} 158 {Opt_err, NULL}
158}; 159};
159 160
@@ -318,7 +319,7 @@ static void ocfs2_destroy_inode(struct inode *inode)
318/* From xfs_super.c:xfs_max_file_offset 319/* From xfs_super.c:xfs_max_file_offset
319 * Copyright (c) 2000-2004 Silicon Graphics, Inc. 320 * Copyright (c) 2000-2004 Silicon Graphics, Inc.
320 */ 321 */
321static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) 322unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
322{ 323{
323 unsigned int pagefactor = 1; 324 unsigned int pagefactor = 1;
324 unsigned int bitshift = BITS_PER_LONG - 1; 325 unsigned int bitshift = BITS_PER_LONG - 1;
@@ -355,9 +356,10 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
355 int incompat_features; 356 int incompat_features;
356 int ret = 0; 357 int ret = 0;
357 unsigned long parsed_options; 358 unsigned long parsed_options;
359 s16 slot;
358 struct ocfs2_super *osb = OCFS2_SB(sb); 360 struct ocfs2_super *osb = OCFS2_SB(sb);
359 361
360 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { 362 if (!ocfs2_parse_options(sb, data, &parsed_options, &slot, 1)) {
361 ret = -EINVAL; 363 ret = -EINVAL;
362 goto out; 364 goto out;
363 } 365 }
@@ -534,6 +536,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
534 struct dentry *root; 536 struct dentry *root;
535 int status, sector_size; 537 int status, sector_size;
536 unsigned long parsed_opt; 538 unsigned long parsed_opt;
539 s16 slot;
537 struct inode *inode = NULL; 540 struct inode *inode = NULL;
538 struct ocfs2_super *osb = NULL; 541 struct ocfs2_super *osb = NULL;
539 struct buffer_head *bh = NULL; 542 struct buffer_head *bh = NULL;
@@ -541,7 +544,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
541 544
542 mlog_entry("%p, %p, %i", sb, data, silent); 545 mlog_entry("%p, %p, %i", sb, data, silent);
543 546
544 if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { 547 if (!ocfs2_parse_options(sb, data, &parsed_opt, &slot, 0)) {
545 status = -EINVAL; 548 status = -EINVAL;
546 goto read_super_error; 549 goto read_super_error;
547 } 550 }
@@ -571,6 +574,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
571 brelse(bh); 574 brelse(bh);
572 bh = NULL; 575 bh = NULL;
573 osb->s_mount_opt = parsed_opt; 576 osb->s_mount_opt = parsed_opt;
577 osb->preferred_slot = slot;
574 578
575 sb->s_magic = OCFS2_SUPER_MAGIC; 579 sb->s_magic = OCFS2_SUPER_MAGIC;
576 580
@@ -713,6 +717,7 @@ static struct file_system_type ocfs2_fs_type = {
713static int ocfs2_parse_options(struct super_block *sb, 717static int ocfs2_parse_options(struct super_block *sb,
714 char *options, 718 char *options,
715 unsigned long *mount_opt, 719 unsigned long *mount_opt,
720 s16 *slot,
716 int is_remount) 721 int is_remount)
717{ 722{
718 int status; 723 int status;
@@ -722,6 +727,7 @@ static int ocfs2_parse_options(struct super_block *sb,
722 options ? options : "(none)"); 727 options ? options : "(none)");
723 728
724 *mount_opt = 0; 729 *mount_opt = 0;
730 *slot = OCFS2_INVALID_SLOT;
725 731
726 if (!options) { 732 if (!options) {
727 status = 1; 733 status = 1;
@@ -782,6 +788,15 @@ static int ocfs2_parse_options(struct super_block *sb,
782 else 788 else
783 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 789 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
784 break; 790 break;
791 case Opt_slot:
792 option = 0;
793 if (match_int(&args[0], &option)) {
794 status = 0;
795 goto bail;
796 }
797 if (option)
798 *slot = (s16)option;
799 break;
785 default: 800 default:
786 mlog(ML_ERROR, 801 mlog(ML_ERROR,
787 "Unrecognized mount option \"%s\" " 802 "Unrecognized mount option \"%s\" "
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..3b9cb3d0b008 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,6 @@ void __ocfs2_abort(struct super_block *sb,
45 45
46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) 46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
47 47
48unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
49
48#endif /* OCFS2_SUPER_H */ 50#endif /* OCFS2_SUPER_H */
diff --git a/fs/open.c b/fs/open.c
index 0d515d161974..be6a457f4226 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -855,7 +855,7 @@ EXPORT_SYMBOL(dentry_open);
855/* 855/*
856 * Find an empty file descriptor entry, and mark it busy. 856 * Find an empty file descriptor entry, and mark it busy.
857 */ 857 */
858int get_unused_fd(void) 858int get_unused_fd_flags(int flags)
859{ 859{
860 struct files_struct * files = current->files; 860 struct files_struct * files = current->files;
861 int fd, error; 861 int fd, error;
@@ -891,7 +891,10 @@ repeat:
891 } 891 }
892 892
893 FD_SET(fd, fdt->open_fds); 893 FD_SET(fd, fdt->open_fds);
894 FD_CLR(fd, fdt->close_on_exec); 894 if (flags & O_CLOEXEC)
895 FD_SET(fd, fdt->close_on_exec);
896 else
897 FD_CLR(fd, fdt->close_on_exec);
895 files->next_fd = fd + 1; 898 files->next_fd = fd + 1;
896#if 1 899#if 1
897 /* Sanity check */ 900 /* Sanity check */
@@ -907,6 +910,11 @@ out:
907 return error; 910 return error;
908} 911}
909 912
913int get_unused_fd(void)
914{
915 return get_unused_fd_flags(0);
916}
917
910EXPORT_SYMBOL(get_unused_fd); 918EXPORT_SYMBOL(get_unused_fd);
911 919
912static void __put_unused_fd(struct files_struct *files, unsigned int fd) 920static void __put_unused_fd(struct files_struct *files, unsigned int fd)
@@ -959,7 +967,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
959 int fd = PTR_ERR(tmp); 967 int fd = PTR_ERR(tmp);
960 968
961 if (!IS_ERR(tmp)) { 969 if (!IS_ERR(tmp)) {
962 fd = get_unused_fd(); 970 fd = get_unused_fd_flags(flags);
963 if (fd >= 0) { 971 if (fd >= 0) {
964 struct file *f = do_filp_open(dfd, tmp, flags, mode); 972 struct file *f = do_filp_open(dfd, tmp, flags, mode);
965 if (IS_ERR(f)) { 973 if (IS_ERR(f)) {
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index e3491328596b..3d3e16631472 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -25,6 +25,8 @@
25#define PARTITION_RISCIX_SCSI 2 25#define PARTITION_RISCIX_SCSI 2
26#define PARTITION_LINUX 9 26#define PARTITION_LINUX 9
27 27
28#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
29 defined(CONFIG_ACORN_PARTITION_ADFS)
28static struct adfs_discrecord * 30static struct adfs_discrecord *
29adfs_partition(struct parsed_partitions *state, char *name, char *data, 31adfs_partition(struct parsed_partitions *state, char *name, char *data,
30 unsigned long first_sector, int slot) 32 unsigned long first_sector, int slot)
@@ -48,6 +50,7 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data,
48 put_partition(state, slot, first_sector, nr_sects); 50 put_partition(state, slot, first_sector, nr_sects);
49 return dr; 51 return dr;
50} 52}
53#endif
51 54
52#ifdef CONFIG_ACORN_PARTITION_RISCIX 55#ifdef CONFIG_ACORN_PARTITION_RISCIX
53 56
@@ -65,6 +68,8 @@ struct riscix_record {
65 struct riscix_part part[8]; 68 struct riscix_part part[8];
66}; 69};
67 70
71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
72 defined(CONFIG_ACORN_PARTITION_ADFS)
68static int 73static int
69riscix_partition(struct parsed_partitions *state, struct block_device *bdev, 74riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
70 unsigned long first_sect, int slot, unsigned long nr_sects) 75 unsigned long first_sect, int slot, unsigned long nr_sects)
@@ -105,6 +110,7 @@ riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
105 return slot; 110 return slot;
106} 111}
107#endif 112#endif
113#endif
108 114
109#define LINUX_NATIVE_MAGIC 0xdeafa1de 115#define LINUX_NATIVE_MAGIC 0xdeafa1de
110#define LINUX_SWAP_MAGIC 0xdeafab1e 116#define LINUX_SWAP_MAGIC 0xdeafab1e
@@ -115,6 +121,8 @@ struct linux_part {
115 __le32 nr_sects; 121 __le32 nr_sects;
116}; 122};
117 123
124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
125 defined(CONFIG_ACORN_PARTITION_ADFS)
118static int 126static int
119linux_partition(struct parsed_partitions *state, struct block_device *bdev, 127linux_partition(struct parsed_partitions *state, struct block_device *bdev,
120 unsigned long first_sect, int slot, unsigned long nr_sects) 128 unsigned long first_sect, int slot, unsigned long nr_sects)
@@ -146,6 +154,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
146 put_dev_sector(sect); 154 put_dev_sector(sect);
147 return slot; 155 return slot;
148} 156}
157#endif
149 158
150#ifdef CONFIG_ACORN_PARTITION_CUMANA 159#ifdef CONFIG_ACORN_PARTITION_CUMANA
151int 160int
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 99873a2b4cbc..e7dd1d4e3473 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -677,15 +677,24 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
677 * Return: -1 Error, the calculated offset exceeded the size of the buffer 677 * Return: -1 Error, the calculated offset exceeded the size of the buffer
678 * n OK, a range-checked offset into buffer 678 * n OK, a range-checked offset into buffer
679 */ 679 */
680static int ldm_relative (const u8 *buffer, int buflen, int base, int offset) 680static int ldm_relative(const u8 *buffer, int buflen, int base, int offset)
681{ 681{
682 682
683 base += offset; 683 base += offset;
684 if ((!buffer) || (offset < 0) || (base > buflen)) 684 if (!buffer || offset < 0 || base > buflen) {
685 if (!buffer)
686 ldm_error("!buffer");
687 if (offset < 0)
688 ldm_error("offset (%d) < 0", offset);
689 if (base > buflen)
690 ldm_error("base (%d) > buflen (%d)", base, buflen);
685 return -1; 691 return -1;
686 if ((base + buffer[base]) >= buflen) 692 }
693 if (base + buffer[base] >= buflen) {
694 ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base,
695 buffer[base], buflen);
687 return -1; 696 return -1;
688 697 }
689 return buffer[base] + offset + 1; 698 return buffer[base] + offset + 1;
690} 699}
691 700
@@ -1054,60 +1063,98 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
1054 * Return: 'true' @vb contains a Volume VBLK 1063 * Return: 'true' @vb contains a Volume VBLK
1055 * 'false' @vb contents are not defined 1064 * 'false' @vb contents are not defined
1056 */ 1065 */
1057static bool ldm_parse_vol5 (const u8 *buffer, int buflen, struct vblk *vb) 1066static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb)
1058{ 1067{
1059 int r_objid, r_name, r_vtype, r_child, r_size, r_id1, r_id2, r_size2; 1068 int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size;
1060 int r_drive, len; 1069 int r_id1, r_id2, r_size2, r_drive, len;
1061 struct vblk_volu *volu; 1070 struct vblk_volu *volu;
1062 1071
1063 BUG_ON (!buffer || !vb); 1072 BUG_ON(!buffer || !vb);
1064 1073 r_objid = ldm_relative(buffer, buflen, 0x18, 0);
1065 r_objid = ldm_relative (buffer, buflen, 0x18, 0); 1074 if (r_objid < 0) {
1066 r_name = ldm_relative (buffer, buflen, 0x18, r_objid); 1075 ldm_error("r_objid %d < 0", r_objid);
1067 r_vtype = ldm_relative (buffer, buflen, 0x18, r_name); 1076 return false;
1068 r_child = ldm_relative (buffer, buflen, 0x2E, r_vtype); 1077 }
1069 r_size = ldm_relative (buffer, buflen, 0x3E, r_child); 1078 r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
1070 1079 if (r_name < 0) {
1071 if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) 1080 ldm_error("r_name %d < 0", r_name);
1072 r_id1 = ldm_relative (buffer, buflen, 0x53, r_size); 1081 return false;
1073 else 1082 }
1083 r_vtype = ldm_relative(buffer, buflen, 0x18, r_name);
1084 if (r_vtype < 0) {
1085 ldm_error("r_vtype %d < 0", r_vtype);
1086 return false;
1087 }
1088 r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype);
1089 if (r_disable_drive_letter < 0) {
1090 ldm_error("r_disable_drive_letter %d < 0",
1091 r_disable_drive_letter);
1092 return false;
1093 }
1094 r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter);
1095 if (r_child < 0) {
1096 ldm_error("r_child %d < 0", r_child);
1097 return false;
1098 }
1099 r_size = ldm_relative(buffer, buflen, 0x3D, r_child);
1100 if (r_size < 0) {
1101 ldm_error("r_size %d < 0", r_size);
1102 return false;
1103 }
1104 if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) {
1105 r_id1 = ldm_relative(buffer, buflen, 0x52, r_size);
1106 if (r_id1 < 0) {
1107 ldm_error("r_id1 %d < 0", r_id1);
1108 return false;
1109 }
1110 } else
1074 r_id1 = r_size; 1111 r_id1 = r_size;
1075 1112 if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) {
1076 if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) 1113 r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1);
1077 r_id2 = ldm_relative (buffer, buflen, 0x53, r_id1); 1114 if (r_id2 < 0) {
1078 else 1115 ldm_error("r_id2 %d < 0", r_id2);
1116 return false;
1117 }
1118 } else
1079 r_id2 = r_id1; 1119 r_id2 = r_id1;
1080 1120 if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) {
1081 if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) 1121 r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2);
1082 r_size2 = ldm_relative (buffer, buflen, 0x53, r_id2); 1122 if (r_size2 < 0) {
1083 else 1123 ldm_error("r_size2 %d < 0", r_size2);
1124 return false;
1125 }
1126 } else
1084 r_size2 = r_id2; 1127 r_size2 = r_id2;
1085 1128 if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
1086 if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) 1129 r_drive = ldm_relative(buffer, buflen, 0x52, r_size2);
1087 r_drive = ldm_relative (buffer, buflen, 0x53, r_size2); 1130 if (r_drive < 0) {
1088 else 1131 ldm_error("r_drive %d < 0", r_drive);
1132 return false;
1133 }
1134 } else
1089 r_drive = r_size2; 1135 r_drive = r_size2;
1090
1091 len = r_drive; 1136 len = r_drive;
1092 if (len < 0) 1137 if (len < 0) {
1138 ldm_error("len %d < 0", len);
1093 return false; 1139 return false;
1094 1140 }
1095 len += VBLK_SIZE_VOL5; 1141 len += VBLK_SIZE_VOL5;
1096 if (len != BE32 (buffer + 0x14)) 1142 if (len > BE32(buffer + 0x14)) {
1143 ldm_error("len %d > BE32(buffer + 0x14) %d", len,
1144 BE32(buffer + 0x14));
1097 return false; 1145 return false;
1098 1146 }
1099 volu = &vb->vblk.volu; 1147 volu = &vb->vblk.volu;
1100 1148 ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type,
1101 ldm_get_vstr (buffer + 0x18 + r_name, volu->volume_type, 1149 sizeof(volu->volume_type));
1102 sizeof (volu->volume_type)); 1150 memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter,
1103 memcpy (volu->volume_state, buffer + 0x19 + r_vtype, 1151 sizeof(volu->volume_state));
1104 sizeof (volu->volume_state)); 1152 volu->size = ldm_get_vnum(buffer + 0x3D + r_child);
1105 volu->size = ldm_get_vnum (buffer + 0x3E + r_child); 1153 volu->partition_type = buffer[0x41 + r_size];
1106 volu->partition_type = buffer[0x42 + r_size]; 1154 memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid));
1107 memcpy (volu->guid, buffer + 0x43 + r_size, sizeof (volu->guid));
1108 if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { 1155 if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
1109 ldm_get_vstr (buffer + 0x53 + r_size, volu->drive_hint, 1156 ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint,
1110 sizeof (volu->drive_hint)); 1157 sizeof(volu->drive_hint));
1111 } 1158 }
1112 return true; 1159 return true;
1113} 1160}
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index d2e6a3046939..80f63b5fdd9f 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -68,7 +68,7 @@ struct parsed_partitions;
68#define VBLK_SIZE_DSK3 12 68#define VBLK_SIZE_DSK3 12
69#define VBLK_SIZE_DSK4 45 69#define VBLK_SIZE_DSK4 45
70#define VBLK_SIZE_PRT3 28 70#define VBLK_SIZE_PRT3 28
71#define VBLK_SIZE_VOL5 59 71#define VBLK_SIZE_VOL5 58
72 72
73/* component types */ 73/* component types */
74#define COMP_STRIPE 0x01 /* Stripe-set */ 74#define COMP_STRIPE 0x01 /* Stripe-set */
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 98e78e2f18d6..965625a0977d 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -62,6 +62,8 @@
62#include <linux/mman.h> 62#include <linux/mman.h>
63#include <linux/proc_fs.h> 63#include <linux/proc_fs.h>
64#include <linux/ioport.h> 64#include <linux/ioport.h>
65#include <linux/uaccess.h>
66#include <linux/io.h>
65#include <linux/mm.h> 67#include <linux/mm.h>
66#include <linux/hugetlb.h> 68#include <linux/hugetlb.h>
67#include <linux/pagemap.h> 69#include <linux/pagemap.h>
@@ -76,9 +78,7 @@
76#include <linux/rcupdate.h> 78#include <linux/rcupdate.h>
77#include <linux/delayacct.h> 79#include <linux/delayacct.h>
78 80
79#include <asm/uaccess.h>
80#include <asm/pgtable.h> 81#include <asm/pgtable.h>
81#include <asm/io.h>
82#include <asm/processor.h> 82#include <asm/processor.h>
83#include "internal.h" 83#include "internal.h"
84 84
@@ -87,10 +87,10 @@
87do { memcpy(buffer, string, strlen(string)); \ 87do { memcpy(buffer, string, strlen(string)); \
88 buffer += strlen(string); } while (0) 88 buffer += strlen(string); } while (0)
89 89
90static inline char * task_name(struct task_struct *p, char * buf) 90static inline char *task_name(struct task_struct *p, char *buf)
91{ 91{
92 int i; 92 int i;
93 char * name; 93 char *name;
94 char tcomm[sizeof(p->comm)]; 94 char tcomm[sizeof(p->comm)];
95 95
96 get_task_comm(tcomm, p); 96 get_task_comm(tcomm, p);
@@ -138,7 +138,7 @@ static const char *task_state_array[] = {
138 "X (dead)" /* 32 */ 138 "X (dead)" /* 32 */
139}; 139};
140 140
141static inline const char * get_task_state(struct task_struct *tsk) 141static inline const char *get_task_state(struct task_struct *tsk)
142{ 142{
143 unsigned int state = (tsk->state & (TASK_RUNNING | 143 unsigned int state = (tsk->state & (TASK_RUNNING |
144 TASK_INTERRUPTIBLE | 144 TASK_INTERRUPTIBLE |
@@ -156,7 +156,7 @@ static inline const char * get_task_state(struct task_struct *tsk)
156 return *p; 156 return *p;
157} 157}
158 158
159static inline char * task_state(struct task_struct *p, char *buffer) 159static inline char *task_state(struct task_struct *p, char *buffer)
160{ 160{
161 struct group_info *group_info; 161 struct group_info *group_info;
162 int g; 162 int g;
@@ -172,8 +172,8 @@ static inline char * task_state(struct task_struct *p, char *buffer)
172 "Uid:\t%d\t%d\t%d\t%d\n" 172 "Uid:\t%d\t%d\t%d\t%d\n"
173 "Gid:\t%d\t%d\t%d\t%d\n", 173 "Gid:\t%d\t%d\t%d\t%d\n",
174 get_task_state(p), 174 get_task_state(p),
175 p->tgid, p->pid, 175 p->tgid, p->pid,
176 pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, 176 pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
177 pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, 177 pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
178 p->uid, p->euid, p->suid, p->fsuid, 178 p->uid, p->euid, p->suid, p->fsuid,
179 p->gid, p->egid, p->sgid, p->fsgid); 179 p->gid, p->egid, p->sgid, p->fsgid);
@@ -191,15 +191,15 @@ static inline char * task_state(struct task_struct *p, char *buffer)
191 get_group_info(group_info); 191 get_group_info(group_info);
192 task_unlock(p); 192 task_unlock(p);
193 193
194 for (g = 0; g < min(group_info->ngroups,NGROUPS_SMALL); g++) 194 for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
195 buffer += sprintf(buffer, "%d ", GROUP_AT(group_info,g)); 195 buffer += sprintf(buffer, "%d ", GROUP_AT(group_info, g));
196 put_group_info(group_info); 196 put_group_info(group_info);
197 197
198 buffer += sprintf(buffer, "\n"); 198 buffer += sprintf(buffer, "\n");
199 return buffer; 199 return buffer;
200} 200}
201 201
202static char * render_sigset_t(const char *header, sigset_t *set, char *buffer) 202static char *render_sigset_t(const char *header, sigset_t *set, char *buffer)
203{ 203{
204 int i, len; 204 int i, len;
205 205
@@ -239,7 +239,7 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
239 } 239 }
240} 240}
241 241
242static inline char * task_sig(struct task_struct *p, char *buffer) 242static inline char *task_sig(struct task_struct *p, char *buffer)
243{ 243{
244 unsigned long flags; 244 unsigned long flags;
245 sigset_t pending, shpending, blocked, ignored, caught; 245 sigset_t pending, shpending, blocked, ignored, caught;
@@ -289,14 +289,23 @@ static inline char *task_cap(struct task_struct *p, char *buffer)
289 cap_t(p->cap_effective)); 289 cap_t(p->cap_effective));
290} 290}
291 291
292int proc_pid_status(struct task_struct *task, char * buffer) 292static inline char *task_context_switch_counts(struct task_struct *p,
293 char *buffer)
293{ 294{
294 char * orig = buffer; 295 return buffer + sprintf(buffer, "voluntary_ctxt_switches:\t%lu\n"
296 "nonvoluntary_ctxt_switches:\t%lu\n",
297 p->nvcsw,
298 p->nivcsw);
299}
300
301int proc_pid_status(struct task_struct *task, char *buffer)
302{
303 char *orig = buffer;
295 struct mm_struct *mm = get_task_mm(task); 304 struct mm_struct *mm = get_task_mm(task);
296 305
297 buffer = task_name(task, buffer); 306 buffer = task_name(task, buffer);
298 buffer = task_state(task, buffer); 307 buffer = task_state(task, buffer);
299 308
300 if (mm) { 309 if (mm) {
301 buffer = task_mem(mm, buffer); 310 buffer = task_mem(mm, buffer);
302 mmput(mm); 311 mmput(mm);
@@ -307,6 +316,7 @@ int proc_pid_status(struct task_struct *task, char * buffer)
307#if defined(CONFIG_S390) 316#if defined(CONFIG_S390)
308 buffer = task_show_regs(task, buffer); 317 buffer = task_show_regs(task, buffer);
309#endif 318#endif
319 buffer = task_context_switch_counts(task, buffer);
310 return buffer - orig; 320 return buffer - orig;
311} 321}
312 322
@@ -332,7 +342,7 @@ static clock_t task_utime(struct task_struct *p)
332 342
333static clock_t task_stime(struct task_struct *p) 343static clock_t task_stime(struct task_struct *p)
334{ 344{
335 clock_t stime = cputime_to_clock_t(p->stime); 345 clock_t stime;
336 346
337 /* 347 /*
338 * Use CFS's precise accounting. (we subtract utime from 348 * Use CFS's precise accounting. (we subtract utime from
@@ -344,8 +354,7 @@ static clock_t task_stime(struct task_struct *p)
344 return stime; 354 return stime;
345} 355}
346 356
347 357static int do_task_stat(struct task_struct *task, char *buffer, int whole)
348static int do_task_stat(struct task_struct *task, char * buffer, int whole)
349{ 358{
350 unsigned long vsize, eip, esp, wchan = ~0UL; 359 unsigned long vsize, eip, esp, wchan = ~0UL;
351 long priority, nice; 360 long priority, nice;
@@ -353,7 +362,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
353 sigset_t sigign, sigcatch; 362 sigset_t sigign, sigcatch;
354 char state; 363 char state;
355 int res; 364 int res;
356 pid_t ppid = 0, pgid = -1, sid = -1; 365 pid_t ppid = 0, pgid = -1, sid = -1;
357 int num_threads = 0; 366 int num_threads = 0;
358 struct mm_struct *mm; 367 struct mm_struct *mm;
359 unsigned long long start_time; 368 unsigned long long start_time;
@@ -424,7 +433,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
424 } 433 }
425 rcu_read_unlock(); 434 rcu_read_unlock();
426 435
427 if (!whole || num_threads<2) 436 if (!whole || num_threads < 2)
428 wchan = get_wchan(task); 437 wchan = get_wchan(task);
429 if (!whole) { 438 if (!whole) {
430 min_flt = task->min_flt; 439 min_flt = task->min_flt;
@@ -440,12 +449,13 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
440 449
441 /* Temporary variable needed for gcc-2.96 */ 450 /* Temporary variable needed for gcc-2.96 */
442 /* convert timespec -> nsec*/ 451 /* convert timespec -> nsec*/
443 start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC 452 start_time =
444 + task->start_time.tv_nsec; 453 (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
454 + task->real_start_time.tv_nsec;
445 /* convert nsec -> ticks */ 455 /* convert nsec -> ticks */
446 start_time = nsec_to_clock_t(start_time); 456 start_time = nsec_to_clock_t(start_time);
447 457
448 res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %u %lu \ 458 res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
449%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ 459%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
450%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n", 460%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n",
451 task->pid, 461 task->pid,
@@ -471,7 +481,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
471 start_time, 481 start_time,
472 vsize, 482 vsize,
473 mm ? get_mm_rss(mm) : 0, 483 mm ? get_mm_rss(mm) : 0,
474 rsslim, 484 rsslim,
475 mm ? mm->start_code : 0, 485 mm ? mm->start_code : 0,
476 mm ? mm->end_code : 0, 486 mm ? mm->end_code : 0,
477 mm ? mm->start_stack : 0, 487 mm ? mm->start_stack : 0,
@@ -493,17 +503,17 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
493 task->rt_priority, 503 task->rt_priority,
494 task->policy, 504 task->policy,
495 (unsigned long long)delayacct_blkio_ticks(task)); 505 (unsigned long long)delayacct_blkio_ticks(task));
496 if(mm) 506 if (mm)
497 mmput(mm); 507 mmput(mm);
498 return res; 508 return res;
499} 509}
500 510
501int proc_tid_stat(struct task_struct *task, char * buffer) 511int proc_tid_stat(struct task_struct *task, char *buffer)
502{ 512{
503 return do_task_stat(task, buffer, 0); 513 return do_task_stat(task, buffer, 0);
504} 514}
505 515
506int proc_tgid_stat(struct task_struct *task, char * buffer) 516int proc_tgid_stat(struct task_struct *task, char *buffer)
507{ 517{
508 return do_task_stat(task, buffer, 1); 518 return do_task_stat(task, buffer, 1);
509} 519}
@@ -512,12 +522,12 @@ int proc_pid_statm(struct task_struct *task, char *buffer)
512{ 522{
513 int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; 523 int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
514 struct mm_struct *mm = get_task_mm(task); 524 struct mm_struct *mm = get_task_mm(task);
515 525
516 if (mm) { 526 if (mm) {
517 size = task_statm(mm, &shared, &text, &data, &resident); 527 size = task_statm(mm, &shared, &text, &data, &resident);
518 mmput(mm); 528 mmput(mm);
519 } 529 }
520 530
521 return sprintf(buffer,"%d %d %d %d %d %d %d\n", 531 return sprintf(buffer, "%d %d %d %d %d %d %d\n",
522 size, resident, shared, text, lib, data, 0); 532 size, resident, shared, text, lib, data, 0);
523} 533}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 46ea5d56e1bb..42cb4f5613b6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -67,7 +67,6 @@
67#include <linux/mount.h> 67#include <linux/mount.h>
68#include <linux/security.h> 68#include <linux/security.h>
69#include <linux/ptrace.h> 69#include <linux/ptrace.h>
70#include <linux/seccomp.h>
71#include <linux/cpuset.h> 70#include <linux/cpuset.h>
72#include <linux/audit.h> 71#include <linux/audit.h>
73#include <linux/poll.h> 72#include <linux/poll.h>
@@ -204,12 +203,17 @@ static int proc_pid_environ(struct task_struct *task, char * buffer)
204 int res = 0; 203 int res = 0;
205 struct mm_struct *mm = get_task_mm(task); 204 struct mm_struct *mm = get_task_mm(task);
206 if (mm) { 205 if (mm) {
207 unsigned int len = mm->env_end - mm->env_start; 206 unsigned int len;
207
208 res = -ESRCH;
209 if (!ptrace_may_attach(task))
210 goto out;
211
212 len = mm->env_end - mm->env_start;
208 if (len > PAGE_SIZE) 213 if (len > PAGE_SIZE)
209 len = PAGE_SIZE; 214 len = PAGE_SIZE;
210 res = access_process_vm(task, mm->env_start, buffer, len, 0); 215 res = access_process_vm(task, mm->env_start, buffer, len, 0);
211 if (!ptrace_may_attach(task)) 216out:
212 res = -ESRCH;
213 mmput(mm); 217 mmput(mm);
214 } 218 }
215 return res; 219 return res;
@@ -279,7 +283,7 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
279static int proc_pid_wchan(struct task_struct *task, char *buffer) 283static int proc_pid_wchan(struct task_struct *task, char *buffer)
280{ 284{
281 unsigned long wchan; 285 unsigned long wchan;
282 char symname[KSYM_NAME_LEN+1]; 286 char symname[KSYM_NAME_LEN];
283 287
284 wchan = get_wchan(task); 288 wchan = get_wchan(task);
285 289
@@ -812,71 +816,6 @@ static const struct file_operations proc_loginuid_operations = {
812}; 816};
813#endif 817#endif
814 818
815#ifdef CONFIG_SECCOMP
816static ssize_t seccomp_read(struct file *file, char __user *buf,
817 size_t count, loff_t *ppos)
818{
819 struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
820 char __buf[20];
821 size_t len;
822
823 if (!tsk)
824 return -ESRCH;
825 /* no need to print the trailing zero, so use only len */
826 len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
827 put_task_struct(tsk);
828
829 return simple_read_from_buffer(buf, count, ppos, __buf, len);
830}
831
832static ssize_t seccomp_write(struct file *file, const char __user *buf,
833 size_t count, loff_t *ppos)
834{
835 struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
836 char __buf[20], *end;
837 unsigned int seccomp_mode;
838 ssize_t result;
839
840 result = -ESRCH;
841 if (!tsk)
842 goto out_no_task;
843
844 /* can set it only once to be even more secure */
845 result = -EPERM;
846 if (unlikely(tsk->seccomp.mode))
847 goto out;
848
849 result = -EFAULT;
850 memset(__buf, 0, sizeof(__buf));
851 count = min(count, sizeof(__buf) - 1);
852 if (copy_from_user(__buf, buf, count))
853 goto out;
854
855 seccomp_mode = simple_strtoul(__buf, &end, 0);
856 if (*end == '\n')
857 end++;
858 result = -EINVAL;
859 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
860 tsk->seccomp.mode = seccomp_mode;
861 set_tsk_thread_flag(tsk, TIF_SECCOMP);
862 } else
863 goto out;
864 result = -EIO;
865 if (unlikely(!(end - __buf)))
866 goto out;
867 result = end - __buf;
868out:
869 put_task_struct(tsk);
870out_no_task:
871 return result;
872}
873
874static const struct file_operations proc_seccomp_operations = {
875 .read = seccomp_read,
876 .write = seccomp_write,
877};
878#endif /* CONFIG_SECCOMP */
879
880#ifdef CONFIG_FAULT_INJECTION 819#ifdef CONFIG_FAULT_INJECTION
881static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, 820static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
882 size_t count, loff_t *ppos) 821 size_t count, loff_t *ppos)
@@ -2037,9 +1976,6 @@ static const struct pid_entry tgid_base_stuff[] = {
2037 REG("numa_maps", S_IRUGO, numa_maps), 1976 REG("numa_maps", S_IRUGO, numa_maps),
2038#endif 1977#endif
2039 REG("mem", S_IRUSR|S_IWUSR, mem), 1978 REG("mem", S_IRUSR|S_IWUSR, mem),
2040#ifdef CONFIG_SECCOMP
2041 REG("seccomp", S_IRUSR|S_IWUSR, seccomp),
2042#endif
2043 LNK("cwd", cwd), 1979 LNK("cwd", cwd),
2044 LNK("root", root), 1980 LNK("root", root),
2045 LNK("exe", exe), 1981 LNK("exe", exe),
@@ -2324,9 +2260,6 @@ static const struct pid_entry tid_base_stuff[] = {
2324 REG("numa_maps", S_IRUGO, numa_maps), 2260 REG("numa_maps", S_IRUGO, numa_maps),
2325#endif 2261#endif
2326 REG("mem", S_IRUSR|S_IWUSR, mem), 2262 REG("mem", S_IRUSR|S_IWUSR, mem),
2327#ifdef CONFIG_SECCOMP
2328 REG("seccomp", S_IRUSR|S_IWUSR, seccomp),
2329#endif
2330 LNK("cwd", cwd), 2263 LNK("cwd", cwd),
2331 LNK("root", root), 2264 LNK("root", root),
2332 LNK("exe", exe), 2265 LNK("exe", exe),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8a40e15f5ecb..b5e7155d30d8 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -20,6 +20,7 @@
20#include <linux/namei.h> 20#include <linux/namei.h>
21#include <linux/bitops.h> 21#include <linux/bitops.h>
22#include <linux/spinlock.h> 22#include <linux/spinlock.h>
23#include <linux/completion.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24 25
25#include "internal.h" 26#include "internal.h"
@@ -529,12 +530,6 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
529 return -EAGAIN; 530 return -EAGAIN;
530 dp->low_ino = i; 531 dp->low_ino = i;
531 532
532 spin_lock(&proc_subdir_lock);
533 dp->next = dir->subdir;
534 dp->parent = dir;
535 dir->subdir = dp;
536 spin_unlock(&proc_subdir_lock);
537
538 if (S_ISDIR(dp->mode)) { 533 if (S_ISDIR(dp->mode)) {
539 if (dp->proc_iops == NULL) { 534 if (dp->proc_iops == NULL) {
540 dp->proc_fops = &proc_dir_operations; 535 dp->proc_fops = &proc_dir_operations;
@@ -550,6 +545,13 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
550 if (dp->proc_iops == NULL) 545 if (dp->proc_iops == NULL)
551 dp->proc_iops = &proc_file_inode_operations; 546 dp->proc_iops = &proc_file_inode_operations;
552 } 547 }
548
549 spin_lock(&proc_subdir_lock);
550 dp->next = dir->subdir;
551 dp->parent = dir;
552 dir->subdir = dp;
553 spin_unlock(&proc_subdir_lock);
554
553 return 0; 555 return 0;
554} 556}
555 557
@@ -613,6 +615,9 @@ static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
613 ent->namelen = len; 615 ent->namelen = len;
614 ent->mode = mode; 616 ent->mode = mode;
615 ent->nlink = nlink; 617 ent->nlink = nlink;
618 ent->pde_users = 0;
619 spin_lock_init(&ent->pde_unload_lock);
620 ent->pde_unload_completion = NULL;
616 out: 621 out:
617 return ent; 622 return ent;
618} 623}
@@ -649,9 +654,6 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
649 654
650 ent = proc_create(&parent, name, S_IFDIR | mode, 2); 655 ent = proc_create(&parent, name, S_IFDIR | mode, 2);
651 if (ent) { 656 if (ent) {
652 ent->proc_fops = &proc_dir_operations;
653 ent->proc_iops = &proc_dir_inode_operations;
654
655 if (proc_register(parent, ent) < 0) { 657 if (proc_register(parent, ent) < 0) {
656 kfree(ent); 658 kfree(ent);
657 ent = NULL; 659 ent = NULL;
@@ -686,10 +688,6 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
686 688
687 ent = proc_create(&parent,name,mode,nlink); 689 ent = proc_create(&parent,name,mode,nlink);
688 if (ent) { 690 if (ent) {
689 if (S_ISDIR(mode)) {
690 ent->proc_fops = &proc_dir_operations;
691 ent->proc_iops = &proc_dir_inode_operations;
692 }
693 if (proc_register(parent, ent) < 0) { 691 if (proc_register(parent, ent) < 0) {
694 kfree(ent); 692 kfree(ent);
695 ent = NULL; 693 ent = NULL;
@@ -734,9 +732,35 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
734 de = *p; 732 de = *p;
735 *p = de->next; 733 *p = de->next;
736 de->next = NULL; 734 de->next = NULL;
735
736 spin_lock(&de->pde_unload_lock);
737 /*
738 * Stop accepting new callers into module. If you're
739 * dynamically allocating ->proc_fops, save a pointer somewhere.
740 */
741 de->proc_fops = NULL;
742 /* Wait until all existing callers into module are done. */
743 if (de->pde_users > 0) {
744 DECLARE_COMPLETION_ONSTACK(c);
745
746 if (!de->pde_unload_completion)
747 de->pde_unload_completion = &c;
748
749 spin_unlock(&de->pde_unload_lock);
750 spin_unlock(&proc_subdir_lock);
751
752 wait_for_completion(de->pde_unload_completion);
753
754 spin_lock(&proc_subdir_lock);
755 goto continue_removing;
756 }
757 spin_unlock(&de->pde_unload_lock);
758
759continue_removing:
737 if (S_ISDIR(de->mode)) 760 if (S_ISDIR(de->mode))
738 parent->nlink--; 761 parent->nlink--;
739 proc_kill_inodes(de); 762 if (!S_ISREG(de->mode))
763 proc_kill_inodes(de);
740 de->nlink = 0; 764 de->nlink = 0;
741 WARN_ON(de->subdir); 765 WARN_ON(de->subdir);
742 if (!atomic_read(&de->count)) 766 if (!atomic_read(&de->count))
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d5ce65c68d7b..dd28e86ab422 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -10,6 +10,7 @@
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/stat.h> 12#include <linux/stat.h>
13#include <linux/completion.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include <linux/limits.h> 15#include <linux/limits.h>
15#include <linux/init.h> 16#include <linux/init.h>
@@ -140,6 +141,251 @@ static const struct super_operations proc_sops = {
140 .remount_fs = proc_remount, 141 .remount_fs = proc_remount,
141}; 142};
142 143
144static void pde_users_dec(struct proc_dir_entry *pde)
145{
146 spin_lock(&pde->pde_unload_lock);
147 pde->pde_users--;
148 if (pde->pde_unload_completion && pde->pde_users == 0)
149 complete(pde->pde_unload_completion);
150 spin_unlock(&pde->pde_unload_lock);
151}
152
153static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
154{
155 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
156 loff_t rv = -EINVAL;
157 loff_t (*llseek)(struct file *, loff_t, int);
158
159 spin_lock(&pde->pde_unload_lock);
160 /*
161 * remove_proc_entry() is going to delete PDE (as part of module
162 * cleanup sequence). No new callers into module allowed.
163 */
164 if (!pde->proc_fops) {
165 spin_unlock(&pde->pde_unload_lock);
166 return rv;
167 }
168 /*
169 * Bump refcount so that remove_proc_entry will wail for ->llseek to
170 * complete.
171 */
172 pde->pde_users++;
173 /*
174 * Save function pointer under lock, to protect against ->proc_fops
175 * NULL'ifying right after ->pde_unload_lock is dropped.
176 */
177 llseek = pde->proc_fops->llseek;
178 spin_unlock(&pde->pde_unload_lock);
179
180 if (!llseek)
181 llseek = default_llseek;
182 rv = llseek(file, offset, whence);
183
184 pde_users_dec(pde);
185 return rv;
186}
187
188static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
189{
190 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
191 ssize_t rv = -EIO;
192 ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
193
194 spin_lock(&pde->pde_unload_lock);
195 if (!pde->proc_fops) {
196 spin_unlock(&pde->pde_unload_lock);
197 return rv;
198 }
199 pde->pde_users++;
200 read = pde->proc_fops->read;
201 spin_unlock(&pde->pde_unload_lock);
202
203 if (read)
204 rv = read(file, buf, count, ppos);
205
206 pde_users_dec(pde);
207 return rv;
208}
209
210static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
211{
212 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
213 ssize_t rv = -EIO;
214 ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
215
216 spin_lock(&pde->pde_unload_lock);
217 if (!pde->proc_fops) {
218 spin_unlock(&pde->pde_unload_lock);
219 return rv;
220 }
221 pde->pde_users++;
222 write = pde->proc_fops->write;
223 spin_unlock(&pde->pde_unload_lock);
224
225 if (write)
226 rv = write(file, buf, count, ppos);
227
228 pde_users_dec(pde);
229 return rv;
230}
231
232static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
233{
234 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
235 unsigned int rv = 0;
236 unsigned int (*poll)(struct file *, struct poll_table_struct *);
237
238 spin_lock(&pde->pde_unload_lock);
239 if (!pde->proc_fops) {
240 spin_unlock(&pde->pde_unload_lock);
241 return rv;
242 }
243 pde->pde_users++;
244 poll = pde->proc_fops->poll;
245 spin_unlock(&pde->pde_unload_lock);
246
247 if (poll)
248 rv = poll(file, pts);
249
250 pde_users_dec(pde);
251 return rv;
252}
253
254static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
255{
256 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
257 long rv = -ENOTTY;
258 long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
259 int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long);
260
261 spin_lock(&pde->pde_unload_lock);
262 if (!pde->proc_fops) {
263 spin_unlock(&pde->pde_unload_lock);
264 return rv;
265 }
266 pde->pde_users++;
267 unlocked_ioctl = pde->proc_fops->unlocked_ioctl;
268 ioctl = pde->proc_fops->ioctl;
269 spin_unlock(&pde->pde_unload_lock);
270
271 if (unlocked_ioctl) {
272 rv = unlocked_ioctl(file, cmd, arg);
273 if (rv == -ENOIOCTLCMD)
274 rv = -EINVAL;
275 } else if (ioctl) {
276 lock_kernel();
277 rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
278 unlock_kernel();
279 }
280
281 pde_users_dec(pde);
282 return rv;
283}
284
285#ifdef CONFIG_COMPAT
286static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
287{
288 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
289 long rv = -ENOTTY;
290 long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
291
292 spin_lock(&pde->pde_unload_lock);
293 if (!pde->proc_fops) {
294 spin_unlock(&pde->pde_unload_lock);
295 return rv;
296 }
297 pde->pde_users++;
298 compat_ioctl = pde->proc_fops->compat_ioctl;
299 spin_unlock(&pde->pde_unload_lock);
300
301 if (compat_ioctl)
302 rv = compat_ioctl(file, cmd, arg);
303
304 pde_users_dec(pde);
305 return rv;
306}
307#endif
308
309static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
310{
311 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
312 int rv = -EIO;
313 int (*mmap)(struct file *, struct vm_area_struct *);
314
315 spin_lock(&pde->pde_unload_lock);
316 if (!pde->proc_fops) {
317 spin_unlock(&pde->pde_unload_lock);
318 return rv;
319 }
320 pde->pde_users++;
321 mmap = pde->proc_fops->mmap;
322 spin_unlock(&pde->pde_unload_lock);
323
324 if (mmap)
325 rv = mmap(file, vma);
326
327 pde_users_dec(pde);
328 return rv;
329}
330
331static int proc_reg_open(struct inode *inode, struct file *file)
332{
333 struct proc_dir_entry *pde = PDE(inode);
334 int rv = 0;
335 int (*open)(struct inode *, struct file *);
336
337 spin_lock(&pde->pde_unload_lock);
338 if (!pde->proc_fops) {
339 spin_unlock(&pde->pde_unload_lock);
340 return rv;
341 }
342 pde->pde_users++;
343 open = pde->proc_fops->open;
344 spin_unlock(&pde->pde_unload_lock);
345
346 if (open)
347 rv = open(inode, file);
348
349 pde_users_dec(pde);
350 return rv;
351}
352
353static int proc_reg_release(struct inode *inode, struct file *file)
354{
355 struct proc_dir_entry *pde = PDE(inode);
356 int rv = 0;
357 int (*release)(struct inode *, struct file *);
358
359 spin_lock(&pde->pde_unload_lock);
360 if (!pde->proc_fops) {
361 spin_unlock(&pde->pde_unload_lock);
362 return rv;
363 }
364 pde->pde_users++;
365 release = pde->proc_fops->release;
366 spin_unlock(&pde->pde_unload_lock);
367
368 if (release)
369 rv = release(inode, file);
370
371 pde_users_dec(pde);
372 return rv;
373}
374
375static const struct file_operations proc_reg_file_ops = {
376 .llseek = proc_reg_llseek,
377 .read = proc_reg_read,
378 .write = proc_reg_write,
379 .poll = proc_reg_poll,
380 .unlocked_ioctl = proc_reg_unlocked_ioctl,
381#ifdef CONFIG_COMPAT
382 .compat_ioctl = proc_reg_compat_ioctl,
383#endif
384 .mmap = proc_reg_mmap,
385 .open = proc_reg_open,
386 .release = proc_reg_release,
387};
388
143struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, 389struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
144 struct proc_dir_entry *de) 390 struct proc_dir_entry *de)
145{ 391{
@@ -166,8 +412,12 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
166 inode->i_nlink = de->nlink; 412 inode->i_nlink = de->nlink;
167 if (de->proc_iops) 413 if (de->proc_iops)
168 inode->i_op = de->proc_iops; 414 inode->i_op = de->proc_iops;
169 if (de->proc_fops) 415 if (de->proc_fops) {
170 inode->i_fop = de->proc_fops; 416 if (S_ISREG(inode->i_mode))
417 inode->i_fop = &proc_reg_file_ops;
418 else
419 inode->i_fop = de->proc_fops;
420 }
171 } 421 }
172 422
173 return inode; 423 return inode;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5fd49e47f83a..d24b8d46059a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -105,6 +105,7 @@ static int uptime_read_proc(char *page, char **start, off_t off,
105 cputime_t idletime = cputime_add(init_task.utime, init_task.stime); 105 cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
106 106
107 do_posix_clock_monotonic_gettime(&uptime); 107 do_posix_clock_monotonic_gettime(&uptime);
108 monotonic_to_bootbased(&uptime);
108 cputime_to_timespec(idletime, &idle); 109 cputime_to_timespec(idletime, &idle);
109 len = sprintf(page,"%lu.%02lu %lu.%02lu\n", 110 len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
110 (unsigned long) uptime.tv_sec, 111 (unsigned long) uptime.tv_sec,
@@ -443,12 +444,12 @@ static int show_stat(struct seq_file *p, void *v)
443 unsigned long jif; 444 unsigned long jif;
444 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; 445 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
445 u64 sum = 0; 446 u64 sum = 0;
447 struct timespec boottime;
446 448
447 user = nice = system = idle = iowait = 449 user = nice = system = idle = iowait =
448 irq = softirq = steal = cputime64_zero; 450 irq = softirq = steal = cputime64_zero;
449 jif = - wall_to_monotonic.tv_sec; 451 getboottime(&boottime);
450 if (wall_to_monotonic.tv_nsec) 452 jif = boottime.tv_sec;
451 --jif;
452 453
453 for_each_possible_cpu(i) { 454 for_each_possible_cpu(i) {
454 int j; 455 int j;
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index b3a473b0a191..22846225acfa 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -69,7 +69,7 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
69 69
70static int show_tty_driver(struct seq_file *m, void *v) 70static int show_tty_driver(struct seq_file *m, void *v)
71{ 71{
72 struct tty_driver *p = v; 72 struct tty_driver *p = list_entry(v, struct tty_driver, tty_drivers);
73 dev_t from = MKDEV(p->major, p->minor_start); 73 dev_t from = MKDEV(p->major, p->minor_start);
74 dev_t to = from + p->num; 74 dev_t to = from + p->num;
75 75
@@ -106,22 +106,13 @@ static int show_tty_driver(struct seq_file *m, void *v)
106/* iterator */ 106/* iterator */
107static void *t_start(struct seq_file *m, loff_t *pos) 107static void *t_start(struct seq_file *m, loff_t *pos)
108{ 108{
109 struct list_head *p;
110 loff_t l = *pos;
111
112 mutex_lock(&tty_mutex); 109 mutex_lock(&tty_mutex);
113 list_for_each(p, &tty_drivers) 110 return seq_list_start(&tty_drivers, *pos);
114 if (!l--)
115 return list_entry(p, struct tty_driver, tty_drivers);
116 return NULL;
117} 111}
118 112
119static void *t_next(struct seq_file *m, void *v, loff_t *pos) 113static void *t_next(struct seq_file *m, void *v, loff_t *pos)
120{ 114{
121 struct list_head *p = ((struct tty_driver *)v)->tty_drivers.next; 115 return seq_list_next(v, &tty_drivers, pos);
122 (*pos)++;
123 return p==&tty_drivers ? NULL :
124 list_entry(p, struct tty_driver, tty_drivers);
125} 116}
126 117
127static void t_stop(struct seq_file *m, void *v) 118static void t_stop(struct seq_file *m, void *v)
diff --git a/fs/quota.c b/fs/quota.c
index 9f237d6182c9..e6577ac15a6c 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -10,12 +10,14 @@
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <asm/current.h> 11#include <asm/current.h>
12#include <asm/uaccess.h> 12#include <asm/uaccess.h>
13#include <linux/compat.h>
13#include <linux/kernel.h> 14#include <linux/kernel.h>
14#include <linux/security.h> 15#include <linux/security.h>
15#include <linux/syscalls.h> 16#include <linux/syscalls.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
17#include <linux/capability.h> 18#include <linux/capability.h>
18#include <linux/quotaops.h> 19#include <linux/quotaops.h>
20#include <linux/types.h>
19 21
20/* Check validity of generic quotactl commands */ 22/* Check validity of generic quotactl commands */
21static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id) 23static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
@@ -384,3 +386,119 @@ asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t
384 386
385 return ret; 387 return ret;
386} 388}
389
390#if defined(CONFIG_X86_64) || defined(CONFIG_IA64)
391/*
392 * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
393 * and is necessary due to alignment problems.
394 */
395struct compat_if_dqblk {
396 compat_u64 dqb_bhardlimit;
397 compat_u64 dqb_bsoftlimit;
398 compat_u64 dqb_curspace;
399 compat_u64 dqb_ihardlimit;
400 compat_u64 dqb_isoftlimit;
401 compat_u64 dqb_curinodes;
402 compat_u64 dqb_btime;
403 compat_u64 dqb_itime;
404 compat_uint_t dqb_valid;
405};
406
407/* XFS structures */
408struct compat_fs_qfilestat {
409 compat_u64 dqb_bhardlimit;
410 compat_u64 qfs_nblks;
411 compat_uint_t qfs_nextents;
412};
413
414struct compat_fs_quota_stat {
415 __s8 qs_version;
416 __u16 qs_flags;
417 __s8 qs_pad;
418 struct compat_fs_qfilestat qs_uquota;
419 struct compat_fs_qfilestat qs_gquota;
420 compat_uint_t qs_incoredqs;
421 compat_int_t qs_btimelimit;
422 compat_int_t qs_itimelimit;
423 compat_int_t qs_rtbtimelimit;
424 __u16 qs_bwarnlimit;
425 __u16 qs_iwarnlimit;
426};
427
428asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
429 qid_t id, void __user *addr)
430{
431 unsigned int cmds;
432 struct if_dqblk __user *dqblk;
433 struct compat_if_dqblk __user *compat_dqblk;
434 struct fs_quota_stat __user *fsqstat;
435 struct compat_fs_quota_stat __user *compat_fsqstat;
436 compat_uint_t data;
437 u16 xdata;
438 long ret;
439
440 cmds = cmd >> SUBCMDSHIFT;
441
442 switch (cmds) {
443 case Q_GETQUOTA:
444 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
445 compat_dqblk = addr;
446 ret = sys_quotactl(cmd, special, id, dqblk);
447 if (ret)
448 break;
449 if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
450 get_user(data, &dqblk->dqb_valid) ||
451 put_user(data, &compat_dqblk->dqb_valid))
452 ret = -EFAULT;
453 break;
454 case Q_SETQUOTA:
455 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
456 compat_dqblk = addr;
457 ret = -EFAULT;
458 if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
459 get_user(data, &compat_dqblk->dqb_valid) ||
460 put_user(data, &dqblk->dqb_valid))
461 break;
462 ret = sys_quotactl(cmd, special, id, dqblk);
463 break;
464 case Q_XGETQSTAT:
465 fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
466 compat_fsqstat = addr;
467 ret = sys_quotactl(cmd, special, id, fsqstat);
468 if (ret)
469 break;
470 ret = -EFAULT;
471 /* Copying qs_version, qs_flags, qs_pad */
472 if (copy_in_user(compat_fsqstat, fsqstat,
473 offsetof(struct compat_fs_quota_stat, qs_uquota)))
474 break;
475 /* Copying qs_uquota */
476 if (copy_in_user(&compat_fsqstat->qs_uquota,
477 &fsqstat->qs_uquota,
478 sizeof(compat_fsqstat->qs_uquota)) ||
479 get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
480 put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
481 break;
482 /* Copying qs_gquota */
483 if (copy_in_user(&compat_fsqstat->qs_gquota,
484 &fsqstat->qs_gquota,
485 sizeof(compat_fsqstat->qs_gquota)) ||
486 get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
487 put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
488 break;
489 /* Copying the rest */
490 if (copy_in_user(&compat_fsqstat->qs_incoredqs,
491 &fsqstat->qs_incoredqs,
492 sizeof(struct compat_fs_quota_stat) -
493 offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
494 get_user(xdata, &fsqstat->qs_iwarnlimit) ||
495 put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
496 break;
497 ret = 0;
498 break;
499 default:
500 ret = sys_quotactl(cmd, special, id, addr);
501 }
502 return ret;
503}
504#endif
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d40d22b347b7..ef2b46d099ff 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -60,6 +60,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
60 inode->i_blocks = 0; 60 inode->i_blocks = 0;
61 inode->i_mapping->a_ops = &ramfs_aops; 61 inode->i_mapping->a_ops = &ramfs_aops;
62 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 62 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
63 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
63 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 64 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
64 switch (mode & S_IFMT) { 65 switch (mode & S_IFMT) {
65 default: 66 default:
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 30eebfb1b2d8..2070aeee2a52 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1305,7 +1305,6 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
1305 if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 && 1305 if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
1306 *ppos + count > MAX_NON_LFS) { 1306 *ppos + count > MAX_NON_LFS) {
1307 if (*ppos >= MAX_NON_LFS) { 1307 if (*ppos >= MAX_NON_LFS) {
1308 send_sig(SIGXFSZ, current, 0);
1309 return -EFBIG; 1308 return -EFBIG;
1310 } 1309 }
1311 if (count > MAX_NON_LFS - (unsigned long)*ppos) 1310 if (count > MAX_NON_LFS - (unsigned long)*ppos)
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 1272d11399fb..ddde489f1cb2 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -7,6 +7,7 @@
7#include <linux/reiserfs_fs.h> 7#include <linux/reiserfs_fs.h>
8#include <linux/reiserfs_acl.h> 8#include <linux/reiserfs_acl.h>
9#include <linux/reiserfs_xattr.h> 9#include <linux/reiserfs_xattr.h>
10#include <linux/exportfs.h>
10#include <linux/smp_lock.h> 11#include <linux/smp_lock.h>
11#include <linux/pagemap.h> 12#include <linux/pagemap.h>
12#include <linux/highmem.h> 13#include <linux/highmem.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b4ac9119200e..5a93cfe1a032 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -21,6 +21,7 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/exportfs.h>
24#include <linux/vfs.h> 25#include <linux/vfs.h>
25#include <linux/mnt_namespace.h> 26#include <linux/mnt_namespace.h>
26#include <linux/mount.h> 27#include <linux/mount.h>
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 49194a4e6b91..bbb19be260ce 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -177,21 +177,23 @@ EXPORT_SYMBOL(seq_read);
177 177
178static int traverse(struct seq_file *m, loff_t offset) 178static int traverse(struct seq_file *m, loff_t offset)
179{ 179{
180 loff_t pos = 0; 180 loff_t pos = 0, index;
181 int error = 0; 181 int error = 0;
182 void *p; 182 void *p;
183 183
184 m->version = 0; 184 m->version = 0;
185 m->index = 0; 185 index = 0;
186 m->count = m->from = 0; 186 m->count = m->from = 0;
187 if (!offset) 187 if (!offset) {
188 m->index = index;
188 return 0; 189 return 0;
190 }
189 if (!m->buf) { 191 if (!m->buf) {
190 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); 192 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
191 if (!m->buf) 193 if (!m->buf)
192 return -ENOMEM; 194 return -ENOMEM;
193 } 195 }
194 p = m->op->start(m, &m->index); 196 p = m->op->start(m, &index);
195 while (p) { 197 while (p) {
196 error = PTR_ERR(p); 198 error = PTR_ERR(p);
197 if (IS_ERR(p)) 199 if (IS_ERR(p))
@@ -204,15 +206,17 @@ static int traverse(struct seq_file *m, loff_t offset)
204 if (pos + m->count > offset) { 206 if (pos + m->count > offset) {
205 m->from = offset - pos; 207 m->from = offset - pos;
206 m->count -= m->from; 208 m->count -= m->from;
209 m->index = index;
207 break; 210 break;
208 } 211 }
209 pos += m->count; 212 pos += m->count;
210 m->count = 0; 213 m->count = 0;
211 if (pos == offset) { 214 if (pos == offset) {
212 m->index++; 215 index++;
216 m->index = index;
213 break; 217 break;
214 } 218 }
215 p = m->op->next(m, p, &m->index); 219 p = m->op->next(m, p, &index);
216 } 220 }
217 m->op->stop(m, p); 221 m->op->stop(m, p);
218 return error; 222 return error;
@@ -260,8 +264,8 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
260 } 264 }
261 } 265 }
262 } 266 }
263 mutex_unlock(&m->lock);
264 file->f_version = m->version; 267 file->f_version = m->version;
268 mutex_unlock(&m->lock);
265 return retval; 269 return retval;
266} 270}
267EXPORT_SYMBOL(seq_lseek); 271EXPORT_SYMBOL(seq_lseek);
diff --git a/fs/splice.c b/fs/splice.c
index 6c9828651e6f..53fc2082a468 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1061,8 +1061,9 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1061 1061
1062 while (len) { 1062 while (len) {
1063 size_t read_len; 1063 size_t read_len;
1064 loff_t pos = sd->pos;
1064 1065
1065 ret = do_splice_to(in, &sd->pos, pipe, len, flags); 1066 ret = do_splice_to(in, &pos, pipe, len, flags);
1066 if (unlikely(ret <= 0)) 1067 if (unlikely(ret <= 0))
1067 goto out_release; 1068 goto out_release;
1068 1069
@@ -1080,6 +1081,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1080 1081
1081 bytes += ret; 1082 bytes += ret;
1082 len -= ret; 1083 len -= ret;
1084 sd->pos = pos;
1083 1085
1084 if (ret < read_len) 1086 if (ret < read_len)
1085 goto out_release; 1087 goto out_release;
diff --git a/fs/super.c b/fs/super.c
index 5260d620c555..fc8ebedc6bed 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -884,6 +884,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
884 error = type->get_sb(type, flags, name, data, mnt); 884 error = type->get_sb(type, flags, name, data, mnt);
885 if (error < 0) 885 if (error < 0)
886 goto out_free_secdata; 886 goto out_free_secdata;
887 BUG_ON(!mnt->mnt_sb);
887 888
888 error = security_sb_kern_mount(mnt->mnt_sb, secdata); 889 error = security_sb_kern_mount(mnt->mnt_sb, secdata);
889 if (error) 890 if (error)
diff --git a/fs/udf/crc.c b/fs/udf/crc.c
index 1b82a4adc2f7..ef2bfaa19d75 100644
--- a/fs/udf/crc.c
+++ b/fs/udf/crc.c
@@ -106,8 +106,8 @@ int main(void)
106{ 106{
107 unsigned short x; 107 unsigned short x;
108 108
109 x = udf_crc16(bytes, sizeof bytes); 109 x = udf_crc(bytes, sizeof bytes);
110 printf("udf_crc16: calculated = %4.4x, correct = %4.4x\n", x, 0x3299U); 110 printf("udf_crc: calculated = %4.4x, correct = %4.4x\n", x, 0x3299U);
111 111
112 return 0; 112 return 0;
113} 113}
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 8206983f2ebf..10f3188738af 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -50,7 +50,7 @@ void udf_free_inode(struct inode * inode)
50 else 50 else
51 UDF_SB_LVIDIU(sb)->numFiles = 51 UDF_SB_LVIDIU(sb)->numFiles =
52 cpu_to_le32(le32_to_cpu(UDF_SB_LVIDIU(sb)->numFiles) - 1); 52 cpu_to_le32(le32_to_cpu(UDF_SB_LVIDIU(sb)->numFiles) - 1);
53 53
54 mark_buffer_dirty(sbi->s_lvidbh); 54 mark_buffer_dirty(sbi->s_lvidbh);
55 } 55 }
56 mutex_unlock(&sbi->s_alloc_mutex); 56 mutex_unlock(&sbi->s_alloc_mutex);
@@ -136,6 +136,13 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err)
136 UDF_I_EFE(inode) = 0; 136 UDF_I_EFE(inode) = 0;
137 UDF_I_DATA(inode) = kzalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL); 137 UDF_I_DATA(inode) = kzalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL);
138 } 138 }
139 if (!UDF_I_DATA(inode))
140 {
141 iput(inode);
142 *err = -ENOMEM;
143 mutex_unlock(&sbi->s_alloc_mutex);
144 return NULL;
145 }
139 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) 146 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
140 UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_IN_ICB; 147 UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_IN_ICB;
141 else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) 148 else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index bf7de0bdbab3..5b82e489af78 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -49,6 +49,7 @@ MODULE_LICENSE("GPL");
49static mode_t udf_convert_permissions(struct fileEntry *); 49static mode_t udf_convert_permissions(struct fileEntry *);
50static int udf_update_inode(struct inode *, int); 50static int udf_update_inode(struct inode *, int);
51static void udf_fill_inode(struct inode *, struct buffer_head *); 51static void udf_fill_inode(struct inode *, struct buffer_head *);
52static int udf_alloc_i_data(struct inode *inode, size_t size);
52static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, 53static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
53 long *, int *); 54 long *, int *);
54static int8_t udf_insert_aext(struct inode *, struct extent_position, 55static int8_t udf_insert_aext(struct inode *, struct extent_position,
@@ -734,7 +735,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset, int newbl
734 (*c) ++; 735 (*c) ++;
735 (*endnum) ++; 736 (*endnum) ++;
736 } 737 }
737 738
738 laarr[curr].extLocation.logicalBlockNum = newblocknum; 739 laarr[curr].extLocation.logicalBlockNum = newblocknum;
739 if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) 740 if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
740 laarr[curr].extLocation.partitionReferenceNum = 741 laarr[curr].extLocation.partitionReferenceNum =
@@ -836,7 +837,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
836 { 837 {
837 numalloc -= elen; 838 numalloc -= elen;
838 if (*endnum > (i+1)) 839 if (*endnum > (i+1))
839 memmove(&laarr[i], &laarr[i+1], 840 memmove(&laarr[i], &laarr[i+1],
840 sizeof(long_ad) * (*endnum - (i+1))); 841 sizeof(long_ad) * (*endnum - (i+1)));
841 i --; 842 i --;
842 (*endnum) --; 843 (*endnum) --;
@@ -1024,7 +1025,7 @@ void udf_truncate(struct inode * inode)
1024 { 1025 {
1025 block_truncate_page(inode->i_mapping, inode->i_size, udf_get_block); 1026 block_truncate_page(inode->i_mapping, inode->i_size, udf_get_block);
1026 udf_truncate_extents(inode); 1027 udf_truncate_extents(inode);
1027 } 1028 }
1028 1029
1029 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); 1030 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
1030 if (IS_SYNC(inode)) 1031 if (IS_SYNC(inode))
@@ -1087,10 +1088,10 @@ __udf_read_inode(struct inode *inode)
1087 { 1088 {
1088 kernel_lb_addr loc; 1089 kernel_lb_addr loc;
1089 ie = (struct indirectEntry *)ibh->b_data; 1090 ie = (struct indirectEntry *)ibh->b_data;
1090 1091
1091 loc = lelb_to_cpu(ie->indirectICB.extLocation); 1092 loc = lelb_to_cpu(ie->indirectICB.extLocation);
1092 1093
1093 if (ie->indirectICB.extLength && 1094 if (ie->indirectICB.extLength &&
1094 (nbh = udf_read_ptagged(inode->i_sb, loc, 0, &ident))) 1095 (nbh = udf_read_ptagged(inode->i_sb, loc, 0, &ident)))
1095 { 1096 {
1096 if (ident == TAG_IDENT_FE || 1097 if (ident == TAG_IDENT_FE ||
@@ -1156,14 +1157,22 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1156 { 1157 {
1157 UDF_I_EFE(inode) = 1; 1158 UDF_I_EFE(inode) = 1;
1158 UDF_I_USE(inode) = 0; 1159 UDF_I_USE(inode) = 0;
1159 UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry), GFP_KERNEL); 1160 if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)))
1161 {
1162 make_bad_inode(inode);
1163 return;
1164 }
1160 memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct extendedFileEntry), inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)); 1165 memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct extendedFileEntry), inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry));
1161 } 1166 }
1162 else if (le16_to_cpu(fe->descTag.tagIdent) == TAG_IDENT_FE) 1167 else if (le16_to_cpu(fe->descTag.tagIdent) == TAG_IDENT_FE)
1163 { 1168 {
1164 UDF_I_EFE(inode) = 0; 1169 UDF_I_EFE(inode) = 0;
1165 UDF_I_USE(inode) = 0; 1170 UDF_I_USE(inode) = 0;
1166 UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL); 1171 if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - sizeof(struct fileEntry)))
1172 {
1173 make_bad_inode(inode);
1174 return;
1175 }
1167 memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct fileEntry), inode->i_sb->s_blocksize - sizeof(struct fileEntry)); 1176 memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct fileEntry), inode->i_sb->s_blocksize - sizeof(struct fileEntry));
1168 } 1177 }
1169 else if (le16_to_cpu(fe->descTag.tagIdent) == TAG_IDENT_USE) 1178 else if (le16_to_cpu(fe->descTag.tagIdent) == TAG_IDENT_USE)
@@ -1173,7 +1182,11 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1173 UDF_I_LENALLOC(inode) = 1182 UDF_I_LENALLOC(inode) =
1174 le32_to_cpu( 1183 le32_to_cpu(
1175 ((struct unallocSpaceEntry *)bh->b_data)->lengthAllocDescs); 1184 ((struct unallocSpaceEntry *)bh->b_data)->lengthAllocDescs);
1176 UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry), GFP_KERNEL); 1185 if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)))
1186 {
1187 make_bad_inode(inode);
1188 return;
1189 }
1177 memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct unallocSpaceEntry), inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); 1190 memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct unallocSpaceEntry), inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry));
1178 return; 1191 return;
1179 } 1192 }
@@ -1191,7 +1204,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1191 inode->i_nlink = le16_to_cpu(fe->fileLinkCount); 1204 inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
1192 if (!inode->i_nlink) 1205 if (!inode->i_nlink)
1193 inode->i_nlink = 1; 1206 inode->i_nlink = 1;
1194 1207
1195 inode->i_size = le64_to_cpu(fe->informationLength); 1208 inode->i_size = le64_to_cpu(fe->informationLength);
1196 UDF_I_LENEXTENTS(inode) = inode->i_size; 1209 UDF_I_LENEXTENTS(inode) = inode->i_size;
1197 1210
@@ -1243,7 +1256,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1243 } 1256 }
1244 else 1257 else
1245 { 1258 {
1246 inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << 1259 inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
1247 (inode->i_sb->s_blocksize_bits - 9); 1260 (inode->i_sb->s_blocksize_bits - 9);
1248 1261
1249 if ( udf_stamp_to_time(&convtime, &convtime_usec, 1262 if ( udf_stamp_to_time(&convtime, &convtime_usec,
@@ -1374,6 +1387,20 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1374 } 1387 }
1375} 1388}
1376 1389
1390static int udf_alloc_i_data(struct inode *inode, size_t size)
1391{
1392 UDF_I_DATA(inode) = kmalloc(size, GFP_KERNEL);
1393
1394 if (!UDF_I_DATA(inode))
1395 {
1396 printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) no free memory\n",
1397 inode->i_ino);
1398 return -ENOMEM;
1399 }
1400
1401 return 0;
1402}
1403
1377static mode_t 1404static mode_t
1378udf_convert_permissions(struct fileEntry *fe) 1405udf_convert_permissions(struct fileEntry *fe)
1379{ 1406{
@@ -2072,7 +2099,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
2072 mark_buffer_dirty_inode(oepos.bh, inode); 2099 mark_buffer_dirty_inode(oepos.bh, inode);
2073 } 2100 }
2074 } 2101 }
2075 2102
2076 brelse(epos.bh); 2103 brelse(epos.bh);
2077 brelse(oepos.bh); 2104 brelse(oepos.bh);
2078 return (elen >> 30); 2105 return (elen >> 30);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 22ff6ed55ce9..2b3011689e89 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -87,6 +87,7 @@
87#include <linux/smp_lock.h> 87#include <linux/smp_lock.h>
88#include <linux/buffer_head.h> 88#include <linux/buffer_head.h>
89#include <linux/vfs.h> 89#include <linux/vfs.h>
90#include <linux/log2.h>
90 91
91#include "swab.h" 92#include "swab.h"
92#include "util.h" 93#include "util.h"
@@ -854,7 +855,7 @@ magic_found:
854 uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); 855 uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
855 uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); 856 uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
856 857
857 if (uspi->s_fsize & (uspi->s_fsize - 1)) { 858 if (!is_power_of_2(uspi->s_fsize)) {
858 printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n", 859 printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n",
859 uspi->s_fsize); 860 uspi->s_fsize);
860 goto failed; 861 goto failed;
@@ -869,7 +870,7 @@ magic_found:
869 uspi->s_fsize); 870 uspi->s_fsize);
870 goto failed; 871 goto failed;
871 } 872 }
872 if (uspi->s_bsize & (uspi->s_bsize - 1)) { 873 if (!is_power_of_2(uspi->s_bsize)) {
873 printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n", 874 printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n",
874 uspi->s_bsize); 875 uspi->s_bsize);
875 goto failed; 876 goto failed;
diff --git a/fs/utimes.c b/fs/utimes.c
index b3c88952465f..83a7e69e706c 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -106,7 +106,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
106 if (IS_IMMUTABLE(inode)) 106 if (IS_IMMUTABLE(inode))
107 goto dput_and_out; 107 goto dput_and_out;
108 108
109 if (current->fsuid != inode->i_uid) { 109 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) {
110 if (f) { 110 if (f) {
111 if (!(f->f_mode & FMODE_WRITE)) 111 if (!(f->f_mode & FMODE_WRITE))
112 goto dput_and_out; 112 goto dput_and_out;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 2df63622354e..b0f0e58866de 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -35,10 +35,13 @@
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36 36
37static kmem_zone_t *xfs_buf_zone; 37static kmem_zone_t *xfs_buf_zone;
38static struct shrinker *xfs_buf_shake;
39STATIC int xfsbufd(void *); 38STATIC int xfsbufd(void *);
40STATIC int xfsbufd_wakeup(int, gfp_t); 39STATIC int xfsbufd_wakeup(int, gfp_t);
41STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 40STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
41static struct shrinker xfs_buf_shake = {
42 .shrink = xfsbufd_wakeup,
43 .seeks = DEFAULT_SEEKS,
44};
42 45
43static struct workqueue_struct *xfslogd_workqueue; 46static struct workqueue_struct *xfslogd_workqueue;
44struct workqueue_struct *xfsdatad_workqueue; 47struct workqueue_struct *xfsdatad_workqueue;
@@ -1832,14 +1835,9 @@ xfs_buf_init(void)
1832 if (!xfsdatad_workqueue) 1835 if (!xfsdatad_workqueue)
1833 goto out_destroy_xfslogd_workqueue; 1836 goto out_destroy_xfslogd_workqueue;
1834 1837
1835 xfs_buf_shake = set_shrinker(DEFAULT_SEEKS, xfsbufd_wakeup); 1838 register_shrinker(&xfs_buf_shake);
1836 if (!xfs_buf_shake)
1837 goto out_destroy_xfsdatad_workqueue;
1838
1839 return 0; 1839 return 0;
1840 1840
1841 out_destroy_xfsdatad_workqueue:
1842 destroy_workqueue(xfsdatad_workqueue);
1843 out_destroy_xfslogd_workqueue: 1841 out_destroy_xfslogd_workqueue:
1844 destroy_workqueue(xfslogd_workqueue); 1842 destroy_workqueue(xfslogd_workqueue);
1845 out_free_buf_zone: 1843 out_free_buf_zone:
@@ -1854,7 +1852,7 @@ xfs_buf_init(void)
1854void 1852void
1855xfs_buf_terminate(void) 1853xfs_buf_terminate(void)
1856{ 1854{
1857 remove_shrinker(xfs_buf_shake); 1855 unregister_shrinker(&xfs_buf_shake);
1858 destroy_workqueue(xfsdatad_workqueue); 1856 destroy_workqueue(xfsdatad_workqueue);
1859 destroy_workqueue(xfslogd_workqueue); 1857 destroy_workqueue(xfslogd_workqueue);
1860 kmem_zone_destroy(xfs_buf_zone); 1858 kmem_zone_destroy(xfs_buf_zone);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 06894cf00b12..4528f9a3f304 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -562,6 +562,7 @@ xfssyncd(
562 bhv_vfs_sync_work_t *work, *n; 562 bhv_vfs_sync_work_t *work, *n;
563 LIST_HEAD (tmp); 563 LIST_HEAD (tmp);
564 564
565 set_freezable();
565 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); 566 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
566 for (;;) { 567 for (;;) {
567 timeleft = schedule_timeout_interruptible(timeleft); 568 timeleft = schedule_timeout_interruptible(timeleft);
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 33dd1ca13245..201cc3273c84 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -18,6 +18,8 @@
18#ifndef __XFS_SUPER_H__ 18#ifndef __XFS_SUPER_H__
19#define __XFS_SUPER_H__ 19#define __XFS_SUPER_H__
20 20
21#include <linux/exportfs.h>
22
21#ifdef CONFIG_XFS_DMAPI 23#ifdef CONFIG_XFS_DMAPI
22# define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops) 24# define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops)
23# define vfs_initdmapi() dmapi_init() 25# define vfs_initdmapi() dmapi_init()
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 7def4c699343..2d274b23ade5 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -62,7 +62,6 @@ uint ndquot;
62 62
63kmem_zone_t *qm_dqzone; 63kmem_zone_t *qm_dqzone;
64kmem_zone_t *qm_dqtrxzone; 64kmem_zone_t *qm_dqtrxzone;
65static struct shrinker *xfs_qm_shaker;
66 65
67static cred_t xfs_zerocr; 66static cred_t xfs_zerocr;
68 67
@@ -78,6 +77,11 @@ STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
78STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 77STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
79STATIC int xfs_qm_shake(int, gfp_t); 78STATIC int xfs_qm_shake(int, gfp_t);
80 79
80static struct shrinker xfs_qm_shaker = {
81 .shrink = xfs_qm_shake,
82 .seeks = DEFAULT_SEEKS,
83};
84
81#ifdef DEBUG 85#ifdef DEBUG
82extern mutex_t qcheck_lock; 86extern mutex_t qcheck_lock;
83#endif 87#endif
@@ -149,7 +153,7 @@ xfs_Gqm_init(void)
149 } else 153 } else
150 xqm->qm_dqzone = qm_dqzone; 154 xqm->qm_dqzone = qm_dqzone;
151 155
152 xfs_qm_shaker = set_shrinker(DEFAULT_SEEKS, xfs_qm_shake); 156 register_shrinker(&xfs_qm_shaker);
153 157
154 /* 158 /*
155 * The t_dqinfo portion of transactions. 159 * The t_dqinfo portion of transactions.
@@ -181,7 +185,7 @@ xfs_qm_destroy(
181 185
182 ASSERT(xqm != NULL); 186 ASSERT(xqm != NULL);
183 ASSERT(xqm->qm_nrefs == 0); 187 ASSERT(xqm->qm_nrefs == 0);
184 remove_shrinker(xfs_qm_shaker); 188 unregister_shrinker(&xfs_qm_shaker);
185 hsize = xqm->qm_dqhashmask + 1; 189 hsize = xqm->qm_dqhashmask + 1;
186 for (i = 0; i < hsize; i++) { 190 for (i = 0; i < hsize; i++) {
187 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); 191 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));