aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorChris Metcalf <cmetcalf@tilera.com>2010-08-13 19:59:15 -0400
committerChris Metcalf <cmetcalf@tilera.com>2010-08-13 19:59:15 -0400
commit7d72e6fa56c4100b9669efe0044f77ed9eb785a1 (patch)
tree5e90bf4969809a1ab20b97432b85be20ccfaa1f4 /fs
parentba00376b0b13f234d839541a7b36a5bf5c2a4036 (diff)
parent2be1f3a73dd02e38e181cf5abacb3d45a6a2d6b8 (diff)
Merge branch 'master' into for-linus
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c15
-rw-r--r--fs/afs/cell.c56
-rw-r--r--fs/afs/dir.c47
-rw-r--r--fs/afs/inode.c86
-rw-r--r--fs/afs/internal.h23
-rw-r--r--fs/afs/mntpt.c78
-rw-r--r--fs/afs/proc.c2
-rw-r--r--fs/afs/rxrpc.c1
-rw-r--r--fs/afs/super.c20
-rw-r--r--fs/autofs4/root.c3
-rw-r--r--fs/block_dev.c10
-rw-r--r--fs/cachefiles/daemon.c32
-rw-r--r--fs/cachefiles/internal.h13
-rw-r--r--fs/ceph/Makefile2
-rw-r--r--fs/ceph/addr.c16
-rw-r--r--fs/ceph/armor.c6
-rw-r--r--fs/ceph/auth.c6
-rw-r--r--fs/ceph/auth_x.c6
-rw-r--r--fs/ceph/buffer.c16
-rw-r--r--fs/ceph/caps.c303
-rw-r--r--fs/ceph/ceph_frag.h4
-rw-r--r--fs/ceph/ceph_fs.c50
-rw-r--r--fs/ceph/ceph_fs.h87
-rw-r--r--fs/ceph/ceph_hash.h4
-rw-r--r--fs/ceph/ceph_strings.c3
-rw-r--r--fs/ceph/crush/crush.h4
-rw-r--r--fs/ceph/crush/hash.h4
-rw-r--r--fs/ceph/crush/mapper.h4
-rw-r--r--fs/ceph/crypto.c27
-rw-r--r--fs/ceph/crypto.h4
-rw-r--r--fs/ceph/debugfs.c21
-rw-r--r--fs/ceph/decode.h6
-rw-r--r--fs/ceph/dir.c10
-rw-r--r--fs/ceph/file.c32
-rw-r--r--fs/ceph/inode.c5
-rw-r--r--fs/ceph/ioctl.c24
-rw-r--r--fs/ceph/ioctl.h2
-rw-r--r--fs/ceph/locks.c256
-rw-r--r--fs/ceph/mds_client.c235
-rw-r--r--fs/ceph/mds_client.h30
-rw-r--r--fs/ceph/mdsmap.c6
-rw-r--r--fs/ceph/mdsmap.h8
-rw-r--r--fs/ceph/messenger.c23
-rw-r--r--fs/ceph/mon_client.c170
-rw-r--r--fs/ceph/mon_client.h5
-rw-r--r--fs/ceph/msgr.h4
-rw-r--r--fs/ceph/osd_client.c9
-rw-r--r--fs/ceph/osdmap.c37
-rw-r--r--fs/ceph/rados.h13
-rw-r--r--fs/ceph/super.c88
-rw-r--r--fs/ceph/super.h40
-rw-r--r--fs/ceph/xattr.c2
-rw-r--r--fs/cifs/README10
-rw-r--r--fs/dcache.c188
-rw-r--r--fs/exofs/file.c29
-rw-r--r--fs/exofs/inode.c9
-rw-r--r--fs/exofs/ios.c44
-rw-r--r--fs/exofs/super.c1
-rw-r--r--fs/fcntl.c15
-rw-r--r--fs/file.c57
-rw-r--r--fs/file_table.c28
-rw-r--r--fs/fs-writeback.c81
-rw-r--r--fs/fs_struct.c7
-rw-r--r--fs/fscache/internal.h14
-rw-r--r--fs/isofs/inode.c7
-rw-r--r--fs/namei.c15
-rw-r--r--fs/namespace.c6
-rw-r--r--fs/nfs/Kconfig17
-rw-r--r--fs/nfs/callback.c11
-rw-r--r--fs/nfs/dns_resolve.c24
-rw-r--r--fs/nfs/dns_resolve.h12
-rw-r--r--fs/notify/fanotify/fanotify.c8
-rw-r--r--fs/notify/fanotify/fanotify_user.c6
-rw-r--r--fs/notify/fsnotify.c12
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c12
-rw-r--r--fs/notify/notification.c33
-rw-r--r--fs/ocfs2/acl.c33
-rw-r--r--fs/ocfs2/cluster/tcp.c17
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c9
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c22
-rw-r--r--fs/ocfs2/dlm/dlmthread.c114
-rw-r--r--fs/ocfs2/refcounttree.c20
-rw-r--r--fs/open.c4
-rw-r--r--fs/partitions/acorn.c35
-rw-r--r--fs/partitions/amiga.c20
-rw-r--r--fs/partitions/atari.c12
-rw-r--r--fs/partitions/check.c22
-rw-r--r--fs/partitions/check.h6
-rw-r--r--fs/partitions/efi.c2
-rw-r--r--fs/partitions/ibm.c17
-rw-r--r--fs/partitions/karma.c2
-rw-r--r--fs/partitions/ldm.c4
-rw-r--r--fs/partitions/mac.c4
-rw-r--r--fs/partitions/msdos.c67
-rw-r--r--fs/partitions/osf.c2
-rw-r--r--fs/partitions/sgi.c2
-rw-r--r--fs/partitions/sun.c2
-rw-r--r--fs/partitions/sysv68.c9
-rw-r--r--fs/partitions/ultrix.c2
-rw-r--r--fs/proc/Makefile2
-rw-r--r--fs/proc/base.c24
-rw-r--r--fs/reiserfs/journal.c1
-rw-r--r--fs/signalfd.c2
-rw-r--r--fs/squashfs/Kconfig25
-rw-r--r--fs/squashfs/Makefile4
-rw-r--r--fs/squashfs/decompressor.c6
-rw-r--r--fs/squashfs/lzo_wrapper.c136
-rw-r--r--fs/squashfs/squashfs.h3
-rw-r--r--fs/squashfs/squashfs_fs.h20
-rw-r--r--fs/squashfs/xattr.c4
-rw-r--r--fs/squashfs/xattr.h2
-rw-r--r--fs/sysv/super.c73
112 files changed, 2320 insertions, 1013 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index d97c34a24f7a..c7c23eab9440 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1263,10 +1263,19 @@ static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
1263 return PTR_ERR(fid); 1263 return PTR_ERR(fid);
1264 1264
1265 retval = p9_client_setattr(fid, &p9attr); 1265 retval = p9_client_setattr(fid, &p9attr);
1266 if (retval >= 0) 1266 if (retval < 0)
1267 retval = inode_setattr(dentry->d_inode, iattr); 1267 return retval;
1268 1268
1269 return retval; 1269 if ((iattr->ia_valid & ATTR_SIZE) &&
1270 iattr->ia_size != i_size_read(dentry->d_inode)) {
1271 retval = vmtruncate(dentry->d_inode, iattr->ia_size);
1272 if (retval)
1273 return retval;
1274 }
1275
1276 setattr_copy(dentry->d_inode, iattr);
1277 mark_inode_dirty(dentry->d_inode);
1278 return 0;
1270} 1279}
1271 1280
1272/** 1281/**
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index ffea35c63879..0d5eeadf6121 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -31,21 +31,20 @@ static struct afs_cell *afs_cell_root;
31 * allocate a cell record and fill in its name, VL server address list and 31 * allocate a cell record and fill in its name, VL server address list and
32 * allocate an anonymous key 32 * allocate an anonymous key
33 */ 33 */
34static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) 34static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
35 char *vllist)
35{ 36{
36 struct afs_cell *cell; 37 struct afs_cell *cell;
37 struct key *key; 38 struct key *key;
38 size_t namelen;
39 char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next; 39 char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
40 char *dvllist = NULL, *_vllist = NULL; 40 char *dvllist = NULL, *_vllist = NULL;
41 char delimiter = ':'; 41 char delimiter = ':';
42 int ret; 42 int ret;
43 43
44 _enter("%s,%s", name, vllist); 44 _enter("%*.*s,%s", namelen, namelen, name ?: "", vllist);
45 45
46 BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ 46 BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
47 47
48 namelen = strlen(name);
49 if (namelen > AFS_MAXCELLNAME) { 48 if (namelen > AFS_MAXCELLNAME) {
50 _leave(" = -ENAMETOOLONG"); 49 _leave(" = -ENAMETOOLONG");
51 return ERR_PTR(-ENAMETOOLONG); 50 return ERR_PTR(-ENAMETOOLONG);
@@ -73,6 +72,10 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
73 if (!vllist || strlen(vllist) < 7) { 72 if (!vllist || strlen(vllist) < 7) {
74 ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL); 73 ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
75 if (ret < 0) { 74 if (ret < 0) {
75 if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY)
76 /* translate these errors into something
77 * userspace might understand */
78 ret = -EDESTADDRREQ;
76 _leave(" = %d", ret); 79 _leave(" = %d", ret);
77 return ERR_PTR(ret); 80 return ERR_PTR(ret);
78 } 81 }
@@ -138,26 +141,29 @@ error:
138} 141}
139 142
140/* 143/*
141 * create a cell record 144 * afs_cell_crate() - create a cell record
142 * - "name" is the name of the cell 145 * @name: is the name of the cell.
143 * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format 146 * @namsesz: is the strlen of the cell name.
147 * @vllist: is a colon separated list of IP addresses in "a.b.c.d" format.
148 * @retref: is T to return the cell reference when the cell exists.
144 */ 149 */
145struct afs_cell *afs_cell_create(const char *name, char *vllist) 150struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
151 char *vllist, bool retref)
146{ 152{
147 struct afs_cell *cell; 153 struct afs_cell *cell;
148 int ret; 154 int ret;
149 155
150 _enter("%s,%s", name, vllist); 156 _enter("%*.*s,%s", namesz, namesz, name ?: "", vllist);
151 157
152 down_write(&afs_cells_sem); 158 down_write(&afs_cells_sem);
153 read_lock(&afs_cells_lock); 159 read_lock(&afs_cells_lock);
154 list_for_each_entry(cell, &afs_cells, link) { 160 list_for_each_entry(cell, &afs_cells, link) {
155 if (strcasecmp(cell->name, name) == 0) 161 if (strncasecmp(cell->name, name, namesz) == 0)
156 goto duplicate_name; 162 goto duplicate_name;
157 } 163 }
158 read_unlock(&afs_cells_lock); 164 read_unlock(&afs_cells_lock);
159 165
160 cell = afs_cell_alloc(name, vllist); 166 cell = afs_cell_alloc(name, namesz, vllist);
161 if (IS_ERR(cell)) { 167 if (IS_ERR(cell)) {
162 _leave(" = %ld", PTR_ERR(cell)); 168 _leave(" = %ld", PTR_ERR(cell));
163 up_write(&afs_cells_sem); 169 up_write(&afs_cells_sem);
@@ -197,8 +203,18 @@ error:
197 return ERR_PTR(ret); 203 return ERR_PTR(ret);
198 204
199duplicate_name: 205duplicate_name:
206 if (retref && !IS_ERR(cell))
207 afs_get_cell(cell);
208
200 read_unlock(&afs_cells_lock); 209 read_unlock(&afs_cells_lock);
201 up_write(&afs_cells_sem); 210 up_write(&afs_cells_sem);
211
212 if (retref) {
213 _leave(" = %p", cell);
214 return cell;
215 }
216
217 _leave(" = -EEXIST");
202 return ERR_PTR(-EEXIST); 218 return ERR_PTR(-EEXIST);
203} 219}
204 220
@@ -229,7 +245,7 @@ int afs_cell_init(char *rootcell)
229 *cp++ = 0; 245 *cp++ = 0;
230 246
231 /* allocate a cell record for the root cell */ 247 /* allocate a cell record for the root cell */
232 new_root = afs_cell_create(rootcell, cp); 248 new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false);
233 if (IS_ERR(new_root)) { 249 if (IS_ERR(new_root)) {
234 _leave(" = %ld", PTR_ERR(new_root)); 250 _leave(" = %ld", PTR_ERR(new_root));
235 return PTR_ERR(new_root); 251 return PTR_ERR(new_root);
@@ -249,11 +265,12 @@ int afs_cell_init(char *rootcell)
249/* 265/*
250 * lookup a cell record 266 * lookup a cell record
251 */ 267 */
252struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz) 268struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
269 bool dns_cell)
253{ 270{
254 struct afs_cell *cell; 271 struct afs_cell *cell;
255 272
256 _enter("\"%*.*s\",", namesz, namesz, name ? name : ""); 273 _enter("\"%*.*s\",", namesz, namesz, name ?: "");
257 274
258 down_read(&afs_cells_sem); 275 down_read(&afs_cells_sem);
259 read_lock(&afs_cells_lock); 276 read_lock(&afs_cells_lock);
@@ -267,6 +284,8 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
267 } 284 }
268 } 285 }
269 cell = ERR_PTR(-ENOENT); 286 cell = ERR_PTR(-ENOENT);
287 if (dns_cell)
288 goto create_cell;
270 found: 289 found:
271 ; 290 ;
272 } else { 291 } else {
@@ -289,6 +308,15 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
289 up_read(&afs_cells_sem); 308 up_read(&afs_cells_sem);
290 _leave(" = %p", cell); 309 _leave(" = %p", cell);
291 return cell; 310 return cell;
311
312create_cell:
313 read_unlock(&afs_cells_lock);
314 up_read(&afs_cells_sem);
315
316 cell = afs_cell_create(name, namesz, NULL, true);
317
318 _leave(" = %p", cell);
319 return cell;
292} 320}
293 321
294#if 0 322#if 0
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b42d5cc1d6d2..0d38c09bd55e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -477,6 +477,40 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
477} 477}
478 478
479/* 479/*
480 * Try to auto mount the mountpoint with pseudo directory, if the autocell
481 * operation is setted.
482 */
483static struct inode *afs_try_auto_mntpt(
484 int ret, struct dentry *dentry, struct inode *dir, struct key *key,
485 struct afs_fid *fid)
486{
487 const char *devname = dentry->d_name.name;
488 struct afs_vnode *vnode = AFS_FS_I(dir);
489 struct inode *inode;
490
491 _enter("%d, %p{%s}, {%x:%u}, %p",
492 ret, dentry, devname, vnode->fid.vid, vnode->fid.vnode, key);
493
494 if (ret != -ENOENT ||
495 !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
496 goto out;
497
498 inode = afs_iget_autocell(dir, devname, strlen(devname), key);
499 if (IS_ERR(inode)) {
500 ret = PTR_ERR(inode);
501 goto out;
502 }
503
504 *fid = AFS_FS_I(inode)->fid;
505 _leave("= %p", inode);
506 return inode;
507
508out:
509 _leave("= %d", ret);
510 return ERR_PTR(ret);
511}
512
513/*
480 * look up an entry in a directory 514 * look up an entry in a directory
481 */ 515 */
482static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, 516static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
@@ -520,6 +554,13 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
520 554
521 ret = afs_do_lookup(dir, dentry, &fid, key); 555 ret = afs_do_lookup(dir, dentry, &fid, key);
522 if (ret < 0) { 556 if (ret < 0) {
557 inode = afs_try_auto_mntpt(ret, dentry, dir, key, &fid);
558 if (!IS_ERR(inode)) {
559 key_put(key);
560 goto success;
561 }
562
563 ret = PTR_ERR(inode);
523 key_put(key); 564 key_put(key);
524 if (ret == -ENOENT) { 565 if (ret == -ENOENT) {
525 d_add(dentry, NULL); 566 d_add(dentry, NULL);
@@ -539,6 +580,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
539 return ERR_CAST(inode); 580 return ERR_CAST(inode);
540 } 581 }
541 582
583success:
542 dentry->d_op = &afs_fs_dentry_operations; 584 dentry->d_op = &afs_fs_dentry_operations;
543 585
544 d_add(dentry, inode); 586 d_add(dentry, inode);
@@ -696,8 +738,9 @@ static int afs_d_delete(struct dentry *dentry)
696 goto zap; 738 goto zap;
697 739
698 if (dentry->d_inode && 740 if (dentry->d_inode &&
699 test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags)) 741 (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags) ||
700 goto zap; 742 test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags)))
743 goto zap;
701 744
702 _leave(" = 0 [keep]"); 745 _leave(" = 0 [keep]");
703 return 0; 746 return 0;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 320ffef11574..0747339011c3 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,6 +19,8 @@
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/pagemap.h> 20#include <linux/pagemap.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/mount.h>
23#include <linux/namei.h>
22#include "internal.h" 24#include "internal.h"
23 25
24struct afs_iget_data { 26struct afs_iget_data {
@@ -102,6 +104,16 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
102} 104}
103 105
104/* 106/*
107 * iget5() comparator for inode created by autocell operations
108 *
109 * These pseudo inodes don't match anything.
110 */
111static int afs_iget5_autocell_test(struct inode *inode, void *opaque)
112{
113 return 0;
114}
115
116/*
105 * iget5() inode initialiser 117 * iget5() inode initialiser
106 */ 118 */
107static int afs_iget5_set(struct inode *inode, void *opaque) 119static int afs_iget5_set(struct inode *inode, void *opaque)
@@ -118,6 +130,67 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
118} 130}
119 131
120/* 132/*
133 * inode retrieval for autocell
134 */
135struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
136 int namesz, struct key *key)
137{
138 struct afs_iget_data data;
139 struct afs_super_info *as;
140 struct afs_vnode *vnode;
141 struct super_block *sb;
142 struct inode *inode;
143 static atomic_t afs_autocell_ino;
144
145 _enter("{%x:%u},%*.*s,",
146 AFS_FS_I(dir)->fid.vid, AFS_FS_I(dir)->fid.vnode,
147 namesz, namesz, dev_name ?: "");
148
149 sb = dir->i_sb;
150 as = sb->s_fs_info;
151 data.volume = as->volume;
152 data.fid.vid = as->volume->vid;
153 data.fid.unique = 0;
154 data.fid.vnode = 0;
155
156 inode = iget5_locked(sb, atomic_inc_return(&afs_autocell_ino),
157 afs_iget5_autocell_test, afs_iget5_set,
158 &data);
159 if (!inode) {
160 _leave(" = -ENOMEM");
161 return ERR_PTR(-ENOMEM);
162 }
163
164 _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }",
165 inode, inode->i_ino, data.fid.vid, data.fid.vnode,
166 data.fid.unique);
167
168 vnode = AFS_FS_I(inode);
169
170 /* there shouldn't be an existing inode */
171 BUG_ON(!(inode->i_state & I_NEW));
172
173 inode->i_size = 0;
174 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
175 inode->i_op = &afs_autocell_inode_operations;
176 inode->i_nlink = 2;
177 inode->i_uid = 0;
178 inode->i_gid = 0;
179 inode->i_ctime.tv_sec = get_seconds();
180 inode->i_ctime.tv_nsec = 0;
181 inode->i_atime = inode->i_mtime = inode->i_ctime;
182 inode->i_blocks = 0;
183 inode->i_version = 0;
184 inode->i_generation = 0;
185
186 set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
187 inode->i_flags |= S_NOATIME;
188 unlock_new_inode(inode);
189 _leave(" = %p", inode);
190 return inode;
191}
192
193/*
121 * inode retrieval 194 * inode retrieval
122 */ 195 */
123struct inode *afs_iget(struct super_block *sb, struct key *key, 196struct inode *afs_iget(struct super_block *sb, struct key *key,
@@ -314,6 +387,19 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
314} 387}
315 388
316/* 389/*
390 * discard an AFS inode
391 */
392int afs_drop_inode(struct inode *inode)
393{
394 _enter("");
395
396 if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
397 return generic_delete_inode(inode);
398 else
399 return generic_drop_inode(inode);
400}
401
402/*
317 * clear an AFS inode 403 * clear an AFS inode
318 */ 404 */
319void afs_evict_inode(struct inode *inode) 405void afs_evict_inode(struct inode *inode)
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8679089ce9a1..cca8eef736fc 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -42,6 +42,7 @@ typedef enum {
42struct afs_mount_params { 42struct afs_mount_params {
43 bool rwpath; /* T if the parent should be considered R/W */ 43 bool rwpath; /* T if the parent should be considered R/W */
44 bool force; /* T to force cell type */ 44 bool force; /* T to force cell type */
45 bool autocell; /* T if set auto mount operation */
45 afs_voltype_t type; /* type of volume requested */ 46 afs_voltype_t type; /* type of volume requested */
46 int volnamesz; /* size of volume name */ 47 int volnamesz; /* size of volume name */
47 const char *volname; /* name of volume to mount */ 48 const char *volname; /* name of volume to mount */
@@ -358,6 +359,8 @@ struct afs_vnode {
358#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */ 359#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */
359#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */ 360#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */
360#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */ 361#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */
362#define AFS_VNODE_AUTOCELL 10 /* set if Vnode is an auto mount point */
363#define AFS_VNODE_PSEUDODIR 11 /* set if Vnode is a pseudo directory */
361 364
362 long acl_order; /* ACL check count (callback break count) */ 365 long acl_order; /* ACL check count (callback break count) */
363 366
@@ -468,8 +471,8 @@ extern struct list_head afs_proc_cells;
468 471
469#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) 472#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
470extern int afs_cell_init(char *); 473extern int afs_cell_init(char *);
471extern struct afs_cell *afs_cell_create(const char *, char *); 474extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool);
472extern struct afs_cell *afs_cell_lookup(const char *, unsigned); 475extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool);
473extern struct afs_cell *afs_grab_cell(struct afs_cell *); 476extern struct afs_cell *afs_grab_cell(struct afs_cell *);
474extern void afs_put_cell(struct afs_cell *); 477extern void afs_put_cell(struct afs_cell *);
475extern void afs_cell_purge(void); 478extern void afs_cell_purge(void);
@@ -558,6 +561,8 @@ extern int afs_fs_release_lock(struct afs_server *, struct key *,
558/* 561/*
559 * inode.c 562 * inode.c
560 */ 563 */
564extern struct inode *afs_iget_autocell(struct inode *, const char *, int,
565 struct key *);
561extern struct inode *afs_iget(struct super_block *, struct key *, 566extern struct inode *afs_iget(struct super_block *, struct key *,
562 struct afs_fid *, struct afs_file_status *, 567 struct afs_fid *, struct afs_file_status *,
563 struct afs_callback *); 568 struct afs_callback *);
@@ -566,6 +571,7 @@ extern int afs_validate(struct afs_vnode *, struct key *);
566extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 571extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
567extern int afs_setattr(struct dentry *, struct iattr *); 572extern int afs_setattr(struct dentry *, struct iattr *);
568extern void afs_evict_inode(struct inode *); 573extern void afs_evict_inode(struct inode *);
574extern int afs_drop_inode(struct inode *);
569 575
570/* 576/*
571 * main.c 577 * main.c
@@ -581,6 +587,7 @@ extern int afs_abort_to_error(u32);
581 * mntpt.c 587 * mntpt.c
582 */ 588 */
583extern const struct inode_operations afs_mntpt_inode_operations; 589extern const struct inode_operations afs_mntpt_inode_operations;
590extern const struct inode_operations afs_autocell_inode_operations;
584extern const struct file_operations afs_mntpt_file_operations; 591extern const struct file_operations afs_mntpt_file_operations;
585 592
586extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); 593extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
@@ -752,12 +759,6 @@ extern unsigned afs_debug;
752#define dbgprintk(FMT,...) \ 759#define dbgprintk(FMT,...) \
753 printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) 760 printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
754 761
755/* make sure we maintain the format strings, even when debugging is disabled */
756static inline __attribute__((format(printf,1,2)))
757void _dbprintk(const char *fmt, ...)
758{
759}
760
761#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) 762#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
762#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) 763#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
763#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) 764#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__)
@@ -792,9 +793,9 @@ do { \
792} while (0) 793} while (0)
793 794
794#else 795#else
795#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) 796#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
796#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) 797#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
797#define _debug(FMT,...) _dbprintk(" "FMT ,##__VA_ARGS__) 798#define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__)
798#endif 799#endif
799 800
800/* 801/*
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index a9e23039ea34..6d552686c498 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -38,6 +38,11 @@ const struct inode_operations afs_mntpt_inode_operations = {
38 .getattr = afs_getattr, 38 .getattr = afs_getattr,
39}; 39};
40 40
41const struct inode_operations afs_autocell_inode_operations = {
42 .follow_link = afs_mntpt_follow_link,
43 .getattr = afs_getattr,
44};
45
41static LIST_HEAD(afs_vfsmounts); 46static LIST_HEAD(afs_vfsmounts);
42static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out); 47static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
43 48
@@ -136,20 +141,16 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
136{ 141{
137 struct afs_super_info *super; 142 struct afs_super_info *super;
138 struct vfsmount *mnt; 143 struct vfsmount *mnt;
144 struct afs_vnode *vnode;
139 struct page *page; 145 struct page *page;
140 size_t size; 146 char *devname, *options;
141 char *buf, *devname, *options; 147 bool rwpath = false;
142 int ret; 148 int ret;
143 149
144 _enter("{%s}", mntpt->d_name.name); 150 _enter("{%s}", mntpt->d_name.name);
145 151
146 BUG_ON(!mntpt->d_inode); 152 BUG_ON(!mntpt->d_inode);
147 153
148 ret = -EINVAL;
149 size = mntpt->d_inode->i_size;
150 if (size > PAGE_SIZE - 1)
151 goto error_no_devname;
152
153 ret = -ENOMEM; 154 ret = -ENOMEM;
154 devname = (char *) get_zeroed_page(GFP_KERNEL); 155 devname = (char *) get_zeroed_page(GFP_KERNEL);
155 if (!devname) 156 if (!devname)
@@ -159,28 +160,59 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
159 if (!options) 160 if (!options)
160 goto error_no_options; 161 goto error_no_options;
161 162
162 /* read the contents of the AFS special symlink */ 163 vnode = AFS_FS_I(mntpt->d_inode);
163 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); 164 if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
164 if (IS_ERR(page)) { 165 /* if the directory is a pseudo directory, use the d_name */
165 ret = PTR_ERR(page); 166 static const char afs_root_cell[] = ":root.cell.";
166 goto error_no_page; 167 unsigned size = mntpt->d_name.len;
168
169 ret = -ENOENT;
170 if (size < 2 || size > AFS_MAXCELLNAME)
171 goto error_no_page;
172
173 if (mntpt->d_name.name[0] == '.') {
174 devname[0] = '#';
175 memcpy(devname + 1, mntpt->d_name.name, size - 1);
176 memcpy(devname + size, afs_root_cell,
177 sizeof(afs_root_cell));
178 rwpath = true;
179 } else {
180 devname[0] = '%';
181 memcpy(devname + 1, mntpt->d_name.name, size);
182 memcpy(devname + size + 1, afs_root_cell,
183 sizeof(afs_root_cell));
184 }
185 } else {
186 /* read the contents of the AFS special symlink */
187 loff_t size = i_size_read(mntpt->d_inode);
188 char *buf;
189
190 ret = -EINVAL;
191 if (size > PAGE_SIZE - 1)
192 goto error_no_page;
193
194 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
195 if (IS_ERR(page)) {
196 ret = PTR_ERR(page);
197 goto error_no_page;
198 }
199
200 ret = -EIO;
201 if (PageError(page))
202 goto error;
203
204 buf = kmap_atomic(page, KM_USER0);
205 memcpy(devname, buf, size);
206 kunmap_atomic(buf, KM_USER0);
207 page_cache_release(page);
208 page = NULL;
167 } 209 }
168 210
169 ret = -EIO;
170 if (PageError(page))
171 goto error;
172
173 buf = kmap_atomic(page, KM_USER0);
174 memcpy(devname, buf, size);
175 kunmap_atomic(buf, KM_USER0);
176 page_cache_release(page);
177 page = NULL;
178
179 /* work out what options we want */ 211 /* work out what options we want */
180 super = AFS_FS_S(mntpt->d_sb); 212 super = AFS_FS_S(mntpt->d_sb);
181 memcpy(options, "cell=", 5); 213 memcpy(options, "cell=", 5);
182 strcpy(options + 5, super->volume->cell->name); 214 strcpy(options + 5, super->volume->cell->name);
183 if (super->volume->type == AFSVL_RWVOL) 215 if (super->volume->type == AFSVL_RWVOL || rwpath)
184 strcat(options, ",rwpath"); 216 strcat(options, ",rwpath");
185 217
186 /* try and do the mount */ 218 /* try and do the mount */
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 852739d262a9..096b23f821a1 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -294,7 +294,7 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
294 if (strcmp(kbuf, "add") == 0) { 294 if (strcmp(kbuf, "add") == 0) {
295 struct afs_cell *cell; 295 struct afs_cell *cell;
296 296
297 cell = afs_cell_create(name, args); 297 cell = afs_cell_create(name, strlen(name), args, false);
298 if (IS_ERR(cell)) { 298 if (IS_ERR(cell)) {
299 ret = PTR_ERR(cell); 299 ret = PTR_ERR(cell);
300 goto done; 300 goto done;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 67cf810e0fd6..654d8fdbf01f 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -100,6 +100,7 @@ int afs_open_socket(void)
100 ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); 100 ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
101 if (ret < 0) { 101 if (ret < 0) {
102 sock_release(socket); 102 sock_release(socket);
103 destroy_workqueue(afs_async_calls);
103 _leave(" = %d [bind]", ret); 104 _leave(" = %d [bind]", ret);
104 return ret; 105 return ret;
105 } 106 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 9cf80f02da16..77e1e5a61154 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -16,6 +16,7 @@
16 16
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/mount.h>
19#include <linux/init.h> 20#include <linux/init.h>
20#include <linux/slab.h> 21#include <linux/slab.h>
21#include <linux/smp_lock.h> 22#include <linux/smp_lock.h>
@@ -48,6 +49,7 @@ struct file_system_type afs_fs_type = {
48static const struct super_operations afs_super_ops = { 49static const struct super_operations afs_super_ops = {
49 .statfs = afs_statfs, 50 .statfs = afs_statfs,
50 .alloc_inode = afs_alloc_inode, 51 .alloc_inode = afs_alloc_inode,
52 .drop_inode = afs_drop_inode,
51 .destroy_inode = afs_destroy_inode, 53 .destroy_inode = afs_destroy_inode,
52 .evict_inode = afs_evict_inode, 54 .evict_inode = afs_evict_inode,
53 .put_super = afs_put_super, 55 .put_super = afs_put_super,
@@ -62,12 +64,14 @@ enum {
62 afs_opt_cell, 64 afs_opt_cell,
63 afs_opt_rwpath, 65 afs_opt_rwpath,
64 afs_opt_vol, 66 afs_opt_vol,
67 afs_opt_autocell,
65}; 68};
66 69
67static const match_table_t afs_options_list = { 70static const match_table_t afs_options_list = {
68 { afs_opt_cell, "cell=%s" }, 71 { afs_opt_cell, "cell=%s" },
69 { afs_opt_rwpath, "rwpath" }, 72 { afs_opt_rwpath, "rwpath" },
70 { afs_opt_vol, "vol=%s" }, 73 { afs_opt_vol, "vol=%s" },
74 { afs_opt_autocell, "autocell" },
71 { afs_no_opt, NULL }, 75 { afs_no_opt, NULL },
72}; 76};
73 77
@@ -151,7 +155,8 @@ static int afs_parse_options(struct afs_mount_params *params,
151 switch (token) { 155 switch (token) {
152 case afs_opt_cell: 156 case afs_opt_cell:
153 cell = afs_cell_lookup(args[0].from, 157 cell = afs_cell_lookup(args[0].from,
154 args[0].to - args[0].from); 158 args[0].to - args[0].from,
159 false);
155 if (IS_ERR(cell)) 160 if (IS_ERR(cell))
156 return PTR_ERR(cell); 161 return PTR_ERR(cell);
157 afs_put_cell(params->cell); 162 afs_put_cell(params->cell);
@@ -166,6 +171,10 @@ static int afs_parse_options(struct afs_mount_params *params,
166 *devname = args[0].from; 171 *devname = args[0].from;
167 break; 172 break;
168 173
174 case afs_opt_autocell:
175 params->autocell = 1;
176 break;
177
169 default: 178 default:
170 printk(KERN_ERR "kAFS:" 179 printk(KERN_ERR "kAFS:"
171 " Unknown or invalid mount option: '%s'\n", p); 180 " Unknown or invalid mount option: '%s'\n", p);
@@ -252,10 +261,10 @@ static int afs_parse_device_name(struct afs_mount_params *params,
252 261
253 /* lookup the cell record */ 262 /* lookup the cell record */
254 if (cellname || !params->cell) { 263 if (cellname || !params->cell) {
255 cell = afs_cell_lookup(cellname, cellnamesz); 264 cell = afs_cell_lookup(cellname, cellnamesz, true);
256 if (IS_ERR(cell)) { 265 if (IS_ERR(cell)) {
257 printk(KERN_ERR "kAFS: unable to lookup cell '%s'\n", 266 printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
258 cellname ?: ""); 267 cellnamesz, cellnamesz, cellname ?: "");
259 return PTR_ERR(cell); 268 return PTR_ERR(cell);
260 } 269 }
261 afs_put_cell(params->cell); 270 afs_put_cell(params->cell);
@@ -321,6 +330,9 @@ static int afs_fill_super(struct super_block *sb, void *data)
321 if (IS_ERR(inode)) 330 if (IS_ERR(inode))
322 goto error_inode; 331 goto error_inode;
323 332
333 if (params->autocell)
334 set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
335
324 ret = -ENOMEM; 336 ret = -ENOMEM;
325 root = d_alloc_root(inode); 337 root = d_alloc_root(inode);
326 if (!root) 338 if (!root)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 48e056e70fd6..cb1bd38dc08c 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -204,8 +204,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
204 } 204 }
205 205
206 /* Initialize expiry counter after successful mount */ 206 /* Initialize expiry counter after successful mount */
207 if (ino) 207 ino->last_used = jiffies;
208 ino->last_used = jiffies;
209 208
210 spin_lock(&sbi->fs_lock); 209 spin_lock(&sbi->fs_lock);
211 ino->flags &= ~AUTOFS_INF_PENDING; 210 ino->flags &= ~AUTOFS_INF_PENDING;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 66411463b734..50e8c8582faa 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1340,10 +1340,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1340 /* 1340 /*
1341 * hooks: /n/, see "layering violations". 1341 * hooks: /n/, see "layering violations".
1342 */ 1342 */
1343 ret = devcgroup_inode_permission(bdev->bd_inode, perm); 1343 if (!for_part) {
1344 if (ret != 0) { 1344 ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1345 bdput(bdev); 1345 if (ret != 0) {
1346 return ret; 1346 bdput(bdev);
1347 return ret;
1348 }
1347 } 1349 }
1348 1350
1349 restart: 1351 restart:
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 24eb0d37241a..727caedcdd92 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -552,8 +552,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
552 */ 552 */
553static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) 553static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
554{ 554{
555 struct fs_struct *fs; 555 struct path path;
556 struct dentry *dir;
557 const struct cred *saved_cred; 556 const struct cred *saved_cred;
558 int ret; 557 int ret;
559 558
@@ -573,24 +572,21 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
573 } 572 }
574 573
575 /* extract the directory dentry from the cwd */ 574 /* extract the directory dentry from the cwd */
576 fs = current->fs; 575 get_fs_pwd(current->fs, &path);
577 read_lock(&fs->lock);
578 dir = dget(fs->pwd.dentry);
579 read_unlock(&fs->lock);
580 576
581 if (!S_ISDIR(dir->d_inode->i_mode)) 577 if (!S_ISDIR(path.dentry->d_inode->i_mode))
582 goto notdir; 578 goto notdir;
583 579
584 cachefiles_begin_secure(cache, &saved_cred); 580 cachefiles_begin_secure(cache, &saved_cred);
585 ret = cachefiles_cull(cache, dir, args); 581 ret = cachefiles_cull(cache, path.dentry, args);
586 cachefiles_end_secure(cache, saved_cred); 582 cachefiles_end_secure(cache, saved_cred);
587 583
588 dput(dir); 584 path_put(&path);
589 _leave(" = %d", ret); 585 _leave(" = %d", ret);
590 return ret; 586 return ret;
591 587
592notdir: 588notdir:
593 dput(dir); 589 path_put(&path);
594 kerror("cull command requires dirfd to be a directory"); 590 kerror("cull command requires dirfd to be a directory");
595 return -ENOTDIR; 591 return -ENOTDIR;
596 592
@@ -628,8 +624,7 @@ inval:
628 */ 624 */
629static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) 625static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
630{ 626{
631 struct fs_struct *fs; 627 struct path path;
632 struct dentry *dir;
633 const struct cred *saved_cred; 628 const struct cred *saved_cred;
634 int ret; 629 int ret;
635 630
@@ -649,24 +644,21 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
649 } 644 }
650 645
651 /* extract the directory dentry from the cwd */ 646 /* extract the directory dentry from the cwd */
652 fs = current->fs; 647 get_fs_pwd(current->fs, &path);
653 read_lock(&fs->lock);
654 dir = dget(fs->pwd.dentry);
655 read_unlock(&fs->lock);
656 648
657 if (!S_ISDIR(dir->d_inode->i_mode)) 649 if (!S_ISDIR(path.dentry->d_inode->i_mode))
658 goto notdir; 650 goto notdir;
659 651
660 cachefiles_begin_secure(cache, &saved_cred); 652 cachefiles_begin_secure(cache, &saved_cred);
661 ret = cachefiles_check_in_use(cache, dir, args); 653 ret = cachefiles_check_in_use(cache, path.dentry, args);
662 cachefiles_end_secure(cache, saved_cred); 654 cachefiles_end_secure(cache, saved_cred);
663 655
664 dput(dir); 656 path_put(&path);
665 //_leave(" = %d", ret); 657 //_leave(" = %d", ret);
666 return ret; 658 return ret;
667 659
668notdir: 660notdir:
669 dput(dir); 661 path_put(&path);
670 kerror("inuse command requires dirfd to be a directory"); 662 kerror("inuse command requires dirfd to be a directory");
671 return -ENOTDIR; 663 return -ENOTDIR;
672 664
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index a8cd821226da..bd6bc1bde2d7 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -267,13 +267,6 @@ do { \
267#define dbgprintk(FMT, ...) \ 267#define dbgprintk(FMT, ...) \
268 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) 268 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
269 269
270/* make sure we maintain the format strings, even when debugging is disabled */
271static inline void _dbprintk(const char *fmt, ...)
272 __attribute__((format(printf, 1, 2)));
273static inline void _dbprintk(const char *fmt, ...)
274{
275}
276
277#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 270#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
278#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 271#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
279#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) 272#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
@@ -304,9 +297,9 @@ do { \
304} while (0) 297} while (0)
305 298
306#else 299#else
307#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 300#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
308#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 301#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
309#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) 302#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
310#endif 303#endif
311 304
312#if 1 /* defined(__KDEBUGALL) */ 305#if 1 /* defined(__KDEBUGALL) */
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 6a660e610be8..278e1172600d 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,7 +6,7 @@ ifneq ($(KERNELRELEASE),)
6 6
7obj-$(CONFIG_CEPH_FS) += ceph.o 7obj-$(CONFIG_CEPH_FS) += ceph.o
8 8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \ 9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \ 10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \ 11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \ 12 mds_client.o mdsmap.o \
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d9c60b84949a..5598a0d02295 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -309,7 +309,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
309 zero_user_segment(page, s, PAGE_CACHE_SIZE); 309 zero_user_segment(page, s, PAGE_CACHE_SIZE);
310 } 310 }
311 311
312 if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { 312 if (add_to_page_cache_lru(page, mapping, page->index,
313 GFP_NOFS)) {
313 page_cache_release(page); 314 page_cache_release(page);
314 dout("readpages %p add_to_page_cache failed %p\n", 315 dout("readpages %p add_to_page_cache failed %p\n",
315 inode, page); 316 inode, page);
@@ -552,7 +553,7 @@ static void writepages_finish(struct ceph_osd_request *req,
552 * page truncation thread, possibly losing some data that 553 * page truncation thread, possibly losing some data that
553 * raced its way in 554 * raced its way in
554 */ 555 */
555 if ((issued & CEPH_CAP_FILE_CACHE) == 0) 556 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
556 generic_error_remove_page(inode->i_mapping, page); 557 generic_error_remove_page(inode->i_mapping, page);
557 558
558 unlock_page(page); 559 unlock_page(page);
@@ -797,9 +798,12 @@ get_more_pages:
797 dout("%p will write page %p idx %lu\n", 798 dout("%p will write page %p idx %lu\n",
798 inode, page, page->index); 799 inode, page, page->index);
799 800
800 writeback_stat = atomic_long_inc_return(&client->writeback_count); 801 writeback_stat =
801 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) { 802 atomic_long_inc_return(&client->writeback_count);
802 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); 803 if (writeback_stat > CONGESTION_ON_THRESH(
804 client->mount_args->congestion_kb)) {
805 set_bdi_congested(&client->backing_dev_info,
806 BLK_RW_ASYNC);
803 } 807 }
804 808
805 set_page_writeback(page); 809 set_page_writeback(page);
@@ -1036,7 +1040,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
1036 *pagep = page; 1040 *pagep = page;
1037 1041
1038 dout("write_begin file %p inode %p page %p %d~%d\n", file, 1042 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1039 inode, page, (int)pos, (int)len); 1043 inode, page, (int)pos, (int)len);
1040 1044
1041 r = ceph_update_writeable_page(file, pos, len, page); 1045 r = ceph_update_writeable_page(file, pos, len, page);
1042 } while (r == -EAGAIN); 1046 } while (r == -EAGAIN);
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
index 67b2c030924b..eb2a666b0be7 100644
--- a/fs/ceph/armor.c
+++ b/fs/ceph/armor.c
@@ -1,11 +1,15 @@
1 1
2#include <linux/errno.h> 2#include <linux/errno.h>
3 3
4int ceph_armor(char *dst, const char *src, const char *end);
5int ceph_unarmor(char *dst, const char *src, const char *end);
6
4/* 7/*
5 * base64 encode/decode. 8 * base64 encode/decode.
6 */ 9 */
7 10
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 11static const char *pem_key =
12 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9 13
10static int encode_bits(int c) 14static int encode_bits(int c)
11{ 15{
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 89490beaf537..6d2e30600627 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -20,7 +20,7 @@ static u32 supported_protocols[] = {
20 CEPH_AUTH_CEPHX 20 CEPH_AUTH_CEPHX
21}; 21};
22 22
23int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) 23static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{ 24{
25 switch (protocol) { 25 switch (protocol) {
26 case CEPH_AUTH_NONE: 26 case CEPH_AUTH_NONE:
@@ -133,8 +133,8 @@ bad:
133 return -ERANGE; 133 return -ERANGE;
134} 134}
135 135
136int ceph_build_auth_request(struct ceph_auth_client *ac, 136static int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len) 137 void *msg_buf, size_t msg_len)
138{ 138{
139 struct ceph_mon_request_header *monhdr = msg_buf; 139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1; 140 void *p = monhdr + 1;
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 6d44053ecff1..582e0b2caf8a 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -87,8 +87,8 @@ static int ceph_x_decrypt(struct ceph_crypto_key *secret,
87/* 87/*
88 * get existing (or insert new) ticket handler 88 * get existing (or insert new) ticket handler
89 */ 89 */
90struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac, 90static struct ceph_x_ticket_handler *
91 int service) 91get_ticket_handler(struct ceph_auth_client *ac, int service)
92{ 92{
93 struct ceph_x_ticket_handler *th; 93 struct ceph_x_ticket_handler *th;
94 struct ceph_x_info *xi = ac->private; 94 struct ceph_x_info *xi = ac->private;
@@ -429,7 +429,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
429 auth->struct_v = 1; 429 auth->struct_v = 1;
430 auth->key = 0; 430 auth->key = 0;
431 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) 431 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
432 auth->key ^= *u; 432 auth->key ^= *(__le64 *)u;
433 dout(" server_challenge %llx client_challenge %llx key %llx\n", 433 dout(" server_challenge %llx client_challenge %llx key %llx\n",
434 xi->server_challenge, le64_to_cpu(auth->client_challenge), 434 xi->server_challenge, le64_to_cpu(auth->client_challenge),
435 le64_to_cpu(auth->key)); 435 le64_to_cpu(auth->key));
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
index c67535d70aa6..cd39f17021de 100644
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -47,22 +47,6 @@ void ceph_buffer_release(struct kref *kref)
47 kfree(b); 47 kfree(b);
48} 48}
49 49
50int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
51{
52 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
53 if (b->vec.iov_base) {
54 b->is_vmalloc = false;
55 } else {
56 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
57 b->is_vmalloc = true;
58 }
59 if (!b->vec.iov_base)
60 return -ENOMEM;
61 b->alloc_len = len;
62 b->vec.iov_len = len;
63 return 0;
64}
65
66int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) 50int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
67{ 51{
68 size_t len; 52 size_t len;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b81be9a56487..7bf182b03973 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -113,58 +113,41 @@ const char *ceph_cap_string(int caps)
113 return cap_str[i]; 113 return cap_str[i];
114} 114}
115 115
116/* 116void ceph_caps_init(struct ceph_mds_client *mdsc)
117 * Cap reservations
118 *
119 * Maintain a global pool of preallocated struct ceph_caps, referenced
120 * by struct ceph_caps_reservations. This ensures that we preallocate
121 * memory needed to successfully process an MDS response. (If an MDS
122 * sends us cap information and we fail to process it, we will have
123 * problems due to the client and MDS being out of sync.)
124 *
125 * Reservations are 'owned' by a ceph_cap_reservation context.
126 */
127static spinlock_t caps_list_lock;
128static struct list_head caps_list; /* unused (reserved or unreserved) */
129static int caps_total_count; /* total caps allocated */
130static int caps_use_count; /* in use */
131static int caps_reserve_count; /* unused, reserved */
132static int caps_avail_count; /* unused, unreserved */
133static int caps_min_count; /* keep at least this many (unreserved) */
134
135void __init ceph_caps_init(void)
136{ 117{
137 INIT_LIST_HEAD(&caps_list); 118 INIT_LIST_HEAD(&mdsc->caps_list);
138 spin_lock_init(&caps_list_lock); 119 spin_lock_init(&mdsc->caps_list_lock);
139} 120}
140 121
141void ceph_caps_finalize(void) 122void ceph_caps_finalize(struct ceph_mds_client *mdsc)
142{ 123{
143 struct ceph_cap *cap; 124 struct ceph_cap *cap;
144 125
145 spin_lock(&caps_list_lock); 126 spin_lock(&mdsc->caps_list_lock);
146 while (!list_empty(&caps_list)) { 127 while (!list_empty(&mdsc->caps_list)) {
147 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); 128 cap = list_first_entry(&mdsc->caps_list,
129 struct ceph_cap, caps_item);
148 list_del(&cap->caps_item); 130 list_del(&cap->caps_item);
149 kmem_cache_free(ceph_cap_cachep, cap); 131 kmem_cache_free(ceph_cap_cachep, cap);
150 } 132 }
151 caps_total_count = 0; 133 mdsc->caps_total_count = 0;
152 caps_avail_count = 0; 134 mdsc->caps_avail_count = 0;
153 caps_use_count = 0; 135 mdsc->caps_use_count = 0;
154 caps_reserve_count = 0; 136 mdsc->caps_reserve_count = 0;
155 caps_min_count = 0; 137 mdsc->caps_min_count = 0;
156 spin_unlock(&caps_list_lock); 138 spin_unlock(&mdsc->caps_list_lock);
157} 139}
158 140
159void ceph_adjust_min_caps(int delta) 141void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
160{ 142{
161 spin_lock(&caps_list_lock); 143 spin_lock(&mdsc->caps_list_lock);
162 caps_min_count += delta; 144 mdsc->caps_min_count += delta;
163 BUG_ON(caps_min_count < 0); 145 BUG_ON(mdsc->caps_min_count < 0);
164 spin_unlock(&caps_list_lock); 146 spin_unlock(&mdsc->caps_list_lock);
165} 147}
166 148
167int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) 149int ceph_reserve_caps(struct ceph_mds_client *mdsc,
150 struct ceph_cap_reservation *ctx, int need)
168{ 151{
169 int i; 152 int i;
170 struct ceph_cap *cap; 153 struct ceph_cap *cap;
@@ -176,16 +159,17 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
176 dout("reserve caps ctx=%p need=%d\n", ctx, need); 159 dout("reserve caps ctx=%p need=%d\n", ctx, need);
177 160
178 /* first reserve any caps that are already allocated */ 161 /* first reserve any caps that are already allocated */
179 spin_lock(&caps_list_lock); 162 spin_lock(&mdsc->caps_list_lock);
180 if (caps_avail_count >= need) 163 if (mdsc->caps_avail_count >= need)
181 have = need; 164 have = need;
182 else 165 else
183 have = caps_avail_count; 166 have = mdsc->caps_avail_count;
184 caps_avail_count -= have; 167 mdsc->caps_avail_count -= have;
185 caps_reserve_count += have; 168 mdsc->caps_reserve_count += have;
186 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 169 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
187 caps_avail_count); 170 mdsc->caps_reserve_count +
188 spin_unlock(&caps_list_lock); 171 mdsc->caps_avail_count);
172 spin_unlock(&mdsc->caps_list_lock);
189 173
190 for (i = have; i < need; i++) { 174 for (i = have; i < need; i++) {
191 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 175 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
@@ -198,19 +182,20 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
198 } 182 }
199 BUG_ON(have + alloc != need); 183 BUG_ON(have + alloc != need);
200 184
201 spin_lock(&caps_list_lock); 185 spin_lock(&mdsc->caps_list_lock);
202 caps_total_count += alloc; 186 mdsc->caps_total_count += alloc;
203 caps_reserve_count += alloc; 187 mdsc->caps_reserve_count += alloc;
204 list_splice(&newcaps, &caps_list); 188 list_splice(&newcaps, &mdsc->caps_list);
205 189
206 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 190 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
207 caps_avail_count); 191 mdsc->caps_reserve_count +
208 spin_unlock(&caps_list_lock); 192 mdsc->caps_avail_count);
193 spin_unlock(&mdsc->caps_list_lock);
209 194
210 ctx->count = need; 195 ctx->count = need;
211 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", 196 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
212 ctx, caps_total_count, caps_use_count, caps_reserve_count, 197 ctx, mdsc->caps_total_count, mdsc->caps_use_count,
213 caps_avail_count); 198 mdsc->caps_reserve_count, mdsc->caps_avail_count);
214 return 0; 199 return 0;
215 200
216out_alloc_count: 201out_alloc_count:
@@ -220,26 +205,29 @@ out_alloc_count:
220 return ret; 205 return ret;
221} 206}
222 207
223int ceph_unreserve_caps(struct ceph_cap_reservation *ctx) 208int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
209 struct ceph_cap_reservation *ctx)
224{ 210{
225 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); 211 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
226 if (ctx->count) { 212 if (ctx->count) {
227 spin_lock(&caps_list_lock); 213 spin_lock(&mdsc->caps_list_lock);
228 BUG_ON(caps_reserve_count < ctx->count); 214 BUG_ON(mdsc->caps_reserve_count < ctx->count);
229 caps_reserve_count -= ctx->count; 215 mdsc->caps_reserve_count -= ctx->count;
230 caps_avail_count += ctx->count; 216 mdsc->caps_avail_count += ctx->count;
231 ctx->count = 0; 217 ctx->count = 0;
232 dout("unreserve caps %d = %d used + %d resv + %d avail\n", 218 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
233 caps_total_count, caps_use_count, caps_reserve_count, 219 mdsc->caps_total_count, mdsc->caps_use_count,
234 caps_avail_count); 220 mdsc->caps_reserve_count, mdsc->caps_avail_count);
235 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 221 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
236 caps_avail_count); 222 mdsc->caps_reserve_count +
237 spin_unlock(&caps_list_lock); 223 mdsc->caps_avail_count);
224 spin_unlock(&mdsc->caps_list_lock);
238 } 225 }
239 return 0; 226 return 0;
240} 227}
241 228
242static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) 229static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
230 struct ceph_cap_reservation *ctx)
243{ 231{
244 struct ceph_cap *cap = NULL; 232 struct ceph_cap *cap = NULL;
245 233
@@ -247,71 +235,74 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
247 if (!ctx) { 235 if (!ctx) {
248 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 236 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
249 if (cap) { 237 if (cap) {
250 caps_use_count++; 238 mdsc->caps_use_count++;
251 caps_total_count++; 239 mdsc->caps_total_count++;
252 } 240 }
253 return cap; 241 return cap;
254 } 242 }
255 243
256 spin_lock(&caps_list_lock); 244 spin_lock(&mdsc->caps_list_lock);
257 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", 245 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
258 ctx, ctx->count, caps_total_count, caps_use_count, 246 ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
259 caps_reserve_count, caps_avail_count); 247 mdsc->caps_reserve_count, mdsc->caps_avail_count);
260 BUG_ON(!ctx->count); 248 BUG_ON(!ctx->count);
261 BUG_ON(ctx->count > caps_reserve_count); 249 BUG_ON(ctx->count > mdsc->caps_reserve_count);
262 BUG_ON(list_empty(&caps_list)); 250 BUG_ON(list_empty(&mdsc->caps_list));
263 251
264 ctx->count--; 252 ctx->count--;
265 caps_reserve_count--; 253 mdsc->caps_reserve_count--;
266 caps_use_count++; 254 mdsc->caps_use_count++;
267 255
268 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); 256 cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
269 list_del(&cap->caps_item); 257 list_del(&cap->caps_item);
270 258
271 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 259 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
272 caps_avail_count); 260 mdsc->caps_reserve_count + mdsc->caps_avail_count);
273 spin_unlock(&caps_list_lock); 261 spin_unlock(&mdsc->caps_list_lock);
274 return cap; 262 return cap;
275} 263}
276 264
277void ceph_put_cap(struct ceph_cap *cap) 265void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
278{ 266{
279 spin_lock(&caps_list_lock); 267 spin_lock(&mdsc->caps_list_lock);
280 dout("put_cap %p %d = %d used + %d resv + %d avail\n", 268 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
281 cap, caps_total_count, caps_use_count, 269 cap, mdsc->caps_total_count, mdsc->caps_use_count,
282 caps_reserve_count, caps_avail_count); 270 mdsc->caps_reserve_count, mdsc->caps_avail_count);
283 caps_use_count--; 271 mdsc->caps_use_count--;
284 /* 272 /*
285 * Keep some preallocated caps around (ceph_min_count), to 273 * Keep some preallocated caps around (ceph_min_count), to
286 * avoid lots of free/alloc churn. 274 * avoid lots of free/alloc churn.
287 */ 275 */
288 if (caps_avail_count >= caps_reserve_count + caps_min_count) { 276 if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
289 caps_total_count--; 277 mdsc->caps_min_count) {
278 mdsc->caps_total_count--;
290 kmem_cache_free(ceph_cap_cachep, cap); 279 kmem_cache_free(ceph_cap_cachep, cap);
291 } else { 280 } else {
292 caps_avail_count++; 281 mdsc->caps_avail_count++;
293 list_add(&cap->caps_item, &caps_list); 282 list_add(&cap->caps_item, &mdsc->caps_list);
294 } 283 }
295 284
296 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 285 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
297 caps_avail_count); 286 mdsc->caps_reserve_count + mdsc->caps_avail_count);
298 spin_unlock(&caps_list_lock); 287 spin_unlock(&mdsc->caps_list_lock);
299} 288}
300 289
301void ceph_reservation_status(struct ceph_client *client, 290void ceph_reservation_status(struct ceph_client *client,
302 int *total, int *avail, int *used, int *reserved, 291 int *total, int *avail, int *used, int *reserved,
303 int *min) 292 int *min)
304{ 293{
294 struct ceph_mds_client *mdsc = &client->mdsc;
295
305 if (total) 296 if (total)
306 *total = caps_total_count; 297 *total = mdsc->caps_total_count;
307 if (avail) 298 if (avail)
308 *avail = caps_avail_count; 299 *avail = mdsc->caps_avail_count;
309 if (used) 300 if (used)
310 *used = caps_use_count; 301 *used = mdsc->caps_use_count;
311 if (reserved) 302 if (reserved)
312 *reserved = caps_reserve_count; 303 *reserved = mdsc->caps_reserve_count;
313 if (min) 304 if (min)
314 *min = caps_min_count; 305 *min = mdsc->caps_min_count;
315} 306}
316 307
317/* 308/*
@@ -336,22 +327,29 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
336 return NULL; 327 return NULL;
337} 328}
338 329
330struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
331{
332 struct ceph_cap *cap;
333
334 spin_lock(&ci->vfs_inode.i_lock);
335 cap = __get_cap_for_mds(ci, mds);
336 spin_unlock(&ci->vfs_inode.i_lock);
337 return cap;
338}
339
339/* 340/*
340 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else 341 * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
341 * -1.
342 */ 342 */
343static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) 343static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
344{ 344{
345 struct ceph_cap *cap; 345 struct ceph_cap *cap;
346 int mds = -1; 346 int mds = -1;
347 struct rb_node *p; 347 struct rb_node *p;
348 348
349 /* prefer mds with WR|WRBUFFER|EXCL caps */ 349 /* prefer mds with WR|BUFFER|EXCL caps */
350 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 350 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
351 cap = rb_entry(p, struct ceph_cap, ci_node); 351 cap = rb_entry(p, struct ceph_cap, ci_node);
352 mds = cap->mds; 352 mds = cap->mds;
353 if (mseq)
354 *mseq = cap->mseq;
355 if (cap->issued & (CEPH_CAP_FILE_WR | 353 if (cap->issued & (CEPH_CAP_FILE_WR |
356 CEPH_CAP_FILE_BUFFER | 354 CEPH_CAP_FILE_BUFFER |
357 CEPH_CAP_FILE_EXCL)) 355 CEPH_CAP_FILE_EXCL))
@@ -364,7 +362,7 @@ int ceph_get_cap_mds(struct inode *inode)
364{ 362{
365 int mds; 363 int mds;
366 spin_lock(&inode->i_lock); 364 spin_lock(&inode->i_lock);
367 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL); 365 mds = __ceph_get_cap_mds(ceph_inode(inode));
368 spin_unlock(&inode->i_lock); 366 spin_unlock(&inode->i_lock);
369 return mds; 367 return mds;
370} 368}
@@ -483,8 +481,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
483 * Each time we receive FILE_CACHE anew, we increment 481 * Each time we receive FILE_CACHE anew, we increment
484 * i_rdcache_gen. 482 * i_rdcache_gen.
485 */ 483 */
486 if ((issued & CEPH_CAP_FILE_CACHE) && 484 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
487 (had & CEPH_CAP_FILE_CACHE) == 0) 485 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
488 ci->i_rdcache_gen++; 486 ci->i_rdcache_gen++;
489 487
490 /* 488 /*
@@ -543,7 +541,7 @@ retry:
543 new_cap = NULL; 541 new_cap = NULL;
544 } else { 542 } else {
545 spin_unlock(&inode->i_lock); 543 spin_unlock(&inode->i_lock);
546 new_cap = get_cap(caps_reservation); 544 new_cap = get_cap(mdsc, caps_reservation);
547 if (new_cap == NULL) 545 if (new_cap == NULL)
548 return -ENOMEM; 546 return -ENOMEM;
549 goto retry; 547 goto retry;
@@ -588,6 +586,7 @@ retry:
588 } else { 586 } else {
589 pr_err("ceph_add_cap: couldn't find snap realm %llx\n", 587 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
590 realmino); 588 realmino);
589 WARN_ON(!realm);
591 } 590 }
592 } 591 }
593 592
@@ -831,7 +830,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
831{ 830{
832 int want = 0; 831 int want = 0;
833 int mode; 832 int mode;
834 for (mode = 0; mode < 4; mode++) 833 for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
835 if (ci->i_nr_by_mode[mode]) 834 if (ci->i_nr_by_mode[mode])
836 want |= ceph_caps_for_mode(mode); 835 want |= ceph_caps_for_mode(mode);
837 return want; 836 return want;
@@ -901,7 +900,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
901 ci->i_auth_cap = NULL; 900 ci->i_auth_cap = NULL;
902 901
903 if (removed) 902 if (removed)
904 ceph_put_cap(cap); 903 ceph_put_cap(mdsc, cap);
905 904
906 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { 905 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
907 struct ceph_snap_realm *realm = ci->i_snap_realm; 906 struct ceph_snap_realm *realm = ci->i_snap_realm;
@@ -1197,6 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1197 */ 1196 */
1198void __ceph_flush_snaps(struct ceph_inode_info *ci, 1197void __ceph_flush_snaps(struct ceph_inode_info *ci,
1199 struct ceph_mds_session **psession) 1198 struct ceph_mds_session **psession)
1199 __releases(ci->vfs_inode->i_lock)
1200 __acquires(ci->vfs_inode->i_lock)
1200{ 1201{
1201 struct inode *inode = &ci->vfs_inode; 1202 struct inode *inode = &ci->vfs_inode;
1202 int mds; 1203 int mds;
@@ -1232,7 +1233,13 @@ retry:
1232 BUG_ON(capsnap->dirty == 0); 1233 BUG_ON(capsnap->dirty == 0);
1233 1234
1234 /* pick mds, take s_mutex */ 1235 /* pick mds, take s_mutex */
1235 mds = __ceph_get_cap_mds(ci, &mseq); 1236 if (ci->i_auth_cap == NULL) {
1237 dout("no auth cap (migrating?), doing nothing\n");
1238 goto out;
1239 }
1240 mds = ci->i_auth_cap->session->s_mds;
1241 mseq = ci->i_auth_cap->mseq;
1242
1236 if (session && session->s_mds != mds) { 1243 if (session && session->s_mds != mds) {
1237 dout("oops, wrong session %p mutex\n", session); 1244 dout("oops, wrong session %p mutex\n", session);
1238 mutex_unlock(&session->s_mutex); 1245 mutex_unlock(&session->s_mutex);
@@ -1251,8 +1258,8 @@ retry:
1251 } 1258 }
1252 /* 1259 /*
1253 * if session == NULL, we raced against a cap 1260 * if session == NULL, we raced against a cap
1254 * deletion. retry, and we'll get a better 1261 * deletion or migration. retry, and we'll
1255 * @mds value next time. 1262 * get a better @mds value next time.
1256 */ 1263 */
1257 spin_lock(&inode->i_lock); 1264 spin_lock(&inode->i_lock);
1258 goto retry; 1265 goto retry;
@@ -1290,6 +1297,7 @@ retry:
1290 list_del_init(&ci->i_snap_flush_item); 1297 list_del_init(&ci->i_snap_flush_item);
1291 spin_unlock(&mdsc->snap_flush_lock); 1298 spin_unlock(&mdsc->snap_flush_lock);
1292 1299
1300out:
1293 if (psession) 1301 if (psession)
1294 *psession = session; 1302 *psession = session;
1295 else if (session) { 1303 else if (session) {
@@ -1435,7 +1443,6 @@ static int try_nonblocking_invalidate(struct inode *inode)
1435 */ 1443 */
1436void ceph_check_caps(struct ceph_inode_info *ci, int flags, 1444void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1437 struct ceph_mds_session *session) 1445 struct ceph_mds_session *session)
1438 __releases(session->s_mutex)
1439{ 1446{
1440 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); 1447 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1441 struct ceph_mds_client *mdsc = &client->mdsc; 1448 struct ceph_mds_client *mdsc = &client->mdsc;
@@ -1510,11 +1517,13 @@ retry_locked:
1510 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ 1517 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1511 ci->i_rdcache_gen && /* may have cached pages */ 1518 ci->i_rdcache_gen && /* may have cached pages */
1512 (file_wanted == 0 || /* no open files */ 1519 (file_wanted == 0 || /* no open files */
1513 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */ 1520 (revoking & (CEPH_CAP_FILE_CACHE|
1521 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
1514 !tried_invalidate) { 1522 !tried_invalidate) {
1515 dout("check_caps trying to invalidate on %p\n", inode); 1523 dout("check_caps trying to invalidate on %p\n", inode);
1516 if (try_nonblocking_invalidate(inode) < 0) { 1524 if (try_nonblocking_invalidate(inode) < 0) {
1517 if (revoking & CEPH_CAP_FILE_CACHE) { 1525 if (revoking & (CEPH_CAP_FILE_CACHE|
1526 CEPH_CAP_FILE_LAZYIO)) {
1518 dout("check_caps queuing invalidate\n"); 1527 dout("check_caps queuing invalidate\n");
1519 queue_invalidate = 1; 1528 queue_invalidate = 1;
1520 ci->i_rdcache_revoking = ci->i_rdcache_gen; 1529 ci->i_rdcache_revoking = ci->i_rdcache_gen;
@@ -2250,8 +2259,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2250 struct ceph_mds_session *session, 2259 struct ceph_mds_session *session,
2251 struct ceph_cap *cap, 2260 struct ceph_cap *cap,
2252 struct ceph_buffer *xattr_buf) 2261 struct ceph_buffer *xattr_buf)
2253 __releases(inode->i_lock) 2262 __releases(inode->i_lock)
2254 __releases(session->s_mutex)
2255{ 2263{
2256 struct ceph_inode_info *ci = ceph_inode(inode); 2264 struct ceph_inode_info *ci = ceph_inode(inode);
2257 int mds = session->s_mds; 2265 int mds = session->s_mds;
@@ -2278,6 +2286,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2278 * will invalidate _after_ writeback.) 2286 * will invalidate _after_ writeback.)
2279 */ 2287 */
2280 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && 2288 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2289 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2281 !ci->i_wrbuffer_ref) { 2290 !ci->i_wrbuffer_ref) {
2282 if (try_nonblocking_invalidate(inode) == 0) { 2291 if (try_nonblocking_invalidate(inode) == 0) {
2283 revoked_rdcache = 1; 2292 revoked_rdcache = 1;
@@ -2369,15 +2378,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2369 2378
2370 /* revocation, grant, or no-op? */ 2379 /* revocation, grant, or no-op? */
2371 if (cap->issued & ~newcaps) { 2380 if (cap->issued & ~newcaps) {
2372 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued), 2381 int revoking = cap->issued & ~newcaps;
2373 ceph_cap_string(newcaps)); 2382
2374 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) 2383 dout("revocation: %s -> %s (revoking %s)\n",
2375 writeback = 1; /* will delay ack */ 2384 ceph_cap_string(cap->issued),
2376 else if (dirty & ~newcaps) 2385 ceph_cap_string(newcaps),
2377 check_caps = 1; /* initiate writeback in check_caps */ 2386 ceph_cap_string(revoking));
2378 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || 2387 if (revoking & used & CEPH_CAP_FILE_BUFFER)
2379 revoked_rdcache) 2388 writeback = 1; /* initiate writeback; will delay ack */
2380 check_caps = 2; /* send revoke ack in check_caps */ 2389 else if (revoking == CEPH_CAP_FILE_CACHE &&
2390 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2391 queue_invalidate)
2392 ; /* do nothing yet, invalidation will be queued */
2393 else if (cap == ci->i_auth_cap)
2394 check_caps = 1; /* check auth cap only */
2395 else
2396 check_caps = 2; /* check all caps */
2381 cap->issued = newcaps; 2397 cap->issued = newcaps;
2382 cap->implemented |= newcaps; 2398 cap->implemented |= newcaps;
2383 } else if (cap->issued == newcaps) { 2399 } else if (cap->issued == newcaps) {
@@ -2568,7 +2584,8 @@ static void handle_cap_trunc(struct inode *inode,
2568 * caller holds s_mutex 2584 * caller holds s_mutex
2569 */ 2585 */
2570static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, 2586static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2571 struct ceph_mds_session *session) 2587 struct ceph_mds_session *session,
2588 int *open_target_sessions)
2572{ 2589{
2573 struct ceph_inode_info *ci = ceph_inode(inode); 2590 struct ceph_inode_info *ci = ceph_inode(inode);
2574 int mds = session->s_mds; 2591 int mds = session->s_mds;
@@ -2600,6 +2617,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2600 ci->i_cap_exporting_mds = mds; 2617 ci->i_cap_exporting_mds = mds;
2601 ci->i_cap_exporting_mseq = mseq; 2618 ci->i_cap_exporting_mseq = mseq;
2602 ci->i_cap_exporting_issued = cap->issued; 2619 ci->i_cap_exporting_issued = cap->issued;
2620
2621 /*
2622 * make sure we have open sessions with all possible
2623 * export targets, so that we get the matching IMPORT
2624 */
2625 *open_target_sessions = 1;
2603 } 2626 }
2604 __ceph_remove_cap(cap); 2627 __ceph_remove_cap(cap);
2605 } 2628 }
@@ -2675,6 +2698,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2675 u64 size, max_size; 2698 u64 size, max_size;
2676 u64 tid; 2699 u64 tid;
2677 void *snaptrace; 2700 void *snaptrace;
2701 size_t snaptrace_len;
2702 void *flock;
2703 u32 flock_len;
2704 int open_target_sessions = 0;
2678 2705
2679 dout("handle_caps from mds%d\n", mds); 2706 dout("handle_caps from mds%d\n", mds);
2680 2707
@@ -2683,7 +2710,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2683 if (msg->front.iov_len < sizeof(*h)) 2710 if (msg->front.iov_len < sizeof(*h))
2684 goto bad; 2711 goto bad;
2685 h = msg->front.iov_base; 2712 h = msg->front.iov_base;
2686 snaptrace = h + 1;
2687 op = le32_to_cpu(h->op); 2713 op = le32_to_cpu(h->op);
2688 vino.ino = le64_to_cpu(h->ino); 2714 vino.ino = le64_to_cpu(h->ino);
2689 vino.snap = CEPH_NOSNAP; 2715 vino.snap = CEPH_NOSNAP;
@@ -2693,6 +2719,21 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2693 size = le64_to_cpu(h->size); 2719 size = le64_to_cpu(h->size);
2694 max_size = le64_to_cpu(h->max_size); 2720 max_size = le64_to_cpu(h->max_size);
2695 2721
2722 snaptrace = h + 1;
2723 snaptrace_len = le32_to_cpu(h->snap_trace_len);
2724
2725 if (le16_to_cpu(msg->hdr.version) >= 2) {
2726 void *p, *end;
2727
2728 p = snaptrace + snaptrace_len;
2729 end = msg->front.iov_base + msg->front.iov_len;
2730 ceph_decode_32_safe(&p, end, flock_len, bad);
2731 flock = p;
2732 } else {
2733 flock = NULL;
2734 flock_len = 0;
2735 }
2736
2696 mutex_lock(&session->s_mutex); 2737 mutex_lock(&session->s_mutex);
2697 session->s_seq++; 2738 session->s_seq++;
2698 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, 2739 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -2714,7 +2755,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2714 * along for the mds (who clearly thinks we still have this 2755 * along for the mds (who clearly thinks we still have this
2715 * cap). 2756 * cap).
2716 */ 2757 */
2717 ceph_add_cap_releases(mdsc, session, -1); 2758 ceph_add_cap_releases(mdsc, session);
2718 ceph_send_cap_releases(mdsc, session); 2759 ceph_send_cap_releases(mdsc, session);
2719 goto done; 2760 goto done;
2720 } 2761 }
@@ -2726,12 +2767,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2726 goto done; 2767 goto done;
2727 2768
2728 case CEPH_CAP_OP_EXPORT: 2769 case CEPH_CAP_OP_EXPORT:
2729 handle_cap_export(inode, h, session); 2770 handle_cap_export(inode, h, session, &open_target_sessions);
2730 goto done; 2771 goto done;
2731 2772
2732 case CEPH_CAP_OP_IMPORT: 2773 case CEPH_CAP_OP_IMPORT:
2733 handle_cap_import(mdsc, inode, h, session, 2774 handle_cap_import(mdsc, inode, h, session,
2734 snaptrace, le32_to_cpu(h->snap_trace_len)); 2775 snaptrace, snaptrace_len);
2735 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, 2776 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2736 session); 2777 session);
2737 goto done_unlocked; 2778 goto done_unlocked;
@@ -2773,6 +2814,8 @@ done:
2773done_unlocked: 2814done_unlocked:
2774 if (inode) 2815 if (inode)
2775 iput(inode); 2816 iput(inode);
2817 if (open_target_sessions)
2818 ceph_mdsc_open_export_target_sessions(mdsc, session);
2776 return; 2819 return;
2777 2820
2778bad: 2821bad:
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
index 793f50cb7c22..5babb8e95352 100644
--- a/fs/ceph/ceph_frag.h
+++ b/fs/ceph/ceph_frag.h
@@ -1,5 +1,5 @@
1#ifndef _FS_CEPH_FRAG_H 1#ifndef FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H 2#define FS_CEPH_FRAG_H
3 3
4/* 4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space, 5 * "Frags" are a way to describe a subset of a 32-bit number space,
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
index 79d76bc4303f..3ac6cc7c1156 100644
--- a/fs/ceph/ceph_fs.c
+++ b/fs/ceph/ceph_fs.c
@@ -29,46 +29,44 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
29 29
30int ceph_flags_to_mode(int flags) 30int ceph_flags_to_mode(int flags)
31{ 31{
32 int mode;
33
32#ifdef O_DIRECTORY /* fixme */ 34#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY) 35 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN; 36 return CEPH_FILE_MODE_PIN;
35#endif 37#endif
38 if ((flags & O_APPEND) == O_APPEND)
39 flags |= O_WRONLY;
40
41 if ((flags & O_ACCMODE) == O_RDWR)
42 mode = CEPH_FILE_MODE_RDWR;
43 else if ((flags & O_ACCMODE) == O_WRONLY)
44 mode = CEPH_FILE_MODE_WR;
45 else
46 mode = CEPH_FILE_MODE_RD;
47
36#ifdef O_LAZY 48#ifdef O_LAZY
37 if (flags & O_LAZY) 49 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY; 50 mode |= CEPH_FILE_MODE_LAZY;
39#endif 51#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42 52
43 flags &= O_ACCMODE; 53 return mode;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49} 54}
50 55
51int ceph_caps_for_mode(int mode) 56int ceph_caps_for_mode(int mode)
52{ 57{
53 switch (mode) { 58 int caps = CEPH_CAP_PIN;
54 case CEPH_FILE_MODE_PIN: 59
55 return CEPH_CAP_PIN; 60 if (mode & CEPH_FILE_MODE_RD)
56 case CEPH_FILE_MODE_RD: 61 caps |= CEPH_CAP_FILE_SHARED |
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; 62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR: 63 if (mode & CEPH_FILE_MODE_WR)
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | 64 caps |= CEPH_CAP_FILE_EXCL |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | 65 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | 66 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; 67 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 } 68 if (mode & CEPH_FILE_MODE_LAZY)
73 return 0; 69 caps |= CEPH_CAP_FILE_LAZYIO;
70
71 return caps;
74} 72}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 2fa992eaf7da..d5619ac86711 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -9,27 +9,13 @@
9 * LGPL2 9 * LGPL2
10 */ 10 */
11 11
12#ifndef _FS_CEPH_CEPH_FS_H 12#ifndef CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H 13#define CEPH_FS_H
14 14
15#include "msgr.h" 15#include "msgr.h"
16#include "rados.h" 16#include "rados.h"
17 17
18/* 18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 20
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level 19 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev 20 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public, 21 * internal cluster protocols separately from the public,
@@ -53,18 +39,10 @@
53/* 39/*
54 * feature bits 40 * feature bits
55 */ 41 */
56#define CEPH_FEATURE_UID 1 42#define CEPH_FEATURE_UID (1<<0)
57#define CEPH_FEATURE_NOSRCADDR 2 43#define CEPH_FEATURE_NOSRCADDR (1<<1)
58#define CEPH_FEATURE_FLOCK 4 44#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
59 45#define CEPH_FEATURE_FLOCK (1<<3)
60#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
61#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID
62#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
63#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID
64#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
65#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID
66#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
67#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
68 46
69 47
70/* 48/*
@@ -96,6 +74,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
96#define CEPH_CRYPTO_NONE 0x0 74#define CEPH_CRYPTO_NONE 0x0
97#define CEPH_CRYPTO_AES 0x1 75#define CEPH_CRYPTO_AES 0x1
98 76
77#define CEPH_AES_IV "cephsageyudagreg"
78
99/* security/authentication protocols */ 79/* security/authentication protocols */
100#define CEPH_AUTH_UNKNOWN 0x0 80#define CEPH_AUTH_UNKNOWN 0x0
101#define CEPH_AUTH_NONE 0x1 81#define CEPH_AUTH_NONE 0x1
@@ -275,6 +255,7 @@ extern const char *ceph_mds_state_name(int s);
275#define CEPH_LOCK_IDFT 512 /* dir frag tree */ 255#define CEPH_LOCK_IDFT 512 /* dir frag tree */
276#define CEPH_LOCK_INEST 1024 /* mds internal */ 256#define CEPH_LOCK_INEST 1024 /* mds internal */
277#define CEPH_LOCK_IXATTR 2048 257#define CEPH_LOCK_IXATTR 2048
258#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
278#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ 259#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
279 260
280/* client_session ops */ 261/* client_session ops */
@@ -316,6 +297,8 @@ enum {
316 CEPH_MDS_OP_RMXATTR = 0x01106, 297 CEPH_MDS_OP_RMXATTR = 0x01106,
317 CEPH_MDS_OP_SETLAYOUT = 0x01107, 298 CEPH_MDS_OP_SETLAYOUT = 0x01107,
318 CEPH_MDS_OP_SETATTR = 0x01108, 299 CEPH_MDS_OP_SETATTR = 0x01108,
300 CEPH_MDS_OP_SETFILELOCK= 0x01109,
301 CEPH_MDS_OP_GETFILELOCK= 0x00110,
319 302
320 CEPH_MDS_OP_MKNOD = 0x01201, 303 CEPH_MDS_OP_MKNOD = 0x01201,
321 CEPH_MDS_OP_LINK = 0x01202, 304 CEPH_MDS_OP_LINK = 0x01202,
@@ -386,6 +369,15 @@ union ceph_mds_request_args {
386 struct { 369 struct {
387 struct ceph_file_layout layout; 370 struct ceph_file_layout layout;
388 } __attribute__ ((packed)) setlayout; 371 } __attribute__ ((packed)) setlayout;
372 struct {
373 __u8 rule; /* currently fcntl or flock */
374 __u8 type; /* shared, exclusive, remove*/
375 __le64 pid; /* process id requesting the lock */
376 __le64 pid_namespace;
377 __le64 start; /* initial location to lock */
378 __le64 length; /* num bytes to lock from start */
379 __u8 wait; /* will caller wait for lock to become available? */
380 } __attribute__ ((packed)) filelock_change;
389} __attribute__ ((packed)); 381} __attribute__ ((packed));
390 382
391#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ 383#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
@@ -480,6 +472,23 @@ struct ceph_mds_reply_dirfrag {
480 __le32 dist[]; 472 __le32 dist[];
481} __attribute__ ((packed)); 473} __attribute__ ((packed));
482 474
475#define CEPH_LOCK_FCNTL 1
476#define CEPH_LOCK_FLOCK 2
477
478#define CEPH_LOCK_SHARED 1
479#define CEPH_LOCK_EXCL 2
480#define CEPH_LOCK_UNLOCK 4
481
482struct ceph_filelock {
483 __le64 start;/* file offset to start lock at */
484 __le64 length; /* num bytes to lock; 0 for all following start */
485 __le64 client; /* which client holds the lock */
486 __le64 pid; /* process id holding the lock on the client */
487 __le64 pid_namespace;
488 __u8 type; /* shared lock, exclusive lock, or unlock */
489} __attribute__ ((packed));
490
491
483/* file access modes */ 492/* file access modes */
484#define CEPH_FILE_MODE_PIN 0 493#define CEPH_FILE_MODE_PIN 0
485#define CEPH_FILE_MODE_RD 1 494#define CEPH_FILE_MODE_RD 1
@@ -508,9 +517,10 @@ int ceph_flags_to_mode(int flags);
508#define CEPH_CAP_SAUTH 2 517#define CEPH_CAP_SAUTH 2
509#define CEPH_CAP_SLINK 4 518#define CEPH_CAP_SLINK 4
510#define CEPH_CAP_SXATTR 6 519#define CEPH_CAP_SXATTR 6
511#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */ 520#define CEPH_CAP_SFILE 8
521#define CEPH_CAP_SFLOCK 20
512 522
513#define CEPH_CAP_BITS 16 523#define CEPH_CAP_BITS 22
514 524
515/* composed values */ 525/* composed values */
516#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) 526#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
@@ -528,6 +538,9 @@ int ceph_flags_to_mode(int flags);
528#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE) 538#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
529#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE) 539#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
530#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE) 540#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
541#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK)
542#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK)
543
531 544
532/* cap masks (for getattr) */ 545/* cap masks (for getattr) */
533#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN 546#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
@@ -563,7 +576,8 @@ int ceph_flags_to_mode(int flags);
563 CEPH_CAP_FILE_EXCL) 576 CEPH_CAP_FILE_EXCL)
564#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) 577#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
565#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ 578#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
566 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN) 579 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
580 CEPH_CAP_PIN)
567 581
568#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ 582#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
569 CEPH_LOCK_IXATTR) 583 CEPH_LOCK_IXATTR)
@@ -653,12 +667,21 @@ struct ceph_mds_cap_reconnect {
653 __le64 cap_id; 667 __le64 cap_id;
654 __le32 wanted; 668 __le32 wanted;
655 __le32 issued; 669 __le32 issued;
670 __le64 snaprealm;
671 __le64 pathbase; /* base ino for our path to this ino */
672 __le32 flock_len; /* size of flock state blob, if any */
673} __attribute__ ((packed));
674/* followed by flock blob */
675
676struct ceph_mds_cap_reconnect_v1 {
677 __le64 cap_id;
678 __le32 wanted;
679 __le32 issued;
656 __le64 size; 680 __le64 size;
657 struct ceph_timespec mtime, atime; 681 struct ceph_timespec mtime, atime;
658 __le64 snaprealm; 682 __le64 snaprealm;
659 __le64 pathbase; /* base ino for our path to this ino */ 683 __le64 pathbase; /* base ino for our path to this ino */
660} __attribute__ ((packed)); 684} __attribute__ ((packed));
661/* followed by encoded string */
662 685
663struct ceph_mds_snaprealm_reconnect { 686struct ceph_mds_snaprealm_reconnect {
664 __le64 ino; /* snap realm base */ 687 __le64 ino; /* snap realm base */
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
index 5ac470c433c9..d099c3f90236 100644
--- a/fs/ceph/ceph_hash.h
+++ b/fs/ceph/ceph_hash.h
@@ -1,5 +1,5 @@
1#ifndef _FS_CEPH_HASH_H 1#ifndef FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H 2#define FS_CEPH_HASH_H
3 3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */ 4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */ 5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 7503aee828ce..c6179d3a26a2 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -28,6 +28,7 @@ const char *ceph_osd_op_name(int op)
28 case CEPH_OSD_OP_TRUNCATE: return "truncate"; 28 case CEPH_OSD_OP_TRUNCATE: return "truncate";
29 case CEPH_OSD_OP_ZERO: return "zero"; 29 case CEPH_OSD_OP_ZERO: return "zero";
30 case CEPH_OSD_OP_WRITEFULL: return "writefull"; 30 case CEPH_OSD_OP_WRITEFULL: return "writefull";
31 case CEPH_OSD_OP_ROLLBACK: return "rollback";
31 32
32 case CEPH_OSD_OP_APPEND: return "append"; 33 case CEPH_OSD_OP_APPEND: return "append";
33 case CEPH_OSD_OP_STARTSYNC: return "startsync"; 34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
@@ -129,6 +130,8 @@ const char *ceph_mds_op_name(int op)
129 case CEPH_MDS_OP_LSSNAP: return "lssnap"; 130 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap"; 131 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap"; 132 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
133 case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
134 case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
132 } 135 }
133 return "???"; 136 return "???";
134} 137}
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
index dcd7e7523700..97e435b191f4 100644
--- a/fs/ceph/crush/crush.h
+++ b/fs/ceph/crush/crush.h
@@ -1,5 +1,5 @@
1#ifndef _CRUSH_CRUSH_H 1#ifndef CEPH_CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H 2#define CEPH_CRUSH_CRUSH_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5 5
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
index ff48e110e4bb..91e884230d5d 100644
--- a/fs/ceph/crush/hash.h
+++ b/fs/ceph/crush/hash.h
@@ -1,5 +1,5 @@
1#ifndef _CRUSH_HASH_H 1#ifndef CEPH_CRUSH_HASH_H
2#define _CRUSH_HASH_H 2#define CEPH_CRUSH_HASH_H
3 3
4#define CRUSH_HASH_RJENKINS1 0 4#define CRUSH_HASH_RJENKINS1 0
5 5
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
index 98e90046fd9f..c46b99c18bb0 100644
--- a/fs/ceph/crush/mapper.h
+++ b/fs/ceph/crush/mapper.h
@@ -1,5 +1,5 @@
1#ifndef _CRUSH_MAPPER_H 1#ifndef CEPH_CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H 2#define CEPH_CRUSH_MAPPER_H
3 3
4/* 4/*
5 * CRUSH functions for find rules and then mapping an input to an 5 * CRUSH functions for find rules and then mapping an input to an
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index f704b3b62424..a3e627f63293 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -75,10 +75,11 @@ static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); 75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76} 76}
77 77
78const u8 *aes_iv = "cephsageyudagreg"; 78static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
79 79
80int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len, 80static int ceph_aes_encrypt(const void *key, int key_len,
81 const void *src, size_t src_len) 81 void *dst, size_t *dst_len,
82 const void *src, size_t src_len)
82{ 83{
83 struct scatterlist sg_in[2], sg_out[1]; 84 struct scatterlist sg_in[2], sg_out[1];
84 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); 85 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -126,9 +127,10 @@ int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
126 return 0; 127 return 0;
127} 128}
128 129
129int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len, 130static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
130 const void *src1, size_t src1_len, 131 size_t *dst_len,
131 const void *src2, size_t src2_len) 132 const void *src1, size_t src1_len,
133 const void *src2, size_t src2_len)
132{ 134{
133 struct scatterlist sg_in[3], sg_out[1]; 135 struct scatterlist sg_in[3], sg_out[1];
134 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); 136 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -179,8 +181,9 @@ int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
179 return 0; 181 return 0;
180} 182}
181 183
182int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len, 184static int ceph_aes_decrypt(const void *key, int key_len,
183 const void *src, size_t src_len) 185 void *dst, size_t *dst_len,
186 const void *src, size_t src_len)
184{ 187{
185 struct scatterlist sg_in[1], sg_out[2]; 188 struct scatterlist sg_in[1], sg_out[2];
186 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); 189 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -238,10 +241,10 @@ int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
238 return 0; 241 return 0;
239} 242}
240 243
241int ceph_aes_decrypt2(const void *key, int key_len, 244static int ceph_aes_decrypt2(const void *key, int key_len,
242 void *dst1, size_t *dst1_len, 245 void *dst1, size_t *dst1_len,
243 void *dst2, size_t *dst2_len, 246 void *dst2, size_t *dst2_len,
244 const void *src, size_t src_len) 247 const void *src, size_t src_len)
245{ 248{
246 struct scatterlist sg_in[1], sg_out[3]; 249 struct scatterlist sg_in[1], sg_out[3];
247 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); 250 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
index 40b502e6bd89..bdf38607323c 100644
--- a/fs/ceph/crypto.h
+++ b/fs/ceph/crypto.h
@@ -42,7 +42,7 @@ extern int ceph_encrypt2(struct ceph_crypto_key *secret,
42 const void *src2, size_t src2_len); 42 const void *src2, size_t src2_len);
43 43
44/* armor.c */ 44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end); 45extern int ceph_armor(char *dst, const char *src, const char *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end); 46extern int ceph_unarmor(char *dst, const char *src, const char *end);
47 47
48#endif 48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f2f5332ddbba..360c4f22718d 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -291,7 +291,7 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
291 return 0; 291 return 0;
292} 292}
293 293
294#define DEFINE_SHOW_FUNC(name) \ 294#define DEFINE_SHOW_FUNC(name) \
295static int name##_open(struct inode *inode, struct file *file) \ 295static int name##_open(struct inode *inode, struct file *file) \
296{ \ 296{ \
297 struct seq_file *sf; \ 297 struct seq_file *sf; \
@@ -361,8 +361,8 @@ int ceph_debugfs_client_init(struct ceph_client *client)
361 int ret = 0; 361 int ret = 0;
362 char name[80]; 362 char name[80];
363 363
364 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld", 364 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
365 PR_FSID(&client->fsid), client->monc.auth->global_id); 365 client->monc.auth->global_id);
366 366
367 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); 367 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
368 if (!client->debugfs_dir) 368 if (!client->debugfs_dir)
@@ -432,11 +432,12 @@ int ceph_debugfs_client_init(struct ceph_client *client)
432 if (!client->debugfs_caps) 432 if (!client->debugfs_caps)
433 goto out; 433 goto out;
434 434
435 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", 435 client->debugfs_congestion_kb =
436 0600, 436 debugfs_create_file("writeback_congestion_kb",
437 client->debugfs_dir, 437 0600,
438 client, 438 client->debugfs_dir,
439 &congestion_kb_fops); 439 client,
440 &congestion_kb_fops);
440 if (!client->debugfs_congestion_kb) 441 if (!client->debugfs_congestion_kb)
441 goto out; 442 goto out;
442 443
@@ -466,7 +467,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
466 debugfs_remove(client->debugfs_dir); 467 debugfs_remove(client->debugfs_dir);
467} 468}
468 469
469#else // CONFIG_DEBUG_FS 470#else /* CONFIG_DEBUG_FS */
470 471
471int __init ceph_debugfs_init(void) 472int __init ceph_debugfs_init(void)
472{ 473{
@@ -486,4 +487,4 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
486{ 487{
487} 488}
488 489
489#endif // CONFIG_DEBUG_FS 490#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
index 65b3e022eaf5..3d25415afe63 100644
--- a/fs/ceph/decode.h
+++ b/fs/ceph/decode.h
@@ -99,11 +99,13 @@ static inline void ceph_encode_timespec(struct ceph_timespec *tv,
99 */ 99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a) 100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{ 101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family); 102 __be16 ss_family = htons(a->in_addr.ss_family);
103 a->in_addr.ss_family = *(__u16 *)&ss_family;
103} 104}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a) 105static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{ 106{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family); 107 __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
108 a->in_addr.ss_family = ntohs(ss_family);
107 WARN_ON(a->in_addr.ss_family == 512); 109 WARN_ON(a->in_addr.ss_family == 512);
108} 110}
109 111
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f94ed3c7f6a5..67bbb41d5526 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -27,7 +27,7 @@
27 27
28const struct inode_operations ceph_dir_iops; 28const struct inode_operations ceph_dir_iops;
29const struct file_operations ceph_dir_fops; 29const struct file_operations ceph_dir_fops;
30struct dentry_operations ceph_dentry_ops; 30const struct dentry_operations ceph_dentry_ops;
31 31
32/* 32/*
33 * Initialize ceph dentry state. 33 * Initialize ceph dentry state.
@@ -94,6 +94,8 @@ static unsigned fpos_off(loff_t p)
94 */ 94 */
95static int __dcache_readdir(struct file *filp, 95static int __dcache_readdir(struct file *filp,
96 void *dirent, filldir_t filldir) 96 void *dirent, filldir_t filldir)
97 __releases(inode->i_lock)
98 __acquires(inode->i_lock)
97{ 99{
98 struct inode *inode = filp->f_dentry->d_inode; 100 struct inode *inode = filp->f_dentry->d_inode;
99 struct ceph_file_info *fi = filp->private_data; 101 struct ceph_file_info *fi = filp->private_data;
@@ -1239,16 +1241,16 @@ const struct inode_operations ceph_dir_iops = {
1239 .create = ceph_create, 1241 .create = ceph_create,
1240}; 1242};
1241 1243
1242struct dentry_operations ceph_dentry_ops = { 1244const struct dentry_operations ceph_dentry_ops = {
1243 .d_revalidate = ceph_d_revalidate, 1245 .d_revalidate = ceph_d_revalidate,
1244 .d_release = ceph_dentry_release, 1246 .d_release = ceph_dentry_release,
1245}; 1247};
1246 1248
1247struct dentry_operations ceph_snapdir_dentry_ops = { 1249const struct dentry_operations ceph_snapdir_dentry_ops = {
1248 .d_revalidate = ceph_snapdir_d_revalidate, 1250 .d_revalidate = ceph_snapdir_d_revalidate,
1249 .d_release = ceph_dentry_release, 1251 .d_release = ceph_dentry_release,
1250}; 1252};
1251 1253
1252struct dentry_operations ceph_snap_dentry_ops = { 1254const struct dentry_operations ceph_snap_dentry_ops = {
1253 .d_release = ceph_dentry_release, 1255 .d_release = ceph_dentry_release,
1254}; 1256};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7c08698fad3e..8c044a4f0457 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -317,7 +317,7 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
317/* 317/*
318 * allocate a vector new pages 318 * allocate a vector new pages
319 */ 319 */
320struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) 320static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{ 321{
322 struct page **pages; 322 struct page **pages;
323 int i; 323 int i;
@@ -665,7 +665,7 @@ more:
665 * throw out any page cache pages in this range. this 665 * throw out any page cache pages in this range. this
666 * may block. 666 * may block.
667 */ 667 */
668 truncate_inode_pages_range(inode->i_mapping, pos, 668 truncate_inode_pages_range(inode->i_mapping, pos,
669 (pos+len) | (PAGE_CACHE_SIZE-1)); 669 (pos+len) | (PAGE_CACHE_SIZE-1));
670 } else { 670 } else {
671 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 671 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
@@ -740,28 +740,32 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
740 unsigned long nr_segs, loff_t pos) 740 unsigned long nr_segs, loff_t pos)
741{ 741{
742 struct file *filp = iocb->ki_filp; 742 struct file *filp = iocb->ki_filp;
743 struct ceph_file_info *fi = filp->private_data;
743 loff_t *ppos = &iocb->ki_pos; 744 loff_t *ppos = &iocb->ki_pos;
744 size_t len = iov->iov_len; 745 size_t len = iov->iov_len;
745 struct inode *inode = filp->f_dentry->d_inode; 746 struct inode *inode = filp->f_dentry->d_inode;
746 struct ceph_inode_info *ci = ceph_inode(inode); 747 struct ceph_inode_info *ci = ceph_inode(inode);
747 void *base = iov->iov_base; 748 void __user *base = iov->iov_base;
748 ssize_t ret; 749 ssize_t ret;
749 int got = 0; 750 int want, got = 0;
750 int checkeof = 0, read = 0; 751 int checkeof = 0, read = 0;
751 752
752 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 753 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
753 inode, ceph_vinop(inode), pos, (unsigned)len, inode); 754 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
754again: 755again:
755 __ceph_do_pending_vmtruncate(inode); 756 __ceph_do_pending_vmtruncate(inode);
756 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, 757 if (fi->fmode & CEPH_FILE_MODE_LAZY)
757 &got, -1); 758 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
759 else
760 want = CEPH_CAP_FILE_CACHE;
761 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
758 if (ret < 0) 762 if (ret < 0)
759 goto out; 763 goto out;
760 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 764 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
761 inode, ceph_vinop(inode), pos, (unsigned)len, 765 inode, ceph_vinop(inode), pos, (unsigned)len,
762 ceph_cap_string(got)); 766 ceph_cap_string(got));
763 767
764 if ((got & CEPH_CAP_FILE_CACHE) == 0 || 768 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
765 (iocb->ki_filp->f_flags & O_DIRECT) || 769 (iocb->ki_filp->f_flags & O_DIRECT) ||
766 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) 770 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
767 /* hmm, this isn't really async... */ 771 /* hmm, this isn't really async... */
@@ -807,11 +811,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
807 unsigned long nr_segs, loff_t pos) 811 unsigned long nr_segs, loff_t pos)
808{ 812{
809 struct file *file = iocb->ki_filp; 813 struct file *file = iocb->ki_filp;
814 struct ceph_file_info *fi = file->private_data;
810 struct inode *inode = file->f_dentry->d_inode; 815 struct inode *inode = file->f_dentry->d_inode;
811 struct ceph_inode_info *ci = ceph_inode(inode); 816 struct ceph_inode_info *ci = ceph_inode(inode);
812 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 817 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
813 loff_t endoff = pos + iov->iov_len; 818 loff_t endoff = pos + iov->iov_len;
814 int got = 0; 819 int want, got = 0;
815 int ret, err; 820 int ret, err;
816 821
817 if (ceph_snap(inode) != CEPH_NOSNAP) 822 if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -824,8 +829,11 @@ retry_snap:
824 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", 829 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
825 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 830 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
826 inode->i_size); 831 inode->i_size);
827 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, 832 if (fi->fmode & CEPH_FILE_MODE_LAZY)
828 &got, endoff); 833 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
834 else
835 want = CEPH_CAP_FILE_BUFFER;
836 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
829 if (ret < 0) 837 if (ret < 0)
830 goto out; 838 goto out;
831 839
@@ -833,7 +841,7 @@ retry_snap:
833 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 841 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
834 ceph_cap_string(got)); 842 ceph_cap_string(got));
835 843
836 if ((got & CEPH_CAP_FILE_BUFFER) == 0 || 844 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
837 (iocb->ki_filp->f_flags & O_DIRECT) || 845 (iocb->ki_filp->f_flags & O_DIRECT) ||
838 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { 846 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
839 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, 847 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
@@ -930,6 +938,8 @@ const struct file_operations ceph_file_fops = {
930 .aio_write = ceph_aio_write, 938 .aio_write = ceph_aio_write,
931 .mmap = ceph_mmap, 939 .mmap = ceph_mmap,
932 .fsync = ceph_fsync, 940 .fsync = ceph_fsync,
941 .lock = ceph_lock,
942 .flock = ceph_flock,
933 .splice_read = generic_file_splice_read, 943 .splice_read = generic_file_splice_read,
934 .splice_write = generic_file_splice_write, 944 .splice_write = generic_file_splice_write,
935 .unlocked_ioctl = ceph_ioctl, 945 .unlocked_ioctl = ceph_ioctl,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 389f9dbd9949..5d893d31e399 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -442,8 +442,9 @@ int ceph_fill_file_size(struct inode *inode, int issued,
442 * the file is either opened or mmaped 442 * the file is either opened or mmaped
443 */ 443 */
444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| 444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| 445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
446 CEPH_CAP_FILE_EXCL)) || 446 CEPH_CAP_FILE_EXCL|
447 CEPH_CAP_FILE_LAZYIO)) ||
447 mapping_mapped(inode->i_mapping) || 448 mapping_mapped(inode->i_mapping) ||
448 __ceph_caps_file_wanted(ci)) { 449 __ceph_caps_file_wanted(ci)) {
449 ci->i_truncate_pending++; 450 ci->i_truncate_pending++;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index d085f07756b4..76e307d2aba1 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -143,6 +143,27 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
143 return 0; 143 return 0;
144} 144}
145 145
146static long ceph_ioctl_lazyio(struct file *file)
147{
148 struct ceph_file_info *fi = file->private_data;
149 struct inode *inode = file->f_dentry->d_inode;
150 struct ceph_inode_info *ci = ceph_inode(inode);
151
152 if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
153 spin_lock(&inode->i_lock);
154 ci->i_nr_by_mode[fi->fmode]--;
155 fi->fmode |= CEPH_FILE_MODE_LAZY;
156 ci->i_nr_by_mode[fi->fmode]++;
157 spin_unlock(&inode->i_lock);
158 dout("ioctl_layzio: file %p marked lazy\n", file);
159
160 ceph_check_caps(ci, 0, NULL);
161 } else {
162 dout("ioctl_layzio: file %p already lazy\n", file);
163 }
164 return 0;
165}
166
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 167long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{ 168{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); 169 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
@@ -155,6 +176,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
155 176
156 case CEPH_IOC_GET_DATALOC: 177 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg); 178 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
179
180 case CEPH_IOC_LAZYIO:
181 return ceph_ioctl_lazyio(file);
158 } 182 }
159 return -ENOTTY; 183 return -ENOTTY;
160} 184}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 25e4f1a9d059..88451a3b6857 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -37,4 +37,6 @@ struct ceph_ioctl_dataloc {
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ 37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc) 38 struct ceph_ioctl_dataloc)
39 39
40#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
41
40#endif 42#endif
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
new file mode 100644
index 000000000000..ae85af06454f
--- /dev/null
+++ b/fs/ceph/locks.c
@@ -0,0 +1,256 @@
1#include "ceph_debug.h"
2
3#include <linux/file.h>
4#include <linux/namei.h>
5
6#include "super.h"
7#include "mds_client.h"
8#include "pagelist.h"
9
10/**
11 * Implement fcntl and flock locking functions.
12 */
13static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
14 u64 pid, u64 pid_ns,
15 int cmd, u64 start, u64 length, u8 wait)
16{
17 struct inode *inode = file->f_dentry->d_inode;
18 struct ceph_mds_client *mdsc =
19 &ceph_sb_to_client(inode->i_sb)->mdsc;
20 struct ceph_mds_request *req;
21 int err;
22
23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
24 if (IS_ERR(req))
25 return PTR_ERR(req);
26 req->r_inode = igrab(inode);
27
28 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
29 "length: %llu, wait: %d, type`: %d", (int)lock_type,
30 (int)operation, pid, start, length, wait, cmd);
31
32 req->r_args.filelock_change.rule = lock_type;
33 req->r_args.filelock_change.type = cmd;
34 req->r_args.filelock_change.pid = cpu_to_le64(pid);
35 /* This should be adjusted, but I'm not sure if
36 namespaces actually get id numbers*/
37 req->r_args.filelock_change.pid_namespace =
38 cpu_to_le64((u64)pid_ns);
39 req->r_args.filelock_change.start = cpu_to_le64(start);
40 req->r_args.filelock_change.length = cpu_to_le64(length);
41 req->r_args.filelock_change.wait = wait;
42
43 err = ceph_mdsc_do_request(mdsc, inode, req);
44 ceph_mdsc_put_request(req);
45 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
46 "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
47 (int)operation, pid, start, length, wait, cmd, err);
48 return err;
49}
50
51/**
52 * Attempt to set an fcntl lock.
53 * For now, this just goes away to the server. Later it may be more awesome.
54 */
55int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
56{
57 u64 length;
58 u8 lock_cmd;
59 int err;
60 u8 wait = 0;
61 u16 op = CEPH_MDS_OP_SETFILELOCK;
62
63 fl->fl_nspid = get_pid(task_tgid(current));
64 dout("ceph_lock, fl_pid:%d", fl->fl_pid);
65
66 /* set wait bit as appropriate, then make command as Ceph expects it*/
67 if (F_SETLKW == cmd)
68 wait = 1;
69 if (F_GETLK == cmd)
70 op = CEPH_MDS_OP_GETFILELOCK;
71
72 if (F_RDLCK == fl->fl_type)
73 lock_cmd = CEPH_LOCK_SHARED;
74 else if (F_WRLCK == fl->fl_type)
75 lock_cmd = CEPH_LOCK_EXCL;
76 else
77 lock_cmd = CEPH_LOCK_UNLOCK;
78
79 if (LLONG_MAX == fl->fl_end)
80 length = 0;
81 else
82 length = fl->fl_end - fl->fl_start + 1;
83
84 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
85 (u64)fl->fl_pid, (u64)fl->fl_nspid,
86 lock_cmd, fl->fl_start,
87 length, wait);
88 if (!err) {
89 dout("mds locked, locking locally");
90 err = posix_lock_file(file, fl, NULL);
91 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
92 /* undo! This should only happen if the kernel detects
93 * local deadlock. */
94 ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
95 (u64)fl->fl_pid, (u64)fl->fl_nspid,
96 CEPH_LOCK_UNLOCK, fl->fl_start,
97 length, 0);
98 dout("got %d on posix_lock_file, undid lock", err);
99 }
100 } else {
101 dout("mds returned error code %d", err);
102 }
103 return err;
104}
105
106int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
107{
108 u64 length;
109 u8 lock_cmd;
110 int err;
111 u8 wait = 1;
112
113 fl->fl_nspid = get_pid(task_tgid(current));
114 dout("ceph_flock, fl_pid:%d", fl->fl_pid);
115
116 /* set wait bit, then clear it out of cmd*/
117 if (cmd & LOCK_NB)
118 wait = 0;
119 cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
120 /* set command sequence that Ceph wants to see:
121 shared lock, exclusive lock, or unlock */
122 if (LOCK_SH == cmd)
123 lock_cmd = CEPH_LOCK_SHARED;
124 else if (LOCK_EX == cmd)
125 lock_cmd = CEPH_LOCK_EXCL;
126 else
127 lock_cmd = CEPH_LOCK_UNLOCK;
128 /* mds requires start and length rather than start and end */
129 if (LLONG_MAX == fl->fl_end)
130 length = 0;
131 else
132 length = fl->fl_end - fl->fl_start + 1;
133
134 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
135 file, (u64)fl->fl_pid, (u64)fl->fl_nspid,
136 lock_cmd, fl->fl_start,
137 length, wait);
138 if (!err) {
139 err = flock_lock_file_wait(file, fl);
140 if (err) {
141 ceph_lock_message(CEPH_LOCK_FLOCK,
142 CEPH_MDS_OP_SETFILELOCK,
143 file, (u64)fl->fl_pid,
144 (u64)fl->fl_nspid,
145 CEPH_LOCK_UNLOCK, fl->fl_start,
146 length, 0);
147 dout("got %d on flock_lock_file_wait, undid lock", err);
148 }
149 } else {
150 dout("mds error code %d", err);
151 }
152 return err;
153}
154
155/**
156 * Must be called with BKL already held. Fills in the passed
157 * counter variables, so you can prepare pagelist metadata before calling
158 * ceph_encode_locks.
159 */
160void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
161{
162 struct file_lock *lock;
163
164 *fcntl_count = 0;
165 *flock_count = 0;
166
167 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
168 if (lock->fl_flags & FL_POSIX)
169 ++(*fcntl_count);
170 else if (lock->fl_flags & FL_FLOCK)
171 ++(*flock_count);
172 }
173 dout("counted %d flock locks and %d fcntl locks",
174 *flock_count, *fcntl_count);
175}
176
177/**
178 * Encode the flock and fcntl locks for the given inode into the pagelist.
179 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
180 * sequential flock locks.
181 * Must be called with BLK already held, and the lock numbers should have
182 * been gathered under the same lock holding window.
183 */
184int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
185 int num_fcntl_locks, int num_flock_locks)
186{
187 struct file_lock *lock;
188 struct ceph_filelock cephlock;
189 int err = 0;
190
191 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
192 num_fcntl_locks);
193 err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
194 if (err)
195 goto fail;
196 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
197 if (lock->fl_flags & FL_POSIX) {
198 err = lock_to_ceph_filelock(lock, &cephlock);
199 if (err)
200 goto fail;
201 err = ceph_pagelist_append(pagelist, &cephlock,
202 sizeof(struct ceph_filelock));
203 }
204 if (err)
205 goto fail;
206 }
207
208 err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
209 if (err)
210 goto fail;
211 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
212 if (lock->fl_flags & FL_FLOCK) {
213 err = lock_to_ceph_filelock(lock, &cephlock);
214 if (err)
215 goto fail;
216 err = ceph_pagelist_append(pagelist, &cephlock,
217 sizeof(struct ceph_filelock));
218 }
219 if (err)
220 goto fail;
221 }
222fail:
223 return err;
224}
225
226/*
227 * Given a pointer to a lock, convert it to a ceph filelock
228 */
229int lock_to_ceph_filelock(struct file_lock *lock,
230 struct ceph_filelock *cephlock)
231{
232 int err = 0;
233
234 cephlock->start = cpu_to_le64(lock->fl_start);
235 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
236 cephlock->client = cpu_to_le64(0);
237 cephlock->pid = cpu_to_le64(lock->fl_pid);
238 cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid);
239
240 switch (lock->fl_type) {
241 case F_RDLCK:
242 cephlock->type = CEPH_LOCK_SHARED;
243 break;
244 case F_WRLCK:
245 cephlock->type = CEPH_LOCK_EXCL;
246 break;
247 case F_UNLCK:
248 cephlock->type = CEPH_LOCK_UNLOCK;
249 break;
250 default:
251 dout("Have unknown lock type %d", lock->fl_type);
252 err = -EINVAL;
253 }
254
255 return err;
256}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index dd440bd438a9..a75ddbf9fe37 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3,6 +3,7 @@
3#include <linux/wait.h> 3#include <linux/wait.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/sched.h> 5#include <linux/sched.h>
6#include <linux/smp_lock.h>
6 7
7#include "mds_client.h" 8#include "mds_client.h"
8#include "mon_client.h" 9#include "mon_client.h"
@@ -37,6 +38,11 @@
37 * are no longer valid. 38 * are no longer valid.
38 */ 39 */
39 40
41struct ceph_reconnect_state {
42 struct ceph_pagelist *pagelist;
43 bool flock;
44};
45
40static void __wake_requests(struct ceph_mds_client *mdsc, 46static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head); 47 struct list_head *head);
42 48
@@ -449,7 +455,7 @@ void ceph_mdsc_release_request(struct kref *kref)
449 kfree(req->r_path1); 455 kfree(req->r_path1);
450 kfree(req->r_path2); 456 kfree(req->r_path2);
451 put_request_session(req); 457 put_request_session(req);
452 ceph_unreserve_caps(&req->r_caps_reservation); 458 ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
453 kfree(req); 459 kfree(req);
454} 460}
455 461
@@ -512,7 +518,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
512{ 518{
513 req->r_tid = ++mdsc->last_tid; 519 req->r_tid = ++mdsc->last_tid;
514 if (req->r_num_caps) 520 if (req->r_num_caps)
515 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps); 521 ceph_reserve_caps(mdsc, &req->r_caps_reservation,
522 req->r_num_caps);
516 dout("__register_request %p tid %lld\n", req, req->r_tid); 523 dout("__register_request %p tid %lld\n", req, req->r_tid);
517 ceph_mdsc_get_request(req); 524 ceph_mdsc_get_request(req);
518 __insert_request(mdsc, req); 525 __insert_request(mdsc, req);
@@ -704,6 +711,51 @@ static int __open_session(struct ceph_mds_client *mdsc,
704} 711}
705 712
706/* 713/*
714 * open sessions for any export targets for the given mds
715 *
716 * called under mdsc->mutex
717 */
718static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
719 struct ceph_mds_session *session)
720{
721 struct ceph_mds_info *mi;
722 struct ceph_mds_session *ts;
723 int i, mds = session->s_mds;
724 int target;
725
726 if (mds >= mdsc->mdsmap->m_max_mds)
727 return;
728 mi = &mdsc->mdsmap->m_info[mds];
729 dout("open_export_target_sessions for mds%d (%d targets)\n",
730 session->s_mds, mi->num_export_targets);
731
732 for (i = 0; i < mi->num_export_targets; i++) {
733 target = mi->export_targets[i];
734 ts = __ceph_lookup_mds_session(mdsc, target);
735 if (!ts) {
736 ts = register_session(mdsc, target);
737 if (IS_ERR(ts))
738 return;
739 }
740 if (session->s_state == CEPH_MDS_SESSION_NEW ||
741 session->s_state == CEPH_MDS_SESSION_CLOSING)
742 __open_session(mdsc, session);
743 else
744 dout(" mds%d target mds%d %p is %s\n", session->s_mds,
745 i, ts, session_state_name(ts->s_state));
746 ceph_put_mds_session(ts);
747 }
748}
749
750void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
751 struct ceph_mds_session *session)
752{
753 mutex_lock(&mdsc->mutex);
754 __open_export_target_sessions(mdsc, session);
755 mutex_unlock(&mdsc->mutex);
756}
757
758/*
707 * session caps 759 * session caps
708 */ 760 */
709 761
@@ -764,7 +816,7 @@ static int iterate_session_caps(struct ceph_mds_session *session,
764 last_inode = NULL; 816 last_inode = NULL;
765 } 817 }
766 if (old_cap) { 818 if (old_cap) {
767 ceph_put_cap(old_cap); 819 ceph_put_cap(session->s_mdsc, old_cap);
768 old_cap = NULL; 820 old_cap = NULL;
769 } 821 }
770 822
@@ -793,7 +845,7 @@ out:
793 if (last_inode) 845 if (last_inode)
794 iput(last_inode); 846 iput(last_inode);
795 if (old_cap) 847 if (old_cap)
796 ceph_put_cap(old_cap); 848 ceph_put_cap(session->s_mdsc, old_cap);
797 849
798 return ret; 850 return ret;
799} 851}
@@ -1067,15 +1119,16 @@ static int trim_caps(struct ceph_mds_client *mdsc,
1067 * Called under s_mutex. 1119 * Called under s_mutex.
1068 */ 1120 */
1069int ceph_add_cap_releases(struct ceph_mds_client *mdsc, 1121int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1070 struct ceph_mds_session *session, 1122 struct ceph_mds_session *session)
1071 int extra)
1072{ 1123{
1073 struct ceph_msg *msg; 1124 struct ceph_msg *msg, *partial = NULL;
1074 struct ceph_mds_cap_release *head; 1125 struct ceph_mds_cap_release *head;
1075 int err = -ENOMEM; 1126 int err = -ENOMEM;
1127 int extra = mdsc->client->mount_args->cap_release_safety;
1128 int num;
1076 1129
1077 if (extra < 0) 1130 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
1078 extra = mdsc->client->mount_args->cap_release_safety; 1131 extra);
1079 1132
1080 spin_lock(&session->s_cap_lock); 1133 spin_lock(&session->s_cap_lock);
1081 1134
@@ -1084,9 +1137,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1084 struct ceph_msg, 1137 struct ceph_msg,
1085 list_head); 1138 list_head);
1086 head = msg->front.iov_base; 1139 head = msg->front.iov_base;
1087 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); 1140 num = le32_to_cpu(head->num);
1141 if (num) {
1142 dout(" partial %p with (%d/%d)\n", msg, num,
1143 (int)CEPH_CAPS_PER_RELEASE);
1144 extra += CEPH_CAPS_PER_RELEASE - num;
1145 partial = msg;
1146 }
1088 } 1147 }
1089
1090 while (session->s_num_cap_releases < session->s_nr_caps + extra) { 1148 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1091 spin_unlock(&session->s_cap_lock); 1149 spin_unlock(&session->s_cap_lock);
1092 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, 1150 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
@@ -1103,19 +1161,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1103 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; 1161 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1104 } 1162 }
1105 1163
1106 if (!list_empty(&session->s_cap_releases)) { 1164 if (partial) {
1107 msg = list_first_entry(&session->s_cap_releases, 1165 head = partial->front.iov_base;
1108 struct ceph_msg, 1166 num = le32_to_cpu(head->num);
1109 list_head); 1167 dout(" queueing partial %p with %d/%d\n", partial, num,
1110 head = msg->front.iov_base; 1168 (int)CEPH_CAPS_PER_RELEASE);
1111 if (head->num) { 1169 list_move_tail(&partial->list_head,
1112 dout(" queueing non-full %p (%d)\n", msg, 1170 &session->s_cap_releases_done);
1113 le32_to_cpu(head->num)); 1171 session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
1114 list_move_tail(&msg->list_head,
1115 &session->s_cap_releases_done);
1116 session->s_num_cap_releases -=
1117 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1118 }
1119 } 1172 }
1120 err = 0; 1173 err = 0;
1121 spin_unlock(&session->s_cap_lock); 1174 spin_unlock(&session->s_cap_lock);
@@ -1250,6 +1303,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1250 return ERR_PTR(-ENOMEM); 1303 return ERR_PTR(-ENOMEM);
1251 1304
1252 mutex_init(&req->r_fill_mutex); 1305 mutex_init(&req->r_fill_mutex);
1306 req->r_mdsc = mdsc;
1253 req->r_started = jiffies; 1307 req->r_started = jiffies;
1254 req->r_resend_mds = -1; 1308 req->r_resend_mds = -1;
1255 INIT_LIST_HEAD(&req->r_unsafe_dir_item); 1309 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1580,6 +1634,15 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1580 1634
1581 req->r_mds = mds; 1635 req->r_mds = mds;
1582 req->r_attempts++; 1636 req->r_attempts++;
1637 if (req->r_inode) {
1638 struct ceph_cap *cap =
1639 ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
1640
1641 if (cap)
1642 req->r_sent_on_mseq = cap->mseq;
1643 else
1644 req->r_sent_on_mseq = -1;
1645 }
1583 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, 1646 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1584 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); 1647 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1585 1648
@@ -1914,21 +1977,40 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1914 result = le32_to_cpu(head->result); 1977 result = le32_to_cpu(head->result);
1915 1978
1916 /* 1979 /*
1917 * Tolerate 2 consecutive ESTALEs from the same mds. 1980 * Handle an ESTALE
1918 * FIXME: we should be looking at the cap migrate_seq. 1981 * if we're not talking to the authority, send to them
1982 * if the authority has changed while we weren't looking,
1983 * send to new authority
1984 * Otherwise we just have to return an ESTALE
1919 */ 1985 */
1920 if (result == -ESTALE) { 1986 if (result == -ESTALE) {
1921 req->r_direct_mode = USE_AUTH_MDS; 1987 dout("got ESTALE on request %llu", req->r_tid);
1922 req->r_num_stale++; 1988 if (!req->r_inode) {
1923 if (req->r_num_stale <= 2) { 1989 /* do nothing; not an authority problem */
1990 } else if (req->r_direct_mode != USE_AUTH_MDS) {
1991 dout("not using auth, setting for that now");
1992 req->r_direct_mode = USE_AUTH_MDS;
1924 __do_request(mdsc, req); 1993 __do_request(mdsc, req);
1925 mutex_unlock(&mdsc->mutex); 1994 mutex_unlock(&mdsc->mutex);
1926 goto out; 1995 goto out;
1996 } else {
1997 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
1998 struct ceph_cap *cap =
1999 ceph_get_cap_for_mds(ci, req->r_mds);;
2000
2001 dout("already using auth");
2002 if ((!cap || cap != ci->i_auth_cap) ||
2003 (cap->mseq != req->r_sent_on_mseq)) {
2004 dout("but cap changed, so resending");
2005 __do_request(mdsc, req);
2006 mutex_unlock(&mdsc->mutex);
2007 goto out;
2008 }
1927 } 2009 }
1928 } else { 2010 dout("have to return ESTALE on request %llu", req->r_tid);
1929 req->r_num_stale = 0;
1930 } 2011 }
1931 2012
2013
1932 if (head->safe) { 2014 if (head->safe) {
1933 req->r_got_safe = true; 2015 req->r_got_safe = true;
1934 __unregister_request(mdsc, req); 2016 __unregister_request(mdsc, req);
@@ -1985,7 +2067,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1985 if (err == 0) { 2067 if (err == 0) {
1986 if (result == 0 && rinfo->dir_nr) 2068 if (result == 0 && rinfo->dir_nr)
1987 ceph_readdir_prepopulate(req, req->r_session); 2069 ceph_readdir_prepopulate(req, req->r_session);
1988 ceph_unreserve_caps(&req->r_caps_reservation); 2070 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
1989 } 2071 }
1990 mutex_unlock(&req->r_fill_mutex); 2072 mutex_unlock(&req->r_fill_mutex);
1991 2073
@@ -2005,7 +2087,7 @@ out_err:
2005 } 2087 }
2006 mutex_unlock(&mdsc->mutex); 2088 mutex_unlock(&mdsc->mutex);
2007 2089
2008 ceph_add_cap_releases(mdsc, req->r_session, -1); 2090 ceph_add_cap_releases(mdsc, req->r_session);
2009 mutex_unlock(&session->s_mutex); 2091 mutex_unlock(&session->s_mutex);
2010 2092
2011 /* kick calling process */ 2093 /* kick calling process */
@@ -2193,9 +2275,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2193static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, 2275static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2194 void *arg) 2276 void *arg)
2195{ 2277{
2196 struct ceph_mds_cap_reconnect rec; 2278 union {
2279 struct ceph_mds_cap_reconnect v2;
2280 struct ceph_mds_cap_reconnect_v1 v1;
2281 } rec;
2282 size_t reclen;
2197 struct ceph_inode_info *ci; 2283 struct ceph_inode_info *ci;
2198 struct ceph_pagelist *pagelist = arg; 2284 struct ceph_reconnect_state *recon_state = arg;
2285 struct ceph_pagelist *pagelist = recon_state->pagelist;
2199 char *path; 2286 char *path;
2200 int pathlen, err; 2287 int pathlen, err;
2201 u64 pathbase; 2288 u64 pathbase;
@@ -2228,17 +2315,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2228 spin_lock(&inode->i_lock); 2315 spin_lock(&inode->i_lock);
2229 cap->seq = 0; /* reset cap seq */ 2316 cap->seq = 0; /* reset cap seq */
2230 cap->issue_seq = 0; /* and issue_seq */ 2317 cap->issue_seq = 0; /* and issue_seq */
2231 rec.cap_id = cpu_to_le64(cap->cap_id); 2318
2232 rec.pathbase = cpu_to_le64(pathbase); 2319 if (recon_state->flock) {
2233 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); 2320 rec.v2.cap_id = cpu_to_le64(cap->cap_id);
2234 rec.issued = cpu_to_le32(cap->issued); 2321 rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2235 rec.size = cpu_to_le64(inode->i_size); 2322 rec.v2.issued = cpu_to_le32(cap->issued);
2236 ceph_encode_timespec(&rec.mtime, &inode->i_mtime); 2323 rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2237 ceph_encode_timespec(&rec.atime, &inode->i_atime); 2324 rec.v2.pathbase = cpu_to_le64(pathbase);
2238 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); 2325 rec.v2.flock_len = 0;
2326 reclen = sizeof(rec.v2);
2327 } else {
2328 rec.v1.cap_id = cpu_to_le64(cap->cap_id);
2329 rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2330 rec.v1.issued = cpu_to_le32(cap->issued);
2331 rec.v1.size = cpu_to_le64(inode->i_size);
2332 ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
2333 ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
2334 rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2335 rec.v1.pathbase = cpu_to_le64(pathbase);
2336 reclen = sizeof(rec.v1);
2337 }
2239 spin_unlock(&inode->i_lock); 2338 spin_unlock(&inode->i_lock);
2240 2339
2241 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec)); 2340 if (recon_state->flock) {
2341 int num_fcntl_locks, num_flock_locks;
2342
2343 lock_kernel();
2344 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
2345 rec.v2.flock_len = (2*sizeof(u32) +
2346 (num_fcntl_locks+num_flock_locks) *
2347 sizeof(struct ceph_filelock));
2348
2349 err = ceph_pagelist_append(pagelist, &rec, reclen);
2350 if (!err)
2351 err = ceph_encode_locks(inode, pagelist,
2352 num_fcntl_locks,
2353 num_flock_locks);
2354 unlock_kernel();
2355 }
2242 2356
2243out: 2357out:
2244 kfree(path); 2358 kfree(path);
@@ -2267,6 +2381,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2267 int mds = session->s_mds; 2381 int mds = session->s_mds;
2268 int err = -ENOMEM; 2382 int err = -ENOMEM;
2269 struct ceph_pagelist *pagelist; 2383 struct ceph_pagelist *pagelist;
2384 struct ceph_reconnect_state recon_state;
2270 2385
2271 pr_info("mds%d reconnect start\n", mds); 2386 pr_info("mds%d reconnect start\n", mds);
2272 2387
@@ -2301,7 +2416,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2301 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); 2416 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2302 if (err) 2417 if (err)
2303 goto fail; 2418 goto fail;
2304 err = iterate_session_caps(session, encode_caps_cb, pagelist); 2419
2420 recon_state.pagelist = pagelist;
2421 recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
2422 err = iterate_session_caps(session, encode_caps_cb, &recon_state);
2305 if (err < 0) 2423 if (err < 0)
2306 goto fail; 2424 goto fail;
2307 2425
@@ -2326,6 +2444,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2326 } 2444 }
2327 2445
2328 reply->pagelist = pagelist; 2446 reply->pagelist = pagelist;
2447 if (recon_state.flock)
2448 reply->hdr.version = cpu_to_le16(2);
2329 reply->hdr.data_len = cpu_to_le32(pagelist->length); 2449 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2330 reply->nr_pages = calc_pages_for(0, pagelist->length); 2450 reply->nr_pages = calc_pages_for(0, pagelist->length);
2331 ceph_con_send(&session->s_con, reply); 2451 ceph_con_send(&session->s_con, reply);
@@ -2376,9 +2496,11 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2376 oldstate = ceph_mdsmap_get_state(oldmap, i); 2496 oldstate = ceph_mdsmap_get_state(oldmap, i);
2377 newstate = ceph_mdsmap_get_state(newmap, i); 2497 newstate = ceph_mdsmap_get_state(newmap, i);
2378 2498
2379 dout("check_new_map mds%d state %s -> %s (session %s)\n", 2499 dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
2380 i, ceph_mds_state_name(oldstate), 2500 i, ceph_mds_state_name(oldstate),
2501 ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
2381 ceph_mds_state_name(newstate), 2502 ceph_mds_state_name(newstate),
2503 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
2382 session_state_name(s->s_state)); 2504 session_state_name(s->s_state));
2383 2505
2384 if (memcmp(ceph_mdsmap_get_addr(oldmap, i), 2506 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
@@ -2428,6 +2550,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2428 wake_up_session_caps(s, 1); 2550 wake_up_session_caps(s, 1);
2429 } 2551 }
2430 } 2552 }
2553
2554 for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
2555 s = mdsc->sessions[i];
2556 if (!s)
2557 continue;
2558 if (!ceph_mdsmap_is_laggy(newmap, i))
2559 continue;
2560 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2561 s->s_state == CEPH_MDS_SESSION_HUNG ||
2562 s->s_state == CEPH_MDS_SESSION_CLOSING) {
2563 dout(" connecting to export targets of laggy mds%d\n",
2564 i);
2565 __open_export_target_sessions(mdsc, s);
2566 }
2567 }
2431} 2568}
2432 2569
2433 2570
@@ -2715,7 +2852,7 @@ static void delayed_work(struct work_struct *work)
2715 send_renew_caps(mdsc, s); 2852 send_renew_caps(mdsc, s);
2716 else 2853 else
2717 ceph_con_keepalive(&s->s_con); 2854 ceph_con_keepalive(&s->s_con);
2718 ceph_add_cap_releases(mdsc, s, -1); 2855 ceph_add_cap_releases(mdsc, s);
2719 if (s->s_state == CEPH_MDS_SESSION_OPEN || 2856 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2720 s->s_state == CEPH_MDS_SESSION_HUNG) 2857 s->s_state == CEPH_MDS_SESSION_HUNG)
2721 ceph_send_cap_releases(mdsc, s); 2858 ceph_send_cap_releases(mdsc, s);
@@ -2764,6 +2901,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2764 spin_lock_init(&mdsc->dentry_lru_lock); 2901 spin_lock_init(&mdsc->dentry_lru_lock);
2765 INIT_LIST_HEAD(&mdsc->dentry_lru); 2902 INIT_LIST_HEAD(&mdsc->dentry_lru);
2766 2903
2904 ceph_caps_init(mdsc);
2905 ceph_adjust_min_caps(mdsc, client->min_caps);
2906
2767 return 0; 2907 return 0;
2768} 2908}
2769 2909
@@ -2959,6 +3099,7 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2959 if (mdsc->mdsmap) 3099 if (mdsc->mdsmap)
2960 ceph_mdsmap_destroy(mdsc->mdsmap); 3100 ceph_mdsmap_destroy(mdsc->mdsmap);
2961 kfree(mdsc->sessions); 3101 kfree(mdsc->sessions);
3102 ceph_caps_finalize(mdsc);
2962} 3103}
2963 3104
2964 3105
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 952410c60d09..ab7e89f5e344 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -151,6 +151,7 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
151struct ceph_mds_request { 151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */ 152 u64 r_tid; /* transaction id */
153 struct rb_node r_node; 153 struct rb_node r_node;
154 struct ceph_mds_client *r_mdsc;
154 155
155 int r_op; /* mds op code */ 156 int r_op; /* mds op code */
156 int r_mds; 157 int r_mds;
@@ -207,8 +208,8 @@ struct ceph_mds_request {
207 208
208 int r_attempts; /* resend attempts */ 209 int r_attempts; /* resend attempts */
209 int r_num_fwd; /* number of forward attempts */ 210 int r_num_fwd; /* number of forward attempts */
210 int r_num_stale;
211 int r_resend_mds; /* mds to resend to next, if any*/ 211 int r_resend_mds; /* mds to resend to next, if any*/
212 u32 r_sent_on_mseq; /* cap mseq request was sent at*/
212 213
213 struct kref r_kref; 214 struct kref r_kref;
214 struct list_head r_wait; 215 struct list_head r_wait;
@@ -267,6 +268,27 @@ struct ceph_mds_client {
267 spinlock_t cap_dirty_lock; /* protects above items */ 268 spinlock_t cap_dirty_lock; /* protects above items */
268 wait_queue_head_t cap_flushing_wq; 269 wait_queue_head_t cap_flushing_wq;
269 270
271 /*
272 * Cap reservations
273 *
274 * Maintain a global pool of preallocated struct ceph_caps, referenced
275 * by struct ceph_caps_reservations. This ensures that we preallocate
276 * memory needed to successfully process an MDS response. (If an MDS
277 * sends us cap information and we fail to process it, we will have
278 * problems due to the client and MDS being out of sync.)
279 *
280 * Reservations are 'owned' by a ceph_cap_reservation context.
281 */
282 spinlock_t caps_list_lock;
283 struct list_head caps_list; /* unused (reserved or
284 unreserved) */
285 int caps_total_count; /* total caps allocated */
286 int caps_use_count; /* in use */
287 int caps_reserve_count; /* unused, reserved */
288 int caps_avail_count; /* unused, unreserved */
289 int caps_min_count; /* keep at least this many
290 (unreserved) */
291
270#ifdef CONFIG_DEBUG_FS 292#ifdef CONFIG_DEBUG_FS
271 struct dentry *debugfs_file; 293 struct dentry *debugfs_file;
272#endif 294#endif
@@ -324,8 +346,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
324} 346}
325 347
326extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, 348extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
327 struct ceph_mds_session *session, 349 struct ceph_mds_session *session);
328 int extra);
329extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, 350extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
330 struct ceph_mds_session *session); 351 struct ceph_mds_session *session);
331 352
@@ -343,4 +364,7 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
343extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, 364extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
344 struct ceph_msg *msg); 365 struct ceph_msg *msg);
345 366
367extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
368 struct ceph_mds_session *session);
369
346#endif 370#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index c4c498e6dfef..040be6d1150b 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -85,6 +85,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
85 struct ceph_entity_addr addr; 85 struct ceph_entity_addr addr;
86 u32 num_export_targets; 86 u32 num_export_targets;
87 void *pexport_targets = NULL; 87 void *pexport_targets = NULL;
88 struct ceph_timespec laggy_since;
88 89
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); 90 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p); 91 global_id = ceph_decode_64(p);
@@ -103,7 +104,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
103 state_seq = ceph_decode_64(p); 104 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr)); 105 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr); 106 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec); 107 ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
107 *p += sizeof(u32); 108 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad); 109 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen; 110 *p += namelen;
@@ -122,6 +123,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
122 m->m_info[mds].global_id = global_id; 123 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state; 124 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr; 125 m->m_info[mds].addr = addr;
126 m->m_info[mds].laggy =
127 (laggy_since.tv_sec != 0 ||
128 laggy_since.tv_nsec != 0);
125 m->m_info[mds].num_export_targets = num_export_targets; 129 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) { 130 if (num_export_targets) {
127 m->m_info[mds].export_targets = 131 m->m_info[mds].export_targets =
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
index eacc131aa5cb..4c5cb0880bba 100644
--- a/fs/ceph/mdsmap.h
+++ b/fs/ceph/mdsmap.h
@@ -13,6 +13,7 @@ struct ceph_mds_info {
13 struct ceph_entity_addr addr; 13 struct ceph_entity_addr addr;
14 s32 state; 14 s32 state;
15 int num_export_targets; 15 int num_export_targets;
16 bool laggy;
16 u32 *export_targets; 17 u32 *export_targets;
17}; 18};
18 19
@@ -47,6 +48,13 @@ static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
47 return m->m_info[w].state; 48 return m->m_info[w].state;
48} 49}
49 50
51static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
52{
53 if (w >= 0 && w < m->m_max_mds)
54 return m->m_info[w].laggy;
55 return false;
56}
57
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); 58extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); 59extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); 60extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 15167b2daa55..2502d76fcec1 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -108,7 +108,7 @@ void ceph_msgr_exit(void)
108 destroy_workqueue(ceph_msgr_wq); 108 destroy_workqueue(ceph_msgr_wq);
109} 109}
110 110
111void ceph_msgr_flush() 111void ceph_msgr_flush(void)
112{ 112{
113 flush_workqueue(ceph_msgr_wq); 113 flush_workqueue(ceph_msgr_wq);
114} 114}
@@ -647,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
647 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 647 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
648 con->connect_seq, global_seq, proto); 648 con->connect_seq, global_seq, proto);
649 649
650 con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT); 650 con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
651 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 651 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
652 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); 652 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
653 con->out_connect.global_seq = cpu_to_le32(global_seq); 653 con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1081,11 +1081,11 @@ static int process_banner(struct ceph_connection *con)
1081 sizeof(con->peer_addr)) != 0 && 1081 sizeof(con->peer_addr)) != 0 &&
1082 !(addr_is_blank(&con->actual_peer_addr.in_addr) && 1082 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1083 con->actual_peer_addr.nonce == con->peer_addr.nonce)) { 1083 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1084 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n", 1084 pr_warning("wrong peer, want %s/%d, got %s/%d\n",
1085 pr_addr(&con->peer_addr.in_addr), 1085 pr_addr(&con->peer_addr.in_addr),
1086 le64_to_cpu(con->peer_addr.nonce), 1086 (int)le32_to_cpu(con->peer_addr.nonce),
1087 pr_addr(&con->actual_peer_addr.in_addr), 1087 pr_addr(&con->actual_peer_addr.in_addr),
1088 le64_to_cpu(con->actual_peer_addr.nonce)); 1088 (int)le32_to_cpu(con->actual_peer_addr.nonce));
1089 con->error_msg = "wrong peer at address"; 1089 con->error_msg = "wrong peer at address";
1090 return -1; 1090 return -1;
1091 } 1091 }
@@ -1123,8 +1123,8 @@ static void fail_protocol(struct ceph_connection *con)
1123 1123
1124static int process_connect(struct ceph_connection *con) 1124static int process_connect(struct ceph_connection *con)
1125{ 1125{
1126 u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT; 1126 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1127 u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT; 1127 u64 req_feat = CEPH_FEATURE_REQUIRED;
1128 u64 server_feat = le64_to_cpu(con->in_reply.features); 1128 u64 server_feat = le64_to_cpu(con->in_reply.features);
1129 1129
1130 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); 1130 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1302,8 +1302,8 @@ static void process_ack(struct ceph_connection *con)
1302 1302
1303 1303
1304static int read_partial_message_section(struct ceph_connection *con, 1304static int read_partial_message_section(struct ceph_connection *con,
1305 struct kvec *section, unsigned int sec_len, 1305 struct kvec *section,
1306 u32 *crc) 1306 unsigned int sec_len, u32 *crc)
1307{ 1307{
1308 int left; 1308 int left;
1309 int ret; 1309 int ret;
@@ -1434,7 +1434,8 @@ static int read_partial_message(struct ceph_connection *con)
1434 1434
1435 /* middle */ 1435 /* middle */
1436 if (m->middle) { 1436 if (m->middle) {
1437 ret = read_partial_message_section(con, &m->middle->vec, middle_len, 1437 ret = read_partial_message_section(con, &m->middle->vec,
1438 middle_len,
1438 &con->in_middle_crc); 1439 &con->in_middle_crc);
1439 if (ret <= 0) 1440 if (ret <= 0)
1440 return ret; 1441 return ret;
@@ -1920,7 +1921,7 @@ out:
1920 /* 1921 /*
1921 * in case we faulted due to authentication, invalidate our 1922 * in case we faulted due to authentication, invalidate our
1922 * current tickets so that we can get new ones. 1923 * current tickets so that we can get new ones.
1923 */ 1924 */
1924 if (con->auth_retry && con->ops->invalidate_authorizer) { 1925 if (con->auth_retry && con->ops->invalidate_authorizer) {
1925 dout("calling invalidate_authorizer()\n"); 1926 dout("calling invalidate_authorizer()\n");
1926 con->ops->invalidate_authorizer(con); 1927 con->ops->invalidate_authorizer(con);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 54fe01c50706..b2a5a3e4a671 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -349,7 +349,7 @@ out:
349} 349}
350 350
351/* 351/*
352 * statfs 352 * generic requests (e.g., statfs, poolop)
353 */ 353 */
354static struct ceph_mon_generic_request *__lookup_generic_req( 354static struct ceph_mon_generic_request *__lookup_generic_req(
355 struct ceph_mon_client *monc, u64 tid) 355 struct ceph_mon_client *monc, u64 tid)
@@ -442,6 +442,35 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
442 return m; 442 return m;
443} 443}
444 444
445static int do_generic_request(struct ceph_mon_client *monc,
446 struct ceph_mon_generic_request *req)
447{
448 int err;
449
450 /* register request */
451 mutex_lock(&monc->mutex);
452 req->tid = ++monc->last_tid;
453 req->request->hdr.tid = cpu_to_le64(req->tid);
454 __insert_generic_request(monc, req);
455 monc->num_generic_requests++;
456 ceph_con_send(monc->con, ceph_msg_get(req->request));
457 mutex_unlock(&monc->mutex);
458
459 err = wait_for_completion_interruptible(&req->completion);
460
461 mutex_lock(&monc->mutex);
462 rb_erase(&req->node, &monc->generic_request_tree);
463 monc->num_generic_requests--;
464 mutex_unlock(&monc->mutex);
465
466 if (!err)
467 err = req->result;
468 return err;
469}
470
471/*
472 * statfs
473 */
445static void handle_statfs_reply(struct ceph_mon_client *monc, 474static void handle_statfs_reply(struct ceph_mon_client *monc,
446 struct ceph_msg *msg) 475 struct ceph_msg *msg)
447{ 476{
@@ -468,7 +497,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
468 return; 497 return;
469 498
470bad: 499bad:
471 pr_err("corrupt generic reply, no tid\n"); 500 pr_err("corrupt generic reply, tid %llu\n", tid);
472 ceph_msg_dump(msg); 501 ceph_msg_dump(msg);
473} 502}
474 503
@@ -487,6 +516,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
487 516
488 kref_init(&req->kref); 517 kref_init(&req->kref);
489 req->buf = buf; 518 req->buf = buf;
519 req->buf_len = sizeof(*buf);
490 init_completion(&req->completion); 520 init_completion(&req->completion);
491 521
492 err = -ENOMEM; 522 err = -ENOMEM;
@@ -504,33 +534,134 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
504 h->monhdr.session_mon_tid = 0; 534 h->monhdr.session_mon_tid = 0;
505 h->fsid = monc->monmap->fsid; 535 h->fsid = monc->monmap->fsid;
506 536
507 /* register request */ 537 err = do_generic_request(monc, req);
508 mutex_lock(&monc->mutex);
509 req->tid = ++monc->last_tid;
510 req->request->hdr.tid = cpu_to_le64(req->tid);
511 __insert_generic_request(monc, req);
512 monc->num_generic_requests++;
513 mutex_unlock(&monc->mutex);
514 538
515 /* send request and wait */ 539out:
516 ceph_con_send(monc->con, ceph_msg_get(req->request)); 540 kref_put(&req->kref, release_generic_request);
517 err = wait_for_completion_interruptible(&req->completion); 541 return err;
542}
543
544/*
545 * pool ops
546 */
547static int get_poolop_reply_buf(const char *src, size_t src_len,
548 char *dst, size_t dst_len)
549{
550 u32 buf_len;
551
552 if (src_len != sizeof(u32) + dst_len)
553 return -EINVAL;
554
555 buf_len = le32_to_cpu(*(u32 *)src);
556 if (buf_len != dst_len)
557 return -EINVAL;
558
559 memcpy(dst, src + sizeof(u32), dst_len);
560 return 0;
561}
562
563static void handle_poolop_reply(struct ceph_mon_client *monc,
564 struct ceph_msg *msg)
565{
566 struct ceph_mon_generic_request *req;
567 struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
568 u64 tid = le64_to_cpu(msg->hdr.tid);
569
570 if (msg->front.iov_len < sizeof(*reply))
571 goto bad;
572 dout("handle_poolop_reply %p tid %llu\n", msg, tid);
518 573
519 mutex_lock(&monc->mutex); 574 mutex_lock(&monc->mutex);
520 rb_erase(&req->node, &monc->generic_request_tree); 575 req = __lookup_generic_req(monc, tid);
521 monc->num_generic_requests--; 576 if (req) {
577 if (req->buf_len &&
578 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
579 msg->front.iov_len - sizeof(*reply),
580 req->buf, req->buf_len) < 0) {
581 mutex_unlock(&monc->mutex);
582 goto bad;
583 }
584 req->result = le32_to_cpu(reply->reply_code);
585 get_generic_request(req);
586 }
522 mutex_unlock(&monc->mutex); 587 mutex_unlock(&monc->mutex);
588 if (req) {
589 complete(&req->completion);
590 put_generic_request(req);
591 }
592 return;
523 593
524 if (!err) 594bad:
525 err = req->result; 595 pr_err("corrupt generic reply, tid %llu\n", tid);
596 ceph_msg_dump(msg);
597}
598
599/*
600 * Do a synchronous pool op.
601 */
602int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
603 u32 pool, u64 snapid,
604 char *buf, int len)
605{
606 struct ceph_mon_generic_request *req;
607 struct ceph_mon_poolop *h;
608 int err;
609
610 req = kzalloc(sizeof(*req), GFP_NOFS);
611 if (!req)
612 return -ENOMEM;
613
614 kref_init(&req->kref);
615 req->buf = buf;
616 req->buf_len = len;
617 init_completion(&req->completion);
618
619 err = -ENOMEM;
620 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
621 if (!req->request)
622 goto out;
623 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
624 if (!req->reply)
625 goto out;
626
627 /* fill out request */
628 req->request->hdr.version = cpu_to_le16(2);
629 h = req->request->front.iov_base;
630 h->monhdr.have_version = 0;
631 h->monhdr.session_mon = cpu_to_le16(-1);
632 h->monhdr.session_mon_tid = 0;
633 h->fsid = monc->monmap->fsid;
634 h->pool = cpu_to_le32(pool);
635 h->op = cpu_to_le32(op);
636 h->auid = 0;
637 h->snapid = cpu_to_le64(snapid);
638 h->name_len = 0;
639
640 err = do_generic_request(monc, req);
526 641
527out: 642out:
528 kref_put(&req->kref, release_generic_request); 643 kref_put(&req->kref, release_generic_request);
529 return err; 644 return err;
530} 645}
531 646
647int ceph_monc_create_snapid(struct ceph_mon_client *monc,
648 u32 pool, u64 *snapid)
649{
650 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
651 pool, 0, (char *)snapid, sizeof(*snapid));
652
653}
654
655int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
656 u32 pool, u64 snapid)
657{
658 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
659 pool, snapid, 0, 0);
660
661}
662
532/* 663/*
533 * Resend pending statfs requests. 664 * Resend pending generic requests.
534 */ 665 */
535static void __resend_generic_request(struct ceph_mon_client *monc) 666static void __resend_generic_request(struct ceph_mon_client *monc)
536{ 667{
@@ -783,6 +914,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
783 handle_statfs_reply(monc, msg); 914 handle_statfs_reply(monc, msg);
784 break; 915 break;
785 916
917 case CEPH_MSG_POOLOP_REPLY:
918 handle_poolop_reply(monc, msg);
919 break;
920
786 case CEPH_MSG_MON_MAP: 921 case CEPH_MSG_MON_MAP:
787 ceph_monc_handle_map(monc, msg); 922 ceph_monc_handle_map(monc, msg);
788 break; 923 break;
@@ -820,6 +955,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
820 case CEPH_MSG_MON_SUBSCRIBE_ACK: 955 case CEPH_MSG_MON_SUBSCRIBE_ACK:
821 m = ceph_msg_get(monc->m_subscribe_ack); 956 m = ceph_msg_get(monc->m_subscribe_ack);
822 break; 957 break;
958 case CEPH_MSG_POOLOP_REPLY:
823 case CEPH_MSG_STATFS_REPLY: 959 case CEPH_MSG_STATFS_REPLY:
824 return get_generic_reply(con, hdr, skip); 960 return get_generic_reply(con, hdr, skip);
825 case CEPH_MSG_AUTH_REPLY: 961 case CEPH_MSG_AUTH_REPLY:
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index 174d794321d0..8e396f2c0963 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -50,6 +50,7 @@ struct ceph_mon_generic_request {
50 struct rb_node node; 50 struct rb_node node;
51 int result; 51 int result;
52 void *buf; 52 void *buf;
53 int buf_len;
53 struct completion completion; 54 struct completion completion;
54 struct ceph_msg *request; /* original request */ 55 struct ceph_msg *request; /* original request */
55 struct ceph_msg *reply; /* and reply */ 56 struct ceph_msg *reply; /* and reply */
@@ -111,6 +112,10 @@ extern int ceph_monc_open_session(struct ceph_mon_client *monc);
111 112
112extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); 113extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
113 114
115extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
116 u32 pool, u64 *snapid);
114 117
118extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
119 u32 pool, u64 snapid);
115 120
116#endif 121#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 892a0298dfdf..680d3d648cac 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -1,5 +1,5 @@
1#ifndef __MSGR_H 1#ifndef CEPH_MSGR_H
2#define __MSGR_H 2#define CEPH_MSGR_H
3 3
4/* 4/*
5 * Data types for message passing layer used by Ceph. 5 * Data types for message passing layer used by Ceph.
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index e38522347898..bed6391e52c7 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1276,8 +1276,6 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1276 1276
1277 /* it may be a short read due to an object boundary */ 1277 /* it may be a short read due to an object boundary */
1278 req->r_pages = pages; 1278 req->r_pages = pages;
1279 num_pages = calc_pages_for(off, *plen);
1280 req->r_num_pages = num_pages;
1281 1279
1282 dout("readpages final extent is %llu~%llu (%d pages)\n", 1280 dout("readpages final extent is %llu~%llu (%d pages)\n",
1283 off, *plen, req->r_num_pages); 1281 off, *plen, req->r_num_pages);
@@ -1319,7 +1317,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1319 1317
1320 /* it may be a short write due to an object boundary */ 1318 /* it may be a short write due to an object boundary */
1321 req->r_pages = pages; 1319 req->r_pages = pages;
1322 req->r_num_pages = calc_pages_for(off, len);
1323 dout("writepages %llu~%llu (%d pages)\n", off, len, 1320 dout("writepages %llu~%llu (%d pages)\n", off, len,
1324 req->r_num_pages); 1321 req->r_num_pages);
1325 1322
@@ -1476,8 +1473,8 @@ static void put_osd_con(struct ceph_connection *con)
1476 * authentication 1473 * authentication
1477 */ 1474 */
1478static int get_authorizer(struct ceph_connection *con, 1475static int get_authorizer(struct ceph_connection *con,
1479 void **buf, int *len, int *proto, 1476 void **buf, int *len, int *proto,
1480 void **reply_buf, int *reply_len, int force_new) 1477 void **reply_buf, int *reply_len, int force_new)
1481{ 1478{
1482 struct ceph_osd *o = con->private; 1479 struct ceph_osd *o = con->private;
1483 struct ceph_osd_client *osdc = o->o_osdc; 1480 struct ceph_osd_client *osdc = o->o_osdc;
@@ -1497,7 +1494,7 @@ static int get_authorizer(struct ceph_connection *con,
1497 &o->o_authorizer_reply_buf, 1494 &o->o_authorizer_reply_buf,
1498 &o->o_authorizer_reply_buf_len); 1495 &o->o_authorizer_reply_buf_len);
1499 if (ret) 1496 if (ret)
1500 return ret; 1497 return ret;
1501 } 1498 }
1502 1499
1503 *proto = ac->protocol; 1500 *proto = ac->protocol;
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 416d46adbf87..e31f118f1392 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -424,12 +424,30 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
424 kfree(pi); 424 kfree(pi);
425} 425}
426 426
427void __decode_pool(void **p, struct ceph_pg_pool_info *pi) 427static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
428{ 428{
429 unsigned n, m;
430
429 ceph_decode_copy(p, &pi->v, sizeof(pi->v)); 431 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
430 calc_pg_masks(pi); 432 calc_pg_masks(pi);
431 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); 433
434 /* num_snaps * snap_info_t */
435 n = le32_to_cpu(pi->v.num_snaps);
436 while (n--) {
437 ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
438 sizeof(struct ceph_timespec), bad);
439 *p += sizeof(u64) + /* key */
440 1 + sizeof(u64) + /* u8, snapid */
441 sizeof(struct ceph_timespec);
442 m = ceph_decode_32(p); /* snap name */
443 *p += m;
444 }
445
432 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; 446 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
447 return 0;
448
449bad:
450 return -EINVAL;
433} 451}
434 452
435static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 453static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
@@ -571,7 +589,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
571 kfree(pi); 589 kfree(pi);
572 goto bad; 590 goto bad;
573 } 591 }
574 __decode_pool(p, pi); 592 err = __decode_pool(p, end, pi);
593 if (err < 0)
594 goto bad;
575 __insert_pg_pool(&map->pg_pools, pi); 595 __insert_pg_pool(&map->pg_pools, pi);
576 } 596 }
577 597
@@ -760,7 +780,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
760 pi->id = pool; 780 pi->id = pool;
761 __insert_pg_pool(&map->pg_pools, pi); 781 __insert_pg_pool(&map->pg_pools, pi);
762 } 782 }
763 __decode_pool(p, pi); 783 err = __decode_pool(p, end, pi);
784 if (err < 0)
785 goto bad;
764 } 786 }
765 if (version >= 5 && __decode_pool_names(p, end, map) < 0) 787 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
766 goto bad; 788 goto bad;
@@ -833,7 +855,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
833 node)->pgid, pgid) <= 0) { 855 node)->pgid, pgid) <= 0) {
834 struct ceph_pg_mapping *cur = 856 struct ceph_pg_mapping *cur =
835 rb_entry(rbp, struct ceph_pg_mapping, node); 857 rb_entry(rbp, struct ceph_pg_mapping, node);
836 858
837 rbp = rb_next(rbp); 859 rbp = rb_next(rbp);
838 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); 860 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
839 rb_erase(&cur->node, &map->pg_temp); 861 rb_erase(&cur->node, &map->pg_temp);
@@ -1026,8 +1048,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1026 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, 1048 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1027 pool->v.type, pool->v.size); 1049 pool->v.type, pool->v.size);
1028 if (ruleno < 0) { 1050 if (ruleno < 0) {
1029 pr_err("no crush rule pool %d type %d size %d\n", 1051 pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
1030 poolid, pool->v.type, pool->v.size); 1052 poolid, pool->v.crush_ruleset, pool->v.type,
1053 pool->v.size);
1031 return NULL; 1054 return NULL;
1032 } 1055 }
1033 1056
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 8fcc023056c7..6d5247f2e81b 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -1,5 +1,5 @@
1#ifndef __RADOS_H 1#ifndef CEPH_RADOS_H
2#define __RADOS_H 2#define CEPH_RADOS_H
3 3
4/* 4/*
5 * Data types for the Ceph distributed object storage layer RADOS 5 * Data types for the Ceph distributed object storage layer RADOS
@@ -203,6 +203,7 @@ enum {
203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, 203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
204 204
205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, 205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
206 CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
206 207
207 /** attrs **/ 208 /** attrs **/
208 /* read */ 209 /* read */
@@ -272,6 +273,10 @@ static inline int ceph_osd_op_mode_modify(int op)
272 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; 273 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
273} 274}
274 275
276/*
277 * note that the following tmap stuff is also defined in the ceph librados.h
278 * any modification here needs to be updated there
279 */
275#define CEPH_OSD_TMAP_HDR 'h' 280#define CEPH_OSD_TMAP_HDR 'h'
276#define CEPH_OSD_TMAP_SET 's' 281#define CEPH_OSD_TMAP_SET 's'
277#define CEPH_OSD_TMAP_RM 'r' 282#define CEPH_OSD_TMAP_RM 'r'
@@ -297,6 +302,7 @@ enum {
297 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ 302 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
298 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ 303 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
299 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ 304 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
305 CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
300}; 306};
301 307
302enum { 308enum {
@@ -350,6 +356,9 @@ struct ceph_osd_op {
350 struct { 356 struct {
351 __le64 cookie, count; 357 __le64 cookie, count;
352 } __attribute__ ((packed)) pgls; 358 } __attribute__ ((packed)) pgls;
359 struct {
360 __le64 snapid;
361 } __attribute__ ((packed)) snap;
353 }; 362 };
354 __le32 payload_len; 363 __le32 payload_len;
355} __attribute__ ((packed)); 364} __attribute__ ((packed));
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fa87f51e38e1..9922628532b2 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -2,6 +2,7 @@
2#include "ceph_debug.h" 2#include "ceph_debug.h"
3 3
4#include <linux/backing-dev.h> 4#include <linux/backing-dev.h>
5#include <linux/ctype.h>
5#include <linux/fs.h> 6#include <linux/fs.h>
6#include <linux/inet.h> 7#include <linux/inet.h>
7#include <linux/in6.h> 8#include <linux/in6.h>
@@ -101,12 +102,21 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
101} 102}
102 103
103 104
104static int ceph_syncfs(struct super_block *sb, int wait) 105static int ceph_sync_fs(struct super_block *sb, int wait)
105{ 106{
106 dout("sync_fs %d\n", wait); 107 struct ceph_client *client = ceph_sb_to_client(sb);
108
109 if (!wait) {
110 dout("sync_fs (non-blocking)\n");
111 ceph_flush_dirty_caps(&client->mdsc);
112 dout("sync_fs (non-blocking) done\n");
113 return 0;
114 }
115
116 dout("sync_fs (blocking)\n");
107 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); 117 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
108 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); 118 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
109 dout("sync_fs %d done\n", wait); 119 dout("sync_fs (blocking) done\n");
110 return 0; 120 return 0;
111} 121}
112 122
@@ -150,9 +160,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
150 struct ceph_mount_args *args = client->mount_args; 160 struct ceph_mount_args *args = client->mount_args;
151 161
152 if (args->flags & CEPH_OPT_FSID) 162 if (args->flags & CEPH_OPT_FSID)
153 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu", 163 seq_printf(m, ",fsid=%pU", &args->fsid);
154 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
155 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
156 if (args->flags & CEPH_OPT_NOSHARE) 164 if (args->flags & CEPH_OPT_NOSHARE)
157 seq_puts(m, ",noshare"); 165 seq_puts(m, ",noshare");
158 if (args->flags & CEPH_OPT_DIRSTAT) 166 if (args->flags & CEPH_OPT_DIRSTAT)
@@ -279,7 +287,7 @@ static const struct super_operations ceph_super_ops = {
279 .alloc_inode = ceph_alloc_inode, 287 .alloc_inode = ceph_alloc_inode,
280 .destroy_inode = ceph_destroy_inode, 288 .destroy_inode = ceph_destroy_inode,
281 .write_inode = ceph_write_inode, 289 .write_inode = ceph_write_inode,
282 .sync_fs = ceph_syncfs, 290 .sync_fs = ceph_sync_fs,
283 .put_super = ceph_put_super, 291 .put_super = ceph_put_super,
284 .show_options = ceph_show_options, 292 .show_options = ceph_show_options,
285 .statfs = ceph_statfs, 293 .statfs = ceph_statfs,
@@ -322,9 +330,6 @@ const char *ceph_msg_type_name(int type)
322 * mount options 330 * mount options
323 */ 331 */
324enum { 332enum {
325 Opt_fsidmajor,
326 Opt_fsidminor,
327 Opt_monport,
328 Opt_wsize, 333 Opt_wsize,
329 Opt_rsize, 334 Opt_rsize,
330 Opt_osdtimeout, 335 Opt_osdtimeout,
@@ -339,6 +344,7 @@ enum {
339 Opt_congestion_kb, 344 Opt_congestion_kb,
340 Opt_last_int, 345 Opt_last_int,
341 /* int args above */ 346 /* int args above */
347 Opt_fsid,
342 Opt_snapdirname, 348 Opt_snapdirname,
343 Opt_name, 349 Opt_name,
344 Opt_secret, 350 Opt_secret,
@@ -355,9 +361,6 @@ enum {
355}; 361};
356 362
357static match_table_t arg_tokens = { 363static match_table_t arg_tokens = {
358 {Opt_fsidmajor, "fsidmajor=%ld"},
359 {Opt_fsidminor, "fsidminor=%ld"},
360 {Opt_monport, "monport=%d"},
361 {Opt_wsize, "wsize=%d"}, 364 {Opt_wsize, "wsize=%d"},
362 {Opt_rsize, "rsize=%d"}, 365 {Opt_rsize, "rsize=%d"},
363 {Opt_osdtimeout, "osdtimeout=%d"}, 366 {Opt_osdtimeout, "osdtimeout=%d"},
@@ -371,6 +374,7 @@ static match_table_t arg_tokens = {
371 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, 374 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
372 {Opt_congestion_kb, "write_congestion_kb=%d"}, 375 {Opt_congestion_kb, "write_congestion_kb=%d"},
373 /* int args above */ 376 /* int args above */
377 {Opt_fsid, "fsid=%s"},
374 {Opt_snapdirname, "snapdirname=%s"}, 378 {Opt_snapdirname, "snapdirname=%s"},
375 {Opt_name, "name=%s"}, 379 {Opt_name, "name=%s"},
376 {Opt_secret, "secret=%s"}, 380 {Opt_secret, "secret=%s"},
@@ -386,6 +390,36 @@ static match_table_t arg_tokens = {
386 {-1, NULL} 390 {-1, NULL}
387}; 391};
388 392
393static int parse_fsid(const char *str, struct ceph_fsid *fsid)
394{
395 int i = 0;
396 char tmp[3];
397 int err = -EINVAL;
398 int d;
399
400 dout("parse_fsid '%s'\n", str);
401 tmp[2] = 0;
402 while (*str && i < 16) {
403 if (ispunct(*str)) {
404 str++;
405 continue;
406 }
407 if (!isxdigit(str[0]) || !isxdigit(str[1]))
408 break;
409 tmp[0] = str[0];
410 tmp[1] = str[1];
411 if (sscanf(tmp, "%x", &d) < 1)
412 break;
413 fsid->fsid[i] = d & 0xff;
414 i++;
415 str += 2;
416 }
417
418 if (i == 16)
419 err = 0;
420 dout("parse_fsid ret %d got fsid %pU", err, fsid);
421 return err;
422}
389 423
390static struct ceph_mount_args *parse_mount_args(int flags, char *options, 424static struct ceph_mount_args *parse_mount_args(int flags, char *options,
391 const char *dev_name, 425 const char *dev_name,
@@ -469,12 +503,6 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
469 dout("got token %d\n", token); 503 dout("got token %d\n", token);
470 } 504 }
471 switch (token) { 505 switch (token) {
472 case Opt_fsidmajor:
473 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
474 break;
475 case Opt_fsidminor:
476 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
477 break;
478 case Opt_ip: 506 case Opt_ip:
479 err = ceph_parse_ips(argstr[0].from, 507 err = ceph_parse_ips(argstr[0].from,
480 argstr[0].to, 508 argstr[0].to,
@@ -485,6 +513,11 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
485 args->flags |= CEPH_OPT_MYIP; 513 args->flags |= CEPH_OPT_MYIP;
486 break; 514 break;
487 515
516 case Opt_fsid:
517 err = parse_fsid(argstr[0].from, &args->fsid);
518 if (err == 0)
519 args->flags |= CEPH_OPT_FSID;
520 break;
488 case Opt_snapdirname: 521 case Opt_snapdirname:
489 kfree(args->snapdir_name); 522 kfree(args->snapdir_name);
490 args->snapdir_name = kstrndup(argstr[0].from, 523 args->snapdir_name = kstrndup(argstr[0].from,
@@ -515,6 +548,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
515 case Opt_osdkeepalivetimeout: 548 case Opt_osdkeepalivetimeout:
516 args->osd_keepalive_timeout = intval; 549 args->osd_keepalive_timeout = intval;
517 break; 550 break;
551 case Opt_osd_idle_ttl:
552 args->osd_idle_ttl = intval;
553 break;
518 case Opt_mount_timeout: 554 case Opt_mount_timeout:
519 args->mount_timeout = intval; 555 args->mount_timeout = intval;
520 break; 556 break;
@@ -630,7 +666,6 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
630 666
631 /* caps */ 667 /* caps */
632 client->min_caps = args->max_readdir; 668 client->min_caps = args->max_readdir;
633 ceph_adjust_min_caps(client->min_caps);
634 669
635 /* subsystems */ 670 /* subsystems */
636 err = ceph_monc_init(&client->monc, client); 671 err = ceph_monc_init(&client->monc, client);
@@ -680,8 +715,6 @@ static void ceph_destroy_client(struct ceph_client *client)
680 715
681 ceph_monc_stop(&client->monc); 716 ceph_monc_stop(&client->monc);
682 717
683 ceph_adjust_min_caps(-client->min_caps);
684
685 ceph_debugfs_client_cleanup(client); 718 ceph_debugfs_client_cleanup(client);
686 destroy_workqueue(client->wb_wq); 719 destroy_workqueue(client->wb_wq);
687 destroy_workqueue(client->pg_inv_wq); 720 destroy_workqueue(client->pg_inv_wq);
@@ -706,13 +739,13 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
706{ 739{
707 if (client->have_fsid) { 740 if (client->have_fsid) {
708 if (ceph_fsid_compare(&client->fsid, fsid)) { 741 if (ceph_fsid_compare(&client->fsid, fsid)) {
709 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT, 742 pr_err("bad fsid, had %pU got %pU",
710 PR_FSID(&client->fsid), PR_FSID(fsid)); 743 &client->fsid, fsid);
711 return -1; 744 return -1;
712 } 745 }
713 } else { 746 } else {
714 pr_info("client%lld fsid " FSID_FORMAT "\n", 747 pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
715 client->monc.auth->global_id, PR_FSID(fsid)); 748 fsid);
716 memcpy(&client->fsid, fsid, sizeof(*fsid)); 749 memcpy(&client->fsid, fsid, sizeof(*fsid));
717 ceph_debugfs_client_init(client); 750 ceph_debugfs_client_init(client);
718 client->have_fsid = true; 751 client->have_fsid = true;
@@ -1043,8 +1076,6 @@ static int __init init_ceph(void)
1043 if (ret) 1076 if (ret)
1044 goto out_msgr; 1077 goto out_msgr;
1045 1078
1046 ceph_caps_init();
1047
1048 ret = register_filesystem(&ceph_fs_type); 1079 ret = register_filesystem(&ceph_fs_type);
1049 if (ret) 1080 if (ret)
1050 goto out_icache; 1081 goto out_icache;
@@ -1069,7 +1100,6 @@ static void __exit exit_ceph(void)
1069{ 1100{
1070 dout("exit_ceph\n"); 1101 dout("exit_ceph\n");
1071 unregister_filesystem(&ceph_fs_type); 1102 unregister_filesystem(&ceph_fs_type);
1072 ceph_caps_finalize();
1073 destroy_caches(); 1103 destroy_caches();
1074 ceph_msgr_exit(); 1104 ceph_msgr_exit();
1075 ceph_debugfs_cleanup(); 1105 ceph_debugfs_cleanup();
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 10a4a406e887..2482d696f0de 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -31,6 +31,12 @@
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32 32
33/* 33/*
34 * Supported features
35 */
36#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
37#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR
38
39/*
34 * mount options 40 * mount options
35 */ 41 */
36#define CEPH_OPT_FSID (1<<0) 42#define CEPH_OPT_FSID (1<<0)
@@ -560,11 +566,13 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
560/* what the mds thinks we want */ 566/* what the mds thinks we want */
561extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); 567extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
562 568
563extern void ceph_caps_init(void); 569extern void ceph_caps_init(struct ceph_mds_client *mdsc);
564extern void ceph_caps_finalize(void); 570extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
565extern void ceph_adjust_min_caps(int delta); 571extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
566extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need); 572extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
567extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx); 573 struct ceph_cap_reservation *ctx, int need);
574extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
575 struct ceph_cap_reservation *ctx);
568extern void ceph_reservation_status(struct ceph_client *client, 576extern void ceph_reservation_status(struct ceph_client *client,
569 int *total, int *avail, int *used, 577 int *total, int *avail, int *used,
570 int *reserved, int *min); 578 int *reserved, int *min);
@@ -738,13 +746,6 @@ extern struct kmem_cache *ceph_file_cachep;
738extern const char *ceph_msg_type_name(int type); 746extern const char *ceph_msg_type_name(int type);
739extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); 747extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
740 748
741#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
742 "%02x%02x%02x%02x%02x%02x"
743#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
744 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
745 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
746 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
747
748/* inode.c */ 749/* inode.c */
749extern const struct inode_operations ceph_file_iops; 750extern const struct inode_operations ceph_file_iops;
750 751
@@ -806,13 +807,16 @@ static inline void ceph_remove_cap(struct ceph_cap *cap)
806 __ceph_remove_cap(cap); 807 __ceph_remove_cap(cap);
807 spin_unlock(&inode->i_lock); 808 spin_unlock(&inode->i_lock);
808} 809}
809extern void ceph_put_cap(struct ceph_cap *cap); 810extern void ceph_put_cap(struct ceph_mds_client *mdsc,
811 struct ceph_cap *cap);
810 812
811extern void ceph_queue_caps_release(struct inode *inode); 813extern void ceph_queue_caps_release(struct inode *inode);
812extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); 814extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
813extern int ceph_fsync(struct file *file, int datasync); 815extern int ceph_fsync(struct file *file, int datasync);
814extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, 816extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
815 struct ceph_mds_session *session); 817 struct ceph_mds_session *session);
818extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
819 int mds);
816extern int ceph_get_cap_mds(struct inode *inode); 820extern int ceph_get_cap_mds(struct inode *inode);
817extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); 821extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
818extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); 822extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
@@ -857,7 +861,7 @@ extern void ceph_release_page_vector(struct page **pages, int num_pages);
857/* dir.c */ 861/* dir.c */
858extern const struct file_operations ceph_dir_fops; 862extern const struct file_operations ceph_dir_fops;
859extern const struct inode_operations ceph_dir_iops; 863extern const struct inode_operations ceph_dir_iops;
860extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, 864extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
861 ceph_snapdir_dentry_ops; 865 ceph_snapdir_dentry_ops;
862 866
863extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); 867extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
@@ -888,6 +892,14 @@ extern void ceph_debugfs_cleanup(void);
888extern int ceph_debugfs_client_init(struct ceph_client *client); 892extern int ceph_debugfs_client_init(struct ceph_client *client);
889extern void ceph_debugfs_client_cleanup(struct ceph_client *client); 893extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
890 894
895/* locks.c */
896extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
897extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
898extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
899extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
900 int p_locks, int f_locks);
901extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
902
891static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) 903static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
892{ 904{
893 if (dentry && dentry->d_parent) 905 if (dentry && dentry->d_parent)
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 68aeebc69681..097a2654c00f 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -337,6 +337,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
337} 337}
338 338
339static int __build_xattrs(struct inode *inode) 339static int __build_xattrs(struct inode *inode)
340 __releases(inode->i_lock)
341 __acquires(inode->i_lock)
340{ 342{
341 u32 namelen; 343 u32 namelen;
342 u32 numattr = 0; 344 u32 numattr = 0;
diff --git a/fs/cifs/README b/fs/cifs/README
index a7081eeeb85d..7099a526f775 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -301,6 +301,16 @@ A partial list of the supported mount options follows:
301 gid Set the default gid for inodes (similar to above). 301 gid Set the default gid for inodes (similar to above).
302 file_mode If CIFS Unix extensions are not supported by the server 302 file_mode If CIFS Unix extensions are not supported by the server
303 this overrides the default mode for file inodes. 303 this overrides the default mode for file inodes.
304 fsc Enable local disk caching using FS-Cache (off by default). This
305 option could be useful to improve performance on a slow link,
306 heavily loaded server and/or network where reading from the
307 disk is faster than reading from the server (over the network).
308 This could also impact scalability positively as the
309 number of calls to the server are reduced. However, local
310 caching is not suitable for all workloads for e.g. read-once
311 type workloads. So, you need to consider carefully your
312 workload/scenario before using this option. Currently, local
313 disk caching is functional for CIFS files opened as read-only.
304 dir_mode If CIFS Unix extensions are not supported by the server 314 dir_mode If CIFS Unix extensions are not supported by the server
305 this overrides the default mode for directory inodes. 315 this overrides the default mode for directory inodes.
306 port attempt to contact the server on this tcp port, before 316 port attempt to contact the server on this tcp port, before
diff --git a/fs/dcache.c b/fs/dcache.c
index 9f2c13417969..166d35d56868 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1905,48 +1905,30 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
1905} 1905}
1906 1906
1907/** 1907/**
1908 * __d_path - return the path of a dentry 1908 * Prepend path string to a buffer
1909 *
1909 * @path: the dentry/vfsmount to report 1910 * @path: the dentry/vfsmount to report
1910 * @root: root vfsmnt/dentry (may be modified by this function) 1911 * @root: root vfsmnt/dentry (may be modified by this function)
1911 * @buffer: buffer to return value in 1912 * @buffer: pointer to the end of the buffer
1912 * @buflen: buffer length 1913 * @buflen: pointer to buffer length
1913 * 1914 *
1914 * Convert a dentry into an ASCII path name. If the entry has been deleted 1915 * Caller holds the dcache_lock.
1915 * the string " (deleted)" is appended. Note that this is ambiguous.
1916 *
1917 * Returns a pointer into the buffer or an error code if the
1918 * path was too long.
1919 *
1920 * "buflen" should be positive. Caller holds the dcache_lock.
1921 * 1916 *
1922 * If path is not reachable from the supplied root, then the value of 1917 * If path is not reachable from the supplied root, then the value of
1923 * root is changed (without modifying refcounts). 1918 * root is changed (without modifying refcounts).
1924 */ 1919 */
1925char *__d_path(const struct path *path, struct path *root, 1920static int prepend_path(const struct path *path, struct path *root,
1926 char *buffer, int buflen) 1921 char **buffer, int *buflen)
1927{ 1922{
1928 struct dentry *dentry = path->dentry; 1923 struct dentry *dentry = path->dentry;
1929 struct vfsmount *vfsmnt = path->mnt; 1924 struct vfsmount *vfsmnt = path->mnt;
1930 char *end = buffer + buflen; 1925 bool slash = false;
1931 char *retval; 1926 int error = 0;
1932 1927
1933 spin_lock(&vfsmount_lock); 1928 spin_lock(&vfsmount_lock);
1934 prepend(&end, &buflen, "\0", 1); 1929 while (dentry != root->dentry || vfsmnt != root->mnt) {
1935 if (d_unlinked(dentry) &&
1936 (prepend(&end, &buflen, " (deleted)", 10) != 0))
1937 goto Elong;
1938
1939 if (buflen < 1)
1940 goto Elong;
1941 /* Get '/' right */
1942 retval = end-1;
1943 *retval = '/';
1944
1945 for (;;) {
1946 struct dentry * parent; 1930 struct dentry * parent;
1947 1931
1948 if (dentry == root->dentry && vfsmnt == root->mnt)
1949 break;
1950 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { 1932 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
1951 /* Global root? */ 1933 /* Global root? */
1952 if (vfsmnt->mnt_parent == vfsmnt) { 1934 if (vfsmnt->mnt_parent == vfsmnt) {
@@ -1958,28 +1940,88 @@ char *__d_path(const struct path *path, struct path *root,
1958 } 1940 }
1959 parent = dentry->d_parent; 1941 parent = dentry->d_parent;
1960 prefetch(parent); 1942 prefetch(parent);
1961 if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || 1943 error = prepend_name(buffer, buflen, &dentry->d_name);
1962 (prepend(&end, &buflen, "/", 1) != 0)) 1944 if (!error)
1963 goto Elong; 1945 error = prepend(buffer, buflen, "/", 1);
1964 retval = end; 1946 if (error)
1947 break;
1948
1949 slash = true;
1965 dentry = parent; 1950 dentry = parent;
1966 } 1951 }
1967 1952
1968out: 1953out:
1954 if (!error && !slash)
1955 error = prepend(buffer, buflen, "/", 1);
1956
1969 spin_unlock(&vfsmount_lock); 1957 spin_unlock(&vfsmount_lock);
1970 return retval; 1958 return error;
1971 1959
1972global_root: 1960global_root:
1973 retval += 1; /* hit the slash */ 1961 /*
1974 if (prepend_name(&retval, &buflen, &dentry->d_name) != 0) 1962 * Filesystems needing to implement special "root names"
1975 goto Elong; 1963 * should do so with ->d_dname()
1964 */
1965 if (IS_ROOT(dentry) &&
1966 (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) {
1967 WARN(1, "Root dentry has weird name <%.*s>\n",
1968 (int) dentry->d_name.len, dentry->d_name.name);
1969 }
1976 root->mnt = vfsmnt; 1970 root->mnt = vfsmnt;
1977 root->dentry = dentry; 1971 root->dentry = dentry;
1978 goto out; 1972 goto out;
1973}
1979 1974
1980Elong: 1975/**
1981 retval = ERR_PTR(-ENAMETOOLONG); 1976 * __d_path - return the path of a dentry
1982 goto out; 1977 * @path: the dentry/vfsmount to report
1978 * @root: root vfsmnt/dentry (may be modified by this function)
1979 * @buffer: buffer to return value in
1980 * @buflen: buffer length
1981 *
1982 * Convert a dentry into an ASCII path name.
1983 *
1984 * Returns a pointer into the buffer or an error code if the
1985 * path was too long.
1986 *
1987 * "buflen" should be positive. Caller holds the dcache_lock.
1988 *
1989 * If path is not reachable from the supplied root, then the value of
1990 * root is changed (without modifying refcounts).
1991 */
1992char *__d_path(const struct path *path, struct path *root,
1993 char *buf, int buflen)
1994{
1995 char *res = buf + buflen;
1996 int error;
1997
1998 prepend(&res, &buflen, "\0", 1);
1999 error = prepend_path(path, root, &res, &buflen);
2000 if (error)
2001 return ERR_PTR(error);
2002
2003 return res;
2004}
2005
2006/*
2007 * same as __d_path but appends "(deleted)" for unlinked files.
2008 */
2009static int path_with_deleted(const struct path *path, struct path *root,
2010 char **buf, int *buflen)
2011{
2012 prepend(buf, buflen, "\0", 1);
2013 if (d_unlinked(path->dentry)) {
2014 int error = prepend(buf, buflen, " (deleted)", 10);
2015 if (error)
2016 return error;
2017 }
2018
2019 return prepend_path(path, root, buf, buflen);
2020}
2021
2022static int prepend_unreachable(char **buffer, int *buflen)
2023{
2024 return prepend(buffer, buflen, "(unreachable)", 13);
1983} 2025}
1984 2026
1985/** 2027/**
@@ -2000,9 +2042,10 @@ Elong:
2000 */ 2042 */
2001char *d_path(const struct path *path, char *buf, int buflen) 2043char *d_path(const struct path *path, char *buf, int buflen)
2002{ 2044{
2003 char *res; 2045 char *res = buf + buflen;
2004 struct path root; 2046 struct path root;
2005 struct path tmp; 2047 struct path tmp;
2048 int error;
2006 2049
2007 /* 2050 /*
2008 * We have various synthetic filesystems that never get mounted. On 2051 * We have various synthetic filesystems that never get mounted. On
@@ -2014,19 +2057,51 @@ char *d_path(const struct path *path, char *buf, int buflen)
2014 if (path->dentry->d_op && path->dentry->d_op->d_dname) 2057 if (path->dentry->d_op && path->dentry->d_op->d_dname)
2015 return path->dentry->d_op->d_dname(path->dentry, buf, buflen); 2058 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
2016 2059
2017 read_lock(&current->fs->lock); 2060 get_fs_root(current->fs, &root);
2018 root = current->fs->root;
2019 path_get(&root);
2020 read_unlock(&current->fs->lock);
2021 spin_lock(&dcache_lock); 2061 spin_lock(&dcache_lock);
2022 tmp = root; 2062 tmp = root;
2023 res = __d_path(path, &tmp, buf, buflen); 2063 error = path_with_deleted(path, &tmp, &res, &buflen);
2064 if (error)
2065 res = ERR_PTR(error);
2024 spin_unlock(&dcache_lock); 2066 spin_unlock(&dcache_lock);
2025 path_put(&root); 2067 path_put(&root);
2026 return res; 2068 return res;
2027} 2069}
2028EXPORT_SYMBOL(d_path); 2070EXPORT_SYMBOL(d_path);
2029 2071
2072/**
2073 * d_path_with_unreachable - return the path of a dentry
2074 * @path: path to report
2075 * @buf: buffer to return value in
2076 * @buflen: buffer length
2077 *
2078 * The difference from d_path() is that this prepends "(unreachable)"
2079 * to paths which are unreachable from the current process' root.
2080 */
2081char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
2082{
2083 char *res = buf + buflen;
2084 struct path root;
2085 struct path tmp;
2086 int error;
2087
2088 if (path->dentry->d_op && path->dentry->d_op->d_dname)
2089 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
2090
2091 get_fs_root(current->fs, &root);
2092 spin_lock(&dcache_lock);
2093 tmp = root;
2094 error = path_with_deleted(path, &tmp, &res, &buflen);
2095 if (!error && !path_equal(&tmp, &root))
2096 error = prepend_unreachable(&res, &buflen);
2097 spin_unlock(&dcache_lock);
2098 path_put(&root);
2099 if (error)
2100 res = ERR_PTR(error);
2101
2102 return res;
2103}
2104
2030/* 2105/*
2031 * Helper function for dentry_operations.d_dname() members 2106 * Helper function for dentry_operations.d_dname() members
2032 */ 2107 */
@@ -2129,27 +2204,30 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2129 if (!page) 2204 if (!page)
2130 return -ENOMEM; 2205 return -ENOMEM;
2131 2206
2132 read_lock(&current->fs->lock); 2207 get_fs_root_and_pwd(current->fs, &root, &pwd);
2133 pwd = current->fs->pwd;
2134 path_get(&pwd);
2135 root = current->fs->root;
2136 path_get(&root);
2137 read_unlock(&current->fs->lock);
2138 2208
2139 error = -ENOENT; 2209 error = -ENOENT;
2140 spin_lock(&dcache_lock); 2210 spin_lock(&dcache_lock);
2141 if (!d_unlinked(pwd.dentry)) { 2211 if (!d_unlinked(pwd.dentry)) {
2142 unsigned long len; 2212 unsigned long len;
2143 struct path tmp = root; 2213 struct path tmp = root;
2144 char * cwd; 2214 char *cwd = page + PAGE_SIZE;
2215 int buflen = PAGE_SIZE;
2145 2216
2146 cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE); 2217 prepend(&cwd, &buflen, "\0", 1);
2218 error = prepend_path(&pwd, &tmp, &cwd, &buflen);
2147 spin_unlock(&dcache_lock); 2219 spin_unlock(&dcache_lock);
2148 2220
2149 error = PTR_ERR(cwd); 2221 if (error)
2150 if (IS_ERR(cwd))
2151 goto out; 2222 goto out;
2152 2223
2224 /* Unreachable from current root */
2225 if (!path_equal(&tmp, &root)) {
2226 error = prepend_unreachable(&cwd, &buflen);
2227 if (error)
2228 goto out;
2229 }
2230
2153 error = -ERANGE; 2231 error = -ERANGE;
2154 len = PAGE_SIZE + page - cwd; 2232 len = PAGE_SIZE + page - cwd;
2155 if (len <= size) { 2233 if (len <= size) {
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index f9bfe2b501d5..68cb23e3bb98 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -30,9 +30,6 @@
30 * along with exofs; if not, write to the Free Software 30 * along with exofs; if not, write to the Free Software
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33
34#include <linux/buffer_head.h>
35
36#include "exofs.h" 33#include "exofs.h"
37 34
38static int exofs_release_file(struct inode *inode, struct file *filp) 35static int exofs_release_file(struct inode *inode, struct file *filp)
@@ -40,19 +37,27 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
40 return 0; 37 return 0;
41} 38}
42 39
40/* exofs_file_fsync - flush the inode to disk
41 *
42 * Note, in exofs all metadata is written as part of inode, regardless.
43 * The writeout is synchronous
44 */
43static int exofs_file_fsync(struct file *filp, int datasync) 45static int exofs_file_fsync(struct file *filp, int datasync)
44{ 46{
45 int ret; 47 int ret;
46 struct address_space *mapping = filp->f_mapping; 48 struct inode *inode = filp->f_mapping->host;
47 struct inode *inode = mapping->host; 49 struct writeback_control wbc = {
50 .sync_mode = WB_SYNC_ALL,
51 .nr_to_write = 0, /* metadata-only; caller takes care of data */
52 };
48 struct super_block *sb; 53 struct super_block *sb;
49 54
50 ret = filemap_write_and_wait(mapping); 55 if (!(inode->i_state & I_DIRTY))
51 if (ret) 56 return 0;
52 return ret; 57 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
58 return 0;
53 59
54 /* sync the inode attributes */ 60 ret = sync_inode(inode, &wbc);
55 ret = write_inode_now(inode, 1);
56 61
57 /* This is a good place to write the sb */ 62 /* This is a good place to write the sb */
58 /* TODO: Sechedule an sb-sync on create */ 63 /* TODO: Sechedule an sb-sync on create */
@@ -65,9 +70,9 @@ static int exofs_file_fsync(struct file *filp, int datasync)
65 70
66static int exofs_flush(struct file *file, fl_owner_t id) 71static int exofs_flush(struct file *file, fl_owner_t id)
67{ 72{
68 exofs_file_fsync(file, 1); 73 int ret = vfs_fsync(file, 0);
69 /* TODO: Flush the OSD target */ 74 /* TODO: Flush the OSD target */
70 return 0; 75 return ret;
71} 76}
72 77
73const struct file_operations exofs_file_operations = { 78const struct file_operations exofs_file_operations = {
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 088cb476b68a..eb7368ebd8cd 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -32,9 +32,6 @@
32 */ 32 */
33 33
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/writeback.h>
36#include <linux/buffer_head.h>
37#include <scsi/scsi_device.h>
38 35
39#include "exofs.h" 36#include "exofs.h"
40 37
@@ -773,15 +770,13 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
773{ 770{
774 EXOFS_DBGMSG("page 0x%lx\n", page->index); 771 EXOFS_DBGMSG("page 0x%lx\n", page->index);
775 WARN_ON(1); 772 WARN_ON(1);
776 return try_to_free_buffers(page); 773 return 0;
777} 774}
778 775
779static void exofs_invalidatepage(struct page *page, unsigned long offset) 776static void exofs_invalidatepage(struct page *page, unsigned long offset)
780{ 777{
781 EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page)); 778 EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
782 WARN_ON(1); 779 WARN_ON(1);
783
784 block_invalidatepage(page, offset);
785} 780}
786 781
787const struct address_space_operations exofs_aops = { 782const struct address_space_operations exofs_aops = {
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index e2732203fa93..6550bf70e41d 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -305,8 +305,6 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
305struct _striping_info { 305struct _striping_info {
306 u64 obj_offset; 306 u64 obj_offset;
307 u64 group_length; 307 u64 group_length;
308 u64 total_group_length;
309 u64 Major;
310 unsigned dev; 308 unsigned dev;
311 unsigned unit_off; 309 unsigned unit_off;
312}; 310};
@@ -343,8 +341,6 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
343 (M * group_depth * stripe_unit); 341 (M * group_depth * stripe_unit);
344 342
345 si->group_length = T - H; 343 si->group_length = T - H;
346 si->total_group_length = T;
347 si->Major = M;
348} 344}
349 345
350static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, 346static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
@@ -392,20 +388,19 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
392} 388}
393 389
394static int _prepare_one_group(struct exofs_io_state *ios, u64 length, 390static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
395 struct _striping_info *si, unsigned first_comp) 391 struct _striping_info *si)
396{ 392{
397 unsigned stripe_unit = ios->layout->stripe_unit; 393 unsigned stripe_unit = ios->layout->stripe_unit;
398 unsigned mirrors_p1 = ios->layout->mirrors_p1; 394 unsigned mirrors_p1 = ios->layout->mirrors_p1;
399 unsigned devs_in_group = ios->layout->group_width * mirrors_p1; 395 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
400 unsigned dev = si->dev; 396 unsigned dev = si->dev;
401 unsigned first_dev = dev - (dev % devs_in_group); 397 unsigned first_dev = dev - (dev % devs_in_group);
402 unsigned comp = first_comp + (dev - first_dev);
403 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; 398 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
404 unsigned cur_pg = ios->pages_consumed; 399 unsigned cur_pg = ios->pages_consumed;
405 int ret = 0; 400 int ret = 0;
406 401
407 while (length) { 402 while (length) {
408 struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; 403 struct exofs_per_dev_state *per_dev = &ios->per_dev[dev];
409 unsigned cur_len, page_off = 0; 404 unsigned cur_len, page_off = 0;
410 405
411 if (!per_dev->length) { 406 if (!per_dev->length) {
@@ -424,11 +419,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
424 cur_len = stripe_unit; 419 cur_len = stripe_unit;
425 } 420 }
426 421
427 if (max_comp < comp) 422 if (max_comp < dev)
428 max_comp = comp; 423 max_comp = dev;
429
430 dev += mirrors_p1;
431 dev = (dev % devs_in_group) + first_dev;
432 } else { 424 } else {
433 cur_len = stripe_unit; 425 cur_len = stripe_unit;
434 } 426 }
@@ -440,8 +432,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
440 if (unlikely(ret)) 432 if (unlikely(ret))
441 goto out; 433 goto out;
442 434
443 comp += mirrors_p1; 435 dev += mirrors_p1;
444 comp = (comp % devs_in_group) + first_comp; 436 dev = (dev % devs_in_group) + first_dev;
445 437
446 length -= cur_len; 438 length -= cur_len;
447 } 439 }
@@ -454,18 +446,15 @@ out:
454static int _prepare_for_striping(struct exofs_io_state *ios) 446static int _prepare_for_striping(struct exofs_io_state *ios)
455{ 447{
456 u64 length = ios->length; 448 u64 length = ios->length;
449 u64 offset = ios->offset;
457 struct _striping_info si; 450 struct _striping_info si;
458 unsigned devs_in_group = ios->layout->group_width *
459 ios->layout->mirrors_p1;
460 unsigned first_comp = 0;
461 int ret = 0; 451 int ret = 0;
462 452
463 _calc_stripe_info(ios, ios->offset, &si);
464
465 if (!ios->pages) { 453 if (!ios->pages) {
466 if (ios->kern_buff) { 454 if (ios->kern_buff) {
467 struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; 455 struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
468 456
457 _calc_stripe_info(ios, ios->offset, &si);
469 per_dev->offset = si.obj_offset; 458 per_dev->offset = si.obj_offset;
470 per_dev->dev = si.dev; 459 per_dev->dev = si.dev;
471 460
@@ -479,26 +468,17 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
479 } 468 }
480 469
481 while (length) { 470 while (length) {
471 _calc_stripe_info(ios, offset, &si);
472
482 if (length < si.group_length) 473 if (length < si.group_length)
483 si.group_length = length; 474 si.group_length = length;
484 475
485 ret = _prepare_one_group(ios, si.group_length, &si, first_comp); 476 ret = _prepare_one_group(ios, si.group_length, &si);
486 if (unlikely(ret)) 477 if (unlikely(ret))
487 goto out; 478 goto out;
488 479
480 offset += si.group_length;
489 length -= si.group_length; 481 length -= si.group_length;
490
491 si.group_length = si.total_group_length;
492 si.unit_off = 0;
493 ++si.Major;
494 si.obj_offset = si.Major * ios->layout->stripe_unit *
495 ios->layout->group_depth;
496
497 si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
498 si.dev %= ios->layout->s_numdevs;
499
500 first_comp += devs_in_group;
501 first_comp %= ios->layout->s_numdevs;
502 } 482 }
503 483
504out: 484out:
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 32cfd61def5f..047e92fa3af8 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -31,7 +31,6 @@
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33 33
34#include <linux/smp_lock.h>
35#include <linux/string.h> 34#include <linux/string.h>
36#include <linux/parser.h> 35#include <linux/parser.h>
37#include <linux/vfs.h> 36#include <linux/vfs.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9d175d623aab..6769fd0f35b8 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -767,11 +767,22 @@ void kill_fasync(struct fasync_struct **fp, int sig, int band)
767} 767}
768EXPORT_SYMBOL(kill_fasync); 768EXPORT_SYMBOL(kill_fasync);
769 769
770static int __init fasync_init(void) 770static int __init fcntl_init(void)
771{ 771{
772 /* please add new bits here to ensure allocation uniqueness */
773 BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
774 O_RDONLY | O_WRONLY | O_RDWR |
775 O_CREAT | O_EXCL | O_NOCTTY |
776 O_TRUNC | O_APPEND | O_NONBLOCK |
777 __O_SYNC | O_DSYNC | FASYNC |
778 O_DIRECT | O_LARGEFILE | O_DIRECTORY |
779 O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
780 FMODE_EXEC
781 ));
782
772 fasync_cache = kmem_cache_create("fasync_cache", 783 fasync_cache = kmem_cache_create("fasync_cache",
773 sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); 784 sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
774 return 0; 785 return 0;
775} 786}
776 787
777module_init(fasync_init) 788module_init(fcntl_init)
diff --git a/fs/file.c b/fs/file.c
index cccaead962c2..0be344755c02 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -39,28 +39,27 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */
39 */ 39 */
40static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); 40static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
41 41
42static inline void * alloc_fdmem(unsigned int size) 42static inline void *alloc_fdmem(unsigned int size)
43{ 43{
44 if (size <= PAGE_SIZE) 44 void *data;
45 return kmalloc(size, GFP_KERNEL); 45
46 else 46 data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
47 return vmalloc(size); 47 if (data != NULL)
48 return data;
49
50 return vmalloc(size);
48} 51}
49 52
50static inline void free_fdarr(struct fdtable *fdt) 53static void free_fdmem(void *ptr)
51{ 54{
52 if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) 55 is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
53 kfree(fdt->fd);
54 else
55 vfree(fdt->fd);
56} 56}
57 57
58static inline void free_fdset(struct fdtable *fdt) 58static void __free_fdtable(struct fdtable *fdt)
59{ 59{
60 if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2)) 60 free_fdmem(fdt->fd);
61 kfree(fdt->open_fds); 61 free_fdmem(fdt->open_fds);
62 else 62 kfree(fdt);
63 vfree(fdt->open_fds);
64} 63}
65 64
66static void free_fdtable_work(struct work_struct *work) 65static void free_fdtable_work(struct work_struct *work)
@@ -75,9 +74,8 @@ static void free_fdtable_work(struct work_struct *work)
75 spin_unlock_bh(&f->lock); 74 spin_unlock_bh(&f->lock);
76 while(fdt) { 75 while(fdt) {
77 struct fdtable *next = fdt->next; 76 struct fdtable *next = fdt->next;
78 vfree(fdt->fd); 77
79 free_fdset(fdt); 78 __free_fdtable(fdt);
80 kfree(fdt);
81 fdt = next; 79 fdt = next;
82 } 80 }
83} 81}
@@ -98,7 +96,7 @@ void free_fdtable_rcu(struct rcu_head *rcu)
98 container_of(fdt, struct files_struct, fdtab)); 96 container_of(fdt, struct files_struct, fdtab));
99 return; 97 return;
100 } 98 }
101 if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) { 99 if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
102 kfree(fdt->fd); 100 kfree(fdt->fd);
103 kfree(fdt->open_fds); 101 kfree(fdt->open_fds);
104 kfree(fdt); 102 kfree(fdt);
@@ -183,7 +181,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
183 return fdt; 181 return fdt;
184 182
185out_arr: 183out_arr:
186 free_fdarr(fdt); 184 free_fdmem(fdt->fd);
187out_fdt: 185out_fdt:
188 kfree(fdt); 186 kfree(fdt);
189out: 187out:
@@ -213,9 +211,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
213 * caller and alloc_fdtable(). Cheaper to catch it here... 211 * caller and alloc_fdtable(). Cheaper to catch it here...
214 */ 212 */
215 if (unlikely(new_fdt->max_fds <= nr)) { 213 if (unlikely(new_fdt->max_fds <= nr)) {
216 free_fdarr(new_fdt); 214 __free_fdtable(new_fdt);
217 free_fdset(new_fdt);
218 kfree(new_fdt);
219 return -EMFILE; 215 return -EMFILE;
220 } 216 }
221 /* 217 /*
@@ -231,9 +227,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
231 free_fdtable(cur_fdt); 227 free_fdtable(cur_fdt);
232 } else { 228 } else {
233 /* Somebody else expanded, so undo our attempt */ 229 /* Somebody else expanded, so undo our attempt */
234 free_fdarr(new_fdt); 230 __free_fdtable(new_fdt);
235 free_fdset(new_fdt);
236 kfree(new_fdt);
237 } 231 }
238 return 1; 232 return 1;
239} 233}
@@ -323,11 +317,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
323 while (unlikely(open_files > new_fdt->max_fds)) { 317 while (unlikely(open_files > new_fdt->max_fds)) {
324 spin_unlock(&oldf->file_lock); 318 spin_unlock(&oldf->file_lock);
325 319
326 if (new_fdt != &newf->fdtab) { 320 if (new_fdt != &newf->fdtab)
327 free_fdarr(new_fdt); 321 __free_fdtable(new_fdt);
328 free_fdset(new_fdt);
329 kfree(new_fdt);
330 }
331 322
332 new_fdt = alloc_fdtable(open_files - 1); 323 new_fdt = alloc_fdtable(open_files - 1);
333 if (!new_fdt) { 324 if (!new_fdt) {
@@ -337,9 +328,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
337 328
338 /* beyond sysctl_nr_open; nothing to do */ 329 /* beyond sysctl_nr_open; nothing to do */
339 if (unlikely(new_fdt->max_fds < open_files)) { 330 if (unlikely(new_fdt->max_fds < open_files)) {
340 free_fdarr(new_fdt); 331 __free_fdtable(new_fdt);
341 free_fdset(new_fdt);
342 kfree(new_fdt);
343 *errorp = -EMFILE; 332 *errorp = -EMFILE;
344 goto out_release; 333 goto out_release;
345 } 334 }
diff --git a/fs/file_table.c b/fs/file_table.c
index b8a0bb63cbd7..edecd36fed9b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -230,15 +230,6 @@ static void __fput(struct file *file)
230 might_sleep(); 230 might_sleep();
231 231
232 fsnotify_close(file); 232 fsnotify_close(file);
233
234 /*
235 * fsnotify_create_event may have taken one or more references on this
236 * file. If it did so it left one reference for us to drop to make sure
237 * its calls to fput could not prematurely destroy the file.
238 */
239 if (atomic_long_read(&file->f_count))
240 return fput(file);
241
242 /* 233 /*
243 * The function eventpoll_release() should be the first called 234 * The function eventpoll_release() should be the first called
244 * in the file cleanup chain. 235 * in the file cleanup chain.
@@ -298,11 +289,20 @@ struct file *fget(unsigned int fd)
298EXPORT_SYMBOL(fget); 289EXPORT_SYMBOL(fget);
299 290
300/* 291/*
301 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 292 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
302 * You can use this only if it is guranteed that the current task already 293 *
303 * holds a refcnt to that file. That check has to be done at fget() only 294 * You can use this instead of fget if you satisfy all of the following
304 * and a flag is returned to be passed to the corresponding fput_light(). 295 * conditions:
305 * There must not be a cloning between an fget_light/fput_light pair. 296 * 1) You must call fput_light before exiting the syscall and returning control
297 * to userspace (i.e. you cannot remember the returned struct file * after
298 * returning to userspace).
299 * 2) You must not call filp_close on the returned struct file * in between
300 * calls to fget_light and fput_light.
301 * 3) You must not clone the current task in between the calls to fget_light
302 * and fput_light.
303 *
304 * The fput_needed flag returned by fget_light should be passed to the
305 * corresponding fput_light.
306 */ 306 */
307struct file *fget_light(unsigned int fd, int *fput_needed) 307struct file *fget_light(unsigned int fd, int *fput_needed)
308{ 308{
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2f76c4a081a2..7d9d06ba184b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -68,7 +68,7 @@ int nr_pdflush_threads;
68 */ 68 */
69int writeback_in_progress(struct backing_dev_info *bdi) 69int writeback_in_progress(struct backing_dev_info *bdi)
70{ 70{
71 return !list_empty(&bdi->work_list); 71 return test_bit(BDI_writeback_running, &bdi->state);
72} 72}
73 73
74static void bdi_queue_work(struct backing_dev_info *bdi, 74static void bdi_queue_work(struct backing_dev_info *bdi,
@@ -249,10 +249,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
249 249
250/* 250/*
251 * Queue all expired dirty inodes for io, eldest first. 251 * Queue all expired dirty inodes for io, eldest first.
252 * Before
253 * newly dirtied b_dirty b_io b_more_io
254 * =============> gf edc BA
255 * After
256 * newly dirtied b_dirty b_io b_more_io
257 * =============> g fBAedc
258 * |
259 * +--> dequeue for IO
252 */ 260 */
253static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 261static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
254{ 262{
255 list_splice_init(&wb->b_more_io, wb->b_io.prev); 263 list_splice_init(&wb->b_more_io, &wb->b_io);
256 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 264 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
257} 265}
258 266
@@ -363,62 +371,35 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
363 spin_lock(&inode_lock); 371 spin_lock(&inode_lock);
364 inode->i_state &= ~I_SYNC; 372 inode->i_state &= ~I_SYNC;
365 if (!(inode->i_state & I_FREEING)) { 373 if (!(inode->i_state & I_FREEING)) {
366 if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { 374 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
367 /*
368 * More pages get dirtied by a fast dirtier.
369 */
370 goto select_queue;
371 } else if (inode->i_state & I_DIRTY) {
372 /*
373 * At least XFS will redirty the inode during the
374 * writeback (delalloc) and on io completion (isize).
375 */
376 redirty_tail(inode);
377 } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
378 /* 375 /*
379 * We didn't write back all the pages. nfs_writepages() 376 * We didn't write back all the pages. nfs_writepages()
380 * sometimes bales out without doing anything. Redirty 377 * sometimes bales out without doing anything.
381 * the inode; Move it from b_io onto b_more_io/b_dirty.
382 */ 378 */
383 /* 379 inode->i_state |= I_DIRTY_PAGES;
384 * akpm: if the caller was the kupdate function we put 380 if (wbc->nr_to_write <= 0) {
385 * this inode at the head of b_dirty so it gets first
386 * consideration. Otherwise, move it to the tail, for
387 * the reasons described there. I'm not really sure
388 * how much sense this makes. Presumably I had a good
389 * reasons for doing it this way, and I'd rather not
390 * muck with it at present.
391 */
392 if (wbc->for_kupdate) {
393 /* 381 /*
394 * For the kupdate function we move the inode 382 * slice used up: queue for next turn
395 * to b_more_io so it will get more writeout as
396 * soon as the queue becomes uncongested.
397 */ 383 */
398 inode->i_state |= I_DIRTY_PAGES; 384 requeue_io(inode);
399select_queue:
400 if (wbc->nr_to_write <= 0) {
401 /*
402 * slice used up: queue for next turn
403 */
404 requeue_io(inode);
405 } else {
406 /*
407 * somehow blocked: retry later
408 */
409 redirty_tail(inode);
410 }
411 } else { 385 } else {
412 /* 386 /*
413 * Otherwise fully redirty the inode so that 387 * Writeback blocked by something other than
414 * other inodes on this superblock will get some 388 * congestion. Delay the inode for some time to
415 * writeout. Otherwise heavy writing to one 389 * avoid spinning on the CPU (100% iowait)
416 * file would indefinitely suspend writeout of 390 * retrying writeback of the dirty page/inode
417 * all the other files. 391 * that cannot be performed immediately.
418 */ 392 */
419 inode->i_state |= I_DIRTY_PAGES;
420 redirty_tail(inode); 393 redirty_tail(inode);
421 } 394 }
395 } else if (inode->i_state & I_DIRTY) {
396 /*
397 * Filesystems can dirty the inode during writeback
398 * operations, such as delayed allocation during
399 * submission or metadata updates after data IO
400 * completion.
401 */
402 redirty_tail(inode);
422 } else if (atomic_read(&inode->i_count)) { 403 } else if (atomic_read(&inode->i_count)) {
423 /* 404 /*
424 * The inode is clean, inuse 405 * The inode is clean, inuse
@@ -590,7 +571,7 @@ static inline bool over_bground_thresh(void)
590{ 571{
591 unsigned long background_thresh, dirty_thresh; 572 unsigned long background_thresh, dirty_thresh;
592 573
593 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 574 global_dirty_limits(&background_thresh, &dirty_thresh);
594 575
595 return (global_page_state(NR_FILE_DIRTY) + 576 return (global_page_state(NR_FILE_DIRTY) +
596 global_page_state(NR_UNSTABLE_NFS) >= background_thresh); 577 global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
@@ -759,6 +740,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
759 struct wb_writeback_work *work; 740 struct wb_writeback_work *work;
760 long wrote = 0; 741 long wrote = 0;
761 742
743 set_bit(BDI_writeback_running, &wb->bdi->state);
762 while ((work = get_next_work_item(bdi)) != NULL) { 744 while ((work = get_next_work_item(bdi)) != NULL) {
763 /* 745 /*
764 * Override sync mode, in case we must wait for completion 746 * Override sync mode, in case we must wait for completion
@@ -785,6 +767,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
785 * Check for periodic writeback, kupdated() style 767 * Check for periodic writeback, kupdated() style
786 */ 768 */
787 wrote += wb_check_old_data_flush(wb); 769 wrote += wb_check_old_data_flush(wb);
770 clear_bit(BDI_writeback_running, &wb->bdi->state);
788 771
789 return wrote; 772 return wrote;
790} 773}
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index eee059052db5..1ee40eb9a2c0 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -106,12 +106,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
106 fs->in_exec = 0; 106 fs->in_exec = 0;
107 rwlock_init(&fs->lock); 107 rwlock_init(&fs->lock);
108 fs->umask = old->umask; 108 fs->umask = old->umask;
109 read_lock(&old->lock); 109 get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
110 fs->root = old->root;
111 path_get(&old->root);
112 fs->pwd = old->pwd;
113 path_get(&old->pwd);
114 read_unlock(&old->lock);
115 } 110 }
116 return fs; 111 return fs;
117} 112}
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 6a026441c5a6..f6aad48d38a8 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -321,17 +321,11 @@ void fscache_put_context(struct fscache_cookie *cookie, void *context)
321#define dbgprintk(FMT, ...) \ 321#define dbgprintk(FMT, ...) \
322 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) 322 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
323 323
324/* make sure we maintain the format strings, even when debugging is disabled */
325static inline __attribute__((format(printf, 1, 2)))
326void _dbprintk(const char *fmt, ...)
327{
328}
329
330#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 324#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
331#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 325#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
332#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) 326#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
333 327
334#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) 328#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
335 329
336#ifdef __KDEBUG 330#ifdef __KDEBUG
337#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) 331#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
@@ -358,9 +352,9 @@ do { \
358} while (0) 352} while (0)
359 353
360#else 354#else
361#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) 355#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
362#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) 356#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
363#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__) 357#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
364#endif 358#endif
365 359
366/* 360/*
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6b4dcd4f2943..5a44811b5027 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -722,7 +722,12 @@ root_found:
722 } 722 }
723 723
724 s->s_magic = ISOFS_SUPER_MAGIC; 724 s->s_magic = ISOFS_SUPER_MAGIC;
725 s->s_maxbytes = 0xffffffff; /* We can handle files up to 4 GB */ 725
726 /*
727 * With multi-extent files, file size is only limited by the maximum
728 * size of a file system, which is 8 TB.
729 */
730 s->s_maxbytes = 0x80000000000LL;
726 731
727 /* 732 /*
728 * The CDROM is read-only, has no nodes (devices) on it, and since 733 * The CDROM is read-only, has no nodes (devices) on it, and since
diff --git a/fs/namei.c b/fs/namei.c
index 13ff4abdbdca..17ea76bf2fbe 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -483,13 +483,8 @@ ok:
483 483
484static __always_inline void set_root(struct nameidata *nd) 484static __always_inline void set_root(struct nameidata *nd)
485{ 485{
486 if (!nd->root.mnt) { 486 if (!nd->root.mnt)
487 struct fs_struct *fs = current->fs; 487 get_fs_root(current->fs, &nd->root);
488 read_lock(&fs->lock);
489 nd->root = fs->root;
490 path_get(&nd->root);
491 read_unlock(&fs->lock);
492 }
493} 488}
494 489
495static int link_path_walk(const char *, struct nameidata *); 490static int link_path_walk(const char *, struct nameidata *);
@@ -1015,11 +1010,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
1015 nd->path = nd->root; 1010 nd->path = nd->root;
1016 path_get(&nd->root); 1011 path_get(&nd->root);
1017 } else if (dfd == AT_FDCWD) { 1012 } else if (dfd == AT_FDCWD) {
1018 struct fs_struct *fs = current->fs; 1013 get_fs_pwd(current->fs, &nd->path);
1019 read_lock(&fs->lock);
1020 nd->path = fs->pwd;
1021 path_get(&fs->pwd);
1022 read_unlock(&fs->lock);
1023 } else { 1014 } else {
1024 struct dentry *dentry; 1015 struct dentry *dentry;
1025 1016
diff --git a/fs/namespace.c b/fs/namespace.c
index 66c4f7e781cb..2e10cb19c5b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -788,7 +788,6 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
788 { MNT_NOATIME, ",noatime" }, 788 { MNT_NOATIME, ",noatime" },
789 { MNT_NODIRATIME, ",nodiratime" }, 789 { MNT_NODIRATIME, ",nodiratime" },
790 { MNT_RELATIME, ",relatime" }, 790 { MNT_RELATIME, ",relatime" },
791 { MNT_STRICTATIME, ",strictatime" },
792 { 0, NULL } 791 { 0, NULL }
793 }; 792 };
794 const struct proc_fs_info *fs_infop; 793 const struct proc_fs_info *fs_infop;
@@ -2213,10 +2212,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2213 goto out1; 2212 goto out1;
2214 } 2213 }
2215 2214
2216 read_lock(&current->fs->lock); 2215 get_fs_root(current->fs, &root);
2217 root = current->fs->root;
2218 path_get(&current->fs->root);
2219 read_unlock(&current->fs->lock);
2220 down_write(&namespace_sem); 2216 down_write(&namespace_sem);
2221 mutex_lock(&old.dentry->d_inode->i_mutex); 2217 mutex_lock(&old.dentry->d_inode->i_mutex);
2222 error = -EINVAL; 2218 error = -EINVAL;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index cc1bb33b59b8..26a510a7be09 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -100,3 +100,20 @@ config NFS_FSCACHE
100 help 100 help
101 Say Y here if you want NFS data to be cached locally on disc through 101 Say Y here if you want NFS data to be cached locally on disc through
102 the general filesystem cache manager 102 the general filesystem cache manager
103
104config NFS_USE_LEGACY_DNS
105 bool "Use the legacy NFS DNS resolver"
106 depends on NFS_V4
107 help
108 The kernel now provides a method for translating a host name into an
109 IP address. Select Y here if you would rather use your own DNS
110 resolver script.
111
112 If unsure, say N
113
114config NFS_USE_KERNEL_DNS
115 bool
116 depends on NFS_V4 && !NFS_USE_LEGACY_DNS
117 select DNS_RESOLVER
118 select KEYS
119 default y
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 36dfdae95123..e17b49e2eabd 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -45,7 +45,7 @@ unsigned short nfs_callback_tcpport;
45unsigned short nfs_callback_tcpport6; 45unsigned short nfs_callback_tcpport6;
46#define NFS_CALLBACK_MAXPORTNR (65535U) 46#define NFS_CALLBACK_MAXPORTNR (65535U)
47 47
48static int param_set_portnr(const char *val, struct kernel_param *kp) 48static int param_set_portnr(const char *val, const struct kernel_param *kp)
49{ 49{
50 unsigned long num; 50 unsigned long num;
51 int ret; 51 int ret;
@@ -58,11 +58,10 @@ static int param_set_portnr(const char *val, struct kernel_param *kp)
58 *((unsigned int *)kp->arg) = num; 58 *((unsigned int *)kp->arg) = num;
59 return 0; 59 return 0;
60} 60}
61 61static struct kernel_param_ops param_ops_portnr = {
62static int param_get_portnr(char *buffer, struct kernel_param *kp) 62 .set = param_set_portnr,
63{ 63 .get = param_get_uint,
64 return param_get_uint(buffer, kp); 64};
65}
66#define param_check_portnr(name, p) __param_check(name, p, unsigned int); 65#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
67 66
68module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); 67module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 76fd235d0024..dba50a5625db 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -6,6 +6,29 @@
6 * Resolves DNS hostnames into valid ip addresses 6 * Resolves DNS hostnames into valid ip addresses
7 */ 7 */
8 8
9#ifdef CONFIG_NFS_USE_KERNEL_DNS
10
11#include <linux/sunrpc/clnt.h>
12#include <linux/dns_resolver.h>
13
14ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
15 struct sockaddr *sa, size_t salen)
16{
17 ssize_t ret;
18 char *ip_addr = NULL;
19 int ip_len;
20
21 ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
22 if (ip_len > 0)
23 ret = rpc_pton(ip_addr, ip_len, sa, salen);
24 else
25 ret = -ESRCH;
26 kfree(ip_addr);
27 return ret;
28}
29
30#else
31
9#include <linux/hash.h> 32#include <linux/hash.h>
10#include <linux/string.h> 33#include <linux/string.h>
11#include <linux/kmod.h> 34#include <linux/kmod.h>
@@ -346,3 +369,4 @@ void nfs_dns_resolver_destroy(void)
346 nfs_cache_unregister(&nfs_dns_resolve); 369 nfs_cache_unregister(&nfs_dns_resolve);
347} 370}
348 371
372#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index a3f0938babf7..199bb5543a91 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -6,8 +6,20 @@
6 6
7#define NFS_DNS_HOSTNAME_MAXLEN (128) 7#define NFS_DNS_HOSTNAME_MAXLEN (128)
8 8
9
10#ifdef CONFIG_NFS_USE_KERNEL_DNS
11static inline int nfs_dns_resolver_init(void)
12{
13 return 0;
14}
15
16static inline void nfs_dns_resolver_destroy(void)
17{}
18#else
9extern int nfs_dns_resolver_init(void); 19extern int nfs_dns_resolver_init(void);
10extern void nfs_dns_resolver_destroy(void); 20extern void nfs_dns_resolver_destroy(void);
21#endif
22
11extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, 23extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
12 struct sockaddr *sa, size_t salen); 24 struct sockaddr *sa, size_t salen);
13 25
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index eb8f73c9c131..756566fe8449 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -17,9 +17,9 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
17 old->data_type == new->data_type && 17 old->data_type == new->data_type &&
18 old->tgid == new->tgid) { 18 old->tgid == new->tgid) {
19 switch (old->data_type) { 19 switch (old->data_type) {
20 case (FSNOTIFY_EVENT_FILE): 20 case (FSNOTIFY_EVENT_PATH):
21 if ((old->file->f_path.mnt == new->file->f_path.mnt) && 21 if ((old->path.mnt == new->path.mnt) &&
22 (old->file->f_path.dentry == new->file->f_path.dentry)) 22 (old->path.dentry == new->path.dentry))
23 return true; 23 return true;
24 case (FSNOTIFY_EVENT_NONE): 24 case (FSNOTIFY_EVENT_NONE):
25 return true; 25 return true;
@@ -174,7 +174,7 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
174 return false; 174 return false;
175 175
176 /* if we don't have enough info to send an event to userspace say no */ 176 /* if we don't have enough info to send an event to userspace say no */
177 if (data_type != FSNOTIFY_EVENT_FILE) 177 if (data_type != FSNOTIFY_EVENT_PATH)
178 return false; 178 return false;
179 179
180 if (inode_mark && vfsmnt_mark) { 180 if (inode_mark && vfsmnt_mark) {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 25a3b4dfcf61..032b837fcd11 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -65,7 +65,7 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
65 if (client_fd < 0) 65 if (client_fd < 0)
66 return client_fd; 66 return client_fd;
67 67
68 if (event->data_type != FSNOTIFY_EVENT_FILE) { 68 if (event->data_type != FSNOTIFY_EVENT_PATH) {
69 WARN_ON(1); 69 WARN_ON(1);
70 put_unused_fd(client_fd); 70 put_unused_fd(client_fd);
71 return -EINVAL; 71 return -EINVAL;
@@ -75,8 +75,8 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
75 * we need a new file handle for the userspace program so it can read even if it was 75 * we need a new file handle for the userspace program so it can read even if it was
76 * originally opened O_WRONLY. 76 * originally opened O_WRONLY.
77 */ 77 */
78 dentry = dget(event->file->f_path.dentry); 78 dentry = dget(event->path.dentry);
79 mnt = mntget(event->file->f_path.mnt); 79 mnt = mntget(event->path.mnt);
80 /* it's possible this event was an overflow event. in that case dentry and mnt 80 /* it's possible this event was an overflow event. in that case dentry and mnt
81 * are NULL; That's fine, just don't call dentry open */ 81 * are NULL; That's fine, just don't call dentry open */
82 if (dentry && mnt) 82 if (dentry && mnt)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4d2a82c1ceb1..3970392b2722 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -84,7 +84,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
84} 84}
85 85
86/* Notify this dentry's parent about a child's events. */ 86/* Notify this dentry's parent about a child's events. */
87void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask) 87void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
88{ 88{
89 struct dentry *parent; 89 struct dentry *parent;
90 struct inode *p_inode; 90 struct inode *p_inode;
@@ -92,7 +92,7 @@ void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
92 bool should_update_children = false; 92 bool should_update_children = false;
93 93
94 if (!dentry) 94 if (!dentry)
95 dentry = file->f_path.dentry; 95 dentry = path->dentry;
96 96
97 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) 97 if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
98 return; 98 return;
@@ -124,8 +124,8 @@ void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
124 * specifies these are events which came from a child. */ 124 * specifies these are events which came from a child. */
125 mask |= FS_EVENT_ON_CHILD; 125 mask |= FS_EVENT_ON_CHILD;
126 126
127 if (file) 127 if (path)
128 fsnotify(p_inode, mask, file, FSNOTIFY_EVENT_FILE, 128 fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
129 dentry->d_name.name, 0); 129 dentry->d_name.name, 0);
130 else 130 else
131 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, 131 fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
@@ -217,8 +217,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
217 /* global tests shouldn't care about events on child only the specific event */ 217 /* global tests shouldn't care about events on child only the specific event */
218 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); 218 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
219 219
220 if (data_is == FSNOTIFY_EVENT_FILE) 220 if (data_is == FSNOTIFY_EVENT_PATH)
221 mnt = ((struct file *)data)->f_path.mnt; 221 mnt = ((struct path *)data)->mnt;
222 else 222 else
223 mnt = NULL; 223 mnt = NULL;
224 224
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 5e73eeb2c697..a91b69a6a291 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -52,9 +52,9 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
52 !strcmp(old->file_name, new->file_name)) 52 !strcmp(old->file_name, new->file_name))
53 return true; 53 return true;
54 break; 54 break;
55 case (FSNOTIFY_EVENT_FILE): 55 case (FSNOTIFY_EVENT_PATH):
56 if ((old->file->f_path.mnt == new->file->f_path.mnt) && 56 if ((old->path.mnt == new->path.mnt) &&
57 (old->file->f_path.dentry == new->file->f_path.dentry)) 57 (old->path.dentry == new->path.dentry))
58 return true; 58 return true;
59 break; 59 break;
60 case (FSNOTIFY_EVENT_NONE): 60 case (FSNOTIFY_EVENT_NONE):
@@ -147,10 +147,10 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
147 __u32 mask, void *data, int data_type) 147 __u32 mask, void *data, int data_type)
148{ 148{
149 if ((inode_mark->mask & FS_EXCL_UNLINK) && 149 if ((inode_mark->mask & FS_EXCL_UNLINK) &&
150 (data_type == FSNOTIFY_EVENT_FILE)) { 150 (data_type == FSNOTIFY_EVENT_PATH)) {
151 struct file *file = data; 151 struct path *path = data;
152 152
153 if (d_unlinked(file->f_path.dentry)) 153 if (d_unlinked(path->dentry))
154 return false; 154 return false;
155 } 155 }
156 156
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index d6c435adc7a2..f39260f8f865 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -31,7 +31,6 @@
31 * allocated and used. 31 * allocated and used.
32 */ 32 */
33 33
34#include <linux/file.h>
35#include <linux/fs.h> 34#include <linux/fs.h>
36#include <linux/init.h> 35#include <linux/init.h>
37#include <linux/kernel.h> 36#include <linux/kernel.h>
@@ -90,8 +89,8 @@ void fsnotify_put_event(struct fsnotify_event *event)
90 if (atomic_dec_and_test(&event->refcnt)) { 89 if (atomic_dec_and_test(&event->refcnt)) {
91 pr_debug("%s: event=%p\n", __func__, event); 90 pr_debug("%s: event=%p\n", __func__, event);
92 91
93 if (event->data_type == FSNOTIFY_EVENT_FILE) 92 if (event->data_type == FSNOTIFY_EVENT_PATH)
94 fput(event->file); 93 path_put(&event->path);
95 94
96 BUG_ON(!list_empty(&event->private_data_list)); 95 BUG_ON(!list_empty(&event->private_data_list));
97 96
@@ -376,8 +375,8 @@ struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
376 } 375 }
377 } 376 }
378 event->tgid = get_pid(old_event->tgid); 377 event->tgid = get_pid(old_event->tgid);
379 if (event->data_type == FSNOTIFY_EVENT_FILE) 378 if (event->data_type == FSNOTIFY_EVENT_PATH)
380 get_file(event->file); 379 path_get(&event->path);
381 380
382 return event; 381 return event;
383} 382}
@@ -424,22 +423,11 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
424 event->data_type = data_type; 423 event->data_type = data_type;
425 424
426 switch (data_type) { 425 switch (data_type) {
427 case FSNOTIFY_EVENT_FILE: { 426 case FSNOTIFY_EVENT_PATH: {
428 event->file = data; 427 struct path *path = data;
429 /* 428 event->path.dentry = path->dentry;
430 * if this file is about to disappear hold an extra reference 429 event->path.mnt = path->mnt;
431 * until we return to __fput so we don't have to worry about 430 path_get(&event->path);
432 * future get/put destroying the file under us or generating
433 * additional events. Notice that we change f_mode without
434 * holding f_lock. This is safe since this is the only possible
435 * reference to this object in the kernel (it was about to be
436 * freed, remember?)
437 */
438 if (!atomic_long_read(&event->file->f_count)) {
439 event->file->f_mode |= FMODE_NONOTIFY;
440 get_file(event->file);
441 }
442 get_file(event->file);
443 break; 431 break;
444 } 432 }
445 case FSNOTIFY_EVENT_INODE: 433 case FSNOTIFY_EVENT_INODE:
@@ -447,7 +435,8 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
447 break; 435 break;
448 case FSNOTIFY_EVENT_NONE: 436 case FSNOTIFY_EVENT_NONE:
449 event->inode = NULL; 437 event->inode = NULL;
450 event->file = NULL; 438 event->path.dentry = NULL;
439 event->path.mnt = NULL;
451 break; 440 break;
452 default: 441 default:
453 BUG(); 442 BUG();
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index da702294d7e7..a76e0aa5cd3f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -290,12 +290,30 @@ static int ocfs2_set_acl(handle_t *handle,
290 290
291int ocfs2_check_acl(struct inode *inode, int mask) 291int ocfs2_check_acl(struct inode *inode, int mask)
292{ 292{
293 struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); 293 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
294 struct buffer_head *di_bh = NULL;
295 struct posix_acl *acl;
296 int ret = -EAGAIN;
294 297
295 if (IS_ERR(acl)) 298 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
299 return ret;
300
301 ret = ocfs2_read_inode_block(inode, &di_bh);
302 if (ret < 0) {
303 mlog_errno(ret);
304 return ret;
305 }
306
307 acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh);
308
309 brelse(di_bh);
310
311 if (IS_ERR(acl)) {
312 mlog_errno(PTR_ERR(acl));
296 return PTR_ERR(acl); 313 return PTR_ERR(acl);
314 }
297 if (acl) { 315 if (acl) {
298 int ret = posix_acl_permission(inode, acl, mask); 316 ret = posix_acl_permission(inode, acl, mask);
299 posix_acl_release(acl); 317 posix_acl_release(acl);
300 return ret; 318 return ret;
301 } 319 }
@@ -344,7 +362,7 @@ int ocfs2_init_acl(handle_t *handle,
344{ 362{
345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 363 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
346 struct posix_acl *acl = NULL; 364 struct posix_acl *acl = NULL;
347 int ret = 0; 365 int ret = 0, ret2;
348 mode_t mode; 366 mode_t mode;
349 367
350 if (!S_ISLNK(inode->i_mode)) { 368 if (!S_ISLNK(inode->i_mode)) {
@@ -381,7 +399,12 @@ int ocfs2_init_acl(handle_t *handle,
381 mode = inode->i_mode; 399 mode = inode->i_mode;
382 ret = posix_acl_create_masq(clone, &mode); 400 ret = posix_acl_create_masq(clone, &mode);
383 if (ret >= 0) { 401 if (ret >= 0) {
384 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); 402 ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
403 if (ret2) {
404 mlog_errno(ret2);
405 ret = ret2;
406 goto cleanup;
407 }
385 if (ret > 0) { 408 if (ret > 0) {
386 ret = ocfs2_set_acl(handle, inode, 409 ret = ocfs2_set_acl(handle, inode,
387 di_bh, ACL_TYPE_ACCESS, 410 di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa75ca3f78da..1361997cf205 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1759,6 +1759,7 @@ static int o2net_accept_one(struct socket *sock)
1759 struct sockaddr_in sin; 1759 struct sockaddr_in sin;
1760 struct socket *new_sock = NULL; 1760 struct socket *new_sock = NULL;
1761 struct o2nm_node *node = NULL; 1761 struct o2nm_node *node = NULL;
1762 struct o2nm_node *local_node = NULL;
1762 struct o2net_sock_container *sc = NULL; 1763 struct o2net_sock_container *sc = NULL;
1763 struct o2net_node *nn; 1764 struct o2net_node *nn;
1764 1765
@@ -1796,11 +1797,15 @@ static int o2net_accept_one(struct socket *sock)
1796 goto out; 1797 goto out;
1797 } 1798 }
1798 1799
1799 if (o2nm_this_node() > node->nd_num) { 1800 if (o2nm_this_node() >= node->nd_num) {
1800 mlog(ML_NOTICE, "unexpected connect attempted from a lower " 1801 local_node = o2nm_get_node_by_num(o2nm_this_node());
1801 "numbered node '%s' at " "%pI4:%d with num %u\n", 1802 mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
1802 node->nd_name, &sin.sin_addr.s_addr, 1803 "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
1803 ntohs(sin.sin_port), node->nd_num); 1804 local_node->nd_name, local_node->nd_num,
1805 &(local_node->nd_ipv4_address),
1806 ntohs(local_node->nd_ipv4_port),
1807 node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
1808 ntohs(sin.sin_port));
1804 ret = -EINVAL; 1809 ret = -EINVAL;
1805 goto out; 1810 goto out;
1806 } 1811 }
@@ -1857,6 +1862,8 @@ out:
1857 sock_release(new_sock); 1862 sock_release(new_sock);
1858 if (node) 1863 if (node)
1859 o2nm_node_put(node); 1864 o2nm_node_put(node);
1865 if (local_node)
1866 o2nm_node_put(local_node);
1860 if (sc) 1867 if (sc)
1861 sc_put(sc); 1868 sc_put(sc);
1862 return ret; 1869 return ret;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 94b97fc6a88e..ffb4c68dafa4 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref)
511 511
512 atomic_dec(&dlm->res_cur_count); 512 atomic_dec(&dlm->res_cur_count);
513 513
514 dlm_put(dlm);
515
516 if (!hlist_unhashed(&res->hash_node) || 514 if (!hlist_unhashed(&res->hash_node) ||
517 !list_empty(&res->granted) || 515 !list_empty(&res->granted) ||
518 !list_empty(&res->converting) || 516 !list_empty(&res->converting) ||
@@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
585 res->migration_pending = 0; 583 res->migration_pending = 0;
586 res->inflight_locks = 0; 584 res->inflight_locks = 0;
587 585
588 /* put in dlm_lockres_release */
589 dlm_grab(dlm);
590 res->dlm = dlm; 586 res->dlm = dlm;
591 587
592 kref_init(&res->refs); 588 kref_init(&res->refs);
@@ -3050,8 +3046,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3050 /* check for pre-existing lock */ 3046 /* check for pre-existing lock */
3051 spin_lock(&dlm->spinlock); 3047 spin_lock(&dlm->spinlock);
3052 res = __dlm_lookup_lockres(dlm, name, namelen, hash); 3048 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
3053 spin_lock(&dlm->master_lock);
3054
3055 if (res) { 3049 if (res) {
3056 spin_lock(&res->spinlock); 3050 spin_lock(&res->spinlock);
3057 if (res->state & DLM_LOCK_RES_RECOVERING) { 3051 if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -3069,14 +3063,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3069 spin_unlock(&res->spinlock); 3063 spin_unlock(&res->spinlock);
3070 } 3064 }
3071 3065
3066 spin_lock(&dlm->master_lock);
3072 /* ignore status. only nonzero status would BUG. */ 3067 /* ignore status. only nonzero status would BUG. */
3073 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, 3068 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
3074 name, namelen, 3069 name, namelen,
3075 migrate->new_master, 3070 migrate->new_master,
3076 migrate->master); 3071 migrate->master);
3077 3072
3078unlock:
3079 spin_unlock(&dlm->master_lock); 3073 spin_unlock(&dlm->master_lock);
3074unlock:
3080 spin_unlock(&dlm->spinlock); 3075 spin_unlock(&dlm->spinlock);
3081 3076
3082 if (oldmle) { 3077 if (oldmle) {
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9dfaac73b36d..aaaffbcbe916 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1997,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1997 struct list_head *queue; 1997 struct list_head *queue;
1998 struct dlm_lock *lock, *next; 1998 struct dlm_lock *lock, *next;
1999 1999
2000 assert_spin_locked(&dlm->spinlock);
2001 assert_spin_locked(&res->spinlock);
2000 res->state |= DLM_LOCK_RES_RECOVERING; 2002 res->state |= DLM_LOCK_RES_RECOVERING;
2001 if (!list_empty(&res->recovering)) { 2003 if (!list_empty(&res->recovering)) {
2002 mlog(0, 2004 mlog(0,
@@ -2326,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2326 /* zero the lvb if necessary */ 2328 /* zero the lvb if necessary */
2327 dlm_revalidate_lvb(dlm, res, dead_node); 2329 dlm_revalidate_lvb(dlm, res, dead_node);
2328 if (res->owner == dead_node) { 2330 if (res->owner == dead_node) {
2329 if (res->state & DLM_LOCK_RES_DROPPING_REF) 2331 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
2330 mlog(0, "%s:%.*s: owned by " 2332 mlog(ML_NOTICE, "Ignore %.*s for "
2331 "dead node %u, this node was " 2333 "recovery as it is being freed\n",
2332 "dropping its ref when it died. " 2334 res->lockname.len,
2333 "continue, dropping the flag.\n", 2335 res->lockname.name);
2334 dlm->name, res->lockname.len, 2336 } else
2335 res->lockname.name, dead_node); 2337 dlm_move_lockres_to_recovery_list(dlm,
2336 2338 res);
2337 /* the wake_up for this will happen when the
2338 * RECOVERING flag is dropped later */
2339 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
2340 2339
2341 dlm_move_lockres_to_recovery_list(dlm, res);
2342 } else if (res->owner == dlm->node_num) { 2340 } else if (res->owner == dlm->node_num) {
2343 dlm_free_dead_locks(dlm, res, dead_node); 2341 dlm_free_dead_locks(dlm, res, dead_node);
2344 __dlm_lockres_calc_usage(dlm, res); 2342 __dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d4f73ca68fe5..2211acf33d9b 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
92 * truly ready to be freed. */ 92 * truly ready to be freed. */
93int __dlm_lockres_unused(struct dlm_lock_resource *res) 93int __dlm_lockres_unused(struct dlm_lock_resource *res)
94{ 94{
95 if (!__dlm_lockres_has_locks(res) && 95 int bit;
96 (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) { 96
97 /* try not to scan the bitmap unless the first two 97 if (__dlm_lockres_has_locks(res))
98 * conditions are already true */ 98 return 0;
99 int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 99
100 if (bit >= O2NM_MAX_NODES) { 100 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
101 /* since the bit for dlm->node_num is not 101 return 0;
102 * set, inflight_locks better be zero */ 102
103 BUG_ON(res->inflight_locks != 0); 103 if (res->state & DLM_LOCK_RES_RECOVERING)
104 return 1; 104 return 0;
105 } 105
106 } 106 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
107 return 0; 107 if (bit < O2NM_MAX_NODES)
108 return 0;
109
110 /*
111 * since the bit for dlm->node_num is not set, inflight_locks better
112 * be zero
113 */
114 BUG_ON(res->inflight_locks != 0);
115 return 1;
108} 116}
109 117
110 118
@@ -152,45 +160,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
152 spin_unlock(&dlm->spinlock); 160 spin_unlock(&dlm->spinlock);
153} 161}
154 162
155static int dlm_purge_lockres(struct dlm_ctxt *dlm, 163static void dlm_purge_lockres(struct dlm_ctxt *dlm,
156 struct dlm_lock_resource *res) 164 struct dlm_lock_resource *res)
157{ 165{
158 int master; 166 int master;
159 int ret = 0; 167 int ret = 0;
160 168
161 spin_lock(&res->spinlock); 169 assert_spin_locked(&dlm->spinlock);
162 if (!__dlm_lockres_unused(res)) { 170 assert_spin_locked(&res->spinlock);
163 mlog(0, "%s:%.*s: tried to purge but not unused\n",
164 dlm->name, res->lockname.len, res->lockname.name);
165 __dlm_print_one_lock_resource(res);
166 spin_unlock(&res->spinlock);
167 BUG();
168 }
169
170 if (res->state & DLM_LOCK_RES_MIGRATING) {
171 mlog(0, "%s:%.*s: Delay dropref as this lockres is "
172 "being remastered\n", dlm->name, res->lockname.len,
173 res->lockname.name);
174 /* Re-add the lockres to the end of the purge list */
175 if (!list_empty(&res->purge)) {
176 list_del_init(&res->purge);
177 list_add_tail(&res->purge, &dlm->purge_list);
178 }
179 spin_unlock(&res->spinlock);
180 return 0;
181 }
182 171
183 master = (res->owner == dlm->node_num); 172 master = (res->owner == dlm->node_num);
184 173
185 if (!master)
186 res->state |= DLM_LOCK_RES_DROPPING_REF;
187 spin_unlock(&res->spinlock);
188 174
189 mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, 175 mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
190 res->lockname.name, master); 176 res->lockname.name, master);
191 177
192 if (!master) { 178 if (!master) {
179 res->state |= DLM_LOCK_RES_DROPPING_REF;
193 /* drop spinlock... retake below */ 180 /* drop spinlock... retake below */
181 spin_unlock(&res->spinlock);
194 spin_unlock(&dlm->spinlock); 182 spin_unlock(&dlm->spinlock);
195 183
196 spin_lock(&res->spinlock); 184 spin_lock(&res->spinlock);
@@ -208,31 +196,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
208 mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", 196 mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
209 dlm->name, res->lockname.len, res->lockname.name, ret); 197 dlm->name, res->lockname.len, res->lockname.name, ret);
210 spin_lock(&dlm->spinlock); 198 spin_lock(&dlm->spinlock);
199 spin_lock(&res->spinlock);
211 } 200 }
212 201
213 spin_lock(&res->spinlock);
214 if (!list_empty(&res->purge)) { 202 if (!list_empty(&res->purge)) {
215 mlog(0, "removing lockres %.*s:%p from purgelist, " 203 mlog(0, "removing lockres %.*s:%p from purgelist, "
216 "master = %d\n", res->lockname.len, res->lockname.name, 204 "master = %d\n", res->lockname.len, res->lockname.name,
217 res, master); 205 res, master);
218 list_del_init(&res->purge); 206 list_del_init(&res->purge);
219 spin_unlock(&res->spinlock);
220 dlm_lockres_put(res); 207 dlm_lockres_put(res);
221 dlm->purge_count--; 208 dlm->purge_count--;
222 } else 209 }
223 spin_unlock(&res->spinlock); 210
211 if (!__dlm_lockres_unused(res)) {
212 mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
213 dlm->name, res->lockname.len, res->lockname.name);
214 __dlm_print_one_lock_resource(res);
215 BUG();
216 }
224 217
225 __dlm_unhash_lockres(res); 218 __dlm_unhash_lockres(res);
226 219
227 /* lockres is not in the hash now. drop the flag and wake up 220 /* lockres is not in the hash now. drop the flag and wake up
228 * any processes waiting in dlm_get_lock_resource. */ 221 * any processes waiting in dlm_get_lock_resource. */
229 if (!master) { 222 if (!master) {
230 spin_lock(&res->spinlock);
231 res->state &= ~DLM_LOCK_RES_DROPPING_REF; 223 res->state &= ~DLM_LOCK_RES_DROPPING_REF;
232 spin_unlock(&res->spinlock); 224 spin_unlock(&res->spinlock);
233 wake_up(&res->wq); 225 wake_up(&res->wq);
234 } 226 } else
235 return 0; 227 spin_unlock(&res->spinlock);
236} 228}
237 229
238static void dlm_run_purge_list(struct dlm_ctxt *dlm, 230static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -251,17 +243,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
251 lockres = list_entry(dlm->purge_list.next, 243 lockres = list_entry(dlm->purge_list.next,
252 struct dlm_lock_resource, purge); 244 struct dlm_lock_resource, purge);
253 245
254 /* Status of the lockres *might* change so double
255 * check. If the lockres is unused, holding the dlm
256 * spinlock will prevent people from getting and more
257 * refs on it -- there's no need to keep the lockres
258 * spinlock. */
259 spin_lock(&lockres->spinlock); 246 spin_lock(&lockres->spinlock);
260 unused = __dlm_lockres_unused(lockres);
261 spin_unlock(&lockres->spinlock);
262
263 if (!unused)
264 continue;
265 247
266 purge_jiffies = lockres->last_used + 248 purge_jiffies = lockres->last_used +
267 msecs_to_jiffies(DLM_PURGE_INTERVAL_MS); 249 msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -273,15 +255,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
273 * in tail order, we can stop at the first 255 * in tail order, we can stop at the first
274 * unpurgable resource -- anyone added after 256 * unpurgable resource -- anyone added after
275 * him will have a greater last_used value */ 257 * him will have a greater last_used value */
258 spin_unlock(&lockres->spinlock);
276 break; 259 break;
277 } 260 }
278 261
262 /* Status of the lockres *might* change so double
263 * check. If the lockres is unused, holding the dlm
264 * spinlock will prevent people from getting and more
265 * refs on it. */
266 unused = __dlm_lockres_unused(lockres);
267 if (!unused ||
268 (lockres->state & DLM_LOCK_RES_MIGRATING)) {
269 mlog(0, "lockres %s:%.*s: is in use or "
270 "being remastered, used %d, state %d\n",
271 dlm->name, lockres->lockname.len,
272 lockres->lockname.name, !unused, lockres->state);
273 list_move_tail(&dlm->purge_list, &lockres->purge);
274 spin_unlock(&lockres->spinlock);
275 continue;
276 }
277
279 dlm_lockres_get(lockres); 278 dlm_lockres_get(lockres);
280 279
281 /* This may drop and reacquire the dlm spinlock if it 280 dlm_purge_lockres(dlm, lockres);
282 * has to do migration. */
283 if (dlm_purge_lockres(dlm, lockres))
284 BUG();
285 281
286 dlm_lockres_put(lockres); 282 dlm_lockres_put(lockres);
287 283
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3ac5aa733e9c..73a11ccfd4c2 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2436,16 +2436,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2436 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + 2436 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2437 le32_to_cpu(rec.r_clusters)) - cpos; 2437 le32_to_cpu(rec.r_clusters)) - cpos;
2438 /* 2438 /*
2439 * If the refcount rec already exist, cool. We just need
2440 * to check whether there is a split. Otherwise we just need
2441 * to increase the refcount.
2442 * If we will insert one, increases recs_add.
2443 *
2444 * We record all the records which will be inserted to the 2439 * We record all the records which will be inserted to the
2445 * same refcount block, so that we can tell exactly whether 2440 * same refcount block, so that we can tell exactly whether
2446 * we need a new refcount block or not. 2441 * we need a new refcount block or not.
2442 *
2443 * If we will insert a new one, this is easy and only happens
2444 * during adding refcounted flag to the extent, so we don't
2445 * have a chance of spliting. We just need one record.
2446 *
2447 * If the refcount rec already exists, that would be a little
2448 * complicated. we may have to:
2449 * 1) split at the beginning if the start pos isn't aligned.
2450 * we need 1 more record in this case.
2451 * 2) split int the end if the end pos isn't aligned.
2452 * we need 1 more record in this case.
2453 * 3) split in the middle because of file system fragmentation.
2454 * we need 2 more records in this case(we can't detect this
2455 * beforehand, so always think of the worst case).
2447 */ 2456 */
2448 if (rec.r_refcount) { 2457 if (rec.r_refcount) {
2458 recs_add += 2;
2449 /* Check whether we need a split at the beginning. */ 2459 /* Check whether we need a split at the beginning. */
2450 if (cpos == start_cpos && 2460 if (cpos == start_cpos &&
2451 cpos != le64_to_cpu(rec.r_cpos)) 2461 cpos != le64_to_cpu(rec.r_cpos))
diff --git a/fs/open.c b/fs/open.c
index b715d06fbe36..630715f9f73d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1031,7 +1031,9 @@ EXPORT_SYMBOL(generic_file_open);
1031 1031
1032/* 1032/*
1033 * This is used by subsystems that don't want seekable 1033 * This is used by subsystems that don't want seekable
1034 * file descriptors 1034 * file descriptors. The function is not supposed to ever fail, the only
1035 * reason it returns an 'int' and not 'void' is so that it can be plugged
1036 * directly into file_operations structure.
1035 */ 1037 */
1036int nonseekable_open(struct inode *inode, struct file *filp) 1038int nonseekable_open(struct inode *inode, struct file *filp)
1037{ 1039{
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index 6921e7890be6..fbeb697374d5 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -45,8 +45,11 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data,
45 nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) | 45 nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
46 (le32_to_cpu(dr->disc_size) >> 9); 46 (le32_to_cpu(dr->disc_size) >> 9);
47 47
48 if (name) 48 if (name) {
49 printk(" [%s]", name); 49 strlcat(state->pp_buf, " [", PAGE_SIZE);
50 strlcat(state->pp_buf, name, PAGE_SIZE);
51 strlcat(state->pp_buf, "]", PAGE_SIZE);
52 }
50 put_partition(state, slot, first_sector, nr_sects); 53 put_partition(state, slot, first_sector, nr_sects);
51 return dr; 54 return dr;
52} 55}
@@ -81,14 +84,14 @@ static int riscix_partition(struct parsed_partitions *state,
81 if (!rr) 84 if (!rr)
82 return -1; 85 return -1;
83 86
84 printk(" [RISCiX]"); 87 strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
85 88
86 89
87 if (rr->magic == RISCIX_MAGIC) { 90 if (rr->magic == RISCIX_MAGIC) {
88 unsigned long size = nr_sects > 2 ? 2 : nr_sects; 91 unsigned long size = nr_sects > 2 ? 2 : nr_sects;
89 int part; 92 int part;
90 93
91 printk(" <"); 94 strlcat(state->pp_buf, " <", PAGE_SIZE);
92 95
93 put_partition(state, slot++, first_sect, size); 96 put_partition(state, slot++, first_sect, size);
94 for (part = 0; part < 8; part++) { 97 for (part = 0; part < 8; part++) {
@@ -97,11 +100,13 @@ static int riscix_partition(struct parsed_partitions *state,
97 put_partition(state, slot++, 100 put_partition(state, slot++,
98 le32_to_cpu(rr->part[part].start), 101 le32_to_cpu(rr->part[part].start),
99 le32_to_cpu(rr->part[part].length)); 102 le32_to_cpu(rr->part[part].length));
100 printk("(%s)", rr->part[part].name); 103 strlcat(state->pp_buf, "(", PAGE_SIZE);
104 strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
105 strlcat(state->pp_buf, ")", PAGE_SIZE);
101 } 106 }
102 } 107 }
103 108
104 printk(" >\n"); 109 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
105 } else { 110 } else {
106 put_partition(state, slot++, first_sect, nr_sects); 111 put_partition(state, slot++, first_sect, nr_sects);
107 } 112 }
@@ -131,7 +136,7 @@ static int linux_partition(struct parsed_partitions *state,
131 struct linux_part *linuxp; 136 struct linux_part *linuxp;
132 unsigned long size = nr_sects > 2 ? 2 : nr_sects; 137 unsigned long size = nr_sects > 2 ? 2 : nr_sects;
133 138
134 printk(" [Linux]"); 139 strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
135 140
136 put_partition(state, slot++, first_sect, size); 141 put_partition(state, slot++, first_sect, size);
137 142
@@ -139,7 +144,7 @@ static int linux_partition(struct parsed_partitions *state,
139 if (!linuxp) 144 if (!linuxp)
140 return -1; 145 return -1;
141 146
142 printk(" <"); 147 strlcat(state->pp_buf, " <", PAGE_SIZE);
143 while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || 148 while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
144 linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { 149 linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
145 if (slot == state->limit) 150 if (slot == state->limit)
@@ -149,7 +154,7 @@ static int linux_partition(struct parsed_partitions *state,
149 le32_to_cpu(linuxp->nr_sects)); 154 le32_to_cpu(linuxp->nr_sects));
150 linuxp ++; 155 linuxp ++;
151 } 156 }
152 printk(" >"); 157 strlcat(state->pp_buf, " >", PAGE_SIZE);
153 158
154 put_dev_sector(sect); 159 put_dev_sector(sect);
155 return slot; 160 return slot;
@@ -294,7 +299,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state)
294 break; 299 break;
295 } 300 }
296 } 301 }
297 printk("\n"); 302 strlcat(state->pp_buf, "\n", PAGE_SIZE);
298 return 1; 303 return 1;
299} 304}
300#endif 305#endif
@@ -367,7 +372,7 @@ int adfspart_check_ICS(struct parsed_partitions *state)
367 return 0; 372 return 0;
368 } 373 }
369 374
370 printk(" [ICS]"); 375 strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
371 376
372 for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { 377 for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
373 u32 start = le32_to_cpu(p->start); 378 u32 start = le32_to_cpu(p->start);
@@ -401,7 +406,7 @@ int adfspart_check_ICS(struct parsed_partitions *state)
401 } 406 }
402 407
403 put_dev_sector(sect); 408 put_dev_sector(sect);
404 printk("\n"); 409 strlcat(state->pp_buf, "\n", PAGE_SIZE);
405 return 1; 410 return 1;
406} 411}
407#endif 412#endif
@@ -461,7 +466,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state)
461 return 0; 466 return 0;
462 } 467 }
463 468
464 printk(" [POWERTEC]"); 469 strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
465 470
466 for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { 471 for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
467 u32 start = le32_to_cpu(p->start); 472 u32 start = le32_to_cpu(p->start);
@@ -472,7 +477,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state)
472 } 477 }
473 478
474 put_dev_sector(sect); 479 put_dev_sector(sect);
475 printk("\n"); 480 strlcat(state->pp_buf, "\n", PAGE_SIZE);
476 return 1; 481 return 1;
477} 482}
478#endif 483#endif
@@ -543,7 +548,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state)
543 548
544 size = get_capacity(state->bdev->bd_disk); 549 size = get_capacity(state->bdev->bd_disk);
545 put_partition(state, slot++, start, size - start); 550 put_partition(state, slot++, start, size - start);
546 printk("\n"); 551 strlcat(state->pp_buf, "\n", PAGE_SIZE);
547 } 552 }
548 553
549 return i ? 1 : 0; 554 return i ? 1 : 0;
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index ba443d4229f8..70cbf44a1560 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -69,7 +69,13 @@ int amiga_partition(struct parsed_partitions *state)
69 /* blksize is blocks per 512 byte standard block */ 69 /* blksize is blocks per 512 byte standard block */
70 blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; 70 blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
71 71
72 printk(" RDSK (%d)", blksize * 512); /* Be more informative */ 72 {
73 char tmp[7 + 10 + 1 + 1];
74
75 /* Be more informative */
76 snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
77 strlcat(state->pp_buf, tmp, PAGE_SIZE);
78 }
73 blk = be32_to_cpu(rdb->rdb_PartitionList); 79 blk = be32_to_cpu(rdb->rdb_PartitionList);
74 put_dev_sector(sect); 80 put_dev_sector(sect);
75 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { 81 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
@@ -106,23 +112,27 @@ int amiga_partition(struct parsed_partitions *state)
106 { 112 {
107 /* Be even more informative to aid mounting */ 113 /* Be even more informative to aid mounting */
108 char dostype[4]; 114 char dostype[4];
115 char tmp[42];
116
109 __be32 *dt = (__be32 *)dostype; 117 __be32 *dt = (__be32 *)dostype;
110 *dt = pb->pb_Environment[16]; 118 *dt = pb->pb_Environment[16];
111 if (dostype[3] < ' ') 119 if (dostype[3] < ' ')
112 printk(" (%c%c%c^%c)", 120 snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
113 dostype[0], dostype[1], 121 dostype[0], dostype[1],
114 dostype[2], dostype[3] + '@' ); 122 dostype[2], dostype[3] + '@' );
115 else 123 else
116 printk(" (%c%c%c%c)", 124 snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
117 dostype[0], dostype[1], 125 dostype[0], dostype[1],
118 dostype[2], dostype[3]); 126 dostype[2], dostype[3]);
119 printk("(res %d spb %d)", 127 strlcat(state->pp_buf, tmp, PAGE_SIZE);
128 snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
120 be32_to_cpu(pb->pb_Environment[6]), 129 be32_to_cpu(pb->pb_Environment[6]),
121 be32_to_cpu(pb->pb_Environment[4])); 130 be32_to_cpu(pb->pb_Environment[4]));
131 strlcat(state->pp_buf, tmp, PAGE_SIZE);
122 } 132 }
123 res = 1; 133 res = 1;
124 } 134 }
125 printk("\n"); 135 strlcat(state->pp_buf, "\n", PAGE_SIZE);
126 136
127rdb_done: 137rdb_done:
128 return res; 138 return res;
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 4439ff1b6cec..9875b05e80a2 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -62,7 +62,7 @@ int atari_partition(struct parsed_partitions *state)
62 } 62 }
63 63
64 pi = &rs->part[0]; 64 pi = &rs->part[0];
65 printk (" AHDI"); 65 strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
66 for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { 66 for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
67 struct rootsector *xrs; 67 struct rootsector *xrs;
68 Sector sect2; 68 Sector sect2;
@@ -81,7 +81,7 @@ int atari_partition(struct parsed_partitions *state)
81#ifdef ICD_PARTS 81#ifdef ICD_PARTS
82 part_fmt = 1; 82 part_fmt = 1;
83#endif 83#endif
84 printk(" XGM<"); 84 strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
85 partsect = extensect = be32_to_cpu(pi->st); 85 partsect = extensect = be32_to_cpu(pi->st);
86 while (1) { 86 while (1) {
87 xrs = read_part_sector(state, partsect, &sect2); 87 xrs = read_part_sector(state, partsect, &sect2);
@@ -120,14 +120,14 @@ int atari_partition(struct parsed_partitions *state)
120 break; 120 break;
121 } 121 }
122 } 122 }
123 printk(" >"); 123 strlcat(state->pp_buf, " >", PAGE_SIZE);
124 } 124 }
125#ifdef ICD_PARTS 125#ifdef ICD_PARTS
126 if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ 126 if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
127 pi = &rs->icdpart[0]; 127 pi = &rs->icdpart[0];
128 /* sanity check: no ICD format if first partition invalid */ 128 /* sanity check: no ICD format if first partition invalid */
129 if (OK_id(pi->id)) { 129 if (OK_id(pi->id)) {
130 printk(" ICD<"); 130 strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
131 for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { 131 for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
132 /* accept only GEM,BGM,RAW,LNX,SWP partitions */ 132 /* accept only GEM,BGM,RAW,LNX,SWP partitions */
133 if (!((pi->flg & 1) && OK_id(pi->id))) 133 if (!((pi->flg & 1) && OK_id(pi->id)))
@@ -137,13 +137,13 @@ int atari_partition(struct parsed_partitions *state)
137 be32_to_cpu(pi->st), 137 be32_to_cpu(pi->st),
138 be32_to_cpu(pi->siz)); 138 be32_to_cpu(pi->siz));
139 } 139 }
140 printk(" >"); 140 strlcat(state->pp_buf, " >", PAGE_SIZE);
141 } 141 }
142 } 142 }
143#endif 143#endif
144 put_dev_sector(sect); 144 put_dev_sector(sect);
145 145
146 printk ("\n"); 146 strlcat(state->pp_buf, "\n", PAGE_SIZE);
147 147
148 return 1; 148 return 1;
149} 149}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 72c52656dc2e..79fbf3f390f0 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -164,10 +164,16 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
164 state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL); 164 state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
165 if (!state) 165 if (!state)
166 return NULL; 166 return NULL;
167 state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
168 if (!state->pp_buf) {
169 kfree(state);
170 return NULL;
171 }
172 state->pp_buf[0] = '\0';
167 173
168 state->bdev = bdev; 174 state->bdev = bdev;
169 disk_name(hd, 0, state->name); 175 disk_name(hd, 0, state->name);
170 printk(KERN_INFO " %s:", state->name); 176 snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
171 if (isdigit(state->name[strlen(state->name)-1])) 177 if (isdigit(state->name[strlen(state->name)-1]))
172 sprintf(state->name, "p"); 178 sprintf(state->name, "p");
173 179
@@ -185,17 +191,25 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
185 } 191 }
186 192
187 } 193 }
188 if (res > 0) 194 if (res > 0) {
195 printk(KERN_INFO "%s", state->pp_buf);
196
197 free_page((unsigned long)state->pp_buf);
189 return state; 198 return state;
199 }
190 if (state->access_beyond_eod) 200 if (state->access_beyond_eod)
191 err = -ENOSPC; 201 err = -ENOSPC;
192 if (err) 202 if (err)
193 /* The partition is unrecognized. So report I/O errors if there were any */ 203 /* The partition is unrecognized. So report I/O errors if there were any */
194 res = err; 204 res = err;
195 if (!res) 205 if (!res)
196 printk(" unknown partition table\n"); 206 strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
197 else if (warn_no_part) 207 else if (warn_no_part)
198 printk(" unable to read partition table\n"); 208 strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
209
210 printk(KERN_INFO "%s", state->pp_buf);
211
212 free_page((unsigned long)state->pp_buf);
199 kfree(state); 213 kfree(state);
200 return ERR_PTR(res); 214 return ERR_PTR(res);
201} 215}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 52f8bd399396..8e4e103ba216 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -16,6 +16,7 @@ struct parsed_partitions {
16 int next; 16 int next;
17 int limit; 17 int limit;
18 bool access_beyond_eod; 18 bool access_beyond_eod;
19 char *pp_buf;
19}; 20};
20 21
21static inline void *read_part_sector(struct parsed_partitions *state, 22static inline void *read_part_sector(struct parsed_partitions *state,
@@ -32,9 +33,12 @@ static inline void
32put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) 33put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
33{ 34{
34 if (n < p->limit) { 35 if (n < p->limit) {
36 char tmp[1 + BDEVNAME_SIZE + 10 + 1];
37
35 p->parts[n].from = from; 38 p->parts[n].from = from;
36 p->parts[n].size = size; 39 p->parts[n].size = size;
37 printk(" %s%d", p->name, n); 40 snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
41 strlcat(p->pp_buf, tmp, PAGE_SIZE);
38 } 42 }
39} 43}
40 44
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 9efb2cfe2410..dbb44d4bb8a7 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -630,6 +630,6 @@ int efi_partition(struct parsed_partitions *state)
630 } 630 }
631 kfree(ptes); 631 kfree(ptes);
632 kfree(gpt); 632 kfree(gpt);
633 printk("\n"); 633 strlcat(state->pp_buf, "\n", PAGE_SIZE);
634 return 1; 634 return 1;
635} 635}
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc8497643fd0..d513a07f44bb 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -75,6 +75,7 @@ int ibm_partition(struct parsed_partitions *state)
75 unsigned char *data; 75 unsigned char *data;
76 Sector sect; 76 Sector sect;
77 sector_t labelsect; 77 sector_t labelsect;
78 char tmp[64];
78 79
79 res = 0; 80 res = 0;
80 blocksize = bdev_logical_block_size(bdev); 81 blocksize = bdev_logical_block_size(bdev);
@@ -144,13 +145,15 @@ int ibm_partition(struct parsed_partitions *state)
144 */ 145 */
145 blocksize = label->cms.block_size; 146 blocksize = label->cms.block_size;
146 if (label->cms.disk_offset != 0) { 147 if (label->cms.disk_offset != 0) {
147 printk("CMS1/%8s(MDSK):", name); 148 snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
149 strlcat(state->pp_buf, tmp, PAGE_SIZE);
148 /* disk is reserved minidisk */ 150 /* disk is reserved minidisk */
149 offset = label->cms.disk_offset; 151 offset = label->cms.disk_offset;
150 size = (label->cms.block_count - 1) 152 size = (label->cms.block_count - 1)
151 * (blocksize >> 9); 153 * (blocksize >> 9);
152 } else { 154 } else {
153 printk("CMS1/%8s:", name); 155 snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
156 strlcat(state->pp_buf, tmp, PAGE_SIZE);
154 offset = (info->label_block + 1); 157 offset = (info->label_block + 1);
155 size = label->cms.block_count 158 size = label->cms.block_count
156 * (blocksize >> 9); 159 * (blocksize >> 9);
@@ -159,7 +162,8 @@ int ibm_partition(struct parsed_partitions *state)
159 size-offset*(blocksize >> 9)); 162 size-offset*(blocksize >> 9));
160 } else { 163 } else {
161 if (strncmp(type, "LNX1", 4) == 0) { 164 if (strncmp(type, "LNX1", 4) == 0) {
162 printk("LNX1/%8s:", name); 165 snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
166 strlcat(state->pp_buf, tmp, PAGE_SIZE);
163 if (label->lnx.ldl_version == 0xf2) { 167 if (label->lnx.ldl_version == 0xf2) {
164 fmt_size = label->lnx.formatted_blocks 168 fmt_size = label->lnx.formatted_blocks
165 * (blocksize >> 9); 169 * (blocksize >> 9);
@@ -178,7 +182,7 @@ int ibm_partition(struct parsed_partitions *state)
178 offset = (info->label_block + 1); 182 offset = (info->label_block + 1);
179 } else { 183 } else {
180 /* unlabeled disk */ 184 /* unlabeled disk */
181 printk("(nonl)"); 185 strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
182 size = i_size >> 9; 186 size = i_size >> 9;
183 offset = (info->label_block + 1); 187 offset = (info->label_block + 1);
184 } 188 }
@@ -197,7 +201,8 @@ int ibm_partition(struct parsed_partitions *state)
197 * if not, something is wrong, skipping partition detection 201 * if not, something is wrong, skipping partition detection
198 */ 202 */
199 if (strncmp(type, "VOL1", 4) == 0) { 203 if (strncmp(type, "VOL1", 4) == 0) {
200 printk("VOL1/%8s:", name); 204 snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
205 strlcat(state->pp_buf, tmp, PAGE_SIZE);
201 /* 206 /*
202 * get block number and read then go through format1 207 * get block number and read then go through format1
203 * labels 208 * labels
@@ -253,7 +258,7 @@ int ibm_partition(struct parsed_partitions *state)
253 258
254 } 259 }
255 260
256 printk("\n"); 261 strlcat(state->pp_buf, "\n", PAGE_SIZE);
257 goto out_freeall; 262 goto out_freeall;
258 263
259 264
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 1cc928bb762f..0ea19312706b 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -50,7 +50,7 @@ int karma_partition(struct parsed_partitions *state)
50 } 50 }
51 slot++; 51 slot++;
52 } 52 }
53 printk("\n"); 53 strlcat(state->pp_buf, "\n", PAGE_SIZE);
54 put_dev_sector(sect); 54 put_dev_sector(sect);
55 return 1; 55 return 1;
56} 56}
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 648c9d8f3357..5bf8a04b5d9b 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -643,7 +643,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
643 return false; 643 return false;
644 } 644 }
645 645
646 printk (" [LDM]"); 646 strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
647 647
648 /* Create the data partitions */ 648 /* Create the data partitions */
649 list_for_each (item, &ldb->v_part) { 649 list_for_each (item, &ldb->v_part) {
@@ -658,7 +658,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
658 part_num++; 658 part_num++;
659 } 659 }
660 660
661 printk ("\n"); 661 strlcat(pp->pp_buf, "\n", PAGE_SIZE);
662 return true; 662 return true;
663} 663}
664 664
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index 74465ff7c263..68d6a216ee79 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -59,7 +59,7 @@ int mac_partition(struct parsed_partitions *state)
59 put_dev_sector(sect); 59 put_dev_sector(sect);
60 return 0; /* not a MacOS disk */ 60 return 0; /* not a MacOS disk */
61 } 61 }
62 printk(" [mac]"); 62 strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
63 blocks_in_map = be32_to_cpu(part->map_count); 63 blocks_in_map = be32_to_cpu(part->map_count);
64 for (blk = 1; blk <= blocks_in_map; ++blk) { 64 for (blk = 1; blk <= blocks_in_map; ++blk) {
65 int pos = blk * secsize; 65 int pos = blk * secsize;
@@ -128,6 +128,6 @@ int mac_partition(struct parsed_partitions *state)
128#endif 128#endif
129 129
130 put_dev_sector(sect); 130 put_dev_sector(sect);
131 printk("\n"); 131 strlcat(state->pp_buf, "\n", PAGE_SIZE);
132 return 1; 132 return 1;
133} 133}
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 15bfb7b1e044..5f79a6677c69 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -213,10 +213,18 @@ static void parse_solaris_x86(struct parsed_partitions *state,
213 put_dev_sector(sect); 213 put_dev_sector(sect);
214 return; 214 return;
215 } 215 }
216 printk(" %s%d: <solaris:", state->name, origin); 216 {
217 char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
218
219 snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
220 strlcat(state->pp_buf, tmp, PAGE_SIZE);
221 }
217 if (le32_to_cpu(v->v_version) != 1) { 222 if (le32_to_cpu(v->v_version) != 1) {
218 printk(" cannot handle version %d vtoc>\n", 223 char tmp[64];
219 le32_to_cpu(v->v_version)); 224
225 snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n",
226 le32_to_cpu(v->v_version));
227 strlcat(state->pp_buf, tmp, PAGE_SIZE);
220 put_dev_sector(sect); 228 put_dev_sector(sect);
221 return; 229 return;
222 } 230 }
@@ -224,9 +232,12 @@ static void parse_solaris_x86(struct parsed_partitions *state,
224 max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; 232 max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
225 for (i=0; i<max_nparts && state->next<state->limit; i++) { 233 for (i=0; i<max_nparts && state->next<state->limit; i++) {
226 struct solaris_x86_slice *s = &v->v_slice[i]; 234 struct solaris_x86_slice *s = &v->v_slice[i];
235 char tmp[3 + 10 + 1 + 1];
236
227 if (s->s_size == 0) 237 if (s->s_size == 0)
228 continue; 238 continue;
229 printk(" [s%d]", i); 239 snprintf(tmp, sizeof(tmp), " [s%d]", i);
240 strlcat(state->pp_buf, tmp, PAGE_SIZE);
230 /* solaris partitions are relative to current MS-DOS 241 /* solaris partitions are relative to current MS-DOS
231 * one; must add the offset of the current partition */ 242 * one; must add the offset of the current partition */
232 put_partition(state, state->next++, 243 put_partition(state, state->next++,
@@ -234,7 +245,7 @@ static void parse_solaris_x86(struct parsed_partitions *state,
234 le32_to_cpu(s->s_size)); 245 le32_to_cpu(s->s_size));
235 } 246 }
236 put_dev_sector(sect); 247 put_dev_sector(sect);
237 printk(" >\n"); 248 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
238#endif 249#endif
239} 250}
240 251
@@ -250,6 +261,7 @@ static void parse_bsd(struct parsed_partitions *state,
250 Sector sect; 261 Sector sect;
251 struct bsd_disklabel *l; 262 struct bsd_disklabel *l;
252 struct bsd_partition *p; 263 struct bsd_partition *p;
264 char tmp[64];
253 265
254 l = read_part_sector(state, offset + 1, &sect); 266 l = read_part_sector(state, offset + 1, &sect);
255 if (!l) 267 if (!l)
@@ -258,7 +270,9 @@ static void parse_bsd(struct parsed_partitions *state,
258 put_dev_sector(sect); 270 put_dev_sector(sect);
259 return; 271 return;
260 } 272 }
261 printk(" %s%d: <%s:", state->name, origin, flavour); 273
274 snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
275 strlcat(state->pp_buf, tmp, PAGE_SIZE);
262 276
263 if (le16_to_cpu(l->d_npartitions) < max_partitions) 277 if (le16_to_cpu(l->d_npartitions) < max_partitions)
264 max_partitions = le16_to_cpu(l->d_npartitions); 278 max_partitions = le16_to_cpu(l->d_npartitions);
@@ -275,16 +289,18 @@ static void parse_bsd(struct parsed_partitions *state,
275 /* full parent partition, we have it already */ 289 /* full parent partition, we have it already */
276 continue; 290 continue;
277 if (offset > bsd_start || offset+size < bsd_start+bsd_size) { 291 if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
278 printk("bad subpartition - ignored\n"); 292 strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
279 continue; 293 continue;
280 } 294 }
281 put_partition(state, state->next++, bsd_start, bsd_size); 295 put_partition(state, state->next++, bsd_start, bsd_size);
282 } 296 }
283 put_dev_sector(sect); 297 put_dev_sector(sect);
284 if (le16_to_cpu(l->d_npartitions) > max_partitions) 298 if (le16_to_cpu(l->d_npartitions) > max_partitions) {
285 printk(" (ignored %d more)", 299 snprintf(tmp, sizeof(tmp), " (ignored %d more)",
286 le16_to_cpu(l->d_npartitions) - max_partitions); 300 le16_to_cpu(l->d_npartitions) - max_partitions);
287 printk(" >\n"); 301 strlcat(state->pp_buf, tmp, PAGE_SIZE);
302 }
303 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
288} 304}
289#endif 305#endif
290 306
@@ -333,7 +349,12 @@ static void parse_unixware(struct parsed_partitions *state,
333 put_dev_sector(sect); 349 put_dev_sector(sect);
334 return; 350 return;
335 } 351 }
336 printk(" %s%d: <unixware:", state->name, origin); 352 {
353 char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
354
355 snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
356 strlcat(state->pp_buf, tmp, PAGE_SIZE);
357 }
337 p = &l->vtoc.v_slice[1]; 358 p = &l->vtoc.v_slice[1];
338 /* I omit the 0th slice as it is the same as whole disk. */ 359 /* I omit the 0th slice as it is the same as whole disk. */
339 while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { 360 while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
@@ -347,7 +368,7 @@ static void parse_unixware(struct parsed_partitions *state,
347 p++; 368 p++;
348 } 369 }
349 put_dev_sector(sect); 370 put_dev_sector(sect);
350 printk(" >\n"); 371 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
351#endif 372#endif
352} 373}
353 374
@@ -376,8 +397,10 @@ static void parse_minix(struct parsed_partitions *state,
376 * the normal boot sector. */ 397 * the normal boot sector. */
377 if (msdos_magic_present (data + 510) && 398 if (msdos_magic_present (data + 510) &&
378 SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */ 399 SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
400 char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
379 401
380 printk(" %s%d: <minix:", state->name, origin); 402 snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
403 strlcat(state->pp_buf, tmp, PAGE_SIZE);
381 for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { 404 for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
382 if (state->next == state->limit) 405 if (state->next == state->limit)
383 break; 406 break;
@@ -386,7 +409,7 @@ static void parse_minix(struct parsed_partitions *state,
386 put_partition(state, state->next++, 409 put_partition(state, state->next++,
387 start_sect(p), nr_sects(p)); 410 start_sect(p), nr_sects(p));
388 } 411 }
389 printk(" >\n"); 412 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
390 } 413 }
391 put_dev_sector(sect); 414 put_dev_sector(sect);
392#endif /* CONFIG_MINIX_SUBPARTITION */ 415#endif /* CONFIG_MINIX_SUBPARTITION */
@@ -425,7 +448,7 @@ int msdos_partition(struct parsed_partitions *state)
425 448
426 if (aix_magic_present(state, data)) { 449 if (aix_magic_present(state, data)) {
427 put_dev_sector(sect); 450 put_dev_sector(sect);
428 printk( " [AIX]"); 451 strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
429 return 0; 452 return 0;
430 } 453 }
431 454
@@ -446,7 +469,7 @@ int msdos_partition(struct parsed_partitions *state)
446 fb = (struct fat_boot_sector *) data; 469 fb = (struct fat_boot_sector *) data;
447 if (slot == 1 && fb->reserved && fb->fats 470 if (slot == 1 && fb->reserved && fb->fats
448 && fat_valid_media(fb->media)) { 471 && fat_valid_media(fb->media)) {
449 printk("\n"); 472 strlcat(state->pp_buf, "\n", PAGE_SIZE);
450 put_dev_sector(sect); 473 put_dev_sector(sect);
451 return 1; 474 return 1;
452 } else { 475 } else {
@@ -491,21 +514,21 @@ int msdos_partition(struct parsed_partitions *state)
491 n = min(size, max(sector_size, n)); 514 n = min(size, max(sector_size, n));
492 put_partition(state, slot, start, n); 515 put_partition(state, slot, start, n);
493 516
494 printk(" <"); 517 strlcat(state->pp_buf, " <", PAGE_SIZE);
495 parse_extended(state, start, size); 518 parse_extended(state, start, size);
496 printk(" >"); 519 strlcat(state->pp_buf, " >", PAGE_SIZE);
497 continue; 520 continue;
498 } 521 }
499 put_partition(state, slot, start, size); 522 put_partition(state, slot, start, size);
500 if (SYS_IND(p) == LINUX_RAID_PARTITION) 523 if (SYS_IND(p) == LINUX_RAID_PARTITION)
501 state->parts[slot].flags = ADDPART_FLAG_RAID; 524 state->parts[slot].flags = ADDPART_FLAG_RAID;
502 if (SYS_IND(p) == DM6_PARTITION) 525 if (SYS_IND(p) == DM6_PARTITION)
503 printk("[DM]"); 526 strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
504 if (SYS_IND(p) == EZD_PARTITION) 527 if (SYS_IND(p) == EZD_PARTITION)
505 printk("[EZD]"); 528 strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
506 } 529 }
507 530
508 printk("\n"); 531 strlcat(state->pp_buf, "\n", PAGE_SIZE);
509 532
510 /* second pass - output for each on a separate line */ 533 /* second pass - output for each on a separate line */
511 p = (struct partition *) (0x1be + data); 534 p = (struct partition *) (0x1be + data);
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index fc22b85d436a..48cec7cbca17 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -72,7 +72,7 @@ int osf_partition(struct parsed_partitions *state)
72 le32_to_cpu(partition->p_size)); 72 le32_to_cpu(partition->p_size));
73 slot++; 73 slot++;
74 } 74 }
75 printk("\n"); 75 strlcat(state->pp_buf, "\n", PAGE_SIZE);
76 put_dev_sector(sect); 76 put_dev_sector(sect);
77 return 1; 77 return 1;
78} 78}
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index 43b1df9aa16c..ea8a86dceaf4 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -76,7 +76,7 @@ int sgi_partition(struct parsed_partitions *state)
76 } 76 }
77 slot++; 77 slot++;
78 } 78 }
79 printk("\n"); 79 strlcat(state->pp_buf, "\n", PAGE_SIZE);
80 put_dev_sector(sect); 80 put_dev_sector(sect);
81 return 1; 81 return 1;
82} 82}
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index a32660e25f7f..b5b6fcfb3d36 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -116,7 +116,7 @@ int sun_partition(struct parsed_partitions *state)
116 } 116 }
117 slot++; 117 slot++;
118 } 118 }
119 printk("\n"); 119 strlcat(state->pp_buf, "\n", PAGE_SIZE);
120 put_dev_sector(sect); 120 put_dev_sector(sect);
121 return 1; 121 return 1;
122} 122}
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 9030c864428e..9627ccffc1c4 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -54,6 +54,7 @@ int sysv68_partition(struct parsed_partitions *state)
54 unsigned char *data; 54 unsigned char *data;
55 struct dkblk0 *b; 55 struct dkblk0 *b;
56 struct slice *slice; 56 struct slice *slice;
57 char tmp[64];
57 58
58 data = read_part_sector(state, 0, &sect); 59 data = read_part_sector(state, 0, &sect);
59 if (!data) 60 if (!data)
@@ -73,7 +74,8 @@ int sysv68_partition(struct parsed_partitions *state)
73 return -1; 74 return -1;
74 75
75 slices -= 1; /* last slice is the whole disk */ 76 slices -= 1; /* last slice is the whole disk */
76 printk("sysV68: %s(s%u)", state->name, slices); 77 snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
78 strlcat(state->pp_buf, tmp, PAGE_SIZE);
77 slice = (struct slice *)data; 79 slice = (struct slice *)data;
78 for (i = 0; i < slices; i++, slice++) { 80 for (i = 0; i < slices; i++, slice++) {
79 if (slot == state->limit) 81 if (slot == state->limit)
@@ -82,11 +84,12 @@ int sysv68_partition(struct parsed_partitions *state)
82 put_partition(state, slot, 84 put_partition(state, slot,
83 be32_to_cpu(slice->blkoff), 85 be32_to_cpu(slice->blkoff),
84 be32_to_cpu(slice->nblocks)); 86 be32_to_cpu(slice->nblocks));
85 printk("(s%u)", i); 87 snprintf(tmp, sizeof(tmp), "(s%u)", i);
88 strlcat(state->pp_buf, tmp, PAGE_SIZE);
86 } 89 }
87 slot++; 90 slot++;
88 } 91 }
89 printk("\n"); 92 strlcat(state->pp_buf, "\n", PAGE_SIZE);
90 put_dev_sector(sect); 93 put_dev_sector(sect);
91 return 1; 94 return 1;
92} 95}
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index db9eef260364..8dbaf9f77a99 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -39,7 +39,7 @@ int ultrix_partition(struct parsed_partitions *state)
39 label->pt_part[i].pi_blkoff, 39 label->pt_part[i].pi_blkoff,
40 label->pt_part[i].pi_nblocks); 40 label->pt_part[i].pi_nblocks);
41 put_dev_sector(sect); 41 put_dev_sector(sect);
42 printk ("\n"); 42 strlcat(state->pp_buf, "\n", PAGE_SIZE);
43 return 1; 43 return 1;
44 } else { 44 } else {
45 put_dev_sector(sect); 45 put_dev_sector(sect);
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 11a7b5c68153..2758e2afc518 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the Linux proc filesystem routines. 2# Makefile for the Linux proc filesystem routines.
3# 3#
4 4
5obj-$(CONFIG_PROC_FS) += proc.o 5obj-y += proc.o
6 6
7proc-y := nommu.o task_nommu.o 7proc-y := nommu.o task_nommu.o
8proc-$(CONFIG_MMU) := mmu.o task_mmu.o 8proc-$(CONFIG_MMU) := mmu.o task_mmu.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c806dfb24e08..a1c43e7c8a7b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -149,18 +149,13 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
149 return count; 149 return count;
150} 150}
151 151
152static int get_fs_path(struct task_struct *task, struct path *path, bool root) 152static int get_task_root(struct task_struct *task, struct path *root)
153{ 153{
154 struct fs_struct *fs;
155 int result = -ENOENT; 154 int result = -ENOENT;
156 155
157 task_lock(task); 156 task_lock(task);
158 fs = task->fs; 157 if (task->fs) {
159 if (fs) { 158 get_fs_root(task->fs, root);
160 read_lock(&fs->lock);
161 *path = root ? fs->root : fs->pwd;
162 path_get(path);
163 read_unlock(&fs->lock);
164 result = 0; 159 result = 0;
165 } 160 }
166 task_unlock(task); 161 task_unlock(task);
@@ -173,7 +168,12 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
173 int result = -ENOENT; 168 int result = -ENOENT;
174 169
175 if (task) { 170 if (task) {
176 result = get_fs_path(task, path, 0); 171 task_lock(task);
172 if (task->fs) {
173 get_fs_pwd(task->fs, path);
174 result = 0;
175 }
176 task_unlock(task);
177 put_task_struct(task); 177 put_task_struct(task);
178 } 178 }
179 return result; 179 return result;
@@ -185,7 +185,7 @@ static int proc_root_link(struct inode *inode, struct path *path)
185 int result = -ENOENT; 185 int result = -ENOENT;
186 186
187 if (task) { 187 if (task) {
188 result = get_fs_path(task, path, 1); 188 result = get_task_root(task, path);
189 put_task_struct(task); 189 put_task_struct(task);
190 } 190 }
191 return result; 191 return result;
@@ -597,7 +597,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
597 get_mnt_ns(ns); 597 get_mnt_ns(ns);
598 } 598 }
599 rcu_read_unlock(); 599 rcu_read_unlock();
600 if (ns && get_fs_path(task, &root, 1) == 0) 600 if (ns && get_task_root(task, &root) == 0)
601 ret = 0; 601 ret = 0;
602 put_task_struct(task); 602 put_task_struct(task);
603 } 603 }
@@ -1526,7 +1526,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1526 if (!tmp) 1526 if (!tmp)
1527 return -ENOMEM; 1527 return -ENOMEM;
1528 1528
1529 pathname = d_path(path, tmp, PAGE_SIZE); 1529 pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE);
1530 len = PTR_ERR(pathname); 1530 len = PTR_ERR(pathname);
1531 if (IS_ERR(pathname)) 1531 if (IS_ERR(pathname))
1532 goto out; 1532 goto out;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 19fbc810e8e7..1ec952b1f036 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -983,7 +983,6 @@ static int flush_older_commits(struct super_block *s,
983 983
984static int reiserfs_async_progress_wait(struct super_block *s) 984static int reiserfs_async_progress_wait(struct super_block *s)
985{ 985{
986 DEFINE_WAIT(wait);
987 struct reiserfs_journal *j = SB_JOURNAL(s); 986 struct reiserfs_journal *j = SB_JOURNAL(s);
988 987
989 if (atomic_read(&j->j_async_throttle)) { 988 if (atomic_read(&j->j_async_throttle)) {
diff --git a/fs/signalfd.c b/fs/signalfd.c
index f329849ce3c0..1c5a6add779d 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -88,6 +88,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
88 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid); 88 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
89 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun); 89 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
90 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); 90 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
91 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
91 break; 92 break;
92 case __SI_POLL: 93 case __SI_POLL:
93 err |= __put_user(kinfo->si_band, &uinfo->ssi_band); 94 err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
@@ -111,6 +112,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
111 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); 112 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
112 err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); 113 err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
113 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); 114 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
115 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
114 break; 116 break;
115 default: 117 default:
116 /* 118 /*
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index cc6ce8a84c21..e5f63da64d04 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,13 +5,13 @@ config SQUASHFS
5 help 5 help
6 Saying Y here includes support for SquashFS 4.0 (a Compressed 6 Saying Y here includes support for SquashFS 4.0 (a Compressed
7 Read-Only File System). Squashfs is a highly compressed read-only 7 Read-Only File System). Squashfs is a highly compressed read-only
8 filesystem for Linux. It uses zlib compression to compress both 8 filesystem for Linux. It uses zlib/lzo compression to compress both
9 files, inodes and directories. Inodes in the system are very small 9 files, inodes and directories. Inodes in the system are very small
10 and all blocks are packed to minimise data overhead. Block sizes 10 and all blocks are packed to minimise data overhead. Block sizes
11 greater than 4K are supported up to a maximum of 1 Mbytes (default 11 greater than 4K are supported up to a maximum of 1 Mbytes (default
12 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files 12 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files
13 (larger than 4GB), full uid/gid information, hard links and 13 (larger than 4GB), full uid/gid information, hard links and
14 timestamps. 14 timestamps.
15 15
16 Squashfs is intended for general read-only filesystem use, for 16 Squashfs is intended for general read-only filesystem use, for
17 archival use (i.e. in cases where a .tar.gz file may be used), and in 17 archival use (i.e. in cases where a .tar.gz file may be used), and in
@@ -26,7 +26,7 @@ config SQUASHFS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config SQUASHFS_XATTRS 29config SQUASHFS_XATTR
30 bool "Squashfs XATTR support" 30 bool "Squashfs XATTR support"
31 depends on SQUASHFS 31 depends on SQUASHFS
32 default n 32 default n
@@ -37,9 +37,24 @@ config SQUASHFS_XATTRS
37 37
38 If unsure, say N. 38 If unsure, say N.
39 39
40config SQUASHFS_EMBEDDED 40config SQUASHFS_LZO
41 bool "Include support for LZO compressed file systems"
42 depends on SQUASHFS
43 default n
44 select LZO_DECOMPRESS
45 help
46 Saying Y here includes support for reading Squashfs file systems
47 compressed with LZO compresssion. LZO compression is mainly
48 aimed at embedded systems with slower CPUs where the overheads
49 of zlib are too high.
41 50
42 bool "Additional option for memory-constrained systems" 51 LZO is not the standard compression used in Squashfs and so most
52 file systems will be readable without selecting this option.
53
54 If unsure, say N.
55
56config SQUASHFS_EMBEDDED
57 bool "Additional option for memory-constrained systems"
43 depends on SQUASHFS 58 depends on SQUASHFS
44 default n 59 default n
45 help 60 help
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 2cee3e9fa452..7672bac8d328 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,5 +5,5 @@
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o 8squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
9 9squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 157478da6ac9..24af9ce9722f 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -40,9 +40,11 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
40 NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 40 NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
41}; 41};
42 42
43#ifndef CONFIG_SQUASHFS_LZO
43static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { 44static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
44 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 45 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
45}; 46};
47#endif
46 48
47static const struct squashfs_decompressor squashfs_unknown_comp_ops = { 49static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
48 NULL, NULL, NULL, 0, "unknown", 0 50 NULL, NULL, NULL, 0, "unknown", 0
@@ -51,7 +53,11 @@ static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
51static const struct squashfs_decompressor *decompressor[] = { 53static const struct squashfs_decompressor *decompressor[] = {
52 &squashfs_zlib_comp_ops, 54 &squashfs_zlib_comp_ops,
53 &squashfs_lzma_unsupported_comp_ops, 55 &squashfs_lzma_unsupported_comp_ops,
56#ifdef CONFIG_SQUASHFS_LZO
57 &squashfs_lzo_comp_ops,
58#else
54 &squashfs_lzo_unsupported_comp_ops, 59 &squashfs_lzo_unsupported_comp_ops,
60#endif
55 &squashfs_unknown_comp_ops 61 &squashfs_unknown_comp_ops
56}; 62};
57 63
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
new file mode 100644
index 000000000000..5d87789bf1c1
--- /dev/null
+++ b/fs/squashfs/lzo_wrapper.c
@@ -0,0 +1,136 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010 LG Electronics
5 * Chan Jeong <chan.jeong@lge.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * lzo_wrapper.c
22 */
23
24#include <linux/mutex.h>
25#include <linux/buffer_head.h>
26#include <linux/slab.h>
27#include <linux/vmalloc.h>
28#include <linux/lzo.h>
29
30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h"
34#include "decompressor.h"
35
36struct squashfs_lzo {
37 void *input;
38 void *output;
39};
40
41static void *lzo_init(struct squashfs_sb_info *msblk)
42{
43 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
44
45 struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL);
46 if (stream == NULL)
47 goto failed;
48 stream->input = vmalloc(block_size);
49 if (stream->input == NULL)
50 goto failed;
51 stream->output = vmalloc(block_size);
52 if (stream->output == NULL)
53 goto failed2;
54
55 return stream;
56
57failed2:
58 vfree(stream->input);
59failed:
60 ERROR("Failed to allocate lzo workspace\n");
61 kfree(stream);
62 return NULL;
63}
64
65
66static void lzo_free(void *strm)
67{
68 struct squashfs_lzo *stream = strm;
69
70 if (stream) {
71 vfree(stream->input);
72 vfree(stream->output);
73 }
74 kfree(stream);
75}
76
77
78static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
79 struct buffer_head **bh, int b, int offset, int length, int srclength,
80 int pages)
81{
82 struct squashfs_lzo *stream = msblk->stream;
83 void *buff = stream->input;
84 int avail, i, bytes = length, res;
85 size_t out_len = srclength;
86
87 mutex_lock(&msblk->read_data_mutex);
88
89 for (i = 0; i < b; i++) {
90 wait_on_buffer(bh[i]);
91 if (!buffer_uptodate(bh[i]))
92 goto block_release;
93
94 avail = min(bytes, msblk->devblksize - offset);
95 memcpy(buff, bh[i]->b_data + offset, avail);
96 buff += avail;
97 bytes -= avail;
98 offset = 0;
99 put_bh(bh[i]);
100 }
101
102 res = lzo1x_decompress_safe(stream->input, (size_t)length,
103 stream->output, &out_len);
104 if (res != LZO_E_OK)
105 goto failed;
106
107 res = bytes = (int)out_len;
108 for (i = 0, buff = stream->output; bytes && i < pages; i++) {
109 avail = min_t(int, bytes, PAGE_CACHE_SIZE);
110 memcpy(buffer[i], buff, avail);
111 buff += avail;
112 bytes -= avail;
113 }
114
115 mutex_unlock(&msblk->read_data_mutex);
116 return res;
117
118block_release:
119 for (; i < b; i++)
120 put_bh(bh[i]);
121
122failed:
123 mutex_unlock(&msblk->read_data_mutex);
124
125 ERROR("lzo decompression failed, data probably corrupt\n");
126 return -EIO;
127}
128
129const struct squashfs_decompressor squashfs_lzo_comp_ops = {
130 .init = lzo_init,
131 .free = lzo_free,
132 .decompress = lzo_uncompress,
133 .id = LZO_COMPRESSION,
134 .name = "lzo",
135 .supported = 1
136};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 733a17c42945..5d45569d5f72 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -104,3 +104,6 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
104 104
105/* zlib_wrapper.c */ 105/* zlib_wrapper.c */
106extern const struct squashfs_decompressor squashfs_zlib_comp_ops; 106extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
107
108/* lzo_wrapper.c */
109extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 8eabb808b78d..c5137fc9ab11 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -274,7 +274,7 @@ struct squashfs_base_inode {
274 __le16 uid; 274 __le16 uid;
275 __le16 guid; 275 __le16 guid;
276 __le32 mtime; 276 __le32 mtime;
277 __le32 inode_number; 277 __le32 inode_number;
278}; 278};
279 279
280struct squashfs_ipc_inode { 280struct squashfs_ipc_inode {
@@ -283,7 +283,7 @@ struct squashfs_ipc_inode {
283 __le16 uid; 283 __le16 uid;
284 __le16 guid; 284 __le16 guid;
285 __le32 mtime; 285 __le32 mtime;
286 __le32 inode_number; 286 __le32 inode_number;
287 __le32 nlink; 287 __le32 nlink;
288}; 288};
289 289
@@ -293,7 +293,7 @@ struct squashfs_lipc_inode {
293 __le16 uid; 293 __le16 uid;
294 __le16 guid; 294 __le16 guid;
295 __le32 mtime; 295 __le32 mtime;
296 __le32 inode_number; 296 __le32 inode_number;
297 __le32 nlink; 297 __le32 nlink;
298 __le32 xattr; 298 __le32 xattr;
299}; 299};
@@ -304,7 +304,7 @@ struct squashfs_dev_inode {
304 __le16 uid; 304 __le16 uid;
305 __le16 guid; 305 __le16 guid;
306 __le32 mtime; 306 __le32 mtime;
307 __le32 inode_number; 307 __le32 inode_number;
308 __le32 nlink; 308 __le32 nlink;
309 __le32 rdev; 309 __le32 rdev;
310}; 310};
@@ -315,7 +315,7 @@ struct squashfs_ldev_inode {
315 __le16 uid; 315 __le16 uid;
316 __le16 guid; 316 __le16 guid;
317 __le32 mtime; 317 __le32 mtime;
318 __le32 inode_number; 318 __le32 inode_number;
319 __le32 nlink; 319 __le32 nlink;
320 __le32 rdev; 320 __le32 rdev;
321 __le32 xattr; 321 __le32 xattr;
@@ -327,7 +327,7 @@ struct squashfs_symlink_inode {
327 __le16 uid; 327 __le16 uid;
328 __le16 guid; 328 __le16 guid;
329 __le32 mtime; 329 __le32 mtime;
330 __le32 inode_number; 330 __le32 inode_number;
331 __le32 nlink; 331 __le32 nlink;
332 __le32 symlink_size; 332 __le32 symlink_size;
333 char symlink[0]; 333 char symlink[0];
@@ -339,7 +339,7 @@ struct squashfs_reg_inode {
339 __le16 uid; 339 __le16 uid;
340 __le16 guid; 340 __le16 guid;
341 __le32 mtime; 341 __le32 mtime;
342 __le32 inode_number; 342 __le32 inode_number;
343 __le32 start_block; 343 __le32 start_block;
344 __le32 fragment; 344 __le32 fragment;
345 __le32 offset; 345 __le32 offset;
@@ -353,7 +353,7 @@ struct squashfs_lreg_inode {
353 __le16 uid; 353 __le16 uid;
354 __le16 guid; 354 __le16 guid;
355 __le32 mtime; 355 __le32 mtime;
356 __le32 inode_number; 356 __le32 inode_number;
357 __le64 start_block; 357 __le64 start_block;
358 __le64 file_size; 358 __le64 file_size;
359 __le64 sparse; 359 __le64 sparse;
@@ -370,7 +370,7 @@ struct squashfs_dir_inode {
370 __le16 uid; 370 __le16 uid;
371 __le16 guid; 371 __le16 guid;
372 __le32 mtime; 372 __le32 mtime;
373 __le32 inode_number; 373 __le32 inode_number;
374 __le32 start_block; 374 __le32 start_block;
375 __le32 nlink; 375 __le32 nlink;
376 __le16 file_size; 376 __le16 file_size;
@@ -384,7 +384,7 @@ struct squashfs_ldir_inode {
384 __le16 uid; 384 __le16 uid;
385 __le16 guid; 385 __le16 guid;
386 __le32 mtime; 386 __le32 mtime;
387 __le32 inode_number; 387 __le32 inode_number;
388 __le32 nlink; 388 __le32 nlink;
389 __le32 file_size; 389 __le32 file_size;
390 __le32 start_block; 390 __le32 start_block;
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index c7655e8b31cd..652b8541f9c6 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -18,7 +18,7 @@
18 * along with this program; if not, write to the Free Software 18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 * 20 *
21 * xattr_id.c 21 * xattr.c
22 */ 22 */
23 23
24#include <linux/init.h> 24#include <linux/init.h>
@@ -295,7 +295,7 @@ static const struct xattr_handler squashfs_xattr_security_handler = {
295 .get = squashfs_security_get 295 .get = squashfs_security_get
296}; 296};
297 297
298static inline const struct xattr_handler *squashfs_xattr_handler(int type) 298static const struct xattr_handler *squashfs_xattr_handler(int type)
299{ 299{
300 if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL)) 300 if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
301 /* ignore unrecognised type */ 301 /* ignore unrecognised type */
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 9da071ae181c..49fe0d719fbf 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -21,7 +21,7 @@
21 * xattr.h 21 * xattr.h
22 */ 22 */
23 23
24#ifdef CONFIG_SQUASHFS_XATTRS 24#ifdef CONFIG_SQUASHFS_XATTR
25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, 25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
26 u64 *, int *); 26 u64 *, int *);
27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, 27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 0e44a6253352..a0b0cda6927e 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -434,12 +434,46 @@ Ebadsize:
434 goto failed; 434 goto failed;
435} 435}
436 436
437static int v7_fill_super(struct super_block *sb, void *data, int silent) 437static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh)
438{ 438{
439 struct sysv_sb_info *sbi;
440 struct buffer_head *bh, *bh2 = NULL;
441 struct v7_super_block *v7sb; 439 struct v7_super_block *v7sb;
442 struct sysv_inode *v7i; 440 struct sysv_inode *v7i;
441 struct buffer_head *bh2;
442 struct sysv_sb_info *sbi;
443
444 sbi = sb->s_fs_info;
445
446 /* plausibility check on superblock */
447 v7sb = (struct v7_super_block *) bh->b_data;
448 if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
449 fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
450 fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
451 return 0;
452
453 /* plausibility check on root inode: it is a directory,
454 with a nonzero size that is a multiple of 16 */
455 bh2 = sb_bread(sb, 2);
456 if (bh2 == NULL)
457 return 0;
458
459 v7i = (struct sysv_inode *)(bh2->b_data + 64);
460 if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
461 (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
462 (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
463 (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
464 sizeof(struct sysv_dir_entry))) {
465 brelse(bh2);
466 return 0;
467 }
468
469 brelse(bh2);
470 return 1;
471}
472
473static int v7_fill_super(struct super_block *sb, void *data, int silent)
474{
475 struct sysv_sb_info *sbi;
476 struct buffer_head *bh;
443 477
444 if (440 != sizeof (struct v7_super_block)) 478 if (440 != sizeof (struct v7_super_block))
445 panic("V7 FS: bad super-block size"); 479 panic("V7 FS: bad super-block size");
@@ -453,7 +487,6 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
453 sbi->s_sb = sb; 487 sbi->s_sb = sb;
454 sbi->s_block_base = 0; 488 sbi->s_block_base = 0;
455 sbi->s_type = FSTYPE_V7; 489 sbi->s_type = FSTYPE_V7;
456 sbi->s_bytesex = BYTESEX_PDP;
457 sb->s_fs_info = sbi; 490 sb->s_fs_info = sbi;
458 491
459 sb_set_blocksize(sb, 512); 492 sb_set_blocksize(sb, 512);
@@ -465,32 +498,27 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
465 goto failed; 498 goto failed;
466 } 499 }
467 500
468 /* plausibility check on superblock */ 501 /* Try PDP-11 UNIX */
469 v7sb = (struct v7_super_block *) bh->b_data; 502 sbi->s_bytesex = BYTESEX_PDP;
470 if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE || 503 if (v7_sanity_check(sb, bh))
471 fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD || 504 goto detected;
472 fs32_to_cpu(sbi, v7sb->s_time) == 0)
473 goto failed;
474 505
475 /* plausibility check on root inode: it is a directory, 506 /* Try PC/IX, v7/x86 */
476 with a nonzero size that is a multiple of 16 */ 507 sbi->s_bytesex = BYTESEX_LE;
477 if ((bh2 = sb_bread(sb, 2)) == NULL) 508 if (v7_sanity_check(sb, bh))
478 goto failed; 509 goto detected;
479 v7i = (struct sysv_inode *)(bh2->b_data + 64);
480 if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
481 (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
482 (fs32_to_cpu(sbi, v7i->i_size) & 017) != 0)
483 goto failed;
484 brelse(bh2);
485 bh2 = NULL;
486 510
511 goto failed;
512
513detected:
487 sbi->s_bh1 = bh; 514 sbi->s_bh1 = bh;
488 sbi->s_bh2 = bh; 515 sbi->s_bh2 = bh;
489 if (complete_read_super(sb, silent, 1)) 516 if (complete_read_super(sb, silent, 1))
490 return 0; 517 return 0;
491 518
492failed: 519failed:
493 brelse(bh2); 520 printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n",
521 sb->s_id);
494 brelse(bh); 522 brelse(bh);
495 kfree(sbi); 523 kfree(sbi);
496 return -EINVAL; 524 return -EINVAL;
@@ -559,4 +587,5 @@ static void __exit exit_sysv_fs(void)
559 587
560module_init(init_sysv_fs) 588module_init(init_sysv_fs)
561module_exit(exit_sysv_fs) 589module_exit(exit_sysv_fs)
590MODULE_ALIAS("v7");
562MODULE_LICENSE("GPL"); 591MODULE_LICENSE("GPL");