aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/mux.c2
-rw-r--r--fs/9p/vfs_inode.c12
-rw-r--r--fs/9p/vfs_super.c7
-rw-r--r--fs/Kconfig25
-rw-r--r--fs/affs/super.c12
-rw-r--r--fs/afs/cell.c3
-rw-r--r--fs/afs/kafsasyncd.c9
-rw-r--r--fs/afs/mntpt.c2
-rw-r--r--fs/afs/server.c6
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/super.h2
-rw-r--r--fs/afs/vlocation.c6
-rw-r--r--fs/afs/vnode.c3
-rw-r--r--fs/aio.c2
-rw-r--r--fs/autofs4/expire.c9
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/binfmt_misc.c3
-rw-r--r--fs/buffer.c3
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/coda/psdev.c2
-rw-r--r--fs/coda/upcall.c2
-rw-r--r--fs/compat.c16
-rw-r--r--fs/compat_ioctl.c34
-rw-r--r--fs/configfs/dir.c6
-rw-r--r--fs/configfs/mount.c2
-rw-r--r--fs/dcache.c7
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/dquot.c4
-rw-r--r--fs/eventpoll.c17
-rw-r--r--fs/exec.c147
-rw-r--r--fs/ext2/Makefile2
-rw-r--r--fs/ext2/balloc.c22
-rw-r--r--fs/ext2/bitmap.c32
-rw-r--r--fs/ext2/dir.c3
-rw-r--r--fs/ext2/fsync.c2
-rw-r--r--fs/ext2/ialloc.c3
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext3/balloc.c242
-rw-r--r--fs/ext3/ialloc.c10
-rw-r--r--fs/ext3/inode.c57
-rw-r--r--fs/ext3/ioctl.c2
-rw-r--r--fs/ext3/namei.c4
-rw-r--r--fs/ext3/resize.c81
-rw-r--r--fs/ext3/super.c52
-rw-r--r--fs/ext3/xattr.c27
-rw-r--r--fs/freevxfs/vxfs.h4
-rw-r--r--fs/freevxfs/vxfs_fshead.c12
-rw-r--r--fs/fuse/Makefile2
-rw-r--r--fs/fuse/control.c218
-rw-r--r--fs/fuse/dev.c418
-rw-r--r--fs/fuse/dir.c56
-rw-r--r--fs/fuse/file.c206
-rw-r--r--fs/fuse/fuse_i.h135
-rw-r--r--fs/fuse/inode.c183
-rw-r--r--fs/jbd/journal.c3
-rw-r--r--fs/jbd/recovery.c1
-rw-r--r--fs/jffs2/erase.c15
-rw-r--r--fs/jffs2/nodemgmt.c3
-rw-r--r--fs/jffs2/summary.c2
-rw-r--r--fs/jffs2/wbuf.c3
-rw-r--r--fs/jfs/jfs_extent.c8
-rw-r--r--fs/libfs.c14
-rw-r--r--fs/lockd/clntlock.c39
-rw-r--r--fs/lockd/clntproc.c14
-rw-r--r--fs/lockd/host.c9
-rw-r--r--fs/namei.c6
-rw-r--r--fs/namespace.c134
-rw-r--r--fs/nfs/Makefile8
-rw-r--r--fs/nfs/callback.c2
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/dir.c18
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/file.c30
-rw-r--r--fs/nfs/idmap.c1
-rw-r--r--fs/nfs/inode.c1313
-rw-r--r--fs/nfs/internal.h186
-rw-r--r--fs/nfs/namespace.c229
-rw-r--r--fs/nfs/nfs2xdr.c6
-rw-r--r--fs/nfs/nfs3acl.c11
-rw-r--r--fs/nfs/nfs3proc.c5
-rw-r--r--fs/nfs/nfs3xdr.c6
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4namespace.c201
-rw-r--r--fs/nfs/nfs4proc.c111
-rw-r--r--fs/nfs/nfs4xdr.c218
-rw-r--r--fs/nfs/pagelist.c49
-rw-r--r--fs/nfs/proc.c5
-rw-r--r--fs/nfs/read.c120
-rw-r--r--fs/nfs/super.c1537
-rw-r--r--fs/nfs/symlink.c13
-rw-r--r--fs/nfs/sysctl.c10
-rw-r--r--fs/nfs/write.c47
-rw-r--r--fs/nfsd/nfs4state.c5
-rw-r--r--fs/nfsd/nfscache.c3
-rw-r--r--fs/ntfs/file.c26
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dlm/dlmast.c15
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h63
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c33
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c6
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h30
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c103
-rw-r--r--fs/ocfs2/dlm/dlmfs.c6
-rw-r--r--fs/ocfs2/dlm/dlmlock.c73
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c448
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c593
-rw-r--r--fs/ocfs2/dlm/dlmthread.c74
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c13
-rw-r--r--fs/ocfs2/dlm/userdlm.c2
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/journal.c5
-rw-r--r--fs/ocfs2/vote.c8
-rw-r--r--fs/open.c2
-rw-r--r--fs/openpromfs/inode.c1154
-rw-r--r--fs/pnode.c9
-rw-r--r--fs/proc/base.c1086
-rw-r--r--fs/proc/inode.c11
-rw-r--r--fs/proc/internal.h22
-rw-r--r--fs/proc/task_mmu.c140
-rw-r--r--fs/proc/task_nommu.c21
-rw-r--r--fs/reiserfs/file.c8
-rw-r--r--fs/reiserfs/journal.c6
-rw-r--r--fs/select.c7
-rw-r--r--fs/smbfs/request.c6
-rw-r--r--fs/smbfs/smbiod.c29
-rw-r--r--fs/super.c2
-rw-r--r--fs/sysfs/dir.c10
-rw-r--r--fs/ufs/balloc.c448
-rw-r--r--fs/ufs/cylinder.c49
-rw-r--r--fs/ufs/dir.c1000
-rw-r--r--fs/ufs/file.c21
-rw-r--r--fs/ufs/ialloc.c63
-rw-r--r--fs/ufs/inode.c370
-rw-r--r--fs/ufs/namei.c84
-rw-r--r--fs/ufs/super.c429
-rw-r--r--fs/ufs/truncate.c104
-rw-r--r--fs/ufs/util.c48
-rw-r--r--fs/ufs/util.h107
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c3
-rw-r--r--fs/xfs/xfs_mount.c18
141 files changed, 7771 insertions, 5516 deletions
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index f4407eb276c7..12e1baa4508d 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -712,7 +712,7 @@ static void v9fs_read_work(void *a)
712 * v9fs_send_request - send 9P request 712 * v9fs_send_request - send 9P request
713 * The function can sleep until the request is scheduled for sending. 713 * The function can sleep until the request is scheduled for sending.
714 * The function can be interrupted. Return from the function is not 714 * The function can be interrupted. Return from the function is not
715 * a guarantee that the request is sent succesfully. Can return errors 715 * a guarantee that the request is sent successfully. Can return errors
716 * that can be retrieved by PTR_ERR macros. 716 * that can be retrieved by PTR_ERR macros.
717 * 717 *
718 * @m: mux data 718 * @m: mux data
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 2cb87ba4b1c1..5c6bdf82146c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -530,9 +530,6 @@ error:
530 if (vfid) 530 if (vfid)
531 v9fs_fid_destroy(vfid); 531 v9fs_fid_destroy(vfid);
532 532
533 if (inode)
534 iput(inode);
535
536 return err; 533 return err;
537} 534}
538 535
@@ -1054,6 +1051,9 @@ static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
1054 int ret; 1051 int ret;
1055 char *link = __getname(); 1052 char *link = __getname();
1056 1053
1054 if (unlikely(!link))
1055 return -ENOMEM;
1056
1057 if (buflen > PATH_MAX) 1057 if (buflen > PATH_MAX)
1058 buflen = PATH_MAX; 1058 buflen = PATH_MAX;
1059 1059
@@ -1171,9 +1171,6 @@ error:
1171 if (vfid) 1171 if (vfid)
1172 v9fs_fid_destroy(vfid); 1172 v9fs_fid_destroy(vfid);
1173 1173
1174 if (inode)
1175 iput(inode);
1176
1177 return err; 1174 return err;
1178 1175
1179} 1176}
@@ -1227,6 +1224,9 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
1227 } 1224 }
1228 1225
1229 name = __getname(); 1226 name = __getname();
1227 if (unlikely(!name))
1228 return -ENOMEM;
1229
1230 sprintf(name, "%d\n", oldfid->fid); 1230 sprintf(name, "%d\n", oldfid->fid);
1231 retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name); 1231 retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name);
1232 __putname(name); 1232 __putname(name);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 872943004e59..8b15bb22caca 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -256,11 +256,12 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
256} 256}
257 257
258static void 258static void
259v9fs_umount_begin(struct super_block *sb) 259v9fs_umount_begin(struct vfsmount *vfsmnt, int flags)
260{ 260{
261 struct v9fs_session_info *v9ses = sb->s_fs_info; 261 struct v9fs_session_info *v9ses = vfsmnt->mnt_sb->s_fs_info;
262 262
263 v9fs_session_cancel(v9ses); 263 if (flags & MNT_FORCE)
264 v9fs_session_cancel(v9ses);
264} 265}
265 266
266static struct super_operations v9fs_super_ops = { 267static struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 467f7ae5f092..00aa3d5c5a83 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -776,7 +776,8 @@ endmenu
776menu "Pseudo filesystems" 776menu "Pseudo filesystems"
777 777
778config PROC_FS 778config PROC_FS
779 bool "/proc file system support" 779 bool "/proc file system support" if EMBEDDED
780 default y
780 help 781 help
781 This is a virtual file system providing information about the status 782 This is a virtual file system providing information about the status
782 of the system. "Virtual" means that it doesn't take up any space on 783 of the system. "Virtual" means that it doesn't take up any space on
@@ -1370,11 +1371,19 @@ config UFS_FS
1370 1371
1371config UFS_FS_WRITE 1372config UFS_FS_WRITE
1372 bool "UFS file system write support (DANGEROUS)" 1373 bool "UFS file system write support (DANGEROUS)"
1373 depends on UFS_FS && EXPERIMENTAL && BROKEN 1374 depends on UFS_FS && EXPERIMENTAL
1374 help 1375 help
1375 Say Y here if you want to try writing to UFS partitions. This is 1376 Say Y here if you want to try writing to UFS partitions. This is
1376 experimental, so you should back up your UFS partitions beforehand. 1377 experimental, so you should back up your UFS partitions beforehand.
1377 1378
1379config UFS_DEBUG
1380 bool "UFS debugging"
1381 depends on UFS_FS
1382 help
1383 If you are experiencing any problems with the UFS filesystem, say
1384 Y here. This will result in _many_ additional debugging messages to be
1385 written to the system log.
1386
1378endmenu 1387endmenu
1379 1388
1380menu "Network File Systems" 1389menu "Network File Systems"
@@ -1481,7 +1490,12 @@ config NFSD
1481 select LOCKD 1490 select LOCKD
1482 select SUNRPC 1491 select SUNRPC
1483 select EXPORTFS 1492 select EXPORTFS
1484 select NFS_ACL_SUPPORT if NFSD_V3_ACL || NFSD_V2_ACL 1493 select NFSD_V2_ACL if NFSD_V3_ACL
1494 select NFS_ACL_SUPPORT if NFSD_V2_ACL
1495 select NFSD_TCP if NFSD_V4
1496 select CRYPTO_MD5 if NFSD_V4
1497 select CRYPTO if NFSD_V4
1498 select FS_POSIX_ACL if NFSD_V4
1485 help 1499 help
1486 If you want your Linux box to act as an NFS *server*, so that other 1500 If you want your Linux box to act as an NFS *server*, so that other
1487 computers on your local network which support NFS can access certain 1501 computers on your local network which support NFS can access certain
@@ -1519,7 +1533,6 @@ config NFSD_V3
1519config NFSD_V3_ACL 1533config NFSD_V3_ACL
1520 bool "Provide server support for the NFSv3 ACL protocol extension" 1534 bool "Provide server support for the NFSv3 ACL protocol extension"
1521 depends on NFSD_V3 1535 depends on NFSD_V3
1522 select NFSD_V2_ACL
1523 help 1536 help
1524 Implement the NFSv3 ACL protocol extension for manipulating POSIX 1537 Implement the NFSv3 ACL protocol extension for manipulating POSIX
1525 Access Control Lists on exported file systems. NFS clients should 1538 Access Control Lists on exported file systems. NFS clients should
@@ -1529,10 +1542,6 @@ config NFSD_V3_ACL
1529config NFSD_V4 1542config NFSD_V4
1530 bool "Provide NFSv4 server support (EXPERIMENTAL)" 1543 bool "Provide NFSv4 server support (EXPERIMENTAL)"
1531 depends on NFSD_V3 && EXPERIMENTAL 1544 depends on NFSD_V3 && EXPERIMENTAL
1532 select NFSD_TCP
1533 select CRYPTO_MD5
1534 select CRYPTO
1535 select FS_POSIX_ACL
1536 help 1545 help
1537 If you would like to include the NFSv4 server as well as the NFSv2 1546 If you would like to include the NFSv4 server as well as the NFSv2
1538 and NFSv3 servers, say Y here. This feature is experimental, and 1547 and NFSv3 servers, say Y here. This feature is experimental, and
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 8765cba35bb9..5200f4938df0 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -271,6 +271,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
271 int reserved; 271 int reserved;
272 unsigned long mount_flags; 272 unsigned long mount_flags;
273 int tmp_flags; /* fix remount prototype... */ 273 int tmp_flags; /* fix remount prototype... */
274 u8 sig[4];
274 275
275 pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options"); 276 pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options");
276 277
@@ -370,8 +371,9 @@ got_root:
370 printk(KERN_ERR "AFFS: Cannot read boot block\n"); 371 printk(KERN_ERR "AFFS: Cannot read boot block\n");
371 goto out_error; 372 goto out_error;
372 } 373 }
373 chksum = be32_to_cpu(*(__be32 *)boot_bh->b_data); 374 memcpy(sig, boot_bh->b_data, 4);
374 brelse(boot_bh); 375 brelse(boot_bh);
376 chksum = be32_to_cpu(*(__be32 *)sig);
375 377
376 /* Dircache filesystems are compatible with non-dircache ones 378 /* Dircache filesystems are compatible with non-dircache ones
377 * when reading. As long as they aren't supported, writing is 379 * when reading. As long as they aren't supported, writing is
@@ -420,11 +422,11 @@ got_root:
420 } 422 }
421 423
422 if (mount_flags & SF_VERBOSE) { 424 if (mount_flags & SF_VERBOSE) {
423 chksum = cpu_to_be32(chksum); 425 u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
424 printk(KERN_NOTICE "AFFS: Mounting volume \"%*s\": Type=%.3s\\%c, Blocksize=%d\n", 426 printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
425 AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0], 427 len > 31 ? 31 : len,
426 AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1, 428 AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1,
427 (char *)&chksum,((char *)&chksum)[3] + '0',blocksize); 429 sig, sig[3] + '0', blocksize);
428 } 430 }
429 431
430 sb->s_flags |= MS_NODEV | MS_NOSUID; 432 sb->s_flags |= MS_NODEV | MS_NOSUID;
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 009a9ae88d61..bfc1fd22d5b1 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -413,8 +413,7 @@ int afs_server_find_by_peer(const struct rxrpc_peer *peer,
413 413
414 /* we found it in the graveyard - resurrect it */ 414 /* we found it in the graveyard - resurrect it */
415 found_dead_server: 415 found_dead_server:
416 list_del(&server->link); 416 list_move_tail(&server->link, &cell->sv_list);
417 list_add_tail(&server->link, &cell->sv_list);
418 afs_get_server(server); 417 afs_get_server(server);
419 afs_kafstimod_del_timer(&server->timeout); 418 afs_kafstimod_del_timer(&server->timeout);
420 spin_unlock(&cell->sv_gylock); 419 spin_unlock(&cell->sv_gylock);
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
index 7ac07d0d47b9..f09a794f248e 100644
--- a/fs/afs/kafsasyncd.c
+++ b/fs/afs/kafsasyncd.c
@@ -136,8 +136,7 @@ static int kafsasyncd(void *arg)
136 if (!list_empty(&kafsasyncd_async_attnq)) { 136 if (!list_empty(&kafsasyncd_async_attnq)) {
137 op = list_entry(kafsasyncd_async_attnq.next, 137 op = list_entry(kafsasyncd_async_attnq.next,
138 struct afs_async_op, link); 138 struct afs_async_op, link);
139 list_del(&op->link); 139 list_move_tail(&op->link,
140 list_add_tail(&op->link,
141 &kafsasyncd_async_busyq); 140 &kafsasyncd_async_busyq);
142 } 141 }
143 142
@@ -204,8 +203,7 @@ void afs_kafsasyncd_begin_op(struct afs_async_op *op)
204 init_waitqueue_entry(&op->waiter, kafsasyncd_task); 203 init_waitqueue_entry(&op->waiter, kafsasyncd_task);
205 add_wait_queue(&op->call->waitq, &op->waiter); 204 add_wait_queue(&op->call->waitq, &op->waiter);
206 205
207 list_del(&op->link); 206 list_move_tail(&op->link, &kafsasyncd_async_busyq);
208 list_add_tail(&op->link, &kafsasyncd_async_busyq);
209 207
210 spin_unlock(&kafsasyncd_async_lock); 208 spin_unlock(&kafsasyncd_async_lock);
211 209
@@ -223,8 +221,7 @@ void afs_kafsasyncd_attend_op(struct afs_async_op *op)
223 221
224 spin_lock(&kafsasyncd_async_lock); 222 spin_lock(&kafsasyncd_async_lock);
225 223
226 list_del(&op->link); 224 list_move_tail(&op->link, &kafsasyncd_async_attnq);
227 list_add_tail(&op->link, &kafsasyncd_async_attnq);
228 225
229 spin_unlock(&kafsasyncd_async_lock); 226 spin_unlock(&kafsasyncd_async_lock);
230 227
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index b5cf9e1205ad..99785a79d043 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -203,7 +203,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
203 203
204 /* try and do the mount */ 204 /* try and do the mount */
205 kdebug("--- attempting mount %s -o %s ---", devname, options); 205 kdebug("--- attempting mount %s -o %s ---", devname, options);
206 mnt = do_kern_mount("afs", 0, devname, options); 206 mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
207 kdebug("--- mount result %p ---", mnt); 207 kdebug("--- mount result %p ---", mnt);
208 208
209 free_page((unsigned long) devname); 209 free_page((unsigned long) devname);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 62b093aa41c6..22afaae1a4ce 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -123,8 +123,7 @@ int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr,
123 resurrect_server: 123 resurrect_server:
124 _debug("resurrecting server"); 124 _debug("resurrecting server");
125 125
126 list_del(&zombie->link); 126 list_move_tail(&zombie->link, &cell->sv_list);
127 list_add_tail(&zombie->link, &cell->sv_list);
128 afs_get_server(zombie); 127 afs_get_server(zombie);
129 afs_kafstimod_del_timer(&zombie->timeout); 128 afs_kafstimod_del_timer(&zombie->timeout);
130 spin_unlock(&cell->sv_gylock); 129 spin_unlock(&cell->sv_gylock);
@@ -168,8 +167,7 @@ void afs_put_server(struct afs_server *server)
168 } 167 }
169 168
170 spin_lock(&cell->sv_gylock); 169 spin_lock(&cell->sv_gylock);
171 list_del(&server->link); 170 list_move_tail(&server->link, &cell->sv_graveyard);
172 list_add_tail(&server->link, &cell->sv_graveyard);
173 171
174 /* time out in 10 secs */ 172 /* time out in 10 secs */
175 afs_kafstimod_add_timer(&server->timeout, 10 * HZ); 173 afs_kafstimod_add_timer(&server->timeout, 10 * HZ);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 82468df0ba54..67d1f5c819ec 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,7 @@ static void afs_put_super(struct super_block *sb);
48 48
49static void afs_destroy_inode(struct inode *inode); 49static void afs_destroy_inode(struct inode *inode);
50 50
51static struct file_system_type afs_fs_type = { 51struct file_system_type afs_fs_type = {
52 .owner = THIS_MODULE, 52 .owner = THIS_MODULE,
53 .name = "afs", 53 .name = "afs",
54 .get_sb = afs_get_sb, 54 .get_sb = afs_get_sb,
diff --git a/fs/afs/super.h b/fs/afs/super.h
index ac11362f4e95..32de8cc6fae8 100644
--- a/fs/afs/super.h
+++ b/fs/afs/super.h
@@ -38,6 +38,8 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
38 return sb->s_fs_info; 38 return sb->s_fs_info;
39} 39}
40 40
41extern struct file_system_type afs_fs_type;
42
41#endif /* __KERNEL__ */ 43#endif /* __KERNEL__ */
42 44
43#endif /* _LINUX_AFS_SUPER_H */ 45#endif /* _LINUX_AFS_SUPER_H */
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index eced20618ecc..331f730a1fb3 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -326,8 +326,7 @@ int afs_vlocation_lookup(struct afs_cell *cell,
326 /* found in the graveyard - resurrect */ 326 /* found in the graveyard - resurrect */
327 _debug("found in graveyard"); 327 _debug("found in graveyard");
328 atomic_inc(&vlocation->usage); 328 atomic_inc(&vlocation->usage);
329 list_del(&vlocation->link); 329 list_move_tail(&vlocation->link, &cell->vl_list);
330 list_add_tail(&vlocation->link, &cell->vl_list);
331 spin_unlock(&cell->vl_gylock); 330 spin_unlock(&cell->vl_gylock);
332 331
333 afs_kafstimod_del_timer(&vlocation->timeout); 332 afs_kafstimod_del_timer(&vlocation->timeout);
@@ -478,8 +477,7 @@ static void __afs_put_vlocation(struct afs_vlocation *vlocation)
478 } 477 }
479 478
480 /* move to graveyard queue */ 479 /* move to graveyard queue */
481 list_del(&vlocation->link); 480 list_move_tail(&vlocation->link,&cell->vl_graveyard);
482 list_add_tail(&vlocation->link,&cell->vl_graveyard);
483 481
484 /* remove from pending timeout queue (refcounted if actually being 482 /* remove from pending timeout queue (refcounted if actually being
485 * updated) */ 483 * updated) */
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 9867fef3261d..cf62da5d7825 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -104,8 +104,7 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
104 vnode->cb_expiry * HZ); 104 vnode->cb_expiry * HZ);
105 105
106 spin_lock(&afs_cb_hash_lock); 106 spin_lock(&afs_cb_hash_lock);
107 list_del(&vnode->cb_hash_link); 107 list_move_tail(&vnode->cb_hash_link,
108 list_add_tail(&vnode->cb_hash_link,
109 &afs_cb_hash(server, &vnode->fid)); 108 &afs_cb_hash(server, &vnode->fid));
110 spin_unlock(&afs_cb_hash_lock); 109 spin_unlock(&afs_cb_hash_lock);
111 110
diff --git a/fs/aio.c b/fs/aio.c
index 8c34a62df7d7..950630187acc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -641,7 +641,7 @@ static inline int __queue_kicked_iocb(struct kiocb *iocb)
641 * invoked both for initial i/o submission and 641 * invoked both for initial i/o submission and
642 * subsequent retries via the aio_kick_handler. 642 * subsequent retries via the aio_kick_handler.
643 * Expects to be invoked with iocb->ki_ctx->lock 643 * Expects to be invoked with iocb->ki_ctx->lock
644 * already held. The lock is released and reaquired 644 * already held. The lock is released and reacquired
645 * as needed during processing. 645 * as needed during processing.
646 * 646 *
647 * Calls the iocb retry method (already setup for the 647 * Calls the iocb retry method (already setup for the
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index b8ce02607d66..8dbd44f10e9d 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -174,6 +174,12 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
174 struct autofs_info *ino = autofs4_dentry_ino(p); 174 struct autofs_info *ino = autofs4_dentry_ino(p);
175 unsigned int ino_count = atomic_read(&ino->count); 175 unsigned int ino_count = atomic_read(&ino->count);
176 176
177 /*
178 * Clean stale dentries below that have not been
179 * invalidated after a mount fail during lookup
180 */
181 d_invalidate(p);
182
177 /* allow for dget above and top is already dgot */ 183 /* allow for dget above and top is already dgot */
178 if (p == top) 184 if (p == top)
179 ino_count += 2; 185 ino_count += 2;
@@ -370,8 +376,7 @@ next:
370 DPRINTK("returning %p %.*s", 376 DPRINTK("returning %p %.*s",
371 expired, (int)expired->d_name.len, expired->d_name.name); 377 expired, (int)expired->d_name.len, expired->d_name.name);
372 spin_lock(&dcache_lock); 378 spin_lock(&dcache_lock);
373 list_del(&expired->d_parent->d_subdirs); 379 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
374 list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
375 spin_unlock(&dcache_lock); 380 spin_unlock(&dcache_lock);
376 return expired; 381 return expired;
377 } 382 }
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b1c902e319c1..c94d52eafd1b 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -510,7 +510,7 @@ static int load_flat_file(struct linux_binprm * bprm,
510 } 510 }
511 511
512 /* OK, This is the point of no return */ 512 /* OK, This is the point of no return */
513 set_personality(PER_LINUX); 513 set_personality(PER_LINUX_32BIT);
514 } 514 }
515 515
516 /* 516 /*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 07a4996cca3f..34ebbc191e46 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -55,6 +55,7 @@ typedef struct {
55} Node; 55} Node;
56 56
57static DEFINE_RWLOCK(entries_lock); 57static DEFINE_RWLOCK(entries_lock);
58static struct file_system_type bm_fs_type;
58static struct vfsmount *bm_mnt; 59static struct vfsmount *bm_mnt;
59static int entry_count; 60static int entry_count;
60 61
@@ -637,7 +638,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
637 if (!inode) 638 if (!inode)
638 goto out2; 639 goto out2;
639 640
640 err = simple_pin_fs("binfmt_misc", &bm_mnt, &entry_count); 641 err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
641 if (err) { 642 if (err) {
642 iput(inode); 643 iput(inode);
643 inode = NULL; 644 inode = NULL;
diff --git a/fs/buffer.c b/fs/buffer.c
index 373bb6292bdc..f23bb647db47 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -564,7 +564,7 @@ still_busy:
564 * Completion handler for block_write_full_page() - pages which are unlocked 564 * Completion handler for block_write_full_page() - pages which are unlocked
565 * during I/O, and which have PageWriteback cleared upon I/O completion. 565 * during I/O, and which have PageWriteback cleared upon I/O completion.
566 */ 566 */
567void end_buffer_async_write(struct buffer_head *bh, int uptodate) 567static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
568{ 568{
569 char b[BDEVNAME_SIZE]; 569 char b[BDEVNAME_SIZE];
570 unsigned long flags; 570 unsigned long flags;
@@ -3166,7 +3166,6 @@ EXPORT_SYMBOL(block_sync_page);
3166EXPORT_SYMBOL(block_truncate_page); 3166EXPORT_SYMBOL(block_truncate_page);
3167EXPORT_SYMBOL(block_write_full_page); 3167EXPORT_SYMBOL(block_write_full_page);
3168EXPORT_SYMBOL(cont_prepare_write); 3168EXPORT_SYMBOL(cont_prepare_write);
3169EXPORT_SYMBOL(end_buffer_async_write);
3170EXPORT_SYMBOL(end_buffer_read_sync); 3169EXPORT_SYMBOL(end_buffer_read_sync);
3171EXPORT_SYMBOL(end_buffer_write_sync); 3170EXPORT_SYMBOL(end_buffer_write_sync);
3172EXPORT_SYMBOL(file_fsync); 3171EXPORT_SYMBOL(file_fsync);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f2e285457bee..c28ede599946 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -403,12 +403,14 @@ static struct quotactl_ops cifs_quotactl_ops = {
403#endif 403#endif
404 404
405#ifdef CONFIG_CIFS_EXPERIMENTAL 405#ifdef CONFIG_CIFS_EXPERIMENTAL
406static void cifs_umount_begin(struct super_block * sblock) 406static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
407{ 407{
408 struct cifs_sb_info *cifs_sb; 408 struct cifs_sb_info *cifs_sb;
409 struct cifsTconInfo * tcon; 409 struct cifsTconInfo * tcon;
410 410
411 cifs_sb = CIFS_SB(sblock); 411 if (!(flags & MNT_FORCE))
412 return;
413 cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
412 if(cifs_sb == NULL) 414 if(cifs_sb == NULL)
413 return; 415 return;
414 416
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 6c6771db36da..7caee8d8ea3b 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -259,7 +259,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
259 /* If request was not a signal, enqueue and don't free */ 259 /* If request was not a signal, enqueue and don't free */
260 if (!(req->uc_flags & REQ_ASYNC)) { 260 if (!(req->uc_flags & REQ_ASYNC)) {
261 req->uc_flags |= REQ_READ; 261 req->uc_flags |= REQ_READ;
262 list_add(&(req->uc_chain), vcp->vc_processing.prev); 262 list_add_tail(&(req->uc_chain), &vcp->vc_processing);
263 goto out; 263 goto out;
264 } 264 }
265 265
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index b040eba13a7d..a5b5e631ba61 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -725,7 +725,7 @@ static int coda_upcall(struct coda_sb_info *sbi,
725 ((union inputArgs *)buffer)->ih.unique = req->uc_unique; 725 ((union inputArgs *)buffer)->ih.unique = req->uc_unique;
726 726
727 /* Append msg to pending queue and poke Venus. */ 727 /* Append msg to pending queue and poke Venus. */
728 list_add(&(req->uc_chain), vcommp->vc_pending.prev); 728 list_add_tail(&(req->uc_chain), &vcommp->vc_pending);
729 729
730 wake_up_interruptible(&vcommp->vc_waitq); 730 wake_up_interruptible(&vcommp->vc_waitq);
731 /* We can be interrupted while we wait for Venus to process 731 /* We can be interrupted while we wait for Venus to process
diff --git a/fs/compat.c b/fs/compat.c
index 7e7e5bc4f3cf..e31e9cf96647 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -55,6 +55,20 @@
55 55
56extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); 56extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
57 57
58int compat_log = 1;
59
60int compat_printk(const char *fmt, ...)
61{
62 va_list ap;
63 int ret;
64 if (!compat_log)
65 return 0;
66 va_start(ap, fmt);
67 ret = vprintk(fmt, ap);
68 va_end(ap);
69 return ret;
70}
71
58/* 72/*
59 * Not all architectures have sys_utime, so implement this in terms 73 * Not all architectures have sys_utime, so implement this in terms
60 * of sys_utimes. 74 * of sys_utimes.
@@ -359,7 +373,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
359 sprintf(buf,"'%c'", (cmd>>24) & 0x3f); 373 sprintf(buf,"'%c'", (cmd>>24) & 0x3f);
360 if (!isprint(buf[1])) 374 if (!isprint(buf[1]))
361 sprintf(buf, "%02x", buf[1]); 375 sprintf(buf, "%02x", buf[1]);
362 printk("ioctl32(%s:%d): Unknown cmd fd(%d) " 376 compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
363 "cmd(%08x){%s} arg(%08x) on %s\n", 377 "cmd(%08x){%s} arg(%08x) on %s\n",
364 current->comm, current->pid, 378 current->comm, current->pid,
365 (int)fd, (unsigned int)cmd, buf, 379 (int)fd, (unsigned int)cmd, buf,
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d2c38875ab29..d8ecfedef189 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -80,6 +80,7 @@
80#include <net/bluetooth/rfcomm.h> 80#include <net/bluetooth/rfcomm.h>
81 81
82#include <linux/capi.h> 82#include <linux/capi.h>
83#include <linux/gigaset_dev.h>
83 84
84#include <scsi/scsi.h> 85#include <scsi/scsi.h>
85#include <scsi/scsi_ioctl.h> 86#include <scsi/scsi_ioctl.h>
@@ -205,38 +206,6 @@ static int do_ext3_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
205 return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg)); 206 return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
206} 207}
207 208
208struct compat_dmx_event {
209 dmx_event_t event;
210 compat_time_t timeStamp;
211 union
212 {
213 dmx_scrambling_status_t scrambling;
214 } u;
215};
216
217static int do_dmx_get_event(unsigned int fd, unsigned int cmd, unsigned long arg)
218{
219 struct dmx_event kevent;
220 mm_segment_t old_fs = get_fs();
221 int err;
222
223 set_fs(KERNEL_DS);
224 err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
225 set_fs(old_fs);
226
227 if (!err) {
228 struct compat_dmx_event __user *up = compat_ptr(arg);
229
230 err = put_user(kevent.event, &up->event);
231 err |= put_user(kevent.timeStamp, &up->timeStamp);
232 err |= put_user(kevent.u.scrambling, &up->u.scrambling);
233 if (err)
234 err = -EFAULT;
235 }
236
237 return err;
238}
239
240struct compat_video_event { 209struct compat_video_event {
241 int32_t type; 210 int32_t type;
242 compat_time_t timestamp; 211 compat_time_t timestamp;
@@ -2964,7 +2933,6 @@ HANDLE_IOCTL(NCP_IOC_SETPRIVATEDATA_32, do_ncp_setprivatedata)
2964#endif 2933#endif
2965 2934
2966/* dvb */ 2935/* dvb */
2967HANDLE_IOCTL(DMX_GET_EVENT, do_dmx_get_event)
2968HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event) 2936HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
2969HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture) 2937HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
2970HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette) 2938HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5f952187fc53..207f8006fd6c 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1009,8 +1009,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
1009 /* fallthrough */ 1009 /* fallthrough */
1010 default: 1010 default:
1011 if (filp->f_pos == 2) { 1011 if (filp->f_pos == 2) {
1012 list_del(q); 1012 list_move(q, &parent_sd->s_children);
1013 list_add(q, &parent_sd->s_children);
1014 } 1013 }
1015 for (p=q->next; p!= &parent_sd->s_children; p=p->next) { 1014 for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
1016 struct configfs_dirent *next; 1015 struct configfs_dirent *next;
@@ -1033,8 +1032,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
1033 dt_type(next)) < 0) 1032 dt_type(next)) < 0)
1034 return 0; 1033 return 0;
1035 1034
1036 list_del(q); 1035 list_move(q, p);
1037 list_add(q, p);
1038 p = q; 1036 p = q;
1039 filp->f_pos++; 1037 filp->f_pos++;
1040 } 1038 }
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 94dab7bdd851..3e5fe843e1df 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -118,7 +118,7 @@ static struct file_system_type configfs_fs_type = {
118 118
119int configfs_pin_fs(void) 119int configfs_pin_fs(void)
120{ 120{
121 return simple_pin_fs("configfs", &configfs_mount, 121 return simple_pin_fs(&configfs_fs_type, &configfs_mount,
122 &configfs_mnt_count); 122 &configfs_mnt_count);
123} 123}
124 124
diff --git a/fs/dcache.c b/fs/dcache.c
index 313b54b2b8f2..48b44a714b35 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -406,7 +406,7 @@ static void prune_dcache(int count, struct super_block *sb)
406 cond_resched_lock(&dcache_lock); 406 cond_resched_lock(&dcache_lock);
407 407
408 tmp = dentry_unused.prev; 408 tmp = dentry_unused.prev;
409 if (unlikely(sb)) { 409 if (sb) {
410 /* Try to find a dentry for this sb, but don't try 410 /* Try to find a dentry for this sb, but don't try
411 * too hard, if they aren't near the tail they will 411 * too hard, if they aren't near the tail they will
412 * be moved down again soon 412 * be moved down again soon
@@ -522,8 +522,7 @@ void shrink_dcache_sb(struct super_block * sb)
522 dentry = list_entry(tmp, struct dentry, d_lru); 522 dentry = list_entry(tmp, struct dentry, d_lru);
523 if (dentry->d_sb != sb) 523 if (dentry->d_sb != sb)
524 continue; 524 continue;
525 list_del(tmp); 525 list_move(tmp, &dentry_unused);
526 list_add(tmp, &dentry_unused);
527 } 526 }
528 527
529 /* 528 /*
@@ -638,7 +637,7 @@ resume:
638 * of the unused list for prune_dcache 637 * of the unused list for prune_dcache
639 */ 638 */
640 if (!atomic_read(&dentry->d_count)) { 639 if (!atomic_read(&dentry->d_count)) {
641 list_add(&dentry->d_lru, dentry_unused.prev); 640 list_add_tail(&dentry->d_lru, &dentry_unused);
642 dentry_stat.nr_unused++; 641 dentry_stat.nr_unused++;
643 found++; 642 found++;
644 } 643 }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 440128ebef3b..6fa1e04f8415 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -199,7 +199,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
199 199
200 pr_debug("debugfs: creating file '%s'\n",name); 200 pr_debug("debugfs: creating file '%s'\n",name);
201 201
202 error = simple_pin_fs("debugfs", &debugfs_mount, &debugfs_mount_count); 202 error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
203 if (error) 203 if (error)
204 goto exit; 204 goto exit;
205 205
diff --git a/fs/dquot.c b/fs/dquot.c
index 81d87a413c68..0122a279106a 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -250,7 +250,7 @@ static inline struct dquot *find_dquot(unsigned int hashent, struct super_block
250/* Add a dquot to the tail of the free list */ 250/* Add a dquot to the tail of the free list */
251static inline void put_dquot_last(struct dquot *dquot) 251static inline void put_dquot_last(struct dquot *dquot)
252{ 252{
253 list_add(&dquot->dq_free, free_dquots.prev); 253 list_add_tail(&dquot->dq_free, &free_dquots);
254 dqstats.free_dquots++; 254 dqstats.free_dquots++;
255} 255}
256 256
@@ -266,7 +266,7 @@ static inline void put_inuse(struct dquot *dquot)
266{ 266{
267 /* We add to the back of inuse list so we don't have to restart 267 /* We add to the back of inuse list so we don't have to restart
268 * when traversing this list and we block */ 268 * when traversing this list and we block */
269 list_add(&dquot->dq_inuse, inuse_list.prev); 269 list_add_tail(&dquot->dq_inuse, &inuse_list);
270 dqstats.allocated_dquots++; 270 dqstats.allocated_dquots++;
271} 271}
272 272
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 08e7e6a555ca..9c677bbd0b08 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * fs/eventpoll.c ( Efficent event polling implementation ) 2 * fs/eventpoll.c ( Efficent event polling implementation )
3 * Copyright (C) 2001,...,2003 Davide Libenzi 3 * Copyright (C) 2001,...,2006 Davide Libenzi
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -1004,7 +1004,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1004 1004
1005 /* Notify waiting tasks that events are available */ 1005 /* Notify waiting tasks that events are available */
1006 if (waitqueue_active(&ep->wq)) 1006 if (waitqueue_active(&ep->wq))
1007 wake_up(&ep->wq); 1007 __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
1008 if (waitqueue_active(&ep->poll_wait)) 1008 if (waitqueue_active(&ep->poll_wait))
1009 pwake++; 1009 pwake++;
1010 } 1010 }
@@ -1083,7 +1083,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1083 1083
1084 /* Notify waiting tasks that events are available */ 1084 /* Notify waiting tasks that events are available */
1085 if (waitqueue_active(&ep->wq)) 1085 if (waitqueue_active(&ep->wq))
1086 wake_up(&ep->wq); 1086 __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
1087 TASK_INTERRUPTIBLE);
1087 if (waitqueue_active(&ep->poll_wait)) 1088 if (waitqueue_active(&ep->poll_wait))
1088 pwake++; 1089 pwake++;
1089 } 1090 }
@@ -1260,7 +1261,8 @@ is_linked:
1260 * wait list. 1261 * wait list.
1261 */ 1262 */
1262 if (waitqueue_active(&ep->wq)) 1263 if (waitqueue_active(&ep->wq))
1263 wake_up(&ep->wq); 1264 __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
1265 TASK_INTERRUPTIBLE);
1264 if (waitqueue_active(&ep->poll_wait)) 1266 if (waitqueue_active(&ep->poll_wait))
1265 pwake++; 1267 pwake++;
1266 1268
@@ -1444,7 +1446,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
1444 * wait list. 1446 * wait list.
1445 */ 1447 */
1446 if (waitqueue_active(&ep->wq)) 1448 if (waitqueue_active(&ep->wq))
1447 wake_up(&ep->wq); 1449 __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
1450 TASK_INTERRUPTIBLE);
1448 if (waitqueue_active(&ep->poll_wait)) 1451 if (waitqueue_active(&ep->poll_wait))
1449 pwake++; 1452 pwake++;
1450 } 1453 }
@@ -1516,7 +1519,7 @@ retry:
1516 * ep_poll_callback() when events will become available. 1519 * ep_poll_callback() when events will become available.
1517 */ 1520 */
1518 init_waitqueue_entry(&wait, current); 1521 init_waitqueue_entry(&wait, current);
1519 add_wait_queue(&ep->wq, &wait); 1522 __add_wait_queue(&ep->wq, &wait);
1520 1523
1521 for (;;) { 1524 for (;;) {
1522 /* 1525 /*
@@ -1536,7 +1539,7 @@ retry:
1536 jtimeout = schedule_timeout(jtimeout); 1539 jtimeout = schedule_timeout(jtimeout);
1537 write_lock_irqsave(&ep->lock, flags); 1540 write_lock_irqsave(&ep->lock, flags);
1538 } 1541 }
1539 remove_wait_queue(&ep->wq, &wait); 1542 __remove_wait_queue(&ep->wq, &wait);
1540 1543
1541 set_current_state(TASK_RUNNING); 1544 set_current_state(TASK_RUNNING);
1542 } 1545 }
diff --git a/fs/exec.c b/fs/exec.c
index 0b88bf646143..c8494f513eaf 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -666,8 +666,6 @@ static int de_thread(struct task_struct *tsk)
666 * and to assume its PID: 666 * and to assume its PID:
667 */ 667 */
668 if (!thread_group_leader(current)) { 668 if (!thread_group_leader(current)) {
669 struct dentry *proc_dentry1, *proc_dentry2;
670
671 /* 669 /*
672 * Wait for the thread group leader to be a zombie. 670 * Wait for the thread group leader to be a zombie.
673 * It should already be zombie at this point, most 671 * It should already be zombie at this point, most
@@ -689,10 +687,6 @@ static int de_thread(struct task_struct *tsk)
689 */ 687 */
690 current->start_time = leader->start_time; 688 current->start_time = leader->start_time;
691 689
692 spin_lock(&leader->proc_lock);
693 spin_lock(&current->proc_lock);
694 proc_dentry1 = proc_pid_unhash(current);
695 proc_dentry2 = proc_pid_unhash(leader);
696 write_lock_irq(&tasklist_lock); 690 write_lock_irq(&tasklist_lock);
697 691
698 BUG_ON(leader->tgid != current->tgid); 692 BUG_ON(leader->tgid != current->tgid);
@@ -713,7 +707,7 @@ static int de_thread(struct task_struct *tsk)
713 attach_pid(current, PIDTYPE_PID, current->pid); 707 attach_pid(current, PIDTYPE_PID, current->pid);
714 attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); 708 attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
715 attach_pid(current, PIDTYPE_SID, current->signal->session); 709 attach_pid(current, PIDTYPE_SID, current->signal->session);
716 list_add_tail_rcu(&current->tasks, &init_task.tasks); 710 list_replace_rcu(&leader->tasks, &current->tasks);
717 711
718 current->group_leader = current; 712 current->group_leader = current;
719 leader->group_leader = current; 713 leader->group_leader = current;
@@ -721,7 +715,6 @@ static int de_thread(struct task_struct *tsk)
721 /* Reduce leader to a thread */ 715 /* Reduce leader to a thread */
722 detach_pid(leader, PIDTYPE_PGID); 716 detach_pid(leader, PIDTYPE_PGID);
723 detach_pid(leader, PIDTYPE_SID); 717 detach_pid(leader, PIDTYPE_SID);
724 list_del_init(&leader->tasks);
725 718
726 current->exit_signal = SIGCHLD; 719 current->exit_signal = SIGCHLD;
727 720
@@ -729,10 +722,6 @@ static int de_thread(struct task_struct *tsk)
729 leader->exit_state = EXIT_DEAD; 722 leader->exit_state = EXIT_DEAD;
730 723
731 write_unlock_irq(&tasklist_lock); 724 write_unlock_irq(&tasklist_lock);
732 spin_unlock(&leader->proc_lock);
733 spin_unlock(&current->proc_lock);
734 proc_pid_flush(proc_dentry1);
735 proc_pid_flush(proc_dentry2);
736 } 725 }
737 726
738 /* 727 /*
@@ -1379,67 +1368,102 @@ static void format_corename(char *corename, const char *pattern, long signr)
1379 *out_ptr = 0; 1368 *out_ptr = 0;
1380} 1369}
1381 1370
1382static void zap_threads (struct mm_struct *mm) 1371static void zap_process(struct task_struct *start)
1383{ 1372{
1384 struct task_struct *g, *p; 1373 struct task_struct *t;
1385 struct task_struct *tsk = current;
1386 struct completion *vfork_done = tsk->vfork_done;
1387 int traced = 0;
1388 1374
1389 /* 1375 start->signal->flags = SIGNAL_GROUP_EXIT;
1390 * Make sure nobody is waiting for us to release the VM, 1376 start->signal->group_stop_count = 0;
1391 * otherwise we can deadlock when we wait on each other
1392 */
1393 if (vfork_done) {
1394 tsk->vfork_done = NULL;
1395 complete(vfork_done);
1396 }
1397 1377
1398 read_lock(&tasklist_lock); 1378 t = start;
1399 do_each_thread(g,p) 1379 do {
1400 if (mm == p->mm && p != tsk) { 1380 if (t != current && t->mm) {
1401 force_sig_specific(SIGKILL, p); 1381 t->mm->core_waiters++;
1402 mm->core_waiters++; 1382 sigaddset(&t->pending.signal, SIGKILL);
1403 if (unlikely(p->ptrace) && 1383 signal_wake_up(t, 1);
1404 unlikely(p->parent->mm == mm))
1405 traced = 1;
1406 } 1384 }
1407 while_each_thread(g,p); 1385 } while ((t = next_thread(t)) != start);
1386}
1408 1387
1409 read_unlock(&tasklist_lock); 1388static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1389 int exit_code)
1390{
1391 struct task_struct *g, *p;
1392 unsigned long flags;
1393 int err = -EAGAIN;
1394
1395 spin_lock_irq(&tsk->sighand->siglock);
1396 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
1397 tsk->signal->group_exit_code = exit_code;
1398 zap_process(tsk);
1399 err = 0;
1400 }
1401 spin_unlock_irq(&tsk->sighand->siglock);
1402 if (err)
1403 return err;
1410 1404
1411 if (unlikely(traced)) { 1405 if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
1412 /* 1406 goto done;
1413 * We are zapping a thread and the thread it ptraces. 1407
1414 * If the tracee went into a ptrace stop for exit tracing, 1408 rcu_read_lock();
1415 * we could deadlock since the tracer is waiting for this 1409 for_each_process(g) {
1416 * coredump to finish. Detach them so they can both die. 1410 if (g == tsk->group_leader)
1417 */ 1411 continue;
1418 write_lock_irq(&tasklist_lock); 1412
1419 do_each_thread(g,p) { 1413 p = g;
1420 if (mm == p->mm && p != tsk && 1414 do {
1421 p->ptrace && p->parent->mm == mm) { 1415 if (p->mm) {
1422 __ptrace_detach(p, 0); 1416 if (p->mm == mm) {
1417 /*
1418 * p->sighand can't disappear, but
1419 * may be changed by de_thread()
1420 */
1421 lock_task_sighand(p, &flags);
1422 zap_process(p);
1423 unlock_task_sighand(p, &flags);
1424 }
1425 break;
1423 } 1426 }
1424 } while_each_thread(g,p); 1427 } while ((p = next_thread(p)) != g);
1425 write_unlock_irq(&tasklist_lock);
1426 } 1428 }
1429 rcu_read_unlock();
1430done:
1431 return mm->core_waiters;
1427} 1432}
1428 1433
1429static void coredump_wait(struct mm_struct *mm) 1434static int coredump_wait(int exit_code)
1430{ 1435{
1431 DECLARE_COMPLETION(startup_done); 1436 struct task_struct *tsk = current;
1437 struct mm_struct *mm = tsk->mm;
1438 struct completion startup_done;
1439 struct completion *vfork_done;
1432 int core_waiters; 1440 int core_waiters;
1433 1441
1442 init_completion(&mm->core_done);
1443 init_completion(&startup_done);
1434 mm->core_startup_done = &startup_done; 1444 mm->core_startup_done = &startup_done;
1435 1445
1436 zap_threads(mm); 1446 core_waiters = zap_threads(tsk, mm, exit_code);
1437 core_waiters = mm->core_waiters;
1438 up_write(&mm->mmap_sem); 1447 up_write(&mm->mmap_sem);
1439 1448
1449 if (unlikely(core_waiters < 0))
1450 goto fail;
1451
1452 /*
1453 * Make sure nobody is waiting for us to release the VM,
1454 * otherwise we can deadlock when we wait on each other
1455 */
1456 vfork_done = tsk->vfork_done;
1457 if (vfork_done) {
1458 tsk->vfork_done = NULL;
1459 complete(vfork_done);
1460 }
1461
1440 if (core_waiters) 1462 if (core_waiters)
1441 wait_for_completion(&startup_done); 1463 wait_for_completion(&startup_done);
1464fail:
1442 BUG_ON(mm->core_waiters); 1465 BUG_ON(mm->core_waiters);
1466 return core_waiters;
1443} 1467}
1444 1468
1445int do_coredump(long signr, int exit_code, struct pt_regs * regs) 1469int do_coredump(long signr, int exit_code, struct pt_regs * regs)
@@ -1473,22 +1497,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
1473 } 1497 }
1474 mm->dumpable = 0; 1498 mm->dumpable = 0;
1475 1499
1476 retval = -EAGAIN; 1500 retval = coredump_wait(exit_code);
1477 spin_lock_irq(&current->sighand->siglock); 1501 if (retval < 0)
1478 if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
1479 current->signal->flags = SIGNAL_GROUP_EXIT;
1480 current->signal->group_exit_code = exit_code;
1481 current->signal->group_stop_count = 0;
1482 retval = 0;
1483 }
1484 spin_unlock_irq(&current->sighand->siglock);
1485 if (retval) {
1486 up_write(&mm->mmap_sem);
1487 goto fail; 1502 goto fail;
1488 }
1489
1490 init_completion(&mm->core_done);
1491 coredump_wait(mm);
1492 1503
1493 /* 1504 /*
1494 * Clear any false indication of pending signals that might 1505 * Clear any false indication of pending signals that might
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index c5d02da73bc3..e0b2b43c1fdb 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -4,7 +4,7 @@
4 4
5obj-$(CONFIG_EXT2_FS) += ext2.o 5obj-$(CONFIG_EXT2_FS) += ext2.o
6 6
7ext2-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o 8 ioctl.o namei.o super.o symlink.o
9 9
10ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 10ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 2c00953d4b0b..433a213a8bd9 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -521,6 +521,26 @@ io_error:
521 goto out_release; 521 goto out_release;
522} 522}
523 523
524#ifdef EXT2FS_DEBUG
525
526static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
527
528unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
529{
530 unsigned int i;
531 unsigned long sum = 0;
532
533 if (!map)
534 return (0);
535 for (i = 0; i < numchars; i++)
536 sum += nibblemap[map->b_data[i] & 0xf] +
537 nibblemap[(map->b_data[i] >> 4) & 0xf];
538 return (sum);
539}
540
541#endif /* EXT2FS_DEBUG */
542
543/* Superblock must be locked */
524unsigned long ext2_count_free_blocks (struct super_block * sb) 544unsigned long ext2_count_free_blocks (struct super_block * sb)
525{ 545{
526 struct ext2_group_desc * desc; 546 struct ext2_group_desc * desc;
@@ -530,7 +550,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
530 unsigned long bitmap_count, x; 550 unsigned long bitmap_count, x;
531 struct ext2_super_block *es; 551 struct ext2_super_block *es;
532 552
533 lock_super (sb);
534 es = EXT2_SB(sb)->s_es; 553 es = EXT2_SB(sb)->s_es;
535 desc_count = 0; 554 desc_count = 0;
536 bitmap_count = 0; 555 bitmap_count = 0;
@@ -554,7 +573,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
554 printk("ext2_count_free_blocks: stored = %lu, computed = %lu, %lu\n", 573 printk("ext2_count_free_blocks: stored = %lu, computed = %lu, %lu\n",
555 (long)le32_to_cpu(es->s_free_blocks_count), 574 (long)le32_to_cpu(es->s_free_blocks_count),
556 desc_count, bitmap_count); 575 desc_count, bitmap_count);
557 unlock_super (sb);
558 return bitmap_count; 576 return bitmap_count;
559#else 577#else
560 for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { 578 for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
diff --git a/fs/ext2/bitmap.c b/fs/ext2/bitmap.c
deleted file mode 100644
index e9983a0dd396..000000000000
--- a/fs/ext2/bitmap.c
+++ /dev/null
@@ -1,32 +0,0 @@
1/*
2 * linux/fs/ext2/bitmap.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#ifdef EXT2FS_DEBUG
11
12#include <linux/buffer_head.h>
13
14#include "ext2.h"
15
16static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17
18unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
19{
20 unsigned int i;
21 unsigned long sum = 0;
22
23 if (!map)
24 return (0);
25 for (i = 0; i < numchars; i++)
26 sum += nibblemap[map->b_data[i] & 0xf] +
27 nibblemap[(map->b_data[i] >> 4) & 0xf];
28 return (sum);
29}
30
31#endif /* EXT2FS_DEBUG */
32
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 3c1c9aaaca6b..92ea8265d7d5 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -399,8 +399,7 @@ ino_t ext2_inode_by_name(struct inode * dir, struct dentry *dentry)
399 de = ext2_find_entry (dir, dentry, &page); 399 de = ext2_find_entry (dir, dentry, &page);
400 if (de) { 400 if (de) {
401 res = le32_to_cpu(de->inode); 401 res = le32_to_cpu(de->inode);
402 kunmap(page); 402 ext2_put_page(page);
403 page_cache_release(page);
404 } 403 }
405 return res; 404 return res;
406} 405}
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
index c9c2e5ffa48e..7806b9e8155b 100644
--- a/fs/ext2/fsync.c
+++ b/fs/ext2/fsync.c
@@ -24,7 +24,7 @@
24 24
25#include "ext2.h" 25#include "ext2.h"
26#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
27#include <linux/buffer_head.h> /* for fsync_inode_buffers() */ 27#include <linux/buffer_head.h> /* for sync_mapping_buffers() */
28 28
29 29
30/* 30/*
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index e52765219e16..308c252568c6 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -638,6 +638,7 @@ fail:
638 return ERR_PTR(err); 638 return ERR_PTR(err);
639} 639}
640 640
641/* Superblock must be locked */
641unsigned long ext2_count_free_inodes (struct super_block * sb) 642unsigned long ext2_count_free_inodes (struct super_block * sb)
642{ 643{
643 struct ext2_group_desc *desc; 644 struct ext2_group_desc *desc;
@@ -649,7 +650,6 @@ unsigned long ext2_count_free_inodes (struct super_block * sb)
649 unsigned long bitmap_count = 0; 650 unsigned long bitmap_count = 0;
650 struct buffer_head *bitmap_bh = NULL; 651 struct buffer_head *bitmap_bh = NULL;
651 652
652 lock_super (sb);
653 es = EXT2_SB(sb)->s_es; 653 es = EXT2_SB(sb)->s_es;
654 for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { 654 for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
655 unsigned x; 655 unsigned x;
@@ -672,7 +672,6 @@ unsigned long ext2_count_free_inodes (struct super_block * sb)
672 printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n", 672 printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
673 percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter), 673 percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter),
674 desc_count, bitmap_count); 674 desc_count, bitmap_count);
675 unlock_super(sb);
676 return desc_count; 675 return desc_count;
677#else 676#else
678 for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { 677 for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ee4ba759581e..d4233b2e6436 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -854,7 +854,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
854 } 854 }
855 if (!ext2_check_descriptors (sb)) { 855 if (!ext2_check_descriptors (sb)) {
856 printk ("EXT2-fs: group descriptors corrupted!\n"); 856 printk ("EXT2-fs: group descriptors corrupted!\n");
857 db_count = i;
858 goto failed_mount2; 857 goto failed_mount2;
859 } 858 }
860 sbi->s_gdb_count = db_count; 859 sbi->s_gdb_count = db_count;
@@ -1046,6 +1045,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1046 unsigned long overhead; 1045 unsigned long overhead;
1047 int i; 1046 int i;
1048 1047
1048 lock_super(sb);
1049 if (test_opt (sb, MINIX_DF)) 1049 if (test_opt (sb, MINIX_DF))
1050 overhead = 0; 1050 overhead = 0;
1051 else { 1051 else {
@@ -1086,6 +1086,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
1086 buf->f_files = le32_to_cpu(sbi->s_es->s_inodes_count); 1086 buf->f_files = le32_to_cpu(sbi->s_es->s_inodes_count);
1087 buf->f_ffree = ext2_count_free_inodes (sb); 1087 buf->f_ffree = ext2_count_free_inodes (sb);
1088 buf->f_namelen = EXT2_NAME_LEN; 1088 buf->f_namelen = EXT2_NAME_LEN;
1089 unlock_super(sb);
1089 return 0; 1090 return 0;
1090} 1091}
1091 1092
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 77927d6938f6..96172e89ddc3 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -163,20 +163,19 @@ restart:
163#endif 163#endif
164 164
165static int 165static int
166goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal, 166goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
167 unsigned int group, struct super_block * sb) 167 unsigned int group, struct super_block * sb)
168{ 168{
169 unsigned long group_first_block, group_last_block; 169 ext3_fsblk_t group_first_block, group_last_block;
170 170
171 group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + 171 group_first_block = ext3_group_first_block_no(sb, group);
172 group * EXT3_BLOCKS_PER_GROUP(sb);
173 group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1; 172 group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
174 173
175 if ((rsv->_rsv_start > group_last_block) || 174 if ((rsv->_rsv_start > group_last_block) ||
176 (rsv->_rsv_end < group_first_block)) 175 (rsv->_rsv_end < group_first_block))
177 return 0; 176 return 0;
178 if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start) 177 if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
179 || (goal + group_first_block > rsv->_rsv_end))) 178 || (grp_goal + group_first_block > rsv->_rsv_end)))
180 return 0; 179 return 0;
181 return 1; 180 return 1;
182} 181}
@@ -187,7 +186,7 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
187 * Returns NULL if there are no windows or if all windows start after the goal. 186 * Returns NULL if there are no windows or if all windows start after the goal.
188 */ 187 */
189static struct ext3_reserve_window_node * 188static struct ext3_reserve_window_node *
190search_reserve_window(struct rb_root *root, unsigned long goal) 189search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
191{ 190{
192 struct rb_node *n = root->rb_node; 191 struct rb_node *n = root->rb_node;
193 struct ext3_reserve_window_node *rsv; 192 struct ext3_reserve_window_node *rsv;
@@ -223,7 +222,7 @@ void ext3_rsv_window_add(struct super_block *sb,
223{ 222{
224 struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root; 223 struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
225 struct rb_node *node = &rsv->rsv_node; 224 struct rb_node *node = &rsv->rsv_node;
226 unsigned int start = rsv->rsv_start; 225 ext3_fsblk_t start = rsv->rsv_start;
227 226
228 struct rb_node ** p = &root->rb_node; 227 struct rb_node ** p = &root->rb_node;
229 struct rb_node * parent = NULL; 228 struct rb_node * parent = NULL;
@@ -310,20 +309,20 @@ void ext3_discard_reservation(struct inode *inode)
310 309
311/* Free given blocks, update quota and i_blocks field */ 310/* Free given blocks, update quota and i_blocks field */
312void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb, 311void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
313 unsigned long block, unsigned long count, 312 ext3_fsblk_t block, unsigned long count,
314 int *pdquot_freed_blocks) 313 unsigned long *pdquot_freed_blocks)
315{ 314{
316 struct buffer_head *bitmap_bh = NULL; 315 struct buffer_head *bitmap_bh = NULL;
317 struct buffer_head *gd_bh; 316 struct buffer_head *gd_bh;
318 unsigned long block_group; 317 unsigned long block_group;
319 unsigned long bit; 318 ext3_grpblk_t bit;
320 unsigned long i; 319 unsigned long i;
321 unsigned long overflow; 320 unsigned long overflow;
322 struct ext3_group_desc * desc; 321 struct ext3_group_desc * desc;
323 struct ext3_super_block * es; 322 struct ext3_super_block * es;
324 struct ext3_sb_info *sbi; 323 struct ext3_sb_info *sbi;
325 int err = 0, ret; 324 int err = 0, ret;
326 unsigned group_freed; 325 ext3_grpblk_t group_freed;
327 326
328 *pdquot_freed_blocks = 0; 327 *pdquot_freed_blocks = 0;
329 sbi = EXT3_SB(sb); 328 sbi = EXT3_SB(sb);
@@ -333,7 +332,7 @@ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
333 block + count > le32_to_cpu(es->s_blocks_count)) { 332 block + count > le32_to_cpu(es->s_blocks_count)) {
334 ext3_error (sb, "ext3_free_blocks", 333 ext3_error (sb, "ext3_free_blocks",
335 "Freeing blocks not in datazone - " 334 "Freeing blocks not in datazone - "
336 "block = %lu, count = %lu", block, count); 335 "block = "E3FSBLK", count = %lu", block, count);
337 goto error_return; 336 goto error_return;
338 } 337 }
339 338
@@ -369,7 +368,7 @@ do_more:
369 sbi->s_itb_per_group)) 368 sbi->s_itb_per_group))
370 ext3_error (sb, "ext3_free_blocks", 369 ext3_error (sb, "ext3_free_blocks",
371 "Freeing blocks in system zones - " 370 "Freeing blocks in system zones - "
372 "Block = %lu, count = %lu", 371 "Block = "E3FSBLK", count = %lu",
373 block, count); 372 block, count);
374 373
375 /* 374 /*
@@ -453,7 +452,8 @@ do_more:
453 bit + i, bitmap_bh->b_data)) { 452 bit + i, bitmap_bh->b_data)) {
454 jbd_unlock_bh_state(bitmap_bh); 453 jbd_unlock_bh_state(bitmap_bh);
455 ext3_error(sb, __FUNCTION__, 454 ext3_error(sb, __FUNCTION__,
456 "bit already cleared for block %lu", block + i); 455 "bit already cleared for block "E3FSBLK,
456 block + i);
457 jbd_lock_bh_state(bitmap_bh); 457 jbd_lock_bh_state(bitmap_bh);
458 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 458 BUFFER_TRACE(bitmap_bh, "bit already cleared");
459 } else { 459 } else {
@@ -493,10 +493,10 @@ error_return:
493 493
494/* Free given blocks, update quota and i_blocks field */ 494/* Free given blocks, update quota and i_blocks field */
495void ext3_free_blocks(handle_t *handle, struct inode *inode, 495void ext3_free_blocks(handle_t *handle, struct inode *inode,
496 unsigned long block, unsigned long count) 496 ext3_fsblk_t block, unsigned long count)
497{ 497{
498 struct super_block * sb; 498 struct super_block * sb;
499 int dquot_freed_blocks; 499 unsigned long dquot_freed_blocks;
500 500
501 sb = inode->i_sb; 501 sb = inode->i_sb;
502 if (!sb) { 502 if (!sb) {
@@ -525,7 +525,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
525 * data-writes at some point, and disable it for metadata allocations or 525 * data-writes at some point, and disable it for metadata allocations or
526 * sync-data inodes. 526 * sync-data inodes.
527 */ 527 */
528static int ext3_test_allocatable(int nr, struct buffer_head *bh) 528static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
529{ 529{
530 int ret; 530 int ret;
531 struct journal_head *jh = bh2jh(bh); 531 struct journal_head *jh = bh2jh(bh);
@@ -542,11 +542,11 @@ static int ext3_test_allocatable(int nr, struct buffer_head *bh)
542 return ret; 542 return ret;
543} 543}
544 544
545static int 545static ext3_grpblk_t
546bitmap_search_next_usable_block(int start, struct buffer_head *bh, 546bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
547 int maxblocks) 547 ext3_grpblk_t maxblocks)
548{ 548{
549 int next; 549 ext3_grpblk_t next;
550 struct journal_head *jh = bh2jh(bh); 550 struct journal_head *jh = bh2jh(bh);
551 551
552 /* 552 /*
@@ -576,10 +576,11 @@ bitmap_search_next_usable_block(int start, struct buffer_head *bh,
576 * the initial goal; then for a free byte somewhere in the bitmap; then 576 * the initial goal; then for a free byte somewhere in the bitmap; then
577 * for any free bit in the bitmap. 577 * for any free bit in the bitmap.
578 */ 578 */
579static int 579static ext3_grpblk_t
580find_next_usable_block(int start, struct buffer_head *bh, int maxblocks) 580find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
581 ext3_grpblk_t maxblocks)
581{ 582{
582 int here, next; 583 ext3_grpblk_t here, next;
583 char *p, *r; 584 char *p, *r;
584 585
585 if (start > 0) { 586 if (start > 0) {
@@ -591,7 +592,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
591 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the 592 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
592 * next 64-bit boundary is simple.. 593 * next 64-bit boundary is simple..
593 */ 594 */
594 int end_goal = (start + 63) & ~63; 595 ext3_grpblk_t end_goal = (start + 63) & ~63;
595 if (end_goal > maxblocks) 596 if (end_goal > maxblocks)
596 end_goal = maxblocks; 597 end_goal = maxblocks;
597 here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); 598 here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
@@ -628,7 +629,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
628 * zero (failure). 629 * zero (failure).
629 */ 630 */
630static inline int 631static inline int
631claim_block(spinlock_t *lock, int block, struct buffer_head *bh) 632claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
632{ 633{
633 struct journal_head *jh = bh2jh(bh); 634 struct journal_head *jh = bh2jh(bh);
634 int ret; 635 int ret;
@@ -651,19 +652,18 @@ claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
651 * new bitmap. In that case we must release write access to the old one via 652 * new bitmap. In that case we must release write access to the old one via
652 * ext3_journal_release_buffer(), else we'll run out of credits. 653 * ext3_journal_release_buffer(), else we'll run out of credits.
653 */ 654 */
654static int 655static ext3_grpblk_t
655ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, 656ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
656 struct buffer_head *bitmap_bh, int goal, 657 struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
657 unsigned long *count, struct ext3_reserve_window *my_rsv) 658 unsigned long *count, struct ext3_reserve_window *my_rsv)
658{ 659{
659 int group_first_block, start, end; 660 ext3_fsblk_t group_first_block;
661 ext3_grpblk_t start, end;
660 unsigned long num = 0; 662 unsigned long num = 0;
661 663
662 /* we do allocation within the reservation window if we have a window */ 664 /* we do allocation within the reservation window if we have a window */
663 if (my_rsv) { 665 if (my_rsv) {
664 group_first_block = 666 group_first_block = ext3_group_first_block_no(sb, group);
665 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
666 group * EXT3_BLOCKS_PER_GROUP(sb);
667 if (my_rsv->_rsv_start >= group_first_block) 667 if (my_rsv->_rsv_start >= group_first_block)
668 start = my_rsv->_rsv_start - group_first_block; 668 start = my_rsv->_rsv_start - group_first_block;
669 else 669 else
@@ -673,13 +673,13 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
673 if (end > EXT3_BLOCKS_PER_GROUP(sb)) 673 if (end > EXT3_BLOCKS_PER_GROUP(sb))
674 /* reservation window crosses group boundary */ 674 /* reservation window crosses group boundary */
675 end = EXT3_BLOCKS_PER_GROUP(sb); 675 end = EXT3_BLOCKS_PER_GROUP(sb);
676 if ((start <= goal) && (goal < end)) 676 if ((start <= grp_goal) && (grp_goal < end))
677 start = goal; 677 start = grp_goal;
678 else 678 else
679 goal = -1; 679 grp_goal = -1;
680 } else { 680 } else {
681 if (goal > 0) 681 if (grp_goal > 0)
682 start = goal; 682 start = grp_goal;
683 else 683 else
684 start = 0; 684 start = 0;
685 end = EXT3_BLOCKS_PER_GROUP(sb); 685 end = EXT3_BLOCKS_PER_GROUP(sb);
@@ -688,43 +688,43 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
688 BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb)); 688 BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
689 689
690repeat: 690repeat:
691 if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) { 691 if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
692 goal = find_next_usable_block(start, bitmap_bh, end); 692 grp_goal = find_next_usable_block(start, bitmap_bh, end);
693 if (goal < 0) 693 if (grp_goal < 0)
694 goto fail_access; 694 goto fail_access;
695 if (!my_rsv) { 695 if (!my_rsv) {
696 int i; 696 int i;
697 697
698 for (i = 0; i < 7 && goal > start && 698 for (i = 0; i < 7 && grp_goal > start &&
699 ext3_test_allocatable(goal - 1, 699 ext3_test_allocatable(grp_goal - 1,
700 bitmap_bh); 700 bitmap_bh);
701 i++, goal--) 701 i++, grp_goal--)
702 ; 702 ;
703 } 703 }
704 } 704 }
705 start = goal; 705 start = grp_goal;
706 706
707 if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) { 707 if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
708 /* 708 /*
709 * The block was allocated by another thread, or it was 709 * The block was allocated by another thread, or it was
710 * allocated and then freed by another thread 710 * allocated and then freed by another thread
711 */ 711 */
712 start++; 712 start++;
713 goal++; 713 grp_goal++;
714 if (start >= end) 714 if (start >= end)
715 goto fail_access; 715 goto fail_access;
716 goto repeat; 716 goto repeat;
717 } 717 }
718 num++; 718 num++;
719 goal++; 719 grp_goal++;
720 while (num < *count && goal < end 720 while (num < *count && grp_goal < end
721 && ext3_test_allocatable(goal, bitmap_bh) 721 && ext3_test_allocatable(grp_goal, bitmap_bh)
722 && claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) { 722 && claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
723 num++; 723 num++;
724 goal++; 724 grp_goal++;
725 } 725 }
726 *count = num; 726 *count = num;
727 return goal - num; 727 return grp_goal - num;
728fail_access: 728fail_access:
729 *count = num; 729 *count = num;
730 return -1; 730 return -1;
@@ -766,12 +766,13 @@ fail_access:
766static int find_next_reservable_window( 766static int find_next_reservable_window(
767 struct ext3_reserve_window_node *search_head, 767 struct ext3_reserve_window_node *search_head,
768 struct ext3_reserve_window_node *my_rsv, 768 struct ext3_reserve_window_node *my_rsv,
769 struct super_block * sb, int start_block, 769 struct super_block * sb,
770 int last_block) 770 ext3_fsblk_t start_block,
771 ext3_fsblk_t last_block)
771{ 772{
772 struct rb_node *next; 773 struct rb_node *next;
773 struct ext3_reserve_window_node *rsv, *prev; 774 struct ext3_reserve_window_node *rsv, *prev;
774 int cur; 775 ext3_fsblk_t cur;
775 int size = my_rsv->rsv_goal_size; 776 int size = my_rsv->rsv_goal_size;
776 777
777 /* TODO: make the start of the reservation window byte-aligned */ 778 /* TODO: make the start of the reservation window byte-aligned */
@@ -873,10 +874,10 @@ static int find_next_reservable_window(
873 * 874 *
874 * @rsv: the reservation 875 * @rsv: the reservation
875 * 876 *
876 * @goal: The goal (group-relative). It is where the search for a 877 * @grp_goal: The goal (group-relative). It is where the search for a
877 * free reservable space should start from. 878 * free reservable space should start from.
878 * if we have a goal(goal >0 ), then start from there, 879 * if we have a grp_goal(grp_goal >0 ), then start from there,
879 * no goal(goal = -1), we start from the first block 880 * no grp_goal(grp_goal = -1), we start from the first block
880 * of the group. 881 * of the group.
881 * 882 *
882 * @sb: the super block 883 * @sb: the super block
@@ -885,25 +886,24 @@ static int find_next_reservable_window(
885 * 886 *
886 */ 887 */
887static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, 888static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
888 int goal, struct super_block *sb, 889 ext3_grpblk_t grp_goal, struct super_block *sb,
889 unsigned int group, struct buffer_head *bitmap_bh) 890 unsigned int group, struct buffer_head *bitmap_bh)
890{ 891{
891 struct ext3_reserve_window_node *search_head; 892 struct ext3_reserve_window_node *search_head;
892 int group_first_block, group_end_block, start_block; 893 ext3_fsblk_t group_first_block, group_end_block, start_block;
893 int first_free_block; 894 ext3_grpblk_t first_free_block;
894 struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; 895 struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
895 unsigned long size; 896 unsigned long size;
896 int ret; 897 int ret;
897 spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; 898 spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
898 899
899 group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + 900 group_first_block = ext3_group_first_block_no(sb, group);
900 group * EXT3_BLOCKS_PER_GROUP(sb);
901 group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1; 901 group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
902 902
903 if (goal < 0) 903 if (grp_goal < 0)
904 start_block = group_first_block; 904 start_block = group_first_block;
905 else 905 else
906 start_block = goal + group_first_block; 906 start_block = grp_goal + group_first_block;
907 907
908 size = my_rsv->rsv_goal_size; 908 size = my_rsv->rsv_goal_size;
909 909
@@ -1057,14 +1057,15 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
1057 * sorted double linked list should be fast. 1057 * sorted double linked list should be fast.
1058 * 1058 *
1059 */ 1059 */
1060static int 1060static ext3_grpblk_t
1061ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, 1061ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1062 unsigned int group, struct buffer_head *bitmap_bh, 1062 unsigned int group, struct buffer_head *bitmap_bh,
1063 int goal, struct ext3_reserve_window_node * my_rsv, 1063 ext3_grpblk_t grp_goal,
1064 struct ext3_reserve_window_node * my_rsv,
1064 unsigned long *count, int *errp) 1065 unsigned long *count, int *errp)
1065{ 1066{
1066 unsigned long group_first_block; 1067 ext3_fsblk_t group_first_block;
1067 int ret = 0; 1068 ext3_grpblk_t ret = 0;
1068 int fatal; 1069 int fatal;
1069 unsigned long num = *count; 1070 unsigned long num = *count;
1070 1071
@@ -1090,17 +1091,16 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1090 */ 1091 */
1091 if (my_rsv == NULL ) { 1092 if (my_rsv == NULL ) {
1092 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, 1093 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1093 goal, count, NULL); 1094 grp_goal, count, NULL);
1094 goto out; 1095 goto out;
1095 } 1096 }
1096 /* 1097 /*
1097 * goal is a group relative block number (if there is a goal) 1098 * grp_goal is a group relative block number (if there is a goal)
1098 * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb) 1099 * 0 < grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
1099 * first block is a filesystem wide block number 1100 * first block is a filesystem wide block number
1100 * first block is the block number of the first block in this group 1101 * first block is the block number of the first block in this group
1101 */ 1102 */
1102 group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + 1103 group_first_block = ext3_group_first_block_no(sb, group);
1103 group * EXT3_BLOCKS_PER_GROUP(sb);
1104 1104
1105 /* 1105 /*
1106 * Basically we will allocate a new block from inode's reservation 1106 * Basically we will allocate a new block from inode's reservation
@@ -1119,24 +1119,24 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1119 */ 1119 */
1120 while (1) { 1120 while (1) {
1121 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || 1121 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1122 !goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) { 1122 !goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb)) {
1123 if (my_rsv->rsv_goal_size < *count) 1123 if (my_rsv->rsv_goal_size < *count)
1124 my_rsv->rsv_goal_size = *count; 1124 my_rsv->rsv_goal_size = *count;
1125 ret = alloc_new_reservation(my_rsv, goal, sb, 1125 ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1126 group, bitmap_bh); 1126 group, bitmap_bh);
1127 if (ret < 0) 1127 if (ret < 0)
1128 break; /* failed */ 1128 break; /* failed */
1129 1129
1130 if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) 1130 if (!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb))
1131 goal = -1; 1131 grp_goal = -1;
1132 } else if (goal > 0 && (my_rsv->rsv_end-goal+1) < *count) 1132 } else if (grp_goal > 0 && (my_rsv->rsv_end-grp_goal+1) < *count)
1133 try_to_extend_reservation(my_rsv, sb, 1133 try_to_extend_reservation(my_rsv, sb,
1134 *count-my_rsv->rsv_end + goal - 1); 1134 *count-my_rsv->rsv_end + grp_goal - 1);
1135 1135
1136 if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) 1136 if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
1137 || (my_rsv->rsv_end < group_first_block)) 1137 || (my_rsv->rsv_end < group_first_block))
1138 BUG(); 1138 BUG();
1139 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, 1139 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, grp_goal,
1140 &num, &my_rsv->rsv_window); 1140 &num, &my_rsv->rsv_window);
1141 if (ret >= 0) { 1141 if (ret >= 0) {
1142 my_rsv->rsv_alloc_hit += num; 1142 my_rsv->rsv_alloc_hit += num;
@@ -1164,7 +1164,7 @@ out:
1164 1164
1165static int ext3_has_free_blocks(struct ext3_sb_info *sbi) 1165static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
1166{ 1166{
1167 int free_blocks, root_blocks; 1167 ext3_fsblk_t free_blocks, root_blocks;
1168 1168
1169 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 1169 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1170 root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); 1170 root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
@@ -1200,19 +1200,20 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries)
1200 * bitmap, and then for any free bit if that fails. 1200 * bitmap, and then for any free bit if that fails.
1201 * This function also updates quota and i_blocks field. 1201 * This function also updates quota and i_blocks field.
1202 */ 1202 */
1203int ext3_new_blocks(handle_t *handle, struct inode *inode, 1203ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1204 unsigned long goal, unsigned long *count, int *errp) 1204 ext3_fsblk_t goal, unsigned long *count, int *errp)
1205{ 1205{
1206 struct buffer_head *bitmap_bh = NULL; 1206 struct buffer_head *bitmap_bh = NULL;
1207 struct buffer_head *gdp_bh; 1207 struct buffer_head *gdp_bh;
1208 int group_no; 1208 int group_no;
1209 int goal_group; 1209 int goal_group;
1210 int ret_block; 1210 ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1211 ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1212 ext3_fsblk_t ret_block; /* filesyetem-wide allocated block */
1211 int bgi; /* blockgroup iteration index */ 1213 int bgi; /* blockgroup iteration index */
1212 int target_block;
1213 int fatal = 0, err; 1214 int fatal = 0, err;
1214 int performed_allocation = 0; 1215 int performed_allocation = 0;
1215 int free_blocks; 1216 ext3_grpblk_t free_blocks; /* number of free blocks in a group */
1216 struct super_block *sb; 1217 struct super_block *sb;
1217 struct ext3_group_desc *gdp; 1218 struct ext3_group_desc *gdp;
1218 struct ext3_super_block *es; 1219 struct ext3_super_block *es;
@@ -1285,16 +1286,17 @@ retry:
1285 my_rsv = NULL; 1286 my_rsv = NULL;
1286 1287
1287 if (free_blocks > 0) { 1288 if (free_blocks > 0) {
1288 ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) % 1289 grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
1289 EXT3_BLOCKS_PER_GROUP(sb)); 1290 EXT3_BLOCKS_PER_GROUP(sb));
1290 bitmap_bh = read_block_bitmap(sb, group_no); 1291 bitmap_bh = read_block_bitmap(sb, group_no);
1291 if (!bitmap_bh) 1292 if (!bitmap_bh)
1292 goto io_error; 1293 goto io_error;
1293 ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no, 1294 grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1294 bitmap_bh, ret_block, my_rsv, &num, &fatal); 1295 group_no, bitmap_bh, grp_target_blk,
1296 my_rsv, &num, &fatal);
1295 if (fatal) 1297 if (fatal)
1296 goto out; 1298 goto out;
1297 if (ret_block >= 0) 1299 if (grp_alloc_blk >= 0)
1298 goto allocated; 1300 goto allocated;
1299 } 1301 }
1300 1302
@@ -1327,11 +1329,15 @@ retry:
1327 bitmap_bh = read_block_bitmap(sb, group_no); 1329 bitmap_bh = read_block_bitmap(sb, group_no);
1328 if (!bitmap_bh) 1330 if (!bitmap_bh)
1329 goto io_error; 1331 goto io_error;
1330 ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no, 1332 /*
1331 bitmap_bh, -1, my_rsv, &num, &fatal); 1333 * try to allocate block(s) from this group, without a goal(-1).
1334 */
1335 grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1336 group_no, bitmap_bh, -1, my_rsv,
1337 &num, &fatal);
1332 if (fatal) 1338 if (fatal)
1333 goto out; 1339 goto out;
1334 if (ret_block >= 0) 1340 if (grp_alloc_blk >= 0)
1335 goto allocated; 1341 goto allocated;
1336 } 1342 }
1337 /* 1343 /*
@@ -1360,18 +1366,18 @@ allocated:
1360 if (fatal) 1366 if (fatal)
1361 goto out; 1367 goto out;
1362 1368
1363 target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb) 1369 ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
1364 + le32_to_cpu(es->s_first_data_block);
1365 1370
1366 if (in_range(le32_to_cpu(gdp->bg_block_bitmap), target_block, num) || 1371 if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
1367 in_range(le32_to_cpu(gdp->bg_inode_bitmap), target_block, num) || 1372 in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
1368 in_range(target_block, le32_to_cpu(gdp->bg_inode_table), 1373 in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
1369 EXT3_SB(sb)->s_itb_per_group) || 1374 EXT3_SB(sb)->s_itb_per_group) ||
1370 in_range(target_block + num - 1, le32_to_cpu(gdp->bg_inode_table), 1375 in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
1371 EXT3_SB(sb)->s_itb_per_group)) 1376 EXT3_SB(sb)->s_itb_per_group))
1372 ext3_error(sb, "ext3_new_block", 1377 ext3_error(sb, "ext3_new_block",
1373 "Allocating block in system zone - " 1378 "Allocating block in system zone - "
1374 "blocks from %u, length %lu", target_block, num); 1379 "blocks from "E3FSBLK", length %lu",
1380 ret_block, num);
1375 1381
1376 performed_allocation = 1; 1382 performed_allocation = 1;
1377 1383
@@ -1380,7 +1386,7 @@ allocated:
1380 struct buffer_head *debug_bh; 1386 struct buffer_head *debug_bh;
1381 1387
1382 /* Record bitmap buffer state in the newly allocated block */ 1388 /* Record bitmap buffer state in the newly allocated block */
1383 debug_bh = sb_find_get_block(sb, target_block); 1389 debug_bh = sb_find_get_block(sb, ret_block);
1384 if (debug_bh) { 1390 if (debug_bh) {
1385 BUFFER_TRACE(debug_bh, "state when allocated"); 1391 BUFFER_TRACE(debug_bh, "state when allocated");
1386 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state"); 1392 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
@@ -1393,24 +1399,21 @@ allocated:
1393 int i; 1399 int i;
1394 1400
1395 for (i = 0; i < num; i++) { 1401 for (i = 0; i < num; i++) {
1396 if (ext3_test_bit(ret_block, 1402 if (ext3_test_bit(grp_alloc_blk+i,
1397 bh2jh(bitmap_bh)->b_committed_data)) { 1403 bh2jh(bitmap_bh)->b_committed_data)) {
1398 printk("%s: block was unexpectedly set in " 1404 printk("%s: block was unexpectedly set in "
1399 "b_committed_data\n", __FUNCTION__); 1405 "b_committed_data\n", __FUNCTION__);
1400 } 1406 }
1401 } 1407 }
1402 } 1408 }
1403 ext3_debug("found bit %d\n", ret_block); 1409 ext3_debug("found bit %d\n", grp_alloc_blk);
1404 spin_unlock(sb_bgl_lock(sbi, group_no)); 1410 spin_unlock(sb_bgl_lock(sbi, group_no));
1405 jbd_unlock_bh_state(bitmap_bh); 1411 jbd_unlock_bh_state(bitmap_bh);
1406#endif 1412#endif
1407 1413
1408 /* ret_block was blockgroup-relative. Now it becomes fs-relative */
1409 ret_block = target_block;
1410
1411 if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) { 1414 if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
1412 ext3_error(sb, "ext3_new_block", 1415 ext3_error(sb, "ext3_new_block",
1413 "block(%d) >= blocks count(%d) - " 1416 "block("E3FSBLK") >= blocks count(%d) - "
1414 "block_group = %d, es == %p ", ret_block, 1417 "block_group = %d, es == %p ", ret_block,
1415 le32_to_cpu(es->s_blocks_count), group_no, es); 1418 le32_to_cpu(es->s_blocks_count), group_no, es);
1416 goto out; 1419 goto out;
@@ -1421,7 +1424,7 @@ allocated:
1421 * list of some description. We don't know in advance whether 1424 * list of some description. We don't know in advance whether
1422 * the caller wants to use it as metadata or data. 1425 * the caller wants to use it as metadata or data.
1423 */ 1426 */
1424 ext3_debug("allocating block %d. Goal hits %d of %d.\n", 1427 ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
1425 ret_block, goal_hits, goal_attempts); 1428 ret_block, goal_hits, goal_attempts);
1426 1429
1427 spin_lock(sb_bgl_lock(sbi, group_no)); 1430 spin_lock(sb_bgl_lock(sbi, group_no));
@@ -1461,23 +1464,24 @@ out:
1461 return 0; 1464 return 0;
1462} 1465}
1463 1466
1464int ext3_new_block(handle_t *handle, struct inode *inode, 1467ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
1465 unsigned long goal, int *errp) 1468 ext3_fsblk_t goal, int *errp)
1466{ 1469{
1467 unsigned long count = 1; 1470 unsigned long count = 1;
1468 1471
1469 return ext3_new_blocks(handle, inode, goal, &count, errp); 1472 return ext3_new_blocks(handle, inode, goal, &count, errp);
1470} 1473}
1471 1474
1472unsigned long ext3_count_free_blocks(struct super_block *sb) 1475ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
1473{ 1476{
1474 unsigned long desc_count; 1477 ext3_fsblk_t desc_count;
1475 struct ext3_group_desc *gdp; 1478 struct ext3_group_desc *gdp;
1476 int i; 1479 int i;
1477 unsigned long ngroups = EXT3_SB(sb)->s_groups_count; 1480 unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
1478#ifdef EXT3FS_DEBUG 1481#ifdef EXT3FS_DEBUG
1479 struct ext3_super_block *es; 1482 struct ext3_super_block *es;
1480 unsigned long bitmap_count, x; 1483 ext3_fsblk_t bitmap_count;
1484 unsigned long x;
1481 struct buffer_head *bitmap_bh = NULL; 1485 struct buffer_head *bitmap_bh = NULL;
1482 1486
1483 es = EXT3_SB(sb)->s_es; 1487 es = EXT3_SB(sb)->s_es;
@@ -1502,8 +1506,10 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
1502 bitmap_count += x; 1506 bitmap_count += x;
1503 } 1507 }
1504 brelse(bitmap_bh); 1508 brelse(bitmap_bh);
1505 printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n", 1509 printk("ext3_count_free_blocks: stored = "E3FSBLK
1506 le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); 1510 ", computed = "E3FSBLK", "E3FSBLK"\n",
1511 le32_to_cpu(es->s_free_blocks_count),
1512 desc_count, bitmap_count);
1507 return bitmap_count; 1513 return bitmap_count;
1508#else 1514#else
1509 desc_count = 0; 1515 desc_count = 0;
@@ -1520,7 +1526,7 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
1520} 1526}
1521 1527
1522static inline int 1528static inline int
1523block_in_use(unsigned long block, struct super_block *sb, unsigned char *map) 1529block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map)
1524{ 1530{
1525 return ext3_test_bit ((block - 1531 return ext3_test_bit ((block -
1526 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) % 1532 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dc826464f313..36546ed36a14 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -262,9 +262,11 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
262 int ngroups = sbi->s_groups_count; 262 int ngroups = sbi->s_groups_count;
263 int inodes_per_group = EXT3_INODES_PER_GROUP(sb); 263 int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
264 int freei, avefreei; 264 int freei, avefreei;
265 int freeb, avefreeb; 265 ext3_fsblk_t freeb, avefreeb;
266 int blocks_per_dir, ndirs; 266 ext3_fsblk_t blocks_per_dir;
267 int max_debt, max_dirs, min_blocks, min_inodes; 267 int ndirs;
268 int max_debt, max_dirs, min_inodes;
269 ext3_grpblk_t min_blocks;
268 int group = -1, i; 270 int group = -1, i;
269 struct ext3_group_desc *desc; 271 struct ext3_group_desc *desc;
270 struct buffer_head *bh; 272 struct buffer_head *bh;
@@ -307,7 +309,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
307 min_inodes = avefreei - inodes_per_group / 4; 309 min_inodes = avefreei - inodes_per_group / 4;
308 min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; 310 min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
309 311
310 max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST); 312 max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST);
311 if (max_debt * INODE_COST > inodes_per_group) 313 if (max_debt * INODE_COST > inodes_per_group)
312 max_debt = inodes_per_group / INODE_COST; 314 max_debt = inodes_per_group / INODE_COST;
313 if (max_debt > 255) 315 if (max_debt > 255)
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2edd7eec88fd..0321e1b9034a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -62,7 +62,7 @@ static int ext3_inode_is_fast_symlink(struct inode *inode)
62 * still needs to be revoked. 62 * still needs to be revoked.
63 */ 63 */
64int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, 64int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
65 struct buffer_head *bh, int blocknr) 65 struct buffer_head *bh, ext3_fsblk_t blocknr)
66{ 66{
67 int err; 67 int err;
68 68
@@ -407,13 +407,13 @@ no_block:
407 * 407 *
408 * Caller must make sure that @ind is valid and will stay that way. 408 * Caller must make sure that @ind is valid and will stay that way.
409 */ 409 */
410static unsigned long ext3_find_near(struct inode *inode, Indirect *ind) 410static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
411{ 411{
412 struct ext3_inode_info *ei = EXT3_I(inode); 412 struct ext3_inode_info *ei = EXT3_I(inode);
413 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; 413 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
414 __le32 *p; 414 __le32 *p;
415 unsigned long bg_start; 415 ext3_fsblk_t bg_start;
416 unsigned long colour; 416 ext3_grpblk_t colour;
417 417
418 /* Try to find previous block */ 418 /* Try to find previous block */
419 for (p = ind->p - 1; p >= start; p--) { 419 for (p = ind->p - 1; p >= start; p--) {
@@ -429,8 +429,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
429 * It is going to be referred to from the inode itself? OK, just put it 429 * It is going to be referred to from the inode itself? OK, just put it
430 * into the same cylinder group then. 430 * into the same cylinder group then.
431 */ 431 */
432 bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + 432 bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
433 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
434 colour = (current->pid % 16) * 433 colour = (current->pid % 16) *
435 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); 434 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
436 return bg_start + colour; 435 return bg_start + colour;
@@ -448,7 +447,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
448 * stores it in *@goal and returns zero. 447 * stores it in *@goal and returns zero.
449 */ 448 */
450 449
451static unsigned long ext3_find_goal(struct inode *inode, long block, 450static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
452 Indirect chain[4], Indirect *partial) 451 Indirect chain[4], Indirect *partial)
453{ 452{
454 struct ext3_block_alloc_info *block_i; 453 struct ext3_block_alloc_info *block_i;
@@ -516,13 +515,13 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
516 * direct blocks 515 * direct blocks
517 */ 516 */
518static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, 517static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
519 unsigned long goal, int indirect_blks, int blks, 518 ext3_fsblk_t goal, int indirect_blks, int blks,
520 unsigned long long new_blocks[4], int *err) 519 ext3_fsblk_t new_blocks[4], int *err)
521{ 520{
522 int target, i; 521 int target, i;
523 unsigned long count = 0; 522 unsigned long count = 0;
524 int index = 0; 523 int index = 0;
525 unsigned long current_block = 0; 524 ext3_fsblk_t current_block = 0;
526 int ret = 0; 525 int ret = 0;
527 526
528 /* 527 /*
@@ -592,7 +591,7 @@ failed_out:
592 * as described above and return 0. 591 * as described above and return 0.
593 */ 592 */
594static int ext3_alloc_branch(handle_t *handle, struct inode *inode, 593static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
595 int indirect_blks, int *blks, unsigned long goal, 594 int indirect_blks, int *blks, ext3_fsblk_t goal,
596 int *offsets, Indirect *branch) 595 int *offsets, Indirect *branch)
597{ 596{
598 int blocksize = inode->i_sb->s_blocksize; 597 int blocksize = inode->i_sb->s_blocksize;
@@ -600,8 +599,8 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
600 int err = 0; 599 int err = 0;
601 struct buffer_head *bh; 600 struct buffer_head *bh;
602 int num; 601 int num;
603 unsigned long long new_blocks[4]; 602 ext3_fsblk_t new_blocks[4];
604 unsigned long long current_block; 603 ext3_fsblk_t current_block;
605 604
606 num = ext3_alloc_blocks(handle, inode, goal, indirect_blks, 605 num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
607 *blks, new_blocks, &err); 606 *blks, new_blocks, &err);
@@ -688,7 +687,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
688 int i; 687 int i;
689 int err = 0; 688 int err = 0;
690 struct ext3_block_alloc_info *block_i; 689 struct ext3_block_alloc_info *block_i;
691 unsigned long current_block; 690 ext3_fsblk_t current_block;
692 691
693 block_i = EXT3_I(inode)->i_block_alloc_info; 692 block_i = EXT3_I(inode)->i_block_alloc_info;
694 /* 693 /*
@@ -795,13 +794,13 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
795 int offsets[4]; 794 int offsets[4];
796 Indirect chain[4]; 795 Indirect chain[4];
797 Indirect *partial; 796 Indirect *partial;
798 unsigned long goal; 797 ext3_fsblk_t goal;
799 int indirect_blks; 798 int indirect_blks;
800 int blocks_to_boundary = 0; 799 int blocks_to_boundary = 0;
801 int depth; 800 int depth;
802 struct ext3_inode_info *ei = EXT3_I(inode); 801 struct ext3_inode_info *ei = EXT3_I(inode);
803 int count = 0; 802 int count = 0;
804 unsigned long first_block = 0; 803 ext3_fsblk_t first_block = 0;
805 804
806 805
807 J_ASSERT(handle != NULL || create == 0); 806 J_ASSERT(handle != NULL || create == 0);
@@ -819,7 +818,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
819 count++; 818 count++;
820 /*map more blocks*/ 819 /*map more blocks*/
821 while (count < maxblocks && count <= blocks_to_boundary) { 820 while (count < maxblocks && count <= blocks_to_boundary) {
822 unsigned long blk; 821 ext3_fsblk_t blk;
823 822
824 if (!verify_chain(chain, partial)) { 823 if (!verify_chain(chain, partial)) {
825 /* 824 /*
@@ -1759,7 +1758,7 @@ void ext3_set_aops(struct inode *inode)
1759static int ext3_block_truncate_page(handle_t *handle, struct page *page, 1758static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1760 struct address_space *mapping, loff_t from) 1759 struct address_space *mapping, loff_t from)
1761{ 1760{
1762 unsigned long index = from >> PAGE_CACHE_SHIFT; 1761 ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
1763 unsigned offset = from & (PAGE_CACHE_SIZE-1); 1762 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1764 unsigned blocksize, iblock, length, pos; 1763 unsigned blocksize, iblock, length, pos;
1765 struct inode *inode = mapping->host; 1764 struct inode *inode = mapping->host;
@@ -1960,7 +1959,7 @@ no_top:
1960 * than `count' because there can be holes in there. 1959 * than `count' because there can be holes in there.
1961 */ 1960 */
1962static void ext3_clear_blocks(handle_t *handle, struct inode *inode, 1961static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
1963 struct buffer_head *bh, unsigned long block_to_free, 1962 struct buffer_head *bh, ext3_fsblk_t block_to_free,
1964 unsigned long count, __le32 *first, __le32 *last) 1963 unsigned long count, __le32 *first, __le32 *last)
1965{ 1964{
1966 __le32 *p; 1965 __le32 *p;
@@ -2022,12 +2021,12 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
2022 struct buffer_head *this_bh, 2021 struct buffer_head *this_bh,
2023 __le32 *first, __le32 *last) 2022 __le32 *first, __le32 *last)
2024{ 2023{
2025 unsigned long block_to_free = 0; /* Starting block # of a run */ 2024 ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */
2026 unsigned long count = 0; /* Number of blocks in the run */ 2025 unsigned long count = 0; /* Number of blocks in the run */
2027 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 2026 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
2028 corresponding to 2027 corresponding to
2029 block_to_free */ 2028 block_to_free */
2030 unsigned long nr; /* Current block # */ 2029 ext3_fsblk_t nr; /* Current block # */
2031 __le32 *p; /* Pointer into inode/ind 2030 __le32 *p; /* Pointer into inode/ind
2032 for current block */ 2031 for current block */
2033 int err; 2032 int err;
@@ -2089,7 +2088,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2089 struct buffer_head *parent_bh, 2088 struct buffer_head *parent_bh,
2090 __le32 *first, __le32 *last, int depth) 2089 __le32 *first, __le32 *last, int depth)
2091{ 2090{
2092 unsigned long nr; 2091 ext3_fsblk_t nr;
2093 __le32 *p; 2092 __le32 *p;
2094 2093
2095 if (is_handle_aborted(handle)) 2094 if (is_handle_aborted(handle))
@@ -2113,7 +2112,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2113 */ 2112 */
2114 if (!bh) { 2113 if (!bh) {
2115 ext3_error(inode->i_sb, "ext3_free_branches", 2114 ext3_error(inode->i_sb, "ext3_free_branches",
2116 "Read failure, inode=%ld, block=%ld", 2115 "Read failure, inode=%ld, block="E3FSBLK,
2117 inode->i_ino, nr); 2116 inode->i_ino, nr);
2118 continue; 2117 continue;
2119 } 2118 }
@@ -2394,11 +2393,12 @@ out_stop:
2394 ext3_journal_stop(handle); 2393 ext3_journal_stop(handle);
2395} 2394}
2396 2395
2397static unsigned long ext3_get_inode_block(struct super_block *sb, 2396static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
2398 unsigned long ino, struct ext3_iloc *iloc) 2397 unsigned long ino, struct ext3_iloc *iloc)
2399{ 2398{
2400 unsigned long desc, group_desc, block_group; 2399 unsigned long desc, group_desc, block_group;
2401 unsigned long offset, block; 2400 unsigned long offset;
2401 ext3_fsblk_t block;
2402 struct buffer_head *bh; 2402 struct buffer_head *bh;
2403 struct ext3_group_desc * gdp; 2403 struct ext3_group_desc * gdp;
2404 2404
@@ -2448,7 +2448,7 @@ static unsigned long ext3_get_inode_block(struct super_block *sb,
2448static int __ext3_get_inode_loc(struct inode *inode, 2448static int __ext3_get_inode_loc(struct inode *inode,
2449 struct ext3_iloc *iloc, int in_mem) 2449 struct ext3_iloc *iloc, int in_mem)
2450{ 2450{
2451 unsigned long block; 2451 ext3_fsblk_t block;
2452 struct buffer_head *bh; 2452 struct buffer_head *bh;
2453 2453
2454 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc); 2454 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
@@ -2459,7 +2459,8 @@ static int __ext3_get_inode_loc(struct inode *inode,
2459 if (!bh) { 2459 if (!bh) {
2460 ext3_error (inode->i_sb, "ext3_get_inode_loc", 2460 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2461 "unable to read inode block - " 2461 "unable to read inode block - "
2462 "inode=%lu, block=%lu", inode->i_ino, block); 2462 "inode=%lu, block="E3FSBLK,
2463 inode->i_ino, block);
2463 return -EIO; 2464 return -EIO;
2464 } 2465 }
2465 if (!buffer_uptodate(bh)) { 2466 if (!buffer_uptodate(bh)) {
@@ -2540,7 +2541,7 @@ make_io:
2540 if (!buffer_uptodate(bh)) { 2541 if (!buffer_uptodate(bh)) {
2541 ext3_error(inode->i_sb, "ext3_get_inode_loc", 2542 ext3_error(inode->i_sb, "ext3_get_inode_loc",
2542 "unable to read inode block - " 2543 "unable to read inode block - "
2543 "inode=%lu, block=%lu", 2544 "inode=%lu, block="E3FSBLK,
2544 inode->i_ino, block); 2545 inode->i_ino, block);
2545 brelse(bh); 2546 brelse(bh);
2546 return -EIO; 2547 return -EIO;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 8c22aa9a7fbb..3a6b012d120c 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -204,7 +204,7 @@ flags_err:
204 return 0; 204 return 0;
205 } 205 }
206 case EXT3_IOC_GROUP_EXTEND: { 206 case EXT3_IOC_GROUP_EXTEND: {
207 unsigned long n_blocks_count; 207 ext3_fsblk_t n_blocks_count;
208 struct super_block *sb = inode->i_sb; 208 struct super_block *sb = inode->i_sb;
209 int err; 209 int err;
210 210
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b8f5cd1e540d..d9176dba3698 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1379,7 +1379,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1379 int dx_fallback=0; 1379 int dx_fallback=0;
1380#endif 1380#endif
1381 unsigned blocksize; 1381 unsigned blocksize;
1382 unsigned nlen, rlen;
1383 u32 block, blocks; 1382 u32 block, blocks;
1384 1383
1385 sb = dir->i_sb; 1384 sb = dir->i_sb;
@@ -1417,8 +1416,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1417 return retval; 1416 return retval;
1418 de = (struct ext3_dir_entry_2 *) bh->b_data; 1417 de = (struct ext3_dir_entry_2 *) bh->b_data;
1419 de->inode = 0; 1418 de->inode = 0;
1420 de->rec_len = cpu_to_le16(rlen = blocksize); 1419 de->rec_len = cpu_to_le16(blocksize);
1421 nlen = 0;
1422 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1420 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1423} 1421}
1424 1422
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 34b39e9a1e5a..dfd811895d8f 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -28,16 +28,16 @@ static int verify_group_input(struct super_block *sb,
28{ 28{
29 struct ext3_sb_info *sbi = EXT3_SB(sb); 29 struct ext3_sb_info *sbi = EXT3_SB(sb);
30 struct ext3_super_block *es = sbi->s_es; 30 struct ext3_super_block *es = sbi->s_es;
31 unsigned start = le32_to_cpu(es->s_blocks_count); 31 ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
32 unsigned end = start + input->blocks_count; 32 ext3_fsblk_t end = start + input->blocks_count;
33 unsigned group = input->group; 33 unsigned group = input->group;
34 unsigned itend = input->inode_table + sbi->s_itb_per_group; 34 ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
35 unsigned overhead = ext3_bg_has_super(sb, group) ? 35 unsigned overhead = ext3_bg_has_super(sb, group) ?
36 (1 + ext3_bg_num_gdb(sb, group) + 36 (1 + ext3_bg_num_gdb(sb, group) +
37 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; 37 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
38 unsigned metaend = start + overhead; 38 ext3_fsblk_t metaend = start + overhead;
39 struct buffer_head *bh = NULL; 39 struct buffer_head *bh = NULL;
40 int free_blocks_count; 40 ext3_grpblk_t free_blocks_count;
41 int err = -EINVAL; 41 int err = -EINVAL;
42 42
43 input->free_blocks_count = free_blocks_count = 43 input->free_blocks_count = free_blocks_count =
@@ -64,7 +64,8 @@ static int verify_group_input(struct super_block *sb,
64 ext3_warning(sb, __FUNCTION__, "Bad blocks count %u", 64 ext3_warning(sb, __FUNCTION__, "Bad blocks count %u",
65 input->blocks_count); 65 input->blocks_count);
66 else if (!(bh = sb_bread(sb, end - 1))) 66 else if (!(bh = sb_bread(sb, end - 1)))
67 ext3_warning(sb, __FUNCTION__, "Cannot read last block (%u)", 67 ext3_warning(sb, __FUNCTION__,
68 "Cannot read last block ("E3FSBLK")",
68 end - 1); 69 end - 1);
69 else if (outside(input->block_bitmap, start, end)) 70 else if (outside(input->block_bitmap, start, end))
70 ext3_warning(sb, __FUNCTION__, 71 ext3_warning(sb, __FUNCTION__,
@@ -77,7 +78,7 @@ static int verify_group_input(struct super_block *sb,
77 else if (outside(input->inode_table, start, end) || 78 else if (outside(input->inode_table, start, end) ||
78 outside(itend - 1, start, end)) 79 outside(itend - 1, start, end))
79 ext3_warning(sb, __FUNCTION__, 80 ext3_warning(sb, __FUNCTION__,
80 "Inode table not in group (blocks %u-%u)", 81 "Inode table not in group (blocks %u-"E3FSBLK")",
81 input->inode_table, itend - 1); 82 input->inode_table, itend - 1);
82 else if (input->inode_bitmap == input->block_bitmap) 83 else if (input->inode_bitmap == input->block_bitmap)
83 ext3_warning(sb, __FUNCTION__, 84 ext3_warning(sb, __FUNCTION__,
@@ -85,24 +86,27 @@ static int verify_group_input(struct super_block *sb,
85 input->block_bitmap); 86 input->block_bitmap);
86 else if (inside(input->block_bitmap, input->inode_table, itend)) 87 else if (inside(input->block_bitmap, input->inode_table, itend))
87 ext3_warning(sb, __FUNCTION__, 88 ext3_warning(sb, __FUNCTION__,
88 "Block bitmap (%u) in inode table (%u-%u)", 89 "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
89 input->block_bitmap, input->inode_table, itend-1); 90 input->block_bitmap, input->inode_table, itend-1);
90 else if (inside(input->inode_bitmap, input->inode_table, itend)) 91 else if (inside(input->inode_bitmap, input->inode_table, itend))
91 ext3_warning(sb, __FUNCTION__, 92 ext3_warning(sb, __FUNCTION__,
92 "Inode bitmap (%u) in inode table (%u-%u)", 93 "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
93 input->inode_bitmap, input->inode_table, itend-1); 94 input->inode_bitmap, input->inode_table, itend-1);
94 else if (inside(input->block_bitmap, start, metaend)) 95 else if (inside(input->block_bitmap, start, metaend))
95 ext3_warning(sb, __FUNCTION__, 96 ext3_warning(sb, __FUNCTION__,
96 "Block bitmap (%u) in GDT table (%u-%u)", 97 "Block bitmap (%u) in GDT table"
98 " ("E3FSBLK"-"E3FSBLK")",
97 input->block_bitmap, start, metaend - 1); 99 input->block_bitmap, start, metaend - 1);
98 else if (inside(input->inode_bitmap, start, metaend)) 100 else if (inside(input->inode_bitmap, start, metaend))
99 ext3_warning(sb, __FUNCTION__, 101 ext3_warning(sb, __FUNCTION__,
100 "Inode bitmap (%u) in GDT table (%u-%u)", 102 "Inode bitmap (%u) in GDT table"
103 " ("E3FSBLK"-"E3FSBLK")",
101 input->inode_bitmap, start, metaend - 1); 104 input->inode_bitmap, start, metaend - 1);
102 else if (inside(input->inode_table, start, metaend) || 105 else if (inside(input->inode_table, start, metaend) ||
103 inside(itend - 1, start, metaend)) 106 inside(itend - 1, start, metaend))
104 ext3_warning(sb, __FUNCTION__, 107 ext3_warning(sb, __FUNCTION__,
105 "Inode table (%u-%u) overlaps GDT table (%u-%u)", 108 "Inode table (%u-"E3FSBLK") overlaps"
109 "GDT table ("E3FSBLK"-"E3FSBLK")",
106 input->inode_table, itend - 1, start, metaend - 1); 110 input->inode_table, itend - 1, start, metaend - 1);
107 else 111 else
108 err = 0; 112 err = 0;
@@ -112,7 +116,7 @@ static int verify_group_input(struct super_block *sb,
112} 116}
113 117
114static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, 118static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
115 unsigned long blk) 119 ext3_fsblk_t blk)
116{ 120{
117 struct buffer_head *bh; 121 struct buffer_head *bh;
118 int err; 122 int err;
@@ -163,15 +167,14 @@ static int setup_new_group_blocks(struct super_block *sb,
163 struct ext3_new_group_data *input) 167 struct ext3_new_group_data *input)
164{ 168{
165 struct ext3_sb_info *sbi = EXT3_SB(sb); 169 struct ext3_sb_info *sbi = EXT3_SB(sb);
166 unsigned long start = input->group * sbi->s_blocks_per_group + 170 ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
167 le32_to_cpu(sbi->s_es->s_first_data_block);
168 int reserved_gdb = ext3_bg_has_super(sb, input->group) ? 171 int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
169 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; 172 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
170 unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group); 173 unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
171 struct buffer_head *bh; 174 struct buffer_head *bh;
172 handle_t *handle; 175 handle_t *handle;
173 unsigned long block; 176 ext3_fsblk_t block;
174 int bit; 177 ext3_grpblk_t bit;
175 int i; 178 int i;
176 int err = 0, err2; 179 int err = 0, err2;
177 180
@@ -328,7 +331,7 @@ static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
328static int verify_reserved_gdb(struct super_block *sb, 331static int verify_reserved_gdb(struct super_block *sb,
329 struct buffer_head *primary) 332 struct buffer_head *primary)
330{ 333{
331 const unsigned long blk = primary->b_blocknr; 334 const ext3_fsblk_t blk = primary->b_blocknr;
332 const unsigned long end = EXT3_SB(sb)->s_groups_count; 335 const unsigned long end = EXT3_SB(sb)->s_groups_count;
333 unsigned three = 1; 336 unsigned three = 1;
334 unsigned five = 5; 337 unsigned five = 5;
@@ -340,7 +343,8 @@ static int verify_reserved_gdb(struct super_block *sb,
340 while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { 343 while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
341 if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ 344 if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
342 ext3_warning(sb, __FUNCTION__, 345 ext3_warning(sb, __FUNCTION__,
343 "reserved GDT %ld missing grp %d (%ld)", 346 "reserved GDT "E3FSBLK
347 " missing grp %d ("E3FSBLK")",
344 blk, grp, 348 blk, grp,
345 grp * EXT3_BLOCKS_PER_GROUP(sb) + blk); 349 grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
346 return -EINVAL; 350 return -EINVAL;
@@ -372,7 +376,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
372 struct super_block *sb = inode->i_sb; 376 struct super_block *sb = inode->i_sb;
373 struct ext3_super_block *es = EXT3_SB(sb)->s_es; 377 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
374 unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); 378 unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
375 unsigned long gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; 379 ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
376 struct buffer_head **o_group_desc, **n_group_desc; 380 struct buffer_head **o_group_desc, **n_group_desc;
377 struct buffer_head *dind; 381 struct buffer_head *dind;
378 int gdbackups; 382 int gdbackups;
@@ -417,7 +421,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
417 data = (__u32 *)dind->b_data; 421 data = (__u32 *)dind->b_data;
418 if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { 422 if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
419 ext3_warning(sb, __FUNCTION__, 423 ext3_warning(sb, __FUNCTION__,
420 "new group %u GDT block %lu not reserved", 424 "new group %u GDT block "E3FSBLK" not reserved",
421 input->group, gdblock); 425 input->group, gdblock);
422 err = -EINVAL; 426 err = -EINVAL;
423 goto exit_dind; 427 goto exit_dind;
@@ -515,7 +519,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
515 struct buffer_head **primary; 519 struct buffer_head **primary;
516 struct buffer_head *dind; 520 struct buffer_head *dind;
517 struct ext3_iloc iloc; 521 struct ext3_iloc iloc;
518 unsigned long blk; 522 ext3_fsblk_t blk;
519 __u32 *data, *end; 523 __u32 *data, *end;
520 int gdbackups = 0; 524 int gdbackups = 0;
521 int res, i; 525 int res, i;
@@ -540,7 +544,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
540 for (res = 0; res < reserved_gdb; res++, blk++) { 544 for (res = 0; res < reserved_gdb; res++, blk++) {
541 if (le32_to_cpu(*data) != blk) { 545 if (le32_to_cpu(*data) != blk) {
542 ext3_warning(sb, __FUNCTION__, 546 ext3_warning(sb, __FUNCTION__,
543 "reserved block %lu not at offset %ld", 547 "reserved block "E3FSBLK
548 " not at offset %ld",
544 blk, (long)(data - (__u32 *)dind->b_data)); 549 blk, (long)(data - (__u32 *)dind->b_data));
545 err = -EINVAL; 550 err = -EINVAL;
546 goto exit_bh; 551 goto exit_bh;
@@ -902,15 +907,16 @@ exit_put:
902 * GDT blocks are reserved to grow to the desired size. 907 * GDT blocks are reserved to grow to the desired size.
903 */ 908 */
904int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, 909int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
905 unsigned long n_blocks_count) 910 ext3_fsblk_t n_blocks_count)
906{ 911{
907 unsigned long o_blocks_count; 912 ext3_fsblk_t o_blocks_count;
908 unsigned long o_groups_count; 913 unsigned long o_groups_count;
909 unsigned long last; 914 ext3_grpblk_t last;
910 int add; 915 ext3_grpblk_t add;
911 struct buffer_head * bh; 916 struct buffer_head * bh;
912 handle_t *handle; 917 handle_t *handle;
913 int err, freed_blocks; 918 int err;
919 unsigned long freed_blocks;
914 920
915 /* We don't need to worry about locking wrt other resizers just 921 /* We don't need to worry about locking wrt other resizers just
916 * yet: we're going to revalidate es->s_blocks_count after 922 * yet: we're going to revalidate es->s_blocks_count after
@@ -919,12 +925,22 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
919 o_groups_count = EXT3_SB(sb)->s_groups_count; 925 o_groups_count = EXT3_SB(sb)->s_groups_count;
920 926
921 if (test_opt(sb, DEBUG)) 927 if (test_opt(sb, DEBUG))
922 printk(KERN_DEBUG "EXT3-fs: extending last group from %lu to %lu blocks\n", 928 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
923 o_blocks_count, n_blocks_count); 929 o_blocks_count, n_blocks_count);
924 930
925 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) 931 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
926 return 0; 932 return 0;
927 933
934 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
935 printk(KERN_ERR "EXT3-fs: filesystem on %s:"
936 " too large to resize to %lu blocks safely\n",
937 sb->s_id, n_blocks_count);
938 if (sizeof(sector_t) < 8)
939 ext3_warning(sb, __FUNCTION__,
940 "CONFIG_LBD not enabled\n");
941 return -EINVAL;
942 }
943
928 if (n_blocks_count < o_blocks_count) { 944 if (n_blocks_count < o_blocks_count) {
929 ext3_warning(sb, __FUNCTION__, 945 ext3_warning(sb, __FUNCTION__,
930 "can't shrink FS - resize aborted"); 946 "can't shrink FS - resize aborted");
@@ -948,7 +964,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
948 964
949 if (o_blocks_count + add < n_blocks_count) 965 if (o_blocks_count + add < n_blocks_count)
950 ext3_warning(sb, __FUNCTION__, 966 ext3_warning(sb, __FUNCTION__,
951 "will only finish group (%lu blocks, %u new)", 967 "will only finish group ("E3FSBLK
968 " blocks, %u new)",
952 o_blocks_count + add, add); 969 o_blocks_count + add, add);
953 970
954 /* See if the device is actually as big as what was requested */ 971 /* See if the device is actually as big as what was requested */
@@ -991,10 +1008,10 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
991 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 1008 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
992 sb->s_dirt = 1; 1009 sb->s_dirt = 1;
993 unlock_super(sb); 1010 unlock_super(sb);
994 ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count, 1011 ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
995 o_blocks_count + add); 1012 o_blocks_count + add);
996 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1013 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
997 ext3_debug("freed blocks %ld through %ld\n", o_blocks_count, 1014 ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count,
998 o_blocks_count + add); 1015 o_blocks_count + add);
999 if ((err = ext3_journal_stop(handle))) 1016 if ((err = ext3_journal_stop(handle)))
1000 goto exit_put; 1017 goto exit_put;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a60cc6ec130f..b7483360a2db 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -630,7 +630,7 @@ enum {
630 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 630 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
631 Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, 631 Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
632 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 632 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
633 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, 633 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
634 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 634 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
635 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 635 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
636 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 636 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -666,6 +666,7 @@ static match_table_t tokens = {
666 {Opt_noreservation, "noreservation"}, 666 {Opt_noreservation, "noreservation"},
667 {Opt_noload, "noload"}, 667 {Opt_noload, "noload"},
668 {Opt_nobh, "nobh"}, 668 {Opt_nobh, "nobh"},
669 {Opt_bh, "bh"},
669 {Opt_commit, "commit=%u"}, 670 {Opt_commit, "commit=%u"},
670 {Opt_journal_update, "journal=update"}, 671 {Opt_journal_update, "journal=update"},
671 {Opt_journal_inum, "journal=%u"}, 672 {Opt_journal_inum, "journal=%u"},
@@ -689,14 +690,15 @@ static match_table_t tokens = {
689 {Opt_resize, "resize"}, 690 {Opt_resize, "resize"},
690}; 691};
691 692
692static unsigned long get_sb_block(void **data) 693static ext3_fsblk_t get_sb_block(void **data)
693{ 694{
694 unsigned long sb_block; 695 ext3_fsblk_t sb_block;
695 char *options = (char *) *data; 696 char *options = (char *) *data;
696 697
697 if (!options || strncmp(options, "sb=", 3) != 0) 698 if (!options || strncmp(options, "sb=", 3) != 0)
698 return 1; /* Default location */ 699 return 1; /* Default location */
699 options += 3; 700 options += 3;
701 /*todo: use simple_strtoll with >32bit ext3 */
700 sb_block = simple_strtoul(options, &options, 0); 702 sb_block = simple_strtoul(options, &options, 0);
701 if (*options && *options != ',') { 703 if (*options && *options != ',') {
702 printk("EXT3-fs: Invalid sb specification: %s\n", 704 printk("EXT3-fs: Invalid sb specification: %s\n",
@@ -711,7 +713,7 @@ static unsigned long get_sb_block(void **data)
711 713
712static int parse_options (char *options, struct super_block *sb, 714static int parse_options (char *options, struct super_block *sb,
713 unsigned long *inum, unsigned long *journal_devnum, 715 unsigned long *inum, unsigned long *journal_devnum,
714 unsigned long *n_blocks_count, int is_remount) 716 ext3_fsblk_t *n_blocks_count, int is_remount)
715{ 717{
716 struct ext3_sb_info *sbi = EXT3_SB(sb); 718 struct ext3_sb_info *sbi = EXT3_SB(sb);
717 char * p; 719 char * p;
@@ -1013,6 +1015,9 @@ clear_qf_name:
1013 case Opt_nobh: 1015 case Opt_nobh:
1014 set_opt(sbi->s_mount_opt, NOBH); 1016 set_opt(sbi->s_mount_opt, NOBH);
1015 break; 1017 break;
1018 case Opt_bh:
1019 clear_opt(sbi->s_mount_opt, NOBH);
1020 break;
1016 default: 1021 default:
1017 printk (KERN_ERR 1022 printk (KERN_ERR
1018 "EXT3-fs: Unrecognized mount option \"%s\" " 1023 "EXT3-fs: Unrecognized mount option \"%s\" "
@@ -1128,7 +1133,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1128static int ext3_check_descriptors (struct super_block * sb) 1133static int ext3_check_descriptors (struct super_block * sb)
1129{ 1134{
1130 struct ext3_sb_info *sbi = EXT3_SB(sb); 1135 struct ext3_sb_info *sbi = EXT3_SB(sb);
1131 unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); 1136 ext3_fsblk_t block = le32_to_cpu(sbi->s_es->s_first_data_block);
1132 struct ext3_group_desc * gdp = NULL; 1137 struct ext3_group_desc * gdp = NULL;
1133 int desc_block = 0; 1138 int desc_block = 0;
1134 int i; 1139 int i;
@@ -1315,15 +1320,14 @@ static loff_t ext3_max_size(int bits)
1315 return res; 1320 return res;
1316} 1321}
1317 1322
1318static unsigned long descriptor_loc(struct super_block *sb, 1323static ext3_fsblk_t descriptor_loc(struct super_block *sb,
1319 unsigned long logic_sb_block, 1324 ext3_fsblk_t logic_sb_block,
1320 int nr) 1325 int nr)
1321{ 1326{
1322 struct ext3_sb_info *sbi = EXT3_SB(sb); 1327 struct ext3_sb_info *sbi = EXT3_SB(sb);
1323 unsigned long bg, first_data_block, first_meta_bg; 1328 unsigned long bg, first_meta_bg;
1324 int has_super = 0; 1329 int has_super = 0;
1325 1330
1326 first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block);
1327 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); 1331 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
1328 1332
1329 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) || 1333 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
@@ -1332,7 +1336,7 @@ static unsigned long descriptor_loc(struct super_block *sb,
1332 bg = sbi->s_desc_per_block * nr; 1336 bg = sbi->s_desc_per_block * nr;
1333 if (ext3_bg_has_super(sb, bg)) 1337 if (ext3_bg_has_super(sb, bg))
1334 has_super = 1; 1338 has_super = 1;
1335 return (first_data_block + has_super + (bg * sbi->s_blocks_per_group)); 1339 return (has_super + ext3_group_first_block_no(sb, bg));
1336} 1340}
1337 1341
1338 1342
@@ -1341,9 +1345,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1341 struct buffer_head * bh; 1345 struct buffer_head * bh;
1342 struct ext3_super_block *es = NULL; 1346 struct ext3_super_block *es = NULL;
1343 struct ext3_sb_info *sbi; 1347 struct ext3_sb_info *sbi;
1344 unsigned long block; 1348 ext3_fsblk_t block;
1345 unsigned long sb_block = get_sb_block(&data); 1349 ext3_fsblk_t sb_block = get_sb_block(&data);
1346 unsigned long logic_sb_block; 1350 ext3_fsblk_t logic_sb_block;
1347 unsigned long offset = 0; 1351 unsigned long offset = 0;
1348 unsigned long journal_inum = 0; 1352 unsigned long journal_inum = 0;
1349 unsigned long journal_devnum = 0; 1353 unsigned long journal_devnum = 0;
@@ -1565,6 +1569,16 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1565 goto failed_mount; 1569 goto failed_mount;
1566 } 1570 }
1567 1571
1572 if (le32_to_cpu(es->s_blocks_count) >
1573 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
1574 printk(KERN_ERR "EXT3-fs: filesystem on %s:"
1575 " too large to mount safely\n", sb->s_id);
1576 if (sizeof(sector_t) < 8)
1577 printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
1578 "enabled\n");
1579 goto failed_mount;
1580 }
1581
1568 if (EXT3_BLOCKS_PER_GROUP(sb) == 0) 1582 if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
1569 goto cantfind_ext3; 1583 goto cantfind_ext3;
1570 sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - 1584 sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
@@ -1593,7 +1607,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1593 } 1607 }
1594 } 1608 }
1595 if (!ext3_check_descriptors (sb)) { 1609 if (!ext3_check_descriptors (sb)) {
1596 printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n"); 1610 printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n");
1597 goto failed_mount2; 1611 goto failed_mount2;
1598 } 1612 }
1599 sbi->s_gdb_count = db_count; 1613 sbi->s_gdb_count = db_count;
@@ -1830,10 +1844,10 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
1830{ 1844{
1831 struct buffer_head * bh; 1845 struct buffer_head * bh;
1832 journal_t *journal; 1846 journal_t *journal;
1833 int start; 1847 ext3_fsblk_t start;
1834 int len; 1848 ext3_fsblk_t len;
1835 int hblock, blocksize; 1849 int hblock, blocksize;
1836 unsigned long sb_block; 1850 ext3_fsblk_t sb_block;
1837 unsigned long offset; 1851 unsigned long offset;
1838 struct ext3_super_block * es; 1852 struct ext3_super_block * es;
1839 struct block_device *bdev; 1853 struct block_device *bdev;
@@ -2206,7 +2220,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2206{ 2220{
2207 struct ext3_super_block * es; 2221 struct ext3_super_block * es;
2208 struct ext3_sb_info *sbi = EXT3_SB(sb); 2222 struct ext3_sb_info *sbi = EXT3_SB(sb);
2209 unsigned long n_blocks_count = 0; 2223 ext3_fsblk_t n_blocks_count = 0;
2210 unsigned long old_sb_flags; 2224 unsigned long old_sb_flags;
2211 struct ext3_mount_options old_opts; 2225 struct ext3_mount_options old_opts;
2212 int err; 2226 int err;
@@ -2326,7 +2340,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2326 struct super_block *sb = dentry->d_sb; 2340 struct super_block *sb = dentry->d_sb;
2327 struct ext3_sb_info *sbi = EXT3_SB(sb); 2341 struct ext3_sb_info *sbi = EXT3_SB(sb);
2328 struct ext3_super_block *es = sbi->s_es; 2342 struct ext3_super_block *es = sbi->s_es;
2329 unsigned long overhead; 2343 ext3_fsblk_t overhead;
2330 int i; 2344 int i;
2331 2345
2332 if (test_opt (sb, MINIX_DF)) 2346 if (test_opt (sb, MINIX_DF))
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e8d60bf6b7df..a44a0562203a 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -225,7 +225,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
225 error = -ENODATA; 225 error = -ENODATA;
226 if (!EXT3_I(inode)->i_file_acl) 226 if (!EXT3_I(inode)->i_file_acl)
227 goto cleanup; 227 goto cleanup;
228 ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl); 228 ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
229 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); 229 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
230 if (!bh) 230 if (!bh)
231 goto cleanup; 231 goto cleanup;
@@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
233 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 233 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
234 if (ext3_xattr_check_block(bh)) { 234 if (ext3_xattr_check_block(bh)) {
235bad_block: ext3_error(inode->i_sb, __FUNCTION__, 235bad_block: ext3_error(inode->i_sb, __FUNCTION__,
236 "inode %ld: bad block %d", inode->i_ino, 236 "inode %ld: bad block "E3FSBLK, inode->i_ino,
237 EXT3_I(inode)->i_file_acl); 237 EXT3_I(inode)->i_file_acl);
238 error = -EIO; 238 error = -EIO;
239 goto cleanup; 239 goto cleanup;
@@ -366,7 +366,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
366 error = 0; 366 error = 0;
367 if (!EXT3_I(inode)->i_file_acl) 367 if (!EXT3_I(inode)->i_file_acl)
368 goto cleanup; 368 goto cleanup;
369 ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl); 369 ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
370 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); 370 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
371 error = -EIO; 371 error = -EIO;
372 if (!bh) 372 if (!bh)
@@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
375 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 375 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
376 if (ext3_xattr_check_block(bh)) { 376 if (ext3_xattr_check_block(bh)) {
377 ext3_error(inode->i_sb, __FUNCTION__, 377 ext3_error(inode->i_sb, __FUNCTION__,
378 "inode %ld: bad block %d", inode->i_ino, 378 "inode %ld: bad block "E3FSBLK, inode->i_ino,
379 EXT3_I(inode)->i_file_acl); 379 EXT3_I(inode)->i_file_acl);
380 error = -EIO; 380 error = -EIO;
381 goto cleanup; 381 goto cleanup;
@@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
647 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 647 le32_to_cpu(BHDR(bs->bh)->h_refcount));
648 if (ext3_xattr_check_block(bs->bh)) { 648 if (ext3_xattr_check_block(bs->bh)) {
649 ext3_error(sb, __FUNCTION__, 649 ext3_error(sb, __FUNCTION__,
650 "inode %ld: bad block %d", inode->i_ino, 650 "inode %ld: bad block "E3FSBLK, inode->i_ino,
651 EXT3_I(inode)->i_file_acl); 651 EXT3_I(inode)->i_file_acl);
652 error = -EIO; 652 error = -EIO;
653 goto cleanup; 653 goto cleanup;
@@ -792,11 +792,12 @@ inserted:
792 get_bh(new_bh); 792 get_bh(new_bh);
793 } else { 793 } else {
794 /* We need to allocate a new block */ 794 /* We need to allocate a new block */
795 int goal = le32_to_cpu( 795 ext3_fsblk_t goal = le32_to_cpu(
796 EXT3_SB(sb)->s_es->s_first_data_block) + 796 EXT3_SB(sb)->s_es->s_first_data_block) +
797 EXT3_I(inode)->i_block_group * 797 (ext3_fsblk_t)EXT3_I(inode)->i_block_group *
798 EXT3_BLOCKS_PER_GROUP(sb); 798 EXT3_BLOCKS_PER_GROUP(sb);
799 int block = ext3_new_block(handle, inode, goal, &error); 799 ext3_fsblk_t block = ext3_new_block(handle, inode,
800 goal, &error);
800 if (error) 801 if (error)
801 goto cleanup; 802 goto cleanup;
802 ea_idebug(inode, "creating block %d", block); 803 ea_idebug(inode, "creating block %d", block);
@@ -847,7 +848,7 @@ cleanup_dquot:
847 848
848bad_block: 849bad_block:
849 ext3_error(inode->i_sb, __FUNCTION__, 850 ext3_error(inode->i_sb, __FUNCTION__,
850 "inode %ld: bad block %d", inode->i_ino, 851 "inode %ld: bad block "E3FSBLK, inode->i_ino,
851 EXT3_I(inode)->i_file_acl); 852 EXT3_I(inode)->i_file_acl);
852 goto cleanup; 853 goto cleanup;
853 854
@@ -1076,14 +1077,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
1076 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); 1077 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
1077 if (!bh) { 1078 if (!bh) {
1078 ext3_error(inode->i_sb, __FUNCTION__, 1079 ext3_error(inode->i_sb, __FUNCTION__,
1079 "inode %ld: block %d read error", inode->i_ino, 1080 "inode %ld: block "E3FSBLK" read error", inode->i_ino,
1080 EXT3_I(inode)->i_file_acl); 1081 EXT3_I(inode)->i_file_acl);
1081 goto cleanup; 1082 goto cleanup;
1082 } 1083 }
1083 if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || 1084 if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
1084 BHDR(bh)->h_blocks != cpu_to_le32(1)) { 1085 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1085 ext3_error(inode->i_sb, __FUNCTION__, 1086 ext3_error(inode->i_sb, __FUNCTION__,
1086 "inode %ld: bad block %d", inode->i_ino, 1087 "inode %ld: bad block "E3FSBLK, inode->i_ino,
1087 EXT3_I(inode)->i_file_acl); 1088 EXT3_I(inode)->i_file_acl);
1088 goto cleanup; 1089 goto cleanup;
1089 } 1090 }
@@ -1210,11 +1211,11 @@ again:
1210 bh = sb_bread(inode->i_sb, ce->e_block); 1211 bh = sb_bread(inode->i_sb, ce->e_block);
1211 if (!bh) { 1212 if (!bh) {
1212 ext3_error(inode->i_sb, __FUNCTION__, 1213 ext3_error(inode->i_sb, __FUNCTION__,
1213 "inode %ld: block %ld read error", 1214 "inode %ld: block %lu read error",
1214 inode->i_ino, (unsigned long) ce->e_block); 1215 inode->i_ino, (unsigned long) ce->e_block);
1215 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= 1216 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1216 EXT3_XATTR_REFCOUNT_MAX) { 1217 EXT3_XATTR_REFCOUNT_MAX) {
1217 ea_idebug(inode, "block %ld refcount %d>=%d", 1218 ea_idebug(inode, "block %lu refcount %d>=%d",
1218 (unsigned long) ce->e_block, 1219 (unsigned long) ce->e_block,
1219 le32_to_cpu(BHDR(bh)->h_refcount), 1220 le32_to_cpu(BHDR(bh)->h_refcount),
1220 EXT3_XATTR_REFCOUNT_MAX); 1221 EXT3_XATTR_REFCOUNT_MAX);
diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
index 583bd78086d8..d35979a58743 100644
--- a/fs/freevxfs/vxfs.h
+++ b/fs/freevxfs/vxfs.h
@@ -159,11 +159,11 @@ struct vxfs_sb {
159 * In core superblock filesystem private data for VxFS. 159 * In core superblock filesystem private data for VxFS.
160 */ 160 */
161struct vxfs_sb_info { 161struct vxfs_sb_info {
162 struct vxfs_sb *vsi_raw; /* raw (on disk) supeblock */ 162 struct vxfs_sb *vsi_raw; /* raw (on disk) superblock */
163 struct buffer_head *vsi_bp; /* buffer for raw superblock*/ 163 struct buffer_head *vsi_bp; /* buffer for raw superblock*/
164 struct inode *vsi_fship; /* fileset header inode */ 164 struct inode *vsi_fship; /* fileset header inode */
165 struct inode *vsi_ilist; /* inode list inode */ 165 struct inode *vsi_ilist; /* inode list inode */
166 struct inode *vsi_stilist; /* structual inode list inode */ 166 struct inode *vsi_stilist; /* structural inode list inode */
167 u_long vsi_iext; /* initial inode list */ 167 u_long vsi_iext; /* initial inode list */
168 ino_t vsi_fshino; /* fileset header inode */ 168 ino_t vsi_fshino; /* fileset header inode */
169 daddr_t vsi_oltext; /* OLT extent */ 169 daddr_t vsi_oltext; /* OLT extent */
diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c
index 6dee109aeea4..78948b4b1894 100644
--- a/fs/freevxfs/vxfs_fshead.c
+++ b/fs/freevxfs/vxfs_fshead.c
@@ -112,7 +112,7 @@ vxfs_read_fshead(struct super_block *sbp)
112 112
113 vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino); 113 vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino);
114 if (!vip) { 114 if (!vip) {
115 printk(KERN_ERR "vxfs: unabled to read fsh inode\n"); 115 printk(KERN_ERR "vxfs: unable to read fsh inode\n");
116 return -EINVAL; 116 return -EINVAL;
117 } 117 }
118 if (!VXFS_ISFSH(vip)) { 118 if (!VXFS_ISFSH(vip)) {
@@ -129,13 +129,13 @@ vxfs_read_fshead(struct super_block *sbp)
129 129
130 infp->vsi_fship = vxfs_get_fake_inode(sbp, vip); 130 infp->vsi_fship = vxfs_get_fake_inode(sbp, vip);
131 if (!infp->vsi_fship) { 131 if (!infp->vsi_fship) {
132 printk(KERN_ERR "vxfs: unabled to get fsh inode\n"); 132 printk(KERN_ERR "vxfs: unable to get fsh inode\n");
133 goto out_free_fship; 133 goto out_free_fship;
134 } 134 }
135 135
136 sfp = vxfs_getfsh(infp->vsi_fship, 0); 136 sfp = vxfs_getfsh(infp->vsi_fship, 0);
137 if (!sfp) { 137 if (!sfp) {
138 printk(KERN_ERR "vxfs: unabled to get structural fsh\n"); 138 printk(KERN_ERR "vxfs: unable to get structural fsh\n");
139 goto out_iput_fship; 139 goto out_iput_fship;
140 } 140 }
141 141
@@ -145,7 +145,7 @@ vxfs_read_fshead(struct super_block *sbp)
145 145
146 pfp = vxfs_getfsh(infp->vsi_fship, 1); 146 pfp = vxfs_getfsh(infp->vsi_fship, 1);
147 if (!pfp) { 147 if (!pfp) {
148 printk(KERN_ERR "vxfs: unabled to get primary fsh\n"); 148 printk(KERN_ERR "vxfs: unable to get primary fsh\n");
149 goto out_free_sfp; 149 goto out_free_sfp;
150 } 150 }
151 151
@@ -159,7 +159,7 @@ vxfs_read_fshead(struct super_block *sbp)
159 159
160 infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip); 160 infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip);
161 if (!infp->vsi_stilist) { 161 if (!infp->vsi_stilist) {
162 printk(KERN_ERR "vxfs: unabled to get structual list inode\n"); 162 printk(KERN_ERR "vxfs: unable to get structural list inode\n");
163 kfree(tip); 163 kfree(tip);
164 goto out_free_pfp; 164 goto out_free_pfp;
165 } 165 }
@@ -174,7 +174,7 @@ vxfs_read_fshead(struct super_block *sbp)
174 goto out_iput_stilist; 174 goto out_iput_stilist;
175 infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip); 175 infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip);
176 if (!infp->vsi_ilist) { 176 if (!infp->vsi_ilist) {
177 printk(KERN_ERR "vxfs: unabled to get inode list inode\n"); 177 printk(KERN_ERR "vxfs: unable to get inode list inode\n");
178 kfree(tip); 178 kfree(tip);
179 goto out_iput_stilist; 179 goto out_iput_stilist;
180 } 180 }
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index c3e1f760cac9..72437065f6ad 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_FUSE_FS) += fuse.o 5obj-$(CONFIG_FUSE_FS) += fuse.o
6 6
7fuse-objs := dev.o dir.o file.o inode.o 7fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
new file mode 100644
index 000000000000..a3bce3a77253
--- /dev/null
+++ b/fs/fuse/control.c
@@ -0,0 +1,218 @@
1/*
2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu>
4
5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING.
7*/
8
9#include "fuse_i.h"
10
11#include <linux/init.h>
12#include <linux/module.h>
13
14#define FUSE_CTL_SUPER_MAGIC 0x65735543
15
16/*
17 * This is non-NULL when the single instance of the control filesystem
18 * exists. Protected by fuse_mutex
19 */
20static struct super_block *fuse_control_sb;
21
22static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
23{
24 struct fuse_conn *fc;
25 mutex_lock(&fuse_mutex);
26 fc = file->f_dentry->d_inode->u.generic_ip;
27 if (fc)
28 fc = fuse_conn_get(fc);
29 mutex_unlock(&fuse_mutex);
30 return fc;
31}
32
33static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
34 size_t count, loff_t *ppos)
35{
36 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
37 if (fc) {
38 fuse_abort_conn(fc);
39 fuse_conn_put(fc);
40 }
41 return count;
42}
43
44static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
45 size_t len, loff_t *ppos)
46{
47 char tmp[32];
48 size_t size;
49
50 if (!*ppos) {
51 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
52 if (!fc)
53 return 0;
54
55 file->private_data=(void *)(long)atomic_read(&fc->num_waiting);
56 fuse_conn_put(fc);
57 }
58 size = sprintf(tmp, "%ld\n", (long)file->private_data);
59 return simple_read_from_buffer(buf, len, ppos, tmp, size);
60}
61
62static const struct file_operations fuse_ctl_abort_ops = {
63 .open = nonseekable_open,
64 .write = fuse_conn_abort_write,
65};
66
67static const struct file_operations fuse_ctl_waiting_ops = {
68 .open = nonseekable_open,
69 .read = fuse_conn_waiting_read,
70};
71
72static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
73 struct fuse_conn *fc,
74 const char *name,
75 int mode, int nlink,
76 struct inode_operations *iop,
77 const struct file_operations *fop)
78{
79 struct dentry *dentry;
80 struct inode *inode;
81
82 BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES);
83 dentry = d_alloc_name(parent, name);
84 if (!dentry)
85 return NULL;
86
87 fc->ctl_dentry[fc->ctl_ndents++] = dentry;
88 inode = new_inode(fuse_control_sb);
89 if (!inode)
90 return NULL;
91
92 inode->i_mode = mode;
93 inode->i_uid = fc->user_id;
94 inode->i_gid = fc->group_id;
95 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
96 /* setting ->i_op to NULL is not allowed */
97 if (iop)
98 inode->i_op = iop;
99 inode->i_fop = fop;
100 inode->i_nlink = nlink;
101 inode->u.generic_ip = fc;
102 d_add(dentry, inode);
103 return dentry;
104}
105
106/*
107 * Add a connection to the control filesystem (if it exists). Caller
108 * must host fuse_mutex
109 */
110int fuse_ctl_add_conn(struct fuse_conn *fc)
111{
112 struct dentry *parent;
113 char name[32];
114
115 if (!fuse_control_sb)
116 return 0;
117
118 parent = fuse_control_sb->s_root;
119 parent->d_inode->i_nlink++;
120 sprintf(name, "%llu", (unsigned long long) fc->id);
121 parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
122 &simple_dir_inode_operations,
123 &simple_dir_operations);
124 if (!parent)
125 goto err;
126
127 if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
128 NULL, &fuse_ctl_waiting_ops) ||
129 !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
130 NULL, &fuse_ctl_abort_ops))
131 goto err;
132
133 return 0;
134
135 err:
136 fuse_ctl_remove_conn(fc);
137 return -ENOMEM;
138}
139
140/*
141 * Remove a connection from the control filesystem (if it exists).
142 * Caller must host fuse_mutex
143 */
144void fuse_ctl_remove_conn(struct fuse_conn *fc)
145{
146 int i;
147
148 if (!fuse_control_sb)
149 return;
150
151 for (i = fc->ctl_ndents - 1; i >= 0; i--) {
152 struct dentry *dentry = fc->ctl_dentry[i];
153 dentry->d_inode->u.generic_ip = NULL;
154 d_drop(dentry);
155 dput(dentry);
156 }
157 fuse_control_sb->s_root->d_inode->i_nlink--;
158}
159
160static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
161{
162 struct tree_descr empty_descr = {""};
163 struct fuse_conn *fc;
164 int err;
165
166 err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr);
167 if (err)
168 return err;
169
170 mutex_lock(&fuse_mutex);
171 BUG_ON(fuse_control_sb);
172 fuse_control_sb = sb;
173 list_for_each_entry(fc, &fuse_conn_list, entry) {
174 err = fuse_ctl_add_conn(fc);
175 if (err) {
176 fuse_control_sb = NULL;
177 mutex_unlock(&fuse_mutex);
178 return err;
179 }
180 }
181 mutex_unlock(&fuse_mutex);
182
183 return 0;
184}
185
186static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags,
187 const char *dev_name, void *raw_data,
188 struct vfsmount *mnt)
189{
190 return get_sb_single(fs_type, flags, raw_data,
191 fuse_ctl_fill_super, mnt);
192}
193
194static void fuse_ctl_kill_sb(struct super_block *sb)
195{
196 mutex_lock(&fuse_mutex);
197 fuse_control_sb = NULL;
198 mutex_unlock(&fuse_mutex);
199
200 kill_litter_super(sb);
201}
202
203static struct file_system_type fuse_ctl_fs_type = {
204 .owner = THIS_MODULE,
205 .name = "fusectl",
206 .get_sb = fuse_ctl_get_sb,
207 .kill_sb = fuse_ctl_kill_sb,
208};
209
210int __init fuse_ctl_init(void)
211{
212 return register_filesystem(&fuse_ctl_fs_type);
213}
214
215void fuse_ctl_cleanup(void)
216{
217 unregister_filesystem(&fuse_ctl_fs_type);
218}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 104a62dadb94..1e2006caf158 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -34,6 +34,7 @@ static void fuse_request_init(struct fuse_req *req)
34{ 34{
35 memset(req, 0, sizeof(*req)); 35 memset(req, 0, sizeof(*req));
36 INIT_LIST_HEAD(&req->list); 36 INIT_LIST_HEAD(&req->list);
37 INIT_LIST_HEAD(&req->intr_entry);
37 init_waitqueue_head(&req->waitq); 38 init_waitqueue_head(&req->waitq);
38 atomic_set(&req->count, 1); 39 atomic_set(&req->count, 1);
39} 40}
@@ -64,18 +65,6 @@ static void restore_sigs(sigset_t *oldset)
64 sigprocmask(SIG_SETMASK, oldset, NULL); 65 sigprocmask(SIG_SETMASK, oldset, NULL);
65} 66}
66 67
67/*
68 * Reset request, so that it can be reused
69 *
70 * The caller must be _very_ careful to make sure, that it is holding
71 * the only reference to req
72 */
73void fuse_reset_request(struct fuse_req *req)
74{
75 BUG_ON(atomic_read(&req->count) != 1);
76 fuse_request_init(req);
77}
78
79static void __fuse_get_request(struct fuse_req *req) 68static void __fuse_get_request(struct fuse_req *req)
80{ 69{
81 atomic_inc(&req->count); 70 atomic_inc(&req->count);
@@ -88,6 +77,13 @@ static void __fuse_put_request(struct fuse_req *req)
88 atomic_dec(&req->count); 77 atomic_dec(&req->count);
89} 78}
90 79
80static void fuse_req_init_context(struct fuse_req *req)
81{
82 req->in.h.uid = current->fsuid;
83 req->in.h.gid = current->fsgid;
84 req->in.h.pid = current->pid;
85}
86
91struct fuse_req *fuse_get_req(struct fuse_conn *fc) 87struct fuse_req *fuse_get_req(struct fuse_conn *fc)
92{ 88{
93 struct fuse_req *req; 89 struct fuse_req *req;
@@ -103,14 +99,16 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
103 if (intr) 99 if (intr)
104 goto out; 100 goto out;
105 101
102 err = -ENOTCONN;
103 if (!fc->connected)
104 goto out;
105
106 req = fuse_request_alloc(); 106 req = fuse_request_alloc();
107 err = -ENOMEM; 107 err = -ENOMEM;
108 if (!req) 108 if (!req)
109 goto out; 109 goto out;
110 110
111 req->in.h.uid = current->fsuid; 111 fuse_req_init_context(req);
112 req->in.h.gid = current->fsgid;
113 req->in.h.pid = current->pid;
114 req->waiting = 1; 112 req->waiting = 1;
115 return req; 113 return req;
116 114
@@ -119,142 +117,183 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
119 return ERR_PTR(err); 117 return ERR_PTR(err);
120} 118}
121 119
122void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) 120/*
121 * Return request in fuse_file->reserved_req. However that may
122 * currently be in use. If that is the case, wait for it to become
123 * available.
124 */
125static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
126 struct file *file)
123{ 127{
124 if (atomic_dec_and_test(&req->count)) { 128 struct fuse_req *req = NULL;
125 if (req->waiting) 129 struct fuse_file *ff = file->private_data;
126 atomic_dec(&fc->num_waiting); 130
127 fuse_request_free(req); 131 do {
128 } 132 wait_event(fc->blocked_waitq, ff->reserved_req);
133 spin_lock(&fc->lock);
134 if (ff->reserved_req) {
135 req = ff->reserved_req;
136 ff->reserved_req = NULL;
137 get_file(file);
138 req->stolen_file = file;
139 }
140 spin_unlock(&fc->lock);
141 } while (!req);
142
143 return req;
129} 144}
130 145
131/* 146/*
132 * Called with sbput_sem held for read (request_end) or write 147 * Put stolen request back into fuse_file->reserved_req
133 * (fuse_put_super). By the time fuse_put_super() is finished, all
134 * inodes belonging to background requests must be released, so the
135 * iputs have to be done within the locked region.
136 */ 148 */
137void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) 149static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
138{ 150{
139 iput(req->inode); 151 struct file *file = req->stolen_file;
140 iput(req->inode2); 152 struct fuse_file *ff = file->private_data;
153
141 spin_lock(&fc->lock); 154 spin_lock(&fc->lock);
142 list_del(&req->bg_entry); 155 fuse_request_init(req);
143 if (fc->num_background == FUSE_MAX_BACKGROUND) { 156 BUG_ON(ff->reserved_req);
144 fc->blocked = 0; 157 ff->reserved_req = req;
145 wake_up_all(&fc->blocked_waitq); 158 wake_up(&fc->blocked_waitq);
146 }
147 fc->num_background--;
148 spin_unlock(&fc->lock); 159 spin_unlock(&fc->lock);
160 fput(file);
149} 161}
150 162
151/* 163/*
152 * This function is called when a request is finished. Either a reply 164 * Gets a requests for a file operation, always succeeds
153 * has arrived or it was interrupted (and not yet sent) or some error
154 * occurred during communication with userspace, or the device file
155 * was closed. In case of a background request the reference to the
156 * stored objects are released. The requester thread is woken up (if
157 * still waiting), the 'end' callback is called if given, else the
158 * reference to the request is released
159 * 165 *
160 * Releasing extra reference for foreground requests must be done 166 * This is used for sending the FLUSH request, which must get to
161 * within the same locked region as setting state to finished. This 167 * userspace, due to POSIX locks which may need to be unlocked.
162 * is because fuse_reset_request() may be called after request is
163 * finished and it must be the sole possessor. If request is
164 * interrupted and put in the background, it will return with an error
165 * and hence never be reset and reused.
166 * 168 *
167 * Called with fc->lock, unlocks it 169 * If allocation fails due to OOM, use the reserved request in
170 * fuse_file.
171 *
172 * This is very unlikely to deadlock accidentally, since the
173 * filesystem should not have it's own file open. If deadlock is
174 * intentional, it can still be broken by "aborting" the filesystem.
168 */ 175 */
169static void request_end(struct fuse_conn *fc, struct fuse_req *req) 176struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file)
170{ 177{
171 list_del(&req->list); 178 struct fuse_req *req;
172 req->state = FUSE_REQ_FINISHED;
173 if (!req->background) {
174 spin_unlock(&fc->lock);
175 wake_up(&req->waitq);
176 fuse_put_request(fc, req);
177 } else {
178 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
179 req->end = NULL;
180 spin_unlock(&fc->lock);
181 down_read(&fc->sbput_sem);
182 if (fc->mounted)
183 fuse_release_background(fc, req);
184 up_read(&fc->sbput_sem);
185 179
186 /* fput must go outside sbput_sem, otherwise it can deadlock */ 180 atomic_inc(&fc->num_waiting);
187 if (req->file) 181 wait_event(fc->blocked_waitq, !fc->blocked);
188 fput(req->file); 182 req = fuse_request_alloc();
183 if (!req)
184 req = get_reserved_req(fc, file);
189 185
190 if (end) 186 fuse_req_init_context(req);
191 end(fc, req); 187 req->waiting = 1;
188 return req;
189}
190
191void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
192{
193 if (atomic_dec_and_test(&req->count)) {
194 if (req->waiting)
195 atomic_dec(&fc->num_waiting);
196
197 if (req->stolen_file)
198 put_reserved_req(fc, req);
192 else 199 else
193 fuse_put_request(fc, req); 200 fuse_request_free(req);
194 } 201 }
195} 202}
196 203
197/* 204/*
198 * Unfortunately request interruption not just solves the deadlock 205 * This function is called when a request is finished. Either a reply
199 * problem, it causes problems too. These stem from the fact, that an 206 * has arrived or it was aborted (and not yet sent) or some error
200 * interrupted request is continued to be processed in userspace, 207 * occurred during communication with userspace, or the device file
201 * while all the locks and object references (inode and file) held 208 * was closed. The requester thread is woken up (if still waiting),
202 * during the operation are released. 209 * the 'end' callback is called if given, else the reference to the
203 * 210 * request is released
204 * To release the locks is exactly why there's a need to interrupt the
205 * request, so there's not a lot that can be done about this, except
206 * introduce additional locking in userspace.
207 *
208 * More important is to keep inode and file references until userspace
209 * has replied, otherwise FORGET and RELEASE could be sent while the
210 * inode/file is still used by the filesystem.
211 *
212 * For this reason the concept of "background" request is introduced.
213 * An interrupted request is backgrounded if it has been already sent
214 * to userspace. Backgrounding involves getting an extra reference to
215 * inode(s) or file used in the request, and adding the request to
216 * fc->background list. When a reply is received for a background
217 * request, the object references are released, and the request is
218 * removed from the list. If the filesystem is unmounted while there
219 * are still background requests, the list is walked and references
220 * are released as if a reply was received.
221 * 211 *
222 * There's one more use for a background request. The RELEASE message is 212 * Called with fc->lock, unlocks it
223 * always sent as background, since it doesn't return an error or
224 * data.
225 */ 213 */
226static void background_request(struct fuse_conn *fc, struct fuse_req *req) 214static void request_end(struct fuse_conn *fc, struct fuse_req *req)
227{ 215{
228 req->background = 1; 216 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
229 list_add(&req->bg_entry, &fc->background); 217 req->end = NULL;
230 fc->num_background++; 218 list_del(&req->list);
231 if (fc->num_background == FUSE_MAX_BACKGROUND) 219 list_del(&req->intr_entry);
232 fc->blocked = 1; 220 req->state = FUSE_REQ_FINISHED;
233 if (req->inode) 221 if (req->background) {
234 req->inode = igrab(req->inode); 222 if (fc->num_background == FUSE_MAX_BACKGROUND) {
235 if (req->inode2) 223 fc->blocked = 0;
236 req->inode2 = igrab(req->inode2); 224 wake_up_all(&fc->blocked_waitq);
225 }
226 fc->num_background--;
227 }
228 spin_unlock(&fc->lock);
229 dput(req->dentry);
230 mntput(req->vfsmount);
237 if (req->file) 231 if (req->file)
238 get_file(req->file); 232 fput(req->file);
233 wake_up(&req->waitq);
234 if (end)
235 end(fc, req);
236 else
237 fuse_put_request(fc, req);
239} 238}
240 239
241/* Called with fc->lock held. Releases, and then reacquires it. */ 240static void wait_answer_interruptible(struct fuse_conn *fc,
242static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) 241 struct fuse_req *req)
243{ 242{
244 sigset_t oldset; 243 if (signal_pending(current))
244 return;
245 245
246 spin_unlock(&fc->lock); 246 spin_unlock(&fc->lock);
247 block_sigs(&oldset);
248 wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); 247 wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
249 restore_sigs(&oldset);
250 spin_lock(&fc->lock); 248 spin_lock(&fc->lock);
251 if (req->state == FUSE_REQ_FINISHED && !req->interrupted) 249}
252 return; 250
251static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
252{
253 list_add_tail(&req->intr_entry, &fc->interrupts);
254 wake_up(&fc->waitq);
255 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
256}
257
258/* Called with fc->lock held. Releases, and then reacquires it. */
259static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
260{
261 if (!fc->no_interrupt) {
262 /* Any signal may interrupt this */
263 wait_answer_interruptible(fc, req);
264
265 if (req->aborted)
266 goto aborted;
267 if (req->state == FUSE_REQ_FINISHED)
268 return;
253 269
254 if (!req->interrupted) {
255 req->out.h.error = -EINTR;
256 req->interrupted = 1; 270 req->interrupted = 1;
271 if (req->state == FUSE_REQ_SENT)
272 queue_interrupt(fc, req);
273 }
274
275 if (req->force) {
276 spin_unlock(&fc->lock);
277 wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
278 spin_lock(&fc->lock);
279 } else {
280 sigset_t oldset;
281
282 /* Only fatal signals may interrupt this */
283 block_sigs(&oldset);
284 wait_answer_interruptible(fc, req);
285 restore_sigs(&oldset);
257 } 286 }
287
288 if (req->aborted)
289 goto aborted;
290 if (req->state == FUSE_REQ_FINISHED)
291 return;
292
293 req->out.h.error = -EINTR;
294 req->aborted = 1;
295
296 aborted:
258 if (req->locked) { 297 if (req->locked) {
259 /* This is uninterruptible sleep, because data is 298 /* This is uninterruptible sleep, because data is
260 being copied to/from the buffers of req. During 299 being copied to/from the buffers of req. During
@@ -268,8 +307,11 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
268 if (req->state == FUSE_REQ_PENDING) { 307 if (req->state == FUSE_REQ_PENDING) {
269 list_del(&req->list); 308 list_del(&req->list);
270 __fuse_put_request(req); 309 __fuse_put_request(req);
271 } else if (req->state == FUSE_REQ_SENT) 310 } else if (req->state == FUSE_REQ_SENT) {
272 background_request(fc, req); 311 spin_unlock(&fc->lock);
312 wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
313 spin_lock(&fc->lock);
314 }
273} 315}
274 316
275static unsigned len_args(unsigned numargs, struct fuse_arg *args) 317static unsigned len_args(unsigned numargs, struct fuse_arg *args)
@@ -283,13 +325,19 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
283 return nbytes; 325 return nbytes;
284} 326}
285 327
328static u64 fuse_get_unique(struct fuse_conn *fc)
329 {
330 fc->reqctr++;
331 /* zero is special */
332 if (fc->reqctr == 0)
333 fc->reqctr = 1;
334
335 return fc->reqctr;
336}
337
286static void queue_request(struct fuse_conn *fc, struct fuse_req *req) 338static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
287{ 339{
288 fc->reqctr++; 340 req->in.h.unique = fuse_get_unique(fc);
289 /* zero is special */
290 if (fc->reqctr == 0)
291 fc->reqctr = 1;
292 req->in.h.unique = fc->reqctr;
293 req->in.h.len = sizeof(struct fuse_in_header) + 341 req->in.h.len = sizeof(struct fuse_in_header) +
294 len_args(req->in.numargs, (struct fuse_arg *) req->in.args); 342 len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
295 list_add_tail(&req->list, &fc->pending); 343 list_add_tail(&req->list, &fc->pending);
@@ -302,9 +350,6 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
302 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 350 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
303} 351}
304 352
305/*
306 * This can only be interrupted by a SIGKILL
307 */
308void request_send(struct fuse_conn *fc, struct fuse_req *req) 353void request_send(struct fuse_conn *fc, struct fuse_req *req)
309{ 354{
310 req->isreply = 1; 355 req->isreply = 1;
@@ -327,8 +372,12 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
327static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) 372static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
328{ 373{
329 spin_lock(&fc->lock); 374 spin_lock(&fc->lock);
330 background_request(fc, req);
331 if (fc->connected) { 375 if (fc->connected) {
376 req->background = 1;
377 fc->num_background++;
378 if (fc->num_background == FUSE_MAX_BACKGROUND)
379 fc->blocked = 1;
380
332 queue_request(fc, req); 381 queue_request(fc, req);
333 spin_unlock(&fc->lock); 382 spin_unlock(&fc->lock);
334 } else { 383 } else {
@@ -352,14 +401,14 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
352/* 401/*
353 * Lock the request. Up to the next unlock_request() there mustn't be 402 * Lock the request. Up to the next unlock_request() there mustn't be
354 * anything that could cause a page-fault. If the request was already 403 * anything that could cause a page-fault. If the request was already
355 * interrupted bail out. 404 * aborted bail out.
356 */ 405 */
357static int lock_request(struct fuse_conn *fc, struct fuse_req *req) 406static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
358{ 407{
359 int err = 0; 408 int err = 0;
360 if (req) { 409 if (req) {
361 spin_lock(&fc->lock); 410 spin_lock(&fc->lock);
362 if (req->interrupted) 411 if (req->aborted)
363 err = -ENOENT; 412 err = -ENOENT;
364 else 413 else
365 req->locked = 1; 414 req->locked = 1;
@@ -369,7 +418,7 @@ static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
369} 418}
370 419
371/* 420/*
372 * Unlock request. If it was interrupted during being locked, the 421 * Unlock request. If it was aborted during being locked, the
373 * requester thread is currently waiting for it to be unlocked, so 422 * requester thread is currently waiting for it to be unlocked, so
374 * wake it up. 423 * wake it up.
375 */ 424 */
@@ -378,7 +427,7 @@ static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
378 if (req) { 427 if (req) {
379 spin_lock(&fc->lock); 428 spin_lock(&fc->lock);
380 req->locked = 0; 429 req->locked = 0;
381 if (req->interrupted) 430 if (req->aborted)
382 wake_up(&req->waitq); 431 wake_up(&req->waitq);
383 spin_unlock(&fc->lock); 432 spin_unlock(&fc->lock);
384 } 433 }
@@ -557,13 +606,18 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
557 return err; 606 return err;
558} 607}
559 608
609static int request_pending(struct fuse_conn *fc)
610{
611 return !list_empty(&fc->pending) || !list_empty(&fc->interrupts);
612}
613
560/* Wait until a request is available on the pending list */ 614/* Wait until a request is available on the pending list */
561static void request_wait(struct fuse_conn *fc) 615static void request_wait(struct fuse_conn *fc)
562{ 616{
563 DECLARE_WAITQUEUE(wait, current); 617 DECLARE_WAITQUEUE(wait, current);
564 618
565 add_wait_queue_exclusive(&fc->waitq, &wait); 619 add_wait_queue_exclusive(&fc->waitq, &wait);
566 while (fc->connected && list_empty(&fc->pending)) { 620 while (fc->connected && !request_pending(fc)) {
567 set_current_state(TASK_INTERRUPTIBLE); 621 set_current_state(TASK_INTERRUPTIBLE);
568 if (signal_pending(current)) 622 if (signal_pending(current))
569 break; 623 break;
@@ -577,11 +631,50 @@ static void request_wait(struct fuse_conn *fc)
577} 631}
578 632
579/* 633/*
634 * Transfer an interrupt request to userspace
635 *
636 * Unlike other requests this is assembled on demand, without a need
637 * to allocate a separate fuse_req structure.
638 *
639 * Called with fc->lock held, releases it
640 */
641static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
642 const struct iovec *iov, unsigned long nr_segs)
643{
644 struct fuse_copy_state cs;
645 struct fuse_in_header ih;
646 struct fuse_interrupt_in arg;
647 unsigned reqsize = sizeof(ih) + sizeof(arg);
648 int err;
649
650 list_del_init(&req->intr_entry);
651 req->intr_unique = fuse_get_unique(fc);
652 memset(&ih, 0, sizeof(ih));
653 memset(&arg, 0, sizeof(arg));
654 ih.len = reqsize;
655 ih.opcode = FUSE_INTERRUPT;
656 ih.unique = req->intr_unique;
657 arg.unique = req->in.h.unique;
658
659 spin_unlock(&fc->lock);
660 if (iov_length(iov, nr_segs) < reqsize)
661 return -EINVAL;
662
663 fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs);
664 err = fuse_copy_one(&cs, &ih, sizeof(ih));
665 if (!err)
666 err = fuse_copy_one(&cs, &arg, sizeof(arg));
667 fuse_copy_finish(&cs);
668
669 return err ? err : reqsize;
670}
671
672/*
580 * Read a single request into the userspace filesystem's buffer. This 673 * Read a single request into the userspace filesystem's buffer. This
581 * function waits until a request is available, then removes it from 674 * function waits until a request is available, then removes it from
582 * the pending list and copies request data to userspace buffer. If 675 * the pending list and copies request data to userspace buffer. If
583 * no reply is needed (FORGET) or request has been interrupted or 676 * no reply is needed (FORGET) or request has been aborted or there
584 * there was an error during the copying then it's finished by calling 677 * was an error during the copying then it's finished by calling
585 * request_end(). Otherwise add it to the processing list, and set 678 * request_end(). Otherwise add it to the processing list, and set
586 * the 'sent' flag. 679 * the 'sent' flag.
587 */ 680 */
@@ -601,7 +694,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
601 spin_lock(&fc->lock); 694 spin_lock(&fc->lock);
602 err = -EAGAIN; 695 err = -EAGAIN;
603 if ((file->f_flags & O_NONBLOCK) && fc->connected && 696 if ((file->f_flags & O_NONBLOCK) && fc->connected &&
604 list_empty(&fc->pending)) 697 !request_pending(fc))
605 goto err_unlock; 698 goto err_unlock;
606 699
607 request_wait(fc); 700 request_wait(fc);
@@ -609,9 +702,15 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
609 if (!fc->connected) 702 if (!fc->connected)
610 goto err_unlock; 703 goto err_unlock;
611 err = -ERESTARTSYS; 704 err = -ERESTARTSYS;
612 if (list_empty(&fc->pending)) 705 if (!request_pending(fc))
613 goto err_unlock; 706 goto err_unlock;
614 707
708 if (!list_empty(&fc->interrupts)) {
709 req = list_entry(fc->interrupts.next, struct fuse_req,
710 intr_entry);
711 return fuse_read_interrupt(fc, req, iov, nr_segs);
712 }
713
615 req = list_entry(fc->pending.next, struct fuse_req, list); 714 req = list_entry(fc->pending.next, struct fuse_req, list);
616 req->state = FUSE_REQ_READING; 715 req->state = FUSE_REQ_READING;
617 list_move(&req->list, &fc->io); 716 list_move(&req->list, &fc->io);
@@ -636,10 +735,10 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
636 fuse_copy_finish(&cs); 735 fuse_copy_finish(&cs);
637 spin_lock(&fc->lock); 736 spin_lock(&fc->lock);
638 req->locked = 0; 737 req->locked = 0;
639 if (!err && req->interrupted) 738 if (!err && req->aborted)
640 err = -ENOENT; 739 err = -ENOENT;
641 if (err) { 740 if (err) {
642 if (!req->interrupted) 741 if (!req->aborted)
643 req->out.h.error = -EIO; 742 req->out.h.error = -EIO;
644 request_end(fc, req); 743 request_end(fc, req);
645 return err; 744 return err;
@@ -649,6 +748,8 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
649 else { 748 else {
650 req->state = FUSE_REQ_SENT; 749 req->state = FUSE_REQ_SENT;
651 list_move_tail(&req->list, &fc->processing); 750 list_move_tail(&req->list, &fc->processing);
751 if (req->interrupted)
752 queue_interrupt(fc, req);
652 spin_unlock(&fc->lock); 753 spin_unlock(&fc->lock);
653 } 754 }
654 return reqsize; 755 return reqsize;
@@ -675,7 +776,7 @@ static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
675 list_for_each(entry, &fc->processing) { 776 list_for_each(entry, &fc->processing) {
676 struct fuse_req *req; 777 struct fuse_req *req;
677 req = list_entry(entry, struct fuse_req, list); 778 req = list_entry(entry, struct fuse_req, list);
678 if (req->in.h.unique == unique) 779 if (req->in.h.unique == unique || req->intr_unique == unique)
679 return req; 780 return req;
680 } 781 }
681 return NULL; 782 return NULL;
@@ -741,17 +842,33 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
741 goto err_unlock; 842 goto err_unlock;
742 843
743 req = request_find(fc, oh.unique); 844 req = request_find(fc, oh.unique);
744 err = -EINVAL;
745 if (!req) 845 if (!req)
746 goto err_unlock; 846 goto err_unlock;
747 847
748 if (req->interrupted) { 848 if (req->aborted) {
749 spin_unlock(&fc->lock); 849 spin_unlock(&fc->lock);
750 fuse_copy_finish(&cs); 850 fuse_copy_finish(&cs);
751 spin_lock(&fc->lock); 851 spin_lock(&fc->lock);
752 request_end(fc, req); 852 request_end(fc, req);
753 return -ENOENT; 853 return -ENOENT;
754 } 854 }
855 /* Is it an interrupt reply? */
856 if (req->intr_unique == oh.unique) {
857 err = -EINVAL;
858 if (nbytes != sizeof(struct fuse_out_header))
859 goto err_unlock;
860
861 if (oh.error == -ENOSYS)
862 fc->no_interrupt = 1;
863 else if (oh.error == -EAGAIN)
864 queue_interrupt(fc, req);
865
866 spin_unlock(&fc->lock);
867 fuse_copy_finish(&cs);
868 return nbytes;
869 }
870
871 req->state = FUSE_REQ_WRITING;
755 list_move(&req->list, &fc->io); 872 list_move(&req->list, &fc->io);
756 req->out.h = oh; 873 req->out.h = oh;
757 req->locked = 1; 874 req->locked = 1;
@@ -764,9 +881,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
764 spin_lock(&fc->lock); 881 spin_lock(&fc->lock);
765 req->locked = 0; 882 req->locked = 0;
766 if (!err) { 883 if (!err) {
767 if (req->interrupted) 884 if (req->aborted)
768 err = -ENOENT; 885 err = -ENOENT;
769 } else if (!req->interrupted) 886 } else if (!req->aborted)
770 req->out.h.error = -EIO; 887 req->out.h.error = -EIO;
771 request_end(fc, req); 888 request_end(fc, req);
772 889
@@ -800,7 +917,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
800 spin_lock(&fc->lock); 917 spin_lock(&fc->lock);
801 if (!fc->connected) 918 if (!fc->connected)
802 mask = POLLERR; 919 mask = POLLERR;
803 else if (!list_empty(&fc->pending)) 920 else if (request_pending(fc))
804 mask |= POLLIN | POLLRDNORM; 921 mask |= POLLIN | POLLRDNORM;
805 spin_unlock(&fc->lock); 922 spin_unlock(&fc->lock);
806 923
@@ -826,7 +943,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
826/* 943/*
827 * Abort requests under I/O 944 * Abort requests under I/O
828 * 945 *
829 * The requests are set to interrupted and finished, and the request 946 * The requests are set to aborted and finished, and the request
830 * waiter is woken up. This will make request_wait_answer() wait 947 * waiter is woken up. This will make request_wait_answer() wait
831 * until the request is unlocked and then return. 948 * until the request is unlocked and then return.
832 * 949 *
@@ -841,7 +958,7 @@ static void end_io_requests(struct fuse_conn *fc)
841 list_entry(fc->io.next, struct fuse_req, list); 958 list_entry(fc->io.next, struct fuse_req, list);
842 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; 959 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
843 960
844 req->interrupted = 1; 961 req->aborted = 1;
845 req->out.h.error = -ECONNABORTED; 962 req->out.h.error = -ECONNABORTED;
846 req->state = FUSE_REQ_FINISHED; 963 req->state = FUSE_REQ_FINISHED;
847 list_del_init(&req->list); 964 list_del_init(&req->list);
@@ -874,19 +991,20 @@ static void end_io_requests(struct fuse_conn *fc)
874 * onto the pending list is prevented by req->connected being false. 991 * onto the pending list is prevented by req->connected being false.
875 * 992 *
876 * Progression of requests under I/O to the processing list is 993 * Progression of requests under I/O to the processing list is
877 * prevented by the req->interrupted flag being true for these 994 * prevented by the req->aborted flag being true for these requests.
878 * requests. For this reason requests on the io list must be aborted 995 * For this reason requests on the io list must be aborted first.
879 * first.
880 */ 996 */
881void fuse_abort_conn(struct fuse_conn *fc) 997void fuse_abort_conn(struct fuse_conn *fc)
882{ 998{
883 spin_lock(&fc->lock); 999 spin_lock(&fc->lock);
884 if (fc->connected) { 1000 if (fc->connected) {
885 fc->connected = 0; 1001 fc->connected = 0;
1002 fc->blocked = 0;
886 end_io_requests(fc); 1003 end_io_requests(fc);
887 end_requests(fc, &fc->pending); 1004 end_requests(fc, &fc->pending);
888 end_requests(fc, &fc->processing); 1005 end_requests(fc, &fc->processing);
889 wake_up_all(&fc->waitq); 1006 wake_up_all(&fc->waitq);
1007 wake_up_all(&fc->blocked_waitq);
890 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 1008 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
891 } 1009 }
892 spin_unlock(&fc->lock); 1010 spin_unlock(&fc->lock);
@@ -902,7 +1020,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
902 end_requests(fc, &fc->processing); 1020 end_requests(fc, &fc->processing);
903 spin_unlock(&fc->lock); 1021 spin_unlock(&fc->lock);
904 fasync_helper(-1, file, 0, &fc->fasync); 1022 fasync_helper(-1, file, 0, &fc->fasync);
905 kobject_put(&fc->kobj); 1023 fuse_conn_put(fc);
906 } 1024 }
907 1025
908 return 0; 1026 return 0;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8d7546e832e8..72a74cde6de8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -79,7 +79,6 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
79{ 79{
80 req->in.h.opcode = FUSE_LOOKUP; 80 req->in.h.opcode = FUSE_LOOKUP;
81 req->in.h.nodeid = get_node_id(dir); 81 req->in.h.nodeid = get_node_id(dir);
82 req->inode = dir;
83 req->in.numargs = 1; 82 req->in.numargs = 1;
84 req->in.args[0].size = entry->d_name.len + 1; 83 req->in.args[0].size = entry->d_name.len + 1;
85 req->in.args[0].value = entry->d_name.name; 84 req->in.args[0].value = entry->d_name.name;
@@ -225,6 +224,20 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
225} 224}
226 225
227/* 226/*
227 * Synchronous release for the case when something goes wrong in CREATE_OPEN
228 */
229static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
230 u64 nodeid, int flags)
231{
232 struct fuse_req *req;
233
234 req = fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
235 req->force = 1;
236 request_send(fc, req);
237 fuse_put_request(fc, req);
238}
239
240/*
228 * Atomic create+open operation 241 * Atomic create+open operation
229 * 242 *
230 * If the filesystem doesn't support this, then fall back to separate 243 * If the filesystem doesn't support this, then fall back to separate
@@ -237,6 +250,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
237 struct inode *inode; 250 struct inode *inode;
238 struct fuse_conn *fc = get_fuse_conn(dir); 251 struct fuse_conn *fc = get_fuse_conn(dir);
239 struct fuse_req *req; 252 struct fuse_req *req;
253 struct fuse_req *forget_req;
240 struct fuse_open_in inarg; 254 struct fuse_open_in inarg;
241 struct fuse_open_out outopen; 255 struct fuse_open_out outopen;
242 struct fuse_entry_out outentry; 256 struct fuse_entry_out outentry;
@@ -247,9 +261,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
247 if (fc->no_create) 261 if (fc->no_create)
248 return -ENOSYS; 262 return -ENOSYS;
249 263
264 forget_req = fuse_get_req(fc);
265 if (IS_ERR(forget_req))
266 return PTR_ERR(forget_req);
267
250 req = fuse_get_req(fc); 268 req = fuse_get_req(fc);
269 err = PTR_ERR(req);
251 if (IS_ERR(req)) 270 if (IS_ERR(req))
252 return PTR_ERR(req); 271 goto out_put_forget_req;
253 272
254 err = -ENOMEM; 273 err = -ENOMEM;
255 ff = fuse_file_alloc(); 274 ff = fuse_file_alloc();
@@ -262,7 +281,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
262 inarg.mode = mode; 281 inarg.mode = mode;
263 req->in.h.opcode = FUSE_CREATE; 282 req->in.h.opcode = FUSE_CREATE;
264 req->in.h.nodeid = get_node_id(dir); 283 req->in.h.nodeid = get_node_id(dir);
265 req->inode = dir;
266 req->in.numargs = 2; 284 req->in.numargs = 2;
267 req->in.args[0].size = sizeof(inarg); 285 req->in.args[0].size = sizeof(inarg);
268 req->in.args[0].value = &inarg; 286 req->in.args[0].value = &inarg;
@@ -285,25 +303,23 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
285 if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) 303 if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
286 goto out_free_ff; 304 goto out_free_ff;
287 305
306 fuse_put_request(fc, req);
288 inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, 307 inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
289 &outentry.attr); 308 &outentry.attr);
290 err = -ENOMEM;
291 if (!inode) { 309 if (!inode) {
292 flags &= ~(O_CREAT | O_EXCL | O_TRUNC); 310 flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
293 ff->fh = outopen.fh; 311 ff->fh = outopen.fh;
294 /* Special release, with inode = NULL, this will 312 fuse_sync_release(fc, ff, outentry.nodeid, flags);
295 trigger a 'forget' request when the release is 313 fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
296 complete */ 314 return -ENOMEM;
297 fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0);
298 goto out_put_request;
299 } 315 }
300 fuse_put_request(fc, req); 316 fuse_put_request(fc, forget_req);
301 d_instantiate(entry, inode); 317 d_instantiate(entry, inode);
302 fuse_change_timeout(entry, &outentry); 318 fuse_change_timeout(entry, &outentry);
303 file = lookup_instantiate_filp(nd, entry, generic_file_open); 319 file = lookup_instantiate_filp(nd, entry, generic_file_open);
304 if (IS_ERR(file)) { 320 if (IS_ERR(file)) {
305 ff->fh = outopen.fh; 321 ff->fh = outopen.fh;
306 fuse_send_release(fc, ff, outentry.nodeid, inode, flags, 0); 322 fuse_sync_release(fc, ff, outentry.nodeid, flags);
307 return PTR_ERR(file); 323 return PTR_ERR(file);
308 } 324 }
309 fuse_finish_open(inode, file, ff, &outopen); 325 fuse_finish_open(inode, file, ff, &outopen);
@@ -313,6 +329,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
313 fuse_file_free(ff); 329 fuse_file_free(ff);
314 out_put_request: 330 out_put_request:
315 fuse_put_request(fc, req); 331 fuse_put_request(fc, req);
332 out_put_forget_req:
333 fuse_put_request(fc, forget_req);
316 return err; 334 return err;
317} 335}
318 336
@@ -328,7 +346,6 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
328 int err; 346 int err;
329 347
330 req->in.h.nodeid = get_node_id(dir); 348 req->in.h.nodeid = get_node_id(dir);
331 req->inode = dir;
332 req->out.numargs = 1; 349 req->out.numargs = 1;
333 req->out.args[0].size = sizeof(outarg); 350 req->out.args[0].size = sizeof(outarg);
334 req->out.args[0].value = &outarg; 351 req->out.args[0].value = &outarg;
@@ -448,7 +465,6 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
448 465
449 req->in.h.opcode = FUSE_UNLINK; 466 req->in.h.opcode = FUSE_UNLINK;
450 req->in.h.nodeid = get_node_id(dir); 467 req->in.h.nodeid = get_node_id(dir);
451 req->inode = dir;
452 req->in.numargs = 1; 468 req->in.numargs = 1;
453 req->in.args[0].size = entry->d_name.len + 1; 469 req->in.args[0].size = entry->d_name.len + 1;
454 req->in.args[0].value = entry->d_name.name; 470 req->in.args[0].value = entry->d_name.name;
@@ -480,7 +496,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
480 496
481 req->in.h.opcode = FUSE_RMDIR; 497 req->in.h.opcode = FUSE_RMDIR;
482 req->in.h.nodeid = get_node_id(dir); 498 req->in.h.nodeid = get_node_id(dir);
483 req->inode = dir;
484 req->in.numargs = 1; 499 req->in.numargs = 1;
485 req->in.args[0].size = entry->d_name.len + 1; 500 req->in.args[0].size = entry->d_name.len + 1;
486 req->in.args[0].value = entry->d_name.name; 501 req->in.args[0].value = entry->d_name.name;
@@ -510,8 +525,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
510 inarg.newdir = get_node_id(newdir); 525 inarg.newdir = get_node_id(newdir);
511 req->in.h.opcode = FUSE_RENAME; 526 req->in.h.opcode = FUSE_RENAME;
512 req->in.h.nodeid = get_node_id(olddir); 527 req->in.h.nodeid = get_node_id(olddir);
513 req->inode = olddir;
514 req->inode2 = newdir;
515 req->in.numargs = 3; 528 req->in.numargs = 3;
516 req->in.args[0].size = sizeof(inarg); 529 req->in.args[0].size = sizeof(inarg);
517 req->in.args[0].value = &inarg; 530 req->in.args[0].value = &inarg;
@@ -558,7 +571,6 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
558 memset(&inarg, 0, sizeof(inarg)); 571 memset(&inarg, 0, sizeof(inarg));
559 inarg.oldnodeid = get_node_id(inode); 572 inarg.oldnodeid = get_node_id(inode);
560 req->in.h.opcode = FUSE_LINK; 573 req->in.h.opcode = FUSE_LINK;
561 req->inode2 = inode;
562 req->in.numargs = 2; 574 req->in.numargs = 2;
563 req->in.args[0].size = sizeof(inarg); 575 req->in.args[0].size = sizeof(inarg);
564 req->in.args[0].value = &inarg; 576 req->in.args[0].value = &inarg;
@@ -587,7 +599,6 @@ int fuse_do_getattr(struct inode *inode)
587 599
588 req->in.h.opcode = FUSE_GETATTR; 600 req->in.h.opcode = FUSE_GETATTR;
589 req->in.h.nodeid = get_node_id(inode); 601 req->in.h.nodeid = get_node_id(inode);
590 req->inode = inode;
591 req->out.numargs = 1; 602 req->out.numargs = 1;
592 req->out.args[0].size = sizeof(arg); 603 req->out.args[0].size = sizeof(arg);
593 req->out.args[0].value = &arg; 604 req->out.args[0].value = &arg;
@@ -679,7 +690,6 @@ static int fuse_access(struct inode *inode, int mask)
679 inarg.mask = mask; 690 inarg.mask = mask;
680 req->in.h.opcode = FUSE_ACCESS; 691 req->in.h.opcode = FUSE_ACCESS;
681 req->in.h.nodeid = get_node_id(inode); 692 req->in.h.nodeid = get_node_id(inode);
682 req->inode = inode;
683 req->in.numargs = 1; 693 req->in.numargs = 1;
684 req->in.args[0].size = sizeof(inarg); 694 req->in.args[0].size = sizeof(inarg);
685 req->in.args[0].value = &inarg; 695 req->in.args[0].value = &inarg;
@@ -820,7 +830,6 @@ static char *read_link(struct dentry *dentry)
820 } 830 }
821 req->in.h.opcode = FUSE_READLINK; 831 req->in.h.opcode = FUSE_READLINK;
822 req->in.h.nodeid = get_node_id(inode); 832 req->in.h.nodeid = get_node_id(inode);
823 req->inode = inode;
824 req->out.argvar = 1; 833 req->out.argvar = 1;
825 req->out.numargs = 1; 834 req->out.numargs = 1;
826 req->out.args[0].size = PAGE_SIZE - 1; 835 req->out.args[0].size = PAGE_SIZE - 1;
@@ -939,7 +948,6 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
939 iattr_to_fattr(attr, &inarg); 948 iattr_to_fattr(attr, &inarg);
940 req->in.h.opcode = FUSE_SETATTR; 949 req->in.h.opcode = FUSE_SETATTR;
941 req->in.h.nodeid = get_node_id(inode); 950 req->in.h.nodeid = get_node_id(inode);
942 req->inode = inode;
943 req->in.numargs = 1; 951 req->in.numargs = 1;
944 req->in.args[0].size = sizeof(inarg); 952 req->in.args[0].size = sizeof(inarg);
945 req->in.args[0].value = &inarg; 953 req->in.args[0].value = &inarg;
@@ -1002,7 +1010,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
1002 inarg.flags = flags; 1010 inarg.flags = flags;
1003 req->in.h.opcode = FUSE_SETXATTR; 1011 req->in.h.opcode = FUSE_SETXATTR;
1004 req->in.h.nodeid = get_node_id(inode); 1012 req->in.h.nodeid = get_node_id(inode);
1005 req->inode = inode;
1006 req->in.numargs = 3; 1013 req->in.numargs = 3;
1007 req->in.args[0].size = sizeof(inarg); 1014 req->in.args[0].size = sizeof(inarg);
1008 req->in.args[0].value = &inarg; 1015 req->in.args[0].value = &inarg;
@@ -1041,7 +1048,6 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
1041 inarg.size = size; 1048 inarg.size = size;
1042 req->in.h.opcode = FUSE_GETXATTR; 1049 req->in.h.opcode = FUSE_GETXATTR;
1043 req->in.h.nodeid = get_node_id(inode); 1050 req->in.h.nodeid = get_node_id(inode);
1044 req->inode = inode;
1045 req->in.numargs = 2; 1051 req->in.numargs = 2;
1046 req->in.args[0].size = sizeof(inarg); 1052 req->in.args[0].size = sizeof(inarg);
1047 req->in.args[0].value = &inarg; 1053 req->in.args[0].value = &inarg;
@@ -1091,7 +1097,6 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
1091 inarg.size = size; 1097 inarg.size = size;
1092 req->in.h.opcode = FUSE_LISTXATTR; 1098 req->in.h.opcode = FUSE_LISTXATTR;
1093 req->in.h.nodeid = get_node_id(inode); 1099 req->in.h.nodeid = get_node_id(inode);
1094 req->inode = inode;
1095 req->in.numargs = 1; 1100 req->in.numargs = 1;
1096 req->in.args[0].size = sizeof(inarg); 1101 req->in.args[0].size = sizeof(inarg);
1097 req->in.args[0].value = &inarg; 1102 req->in.args[0].value = &inarg;
@@ -1135,7 +1140,6 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
1135 1140
1136 req->in.h.opcode = FUSE_REMOVEXATTR; 1141 req->in.h.opcode = FUSE_REMOVEXATTR;
1137 req->in.h.nodeid = get_node_id(inode); 1142 req->in.h.nodeid = get_node_id(inode);
1138 req->inode = inode;
1139 req->in.numargs = 1; 1143 req->in.numargs = 1;
1140 req->in.args[0].size = strlen(name) + 1; 1144 req->in.args[0].size = strlen(name) + 1;
1141 req->in.args[0].value = name; 1145 req->in.args[0].value = name;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 087f3b734f40..28aa81eae2cc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -30,7 +30,6 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
30 inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); 30 inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
31 req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; 31 req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
32 req->in.h.nodeid = get_node_id(inode); 32 req->in.h.nodeid = get_node_id(inode);
33 req->inode = inode;
34 req->in.numargs = 1; 33 req->in.numargs = 1;
35 req->in.args[0].size = sizeof(inarg); 34 req->in.args[0].size = sizeof(inarg);
36 req->in.args[0].value = &inarg; 35 req->in.args[0].value = &inarg;
@@ -49,8 +48,8 @@ struct fuse_file *fuse_file_alloc(void)
49 struct fuse_file *ff; 48 struct fuse_file *ff;
50 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); 49 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
51 if (ff) { 50 if (ff) {
52 ff->release_req = fuse_request_alloc(); 51 ff->reserved_req = fuse_request_alloc();
53 if (!ff->release_req) { 52 if (!ff->reserved_req) {
54 kfree(ff); 53 kfree(ff);
55 ff = NULL; 54 ff = NULL;
56 } 55 }
@@ -60,7 +59,7 @@ struct fuse_file *fuse_file_alloc(void)
60 59
61void fuse_file_free(struct fuse_file *ff) 60void fuse_file_free(struct fuse_file *ff)
62{ 61{
63 fuse_request_free(ff->release_req); 62 fuse_request_free(ff->reserved_req);
64 kfree(ff); 63 kfree(ff);
65} 64}
66 65
@@ -113,37 +112,22 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
113 return err; 112 return err;
114} 113}
115 114
116/* Special case for failed iget in CREATE */ 115struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags,
117static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) 116 int opcode)
118{ 117{
119 /* If called from end_io_requests(), req has more than one 118 struct fuse_req *req = ff->reserved_req;
120 reference and fuse_reset_request() cannot work */
121 if (fc->connected) {
122 u64 nodeid = req->in.h.nodeid;
123 fuse_reset_request(req);
124 fuse_send_forget(fc, req, nodeid, 1);
125 } else
126 fuse_put_request(fc, req);
127}
128
129void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
130 u64 nodeid, struct inode *inode, int flags, int isdir)
131{
132 struct fuse_req * req = ff->release_req;
133 struct fuse_release_in *inarg = &req->misc.release_in; 119 struct fuse_release_in *inarg = &req->misc.release_in;
134 120
135 inarg->fh = ff->fh; 121 inarg->fh = ff->fh;
136 inarg->flags = flags; 122 inarg->flags = flags;
137 req->in.h.opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; 123 req->in.h.opcode = opcode;
138 req->in.h.nodeid = nodeid; 124 req->in.h.nodeid = nodeid;
139 req->inode = inode;
140 req->in.numargs = 1; 125 req->in.numargs = 1;
141 req->in.args[0].size = sizeof(struct fuse_release_in); 126 req->in.args[0].size = sizeof(struct fuse_release_in);
142 req->in.args[0].value = inarg; 127 req->in.args[0].value = inarg;
143 request_send_background(fc, req);
144 if (!inode)
145 req->end = fuse_release_end;
146 kfree(ff); 128 kfree(ff);
129
130 return req;
147} 131}
148 132
149int fuse_release_common(struct inode *inode, struct file *file, int isdir) 133int fuse_release_common(struct inode *inode, struct file *file, int isdir)
@@ -151,8 +135,15 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
151 struct fuse_file *ff = file->private_data; 135 struct fuse_file *ff = file->private_data;
152 if (ff) { 136 if (ff) {
153 struct fuse_conn *fc = get_fuse_conn(inode); 137 struct fuse_conn *fc = get_fuse_conn(inode);
154 u64 nodeid = get_node_id(inode); 138 struct fuse_req *req;
155 fuse_send_release(fc, ff, nodeid, inode, file->f_flags, isdir); 139
140 req = fuse_release_fill(ff, get_node_id(inode), file->f_flags,
141 isdir ? FUSE_RELEASEDIR : FUSE_RELEASE);
142
143 /* Hold vfsmount and dentry until release is finished */
144 req->vfsmount = mntget(file->f_vfsmnt);
145 req->dentry = dget(file->f_dentry);
146 request_send_background(fc, req);
156 } 147 }
157 148
158 /* Return value is ignored by VFS */ 149 /* Return value is ignored by VFS */
@@ -169,6 +160,28 @@ static int fuse_release(struct inode *inode, struct file *file)
169 return fuse_release_common(inode, file, 0); 160 return fuse_release_common(inode, file, 0);
170} 161}
171 162
163/*
164 * Scramble the ID space with XTEA, so that the value of the files_struct
165 * pointer is not exposed to userspace.
166 */
167static u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
168{
169 u32 *k = fc->scramble_key;
170 u64 v = (unsigned long) id;
171 u32 v0 = v;
172 u32 v1 = v >> 32;
173 u32 sum = 0;
174 int i;
175
176 for (i = 0; i < 32; i++) {
177 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
178 sum += 0x9E3779B9;
179 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
180 }
181
182 return (u64) v0 + ((u64) v1 << 32);
183}
184
172static int fuse_flush(struct file *file, fl_owner_t id) 185static int fuse_flush(struct file *file, fl_owner_t id)
173{ 186{
174 struct inode *inode = file->f_dentry->d_inode; 187 struct inode *inode = file->f_dentry->d_inode;
@@ -184,19 +197,16 @@ static int fuse_flush(struct file *file, fl_owner_t id)
184 if (fc->no_flush) 197 if (fc->no_flush)
185 return 0; 198 return 0;
186 199
187 req = fuse_get_req(fc); 200 req = fuse_get_req_nofail(fc, file);
188 if (IS_ERR(req))
189 return PTR_ERR(req);
190
191 memset(&inarg, 0, sizeof(inarg)); 201 memset(&inarg, 0, sizeof(inarg));
192 inarg.fh = ff->fh; 202 inarg.fh = ff->fh;
203 inarg.lock_owner = fuse_lock_owner_id(fc, id);
193 req->in.h.opcode = FUSE_FLUSH; 204 req->in.h.opcode = FUSE_FLUSH;
194 req->in.h.nodeid = get_node_id(inode); 205 req->in.h.nodeid = get_node_id(inode);
195 req->inode = inode;
196 req->file = file;
197 req->in.numargs = 1; 206 req->in.numargs = 1;
198 req->in.args[0].size = sizeof(inarg); 207 req->in.args[0].size = sizeof(inarg);
199 req->in.args[0].value = &inarg; 208 req->in.args[0].value = &inarg;
209 req->force = 1;
200 request_send(fc, req); 210 request_send(fc, req);
201 err = req->out.h.error; 211 err = req->out.h.error;
202 fuse_put_request(fc, req); 212 fuse_put_request(fc, req);
@@ -232,8 +242,6 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
232 inarg.fsync_flags = datasync ? 1 : 0; 242 inarg.fsync_flags = datasync ? 1 : 0;
233 req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC; 243 req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
234 req->in.h.nodeid = get_node_id(inode); 244 req->in.h.nodeid = get_node_id(inode);
235 req->inode = inode;
236 req->file = file;
237 req->in.numargs = 1; 245 req->in.numargs = 1;
238 req->in.args[0].size = sizeof(inarg); 246 req->in.args[0].size = sizeof(inarg);
239 req->in.args[0].value = &inarg; 247 req->in.args[0].value = &inarg;
@@ -266,8 +274,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
266 inarg->size = count; 274 inarg->size = count;
267 req->in.h.opcode = opcode; 275 req->in.h.opcode = opcode;
268 req->in.h.nodeid = get_node_id(inode); 276 req->in.h.nodeid = get_node_id(inode);
269 req->inode = inode;
270 req->file = file;
271 req->in.numargs = 1; 277 req->in.numargs = 1;
272 req->in.args[0].size = sizeof(struct fuse_read_in); 278 req->in.args[0].size = sizeof(struct fuse_read_in);
273 req->in.args[0].value = inarg; 279 req->in.args[0].value = inarg;
@@ -342,6 +348,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
342 req->out.page_zeroing = 1; 348 req->out.page_zeroing = 1;
343 fuse_read_fill(req, file, inode, pos, count, FUSE_READ); 349 fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
344 if (fc->async_read) { 350 if (fc->async_read) {
351 get_file(file);
352 req->file = file;
345 req->end = fuse_readpages_end; 353 req->end = fuse_readpages_end;
346 request_send_background(fc, req); 354 request_send_background(fc, req);
347 } else { 355 } else {
@@ -420,8 +428,6 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
420 inarg.size = count; 428 inarg.size = count;
421 req->in.h.opcode = FUSE_WRITE; 429 req->in.h.opcode = FUSE_WRITE;
422 req->in.h.nodeid = get_node_id(inode); 430 req->in.h.nodeid = get_node_id(inode);
423 req->inode = inode;
424 req->file = file;
425 req->in.argpages = 1; 431 req->in.argpages = 1;
426 req->in.numargs = 2; 432 req->in.numargs = 2;
427 req->in.args[0].size = sizeof(struct fuse_write_in); 433 req->in.args[0].size = sizeof(struct fuse_write_in);
@@ -619,6 +625,126 @@ static int fuse_set_page_dirty(struct page *page)
619 return 0; 625 return 0;
620} 626}
621 627
628static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
629 struct file_lock *fl)
630{
631 switch (ffl->type) {
632 case F_UNLCK:
633 break;
634
635 case F_RDLCK:
636 case F_WRLCK:
637 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
638 ffl->end < ffl->start)
639 return -EIO;
640
641 fl->fl_start = ffl->start;
642 fl->fl_end = ffl->end;
643 fl->fl_pid = ffl->pid;
644 break;
645
646 default:
647 return -EIO;
648 }
649 fl->fl_type = ffl->type;
650 return 0;
651}
652
653static void fuse_lk_fill(struct fuse_req *req, struct file *file,
654 const struct file_lock *fl, int opcode, pid_t pid)
655{
656 struct inode *inode = file->f_dentry->d_inode;
657 struct fuse_conn *fc = get_fuse_conn(inode);
658 struct fuse_file *ff = file->private_data;
659 struct fuse_lk_in *arg = &req->misc.lk_in;
660
661 arg->fh = ff->fh;
662 arg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
663 arg->lk.start = fl->fl_start;
664 arg->lk.end = fl->fl_end;
665 arg->lk.type = fl->fl_type;
666 arg->lk.pid = pid;
667 req->in.h.opcode = opcode;
668 req->in.h.nodeid = get_node_id(inode);
669 req->in.numargs = 1;
670 req->in.args[0].size = sizeof(*arg);
671 req->in.args[0].value = arg;
672}
673
674static int fuse_getlk(struct file *file, struct file_lock *fl)
675{
676 struct inode *inode = file->f_dentry->d_inode;
677 struct fuse_conn *fc = get_fuse_conn(inode);
678 struct fuse_req *req;
679 struct fuse_lk_out outarg;
680 int err;
681
682 req = fuse_get_req(fc);
683 if (IS_ERR(req))
684 return PTR_ERR(req);
685
686 fuse_lk_fill(req, file, fl, FUSE_GETLK, 0);
687 req->out.numargs = 1;
688 req->out.args[0].size = sizeof(outarg);
689 req->out.args[0].value = &outarg;
690 request_send(fc, req);
691 err = req->out.h.error;
692 fuse_put_request(fc, req);
693 if (!err)
694 err = convert_fuse_file_lock(&outarg.lk, fl);
695
696 return err;
697}
698
699static int fuse_setlk(struct file *file, struct file_lock *fl)
700{
701 struct inode *inode = file->f_dentry->d_inode;
702 struct fuse_conn *fc = get_fuse_conn(inode);
703 struct fuse_req *req;
704 int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
705 pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
706 int err;
707
708 /* Unlock on close is handled by the flush method */
709 if (fl->fl_flags & FL_CLOSE)
710 return 0;
711
712 req = fuse_get_req(fc);
713 if (IS_ERR(req))
714 return PTR_ERR(req);
715
716 fuse_lk_fill(req, file, fl, opcode, pid);
717 request_send(fc, req);
718 err = req->out.h.error;
719 /* locking is restartable */
720 if (err == -EINTR)
721 err = -ERESTARTSYS;
722 fuse_put_request(fc, req);
723 return err;
724}
725
726static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
727{
728 struct inode *inode = file->f_dentry->d_inode;
729 struct fuse_conn *fc = get_fuse_conn(inode);
730 int err;
731
732 if (cmd == F_GETLK) {
733 if (fc->no_lock) {
734 if (!posix_test_lock(file, fl, fl))
735 fl->fl_type = F_UNLCK;
736 err = 0;
737 } else
738 err = fuse_getlk(file, fl);
739 } else {
740 if (fc->no_lock)
741 err = posix_lock_file_wait(file, fl);
742 else
743 err = fuse_setlk(file, fl);
744 }
745 return err;
746}
747
622static const struct file_operations fuse_file_operations = { 748static const struct file_operations fuse_file_operations = {
623 .llseek = generic_file_llseek, 749 .llseek = generic_file_llseek,
624 .read = generic_file_read, 750 .read = generic_file_read,
@@ -628,6 +754,7 @@ static const struct file_operations fuse_file_operations = {
628 .flush = fuse_flush, 754 .flush = fuse_flush,
629 .release = fuse_release, 755 .release = fuse_release,
630 .fsync = fuse_fsync, 756 .fsync = fuse_fsync,
757 .lock = fuse_file_lock,
631 .sendfile = generic_file_sendfile, 758 .sendfile = generic_file_sendfile,
632}; 759};
633 760
@@ -639,6 +766,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
639 .flush = fuse_flush, 766 .flush = fuse_flush,
640 .release = fuse_release, 767 .release = fuse_release,
641 .fsync = fuse_fsync, 768 .fsync = fuse_fsync,
769 .lock = fuse_file_lock,
642 /* no mmap and sendfile */ 770 /* no mmap and sendfile */
643}; 771};
644 772
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0474202cb5dc..0dbf96621841 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -8,12 +8,13 @@
8 8
9#include <linux/fuse.h> 9#include <linux/fuse.h>
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/mount.h>
11#include <linux/wait.h> 12#include <linux/wait.h>
12#include <linux/list.h> 13#include <linux/list.h>
13#include <linux/spinlock.h> 14#include <linux/spinlock.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
16#include <asm/semaphore.h> 17#include <linux/mutex.h>
17 18
18/** Max number of pages that can be used in a single read request */ 19/** Max number of pages that can be used in a single read request */
19#define FUSE_MAX_PAGES_PER_REQ 32 20#define FUSE_MAX_PAGES_PER_REQ 32
@@ -24,6 +25,9 @@
24/** It could be as large as PATH_MAX, but would that have any uses? */ 25/** It could be as large as PATH_MAX, but would that have any uses? */
25#define FUSE_NAME_MAX 1024 26#define FUSE_NAME_MAX 1024
26 27
28/** Number of dentries for each connection in the control filesystem */
29#define FUSE_CTL_NUM_DENTRIES 3
30
27/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem 31/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
28 module will check permissions based on the file mode. Otherwise no 32 module will check permissions based on the file mode. Otherwise no
29 permission checking is done in the kernel */ 33 permission checking is done in the kernel */
@@ -33,6 +37,11 @@
33 doing the mount will be allowed to access the filesystem */ 37 doing the mount will be allowed to access the filesystem */
34#define FUSE_ALLOW_OTHER (1 << 1) 38#define FUSE_ALLOW_OTHER (1 << 1)
35 39
40/** List of active connections */
41extern struct list_head fuse_conn_list;
42
43/** Global mutex protecting fuse_conn_list and the control filesystem */
44extern struct mutex fuse_mutex;
36 45
37/** FUSE inode */ 46/** FUSE inode */
38struct fuse_inode { 47struct fuse_inode {
@@ -56,7 +65,7 @@ struct fuse_inode {
56/** FUSE specific file data */ 65/** FUSE specific file data */
57struct fuse_file { 66struct fuse_file {
58 /** Request reserved for flush and release */ 67 /** Request reserved for flush and release */
59 struct fuse_req *release_req; 68 struct fuse_req *reserved_req;
60 69
61 /** File handle used by userspace */ 70 /** File handle used by userspace */
62 u64 fh; 71 u64 fh;
@@ -122,6 +131,7 @@ enum fuse_req_state {
122 FUSE_REQ_PENDING, 131 FUSE_REQ_PENDING,
123 FUSE_REQ_READING, 132 FUSE_REQ_READING,
124 FUSE_REQ_SENT, 133 FUSE_REQ_SENT,
134 FUSE_REQ_WRITING,
125 FUSE_REQ_FINISHED 135 FUSE_REQ_FINISHED
126}; 136};
127 137
@@ -135,12 +145,15 @@ struct fuse_req {
135 fuse_conn */ 145 fuse_conn */
136 struct list_head list; 146 struct list_head list;
137 147
138 /** Entry on the background list */ 148 /** Entry on the interrupts list */
139 struct list_head bg_entry; 149 struct list_head intr_entry;
140 150
141 /** refcount */ 151 /** refcount */
142 atomic_t count; 152 atomic_t count;
143 153
154 /** Unique ID for the interrupt request */
155 u64 intr_unique;
156
144 /* 157 /*
145 * The following bitfields are either set once before the 158 * The following bitfields are either set once before the
146 * request is queued or setting/clearing them is protected by 159 * request is queued or setting/clearing them is protected by
@@ -150,12 +163,18 @@ struct fuse_req {
150 /** True if the request has reply */ 163 /** True if the request has reply */
151 unsigned isreply:1; 164 unsigned isreply:1;
152 165
153 /** The request was interrupted */ 166 /** Force sending of the request even if interrupted */
154 unsigned interrupted:1; 167 unsigned force:1;
168
169 /** The request was aborted */
170 unsigned aborted:1;
155 171
156 /** Request is sent in the background */ 172 /** Request is sent in the background */
157 unsigned background:1; 173 unsigned background:1;
158 174
175 /** The request has been interrupted */
176 unsigned interrupted:1;
177
159 /** Data is being copied to/from the request */ 178 /** Data is being copied to/from the request */
160 unsigned locked:1; 179 unsigned locked:1;
161 180
@@ -181,6 +200,7 @@ struct fuse_req {
181 struct fuse_init_in init_in; 200 struct fuse_init_in init_in;
182 struct fuse_init_out init_out; 201 struct fuse_init_out init_out;
183 struct fuse_read_in read_in; 202 struct fuse_read_in read_in;
203 struct fuse_lk_in lk_in;
184 } misc; 204 } misc;
185 205
186 /** page vector */ 206 /** page vector */
@@ -192,17 +212,20 @@ struct fuse_req {
192 /** offset of data on first page */ 212 /** offset of data on first page */
193 unsigned page_offset; 213 unsigned page_offset;
194 214
195 /** Inode used in the request */
196 struct inode *inode;
197
198 /** Second inode used in the request (or NULL) */
199 struct inode *inode2;
200
201 /** File used in the request (or NULL) */ 215 /** File used in the request (or NULL) */
202 struct file *file; 216 struct file *file;
203 217
218 /** vfsmount used in release */
219 struct vfsmount *vfsmount;
220
221 /** dentry used in release */
222 struct dentry *dentry;
223
204 /** Request completion callback */ 224 /** Request completion callback */
205 void (*end)(struct fuse_conn *, struct fuse_req *); 225 void (*end)(struct fuse_conn *, struct fuse_req *);
226
227 /** Request is stolen from fuse_file->reserved_req */
228 struct file *stolen_file;
206}; 229};
207 230
208/** 231/**
@@ -216,6 +239,9 @@ struct fuse_conn {
216 /** Lock protecting accessess to members of this structure */ 239 /** Lock protecting accessess to members of this structure */
217 spinlock_t lock; 240 spinlock_t lock;
218 241
242 /** Refcount */
243 atomic_t count;
244
219 /** The user id for this mount */ 245 /** The user id for this mount */
220 uid_t user_id; 246 uid_t user_id;
221 247
@@ -243,13 +269,12 @@ struct fuse_conn {
243 /** The list of requests under I/O */ 269 /** The list of requests under I/O */
244 struct list_head io; 270 struct list_head io;
245 271
246 /** Requests put in the background (RELEASE or any other
247 interrupted request) */
248 struct list_head background;
249
250 /** Number of requests currently in the background */ 272 /** Number of requests currently in the background */
251 unsigned num_background; 273 unsigned num_background;
252 274
275 /** Pending interrupts */
276 struct list_head interrupts;
277
253 /** Flag indicating if connection is blocked. This will be 278 /** Flag indicating if connection is blocked. This will be
254 the case before the INIT reply is received, and if there 279 the case before the INIT reply is received, and if there
255 are too many outstading backgrounds requests */ 280 are too many outstading backgrounds requests */
@@ -258,15 +283,9 @@ struct fuse_conn {
258 /** waitq for blocked connection */ 283 /** waitq for blocked connection */
259 wait_queue_head_t blocked_waitq; 284 wait_queue_head_t blocked_waitq;
260 285
261 /** RW semaphore for exclusion with fuse_put_super() */
262 struct rw_semaphore sbput_sem;
263
264 /** The next unique request id */ 286 /** The next unique request id */
265 u64 reqctr; 287 u64 reqctr;
266 288
267 /** Mount is active */
268 unsigned mounted;
269
270 /** Connection established, cleared on umount, connection 289 /** Connection established, cleared on umount, connection
271 abort and device release */ 290 abort and device release */
272 unsigned connected; 291 unsigned connected;
@@ -305,12 +324,18 @@ struct fuse_conn {
305 /** Is removexattr not implemented by fs? */ 324 /** Is removexattr not implemented by fs? */
306 unsigned no_removexattr : 1; 325 unsigned no_removexattr : 1;
307 326
327 /** Are file locking primitives not implemented by fs? */
328 unsigned no_lock : 1;
329
308 /** Is access not implemented by fs? */ 330 /** Is access not implemented by fs? */
309 unsigned no_access : 1; 331 unsigned no_access : 1;
310 332
311 /** Is create not implemented by fs? */ 333 /** Is create not implemented by fs? */
312 unsigned no_create : 1; 334 unsigned no_create : 1;
313 335
336 /** Is interrupt not implemented by fs? */
337 unsigned no_interrupt : 1;
338
314 /** The number of requests waiting for completion */ 339 /** The number of requests waiting for completion */
315 atomic_t num_waiting; 340 atomic_t num_waiting;
316 341
@@ -320,11 +345,23 @@ struct fuse_conn {
320 /** Backing dev info */ 345 /** Backing dev info */
321 struct backing_dev_info bdi; 346 struct backing_dev_info bdi;
322 347
323 /** kobject */ 348 /** Entry on the fuse_conn_list */
324 struct kobject kobj; 349 struct list_head entry;
350
351 /** Unique ID */
352 u64 id;
353
354 /** Dentries in the control filesystem */
355 struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
356
357 /** number of dentries used in the above array */
358 int ctl_ndents;
325 359
326 /** O_ASYNC requests */ 360 /** O_ASYNC requests */
327 struct fasync_struct *fasync; 361 struct fasync_struct *fasync;
362
363 /** Key for lock owner ID scrambling */
364 u32 scramble_key[4];
328}; 365};
329 366
330static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) 367static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -337,11 +374,6 @@ static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
337 return get_fuse_conn_super(inode->i_sb); 374 return get_fuse_conn_super(inode->i_sb);
338} 375}
339 376
340static inline struct fuse_conn *get_fuse_conn_kobj(struct kobject *obj)
341{
342 return container_of(obj, struct fuse_conn, kobj);
343}
344
345static inline struct fuse_inode *get_fuse_inode(struct inode *inode) 377static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
346{ 378{
347 return container_of(inode, struct fuse_inode, inode); 379 return container_of(inode, struct fuse_inode, inode);
@@ -383,12 +415,9 @@ void fuse_file_free(struct fuse_file *ff);
383void fuse_finish_open(struct inode *inode, struct file *file, 415void fuse_finish_open(struct inode *inode, struct file *file,
384 struct fuse_file *ff, struct fuse_open_out *outarg); 416 struct fuse_file *ff, struct fuse_open_out *outarg);
385 417
386/** 418/** */
387 * Send a RELEASE request 419struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags,
388 */ 420 int opcode);
389void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
390 u64 nodeid, struct inode *inode, int flags, int isdir);
391
392/** 421/**
393 * Send RELEASE or RELEASEDIR request 422 * Send RELEASE or RELEASEDIR request
394 */ 423 */
@@ -435,6 +464,9 @@ int fuse_dev_init(void);
435 */ 464 */
436void fuse_dev_cleanup(void); 465void fuse_dev_cleanup(void);
437 466
467int fuse_ctl_init(void);
468void fuse_ctl_cleanup(void);
469
438/** 470/**
439 * Allocate a request 471 * Allocate a request
440 */ 472 */
@@ -446,14 +478,14 @@ struct fuse_req *fuse_request_alloc(void);
446void fuse_request_free(struct fuse_req *req); 478void fuse_request_free(struct fuse_req *req);
447 479
448/** 480/**
449 * Reinitialize a request, the preallocated flag is left unmodified 481 * Get a request, may fail with -ENOMEM
450 */ 482 */
451void fuse_reset_request(struct fuse_req *req); 483struct fuse_req *fuse_get_req(struct fuse_conn *fc);
452 484
453/** 485/**
454 * Reserve a preallocated request 486 * Gets a requests for a file operation, always succeeds
455 */ 487 */
456struct fuse_req *fuse_get_req(struct fuse_conn *fc); 488struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file);
457 489
458/** 490/**
459 * Decrement reference count of a request. If count goes to zero free 491 * Decrement reference count of a request. If count goes to zero free
@@ -476,11 +508,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
476 */ 508 */
477void request_send_background(struct fuse_conn *fc, struct fuse_req *req); 509void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
478 510
479/**
480 * Release inodes and file associated with background request
481 */
482void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
483
484/* Abort all requests */ 511/* Abort all requests */
485void fuse_abort_conn(struct fuse_conn *fc); 512void fuse_abort_conn(struct fuse_conn *fc);
486 513
@@ -493,3 +520,23 @@ int fuse_do_getattr(struct inode *inode);
493 * Invalidate inode attributes 520 * Invalidate inode attributes
494 */ 521 */
495void fuse_invalidate_attr(struct inode *inode); 522void fuse_invalidate_attr(struct inode *inode);
523
524/**
525 * Acquire reference to fuse_conn
526 */
527struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
528
529/**
530 * Release reference to fuse_conn
531 */
532void fuse_conn_put(struct fuse_conn *fc);
533
534/**
535 * Add connection to control filesystem
536 */
537int fuse_ctl_add_conn(struct fuse_conn *fc);
538
539/**
540 * Remove connection from control filesystem
541 */
542void fuse_ctl_remove_conn(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a13c0f529058..dcaaabd3b9c4 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -11,25 +11,20 @@
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/mount.h>
15#include <linux/seq_file.h> 14#include <linux/seq_file.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/module.h> 16#include <linux/module.h>
18#include <linux/parser.h> 17#include <linux/parser.h>
19#include <linux/statfs.h> 18#include <linux/statfs.h>
19#include <linux/random.h>
20 20
21MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); 21MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
22MODULE_DESCRIPTION("Filesystem in Userspace"); 22MODULE_DESCRIPTION("Filesystem in Userspace");
23MODULE_LICENSE("GPL"); 23MODULE_LICENSE("GPL");
24 24
25static kmem_cache_t *fuse_inode_cachep; 25static kmem_cache_t *fuse_inode_cachep;
26static struct subsystem connections_subsys; 26struct list_head fuse_conn_list;
27 27DEFINE_MUTEX(fuse_mutex);
28struct fuse_conn_attr {
29 struct attribute attr;
30 ssize_t (*show)(struct fuse_conn *, char *);
31 ssize_t (*store)(struct fuse_conn *, const char *, size_t);
32};
33 28
34#define FUSE_SUPER_MAGIC 0x65735546 29#define FUSE_SUPER_MAGIC 0x65735546
35 30
@@ -104,6 +99,14 @@ static void fuse_clear_inode(struct inode *inode)
104 } 99 }
105} 100}
106 101
102static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
103{
104 if (*flags & MS_MANDLOCK)
105 return -EINVAL;
106
107 return 0;
108}
109
107void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr) 110void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
108{ 111{
109 if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size) 112 if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
@@ -195,31 +198,29 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
195 return inode; 198 return inode;
196} 199}
197 200
198static void fuse_umount_begin(struct super_block *sb) 201static void fuse_umount_begin(struct vfsmount *vfsmnt, int flags)
199{ 202{
200 fuse_abort_conn(get_fuse_conn_super(sb)); 203 if (flags & MNT_FORCE)
204 fuse_abort_conn(get_fuse_conn_super(vfsmnt->mnt_sb));
201} 205}
202 206
203static void fuse_put_super(struct super_block *sb) 207static void fuse_put_super(struct super_block *sb)
204{ 208{
205 struct fuse_conn *fc = get_fuse_conn_super(sb); 209 struct fuse_conn *fc = get_fuse_conn_super(sb);
206 210
207 down_write(&fc->sbput_sem);
208 while (!list_empty(&fc->background))
209 fuse_release_background(fc,
210 list_entry(fc->background.next,
211 struct fuse_req, bg_entry));
212
213 spin_lock(&fc->lock); 211 spin_lock(&fc->lock);
214 fc->mounted = 0;
215 fc->connected = 0; 212 fc->connected = 0;
213 fc->blocked = 0;
216 spin_unlock(&fc->lock); 214 spin_unlock(&fc->lock);
217 up_write(&fc->sbput_sem);
218 /* Flush all readers on this fs */ 215 /* Flush all readers on this fs */
219 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 216 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
220 wake_up_all(&fc->waitq); 217 wake_up_all(&fc->waitq);
221 kobject_del(&fc->kobj); 218 wake_up_all(&fc->blocked_waitq);
222 kobject_put(&fc->kobj); 219 mutex_lock(&fuse_mutex);
220 list_del(&fc->entry);
221 fuse_ctl_remove_conn(fc);
222 mutex_unlock(&fuse_mutex);
223 fuse_conn_put(fc);
223} 224}
224 225
225static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) 226static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
@@ -369,11 +370,6 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
369 return 0; 370 return 0;
370} 371}
371 372
372static void fuse_conn_release(struct kobject *kobj)
373{
374 kfree(get_fuse_conn_kobj(kobj));
375}
376
377static struct fuse_conn *new_conn(void) 373static struct fuse_conn *new_conn(void)
378{ 374{
379 struct fuse_conn *fc; 375 struct fuse_conn *fc;
@@ -381,24 +377,35 @@ static struct fuse_conn *new_conn(void)
381 fc = kzalloc(sizeof(*fc), GFP_KERNEL); 377 fc = kzalloc(sizeof(*fc), GFP_KERNEL);
382 if (fc) { 378 if (fc) {
383 spin_lock_init(&fc->lock); 379 spin_lock_init(&fc->lock);
380 atomic_set(&fc->count, 1);
384 init_waitqueue_head(&fc->waitq); 381 init_waitqueue_head(&fc->waitq);
385 init_waitqueue_head(&fc->blocked_waitq); 382 init_waitqueue_head(&fc->blocked_waitq);
386 INIT_LIST_HEAD(&fc->pending); 383 INIT_LIST_HEAD(&fc->pending);
387 INIT_LIST_HEAD(&fc->processing); 384 INIT_LIST_HEAD(&fc->processing);
388 INIT_LIST_HEAD(&fc->io); 385 INIT_LIST_HEAD(&fc->io);
389 INIT_LIST_HEAD(&fc->background); 386 INIT_LIST_HEAD(&fc->interrupts);
390 init_rwsem(&fc->sbput_sem);
391 kobj_set_kset_s(fc, connections_subsys);
392 kobject_init(&fc->kobj);
393 atomic_set(&fc->num_waiting, 0); 387 atomic_set(&fc->num_waiting, 0);
394 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 388 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
395 fc->bdi.unplug_io_fn = default_unplug_io_fn; 389 fc->bdi.unplug_io_fn = default_unplug_io_fn;
396 fc->reqctr = 0; 390 fc->reqctr = 0;
397 fc->blocked = 1; 391 fc->blocked = 1;
392 get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
398 } 393 }
399 return fc; 394 return fc;
400} 395}
401 396
397void fuse_conn_put(struct fuse_conn *fc)
398{
399 if (atomic_dec_and_test(&fc->count))
400 kfree(fc);
401}
402
403struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
404{
405 atomic_inc(&fc->count);
406 return fc;
407}
408
402static struct inode *get_root_inode(struct super_block *sb, unsigned mode) 409static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
403{ 410{
404 struct fuse_attr attr; 411 struct fuse_attr attr;
@@ -414,6 +421,7 @@ static struct super_operations fuse_super_operations = {
414 .destroy_inode = fuse_destroy_inode, 421 .destroy_inode = fuse_destroy_inode,
415 .read_inode = fuse_read_inode, 422 .read_inode = fuse_read_inode,
416 .clear_inode = fuse_clear_inode, 423 .clear_inode = fuse_clear_inode,
424 .remount_fs = fuse_remount_fs,
417 .put_super = fuse_put_super, 425 .put_super = fuse_put_super,
418 .umount_begin = fuse_umount_begin, 426 .umount_begin = fuse_umount_begin,
419 .statfs = fuse_statfs, 427 .statfs = fuse_statfs,
@@ -433,8 +441,12 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
433 ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; 441 ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
434 if (arg->flags & FUSE_ASYNC_READ) 442 if (arg->flags & FUSE_ASYNC_READ)
435 fc->async_read = 1; 443 fc->async_read = 1;
436 } else 444 if (!(arg->flags & FUSE_POSIX_LOCKS))
445 fc->no_lock = 1;
446 } else {
437 ra_pages = fc->max_read / PAGE_CACHE_SIZE; 447 ra_pages = fc->max_read / PAGE_CACHE_SIZE;
448 fc->no_lock = 1;
449 }
438 450
439 fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); 451 fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
440 fc->minor = arg->minor; 452 fc->minor = arg->minor;
@@ -452,7 +464,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
452 arg->major = FUSE_KERNEL_VERSION; 464 arg->major = FUSE_KERNEL_VERSION;
453 arg->minor = FUSE_KERNEL_MINOR_VERSION; 465 arg->minor = FUSE_KERNEL_MINOR_VERSION;
454 arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; 466 arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
455 arg->flags |= FUSE_ASYNC_READ; 467 arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS;
456 req->in.h.opcode = FUSE_INIT; 468 req->in.h.opcode = FUSE_INIT;
457 req->in.numargs = 1; 469 req->in.numargs = 1;
458 req->in.args[0].size = sizeof(*arg); 470 req->in.args[0].size = sizeof(*arg);
@@ -468,10 +480,9 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
468 request_send_background(fc, req); 480 request_send_background(fc, req);
469} 481}
470 482
471static unsigned long long conn_id(void) 483static u64 conn_id(void)
472{ 484{
473 /* BKL is held for ->get_sb() */ 485 static u64 ctr = 1;
474 static unsigned long long ctr = 1;
475 return ctr++; 486 return ctr++;
476} 487}
477 488
@@ -485,6 +496,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
485 struct fuse_req *init_req; 496 struct fuse_req *init_req;
486 int err; 497 int err;
487 498
499 if (sb->s_flags & MS_MANDLOCK)
500 return -EINVAL;
501
488 if (!parse_fuse_opt((char *) data, &d)) 502 if (!parse_fuse_opt((char *) data, &d))
489 return -EINVAL; 503 return -EINVAL;
490 504
@@ -528,25 +542,21 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
528 if (!init_req) 542 if (!init_req)
529 goto err_put_root; 543 goto err_put_root;
530 544
531 err = kobject_set_name(&fc->kobj, "%llu", conn_id()); 545 mutex_lock(&fuse_mutex);
532 if (err)
533 goto err_free_req;
534
535 err = kobject_add(&fc->kobj);
536 if (err)
537 goto err_free_req;
538
539 /* Setting file->private_data can't race with other mount()
540 instances, since BKL is held for ->get_sb() */
541 err = -EINVAL; 546 err = -EINVAL;
542 if (file->private_data) 547 if (file->private_data)
543 goto err_kobject_del; 548 goto err_unlock;
544 549
550 fc->id = conn_id();
551 err = fuse_ctl_add_conn(fc);
552 if (err)
553 goto err_unlock;
554
555 list_add_tail(&fc->entry, &fuse_conn_list);
545 sb->s_root = root_dentry; 556 sb->s_root = root_dentry;
546 fc->mounted = 1;
547 fc->connected = 1; 557 fc->connected = 1;
548 kobject_get(&fc->kobj); 558 file->private_data = fuse_conn_get(fc);
549 file->private_data = fc; 559 mutex_unlock(&fuse_mutex);
550 /* 560 /*
551 * atomic_dec_and_test() in fput() provides the necessary 561 * atomic_dec_and_test() in fput() provides the necessary
552 * memory barrier for file->private_data to be visible on all 562 * memory barrier for file->private_data to be visible on all
@@ -558,15 +568,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
558 568
559 return 0; 569 return 0;
560 570
561 err_kobject_del: 571 err_unlock:
562 kobject_del(&fc->kobj); 572 mutex_unlock(&fuse_mutex);
563 err_free_req:
564 fuse_request_free(init_req); 573 fuse_request_free(init_req);
565 err_put_root: 574 err_put_root:
566 dput(root_dentry); 575 dput(root_dentry);
567 err: 576 err:
568 fput(file); 577 fput(file);
569 kobject_put(&fc->kobj); 578 fuse_conn_put(fc);
570 return err; 579 return err;
571} 580}
572 581
@@ -584,68 +593,8 @@ static struct file_system_type fuse_fs_type = {
584 .kill_sb = kill_anon_super, 593 .kill_sb = kill_anon_super,
585}; 594};
586 595
587static ssize_t fuse_conn_waiting_show(struct fuse_conn *fc, char *page)
588{
589 return sprintf(page, "%i\n", atomic_read(&fc->num_waiting));
590}
591
592static ssize_t fuse_conn_abort_store(struct fuse_conn *fc, const char *page,
593 size_t count)
594{
595 fuse_abort_conn(fc);
596 return count;
597}
598
599static struct fuse_conn_attr fuse_conn_waiting =
600 __ATTR(waiting, 0400, fuse_conn_waiting_show, NULL);
601static struct fuse_conn_attr fuse_conn_abort =
602 __ATTR(abort, 0600, NULL, fuse_conn_abort_store);
603
604static struct attribute *fuse_conn_attrs[] = {
605 &fuse_conn_waiting.attr,
606 &fuse_conn_abort.attr,
607 NULL,
608};
609
610static ssize_t fuse_conn_attr_show(struct kobject *kobj,
611 struct attribute *attr,
612 char *page)
613{
614 struct fuse_conn_attr *fca =
615 container_of(attr, struct fuse_conn_attr, attr);
616
617 if (fca->show)
618 return fca->show(get_fuse_conn_kobj(kobj), page);
619 else
620 return -EACCES;
621}
622
623static ssize_t fuse_conn_attr_store(struct kobject *kobj,
624 struct attribute *attr,
625 const char *page, size_t count)
626{
627 struct fuse_conn_attr *fca =
628 container_of(attr, struct fuse_conn_attr, attr);
629
630 if (fca->store)
631 return fca->store(get_fuse_conn_kobj(kobj), page, count);
632 else
633 return -EACCES;
634}
635
636static struct sysfs_ops fuse_conn_sysfs_ops = {
637 .show = &fuse_conn_attr_show,
638 .store = &fuse_conn_attr_store,
639};
640
641static struct kobj_type ktype_fuse_conn = {
642 .release = fuse_conn_release,
643 .sysfs_ops = &fuse_conn_sysfs_ops,
644 .default_attrs = fuse_conn_attrs,
645};
646
647static decl_subsys(fuse, NULL, NULL); 596static decl_subsys(fuse, NULL, NULL);
648static decl_subsys(connections, &ktype_fuse_conn, NULL); 597static decl_subsys(connections, NULL, NULL);
649 598
650static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep, 599static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep,
651 unsigned long flags) 600 unsigned long flags)
@@ -719,6 +668,7 @@ static int __init fuse_init(void)
719 printk("fuse init (API version %i.%i)\n", 668 printk("fuse init (API version %i.%i)\n",
720 FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); 669 FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
721 670
671 INIT_LIST_HEAD(&fuse_conn_list);
722 res = fuse_fs_init(); 672 res = fuse_fs_init();
723 if (res) 673 if (res)
724 goto err; 674 goto err;
@@ -731,8 +681,14 @@ static int __init fuse_init(void)
731 if (res) 681 if (res)
732 goto err_dev_cleanup; 682 goto err_dev_cleanup;
733 683
684 res = fuse_ctl_init();
685 if (res)
686 goto err_sysfs_cleanup;
687
734 return 0; 688 return 0;
735 689
690 err_sysfs_cleanup:
691 fuse_sysfs_cleanup();
736 err_dev_cleanup: 692 err_dev_cleanup:
737 fuse_dev_cleanup(); 693 fuse_dev_cleanup();
738 err_fs_cleanup: 694 err_fs_cleanup:
@@ -745,6 +701,7 @@ static void __exit fuse_exit(void)
745{ 701{
746 printk(KERN_DEBUG "fuse exit\n"); 702 printk(KERN_DEBUG "fuse exit\n");
747 703
704 fuse_ctl_cleanup();
748 fuse_sysfs_cleanup(); 705 fuse_sysfs_cleanup();
749 fuse_fs_cleanup(); 706 fuse_fs_cleanup();
750 fuse_dev_cleanup(); 707 fuse_dev_cleanup();
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 7f96b5cb6781..8c9b28dff119 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -34,6 +34,7 @@
34#include <linux/suspend.h> 34#include <linux/suspend.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/kthread.h> 36#include <linux/kthread.h>
37#include <linux/poison.h>
37#include <linux/proc_fs.h> 38#include <linux/proc_fs.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
@@ -1675,7 +1676,7 @@ static void journal_free_journal_head(struct journal_head *jh)
1675{ 1676{
1676#ifdef CONFIG_JBD_DEBUG 1677#ifdef CONFIG_JBD_DEBUG
1677 atomic_dec(&nr_journal_heads); 1678 atomic_dec(&nr_journal_heads);
1678 memset(jh, 0x5b, sizeof(*jh)); 1679 memset(jh, JBD_POISON_FREE, sizeof(*jh));
1679#endif 1680#endif
1680 kmem_cache_free(journal_head_cache, jh); 1681 kmem_cache_free(journal_head_cache, jh);
1681} 1682}
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 80d7f53fd0a7..de5bafb4e853 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -531,6 +531,7 @@ static int do_one_pass(journal_t *journal,
531 default: 531 default:
532 jbd_debug(3, "Unrecognised magic %d, end of scan.\n", 532 jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
533 blocktype); 533 blocktype);
534 brelse(bh);
534 goto done; 535 goto done;
535 } 536 }
536 } 537 }
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 1862e8bc101d..b8886f048eaa 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -53,8 +53,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
53 if (!instr) { 53 if (!instr) {
54 printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); 54 printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
55 spin_lock(&c->erase_completion_lock); 55 spin_lock(&c->erase_completion_lock);
56 list_del(&jeb->list); 56 list_move(&jeb->list, &c->erase_pending_list);
57 list_add(&jeb->list, &c->erase_pending_list);
58 c->erasing_size -= c->sector_size; 57 c->erasing_size -= c->sector_size;
59 c->dirty_size += c->sector_size; 58 c->dirty_size += c->sector_size;
60 jeb->dirty_size = c->sector_size; 59 jeb->dirty_size = c->sector_size;
@@ -86,8 +85,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
86 /* Erase failed immediately. Refile it on the list */ 85 /* Erase failed immediately. Refile it on the list */
87 D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret)); 86 D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret));
88 spin_lock(&c->erase_completion_lock); 87 spin_lock(&c->erase_completion_lock);
89 list_del(&jeb->list); 88 list_move(&jeb->list, &c->erase_pending_list);
90 list_add(&jeb->list, &c->erase_pending_list);
91 c->erasing_size -= c->sector_size; 89 c->erasing_size -= c->sector_size;
92 c->dirty_size += c->sector_size; 90 c->dirty_size += c->sector_size;
93 jeb->dirty_size = c->sector_size; 91 jeb->dirty_size = c->sector_size;
@@ -161,8 +159,7 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
161{ 159{
162 D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset)); 160 D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset));
163 spin_lock(&c->erase_completion_lock); 161 spin_lock(&c->erase_completion_lock);
164 list_del(&jeb->list); 162 list_move_tail(&jeb->list, &c->erase_complete_list);
165 list_add_tail(&jeb->list, &c->erase_complete_list);
166 spin_unlock(&c->erase_completion_lock); 163 spin_unlock(&c->erase_completion_lock);
167 /* Ensure that kupdated calls us again to mark them clean */ 164 /* Ensure that kupdated calls us again to mark them clean */
168 jffs2_erase_pending_trigger(c); 165 jffs2_erase_pending_trigger(c);
@@ -178,8 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
178 if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { 175 if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
179 /* We'd like to give this block another try. */ 176 /* We'd like to give this block another try. */
180 spin_lock(&c->erase_completion_lock); 177 spin_lock(&c->erase_completion_lock);
181 list_del(&jeb->list); 178 list_move(&jeb->list, &c->erase_pending_list);
182 list_add(&jeb->list, &c->erase_pending_list);
183 c->erasing_size -= c->sector_size; 179 c->erasing_size -= c->sector_size;
184 c->dirty_size += c->sector_size; 180 c->dirty_size += c->sector_size;
185 jeb->dirty_size = c->sector_size; 181 jeb->dirty_size = c->sector_size;
@@ -191,8 +187,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
191 spin_lock(&c->erase_completion_lock); 187 spin_lock(&c->erase_completion_lock);
192 c->erasing_size -= c->sector_size; 188 c->erasing_size -= c->sector_size;
193 c->bad_size += c->sector_size; 189 c->bad_size += c->sector_size;
194 list_del(&jeb->list); 190 list_move(&jeb->list, &c->bad_list);
195 list_add(&jeb->list, &c->bad_list);
196 c->nr_erasing_blocks--; 191 c->nr_erasing_blocks--;
197 spin_unlock(&c->erase_completion_lock); 192 spin_unlock(&c->erase_completion_lock);
198 wake_up(&c->erase_wait); 193 wake_up(&c->erase_wait);
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 8bedfd2ff689..ac0c350ed7d7 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -211,8 +211,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
211 struct jffs2_eraseblock *ejeb; 211 struct jffs2_eraseblock *ejeb;
212 212
213 ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); 213 ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
214 list_del(&ejeb->list); 214 list_move_tail(&ejeb->list, &c->erase_pending_list);
215 list_add_tail(&ejeb->list, &c->erase_pending_list);
216 c->nr_erasing_blocks++; 215 c->nr_erasing_blocks++;
217 jffs2_erase_pending_trigger(c); 216 jffs2_erase_pending_trigger(c);
218 D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", 217 D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 0b02fc79e4d1..be1acc3dad97 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -43,7 +43,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c)
43 return -ENOMEM; 43 return -ENOMEM;
44 } 44 }
45 45
46 dbg_summary("returned succesfully\n"); 46 dbg_summary("returned successfully\n");
47 47
48 return 0; 48 return 0;
49} 49}
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index a7f153f79ecb..b9b700730dfe 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -495,8 +495,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
495 /* Fix up the original jeb now it's on the bad_list */ 495 /* Fix up the original jeb now it's on the bad_list */
496 if (first_raw == jeb->first_node) { 496 if (first_raw == jeb->first_node) {
497 D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); 497 D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
498 list_del(&jeb->list); 498 list_move(&jeb->list, &c->erase_pending_list);
499 list_add(&jeb->list, &c->erase_pending_list);
500 c->nr_erasing_blocks++; 499 c->nr_erasing_blocks++;
501 jffs2_erase_pending_trigger(c); 500 jffs2_erase_pending_trigger(c);
502 } 501 }
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 5549378358bf..4d52593a5fc6 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
126 126
127 /* allocate the disk blocks for the extent. initially, extBalloc() 127 /* allocate the disk blocks for the extent. initially, extBalloc()
128 * will try to allocate disk blocks for the requested size (xlen). 128 * will try to allocate disk blocks for the requested size (xlen).
129 * if this fails (xlen contigious free blocks not avaliable), it'll 129 * if this fails (xlen contiguous free blocks not avaliable), it'll
130 * try to allocate a smaller number of blocks (producing a smaller 130 * try to allocate a smaller number of blocks (producing a smaller
131 * extent), with this smaller number of blocks consisting of the 131 * extent), with this smaller number of blocks consisting of the
132 * requested number of blocks rounded down to the next smaller 132 * requested number of blocks rounded down to the next smaller
@@ -493,7 +493,7 @@ int extFill(struct inode *ip, xad_t * xp)
493 * 493 *
494 * initially, we will try to allocate disk blocks for the 494 * initially, we will try to allocate disk blocks for the
495 * requested size (nblocks). if this fails (nblocks 495 * requested size (nblocks). if this fails (nblocks
496 * contigious free blocks not avaliable), we'll try to allocate 496 * contiguous free blocks not avaliable), we'll try to allocate
497 * a smaller number of blocks (producing a smaller extent), with 497 * a smaller number of blocks (producing a smaller extent), with
498 * this smaller number of blocks consisting of the requested 498 * this smaller number of blocks consisting of the requested
499 * number of blocks rounded down to the next smaller power of 2 499 * number of blocks rounded down to the next smaller power of 2
@@ -529,7 +529,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
529 529
530 /* get the number of blocks to initially attempt to allocate. 530 /* get the number of blocks to initially attempt to allocate.
531 * we'll first try the number of blocks requested unless this 531 * we'll first try the number of blocks requested unless this
532 * number is greater than the maximum number of contigious free 532 * number is greater than the maximum number of contiguous free
533 * blocks in the map. in that case, we'll start off with the 533 * blocks in the map. in that case, we'll start off with the
534 * maximum free. 534 * maximum free.
535 */ 535 */
@@ -586,7 +586,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
586 * in place. if this fails, we'll try to move the extent 586 * in place. if this fails, we'll try to move the extent
587 * to a new set of blocks. if moving the extent, we initially 587 * to a new set of blocks. if moving the extent, we initially
588 * will try to allocate disk blocks for the requested size 588 * will try to allocate disk blocks for the requested size
589 * (nnew). if this fails (nnew contigious free blocks not 589 * (nnew). if this fails (new contiguous free blocks not
590 * avaliable), we'll try to allocate a smaller number of 590 * avaliable), we'll try to allocate a smaller number of
591 * blocks (producing a smaller extent), with this smaller 591 * blocks (producing a smaller extent), with this smaller
592 * number of blocks consisting of the requested number of 592 * number of blocks consisting of the requested number of
diff --git a/fs/libfs.c b/fs/libfs.c
index 1b1156381787..ac02ea602c3d 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -149,10 +149,9 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
149 /* fallthrough */ 149 /* fallthrough */
150 default: 150 default:
151 spin_lock(&dcache_lock); 151 spin_lock(&dcache_lock);
152 if (filp->f_pos == 2) { 152 if (filp->f_pos == 2)
153 list_del(q); 153 list_move(q, &dentry->d_subdirs);
154 list_add(q, &dentry->d_subdirs); 154
155 }
156 for (p=q->next; p != &dentry->d_subdirs; p=p->next) { 155 for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
157 struct dentry *next; 156 struct dentry *next;
158 next = list_entry(p, struct dentry, d_u.d_child); 157 next = list_entry(p, struct dentry, d_u.d_child);
@@ -164,8 +163,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
164 return 0; 163 return 0;
165 spin_lock(&dcache_lock); 164 spin_lock(&dcache_lock);
166 /* next is still alive */ 165 /* next is still alive */
167 list_del(q); 166 list_move(q, p);
168 list_add(q, p);
169 p = q; 167 p = q;
170 filp->f_pos++; 168 filp->f_pos++;
171 } 169 }
@@ -424,13 +422,13 @@ out:
424 422
425static DEFINE_SPINLOCK(pin_fs_lock); 423static DEFINE_SPINLOCK(pin_fs_lock);
426 424
427int simple_pin_fs(char *name, struct vfsmount **mount, int *count) 425int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
428{ 426{
429 struct vfsmount *mnt = NULL; 427 struct vfsmount *mnt = NULL;
430 spin_lock(&pin_fs_lock); 428 spin_lock(&pin_fs_lock);
431 if (unlikely(!*mount)) { 429 if (unlikely(!*mount)) {
432 spin_unlock(&pin_fs_lock); 430 spin_unlock(&pin_fs_lock);
433 mnt = do_kern_mount(name, 0, name, NULL); 431 mnt = vfs_kern_mount(type, 0, type->name, NULL);
434 if (IS_ERR(mnt)) 432 if (IS_ERR(mnt))
435 return PTR_ERR(mnt); 433 return PTR_ERR(mnt);
436 spin_lock(&pin_fs_lock); 434 spin_lock(&pin_fs_lock);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index bce744468708..52774feab93f 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -147,11 +147,10 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
147 * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number, 147 * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number,
148 * that we mark locks for reclaiming, and that we bump the pseudo NSM state. 148 * that we mark locks for reclaiming, and that we bump the pseudo NSM state.
149 */ 149 */
150static inline 150static void nlmclnt_prepare_reclaim(struct nlm_host *host)
151void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
152{ 151{
152 down_write(&host->h_rwsem);
153 host->h_monitored = 0; 153 host->h_monitored = 0;
154 host->h_nsmstate = newstate;
155 host->h_state++; 154 host->h_state++;
156 host->h_nextrebind = 0; 155 host->h_nextrebind = 0;
157 nlm_rebind_host(host); 156 nlm_rebind_host(host);
@@ -164,6 +163,13 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
164 dprintk("NLM: reclaiming locks for host %s", host->h_name); 163 dprintk("NLM: reclaiming locks for host %s", host->h_name);
165} 164}
166 165
166static void nlmclnt_finish_reclaim(struct nlm_host *host)
167{
168 host->h_reclaiming = 0;
169 up_write(&host->h_rwsem);
170 dprintk("NLM: done reclaiming locks for host %s", host->h_name);
171}
172
167/* 173/*
168 * Reclaim all locks on server host. We do this by spawning a separate 174 * Reclaim all locks on server host. We do this by spawning a separate
169 * reclaimer thread. 175 * reclaimer thread.
@@ -171,12 +177,10 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
171void 177void
172nlmclnt_recovery(struct nlm_host *host, u32 newstate) 178nlmclnt_recovery(struct nlm_host *host, u32 newstate)
173{ 179{
174 if (host->h_reclaiming++) { 180 if (host->h_nsmstate == newstate)
175 if (host->h_nsmstate == newstate) 181 return;
176 return; 182 host->h_nsmstate = newstate;
177 nlmclnt_prepare_reclaim(host, newstate); 183 if (!host->h_reclaiming++) {
178 } else {
179 nlmclnt_prepare_reclaim(host, newstate);
180 nlm_get_host(host); 184 nlm_get_host(host);
181 __module_get(THIS_MODULE); 185 __module_get(THIS_MODULE);
182 if (kernel_thread(reclaimer, host, CLONE_KERNEL) < 0) 186 if (kernel_thread(reclaimer, host, CLONE_KERNEL) < 0)
@@ -190,6 +194,7 @@ reclaimer(void *ptr)
190 struct nlm_host *host = (struct nlm_host *) ptr; 194 struct nlm_host *host = (struct nlm_host *) ptr;
191 struct nlm_wait *block; 195 struct nlm_wait *block;
192 struct file_lock *fl, *next; 196 struct file_lock *fl, *next;
197 u32 nsmstate;
193 198
194 daemonize("%s-reclaim", host->h_name); 199 daemonize("%s-reclaim", host->h_name);
195 allow_signal(SIGKILL); 200 allow_signal(SIGKILL);
@@ -199,19 +204,25 @@ reclaimer(void *ptr)
199 lock_kernel(); 204 lock_kernel();
200 lockd_up(); 205 lockd_up();
201 206
207 nlmclnt_prepare_reclaim(host);
202 /* First, reclaim all locks that have been marked. */ 208 /* First, reclaim all locks that have been marked. */
203restart: 209restart:
210 nsmstate = host->h_nsmstate;
204 list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) { 211 list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
205 list_del_init(&fl->fl_u.nfs_fl.list); 212 list_del_init(&fl->fl_u.nfs_fl.list);
206 213
207 if (signalled()) 214 if (signalled())
208 continue; 215 continue;
209 if (nlmclnt_reclaim(host, fl) == 0) 216 if (nlmclnt_reclaim(host, fl) != 0)
210 list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted); 217 continue;
211 goto restart; 218 list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
219 if (host->h_nsmstate != nsmstate) {
220 /* Argh! The server rebooted again! */
221 list_splice_init(&host->h_granted, &host->h_reclaim);
222 goto restart;
223 }
212 } 224 }
213 225 nlmclnt_finish_reclaim(host);
214 host->h_reclaiming = 0;
215 226
216 /* Now, wake up all processes that sleep on a blocked lock */ 227 /* Now, wake up all processes that sleep on a blocked lock */
217 list_for_each_entry(block, &nlm_blocked, b_list) { 228 list_for_each_entry(block, &nlm_blocked, b_list) {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index f96e38155b5c..4db62098d3f4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -508,7 +508,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
508 } 508 }
509 509
510 block = nlmclnt_prepare_block(host, fl); 510 block = nlmclnt_prepare_block(host, fl);
511again:
511 for(;;) { 512 for(;;) {
513 /* Reboot protection */
514 fl->fl_u.nfs_fl.state = host->h_state;
512 status = nlmclnt_call(req, NLMPROC_LOCK); 515 status = nlmclnt_call(req, NLMPROC_LOCK);
513 if (status < 0) 516 if (status < 0)
514 goto out_unblock; 517 goto out_unblock;
@@ -531,10 +534,16 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
531 } 534 }
532 535
533 if (resp->status == NLM_LCK_GRANTED) { 536 if (resp->status == NLM_LCK_GRANTED) {
534 fl->fl_u.nfs_fl.state = host->h_state; 537 down_read(&host->h_rwsem);
538 /* Check whether or not the server has rebooted */
539 if (fl->fl_u.nfs_fl.state != host->h_state) {
540 up_read(&host->h_rwsem);
541 goto again;
542 }
535 fl->fl_flags |= FL_SLEEP; 543 fl->fl_flags |= FL_SLEEP;
536 /* Ensure the resulting lock will get added to granted list */ 544 /* Ensure the resulting lock will get added to granted list */
537 do_vfs_lock(fl); 545 do_vfs_lock(fl);
546 up_read(&host->h_rwsem);
538 } 547 }
539 status = nlm_stat_to_errno(resp->status); 548 status = nlm_stat_to_errno(resp->status);
540out_unblock: 549out_unblock:
@@ -596,6 +605,7 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl)
596static int 605static int
597nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) 606nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
598{ 607{
608 struct nlm_host *host = req->a_host;
599 struct nlm_res *resp = &req->a_res; 609 struct nlm_res *resp = &req->a_res;
600 int status; 610 int status;
601 611
@@ -604,7 +614,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
604 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either 614 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
605 * case, we want to unlock. 615 * case, we want to unlock.
606 */ 616 */
617 down_read(&host->h_rwsem);
607 do_vfs_lock(fl); 618 do_vfs_lock(fl);
619 up_read(&host->h_rwsem);
608 620
609 if (req->a_flags & RPC_TASK_ASYNC) 621 if (req->a_flags & RPC_TASK_ASYNC)
610 return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); 622 return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 729ac427d359..38b0e8a1aec0 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -112,11 +112,12 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
112 host->h_version = version; 112 host->h_version = version;
113 host->h_proto = proto; 113 host->h_proto = proto;
114 host->h_rpcclnt = NULL; 114 host->h_rpcclnt = NULL;
115 init_MUTEX(&host->h_sema); 115 mutex_init(&host->h_mutex);
116 host->h_nextrebind = jiffies + NLM_HOST_REBIND; 116 host->h_nextrebind = jiffies + NLM_HOST_REBIND;
117 host->h_expires = jiffies + NLM_HOST_EXPIRE; 117 host->h_expires = jiffies + NLM_HOST_EXPIRE;
118 atomic_set(&host->h_count, 1); 118 atomic_set(&host->h_count, 1);
119 init_waitqueue_head(&host->h_gracewait); 119 init_waitqueue_head(&host->h_gracewait);
120 init_rwsem(&host->h_rwsem);
120 host->h_state = 0; /* pseudo NSM state */ 121 host->h_state = 0; /* pseudo NSM state */
121 host->h_nsmstate = 0; /* real NSM state */ 122 host->h_nsmstate = 0; /* real NSM state */
122 host->h_server = server; 123 host->h_server = server;
@@ -172,7 +173,7 @@ nlm_bind_host(struct nlm_host *host)
172 (unsigned)ntohl(host->h_addr.sin_addr.s_addr)); 173 (unsigned)ntohl(host->h_addr.sin_addr.s_addr));
173 174
174 /* Lock host handle */ 175 /* Lock host handle */
175 down(&host->h_sema); 176 mutex_lock(&host->h_mutex);
176 177
177 /* If we've already created an RPC client, check whether 178 /* If we've already created an RPC client, check whether
178 * RPC rebind is required 179 * RPC rebind is required
@@ -204,12 +205,12 @@ nlm_bind_host(struct nlm_host *host)
204 host->h_rpcclnt = clnt; 205 host->h_rpcclnt = clnt;
205 } 206 }
206 207
207 up(&host->h_sema); 208 mutex_unlock(&host->h_mutex);
208 return clnt; 209 return clnt;
209 210
210forgetit: 211forgetit:
211 printk("lockd: couldn't create RPC handle for %s\n", host->h_name); 212 printk("lockd: couldn't create RPC handle for %s\n", host->h_name);
212 up(&host->h_sema); 213 mutex_unlock(&host->h_mutex);
213 return NULL; 214 return NULL;
214} 215}
215 216
diff --git a/fs/namei.c b/fs/namei.c
index bb4a3e40e432..c784e8bb57a3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2243,14 +2243,16 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
2243 int error; 2243 int error;
2244 char * to; 2244 char * to;
2245 2245
2246 if (flags != 0) 2246 if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
2247 return -EINVAL; 2247 return -EINVAL;
2248 2248
2249 to = getname(newname); 2249 to = getname(newname);
2250 if (IS_ERR(to)) 2250 if (IS_ERR(to))
2251 return PTR_ERR(to); 2251 return PTR_ERR(to);
2252 2252
2253 error = __user_walk_fd(olddfd, oldname, 0, &old_nd); 2253 error = __user_walk_fd(olddfd, oldname,
2254 flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
2255 &old_nd);
2254 if (error) 2256 if (error)
2255 goto exit; 2257 goto exit;
2256 error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd); 2258 error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
diff --git a/fs/namespace.c b/fs/namespace.c
index c13072a5f1ee..b3ed212ea416 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -526,10 +526,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
526{ 526{
527 struct vfsmount *p; 527 struct vfsmount *p;
528 528
529 for (p = mnt; p; p = next_mnt(p, mnt)) { 529 for (p = mnt; p; p = next_mnt(p, mnt))
530 list_del(&p->mnt_hash); 530 list_move(&p->mnt_hash, kill);
531 list_add(&p->mnt_hash, kill);
532 }
533 531
534 if (propagate) 532 if (propagate)
535 propagate_umount(kill); 533 propagate_umount(kill);
@@ -585,8 +583,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
585 */ 583 */
586 584
587 lock_kernel(); 585 lock_kernel();
588 if ((flags & MNT_FORCE) && sb->s_op->umount_begin) 586 if (sb->s_op->umount_begin)
589 sb->s_op->umount_begin(sb); 587 sb->s_op->umount_begin(mnt, flags);
590 unlock_kernel(); 588 unlock_kernel();
591 589
592 /* 590 /*
@@ -1172,13 +1170,46 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
1172} 1170}
1173 1171
1174/* 1172/*
1173 * go through the vfsmounts we've just consigned to the graveyard to
1174 * - check that they're still dead
1175 * - delete the vfsmount from the appropriate namespace under lock
1176 * - dispose of the corpse
1177 */
1178static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts)
1179{
1180 struct namespace *namespace;
1181 struct vfsmount *mnt;
1182
1183 while (!list_empty(graveyard)) {
1184 LIST_HEAD(umounts);
1185 mnt = list_entry(graveyard->next, struct vfsmount, mnt_expire);
1186 list_del_init(&mnt->mnt_expire);
1187
1188 /* don't do anything if the namespace is dead - all the
1189 * vfsmounts from it are going away anyway */
1190 namespace = mnt->mnt_namespace;
1191 if (!namespace || !namespace->root)
1192 continue;
1193 get_namespace(namespace);
1194
1195 spin_unlock(&vfsmount_lock);
1196 down_write(&namespace_sem);
1197 expire_mount(mnt, mounts, &umounts);
1198 up_write(&namespace_sem);
1199 release_mounts(&umounts);
1200 mntput(mnt);
1201 put_namespace(namespace);
1202 spin_lock(&vfsmount_lock);
1203 }
1204}
1205
1206/*
1175 * process a list of expirable mountpoints with the intent of discarding any 1207 * process a list of expirable mountpoints with the intent of discarding any
1176 * mountpoints that aren't in use and haven't been touched since last we came 1208 * mountpoints that aren't in use and haven't been touched since last we came
1177 * here 1209 * here
1178 */ 1210 */
1179void mark_mounts_for_expiry(struct list_head *mounts) 1211void mark_mounts_for_expiry(struct list_head *mounts)
1180{ 1212{
1181 struct namespace *namespace;
1182 struct vfsmount *mnt, *next; 1213 struct vfsmount *mnt, *next;
1183 LIST_HEAD(graveyard); 1214 LIST_HEAD(graveyard);
1184 1215
@@ -1202,38 +1233,79 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1202 list_move(&mnt->mnt_expire, &graveyard); 1233 list_move(&mnt->mnt_expire, &graveyard);
1203 } 1234 }
1204 1235
1205 /* 1236 expire_mount_list(&graveyard, mounts);
1206 * go through the vfsmounts we've just consigned to the graveyard to
1207 * - check that they're still dead
1208 * - delete the vfsmount from the appropriate namespace under lock
1209 * - dispose of the corpse
1210 */
1211 while (!list_empty(&graveyard)) {
1212 LIST_HEAD(umounts);
1213 mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire);
1214 list_del_init(&mnt->mnt_expire);
1215 1237
1216 /* don't do anything if the namespace is dead - all the 1238 spin_unlock(&vfsmount_lock);
1217 * vfsmounts from it are going away anyway */ 1239}
1218 namespace = mnt->mnt_namespace; 1240
1219 if (!namespace || !namespace->root) 1241EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
1242
1243/*
1244 * Ripoff of 'select_parent()'
1245 *
1246 * search the list of submounts for a given mountpoint, and move any
1247 * shrinkable submounts to the 'graveyard' list.
1248 */
1249static int select_submounts(struct vfsmount *parent, struct list_head *graveyard)
1250{
1251 struct vfsmount *this_parent = parent;
1252 struct list_head *next;
1253 int found = 0;
1254
1255repeat:
1256 next = this_parent->mnt_mounts.next;
1257resume:
1258 while (next != &this_parent->mnt_mounts) {
1259 struct list_head *tmp = next;
1260 struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child);
1261
1262 next = tmp->next;
1263 if (!(mnt->mnt_flags & MNT_SHRINKABLE))
1220 continue; 1264 continue;
1221 get_namespace(namespace); 1265 /*
1266 * Descend a level if the d_mounts list is non-empty.
1267 */
1268 if (!list_empty(&mnt->mnt_mounts)) {
1269 this_parent = mnt;
1270 goto repeat;
1271 }
1222 1272
1223 spin_unlock(&vfsmount_lock); 1273 if (!propagate_mount_busy(mnt, 1)) {
1224 down_write(&namespace_sem); 1274 mntget(mnt);
1225 expire_mount(mnt, mounts, &umounts); 1275 list_move_tail(&mnt->mnt_expire, graveyard);
1226 up_write(&namespace_sem); 1276 found++;
1227 release_mounts(&umounts); 1277 }
1228 mntput(mnt); 1278 }
1229 put_namespace(namespace); 1279 /*
1230 spin_lock(&vfsmount_lock); 1280 * All done at this level ... ascend and resume the search
1281 */
1282 if (this_parent != parent) {
1283 next = this_parent->mnt_child.next;
1284 this_parent = this_parent->mnt_parent;
1285 goto resume;
1231 } 1286 }
1287 return found;
1288}
1289
1290/*
1291 * process a list of expirable mountpoints with the intent of discarding any
1292 * submounts of a specific parent mountpoint
1293 */
1294void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts)
1295{
1296 LIST_HEAD(graveyard);
1297 int found;
1298
1299 spin_lock(&vfsmount_lock);
1300
1301 /* extract submounts of 'mountpoint' from the expiration list */
1302 while ((found = select_submounts(mountpoint, &graveyard)) != 0)
1303 expire_mount_list(&graveyard, mounts);
1232 1304
1233 spin_unlock(&vfsmount_lock); 1305 spin_unlock(&vfsmount_lock);
1234} 1306}
1235 1307
1236EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 1308EXPORT_SYMBOL_GPL(shrink_submounts);
1237 1309
1238/* 1310/*
1239 * Some copy_from_user() implementations do not return the exact number of 1311 * Some copy_from_user() implementations do not return the exact number of
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ec61fd56a1a9..0b572a0c1967 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -4,14 +4,16 @@
4 4
5obj-$(CONFIG_NFS_FS) += nfs.o 5obj-$(CONFIG_NFS_FS) += nfs.o
6 6
7nfs-y := dir.o file.o inode.o nfs2xdr.o pagelist.o \ 7nfs-y := dir.o file.o inode.o super.o nfs2xdr.o pagelist.o \
8 proc.o read.o symlink.o unlink.o write.o 8 proc.o read.o symlink.o unlink.o write.o \
9 namespace.o
9nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o 10nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o
10nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o 11nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
11nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o 12nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
12nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ 13nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
13 delegation.o idmap.o \ 14 delegation.o idmap.o \
14 callback.o callback_xdr.o callback_proc.o 15 callback.o callback_xdr.o callback_proc.o \
16 nfs4namespace.o
15nfs-$(CONFIG_NFS_DIRECTIO) += direct.o 17nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
16nfs-$(CONFIG_SYSCTL) += sysctl.o 18nfs-$(CONFIG_SYSCTL) += sysctl.o
17nfs-objs := $(nfs-y) 19nfs-objs := $(nfs-y)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 90c95adc8c1b..d53f8c6a9ecb 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -182,8 +182,6 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
182/* 182/*
183 * Define NFS4 callback program 183 * Define NFS4 callback program
184 */ 184 */
185extern struct svc_version nfs4_callback_version1;
186
187static struct svc_version *nfs4_callback_version[] = { 185static struct svc_version *nfs4_callback_version[] = {
188 [1] = &nfs4_callback_version1, 186 [1] = &nfs4_callback_version1,
189}; 187};
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05c38cf40b69..c92991328d9a 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -202,7 +202,7 @@ static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xd
202 status = decode_fh(xdr, &args->fh); 202 status = decode_fh(xdr, &args->fh);
203out: 203out:
204 dprintk("%s: exit with status = %d\n", __FUNCTION__, status); 204 dprintk("%s: exit with status = %d\n", __FUNCTION__, status);
205 return 0; 205 return status;
206} 206}
207 207
208static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) 208static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cae74dd4c7f5..3ddda6f7ecc2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -528,7 +528,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
528 528
529 lock_kernel(); 529 lock_kernel();
530 530
531 res = nfs_revalidate_inode(NFS_SERVER(inode), inode); 531 res = nfs_revalidate_mapping(inode, filp->f_mapping);
532 if (res < 0) { 532 if (res < 0) {
533 unlock_kernel(); 533 unlock_kernel();
534 return res; 534 return res;
@@ -868,6 +868,17 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
868 return (nd->intent.open.flags & O_EXCL) != 0; 868 return (nd->intent.open.flags & O_EXCL) != 0;
869} 869}
870 870
871static inline int nfs_reval_fsid(struct inode *dir,
872 struct nfs_fh *fh, struct nfs_fattr *fattr)
873{
874 struct nfs_server *server = NFS_SERVER(dir);
875
876 if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
877 /* Revalidate fsid on root dir */
878 return __nfs_revalidate_inode(server, dir->i_sb->s_root->d_inode);
879 return 0;
880}
881
871static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 882static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
872{ 883{
873 struct dentry *res; 884 struct dentry *res;
@@ -900,6 +911,11 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
900 res = ERR_PTR(error); 911 res = ERR_PTR(error);
901 goto out_unlock; 912 goto out_unlock;
902 } 913 }
914 error = nfs_reval_fsid(dir, &fhandle, &fattr);
915 if (error < 0) {
916 res = ERR_PTR(error);
917 goto out_unlock;
918 }
903 inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); 919 inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
904 res = (struct dentry *)inode; 920 res = (struct dentry *)inode;
905 if (IS_ERR(res)) 921 if (IS_ERR(res))
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 3c72b0c07283..8ca9707be6c9 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -892,7 +892,7 @@ out:
892 * nfs_init_directcache - create a slab cache for nfs_direct_req structures 892 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
893 * 893 *
894 */ 894 */
895int nfs_init_directcache(void) 895int __init nfs_init_directcache(void)
896{ 896{
897 nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", 897 nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
898 sizeof(struct nfs_direct_req), 898 sizeof(struct nfs_direct_req),
@@ -906,7 +906,7 @@ int nfs_init_directcache(void)
906} 906}
907 907
908/** 908/**
909 * nfs_init_directcache - destroy the slab cache for nfs_direct_req structures 909 * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
910 * 910 *
911 */ 911 */
912void nfs_destroy_directcache(void) 912void nfs_destroy_directcache(void)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index fa05c027ea11..add289138836 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -127,23 +127,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
127} 127}
128 128
129/** 129/**
130 * nfs_revalidate_file - Revalidate the page cache & related metadata
131 * @inode - pointer to inode struct
132 * @file - pointer to file
133 */
134static int nfs_revalidate_file(struct inode *inode, struct file *filp)
135{
136 struct nfs_inode *nfsi = NFS_I(inode);
137 int retval = 0;
138
139 if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR))
140 || nfs_attribute_timeout(inode))
141 retval = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
142 nfs_revalidate_mapping(inode, filp->f_mapping);
143 return 0;
144}
145
146/**
147 * nfs_revalidate_size - Revalidate the file size 130 * nfs_revalidate_size - Revalidate the file size
148 * @inode - pointer to inode struct 131 * @inode - pointer to inode struct
149 * @file - pointer to struct file 132 * @file - pointer to struct file
@@ -228,7 +211,7 @@ nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos)
228 dentry->d_parent->d_name.name, dentry->d_name.name, 211 dentry->d_parent->d_name.name, dentry->d_name.name,
229 (unsigned long) count, (unsigned long) pos); 212 (unsigned long) count, (unsigned long) pos);
230 213
231 result = nfs_revalidate_file(inode, iocb->ki_filp); 214 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
232 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count); 215 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
233 if (!result) 216 if (!result)
234 result = generic_file_aio_read(iocb, buf, count, pos); 217 result = generic_file_aio_read(iocb, buf, count, pos);
@@ -247,7 +230,7 @@ nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count,
247 dentry->d_parent->d_name.name, dentry->d_name.name, 230 dentry->d_parent->d_name.name, dentry->d_name.name,
248 (unsigned long) count, (unsigned long long) *ppos); 231 (unsigned long) count, (unsigned long long) *ppos);
249 232
250 res = nfs_revalidate_file(inode, filp); 233 res = nfs_revalidate_mapping(inode, filp->f_mapping);
251 if (!res) 234 if (!res)
252 res = generic_file_sendfile(filp, ppos, count, actor, target); 235 res = generic_file_sendfile(filp, ppos, count, actor, target);
253 return res; 236 return res;
@@ -263,7 +246,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
263 dfprintk(VFS, "nfs: mmap(%s/%s)\n", 246 dfprintk(VFS, "nfs: mmap(%s/%s)\n",
264 dentry->d_parent->d_name.name, dentry->d_name.name); 247 dentry->d_parent->d_name.name, dentry->d_name.name);
265 248
266 status = nfs_revalidate_file(inode, file); 249 status = nfs_revalidate_mapping(inode, file->f_mapping);
267 if (!status) 250 if (!status)
268 status = generic_file_mmap(file, vma); 251 status = generic_file_mmap(file, vma);
269 return status; 252 return status;
@@ -320,7 +303,11 @@ static int nfs_commit_write(struct file *file, struct page *page, unsigned offse
320 303
321static void nfs_invalidate_page(struct page *page, unsigned long offset) 304static void nfs_invalidate_page(struct page *page, unsigned long offset)
322{ 305{
323 /* FIXME: we really should cancel any unstarted writes on this page */ 306 struct inode *inode = page->mapping->host;
307
308 /* Cancel any unstarted writes on this page */
309 if (offset == 0)
310 nfs_sync_inode_wait(inode, page->index, 1, FLUSH_INVALIDATE);
324} 311}
325 312
326static int nfs_release_page(struct page *page, gfp_t gfp) 313static int nfs_release_page(struct page *page, gfp_t gfp)
@@ -373,7 +360,6 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
373 if (result) 360 if (result)
374 goto out; 361 goto out;
375 } 362 }
376 nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
377 363
378 result = count; 364 result = count;
379 if (!count) 365 if (!count)
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 3fab5b0cfc5a..b81e7ed3c902 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -47,7 +47,6 @@
47#include <linux/workqueue.h> 47#include <linux/workqueue.h>
48#include <linux/sunrpc/rpc_pipe_fs.h> 48#include <linux/sunrpc/rpc_pipe_fs.h>
49 49
50#include <linux/nfs_fs_sb.h>
51#include <linux/nfs_fs.h> 50#include <linux/nfs_fs.h>
52 51
53#include <linux/nfs_idmap.h> 52#include <linux/nfs_idmap.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 937fbfc381bb..c5b916605fb0 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,8 @@
36#include <linux/mount.h> 36#include <linux/mount.h>
37#include <linux/nfs_idmap.h> 37#include <linux/nfs_idmap.h>
38#include <linux/vfs.h> 38#include <linux/vfs.h>
39#include <linux/inet.h>
40#include <linux/nfs_xdr.h>
39 41
40#include <asm/system.h> 42#include <asm/system.h>
41#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -44,89 +46,17 @@
44#include "callback.h" 46#include "callback.h"
45#include "delegation.h" 47#include "delegation.h"
46#include "iostat.h" 48#include "iostat.h"
49#include "internal.h"
47 50
48#define NFSDBG_FACILITY NFSDBG_VFS 51#define NFSDBG_FACILITY NFSDBG_VFS
49#define NFS_PARANOIA 1 52#define NFS_PARANOIA 1
50 53
51/* Maximum number of readahead requests
52 * FIXME: this should really be a sysctl so that users may tune it to suit
53 * their needs. People that do NFS over a slow network, might for
54 * instance want to reduce it to something closer to 1 for improved
55 * interactive response.
56 */
57#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1)
58
59static void nfs_invalidate_inode(struct inode *); 54static void nfs_invalidate_inode(struct inode *);
60static int nfs_update_inode(struct inode *, struct nfs_fattr *); 55static int nfs_update_inode(struct inode *, struct nfs_fattr *);
61 56
62static struct inode *nfs_alloc_inode(struct super_block *sb);
63static void nfs_destroy_inode(struct inode *);
64static int nfs_write_inode(struct inode *,int);
65static void nfs_delete_inode(struct inode *);
66static void nfs_clear_inode(struct inode *);
67static void nfs_umount_begin(struct super_block *);
68static int nfs_statfs(struct dentry *, struct kstatfs *);
69static int nfs_show_options(struct seq_file *, struct vfsmount *);
70static int nfs_show_stats(struct seq_file *, struct vfsmount *);
71static void nfs_zap_acl_cache(struct inode *); 57static void nfs_zap_acl_cache(struct inode *);
72 58
73static struct rpc_program nfs_program; 59static kmem_cache_t * nfs_inode_cachep;
74
75static struct super_operations nfs_sops = {
76 .alloc_inode = nfs_alloc_inode,
77 .destroy_inode = nfs_destroy_inode,
78 .write_inode = nfs_write_inode,
79 .delete_inode = nfs_delete_inode,
80 .statfs = nfs_statfs,
81 .clear_inode = nfs_clear_inode,
82 .umount_begin = nfs_umount_begin,
83 .show_options = nfs_show_options,
84 .show_stats = nfs_show_stats,
85};
86
87/*
88 * RPC cruft for NFS
89 */
90static struct rpc_stat nfs_rpcstat = {
91 .program = &nfs_program
92};
93static struct rpc_version * nfs_version[] = {
94 NULL,
95 NULL,
96 &nfs_version2,
97#if defined(CONFIG_NFS_V3)
98 &nfs_version3,
99#elif defined(CONFIG_NFS_V4)
100 NULL,
101#endif
102#if defined(CONFIG_NFS_V4)
103 &nfs_version4,
104#endif
105};
106
107static struct rpc_program nfs_program = {
108 .name = "nfs",
109 .number = NFS_PROGRAM,
110 .nrvers = ARRAY_SIZE(nfs_version),
111 .version = nfs_version,
112 .stats = &nfs_rpcstat,
113 .pipe_dir_name = "/nfs",
114};
115
116#ifdef CONFIG_NFS_V3_ACL
117static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
118static struct rpc_version * nfsacl_version[] = {
119 [3] = &nfsacl_version3,
120};
121
122struct rpc_program nfsacl_program = {
123 .name = "nfsacl",
124 .number = NFS_ACL_PROGRAM,
125 .nrvers = ARRAY_SIZE(nfsacl_version),
126 .version = nfsacl_version,
127 .stats = &nfsacl_rpcstat,
128};
129#endif /* CONFIG_NFS_V3_ACL */
130 60
131static inline unsigned long 61static inline unsigned long
132nfs_fattr_to_ino_t(struct nfs_fattr *fattr) 62nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
@@ -134,8 +64,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
134 return nfs_fileid_to_ino_t(fattr->fileid); 64 return nfs_fileid_to_ino_t(fattr->fileid);
135} 65}
136 66
137static int 67int nfs_write_inode(struct inode *inode, int sync)
138nfs_write_inode(struct inode *inode, int sync)
139{ 68{
140 int flags = sync ? FLUSH_SYNC : 0; 69 int flags = sync ? FLUSH_SYNC : 0;
141 int ret; 70 int ret;
@@ -146,31 +75,15 @@ nfs_write_inode(struct inode *inode, int sync)
146 return 0; 75 return 0;
147} 76}
148 77
149static void 78void nfs_clear_inode(struct inode *inode)
150nfs_delete_inode(struct inode * inode)
151{ 79{
152 dprintk("NFS: delete_inode(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); 80 struct nfs_inode *nfsi = NFS_I(inode);
153 81 struct rpc_cred *cred;
154 truncate_inode_pages(&inode->i_data, 0);
155 82
156 nfs_wb_all(inode);
157 /* 83 /*
158 * The following should never happen... 84 * The following should never happen...
159 */ 85 */
160 if (nfs_have_writebacks(inode)) { 86 BUG_ON(nfs_have_writebacks(inode));
161 printk(KERN_ERR "nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino);
162 }
163
164 clear_inode(inode);
165}
166
167static void
168nfs_clear_inode(struct inode *inode)
169{
170 struct nfs_inode *nfsi = NFS_I(inode);
171 struct rpc_cred *cred;
172
173 nfs_wb_all(inode);
174 BUG_ON (!list_empty(&nfsi->open_files)); 87 BUG_ON (!list_empty(&nfsi->open_files));
175 nfs_zap_acl_cache(inode); 88 nfs_zap_acl_cache(inode);
176 cred = nfsi->cache_access.cred; 89 cred = nfsi->cache_access.cred;
@@ -179,555 +92,6 @@ nfs_clear_inode(struct inode *inode)
179 BUG_ON(atomic_read(&nfsi->data_updates) != 0); 92 BUG_ON(atomic_read(&nfsi->data_updates) != 0);
180} 93}
181 94
182void
183nfs_umount_begin(struct super_block *sb)
184{
185 struct rpc_clnt *rpc = NFS_SB(sb)->client;
186
187 /* -EIO all pending I/O */
188 if (!IS_ERR(rpc))
189 rpc_killall_tasks(rpc);
190 rpc = NFS_SB(sb)->client_acl;
191 if (!IS_ERR(rpc))
192 rpc_killall_tasks(rpc);
193}
194
195
196static inline unsigned long
197nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
198{
199 /* make sure blocksize is a power of two */
200 if ((bsize & (bsize - 1)) || nrbitsp) {
201 unsigned char nrbits;
202
203 for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
204 ;
205 bsize = 1 << nrbits;
206 if (nrbitsp)
207 *nrbitsp = nrbits;
208 }
209
210 return bsize;
211}
212
213/*
214 * Calculate the number of 512byte blocks used.
215 */
216static inline unsigned long
217nfs_calc_block_size(u64 tsize)
218{
219 loff_t used = (tsize + 511) >> 9;
220 return (used > ULONG_MAX) ? ULONG_MAX : used;
221}
222
223/*
224 * Compute and set NFS server blocksize
225 */
226static inline unsigned long
227nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
228{
229 if (bsize < NFS_MIN_FILE_IO_SIZE)
230 bsize = NFS_DEF_FILE_IO_SIZE;
231 else if (bsize >= NFS_MAX_FILE_IO_SIZE)
232 bsize = NFS_MAX_FILE_IO_SIZE;
233
234 return nfs_block_bits(bsize, nrbitsp);
235}
236
237/*
238 * Obtain the root inode of the file system.
239 */
240static struct inode *
241nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo)
242{
243 struct nfs_server *server = NFS_SB(sb);
244 int error;
245
246 error = server->rpc_ops->getroot(server, rootfh, fsinfo);
247 if (error < 0) {
248 dprintk("nfs_get_root: getattr error = %d\n", -error);
249 return ERR_PTR(error);
250 }
251
252 return nfs_fhget(sb, rootfh, fsinfo->fattr);
253}
254
255/*
256 * Do NFS version-independent mount processing, and sanity checking
257 */
258static int
259nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
260{
261 struct nfs_server *server;
262 struct inode *root_inode;
263 struct nfs_fattr fattr;
264 struct nfs_fsinfo fsinfo = {
265 .fattr = &fattr,
266 };
267 struct nfs_pathconf pathinfo = {
268 .fattr = &fattr,
269 };
270 int no_root_error = 0;
271 unsigned long max_rpc_payload;
272
273 /* We probably want something more informative here */
274 snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
275
276 server = NFS_SB(sb);
277
278 sb->s_magic = NFS_SUPER_MAGIC;
279
280 server->io_stats = nfs_alloc_iostats();
281 if (server->io_stats == NULL)
282 return -ENOMEM;
283
284 root_inode = nfs_get_root(sb, &server->fh, &fsinfo);
285 /* Did getting the root inode fail? */
286 if (IS_ERR(root_inode)) {
287 no_root_error = PTR_ERR(root_inode);
288 goto out_no_root;
289 }
290 sb->s_root = d_alloc_root(root_inode);
291 if (!sb->s_root) {
292 no_root_error = -ENOMEM;
293 goto out_no_root;
294 }
295 sb->s_root->d_op = server->rpc_ops->dentry_ops;
296
297 /* mount time stamp, in seconds */
298 server->mount_time = jiffies;
299
300 /* Get some general file system info */
301 if (server->namelen == 0 &&
302 server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
303 server->namelen = pathinfo.max_namelen;
304 /* Work out a lot of parameters */
305 if (server->rsize == 0)
306 server->rsize = nfs_block_size(fsinfo.rtpref, NULL);
307 if (server->wsize == 0)
308 server->wsize = nfs_block_size(fsinfo.wtpref, NULL);
309
310 if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax)
311 server->rsize = nfs_block_size(fsinfo.rtmax, NULL);
312 if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax)
313 server->wsize = nfs_block_size(fsinfo.wtmax, NULL);
314
315 max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
316 if (server->rsize > max_rpc_payload)
317 server->rsize = max_rpc_payload;
318 if (server->rsize > NFS_MAX_FILE_IO_SIZE)
319 server->rsize = NFS_MAX_FILE_IO_SIZE;
320 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
321
322 if (server->wsize > max_rpc_payload)
323 server->wsize = max_rpc_payload;
324 if (server->wsize > NFS_MAX_FILE_IO_SIZE)
325 server->wsize = NFS_MAX_FILE_IO_SIZE;
326 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
327
328 if (sb->s_blocksize == 0)
329 sb->s_blocksize = nfs_block_bits(server->wsize,
330 &sb->s_blocksize_bits);
331 server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL);
332
333 server->dtsize = nfs_block_size(fsinfo.dtpref, NULL);
334 if (server->dtsize > PAGE_CACHE_SIZE)
335 server->dtsize = PAGE_CACHE_SIZE;
336 if (server->dtsize > server->rsize)
337 server->dtsize = server->rsize;
338
339 if (server->flags & NFS_MOUNT_NOAC) {
340 server->acregmin = server->acregmax = 0;
341 server->acdirmin = server->acdirmax = 0;
342 sb->s_flags |= MS_SYNCHRONOUS;
343 }
344 server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
345
346 sb->s_maxbytes = fsinfo.maxfilesize;
347 if (sb->s_maxbytes > MAX_LFS_FILESIZE)
348 sb->s_maxbytes = MAX_LFS_FILESIZE;
349
350 server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
351 server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
352
353 /* We're airborne Set socket buffersize */
354 rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
355 return 0;
356 /* Yargs. It didn't work out. */
357out_no_root:
358 dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error);
359 if (!IS_ERR(root_inode))
360 iput(root_inode);
361 return no_root_error;
362}
363
364static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
365{
366 to->to_initval = timeo * HZ / 10;
367 to->to_retries = retrans;
368 if (!to->to_retries)
369 to->to_retries = 2;
370
371 switch (proto) {
372 case IPPROTO_TCP:
373 if (!to->to_initval)
374 to->to_initval = 60 * HZ;
375 if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
376 to->to_initval = NFS_MAX_TCP_TIMEOUT;
377 to->to_increment = to->to_initval;
378 to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
379 to->to_exponential = 0;
380 break;
381 case IPPROTO_UDP:
382 default:
383 if (!to->to_initval)
384 to->to_initval = 11 * HZ / 10;
385 if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
386 to->to_initval = NFS_MAX_UDP_TIMEOUT;
387 to->to_maxval = NFS_MAX_UDP_TIMEOUT;
388 to->to_exponential = 1;
389 break;
390 }
391}
392
393/*
394 * Create an RPC client handle.
395 */
396static struct rpc_clnt *
397nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
398{
399 struct rpc_timeout timeparms;
400 struct rpc_xprt *xprt = NULL;
401 struct rpc_clnt *clnt = NULL;
402 int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
403
404 nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
405
406 server->retrans_timeo = timeparms.to_initval;
407 server->retrans_count = timeparms.to_retries;
408
409 /* create transport and client */
410 xprt = xprt_create_proto(proto, &server->addr, &timeparms);
411 if (IS_ERR(xprt)) {
412 dprintk("%s: cannot create RPC transport. Error = %ld\n",
413 __FUNCTION__, PTR_ERR(xprt));
414 return (struct rpc_clnt *)xprt;
415 }
416 clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
417 server->rpc_ops->version, data->pseudoflavor);
418 if (IS_ERR(clnt)) {
419 dprintk("%s: cannot create RPC client. Error = %ld\n",
420 __FUNCTION__, PTR_ERR(xprt));
421 goto out_fail;
422 }
423
424 clnt->cl_intr = 1;
425 clnt->cl_softrtry = 1;
426
427 return clnt;
428
429out_fail:
430 return clnt;
431}
432
433/*
434 * The way this works is that the mount process passes a structure
435 * in the data argument which contains the server's IP address
436 * and the root file handle obtained from the server's mount
437 * daemon. We stash these away in the private superblock fields.
438 */
439static int
440nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
441{
442 struct nfs_server *server;
443 rpc_authflavor_t authflavor;
444
445 server = NFS_SB(sb);
446 sb->s_blocksize_bits = 0;
447 sb->s_blocksize = 0;
448 if (data->bsize)
449 sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
450 if (data->rsize)
451 server->rsize = nfs_block_size(data->rsize, NULL);
452 if (data->wsize)
453 server->wsize = nfs_block_size(data->wsize, NULL);
454 server->flags = data->flags & NFS_MOUNT_FLAGMASK;
455
456 server->acregmin = data->acregmin*HZ;
457 server->acregmax = data->acregmax*HZ;
458 server->acdirmin = data->acdirmin*HZ;
459 server->acdirmax = data->acdirmax*HZ;
460
461 /* Start lockd here, before we might error out */
462 if (!(server->flags & NFS_MOUNT_NONLM))
463 lockd_up();
464
465 server->namelen = data->namlen;
466 server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL);
467 if (!server->hostname)
468 return -ENOMEM;
469 strcpy(server->hostname, data->hostname);
470
471 /* Check NFS protocol revision and initialize RPC op vector
472 * and file handle pool. */
473#ifdef CONFIG_NFS_V3
474 if (server->flags & NFS_MOUNT_VER3) {
475 server->rpc_ops = &nfs_v3_clientops;
476 server->caps |= NFS_CAP_READDIRPLUS;
477 } else {
478 server->rpc_ops = &nfs_v2_clientops;
479 }
480#else
481 server->rpc_ops = &nfs_v2_clientops;
482#endif
483
484 /* Fill in pseudoflavor for mount version < 5 */
485 if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
486 data->pseudoflavor = RPC_AUTH_UNIX;
487 authflavor = data->pseudoflavor; /* save for sb_init() */
488 /* XXX maybe we want to add a server->pseudoflavor field */
489
490 /* Create RPC client handles */
491 server->client = nfs_create_client(server, data);
492 if (IS_ERR(server->client))
493 return PTR_ERR(server->client);
494 /* RFC 2623, sec 2.3.2 */
495 if (authflavor != RPC_AUTH_UNIX) {
496 struct rpc_auth *auth;
497
498 server->client_sys = rpc_clone_client(server->client);
499 if (IS_ERR(server->client_sys))
500 return PTR_ERR(server->client_sys);
501 auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys);
502 if (IS_ERR(auth))
503 return PTR_ERR(auth);
504 } else {
505 atomic_inc(&server->client->cl_count);
506 server->client_sys = server->client;
507 }
508 if (server->flags & NFS_MOUNT_VER3) {
509#ifdef CONFIG_NFS_V3_ACL
510 if (!(server->flags & NFS_MOUNT_NOACL)) {
511 server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
512 /* No errors! Assume that Sun nfsacls are supported */
513 if (!IS_ERR(server->client_acl))
514 server->caps |= NFS_CAP_ACLS;
515 }
516#else
517 server->flags &= ~NFS_MOUNT_NOACL;
518#endif /* CONFIG_NFS_V3_ACL */
519 /*
520 * The VFS shouldn't apply the umask to mode bits. We will
521 * do so ourselves when necessary.
522 */
523 sb->s_flags |= MS_POSIXACL;
524 if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
525 server->namelen = NFS3_MAXNAMLEN;
526 sb->s_time_gran = 1;
527 } else {
528 if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
529 server->namelen = NFS2_MAXNAMLEN;
530 }
531
532 sb->s_op = &nfs_sops;
533 return nfs_sb_init(sb, authflavor);
534}
535
536static int
537nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
538{
539 struct super_block *sb = dentry->d_sb;
540 struct nfs_server *server = NFS_SB(sb);
541 unsigned char blockbits;
542 unsigned long blockres;
543 struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode);
544 struct nfs_fattr fattr;
545 struct nfs_fsstat res = {
546 .fattr = &fattr,
547 };
548 int error;
549
550 lock_kernel();
551
552 error = server->rpc_ops->statfs(server, rootfh, &res);
553 buf->f_type = NFS_SUPER_MAGIC;
554 if (error < 0)
555 goto out_err;
556
557 /*
558 * Current versions of glibc do not correctly handle the
559 * case where f_frsize != f_bsize. Eventually we want to
560 * report the value of wtmult in this field.
561 */
562 buf->f_frsize = sb->s_blocksize;
563
564 /*
565 * On most *nix systems, f_blocks, f_bfree, and f_bavail
566 * are reported in units of f_frsize. Linux hasn't had
567 * an f_frsize field in its statfs struct until recently,
568 * thus historically Linux's sys_statfs reports these
569 * fields in units of f_bsize.
570 */
571 buf->f_bsize = sb->s_blocksize;
572 blockbits = sb->s_blocksize_bits;
573 blockres = (1 << blockbits) - 1;
574 buf->f_blocks = (res.tbytes + blockres) >> blockbits;
575 buf->f_bfree = (res.fbytes + blockres) >> blockbits;
576 buf->f_bavail = (res.abytes + blockres) >> blockbits;
577
578 buf->f_files = res.tfiles;
579 buf->f_ffree = res.afiles;
580
581 buf->f_namelen = server->namelen;
582 out:
583 unlock_kernel();
584 return 0;
585
586 out_err:
587 dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
588 buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
589 goto out;
590
591}
592
593static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
594{
595 static struct proc_nfs_info {
596 int flag;
597 char *str;
598 char *nostr;
599 } nfs_info[] = {
600 { NFS_MOUNT_SOFT, ",soft", ",hard" },
601 { NFS_MOUNT_INTR, ",intr", "" },
602 { NFS_MOUNT_NOCTO, ",nocto", "" },
603 { NFS_MOUNT_NOAC, ",noac", "" },
604 { NFS_MOUNT_NONLM, ",nolock", "" },
605 { NFS_MOUNT_NOACL, ",noacl", "" },
606 { 0, NULL, NULL }
607 };
608 struct proc_nfs_info *nfs_infop;
609 char buf[12];
610 char *proto;
611
612 seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
613 seq_printf(m, ",rsize=%d", nfss->rsize);
614 seq_printf(m, ",wsize=%d", nfss->wsize);
615 if (nfss->acregmin != 3*HZ || showdefaults)
616 seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ);
617 if (nfss->acregmax != 60*HZ || showdefaults)
618 seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ);
619 if (nfss->acdirmin != 30*HZ || showdefaults)
620 seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
621 if (nfss->acdirmax != 60*HZ || showdefaults)
622 seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
623 for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
624 if (nfss->flags & nfs_infop->flag)
625 seq_puts(m, nfs_infop->str);
626 else
627 seq_puts(m, nfs_infop->nostr);
628 }
629 switch (nfss->client->cl_xprt->prot) {
630 case IPPROTO_TCP:
631 proto = "tcp";
632 break;
633 case IPPROTO_UDP:
634 proto = "udp";
635 break;
636 default:
637 snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
638 proto = buf;
639 }
640 seq_printf(m, ",proto=%s", proto);
641 seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
642 seq_printf(m, ",retrans=%u", nfss->retrans_count);
643}
644
645static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
646{
647 struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
648
649 nfs_show_mount_options(m, nfss, 0);
650
651 seq_puts(m, ",addr=");
652 seq_escape(m, nfss->hostname, " \t\n\\");
653
654 return 0;
655}
656
657static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
658{
659 int i, cpu;
660 struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
661 struct rpc_auth *auth = nfss->client->cl_auth;
662 struct nfs_iostats totals = { };
663
664 seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
665
666 /*
667 * Display all mount option settings
668 */
669 seq_printf(m, "\n\topts:\t");
670 seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
671 seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
672 seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
673 seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
674 nfs_show_mount_options(m, nfss, 1);
675
676 seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
677
678 seq_printf(m, "\n\tcaps:\t");
679 seq_printf(m, "caps=0x%x", nfss->caps);
680 seq_printf(m, ",wtmult=%d", nfss->wtmult);
681 seq_printf(m, ",dtsize=%d", nfss->dtsize);
682 seq_printf(m, ",bsize=%d", nfss->bsize);
683 seq_printf(m, ",namelen=%d", nfss->namelen);
684
685#ifdef CONFIG_NFS_V4
686 if (nfss->rpc_ops->version == 4) {
687 seq_printf(m, "\n\tnfsv4:\t");
688 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
689 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
690 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
691 }
692#endif
693
694 /*
695 * Display security flavor in effect for this mount
696 */
697 seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor);
698 if (auth->au_flavor)
699 seq_printf(m, ",pseudoflavor=%d", auth->au_flavor);
700
701 /*
702 * Display superblock I/O counters
703 */
704 for_each_possible_cpu(cpu) {
705 struct nfs_iostats *stats;
706
707 preempt_disable();
708 stats = per_cpu_ptr(nfss->io_stats, cpu);
709
710 for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
711 totals.events[i] += stats->events[i];
712 for (i = 0; i < __NFSIOS_BYTESMAX; i++)
713 totals.bytes[i] += stats->bytes[i];
714
715 preempt_enable();
716 }
717
718 seq_printf(m, "\n\tevents:\t");
719 for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
720 seq_printf(m, "%lu ", totals.events[i]);
721 seq_printf(m, "\n\tbytes:\t");
722 for (i = 0; i < __NFSIOS_BYTESMAX; i++)
723 seq_printf(m, "%Lu ", totals.bytes[i]);
724 seq_printf(m, "\n");
725
726 rpc_print_iostats(m, nfss->client);
727
728 return 0;
729}
730
731/** 95/**
732 * nfs_sync_mapping - helper to flush all mmapped dirty data to disk 96 * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
733 */ 97 */
@@ -890,6 +254,14 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
890 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) 254 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
891 && fattr->size <= NFS_LIMIT_READDIRPLUS) 255 && fattr->size <= NFS_LIMIT_READDIRPLUS)
892 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode)); 256 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
257 /* Deal with crossing mountpoints */
258 if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
259 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
260 inode->i_op = &nfs_referral_inode_operations;
261 else
262 inode->i_op = &nfs_mountpoint_inode_operations;
263 inode->i_fop = NULL;
264 }
893 } else if (S_ISLNK(inode->i_mode)) 265 } else if (S_ISLNK(inode->i_mode))
894 inode->i_op = &nfs_symlink_inode_operations; 266 inode->i_op = &nfs_symlink_inode_operations;
895 else 267 else
@@ -1208,6 +580,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1208 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", 580 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
1209 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 581 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
1210 582
583 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
1211 lock_kernel(); 584 lock_kernel();
1212 if (!inode || is_bad_inode(inode)) 585 if (!inode || is_bad_inode(inode))
1213 goto out_nowait; 586 goto out_nowait;
@@ -1221,7 +594,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1221 status = -ESTALE; 594 status = -ESTALE;
1222 /* Do we trust the cached ESTALE? */ 595 /* Do we trust the cached ESTALE? */
1223 if (NFS_ATTRTIMEO(inode) != 0) { 596 if (NFS_ATTRTIMEO(inode) != 0) {
1224 if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) { 597 if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) {
1225 /* no */ 598 /* no */
1226 } else 599 } else
1227 goto out; 600 goto out;
@@ -1252,8 +625,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1252 } 625 }
1253 spin_unlock(&inode->i_lock); 626 spin_unlock(&inode->i_lock);
1254 627
1255 nfs_revalidate_mapping(inode, inode->i_mapping);
1256
1257 if (nfsi->cache_validity & NFS_INO_INVALID_ACL) 628 if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
1258 nfs_zap_acl_cache(inode); 629 nfs_zap_acl_cache(inode);
1259 630
@@ -1287,8 +658,7 @@ int nfs_attribute_timeout(struct inode *inode)
1287 */ 658 */
1288int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 659int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1289{ 660{
1290 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 661 if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
1291 if (!(NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
1292 && !nfs_attribute_timeout(inode)) 662 && !nfs_attribute_timeout(inode))
1293 return NFS_STALE(inode) ? -ESTALE : 0; 663 return NFS_STALE(inode) ? -ESTALE : 0;
1294 return __nfs_revalidate_inode(server, inode); 664 return __nfs_revalidate_inode(server, inode);
@@ -1299,9 +669,16 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1299 * @inode - pointer to host inode 669 * @inode - pointer to host inode
1300 * @mapping - pointer to mapping 670 * @mapping - pointer to mapping
1301 */ 671 */
1302void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) 672int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
1303{ 673{
1304 struct nfs_inode *nfsi = NFS_I(inode); 674 struct nfs_inode *nfsi = NFS_I(inode);
675 int ret = 0;
676
677 if (NFS_STALE(inode))
678 ret = -ESTALE;
679 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
680 || nfs_attribute_timeout(inode))
681 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
1305 682
1306 if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { 683 if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
1307 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 684 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
@@ -1322,6 +699,7 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
1322 inode->i_sb->s_id, 699 inode->i_sb->s_id,
1323 (long long)NFS_FILEID(inode)); 700 (long long)NFS_FILEID(inode));
1324 } 701 }
702 return ret;
1325} 703}
1326 704
1327/** 705/**
@@ -1361,12 +739,6 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1361{ 739{
1362 struct nfs_inode *nfsi = NFS_I(inode); 740 struct nfs_inode *nfsi = NFS_I(inode);
1363 741
1364 if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0
1365 && nfsi->change_attr == fattr->pre_change_attr) {
1366 nfsi->change_attr = fattr->change_attr;
1367 nfsi->cache_change_attribute = jiffies;
1368 }
1369
1370 /* If we have atomic WCC data, we may update some attributes */ 742 /* If we have atomic WCC data, we may update some attributes */
1371 if ((fattr->valid & NFS_ATTR_WCC) != 0) { 743 if ((fattr->valid & NFS_ATTR_WCC) != 0) {
1372 if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { 744 if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
@@ -1400,9 +772,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
1400 int data_unstable; 772 int data_unstable;
1401 773
1402 774
1403 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
1404 return 0;
1405
1406 /* Has the inode gone and changed behind our back? */ 775 /* Has the inode gone and changed behind our back? */
1407 if (nfsi->fileid != fattr->fileid 776 if (nfsi->fileid != fattr->fileid
1408 || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { 777 || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
@@ -1415,20 +784,13 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
1415 /* Do atomic weak cache consistency updates */ 784 /* Do atomic weak cache consistency updates */
1416 nfs_wcc_update_inode(inode, fattr); 785 nfs_wcc_update_inode(inode, fattr);
1417 786
1418 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0) { 787 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
1419 if (nfsi->change_attr == fattr->change_attr) 788 nfsi->change_attr != fattr->change_attr)
1420 goto out; 789 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
1421 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
1422 if (!data_unstable)
1423 nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
1424 }
1425 790
1426 /* Verify a few of the more important attributes */ 791 /* Verify a few of the more important attributes */
1427 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { 792 if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
1428 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 793 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
1429 if (!data_unstable)
1430 nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
1431 }
1432 794
1433 cur_size = i_size_read(inode); 795 cur_size = i_size_read(inode);
1434 new_isize = nfs_size_to_loff_t(fattr->size); 796 new_isize = nfs_size_to_loff_t(fattr->size);
@@ -1445,7 +807,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
1445 if (inode->i_nlink != fattr->nlink) 807 if (inode->i_nlink != fattr->nlink)
1446 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 808 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
1447 809
1448out:
1449 if (!timespec_equal(&inode->i_atime, &fattr->atime)) 810 if (!timespec_equal(&inode->i_atime, &fattr->atime))
1450 nfsi->cache_validity |= NFS_INO_INVALID_ATIME; 811 nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
1451 812
@@ -1471,7 +832,6 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
1471 if ((fattr->valid & NFS_ATTR_FATTR) == 0) 832 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
1472 return 0; 833 return 0;
1473 spin_lock(&inode->i_lock); 834 spin_lock(&inode->i_lock);
1474 nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
1475 if (time_after(fattr->time_start, nfsi->last_updated)) 835 if (time_after(fattr->time_start, nfsi->last_updated))
1476 status = nfs_update_inode(inode, fattr); 836 status = nfs_update_inode(inode, fattr);
1477 else 837 else
@@ -1496,7 +856,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1496 856
1497 spin_lock(&inode->i_lock); 857 spin_lock(&inode->i_lock);
1498 if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) { 858 if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) {
1499 nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS; 859 nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
1500 goto out; 860 goto out;
1501 } 861 }
1502 status = nfs_update_inode(inode, fattr); 862 status = nfs_update_inode(inode, fattr);
@@ -1519,6 +879,7 @@ out:
1519 */ 879 */
1520static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) 880static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1521{ 881{
882 struct nfs_server *server;
1522 struct nfs_inode *nfsi = NFS_I(inode); 883 struct nfs_inode *nfsi = NFS_I(inode);
1523 loff_t cur_isize, new_isize; 884 loff_t cur_isize, new_isize;
1524 unsigned int invalid = 0; 885 unsigned int invalid = 0;
@@ -1528,9 +889,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1528 __FUNCTION__, inode->i_sb->s_id, inode->i_ino, 889 __FUNCTION__, inode->i_sb->s_id, inode->i_ino,
1529 atomic_read(&inode->i_count), fattr->valid); 890 atomic_read(&inode->i_count), fattr->valid);
1530 891
1531 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
1532 return 0;
1533
1534 if (nfsi->fileid != fattr->fileid) 892 if (nfsi->fileid != fattr->fileid)
1535 goto out_fileid; 893 goto out_fileid;
1536 894
@@ -1540,6 +898,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1540 if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) 898 if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
1541 goto out_changed; 899 goto out_changed;
1542 900
901 server = NFS_SERVER(inode);
902 /* Update the fsid if and only if this is the root directory */
903 if (inode == inode->i_sb->s_root->d_inode
904 && !nfs_fsid_equal(&server->fsid, &fattr->fsid))
905 server->fsid = fattr->fsid;
906
1543 /* 907 /*
1544 * Update the read time so we don't revalidate too often. 908 * Update the read time so we don't revalidate too often.
1545 */ 909 */
@@ -1549,7 +913,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1549 /* Are we racing with known updates of the metadata on the server? */ 913 /* Are we racing with known updates of the metadata on the server? */
1550 data_stable = nfs_verify_change_attribute(inode, fattr->time_start); 914 data_stable = nfs_verify_change_attribute(inode, fattr->time_start);
1551 if (data_stable) 915 if (data_stable)
1552 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME); 916 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATIME);
1553 917
1554 /* Do atomic weak cache consistency updates */ 918 /* Do atomic weak cache consistency updates */
1555 nfs_wcc_update_inode(inode, fattr); 919 nfs_wcc_update_inode(inode, fattr);
@@ -1613,15 +977,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1613 inode->i_blksize = fattr->du.nfs2.blocksize; 977 inode->i_blksize = fattr->du.nfs2.blocksize;
1614 } 978 }
1615 979
1616 if ((fattr->valid & NFS_ATTR_FATTR_V4)) { 980 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
1617 if (nfsi->change_attr != fattr->change_attr) { 981 nfsi->change_attr != fattr->change_attr) {
1618 dprintk("NFS: change_attr change on server for file %s/%ld\n", 982 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1619 inode->i_sb->s_id, inode->i_ino); 983 inode->i_sb->s_id, inode->i_ino);
1620 nfsi->change_attr = fattr->change_attr; 984 nfsi->change_attr = fattr->change_attr;
1621 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 985 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1622 nfsi->cache_change_attribute = jiffies; 986 nfsi->cache_change_attribute = jiffies;
1623 } else
1624 invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA);
1625 } 987 }
1626 988
1627 /* Update attrtimeo value if we're out of the unstable period */ 989 /* Update attrtimeo value if we're out of the unstable period */
@@ -1669,202 +1031,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1669 goto out_err; 1031 goto out_err;
1670} 1032}
1671 1033
1672/*
1673 * File system information
1674 */
1675
1676static int nfs_set_super(struct super_block *s, void *data)
1677{
1678 s->s_fs_info = data;
1679 return set_anon_super(s, data);
1680}
1681
1682static int nfs_compare_super(struct super_block *sb, void *data)
1683{
1684 struct nfs_server *server = data;
1685 struct nfs_server *old = NFS_SB(sb);
1686
1687 if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr)
1688 return 0;
1689 if (old->addr.sin_port != server->addr.sin_port)
1690 return 0;
1691 return !nfs_compare_fh(&old->fh, &server->fh);
1692}
1693
1694static int nfs_get_sb(struct file_system_type *fs_type,
1695 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
1696{
1697 int error;
1698 struct nfs_server *server = NULL;
1699 struct super_block *s;
1700 struct nfs_fh *root;
1701 struct nfs_mount_data *data = raw_data;
1702
1703 error = -EINVAL;
1704 if (data == NULL) {
1705 dprintk("%s: missing data argument\n", __FUNCTION__);
1706 goto out_err_noserver;
1707 }
1708 if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
1709 dprintk("%s: bad mount version\n", __FUNCTION__);
1710 goto out_err_noserver;
1711 }
1712 switch (data->version) {
1713 case 1:
1714 data->namlen = 0;
1715 case 2:
1716 data->bsize = 0;
1717 case 3:
1718 if (data->flags & NFS_MOUNT_VER3) {
1719 dprintk("%s: mount structure version %d does not support NFSv3\n",
1720 __FUNCTION__,
1721 data->version);
1722 goto out_err_noserver;
1723 }
1724 data->root.size = NFS2_FHSIZE;
1725 memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
1726 case 4:
1727 if (data->flags & NFS_MOUNT_SECFLAVOUR) {
1728 dprintk("%s: mount structure version %d does not support strong security\n",
1729 __FUNCTION__,
1730 data->version);
1731 goto out_err_noserver;
1732 }
1733 case 5:
1734 memset(data->context, 0, sizeof(data->context));
1735 }
1736#ifndef CONFIG_NFS_V3
1737 /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
1738 error = -EPROTONOSUPPORT;
1739 if (data->flags & NFS_MOUNT_VER3) {
1740 dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
1741 goto out_err_noserver;
1742 }
1743#endif /* CONFIG_NFS_V3 */
1744
1745 error = -ENOMEM;
1746 server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
1747 if (!server)
1748 goto out_err_noserver;
1749 /* Zero out the NFS state stuff */
1750 init_nfsv4_state(server);
1751 server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
1752
1753 root = &server->fh;
1754 if (data->flags & NFS_MOUNT_VER3)
1755 root->size = data->root.size;
1756 else
1757 root->size = NFS2_FHSIZE;
1758 error = -EINVAL;
1759 if (root->size > sizeof(root->data)) {
1760 dprintk("%s: invalid root filehandle\n", __FUNCTION__);
1761 goto out_err;
1762 }
1763 memcpy(root->data, data->root.data, root->size);
1764
1765 /* We now require that the mount process passes the remote address */
1766 memcpy(&server->addr, &data->addr, sizeof(server->addr));
1767 if (server->addr.sin_addr.s_addr == INADDR_ANY) {
1768 dprintk("%s: mount program didn't pass remote address!\n",
1769 __FUNCTION__);
1770 goto out_err;
1771 }
1772
1773 /* Fire up rpciod if not yet running */
1774 error = rpciod_up();
1775 if (error < 0) {
1776 dprintk("%s: couldn't start rpciod! Error = %d\n",
1777 __FUNCTION__, error);
1778 goto out_err;
1779 }
1780
1781 s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
1782 if (IS_ERR(s)) {
1783 error = PTR_ERR(s);
1784 goto out_err_rpciod;
1785 }
1786
1787 if (s->s_root)
1788 goto out_rpciod_down;
1789
1790 s->s_flags = flags;
1791
1792 error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1793 if (error) {
1794 up_write(&s->s_umount);
1795 deactivate_super(s);
1796 return error;
1797 }
1798 s->s_flags |= MS_ACTIVE;
1799 return simple_set_mnt(mnt, s);
1800
1801out_rpciod_down:
1802 rpciod_down();
1803 kfree(server);
1804 return simple_set_mnt(mnt, s);
1805
1806out_err_rpciod:
1807 rpciod_down();
1808out_err:
1809 kfree(server);
1810out_err_noserver:
1811 return error;
1812}
1813
1814static void nfs_kill_super(struct super_block *s)
1815{
1816 struct nfs_server *server = NFS_SB(s);
1817
1818 kill_anon_super(s);
1819
1820 if (!IS_ERR(server->client))
1821 rpc_shutdown_client(server->client);
1822 if (!IS_ERR(server->client_sys))
1823 rpc_shutdown_client(server->client_sys);
1824 if (!IS_ERR(server->client_acl))
1825 rpc_shutdown_client(server->client_acl);
1826
1827 if (!(server->flags & NFS_MOUNT_NONLM))
1828 lockd_down(); /* release rpc.lockd */
1829
1830 rpciod_down(); /* release rpciod */
1831
1832 nfs_free_iostats(server->io_stats);
1833 kfree(server->hostname);
1834 kfree(server);
1835}
1836
1837static struct file_system_type nfs_fs_type = {
1838 .owner = THIS_MODULE,
1839 .name = "nfs",
1840 .get_sb = nfs_get_sb,
1841 .kill_sb = nfs_kill_super,
1842 .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
1843};
1844 1034
1845#ifdef CONFIG_NFS_V4 1035#ifdef CONFIG_NFS_V4
1846 1036
1847static void nfs4_clear_inode(struct inode *);
1848
1849
1850static struct super_operations nfs4_sops = {
1851 .alloc_inode = nfs_alloc_inode,
1852 .destroy_inode = nfs_destroy_inode,
1853 .write_inode = nfs_write_inode,
1854 .delete_inode = nfs_delete_inode,
1855 .statfs = nfs_statfs,
1856 .clear_inode = nfs4_clear_inode,
1857 .umount_begin = nfs_umount_begin,
1858 .show_options = nfs_show_options,
1859 .show_stats = nfs_show_stats,
1860};
1861
1862/* 1037/*
1863 * Clean out any remaining NFSv4 state that might be left over due 1038 * Clean out any remaining NFSv4 state that might be left over due
1864 * to open() calls that passed nfs_atomic_lookup, but failed to call 1039 * to open() calls that passed nfs_atomic_lookup, but failed to call
1865 * nfs_open(). 1040 * nfs_open().
1866 */ 1041 */
1867static void nfs4_clear_inode(struct inode *inode) 1042void nfs4_clear_inode(struct inode *inode)
1868{ 1043{
1869 struct nfs_inode *nfsi = NFS_I(inode); 1044 struct nfs_inode *nfsi = NFS_I(inode);
1870 1045
@@ -1888,365 +1063,9 @@ static void nfs4_clear_inode(struct inode *inode)
1888 nfs4_close_state(state, state->state); 1063 nfs4_close_state(state, state->state);
1889 } 1064 }
1890} 1065}
1891
1892
1893static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
1894{
1895 struct nfs_server *server;
1896 struct nfs4_client *clp = NULL;
1897 struct rpc_xprt *xprt = NULL;
1898 struct rpc_clnt *clnt = NULL;
1899 struct rpc_timeout timeparms;
1900 rpc_authflavor_t authflavour;
1901 int err = -EIO;
1902
1903 sb->s_blocksize_bits = 0;
1904 sb->s_blocksize = 0;
1905 server = NFS_SB(sb);
1906 if (data->rsize != 0)
1907 server->rsize = nfs_block_size(data->rsize, NULL);
1908 if (data->wsize != 0)
1909 server->wsize = nfs_block_size(data->wsize, NULL);
1910 server->flags = data->flags & NFS_MOUNT_FLAGMASK;
1911 server->caps = NFS_CAP_ATOMIC_OPEN;
1912
1913 server->acregmin = data->acregmin*HZ;
1914 server->acregmax = data->acregmax*HZ;
1915 server->acdirmin = data->acdirmin*HZ;
1916 server->acdirmax = data->acdirmax*HZ;
1917
1918 server->rpc_ops = &nfs_v4_clientops;
1919
1920 nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
1921
1922 server->retrans_timeo = timeparms.to_initval;
1923 server->retrans_count = timeparms.to_retries;
1924
1925 clp = nfs4_get_client(&server->addr.sin_addr);
1926 if (!clp) {
1927 dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
1928 return -EIO;
1929 }
1930
1931 /* Now create transport and client */
1932 authflavour = RPC_AUTH_UNIX;
1933 if (data->auth_flavourlen != 0) {
1934 if (data->auth_flavourlen != 1) {
1935 dprintk("%s: Invalid number of RPC auth flavours %d.\n",
1936 __FUNCTION__, data->auth_flavourlen);
1937 err = -EINVAL;
1938 goto out_fail;
1939 }
1940 if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
1941 err = -EFAULT;
1942 goto out_fail;
1943 }
1944 }
1945
1946 down_write(&clp->cl_sem);
1947 if (IS_ERR(clp->cl_rpcclient)) {
1948 xprt = xprt_create_proto(data->proto, &server->addr, &timeparms);
1949 if (IS_ERR(xprt)) {
1950 up_write(&clp->cl_sem);
1951 err = PTR_ERR(xprt);
1952 dprintk("%s: cannot create RPC transport. Error = %d\n",
1953 __FUNCTION__, err);
1954 goto out_fail;
1955 }
1956 clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
1957 server->rpc_ops->version, authflavour);
1958 if (IS_ERR(clnt)) {
1959 up_write(&clp->cl_sem);
1960 err = PTR_ERR(clnt);
1961 dprintk("%s: cannot create RPC client. Error = %d\n",
1962 __FUNCTION__, err);
1963 goto out_fail;
1964 }
1965 clnt->cl_intr = 1;
1966 clnt->cl_softrtry = 1;
1967 clp->cl_rpcclient = clnt;
1968 memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
1969 nfs_idmap_new(clp);
1970 }
1971 list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
1972 clnt = rpc_clone_client(clp->cl_rpcclient);
1973 if (!IS_ERR(clnt))
1974 server->nfs4_state = clp;
1975 up_write(&clp->cl_sem);
1976 clp = NULL;
1977
1978 if (IS_ERR(clnt)) {
1979 err = PTR_ERR(clnt);
1980 dprintk("%s: cannot create RPC client. Error = %d\n",
1981 __FUNCTION__, err);
1982 return err;
1983 }
1984
1985 server->client = clnt;
1986
1987 if (server->nfs4_state->cl_idmap == NULL) {
1988 dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
1989 return -ENOMEM;
1990 }
1991
1992 if (clnt->cl_auth->au_flavor != authflavour) {
1993 struct rpc_auth *auth;
1994
1995 auth = rpcauth_create(authflavour, clnt);
1996 if (IS_ERR(auth)) {
1997 dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
1998 return PTR_ERR(auth);
1999 }
2000 }
2001
2002 sb->s_time_gran = 1;
2003
2004 sb->s_op = &nfs4_sops;
2005 err = nfs_sb_init(sb, authflavour);
2006 if (err == 0)
2007 return 0;
2008out_fail:
2009 if (clp)
2010 nfs4_put_client(clp);
2011 return err;
2012}
2013
2014static int nfs4_compare_super(struct super_block *sb, void *data)
2015{
2016 struct nfs_server *server = data;
2017 struct nfs_server *old = NFS_SB(sb);
2018
2019 if (strcmp(server->hostname, old->hostname) != 0)
2020 return 0;
2021 if (strcmp(server->mnt_path, old->mnt_path) != 0)
2022 return 0;
2023 return 1;
2024}
2025
2026static void *
2027nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
2028{
2029 void *p = NULL;
2030
2031 if (!src->len)
2032 return ERR_PTR(-EINVAL);
2033 if (src->len < maxlen)
2034 maxlen = src->len;
2035 if (dst == NULL) {
2036 p = dst = kmalloc(maxlen + 1, GFP_KERNEL);
2037 if (p == NULL)
2038 return ERR_PTR(-ENOMEM);
2039 }
2040 if (copy_from_user(dst, src->data, maxlen)) {
2041 kfree(p);
2042 return ERR_PTR(-EFAULT);
2043 }
2044 dst[maxlen] = '\0';
2045 return dst;
2046}
2047
2048static int nfs4_get_sb(struct file_system_type *fs_type,
2049 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
2050{
2051 int error;
2052 struct nfs_server *server;
2053 struct super_block *s;
2054 struct nfs4_mount_data *data = raw_data;
2055 void *p;
2056
2057 if (data == NULL) {
2058 dprintk("%s: missing data argument\n", __FUNCTION__);
2059 return -EINVAL;
2060 }
2061 if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
2062 dprintk("%s: bad mount version\n", __FUNCTION__);
2063 return -EINVAL;
2064 }
2065
2066 server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
2067 if (!server)
2068 return -ENOMEM;
2069 /* Zero out the NFS state stuff */
2070 init_nfsv4_state(server);
2071 server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
2072
2073 p = nfs_copy_user_string(NULL, &data->hostname, 256);
2074 if (IS_ERR(p))
2075 goto out_err;
2076 server->hostname = p;
2077
2078 p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
2079 if (IS_ERR(p))
2080 goto out_err;
2081 server->mnt_path = p;
2082
2083 p = nfs_copy_user_string(server->ip_addr, &data->client_addr,
2084 sizeof(server->ip_addr) - 1);
2085 if (IS_ERR(p))
2086 goto out_err;
2087
2088 /* We now require that the mount process passes the remote address */
2089 if (data->host_addrlen != sizeof(server->addr)) {
2090 error = -EINVAL;
2091 goto out_free;
2092 }
2093 if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
2094 error = -EFAULT;
2095 goto out_free;
2096 }
2097 if (server->addr.sin_family != AF_INET ||
2098 server->addr.sin_addr.s_addr == INADDR_ANY) {
2099 dprintk("%s: mount program didn't pass remote IP address!\n",
2100 __FUNCTION__);
2101 error = -EINVAL;
2102 goto out_free;
2103 }
2104
2105 /* Fire up rpciod if not yet running */
2106 error = rpciod_up();
2107 if (error < 0) {
2108 dprintk("%s: couldn't start rpciod! Error = %d\n",
2109 __FUNCTION__, error);
2110 goto out_free;
2111 }
2112
2113 s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
2114 if (IS_ERR(s)) {
2115 error = PTR_ERR(s);
2116 goto out_free;
2117 }
2118
2119 if (s->s_root) {
2120 kfree(server->mnt_path);
2121 kfree(server->hostname);
2122 kfree(server);
2123 return simple_set_mnt(mnt, s);
2124 }
2125
2126 s->s_flags = flags;
2127
2128 error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
2129 if (error) {
2130 up_write(&s->s_umount);
2131 deactivate_super(s);
2132 return error;
2133 }
2134 s->s_flags |= MS_ACTIVE;
2135 return simple_set_mnt(mnt, s);
2136out_err:
2137 error = PTR_ERR(p);
2138out_free:
2139 kfree(server->mnt_path);
2140 kfree(server->hostname);
2141 kfree(server);
2142 return error;
2143}
2144
2145static void nfs4_kill_super(struct super_block *sb)
2146{
2147 struct nfs_server *server = NFS_SB(sb);
2148
2149 nfs_return_all_delegations(sb);
2150 kill_anon_super(sb);
2151
2152 nfs4_renewd_prepare_shutdown(server);
2153
2154 if (server->client != NULL && !IS_ERR(server->client))
2155 rpc_shutdown_client(server->client);
2156
2157 destroy_nfsv4_state(server);
2158
2159 rpciod_down();
2160
2161 nfs_free_iostats(server->io_stats);
2162 kfree(server->hostname);
2163 kfree(server);
2164}
2165
2166static struct file_system_type nfs4_fs_type = {
2167 .owner = THIS_MODULE,
2168 .name = "nfs4",
2169 .get_sb = nfs4_get_sb,
2170 .kill_sb = nfs4_kill_super,
2171 .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
2172};
2173
2174static const int nfs_set_port_min = 0;
2175static const int nfs_set_port_max = 65535;
2176static int param_set_port(const char *val, struct kernel_param *kp)
2177{
2178 char *endp;
2179 int num = simple_strtol(val, &endp, 0);
2180 if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
2181 return -EINVAL;
2182 *((int *)kp->arg) = num;
2183 return 0;
2184}
2185
2186module_param_call(callback_tcpport, param_set_port, param_get_int,
2187 &nfs_callback_set_tcpport, 0644);
2188
2189static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
2190{
2191 char *endp;
2192 int num = simple_strtol(val, &endp, 0);
2193 int jif = num * HZ;
2194 if (endp == val || *endp || num < 0 || jif < num)
2195 return -EINVAL;
2196 *((int *)kp->arg) = jif;
2197 return 0;
2198}
2199
2200module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
2201 &nfs_idmap_cache_timeout, 0644);
2202
2203#define nfs4_init_once(nfsi) \
2204 do { \
2205 INIT_LIST_HEAD(&(nfsi)->open_states); \
2206 nfsi->delegation = NULL; \
2207 nfsi->delegation_state = 0; \
2208 init_rwsem(&nfsi->rwsem); \
2209 } while(0)
2210
2211static inline int register_nfs4fs(void)
2212{
2213 int ret;
2214
2215 ret = nfs_register_sysctl();
2216 if (ret != 0)
2217 return ret;
2218 ret = register_filesystem(&nfs4_fs_type);
2219 if (ret != 0)
2220 nfs_unregister_sysctl();
2221 return ret;
2222}
2223
2224static inline void unregister_nfs4fs(void)
2225{
2226 unregister_filesystem(&nfs4_fs_type);
2227 nfs_unregister_sysctl();
2228}
2229#else
2230#define nfs4_init_once(nfsi) \
2231 do { } while (0)
2232#define register_nfs4fs() (0)
2233#define unregister_nfs4fs()
2234#endif 1066#endif
2235 1067
2236extern int nfs_init_nfspagecache(void); 1068struct inode *nfs_alloc_inode(struct super_block *sb)
2237extern void nfs_destroy_nfspagecache(void);
2238extern int nfs_init_readpagecache(void);
2239extern void nfs_destroy_readpagecache(void);
2240extern int nfs_init_writepagecache(void);
2241extern void nfs_destroy_writepagecache(void);
2242#ifdef CONFIG_NFS_DIRECTIO
2243extern int nfs_init_directcache(void);
2244extern void nfs_destroy_directcache(void);
2245#endif
2246
2247static kmem_cache_t * nfs_inode_cachep;
2248
2249static struct inode *nfs_alloc_inode(struct super_block *sb)
2250{ 1069{
2251 struct nfs_inode *nfsi; 1070 struct nfs_inode *nfsi;
2252 nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL); 1071 nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL);
@@ -2265,11 +1084,21 @@ static struct inode *nfs_alloc_inode(struct super_block *sb)
2265 return &nfsi->vfs_inode; 1084 return &nfsi->vfs_inode;
2266} 1085}
2267 1086
2268static void nfs_destroy_inode(struct inode *inode) 1087void nfs_destroy_inode(struct inode *inode)
2269{ 1088{
2270 kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); 1089 kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
2271} 1090}
2272 1091
1092static inline void nfs4_init_once(struct nfs_inode *nfsi)
1093{
1094#ifdef CONFIG_NFS_V4
1095 INIT_LIST_HEAD(&nfsi->open_states);
1096 nfsi->delegation = NULL;
1097 nfsi->delegation_state = 0;
1098 init_rwsem(&nfsi->rwsem);
1099#endif
1100}
1101
2273static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) 1102static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
2274{ 1103{
2275 struct nfs_inode *nfsi = (struct nfs_inode *) foo; 1104 struct nfs_inode *nfsi = (struct nfs_inode *) foo;
@@ -2290,7 +1119,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
2290 } 1119 }
2291} 1120}
2292 1121
2293static int nfs_init_inodecache(void) 1122static int __init nfs_init_inodecache(void)
2294{ 1123{
2295 nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", 1124 nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
2296 sizeof(struct nfs_inode), 1125 sizeof(struct nfs_inode),
@@ -2332,29 +1161,22 @@ static int __init init_nfs_fs(void)
2332 if (err) 1161 if (err)
2333 goto out1; 1162 goto out1;
2334 1163
2335#ifdef CONFIG_NFS_DIRECTIO
2336 err = nfs_init_directcache(); 1164 err = nfs_init_directcache();
2337 if (err) 1165 if (err)
2338 goto out0; 1166 goto out0;
2339#endif
2340 1167
2341#ifdef CONFIG_PROC_FS 1168#ifdef CONFIG_PROC_FS
2342 rpc_proc_register(&nfs_rpcstat); 1169 rpc_proc_register(&nfs_rpcstat);
2343#endif 1170#endif
2344 err = register_filesystem(&nfs_fs_type); 1171 if ((err = register_nfs_fs()) != 0)
2345 if (err)
2346 goto out;
2347 if ((err = register_nfs4fs()) != 0)
2348 goto out; 1172 goto out;
2349 return 0; 1173 return 0;
2350out: 1174out:
2351#ifdef CONFIG_PROC_FS 1175#ifdef CONFIG_PROC_FS
2352 rpc_proc_unregister("nfs"); 1176 rpc_proc_unregister("nfs");
2353#endif 1177#endif
2354#ifdef CONFIG_NFS_DIRECTIO
2355 nfs_destroy_directcache(); 1178 nfs_destroy_directcache();
2356out0: 1179out0:
2357#endif
2358 nfs_destroy_writepagecache(); 1180 nfs_destroy_writepagecache();
2359out1: 1181out1:
2360 nfs_destroy_readpagecache(); 1182 nfs_destroy_readpagecache();
@@ -2368,9 +1190,7 @@ out4:
2368 1190
2369static void __exit exit_nfs_fs(void) 1191static void __exit exit_nfs_fs(void)
2370{ 1192{
2371#ifdef CONFIG_NFS_DIRECTIO
2372 nfs_destroy_directcache(); 1193 nfs_destroy_directcache();
2373#endif
2374 nfs_destroy_writepagecache(); 1194 nfs_destroy_writepagecache();
2375 nfs_destroy_readpagecache(); 1195 nfs_destroy_readpagecache();
2376 nfs_destroy_inodecache(); 1196 nfs_destroy_inodecache();
@@ -2378,8 +1198,7 @@ static void __exit exit_nfs_fs(void)
2378#ifdef CONFIG_PROC_FS 1198#ifdef CONFIG_PROC_FS
2379 rpc_proc_unregister("nfs"); 1199 rpc_proc_unregister("nfs");
2380#endif 1200#endif
2381 unregister_filesystem(&nfs_fs_type); 1201 unregister_nfs_fs();
2382 unregister_nfs4fs();
2383} 1202}
2384 1203
2385/* Not quite true; I just maintain it */ 1204/* Not quite true; I just maintain it */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
new file mode 100644
index 000000000000..4fe51c1292bb
--- /dev/null
+++ b/fs/nfs/internal.h
@@ -0,0 +1,186 @@
1/*
2 * NFS internal definitions
3 */
4
5#include <linux/mount.h>
6
7struct nfs_clone_mount {
8 const struct super_block *sb;
9 const struct dentry *dentry;
10 struct nfs_fh *fh;
11 struct nfs_fattr *fattr;
12 char *hostname;
13 char *mnt_path;
14 struct sockaddr_in *addr;
15 rpc_authflavor_t authflavor;
16};
17
18/* namespace-nfs4.c */
19#ifdef CONFIG_NFS_V4
20extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry);
21#else
22static inline
23struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
24{
25 return ERR_PTR(-ENOENT);
26}
27#endif
28
29/* callback_xdr.c */
30extern struct svc_version nfs4_callback_version1;
31
32/* pagelist.c */
33extern int __init nfs_init_nfspagecache(void);
34extern void nfs_destroy_nfspagecache(void);
35extern int __init nfs_init_readpagecache(void);
36extern void nfs_destroy_readpagecache(void);
37extern int __init nfs_init_writepagecache(void);
38extern void nfs_destroy_writepagecache(void);
39
40#ifdef CONFIG_NFS_DIRECTIO
41extern int __init nfs_init_directcache(void);
42extern void nfs_destroy_directcache(void);
43#else
44#define nfs_init_directcache() (0)
45#define nfs_destroy_directcache() do {} while(0)
46#endif
47
48/* nfs2xdr.c */
49extern struct rpc_procinfo nfs_procedures[];
50extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
51
52/* nfs3xdr.c */
53extern struct rpc_procinfo nfs3_procedures[];
54extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
55
56/* nfs4xdr.c */
57extern int nfs_stat_to_errno(int);
58extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
59
60/* nfs4proc.c */
61#ifdef CONFIG_NFS_V4
62extern struct rpc_procinfo nfs4_procedures[];
63
64extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
65 struct nfs4_fs_locations *fs_locations,
66 struct page *page);
67#endif
68
69/* inode.c */
70extern struct inode *nfs_alloc_inode(struct super_block *sb);
71extern void nfs_destroy_inode(struct inode *);
72extern int nfs_write_inode(struct inode *,int);
73extern void nfs_clear_inode(struct inode *);
74#ifdef CONFIG_NFS_V4
75extern void nfs4_clear_inode(struct inode *);
76#endif
77
78/* super.c */
79extern struct file_system_type nfs_referral_nfs4_fs_type;
80extern struct file_system_type clone_nfs_fs_type;
81#ifdef CONFIG_NFS_V4
82extern struct file_system_type clone_nfs4_fs_type;
83#endif
84#ifdef CONFIG_PROC_FS
85extern struct rpc_stat nfs_rpcstat;
86#endif
87extern int __init register_nfs_fs(void);
88extern void __exit unregister_nfs_fs(void);
89
90/* namespace.c */
91extern char *nfs_path(const char *base, const struct dentry *dentry,
92 char *buffer, ssize_t buflen);
93
94/*
95 * Determine the mount path as a string
96 */
97static inline char *
98nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
99{
100#ifdef CONFIG_NFS_V4
101 return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
102#else
103 return NULL;
104#endif
105}
106
107/*
108 * Determine the device name as a string
109 */
110static inline char *nfs_devname(const struct vfsmount *mnt_parent,
111 const struct dentry *dentry,
112 char *buffer, ssize_t buflen)
113{
114 return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen);
115}
116
117/*
118 * Determine the actual block size (and log2 thereof)
119 */
120static inline
121unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
122{
123 /* make sure blocksize is a power of two */
124 if ((bsize & (bsize - 1)) || nrbitsp) {
125 unsigned char nrbits;
126
127 for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
128 ;
129 bsize = 1 << nrbits;
130 if (nrbitsp)
131 *nrbitsp = nrbits;
132 }
133
134 return bsize;
135}
136
137/*
138 * Calculate the number of 512byte blocks used.
139 */
140static inline unsigned long nfs_calc_block_size(u64 tsize)
141{
142 loff_t used = (tsize + 511) >> 9;
143 return (used > ULONG_MAX) ? ULONG_MAX : used;
144}
145
146/*
147 * Compute and set NFS server blocksize
148 */
149static inline
150unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
151{
152 if (bsize < NFS_MIN_FILE_IO_SIZE)
153 bsize = NFS_DEF_FILE_IO_SIZE;
154 else if (bsize >= NFS_MAX_FILE_IO_SIZE)
155 bsize = NFS_MAX_FILE_IO_SIZE;
156
157 return nfs_block_bits(bsize, nrbitsp);
158}
159
160/*
161 * Determine the maximum file size for a superblock
162 */
163static inline
164void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
165{
166 sb->s_maxbytes = (loff_t)maxfilesize;
167 if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
168 sb->s_maxbytes = MAX_LFS_FILESIZE;
169}
170
171/*
172 * Check if the string represents a "valid" IPv4 address
173 */
174static inline int valid_ipaddr4(const char *buf)
175{
176 int rc, count, in[4];
177
178 rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
179 if (rc != 4)
180 return -EINVAL;
181 for (count = 0; count < 4; count++) {
182 if (in[count] > 255)
183 return -EINVAL;
184 }
185 return 0;
186}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
new file mode 100644
index 000000000000..19b98ca468eb
--- /dev/null
+++ b/fs/nfs/namespace.c
@@ -0,0 +1,229 @@
1/*
2 * linux/fs/nfs/namespace.c
3 *
4 * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
5 *
6 * NFS namespace
7 */
8
9#include <linux/config.h>
10
11#include <linux/dcache.h>
12#include <linux/mount.h>
13#include <linux/namei.h>
14#include <linux/nfs_fs.h>
15#include <linux/string.h>
16#include <linux/sunrpc/clnt.h>
17#include <linux/vfs.h>
18#include "internal.h"
19
20#define NFSDBG_FACILITY NFSDBG_VFS
21
22static void nfs_expire_automounts(void *list);
23
24LIST_HEAD(nfs_automount_list);
25static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list);
26int nfs_mountpoint_expiry_timeout = 500 * HZ;
27
28/*
29 * nfs_path - reconstruct the path given an arbitrary dentry
30 * @base - arbitrary string to prepend to the path
31 * @dentry - pointer to dentry
32 * @buffer - result buffer
33 * @buflen - length of buffer
34 *
35 * Helper function for constructing the path from the
36 * root dentry to an arbitrary hashed dentry.
37 *
38 * This is mainly for use in figuring out the path on the
39 * server side when automounting on top of an existing partition.
40 */
41char *nfs_path(const char *base, const struct dentry *dentry,
42 char *buffer, ssize_t buflen)
43{
44 char *end = buffer+buflen;
45 int namelen;
46
47 *--end = '\0';
48 buflen--;
49 spin_lock(&dcache_lock);
50 while (!IS_ROOT(dentry)) {
51 namelen = dentry->d_name.len;
52 buflen -= namelen + 1;
53 if (buflen < 0)
54 goto Elong;
55 end -= namelen;
56 memcpy(end, dentry->d_name.name, namelen);
57 *--end = '/';
58 dentry = dentry->d_parent;
59 }
60 spin_unlock(&dcache_lock);
61 namelen = strlen(base);
62 /* Strip off excess slashes in base string */
63 while (namelen > 0 && base[namelen - 1] == '/')
64 namelen--;
65 buflen -= namelen;
66 if (buflen < 0)
67 goto Elong;
68 end -= namelen;
69 memcpy(end, base, namelen);
70 return end;
71Elong:
72 return ERR_PTR(-ENAMETOOLONG);
73}
74
75/*
76 * nfs_follow_mountpoint - handle crossing a mountpoint on the server
77 * @dentry - dentry of mountpoint
78 * @nd - nameidata info
79 *
80 * When we encounter a mountpoint on the server, we want to set up
81 * a mountpoint on the client too, to prevent inode numbers from
82 * colliding, and to allow "df" to work properly.
83 * On NFSv4, we also want to allow for the fact that different
84 * filesystems may be migrated to different servers in a failover
85 * situation, and that different filesystems may want to use
86 * different security flavours.
87 */
88static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
89{
90 struct vfsmount *mnt;
91 struct nfs_server *server = NFS_SERVER(dentry->d_inode);
92 struct dentry *parent;
93 struct nfs_fh fh;
94 struct nfs_fattr fattr;
95 int err;
96
97 BUG_ON(IS_ROOT(dentry));
98 dprintk("%s: enter\n", __FUNCTION__);
99 dput(nd->dentry);
100 nd->dentry = dget(dentry);
101 if (d_mountpoint(nd->dentry))
102 goto out_follow;
103 /* Look it up again */
104 parent = dget_parent(nd->dentry);
105 err = server->rpc_ops->lookup(parent->d_inode, &nd->dentry->d_name, &fh, &fattr);
106 dput(parent);
107 if (err != 0)
108 goto out_err;
109
110 if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL)
111 mnt = nfs_do_refmount(nd->mnt, nd->dentry);
112 else
113 mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
114 err = PTR_ERR(mnt);
115 if (IS_ERR(mnt))
116 goto out_err;
117
118 mntget(mnt);
119 err = do_add_mount(mnt, nd, nd->mnt->mnt_flags|MNT_SHRINKABLE, &nfs_automount_list);
120 if (err < 0) {
121 mntput(mnt);
122 if (err == -EBUSY)
123 goto out_follow;
124 goto out_err;
125 }
126 mntput(nd->mnt);
127 dput(nd->dentry);
128 nd->mnt = mnt;
129 nd->dentry = dget(mnt->mnt_root);
130 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
131out:
132 dprintk("%s: done, returned %d\n", __FUNCTION__, err);
133 return ERR_PTR(err);
134out_err:
135 path_release(nd);
136 goto out;
137out_follow:
138 while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
139 ;
140 err = 0;
141 goto out;
142}
143
144struct inode_operations nfs_mountpoint_inode_operations = {
145 .follow_link = nfs_follow_mountpoint,
146 .getattr = nfs_getattr,
147};
148
149struct inode_operations nfs_referral_inode_operations = {
150 .follow_link = nfs_follow_mountpoint,
151};
152
153static void nfs_expire_automounts(void *data)
154{
155 struct list_head *list = (struct list_head *)data;
156
157 mark_mounts_for_expiry(list);
158 if (!list_empty(list))
159 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
160}
161
162void nfs_release_automount_timer(void)
163{
164 if (list_empty(&nfs_automount_list)) {
165 cancel_delayed_work(&nfs_automount_task);
166 flush_scheduled_work();
167 }
168}
169
170/*
171 * Clone a mountpoint of the appropriate type
172 */
173static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname,
174 struct nfs_clone_mount *mountdata)
175{
176#ifdef CONFIG_NFS_V4
177 struct vfsmount *mnt = NULL;
178 switch (server->rpc_ops->version) {
179 case 2:
180 case 3:
181 mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
182 break;
183 case 4:
184 mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, mountdata);
185 }
186 return mnt;
187#else
188 return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
189#endif
190}
191
192/**
193 * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
194 * @mnt_parent - mountpoint of parent directory
195 * @dentry - parent directory
196 * @fh - filehandle for new root dentry
197 * @fattr - attributes for new root inode
198 *
199 */
200struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
201 const struct dentry *dentry, struct nfs_fh *fh,
202 struct nfs_fattr *fattr)
203{
204 struct nfs_clone_mount mountdata = {
205 .sb = mnt_parent->mnt_sb,
206 .dentry = dentry,
207 .fh = fh,
208 .fattr = fattr,
209 };
210 struct vfsmount *mnt = ERR_PTR(-ENOMEM);
211 char *page = (char *) __get_free_page(GFP_USER);
212 char *devname;
213
214 dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
215 dentry->d_parent->d_name.name,
216 dentry->d_name.name);
217 if (page == NULL)
218 goto out;
219 devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
220 mnt = (struct vfsmount *)devname;
221 if (IS_ERR(devname))
222 goto free_page;
223 mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata);
224free_page:
225 free_page((unsigned long)page);
226out:
227 dprintk("%s: done\n", __FUNCTION__);
228 return mnt;
229}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index f0015fa876e1..67391eef6b93 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -23,12 +23,11 @@
23#include <linux/nfs.h> 23#include <linux/nfs.h>
24#include <linux/nfs2.h> 24#include <linux/nfs2.h>
25#include <linux/nfs_fs.h> 25#include <linux/nfs_fs.h>
26#include "internal.h"
26 27
27#define NFSDBG_FACILITY NFSDBG_XDR 28#define NFSDBG_FACILITY NFSDBG_XDR
28/* #define NFS_PARANOIA 1 */ 29/* #define NFS_PARANOIA 1 */
29 30
30extern int nfs_stat_to_errno(int stat);
31
32/* Mapping from NFS error code to "errno" error code. */ 31/* Mapping from NFS error code to "errno" error code. */
33#define errno_NFSERR_IO EIO 32#define errno_NFSERR_IO EIO
34 33
@@ -131,7 +130,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
131 fattr->du.nfs2.blocksize = ntohl(*p++); 130 fattr->du.nfs2.blocksize = ntohl(*p++);
132 rdev = ntohl(*p++); 131 rdev = ntohl(*p++);
133 fattr->du.nfs2.blocks = ntohl(*p++); 132 fattr->du.nfs2.blocks = ntohl(*p++);
134 fattr->fsid_u.nfs3 = ntohl(*p++); 133 fattr->fsid.major = ntohl(*p++);
134 fattr->fsid.minor = 0;
135 fattr->fileid = ntohl(*p++); 135 fattr->fileid = ntohl(*p++);
136 p = xdr_decode_time(p, &fattr->atime); 136 p = xdr_decode_time(p, &fattr->atime);
137 p = xdr_decode_time(p, &fattr->mtime); 137 p = xdr_decode_time(p, &fattr->mtime);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 33287879bd23..7322da4d2055 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -172,8 +172,10 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
172 inode->i_ino, acl, dfacl); 172 inode->i_ino, acl, dfacl);
173 spin_lock(&inode->i_lock); 173 spin_lock(&inode->i_lock);
174 __nfs3_forget_cached_acls(NFS_I(inode)); 174 __nfs3_forget_cached_acls(NFS_I(inode));
175 nfsi->acl_access = posix_acl_dup(acl); 175 if (!IS_ERR(acl))
176 nfsi->acl_default = posix_acl_dup(dfacl); 176 nfsi->acl_access = posix_acl_dup(acl);
177 if (!IS_ERR(dfacl))
178 nfsi->acl_default = posix_acl_dup(dfacl);
177 spin_unlock(&inode->i_lock); 179 spin_unlock(&inode->i_lock);
178} 180}
179 181
@@ -254,7 +256,9 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
254 res.acl_access = NULL; 256 res.acl_access = NULL;
255 } 257 }
256 } 258 }
257 nfs3_cache_acls(inode, res.acl_access, res.acl_default); 259 nfs3_cache_acls(inode,
260 (res.mask & NFS_ACL) ? res.acl_access : ERR_PTR(-EINVAL),
261 (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL));
258 262
259 switch(type) { 263 switch(type) {
260 case ACL_TYPE_ACCESS: 264 case ACL_TYPE_ACCESS:
@@ -329,6 +333,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
329 switch (status) { 333 switch (status) {
330 case 0: 334 case 0:
331 status = nfs_refresh_inode(inode, &fattr); 335 status = nfs_refresh_inode(inode, &fattr);
336 nfs3_cache_acls(inode, acl, dfacl);
332 break; 337 break;
333 case -EPFNOSUPPORT: 338 case -EPFNOSUPPORT:
334 case -EPROTONOSUPPORT: 339 case -EPROTONOSUPPORT:
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index cf186f0d2b3b..7143b1f82cea 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -20,11 +20,10 @@
20#include <linux/nfs_mount.h> 20#include <linux/nfs_mount.h>
21 21
22#include "iostat.h" 22#include "iostat.h"
23#include "internal.h"
23 24
24#define NFSDBG_FACILITY NFSDBG_PROC 25#define NFSDBG_FACILITY NFSDBG_PROC
25 26
26extern struct rpc_procinfo nfs3_procedures[];
27
28/* A wrapper to handle the EJUKEBOX error message */ 27/* A wrapper to handle the EJUKEBOX error message */
29static int 28static int
30nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) 29nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
@@ -809,8 +808,6 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
809 return status; 808 return status;
810} 809}
811 810
812extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
813
814static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) 811static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
815{ 812{
816 if (nfs3_async_handle_jukebox(task, data->inode)) 813 if (nfs3_async_handle_jukebox(task, data->inode))
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index ec233619687e..0250269e9753 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -22,14 +22,13 @@
22#include <linux/nfs3.h> 22#include <linux/nfs3.h>
23#include <linux/nfs_fs.h> 23#include <linux/nfs_fs.h>
24#include <linux/nfsacl.h> 24#include <linux/nfsacl.h>
25#include "internal.h"
25 26
26#define NFSDBG_FACILITY NFSDBG_XDR 27#define NFSDBG_FACILITY NFSDBG_XDR
27 28
28/* Mapping from NFS error code to "errno" error code. */ 29/* Mapping from NFS error code to "errno" error code. */
29#define errno_NFSERR_IO EIO 30#define errno_NFSERR_IO EIO
30 31
31extern int nfs_stat_to_errno(int);
32
33/* 32/*
34 * Declare the space requirements for NFS arguments and replies as 33 * Declare the space requirements for NFS arguments and replies as
35 * number of 32bit-words 34 * number of 32bit-words
@@ -166,7 +165,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
166 if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor) 165 if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
167 fattr->rdev = 0; 166 fattr->rdev = 0;
168 167
169 p = xdr_decode_hyper(p, &fattr->fsid_u.nfs3); 168 p = xdr_decode_hyper(p, &fattr->fsid.major);
169 fattr->fsid.minor = 0;
170 p = xdr_decode_hyper(p, &fattr->fileid); 170 p = xdr_decode_hyper(p, &fattr->fileid);
171 p = xdr_decode_time3(p, &fattr->atime); 171 p = xdr_decode_time3(p, &fattr->atime);
172 p = xdr_decode_time3(p, &fattr->mtime); 172 p = xdr_decode_time3(p, &fattr->mtime);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 0f5e4e7cddec..9a102860df37 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -217,6 +217,9 @@ extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *);
217extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state); 217extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
218extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); 218extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
219extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); 219extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
220extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
221extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
222 struct nfs4_fs_locations *fs_locations, struct page *page);
220 223
221extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops; 224extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
222extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops; 225extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
@@ -225,6 +228,7 @@ extern const u32 nfs4_fattr_bitmap[2];
225extern const u32 nfs4_statfs_bitmap[2]; 228extern const u32 nfs4_statfs_bitmap[2];
226extern const u32 nfs4_pathconf_bitmap[2]; 229extern const u32 nfs4_pathconf_bitmap[2];
227extern const u32 nfs4_fsinfo_bitmap[2]; 230extern const u32 nfs4_fsinfo_bitmap[2];
231extern const u32 nfs4_fs_locations_bitmap[2];
228 232
229/* nfs4renewd.c */ 233/* nfs4renewd.c */
230extern void nfs4_schedule_state_renewal(struct nfs4_client *); 234extern void nfs4_schedule_state_renewal(struct nfs4_client *);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
new file mode 100644
index 000000000000..ea38d27b74e6
--- /dev/null
+++ b/fs/nfs/nfs4namespace.c
@@ -0,0 +1,201 @@
1/*
2 * linux/fs/nfs/nfs4namespace.c
3 *
4 * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
5 *
6 * NFSv4 namespace
7 */
8
9#include <linux/config.h>
10
11#include <linux/dcache.h>
12#include <linux/mount.h>
13#include <linux/namei.h>
14#include <linux/nfs_fs.h>
15#include <linux/string.h>
16#include <linux/sunrpc/clnt.h>
17#include <linux/vfs.h>
18#include <linux/inet.h>
19#include "internal.h"
20
21#define NFSDBG_FACILITY NFSDBG_VFS
22
23/*
24 * Check if fs_root is valid
25 */
26static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname,
27 char *buffer, ssize_t buflen)
28{
29 char *end = buffer + buflen;
30 int n;
31
32 *--end = '\0';
33 buflen--;
34
35 n = pathname->ncomponents;
36 while (--n >= 0) {
37 struct nfs4_string *component = &pathname->components[n];
38 buflen -= component->len + 1;
39 if (buflen < 0)
40 goto Elong;
41 end -= component->len;
42 memcpy(end, component->data, component->len);
43 *--end = '/';
44 }
45 return end;
46Elong:
47 return ERR_PTR(-ENAMETOOLONG);
48}
49
50
51/**
52 * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
53 * @mnt_parent - mountpoint of parent directory
54 * @dentry - parent directory
55 * @fspath - fs path returned in fs_locations
56 * @mntpath - mount path to new server
57 * @hostname - hostname of new server
58 * @addr - host addr of new server
59 *
60 */
61static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
62 const struct dentry *dentry,
63 struct nfs4_fs_locations *locations)
64{
65 struct vfsmount *mnt = ERR_PTR(-ENOENT);
66 struct nfs_clone_mount mountdata = {
67 .sb = mnt_parent->mnt_sb,
68 .dentry = dentry,
69 .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
70 };
71 char *page, *page2;
72 char *path, *fs_path;
73 char *devname;
74 int loc, s;
75
76 if (locations == NULL || locations->nlocations <= 0)
77 goto out;
78
79 dprintk("%s: referral at %s/%s\n", __FUNCTION__,
80 dentry->d_parent->d_name.name, dentry->d_name.name);
81
82 /* Ensure fs path is a prefix of current dentry path */
83 page = (char *) __get_free_page(GFP_USER);
84 if (page == NULL)
85 goto out;
86 page2 = (char *) __get_free_page(GFP_USER);
87 if (page2 == NULL)
88 goto out;
89
90 path = nfs4_path(dentry, page, PAGE_SIZE);
91 if (IS_ERR(path))
92 goto out_free;
93
94 fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
95 if (IS_ERR(fs_path))
96 goto out_free;
97
98 if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
99 dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path);
100 goto out_free;
101 }
102
103 devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
104 if (IS_ERR(devname)) {
105 mnt = (struct vfsmount *)devname;
106 goto out_free;
107 }
108
109 loc = 0;
110 while (loc < locations->nlocations && IS_ERR(mnt)) {
111 struct nfs4_fs_location *location = &locations->locations[loc];
112 char *mnt_path;
113
114 if (location == NULL || location->nservers <= 0 ||
115 location->rootpath.ncomponents == 0) {
116 loc++;
117 continue;
118 }
119
120 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
121 if (IS_ERR(mnt_path)) {
122 loc++;
123 continue;
124 }
125 mountdata.mnt_path = mnt_path;
126
127 s = 0;
128 while (s < location->nservers) {
129 struct sockaddr_in addr = {};
130
131 if (location->servers[s].len <= 0 ||
132 valid_ipaddr4(location->servers[s].data) < 0) {
133 s++;
134 continue;
135 }
136
137 mountdata.hostname = location->servers[s].data;
138 addr.sin_addr.s_addr = in_aton(mountdata.hostname);
139 addr.sin_family = AF_INET;
140 addr.sin_port = htons(NFS_PORT);
141 mountdata.addr = &addr;
142
143 mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata);
144 if (!IS_ERR(mnt)) {
145 break;
146 }
147 s++;
148 }
149 loc++;
150 }
151
152out_free:
153 free_page((unsigned long)page);
154 free_page((unsigned long)page2);
155out:
156 dprintk("%s: done\n", __FUNCTION__);
157 return mnt;
158}
159
160/*
161 * nfs_do_refmount - handle crossing a referral on server
162 * @dentry - dentry of referral
163 * @nd - nameidata info
164 *
165 */
166struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
167{
168 struct vfsmount *mnt = ERR_PTR(-ENOENT);
169 struct dentry *parent;
170 struct nfs4_fs_locations *fs_locations = NULL;
171 struct page *page;
172 int err;
173
174 /* BUG_ON(IS_ROOT(dentry)); */
175 dprintk("%s: enter\n", __FUNCTION__);
176
177 page = alloc_page(GFP_KERNEL);
178 if (page == NULL)
179 goto out;
180
181 fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
182 if (fs_locations == NULL)
183 goto out_free;
184
185 /* Get locations */
186 parent = dget_parent(dentry);
187 dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name);
188 err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page);
189 dput(parent);
190 if (err != 0 || fs_locations->nlocations <= 0 ||
191 fs_locations->fs_path.ncomponents <= 0)
192 goto out_free;
193
194 mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
195out_free:
196 __free_page(page);
197 kfree(fs_locations);
198out:
199 dprintk("%s: done\n", __FUNCTION__);
200 return mnt;
201}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d86c0db7b1e8..b4916b092194 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -65,8 +65,6 @@ static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *)
65static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); 65static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
66static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); 66static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
67static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp); 67static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp);
68extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
69extern struct rpc_procinfo nfs4_procedures[];
70 68
71/* Prevent leaks of NFSv4 errors into userland */ 69/* Prevent leaks of NFSv4 errors into userland */
72int nfs4_map_errors(int err) 70int nfs4_map_errors(int err)
@@ -121,6 +119,25 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
121 0 119 0
122}; 120};
123 121
122const u32 nfs4_fs_locations_bitmap[2] = {
123 FATTR4_WORD0_TYPE
124 | FATTR4_WORD0_CHANGE
125 | FATTR4_WORD0_SIZE
126 | FATTR4_WORD0_FSID
127 | FATTR4_WORD0_FILEID
128 | FATTR4_WORD0_FS_LOCATIONS,
129 FATTR4_WORD1_MODE
130 | FATTR4_WORD1_NUMLINKS
131 | FATTR4_WORD1_OWNER
132 | FATTR4_WORD1_OWNER_GROUP
133 | FATTR4_WORD1_RAWDEV
134 | FATTR4_WORD1_SPACE_USED
135 | FATTR4_WORD1_TIME_ACCESS
136 | FATTR4_WORD1_TIME_METADATA
137 | FATTR4_WORD1_TIME_MODIFY
138 | FATTR4_WORD1_MOUNTED_ON_FILEID
139};
140
124static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry, 141static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
125 struct nfs4_readdir_arg *readdir) 142 struct nfs4_readdir_arg *readdir)
126{ 143{
@@ -185,15 +202,15 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
185 spin_unlock(&clp->cl_lock); 202 spin_unlock(&clp->cl_lock);
186} 203}
187 204
188static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinfo) 205static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
189{ 206{
190 struct nfs_inode *nfsi = NFS_I(inode); 207 struct nfs_inode *nfsi = NFS_I(dir);
191 208
192 spin_lock(&inode->i_lock); 209 spin_lock(&dir->i_lock);
193 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 210 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
194 if (cinfo->before == nfsi->change_attr && cinfo->atomic) 211 if (cinfo->before == nfsi->change_attr && cinfo->atomic)
195 nfsi->change_attr = cinfo->after; 212 nfsi->change_attr = cinfo->after;
196 spin_unlock(&inode->i_lock); 213 spin_unlock(&dir->i_lock);
197} 214}
198 215
199struct nfs4_opendata { 216struct nfs4_opendata {
@@ -1331,7 +1348,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
1331 return status; 1348 return status;
1332} 1349}
1333 1350
1334static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) 1351int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
1335{ 1352{
1336 struct nfs4_exception exception = { }; 1353 struct nfs4_exception exception = { };
1337 int err; 1354 int err;
@@ -1443,6 +1460,50 @@ out:
1443 return nfs4_map_errors(status); 1460 return nfs4_map_errors(status);
1444} 1461}
1445 1462
1463/*
1464 * Get locations and (maybe) other attributes of a referral.
1465 * Note that we'll actually follow the referral later when
1466 * we detect fsid mismatch in inode revalidation
1467 */
1468static int nfs4_get_referral(struct inode *dir, struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle)
1469{
1470 int status = -ENOMEM;
1471 struct page *page = NULL;
1472 struct nfs4_fs_locations *locations = NULL;
1473 struct dentry dentry = {};
1474
1475 page = alloc_page(GFP_KERNEL);
1476 if (page == NULL)
1477 goto out;
1478 locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
1479 if (locations == NULL)
1480 goto out;
1481
1482 dentry.d_name.name = name->name;
1483 dentry.d_name.len = name->len;
1484 status = nfs4_proc_fs_locations(dir, &dentry, locations, page);
1485 if (status != 0)
1486 goto out;
1487 /* Make sure server returned a different fsid for the referral */
1488 if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
1489 dprintk("%s: server did not return a different fsid for a referral at %s\n", __FUNCTION__, name->name);
1490 status = -EIO;
1491 goto out;
1492 }
1493
1494 memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
1495 fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL;
1496 if (!fattr->mode)
1497 fattr->mode = S_IFDIR;
1498 memset(fhandle, 0, sizeof(struct nfs_fh));
1499out:
1500 if (page)
1501 __free_page(page);
1502 if (locations)
1503 kfree(locations);
1504 return status;
1505}
1506
1446static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 1507static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
1447{ 1508{
1448 struct nfs4_getattr_arg args = { 1509 struct nfs4_getattr_arg args = {
@@ -1547,6 +1608,8 @@ static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
1547 1608
1548 dprintk("NFS call lookup %s\n", name->name); 1609 dprintk("NFS call lookup %s\n", name->name);
1549 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 1610 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
1611 if (status == -NFS4ERR_MOVED)
1612 status = nfs4_get_referral(dir, name, fattr, fhandle);
1550 dprintk("NFS reply lookup: %d\n", status); 1613 dprintk("NFS reply lookup: %d\n", status);
1551 return status; 1614 return status;
1552} 1615}
@@ -2008,7 +2071,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
2008 if (!status) { 2071 if (!status) {
2009 update_changeattr(dir, &res.cinfo); 2072 update_changeattr(dir, &res.cinfo);
2010 nfs_post_op_update_inode(dir, res.dir_attr); 2073 nfs_post_op_update_inode(dir, res.dir_attr);
2011 nfs_refresh_inode(inode, res.fattr); 2074 nfs_post_op_update_inode(inode, res.fattr);
2012 } 2075 }
2013 2076
2014 return status; 2077 return status;
@@ -3570,6 +3633,36 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
3570 return len; 3633 return len;
3571} 3634}
3572 3635
3636int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
3637 struct nfs4_fs_locations *fs_locations, struct page *page)
3638{
3639 struct nfs_server *server = NFS_SERVER(dir);
3640 u32 bitmask[2] = {
3641 [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
3642 [1] = FATTR4_WORD1_MOUNTED_ON_FILEID,
3643 };
3644 struct nfs4_fs_locations_arg args = {
3645 .dir_fh = NFS_FH(dir),
3646 .name = &dentry->d_name,
3647 .page = page,
3648 .bitmask = bitmask,
3649 };
3650 struct rpc_message msg = {
3651 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
3652 .rpc_argp = &args,
3653 .rpc_resp = fs_locations,
3654 };
3655 int status;
3656
3657 dprintk("%s: start\n", __FUNCTION__);
3658 fs_locations->fattr.valid = 0;
3659 fs_locations->server = server;
3660 fs_locations->nlocations = 0;
3661 status = rpc_call_sync(server->client, &msg, 0);
3662 dprintk("%s: returned status = %d\n", __FUNCTION__, status);
3663 return status;
3664}
3665
3573struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = { 3666struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
3574 .recover_open = nfs4_open_reclaim, 3667 .recover_open = nfs4_open_reclaim,
3575 .recover_lock = nfs4_lock_reclaim, 3668 .recover_lock = nfs4_lock_reclaim,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7c5d70efe720..1750d996f49f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -411,6 +411,15 @@ static int nfs_stat_to_errno(int);
411#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \ 411#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \
412 decode_putfh_maxsz + \ 412 decode_putfh_maxsz + \
413 op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) 413 op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
414#define NFS4_enc_fs_locations_sz \
415 (compound_encode_hdr_maxsz + \
416 encode_putfh_maxsz + \
417 encode_getattr_maxsz)
418#define NFS4_dec_fs_locations_sz \
419 (compound_decode_hdr_maxsz + \
420 decode_putfh_maxsz + \
421 op_decode_hdr_maxsz + \
422 nfs4_fattr_bitmap_maxsz)
414 423
415static struct { 424static struct {
416 unsigned int mode; 425 unsigned int mode;
@@ -722,6 +731,13 @@ static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask)
722 bitmask[1] & nfs4_fsinfo_bitmap[1]); 731 bitmask[1] & nfs4_fsinfo_bitmap[1]);
723} 732}
724 733
734static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
735{
736 return encode_getattr_two(xdr,
737 bitmask[0] & nfs4_fs_locations_bitmap[0],
738 bitmask[1] & nfs4_fs_locations_bitmap[1]);
739}
740
725static int encode_getfh(struct xdr_stream *xdr) 741static int encode_getfh(struct xdr_stream *xdr)
726{ 742{
727 uint32_t *p; 743 uint32_t *p;
@@ -2003,6 +2019,38 @@ out:
2003} 2019}
2004 2020
2005/* 2021/*
2022 * Encode FS_LOCATIONS request
2023 */
2024static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args)
2025{
2026 struct xdr_stream xdr;
2027 struct compound_hdr hdr = {
2028 .nops = 3,
2029 };
2030 struct rpc_auth *auth = req->rq_task->tk_auth;
2031 int replen;
2032 int status;
2033
2034 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2035 encode_compound_hdr(&xdr, &hdr);
2036 if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
2037 goto out;
2038 if ((status = encode_lookup(&xdr, args->name)) != 0)
2039 goto out;
2040 if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0)
2041 goto out;
2042 /* set up reply
2043 * toplevel_status + OP_PUTFH + status
2044 * + OP_LOOKUP + status + OP_GETATTR + status = 7
2045 */
2046 replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
2047 xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
2048 0, PAGE_SIZE);
2049out:
2050 return status;
2051}
2052
2053/*
2006 * START OF "GENERIC" DECODE ROUTINES. 2054 * START OF "GENERIC" DECODE ROUTINES.
2007 * These may look a little ugly since they are imported from a "generic" 2055 * These may look a little ugly since they are imported from a "generic"
2008 * set of XDR encode/decode routines which are intended to be shared by 2056 * set of XDR encode/decode routines which are intended to be shared by
@@ -2036,7 +2084,7 @@ out:
2036 } \ 2084 } \
2037} while (0) 2085} while (0)
2038 2086
2039static int decode_opaque_inline(struct xdr_stream *xdr, uint32_t *len, char **string) 2087static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
2040{ 2088{
2041 uint32_t *p; 2089 uint32_t *p;
2042 2090
@@ -2087,7 +2135,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
2087static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp) 2135static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp)
2088{ 2136{
2089 uint32_t *p; 2137 uint32_t *p;
2090 uint32_t strlen; 2138 unsigned int strlen;
2091 char *str; 2139 char *str;
2092 2140
2093 READ_BUF(12); 2141 READ_BUF(12);
@@ -2217,7 +2265,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
2217 return 0; 2265 return 0;
2218} 2266}
2219 2267
2220static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fsid *fsid) 2268static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
2221{ 2269{
2222 uint32_t *p; 2270 uint32_t *p;
2223 2271
@@ -2285,6 +2333,22 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2285 return 0; 2333 return 0;
2286} 2334}
2287 2335
2336static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
2337{
2338 uint32_t *p;
2339
2340 *fileid = 0;
2341 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
2342 return -EIO;
2343 if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
2344 READ_BUF(8);
2345 READ64(*fileid);
2346 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
2347 }
2348 dprintk("%s: fileid=%Lu\n", __FUNCTION__, (unsigned long long)*fileid);
2349 return 0;
2350}
2351
2288static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2352static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
2289{ 2353{
2290 uint32_t *p; 2354 uint32_t *p;
@@ -2336,6 +2400,116 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
2336 return status; 2400 return status;
2337} 2401}
2338 2402
2403static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
2404{
2405 int n;
2406 uint32_t *p;
2407 int status = 0;
2408
2409 READ_BUF(4);
2410 READ32(n);
2411 if (n < 0)
2412 goto out_eio;
2413 if (n == 0)
2414 goto root_path;
2415 dprintk("path ");
2416 path->ncomponents = 0;
2417 while (path->ncomponents < n) {
2418 struct nfs4_string *component = &path->components[path->ncomponents];
2419 status = decode_opaque_inline(xdr, &component->len, &component->data);
2420 if (unlikely(status != 0))
2421 goto out_eio;
2422 if (path->ncomponents != n)
2423 dprintk("/");
2424 dprintk("%s", component->data);
2425 if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
2426 path->ncomponents++;
2427 else {
2428 dprintk("cannot parse %d components in path\n", n);
2429 goto out_eio;
2430 }
2431 }
2432out:
2433 dprintk("\n");
2434 return status;
2435root_path:
2436/* a root pathname is sent as a zero component4 */
2437 path->ncomponents = 1;
2438 path->components[0].len=0;
2439 path->components[0].data=NULL;
2440 dprintk("path /\n");
2441 goto out;
2442out_eio:
2443 dprintk(" status %d", status);
2444 status = -EIO;
2445 goto out;
2446}
2447
2448static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
2449{
2450 int n;
2451 uint32_t *p;
2452 int status = -EIO;
2453
2454 if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
2455 goto out;
2456 status = 0;
2457 if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
2458 goto out;
2459 dprintk("%s: fsroot ", __FUNCTION__);
2460 status = decode_pathname(xdr, &res->fs_path);
2461 if (unlikely(status != 0))
2462 goto out;
2463 READ_BUF(4);
2464 READ32(n);
2465 if (n <= 0)
2466 goto out_eio;
2467 res->nlocations = 0;
2468 while (res->nlocations < n) {
2469 int m;
2470 struct nfs4_fs_location *loc = &res->locations[res->nlocations];
2471
2472 READ_BUF(4);
2473 READ32(m);
2474 if (m <= 0)
2475 goto out_eio;
2476
2477 loc->nservers = 0;
2478 dprintk("%s: servers ", __FUNCTION__);
2479 while (loc->nservers < m) {
2480 struct nfs4_string *server = &loc->servers[loc->nservers];
2481 status = decode_opaque_inline(xdr, &server->len, &server->data);
2482 if (unlikely(status != 0))
2483 goto out_eio;
2484 dprintk("%s ", server->data);
2485 if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
2486 loc->nservers++;
2487 else {
2488 int i;
2489 dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations);
2490 for (i = loc->nservers; i < m; i++) {
2491 int len;
2492 char *data;
2493 status = decode_opaque_inline(xdr, &len, &data);
2494 if (unlikely(status != 0))
2495 goto out_eio;
2496 }
2497 }
2498 }
2499 status = decode_pathname(xdr, &loc->rootpath);
2500 if (unlikely(status != 0))
2501 goto out_eio;
2502 if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
2503 res->nlocations++;
2504 }
2505out:
2506 dprintk("%s: fs_locations done, error = %d\n", __FUNCTION__, status);
2507 return status;
2508out_eio:
2509 status = -EIO;
2510 goto out;
2511}
2512
2339static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2513static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
2340{ 2514{
2341 uint32_t *p; 2515 uint32_t *p;
@@ -2841,6 +3015,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
2841 bitmap[2] = {0}, 3015 bitmap[2] = {0},
2842 type; 3016 type;
2843 int status, fmode = 0; 3017 int status, fmode = 0;
3018 uint64_t fileid;
2844 3019
2845 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 3020 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
2846 goto xdr_error; 3021 goto xdr_error;
@@ -2863,10 +3038,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
2863 goto xdr_error; 3038 goto xdr_error;
2864 if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0) 3039 if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
2865 goto xdr_error; 3040 goto xdr_error;
2866 if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid_u.nfs4)) != 0) 3041 if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
2867 goto xdr_error; 3042 goto xdr_error;
2868 if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0) 3043 if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
2869 goto xdr_error; 3044 goto xdr_error;
3045 if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
3046 struct nfs4_fs_locations,
3047 fattr))) != 0)
3048 goto xdr_error;
2870 if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0) 3049 if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
2871 goto xdr_error; 3050 goto xdr_error;
2872 fattr->mode |= fmode; 3051 fattr->mode |= fmode;
@@ -2886,6 +3065,10 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
2886 goto xdr_error; 3065 goto xdr_error;
2887 if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0) 3066 if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
2888 goto xdr_error; 3067 goto xdr_error;
3068 if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0)
3069 goto xdr_error;
3070 if (fattr->fileid == 0 && fileid != 0)
3071 fattr->fileid = fileid;
2889 if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) 3072 if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
2890 fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; 3073 fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
2891xdr_error: 3074xdr_error:
@@ -3350,8 +3533,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
3350 attrlen, recvd); 3533 attrlen, recvd);
3351 return -EINVAL; 3534 return -EINVAL;
3352 } 3535 }
3353 if (attrlen <= *acl_len) 3536 xdr_read_pages(xdr, attrlen);
3354 xdr_read_pages(xdr, attrlen);
3355 *acl_len = attrlen; 3537 *acl_len = attrlen;
3356 } else 3538 } else
3357 status = -EOPNOTSUPP; 3539 status = -EOPNOTSUPP;
@@ -4211,6 +4393,29 @@ out:
4211 return status; 4393 return status;
4212} 4394}
4213 4395
4396/*
4397 * FS_LOCATIONS request
4398 */
4399static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res)
4400{
4401 struct xdr_stream xdr;
4402 struct compound_hdr hdr;
4403 int status;
4404
4405 xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
4406 status = decode_compound_hdr(&xdr, &hdr);
4407 if (status != 0)
4408 goto out;
4409 if ((status = decode_putfh(&xdr)) != 0)
4410 goto out;
4411 if ((status = decode_lookup(&xdr)) != 0)
4412 goto out;
4413 xdr_enter_page(&xdr, PAGE_SIZE);
4414 status = decode_getfattr(&xdr, &res->fattr, res->server);
4415out:
4416 return status;
4417}
4418
4214uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus) 4419uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus)
4215{ 4420{
4216 uint32_t bitmap[2] = {0}; 4421 uint32_t bitmap[2] = {0};
@@ -4382,6 +4587,7 @@ struct rpc_procinfo nfs4_procedures[] = {
4382 PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), 4587 PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn),
4383 PROC(GETACL, enc_getacl, dec_getacl), 4588 PROC(GETACL, enc_getacl, dec_getacl),
4384 PROC(SETACL, enc_setacl, dec_setacl), 4589 PROC(SETACL, enc_setacl, dec_setacl),
4590 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
4385}; 4591};
4386 4592
4387struct rpc_version nfs_version4 = { 4593struct rpc_version nfs_version4 = {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 106aca388ebc..d89f6fb3b3a3 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -325,6 +325,7 @@ out:
325 325
326/** 326/**
327 * nfs_scan_list - Scan a list for matching requests 327 * nfs_scan_list - Scan a list for matching requests
328 * @nfsi: NFS inode
328 * @head: One of the NFS inode request lists 329 * @head: One of the NFS inode request lists
329 * @dst: Destination list 330 * @dst: Destination list
330 * @idx_start: lower bound of page->index to scan 331 * @idx_start: lower bound of page->index to scan
@@ -336,14 +337,15 @@ out:
336 * The requests are *not* checked to ensure that they form a contiguous set. 337 * The requests are *not* checked to ensure that they form a contiguous set.
337 * You must be holding the inode's req_lock when calling this function 338 * You must be holding the inode's req_lock when calling this function
338 */ 339 */
339int 340int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
340nfs_scan_list(struct list_head *head, struct list_head *dst, 341 struct list_head *dst, unsigned long idx_start,
341 unsigned long idx_start, unsigned int npages) 342 unsigned int npages)
342{ 343{
343 struct list_head *pos, *tmp; 344 struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
344 struct nfs_page *req; 345 struct nfs_page *req;
345 unsigned long idx_end; 346 unsigned long idx_end;
346 int res; 347 int found, i;
348 int res;
347 349
348 res = 0; 350 res = 0;
349 if (npages == 0) 351 if (npages == 0)
@@ -351,25 +353,32 @@ nfs_scan_list(struct list_head *head, struct list_head *dst,
351 else 353 else
352 idx_end = idx_start + npages - 1; 354 idx_end = idx_start + npages - 1;
353 355
354 list_for_each_safe(pos, tmp, head) { 356 for (;;) {
355 357 found = radix_tree_gang_lookup(&nfsi->nfs_page_tree,
356 req = nfs_list_entry(pos); 358 (void **)&pgvec[0], idx_start,
357 359 NFS_SCAN_MAXENTRIES);
358 if (req->wb_index < idx_start) 360 if (found <= 0)
359 continue;
360 if (req->wb_index > idx_end)
361 break; 361 break;
362 for (i = 0; i < found; i++) {
363 req = pgvec[i];
364 if (req->wb_index > idx_end)
365 goto out;
366 idx_start = req->wb_index + 1;
367 if (req->wb_list_head != head)
368 continue;
369 if (nfs_set_page_writeback_locked(req)) {
370 nfs_list_remove_request(req);
371 nfs_list_add_request(req, dst);
372 res++;
373 }
374 }
362 375
363 if (!nfs_set_page_writeback_locked(req))
364 continue;
365 nfs_list_remove_request(req);
366 nfs_list_add_request(req, dst);
367 res++;
368 } 376 }
377out:
369 return res; 378 return res;
370} 379}
371 380
372int nfs_init_nfspagecache(void) 381int __init nfs_init_nfspagecache(void)
373{ 382{
374 nfs_page_cachep = kmem_cache_create("nfs_page", 383 nfs_page_cachep = kmem_cache_create("nfs_page",
375 sizeof(struct nfs_page), 384 sizeof(struct nfs_page),
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 9dd85cac2df0..b3899ea3229e 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -44,11 +44,10 @@
44#include <linux/nfs_page.h> 44#include <linux/nfs_page.h>
45#include <linux/lockd/bind.h> 45#include <linux/lockd/bind.h>
46#include <linux/smp_lock.h> 46#include <linux/smp_lock.h>
47#include "internal.h"
47 48
48#define NFSDBG_FACILITY NFSDBG_PROC 49#define NFSDBG_FACILITY NFSDBG_PROC
49 50
50extern struct rpc_procinfo nfs_procedures[];
51
52/* 51/*
53 * Bare-bones access to getattr: this is for nfs_read_super. 52 * Bare-bones access to getattr: this is for nfs_read_super.
54 */ 53 */
@@ -611,8 +610,6 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
611 return 0; 610 return 0;
612} 611}
613 612
614extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
615
616static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) 613static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
617{ 614{
618 if (task->tk_status >= 0) { 615 if (task->tk_status >= 0) {
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 624ca7146b6b..32cf3773af0c 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -51,14 +51,11 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
51 if (p) { 51 if (p) {
52 memset(p, 0, sizeof(*p)); 52 memset(p, 0, sizeof(*p));
53 INIT_LIST_HEAD(&p->pages); 53 INIT_LIST_HEAD(&p->pages);
54 if (pagecount < NFS_PAGEVEC_SIZE) 54 if (pagecount <= ARRAY_SIZE(p->page_array))
55 p->pagevec = &p->page_array[0]; 55 p->pagevec = p->page_array;
56 else { 56 else {
57 size_t size = ++pagecount * sizeof(struct page *); 57 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
58 p->pagevec = kmalloc(size, GFP_NOFS); 58 if (!p->pagevec) {
59 if (p->pagevec) {
60 memset(p->pagevec, 0, size);
61 } else {
62 mempool_free(p, nfs_rdata_mempool); 59 mempool_free(p, nfs_rdata_mempool);
63 p = NULL; 60 p = NULL;
64 } 61 }
@@ -104,6 +101,28 @@ int nfs_return_empty_page(struct page *page)
104 return 0; 101 return 0;
105} 102}
106 103
104static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
105{
106 unsigned int remainder = data->args.count - data->res.count;
107 unsigned int base = data->args.pgbase + data->res.count;
108 unsigned int pglen;
109 struct page **pages;
110
111 if (data->res.eof == 0 || remainder == 0)
112 return;
113 /*
114 * Note: "remainder" can never be negative, since we check for
115 * this in the XDR code.
116 */
117 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
118 base &= ~PAGE_CACHE_MASK;
119 pglen = PAGE_CACHE_SIZE - base;
120 if (pglen < remainder)
121 memclear_highpage_flush(*pages, base, pglen);
122 else
123 memclear_highpage_flush(*pages, base, remainder);
124}
125
107/* 126/*
108 * Read a page synchronously. 127 * Read a page synchronously.
109 */ 128 */
@@ -177,11 +196,9 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
177 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; 196 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
178 spin_unlock(&inode->i_lock); 197 spin_unlock(&inode->i_lock);
179 198
180 if (count) 199 nfs_readpage_truncate_uninitialised_page(rdata);
181 memclear_highpage_flush(page, rdata->args.pgbase, count); 200 if (rdata->res.eof || rdata->res.count == rdata->args.count)
182 SetPageUptodate(page); 201 SetPageUptodate(page);
183 if (PageError(page))
184 ClearPageError(page);
185 result = 0; 202 result = 0;
186 203
187io_error: 204io_error:
@@ -436,20 +453,12 @@ static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata)
436 struct nfs_page *req = data->req; 453 struct nfs_page *req = data->req;
437 struct page *page = req->wb_page; 454 struct page *page = req->wb_page;
438 455
456 if (likely(task->tk_status >= 0))
457 nfs_readpage_truncate_uninitialised_page(data);
458 else
459 SetPageError(page);
439 if (nfs_readpage_result(task, data) != 0) 460 if (nfs_readpage_result(task, data) != 0)
440 return; 461 return;
441 if (task->tk_status >= 0) {
442 unsigned int request = data->args.count;
443 unsigned int result = data->res.count;
444
445 if (result < request) {
446 memclear_highpage_flush(page,
447 data->args.pgbase + result,
448 request - result);
449 }
450 } else
451 SetPageError(page);
452
453 if (atomic_dec_and_test(&req->wb_complete)) { 462 if (atomic_dec_and_test(&req->wb_complete)) {
454 if (!PageError(page)) 463 if (!PageError(page))
455 SetPageUptodate(page); 464 SetPageUptodate(page);
@@ -462,6 +471,40 @@ static const struct rpc_call_ops nfs_read_partial_ops = {
462 .rpc_release = nfs_readdata_release, 471 .rpc_release = nfs_readdata_release,
463}; 472};
464 473
474static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data)
475{
476 unsigned int count = data->res.count;
477 unsigned int base = data->args.pgbase;
478 struct page **pages;
479
480 if (unlikely(count == 0))
481 return;
482 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
483 base &= ~PAGE_CACHE_MASK;
484 count += base;
485 for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
486 SetPageUptodate(*pages);
487 /*
488 * Was this an eof or a short read? If the latter, don't mark the page
489 * as uptodate yet.
490 */
491 if (count > 0 && (data->res.eof || data->args.count == data->res.count))
492 SetPageUptodate(*pages);
493}
494
495static void nfs_readpage_set_pages_error(struct nfs_read_data *data)
496{
497 unsigned int count = data->args.count;
498 unsigned int base = data->args.pgbase;
499 struct page **pages;
500
501 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
502 base &= ~PAGE_CACHE_MASK;
503 count += base;
504 for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
505 SetPageError(*pages);
506}
507
465/* 508/*
466 * This is the callback from RPC telling us whether a reply was 509 * This is the callback from RPC telling us whether a reply was
467 * received or some error occurred (timeout or socket shutdown). 510 * received or some error occurred (timeout or socket shutdown).
@@ -469,27 +512,24 @@ static const struct rpc_call_ops nfs_read_partial_ops = {
469static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) 512static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
470{ 513{
471 struct nfs_read_data *data = calldata; 514 struct nfs_read_data *data = calldata;
472 unsigned int count = data->res.count;
473 515
516 /*
517 * Note: nfs_readpage_result may change the values of
518 * data->args. In the multi-page case, we therefore need
519 * to ensure that we call the next nfs_readpage_set_page_uptodate()
520 * first in the multi-page case.
521 */
522 if (likely(task->tk_status >= 0)) {
523 nfs_readpage_truncate_uninitialised_page(data);
524 nfs_readpage_set_pages_uptodate(data);
525 } else
526 nfs_readpage_set_pages_error(data);
474 if (nfs_readpage_result(task, data) != 0) 527 if (nfs_readpage_result(task, data) != 0)
475 return; 528 return;
476 while (!list_empty(&data->pages)) { 529 while (!list_empty(&data->pages)) {
477 struct nfs_page *req = nfs_list_entry(data->pages.next); 530 struct nfs_page *req = nfs_list_entry(data->pages.next);
478 struct page *page = req->wb_page;
479 nfs_list_remove_request(req);
480 531
481 if (task->tk_status >= 0) { 532 nfs_list_remove_request(req);
482 if (count < PAGE_CACHE_SIZE) {
483 if (count < req->wb_bytes)
484 memclear_highpage_flush(page,
485 req->wb_pgbase + count,
486 req->wb_bytes - count);
487 count = 0;
488 } else
489 count -= PAGE_CACHE_SIZE;
490 SetPageUptodate(page);
491 } else
492 SetPageError(page);
493 nfs_readpage_release(req); 533 nfs_readpage_release(req);
494 } 534 }
495} 535}
@@ -654,7 +694,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
654 return ret; 694 return ret;
655} 695}
656 696
657int nfs_init_readpagecache(void) 697int __init nfs_init_readpagecache(void)
658{ 698{
659 nfs_rdata_cachep = kmem_cache_create("nfs_read_data", 699 nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
660 sizeof(struct nfs_read_data), 700 sizeof(struct nfs_read_data),
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
new file mode 100644
index 000000000000..e8a9bee74d9d
--- /dev/null
+++ b/fs/nfs/super.c
@@ -0,0 +1,1537 @@
1/*
2 * linux/fs/nfs/super.c
3 *
4 * Copyright (C) 1992 Rick Sladkey
5 *
6 * nfs superblock handling functions
7 *
8 * Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some
9 * experimental NFS changes. Modularisation taken straight from SYS5 fs.
10 *
11 * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
12 * J.S.Peatfield@damtp.cam.ac.uk
13 *
14 * Split from inode.c by David Howells <dhowells@redhat.com>
15 *
16 */
17
18#include <linux/config.h>
19#include <linux/module.h>
20#include <linux/init.h>
21
22#include <linux/time.h>
23#include <linux/kernel.h>
24#include <linux/mm.h>
25#include <linux/string.h>
26#include <linux/stat.h>
27#include <linux/errno.h>
28#include <linux/unistd.h>
29#include <linux/sunrpc/clnt.h>
30#include <linux/sunrpc/stats.h>
31#include <linux/sunrpc/metrics.h>
32#include <linux/nfs_fs.h>
33#include <linux/nfs_mount.h>
34#include <linux/nfs4_mount.h>
35#include <linux/lockd/bind.h>
36#include <linux/smp_lock.h>
37#include <linux/seq_file.h>
38#include <linux/mount.h>
39#include <linux/nfs_idmap.h>
40#include <linux/vfs.h>
41#include <linux/inet.h>
42#include <linux/nfs_xdr.h>
43
44#include <asm/system.h>
45#include <asm/uaccess.h>
46
47#include "nfs4_fs.h"
48#include "callback.h"
49#include "delegation.h"
50#include "iostat.h"
51#include "internal.h"
52
53#define NFSDBG_FACILITY NFSDBG_VFS
54
55/* Maximum number of readahead requests
56 * FIXME: this should really be a sysctl so that users may tune it to suit
57 * their needs. People that do NFS over a slow network, might for
58 * instance want to reduce it to something closer to 1 for improved
59 * interactive response.
60 */
61#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1)
62
63/*
64 * RPC cruft for NFS
65 */
66static struct rpc_version * nfs_version[] = {
67 NULL,
68 NULL,
69 &nfs_version2,
70#if defined(CONFIG_NFS_V3)
71 &nfs_version3,
72#elif defined(CONFIG_NFS_V4)
73 NULL,
74#endif
75#if defined(CONFIG_NFS_V4)
76 &nfs_version4,
77#endif
78};
79
80static struct rpc_program nfs_program = {
81 .name = "nfs",
82 .number = NFS_PROGRAM,
83 .nrvers = ARRAY_SIZE(nfs_version),
84 .version = nfs_version,
85 .stats = &nfs_rpcstat,
86 .pipe_dir_name = "/nfs",
87};
88
89struct rpc_stat nfs_rpcstat = {
90 .program = &nfs_program
91};
92
93
94#ifdef CONFIG_NFS_V3_ACL
95static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
96static struct rpc_version * nfsacl_version[] = {
97 [3] = &nfsacl_version3,
98};
99
100struct rpc_program nfsacl_program = {
101 .name = "nfsacl",
102 .number = NFS_ACL_PROGRAM,
103 .nrvers = ARRAY_SIZE(nfsacl_version),
104 .version = nfsacl_version,
105 .stats = &nfsacl_rpcstat,
106};
107#endif /* CONFIG_NFS_V3_ACL */
108
109static void nfs_umount_begin(struct vfsmount *, int);
110static int nfs_statfs(struct dentry *, struct kstatfs *);
111static int nfs_show_options(struct seq_file *, struct vfsmount *);
112static int nfs_show_stats(struct seq_file *, struct vfsmount *);
113static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
114static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
115 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
116static void nfs_kill_super(struct super_block *);
117
118static struct file_system_type nfs_fs_type = {
119 .owner = THIS_MODULE,
120 .name = "nfs",
121 .get_sb = nfs_get_sb,
122 .kill_sb = nfs_kill_super,
123 .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
124};
125
126struct file_system_type clone_nfs_fs_type = {
127 .owner = THIS_MODULE,
128 .name = "nfs",
129 .get_sb = nfs_clone_nfs_sb,
130 .kill_sb = nfs_kill_super,
131 .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
132};
133
134static struct super_operations nfs_sops = {
135 .alloc_inode = nfs_alloc_inode,
136 .destroy_inode = nfs_destroy_inode,
137 .write_inode = nfs_write_inode,
138 .statfs = nfs_statfs,
139 .clear_inode = nfs_clear_inode,
140 .umount_begin = nfs_umount_begin,
141 .show_options = nfs_show_options,
142 .show_stats = nfs_show_stats,
143};
144
145#ifdef CONFIG_NFS_V4
146static int nfs4_get_sb(struct file_system_type *fs_type,
147 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
148static int nfs_clone_nfs4_sb(struct file_system_type *fs_type,
149 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
150static int nfs_referral_nfs4_sb(struct file_system_type *fs_type,
151 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
152static void nfs4_kill_super(struct super_block *sb);
153
154static struct file_system_type nfs4_fs_type = {
155 .owner = THIS_MODULE,
156 .name = "nfs4",
157 .get_sb = nfs4_get_sb,
158 .kill_sb = nfs4_kill_super,
159 .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
160};
161
162struct file_system_type clone_nfs4_fs_type = {
163 .owner = THIS_MODULE,
164 .name = "nfs4",
165 .get_sb = nfs_clone_nfs4_sb,
166 .kill_sb = nfs4_kill_super,
167 .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
168};
169
170struct file_system_type nfs_referral_nfs4_fs_type = {
171 .owner = THIS_MODULE,
172 .name = "nfs4",
173 .get_sb = nfs_referral_nfs4_sb,
174 .kill_sb = nfs4_kill_super,
175 .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
176};
177
178static struct super_operations nfs4_sops = {
179 .alloc_inode = nfs_alloc_inode,
180 .destroy_inode = nfs_destroy_inode,
181 .write_inode = nfs_write_inode,
182 .statfs = nfs_statfs,
183 .clear_inode = nfs4_clear_inode,
184 .umount_begin = nfs_umount_begin,
185 .show_options = nfs_show_options,
186 .show_stats = nfs_show_stats,
187};
188#endif
189
190#ifdef CONFIG_NFS_V4
191static const int nfs_set_port_min = 0;
192static const int nfs_set_port_max = 65535;
193
194static int param_set_port(const char *val, struct kernel_param *kp)
195{
196 char *endp;
197 int num = simple_strtol(val, &endp, 0);
198 if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
199 return -EINVAL;
200 *((int *)kp->arg) = num;
201 return 0;
202}
203
204module_param_call(callback_tcpport, param_set_port, param_get_int,
205 &nfs_callback_set_tcpport, 0644);
206#endif
207
208#ifdef CONFIG_NFS_V4
209static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
210{
211 char *endp;
212 int num = simple_strtol(val, &endp, 0);
213 int jif = num * HZ;
214 if (endp == val || *endp || num < 0 || jif < num)
215 return -EINVAL;
216 *((int *)kp->arg) = jif;
217 return 0;
218}
219
220module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
221 &nfs_idmap_cache_timeout, 0644);
222#endif
223
224/*
225 * Register the NFS filesystems
226 */
227int __init register_nfs_fs(void)
228{
229 int ret;
230
231 ret = register_filesystem(&nfs_fs_type);
232 if (ret < 0)
233 goto error_0;
234
235#ifdef CONFIG_NFS_V4
236 ret = nfs_register_sysctl();
237 if (ret < 0)
238 goto error_1;
239 ret = register_filesystem(&nfs4_fs_type);
240 if (ret < 0)
241 goto error_2;
242#endif
243 return 0;
244
245#ifdef CONFIG_NFS_V4
246error_2:
247 nfs_unregister_sysctl();
248error_1:
249 unregister_filesystem(&nfs_fs_type);
250#endif
251error_0:
252 return ret;
253}
254
255/*
256 * Unregister the NFS filesystems
257 */
258void __exit unregister_nfs_fs(void)
259{
260#ifdef CONFIG_NFS_V4
261 unregister_filesystem(&nfs4_fs_type);
262 nfs_unregister_sysctl();
263#endif
264 unregister_filesystem(&nfs_fs_type);
265}
266
267/*
268 * Deliver file system statistics to userspace
269 */
270static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
271{
272 struct super_block *sb = dentry->d_sb;
273 struct nfs_server *server = NFS_SB(sb);
274 unsigned char blockbits;
275 unsigned long blockres;
276 struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode);
277 struct nfs_fattr fattr;
278 struct nfs_fsstat res = {
279 .fattr = &fattr,
280 };
281 int error;
282
283 lock_kernel();
284
285 error = server->rpc_ops->statfs(server, rootfh, &res);
286 buf->f_type = NFS_SUPER_MAGIC;
287 if (error < 0)
288 goto out_err;
289
290 /*
291 * Current versions of glibc do not correctly handle the
292 * case where f_frsize != f_bsize. Eventually we want to
293 * report the value of wtmult in this field.
294 */
295 buf->f_frsize = sb->s_blocksize;
296
297 /*
298 * On most *nix systems, f_blocks, f_bfree, and f_bavail
299 * are reported in units of f_frsize. Linux hasn't had
300 * an f_frsize field in its statfs struct until recently,
301 * thus historically Linux's sys_statfs reports these
302 * fields in units of f_bsize.
303 */
304 buf->f_bsize = sb->s_blocksize;
305 blockbits = sb->s_blocksize_bits;
306 blockres = (1 << blockbits) - 1;
307 buf->f_blocks = (res.tbytes + blockres) >> blockbits;
308 buf->f_bfree = (res.fbytes + blockres) >> blockbits;
309 buf->f_bavail = (res.abytes + blockres) >> blockbits;
310
311 buf->f_files = res.tfiles;
312 buf->f_ffree = res.afiles;
313
314 buf->f_namelen = server->namelen;
315 out:
316 unlock_kernel();
317 return 0;
318
319 out_err:
320 dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
321 buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
322 goto out;
323
324}
325
326static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
327{
328 static struct {
329 rpc_authflavor_t flavour;
330 const char *str;
331 } sec_flavours[] = {
332 { RPC_AUTH_NULL, "null" },
333 { RPC_AUTH_UNIX, "sys" },
334 { RPC_AUTH_GSS_KRB5, "krb5" },
335 { RPC_AUTH_GSS_KRB5I, "krb5i" },
336 { RPC_AUTH_GSS_KRB5P, "krb5p" },
337 { RPC_AUTH_GSS_LKEY, "lkey" },
338 { RPC_AUTH_GSS_LKEYI, "lkeyi" },
339 { RPC_AUTH_GSS_LKEYP, "lkeyp" },
340 { RPC_AUTH_GSS_SPKM, "spkm" },
341 { RPC_AUTH_GSS_SPKMI, "spkmi" },
342 { RPC_AUTH_GSS_SPKMP, "spkmp" },
343 { -1, "unknown" }
344 };
345 int i;
346
347 for (i=0; sec_flavours[i].flavour != -1; i++) {
348 if (sec_flavours[i].flavour == flavour)
349 break;
350 }
351 return sec_flavours[i].str;
352}
353
354/*
355 * Describe the mount options in force on this server representation
356 */
357static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
358{
359 static struct proc_nfs_info {
360 int flag;
361 char *str;
362 char *nostr;
363 } nfs_info[] = {
364 { NFS_MOUNT_SOFT, ",soft", ",hard" },
365 { NFS_MOUNT_INTR, ",intr", "" },
366 { NFS_MOUNT_NOCTO, ",nocto", "" },
367 { NFS_MOUNT_NOAC, ",noac", "" },
368 { NFS_MOUNT_NONLM, ",nolock", "" },
369 { NFS_MOUNT_NOACL, ",noacl", "" },
370 { 0, NULL, NULL }
371 };
372 struct proc_nfs_info *nfs_infop;
373 char buf[12];
374 char *proto;
375
376 seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
377 seq_printf(m, ",rsize=%d", nfss->rsize);
378 seq_printf(m, ",wsize=%d", nfss->wsize);
379 if (nfss->acregmin != 3*HZ || showdefaults)
380 seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ);
381 if (nfss->acregmax != 60*HZ || showdefaults)
382 seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ);
383 if (nfss->acdirmin != 30*HZ || showdefaults)
384 seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
385 if (nfss->acdirmax != 60*HZ || showdefaults)
386 seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
387 for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
388 if (nfss->flags & nfs_infop->flag)
389 seq_puts(m, nfs_infop->str);
390 else
391 seq_puts(m, nfs_infop->nostr);
392 }
393 switch (nfss->client->cl_xprt->prot) {
394 case IPPROTO_TCP:
395 proto = "tcp";
396 break;
397 case IPPROTO_UDP:
398 proto = "udp";
399 break;
400 default:
401 snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
402 proto = buf;
403 }
404 seq_printf(m, ",proto=%s", proto);
405 seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
406 seq_printf(m, ",retrans=%u", nfss->retrans_count);
407 seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
408}
409
410/*
411 * Describe the mount options on this VFS mountpoint
412 */
413static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
414{
415 struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
416
417 nfs_show_mount_options(m, nfss, 0);
418
419 seq_puts(m, ",addr=");
420 seq_escape(m, nfss->hostname, " \t\n\\");
421
422 return 0;
423}
424
425/*
426 * Present statistical information for this VFS mountpoint
427 */
428static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
429{
430 int i, cpu;
431 struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
432 struct rpc_auth *auth = nfss->client->cl_auth;
433 struct nfs_iostats totals = { };
434
435 seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
436
437 /*
438 * Display all mount option settings
439 */
440 seq_printf(m, "\n\topts:\t");
441 seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
442 seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
443 seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
444 seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
445 nfs_show_mount_options(m, nfss, 1);
446
447 seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
448
449 seq_printf(m, "\n\tcaps:\t");
450 seq_printf(m, "caps=0x%x", nfss->caps);
451 seq_printf(m, ",wtmult=%d", nfss->wtmult);
452 seq_printf(m, ",dtsize=%d", nfss->dtsize);
453 seq_printf(m, ",bsize=%d", nfss->bsize);
454 seq_printf(m, ",namelen=%d", nfss->namelen);
455
456#ifdef CONFIG_NFS_V4
457 if (nfss->rpc_ops->version == 4) {
458 seq_printf(m, "\n\tnfsv4:\t");
459 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
460 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
461 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
462 }
463#endif
464
465 /*
466 * Display security flavor in effect for this mount
467 */
468 seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor);
469 if (auth->au_flavor)
470 seq_printf(m, ",pseudoflavor=%d", auth->au_flavor);
471
472 /*
473 * Display superblock I/O counters
474 */
475 for_each_possible_cpu(cpu) {
476 struct nfs_iostats *stats;
477
478 preempt_disable();
479 stats = per_cpu_ptr(nfss->io_stats, cpu);
480
481 for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
482 totals.events[i] += stats->events[i];
483 for (i = 0; i < __NFSIOS_BYTESMAX; i++)
484 totals.bytes[i] += stats->bytes[i];
485
486 preempt_enable();
487 }
488
489 seq_printf(m, "\n\tevents:\t");
490 for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
491 seq_printf(m, "%lu ", totals.events[i]);
492 seq_printf(m, "\n\tbytes:\t");
493 for (i = 0; i < __NFSIOS_BYTESMAX; i++)
494 seq_printf(m, "%Lu ", totals.bytes[i]);
495 seq_printf(m, "\n");
496
497 rpc_print_iostats(m, nfss->client);
498
499 return 0;
500}
501
502/*
503 * Begin unmount by attempting to remove all automounted mountpoints we added
504 * in response to traversals
505 */
506static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
507{
508 struct nfs_server *server;
509 struct rpc_clnt *rpc;
510
511 shrink_submounts(vfsmnt, &nfs_automount_list);
512 if (!(flags & MNT_FORCE))
513 return;
514 /* -EIO all pending I/O */
515 server = NFS_SB(vfsmnt->mnt_sb);
516 rpc = server->client;
517 if (!IS_ERR(rpc))
518 rpc_killall_tasks(rpc);
519 rpc = server->client_acl;
520 if (!IS_ERR(rpc))
521 rpc_killall_tasks(rpc);
522}
523
524/*
525 * Obtain the root inode of the file system.
526 */
527static struct inode *
528nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo)
529{
530 struct nfs_server *server = NFS_SB(sb);
531 int error;
532
533 error = server->rpc_ops->getroot(server, rootfh, fsinfo);
534 if (error < 0) {
535 dprintk("nfs_get_root: getattr error = %d\n", -error);
536 return ERR_PTR(error);
537 }
538
539 server->fsid = fsinfo->fattr->fsid;
540 return nfs_fhget(sb, rootfh, fsinfo->fattr);
541}
542
543/*
544 * Do NFS version-independent mount processing, and sanity checking
545 */
546static int
547nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
548{
549 struct nfs_server *server;
550 struct inode *root_inode;
551 struct nfs_fattr fattr;
552 struct nfs_fsinfo fsinfo = {
553 .fattr = &fattr,
554 };
555 struct nfs_pathconf pathinfo = {
556 .fattr = &fattr,
557 };
558 int no_root_error = 0;
559 unsigned long max_rpc_payload;
560
561 /* We probably want something more informative here */
562 snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
563
564 server = NFS_SB(sb);
565
566 sb->s_magic = NFS_SUPER_MAGIC;
567
568 server->io_stats = nfs_alloc_iostats();
569 if (server->io_stats == NULL)
570 return -ENOMEM;
571
572 root_inode = nfs_get_root(sb, &server->fh, &fsinfo);
573 /* Did getting the root inode fail? */
574 if (IS_ERR(root_inode)) {
575 no_root_error = PTR_ERR(root_inode);
576 goto out_no_root;
577 }
578 sb->s_root = d_alloc_root(root_inode);
579 if (!sb->s_root) {
580 no_root_error = -ENOMEM;
581 goto out_no_root;
582 }
583 sb->s_root->d_op = server->rpc_ops->dentry_ops;
584
585 /* mount time stamp, in seconds */
586 server->mount_time = jiffies;
587
588 /* Get some general file system info */
589 if (server->namelen == 0 &&
590 server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
591 server->namelen = pathinfo.max_namelen;
592 /* Work out a lot of parameters */
593 if (server->rsize == 0)
594 server->rsize = nfs_block_size(fsinfo.rtpref, NULL);
595 if (server->wsize == 0)
596 server->wsize = nfs_block_size(fsinfo.wtpref, NULL);
597
598 if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax)
599 server->rsize = nfs_block_size(fsinfo.rtmax, NULL);
600 if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax)
601 server->wsize = nfs_block_size(fsinfo.wtmax, NULL);
602
603 max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
604 if (server->rsize > max_rpc_payload)
605 server->rsize = max_rpc_payload;
606 if (server->rsize > NFS_MAX_FILE_IO_SIZE)
607 server->rsize = NFS_MAX_FILE_IO_SIZE;
608 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
609
610 if (server->wsize > max_rpc_payload)
611 server->wsize = max_rpc_payload;
612 if (server->wsize > NFS_MAX_FILE_IO_SIZE)
613 server->wsize = NFS_MAX_FILE_IO_SIZE;
614 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
615
616 if (sb->s_blocksize == 0)
617 sb->s_blocksize = nfs_block_bits(server->wsize,
618 &sb->s_blocksize_bits);
619 server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL);
620
621 server->dtsize = nfs_block_size(fsinfo.dtpref, NULL);
622 if (server->dtsize > PAGE_CACHE_SIZE)
623 server->dtsize = PAGE_CACHE_SIZE;
624 if (server->dtsize > server->rsize)
625 server->dtsize = server->rsize;
626
627 if (server->flags & NFS_MOUNT_NOAC) {
628 server->acregmin = server->acregmax = 0;
629 server->acdirmin = server->acdirmax = 0;
630 sb->s_flags |= MS_SYNCHRONOUS;
631 }
632 server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
633
634 nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
635
636 server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
637 server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
638
639 /* We're airborne Set socket buffersize */
640 rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
641 return 0;
642 /* Yargs. It didn't work out. */
643out_no_root:
644 dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error);
645 if (!IS_ERR(root_inode))
646 iput(root_inode);
647 return no_root_error;
648}
649
650/*
651 * Initialise the timeout values for a connection
652 */
653static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
654{
655 to->to_initval = timeo * HZ / 10;
656 to->to_retries = retrans;
657 if (!to->to_retries)
658 to->to_retries = 2;
659
660 switch (proto) {
661 case IPPROTO_TCP:
662 if (!to->to_initval)
663 to->to_initval = 60 * HZ;
664 if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
665 to->to_initval = NFS_MAX_TCP_TIMEOUT;
666 to->to_increment = to->to_initval;
667 to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
668 to->to_exponential = 0;
669 break;
670 case IPPROTO_UDP:
671 default:
672 if (!to->to_initval)
673 to->to_initval = 11 * HZ / 10;
674 if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
675 to->to_initval = NFS_MAX_UDP_TIMEOUT;
676 to->to_maxval = NFS_MAX_UDP_TIMEOUT;
677 to->to_exponential = 1;
678 break;
679 }
680}
681
682/*
683 * Create an RPC client handle.
684 */
685static struct rpc_clnt *
686nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
687{
688 struct rpc_timeout timeparms;
689 struct rpc_xprt *xprt = NULL;
690 struct rpc_clnt *clnt = NULL;
691 int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
692
693 nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
694
695 server->retrans_timeo = timeparms.to_initval;
696 server->retrans_count = timeparms.to_retries;
697
698 /* create transport and client */
699 xprt = xprt_create_proto(proto, &server->addr, &timeparms);
700 if (IS_ERR(xprt)) {
701 dprintk("%s: cannot create RPC transport. Error = %ld\n",
702 __FUNCTION__, PTR_ERR(xprt));
703 return (struct rpc_clnt *)xprt;
704 }
705 clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
706 server->rpc_ops->version, data->pseudoflavor);
707 if (IS_ERR(clnt)) {
708 dprintk("%s: cannot create RPC client. Error = %ld\n",
709 __FUNCTION__, PTR_ERR(xprt));
710 goto out_fail;
711 }
712
713 clnt->cl_intr = 1;
714 clnt->cl_softrtry = 1;
715
716 return clnt;
717
718out_fail:
719 return clnt;
720}
721
722/*
723 * Clone a server record
724 */
725static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_clone_mount *data)
726{
727 struct nfs_server *server = NFS_SB(sb);
728 struct nfs_server *parent = NFS_SB(data->sb);
729 struct inode *root_inode;
730 struct nfs_fsinfo fsinfo;
731 void *err = ERR_PTR(-ENOMEM);
732
733 sb->s_op = data->sb->s_op;
734 sb->s_blocksize = data->sb->s_blocksize;
735 sb->s_blocksize_bits = data->sb->s_blocksize_bits;
736 sb->s_maxbytes = data->sb->s_maxbytes;
737
738 server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
739 server->io_stats = nfs_alloc_iostats();
740 if (server->io_stats == NULL)
741 goto out;
742
743 server->client = rpc_clone_client(parent->client);
744 if (IS_ERR((err = server->client)))
745 goto out;
746
747 if (!IS_ERR(parent->client_sys)) {
748 server->client_sys = rpc_clone_client(parent->client_sys);
749 if (IS_ERR((err = server->client_sys)))
750 goto out;
751 }
752 if (!IS_ERR(parent->client_acl)) {
753 server->client_acl = rpc_clone_client(parent->client_acl);
754 if (IS_ERR((err = server->client_acl)))
755 goto out;
756 }
757 root_inode = nfs_fhget(sb, data->fh, data->fattr);
758 if (!root_inode)
759 goto out;
760 sb->s_root = d_alloc_root(root_inode);
761 if (!sb->s_root)
762 goto out_put_root;
763 fsinfo.fattr = data->fattr;
764 if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
765 nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
766 sb->s_root->d_op = server->rpc_ops->dentry_ops;
767 sb->s_flags |= MS_ACTIVE;
768 return server;
769out_put_root:
770 iput(root_inode);
771out:
772 return err;
773}
774
775/*
776 * Copy an existing superblock and attach revised data
777 */
778static int nfs_clone_generic_sb(struct nfs_clone_mount *data,
779 struct super_block *(*fill_sb)(struct nfs_server *, struct nfs_clone_mount *),
780 struct nfs_server *(*fill_server)(struct super_block *, struct nfs_clone_mount *),
781 struct vfsmount *mnt)
782{
783 struct nfs_server *server;
784 struct nfs_server *parent = NFS_SB(data->sb);
785 struct super_block *sb = ERR_PTR(-EINVAL);
786 char *hostname;
787 int error = -ENOMEM;
788 int len;
789
790 server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
791 if (server == NULL)
792 goto out_err;
793 memcpy(server, parent, sizeof(*server));
794 hostname = (data->hostname != NULL) ? data->hostname : parent->hostname;
795 len = strlen(hostname) + 1;
796 server->hostname = kmalloc(len, GFP_KERNEL);
797 if (server->hostname == NULL)
798 goto free_server;
799 memcpy(server->hostname, hostname, len);
800 error = rpciod_up();
801 if (error != 0)
802 goto free_hostname;
803
804 sb = fill_sb(server, data);
805 if (IS_ERR(sb)) {
806 error = PTR_ERR(sb);
807 goto kill_rpciod;
808 }
809
810 if (sb->s_root)
811 goto out_rpciod_down;
812
813 server = fill_server(sb, data);
814 if (IS_ERR(server)) {
815 error = PTR_ERR(server);
816 goto out_deactivate;
817 }
818 return simple_set_mnt(mnt, sb);
819out_deactivate:
820 up_write(&sb->s_umount);
821 deactivate_super(sb);
822 return error;
823out_rpciod_down:
824 rpciod_down();
825 kfree(server->hostname);
826 kfree(server);
827 return simple_set_mnt(mnt, sb);
828kill_rpciod:
829 rpciod_down();
830free_hostname:
831 kfree(server->hostname);
832free_server:
833 kfree(server);
834out_err:
835 return error;
836}
837
838/*
839 * Set up an NFS2/3 superblock
840 *
841 * The way this works is that the mount process passes a structure
842 * in the data argument which contains the server's IP address
843 * and the root file handle obtained from the server's mount
844 * daemon. We stash these away in the private superblock fields.
845 */
846static int
847nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
848{
849 struct nfs_server *server;
850 rpc_authflavor_t authflavor;
851
852 server = NFS_SB(sb);
853 sb->s_blocksize_bits = 0;
854 sb->s_blocksize = 0;
855 if (data->bsize)
856 sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
857 if (data->rsize)
858 server->rsize = nfs_block_size(data->rsize, NULL);
859 if (data->wsize)
860 server->wsize = nfs_block_size(data->wsize, NULL);
861 server->flags = data->flags & NFS_MOUNT_FLAGMASK;
862
863 server->acregmin = data->acregmin*HZ;
864 server->acregmax = data->acregmax*HZ;
865 server->acdirmin = data->acdirmin*HZ;
866 server->acdirmax = data->acdirmax*HZ;
867
868 /* Start lockd here, before we might error out */
869 if (!(server->flags & NFS_MOUNT_NONLM))
870 lockd_up();
871
872 server->namelen = data->namlen;
873 server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL);
874 if (!server->hostname)
875 return -ENOMEM;
876 strcpy(server->hostname, data->hostname);
877
878 /* Check NFS protocol revision and initialize RPC op vector
879 * and file handle pool. */
880#ifdef CONFIG_NFS_V3
881 if (server->flags & NFS_MOUNT_VER3) {
882 server->rpc_ops = &nfs_v3_clientops;
883 server->caps |= NFS_CAP_READDIRPLUS;
884 } else {
885 server->rpc_ops = &nfs_v2_clientops;
886 }
887#else
888 server->rpc_ops = &nfs_v2_clientops;
889#endif
890
891 /* Fill in pseudoflavor for mount version < 5 */
892 if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
893 data->pseudoflavor = RPC_AUTH_UNIX;
894 authflavor = data->pseudoflavor; /* save for sb_init() */
895 /* XXX maybe we want to add a server->pseudoflavor field */
896
897 /* Create RPC client handles */
898 server->client = nfs_create_client(server, data);
899 if (IS_ERR(server->client))
900 return PTR_ERR(server->client);
901 /* RFC 2623, sec 2.3.2 */
902 if (authflavor != RPC_AUTH_UNIX) {
903 struct rpc_auth *auth;
904
905 server->client_sys = rpc_clone_client(server->client);
906 if (IS_ERR(server->client_sys))
907 return PTR_ERR(server->client_sys);
908 auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys);
909 if (IS_ERR(auth))
910 return PTR_ERR(auth);
911 } else {
912 atomic_inc(&server->client->cl_count);
913 server->client_sys = server->client;
914 }
915 if (server->flags & NFS_MOUNT_VER3) {
916#ifdef CONFIG_NFS_V3_ACL
917 if (!(server->flags & NFS_MOUNT_NOACL)) {
918 server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
919 /* No errors! Assume that Sun nfsacls are supported */
920 if (!IS_ERR(server->client_acl))
921 server->caps |= NFS_CAP_ACLS;
922 }
923#else
924 server->flags &= ~NFS_MOUNT_NOACL;
925#endif /* CONFIG_NFS_V3_ACL */
926 /*
927 * The VFS shouldn't apply the umask to mode bits. We will
928 * do so ourselves when necessary.
929 */
930 sb->s_flags |= MS_POSIXACL;
931 if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
932 server->namelen = NFS3_MAXNAMLEN;
933 sb->s_time_gran = 1;
934 } else {
935 if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
936 server->namelen = NFS2_MAXNAMLEN;
937 }
938
939 sb->s_op = &nfs_sops;
940 return nfs_sb_init(sb, authflavor);
941}
942
943static int nfs_set_super(struct super_block *s, void *data)
944{
945 s->s_fs_info = data;
946 return set_anon_super(s, data);
947}
948
949static int nfs_compare_super(struct super_block *sb, void *data)
950{
951 struct nfs_server *server = data;
952 struct nfs_server *old = NFS_SB(sb);
953
954 if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr)
955 return 0;
956 if (old->addr.sin_port != server->addr.sin_port)
957 return 0;
958 return !nfs_compare_fh(&old->fh, &server->fh);
959}
960
961static int nfs_get_sb(struct file_system_type *fs_type,
962 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
963{
964 int error;
965 struct nfs_server *server = NULL;
966 struct super_block *s;
967 struct nfs_fh *root;
968 struct nfs_mount_data *data = raw_data;
969
970 error = -EINVAL;
971 if (data == NULL) {
972 dprintk("%s: missing data argument\n", __FUNCTION__);
973 goto out_err_noserver;
974 }
975 if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
976 dprintk("%s: bad mount version\n", __FUNCTION__);
977 goto out_err_noserver;
978 }
979 switch (data->version) {
980 case 1:
981 data->namlen = 0;
982 case 2:
983 data->bsize = 0;
984 case 3:
985 if (data->flags & NFS_MOUNT_VER3) {
986 dprintk("%s: mount structure version %d does not support NFSv3\n",
987 __FUNCTION__,
988 data->version);
989 goto out_err_noserver;
990 }
991 data->root.size = NFS2_FHSIZE;
992 memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
993 case 4:
994 if (data->flags & NFS_MOUNT_SECFLAVOUR) {
995 dprintk("%s: mount structure version %d does not support strong security\n",
996 __FUNCTION__,
997 data->version);
998 goto out_err_noserver;
999 }
1000 case 5:
1001 memset(data->context, 0, sizeof(data->context));
1002 }
1003#ifndef CONFIG_NFS_V3
1004 /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
1005 error = -EPROTONOSUPPORT;
1006 if (data->flags & NFS_MOUNT_VER3) {
1007 dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
1008 goto out_err_noserver;
1009 }
1010#endif /* CONFIG_NFS_V3 */
1011
1012 error = -ENOMEM;
1013 server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
1014 if (!server)
1015 goto out_err_noserver;
1016 /* Zero out the NFS state stuff */
1017 init_nfsv4_state(server);
1018 server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
1019
1020 root = &server->fh;
1021 if (data->flags & NFS_MOUNT_VER3)
1022 root->size = data->root.size;
1023 else
1024 root->size = NFS2_FHSIZE;
1025 error = -EINVAL;
1026 if (root->size > sizeof(root->data)) {
1027 dprintk("%s: invalid root filehandle\n", __FUNCTION__);
1028 goto out_err;
1029 }
1030 memcpy(root->data, data->root.data, root->size);
1031
1032 /* We now require that the mount process passes the remote address */
1033 memcpy(&server->addr, &data->addr, sizeof(server->addr));
1034 if (server->addr.sin_addr.s_addr == INADDR_ANY) {
1035 dprintk("%s: mount program didn't pass remote address!\n",
1036 __FUNCTION__);
1037 goto out_err;
1038 }
1039
1040 /* Fire up rpciod if not yet running */
1041 error = rpciod_up();
1042 if (error < 0) {
1043 dprintk("%s: couldn't start rpciod! Error = %d\n",
1044 __FUNCTION__, error);
1045 goto out_err;
1046 }
1047
1048 s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
1049 if (IS_ERR(s)) {
1050 error = PTR_ERR(s);
1051 goto out_err_rpciod;
1052 }
1053
1054 if (s->s_root)
1055 goto out_rpciod_down;
1056
1057 s->s_flags = flags;
1058
1059 error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1060 if (error) {
1061 up_write(&s->s_umount);
1062 deactivate_super(s);
1063 return error;
1064 }
1065 s->s_flags |= MS_ACTIVE;
1066 return simple_set_mnt(mnt, s);
1067
1068out_rpciod_down:
1069 rpciod_down();
1070 kfree(server);
1071 return simple_set_mnt(mnt, s);
1072
1073out_err_rpciod:
1074 rpciod_down();
1075out_err:
1076 kfree(server);
1077out_err_noserver:
1078 return error;
1079}
1080
1081static void nfs_kill_super(struct super_block *s)
1082{
1083 struct nfs_server *server = NFS_SB(s);
1084
1085 kill_anon_super(s);
1086
1087 if (!IS_ERR(server->client))
1088 rpc_shutdown_client(server->client);
1089 if (!IS_ERR(server->client_sys))
1090 rpc_shutdown_client(server->client_sys);
1091 if (!IS_ERR(server->client_acl))
1092 rpc_shutdown_client(server->client_acl);
1093
1094 if (!(server->flags & NFS_MOUNT_NONLM))
1095 lockd_down(); /* release rpc.lockd */
1096
1097 rpciod_down(); /* release rpciod */
1098
1099 nfs_free_iostats(server->io_stats);
1100 kfree(server->hostname);
1101 kfree(server);
1102 nfs_release_automount_timer();
1103}
1104
1105static struct super_block *nfs_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
1106{
1107 struct super_block *sb;
1108
1109 server->fsid = data->fattr->fsid;
1110 nfs_copy_fh(&server->fh, data->fh);
1111 sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
1112 if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM))
1113 lockd_up();
1114 return sb;
1115}
1116
1117static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
1118 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
1119{
1120 struct nfs_clone_mount *data = raw_data;
1121 return nfs_clone_generic_sb(data, nfs_clone_sb, nfs_clone_server, mnt);
1122}
1123
1124#ifdef CONFIG_NFS_V4
1125static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
1126 struct rpc_timeout *timeparms, int proto, rpc_authflavor_t flavor)
1127{
1128 struct nfs4_client *clp;
1129 struct rpc_xprt *xprt = NULL;
1130 struct rpc_clnt *clnt = NULL;
1131 int err = -EIO;
1132
1133 clp = nfs4_get_client(&server->addr.sin_addr);
1134 if (!clp) {
1135 dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
1136 return ERR_PTR(err);
1137 }
1138
1139 /* Now create transport and client */
1140 down_write(&clp->cl_sem);
1141 if (IS_ERR(clp->cl_rpcclient)) {
1142 xprt = xprt_create_proto(proto, &server->addr, timeparms);
1143 if (IS_ERR(xprt)) {
1144 up_write(&clp->cl_sem);
1145 err = PTR_ERR(xprt);
1146 dprintk("%s: cannot create RPC transport. Error = %d\n",
1147 __FUNCTION__, err);
1148 goto out_fail;
1149 }
1150 /* Bind to a reserved port! */
1151 xprt->resvport = 1;
1152 clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
1153 server->rpc_ops->version, flavor);
1154 if (IS_ERR(clnt)) {
1155 up_write(&clp->cl_sem);
1156 err = PTR_ERR(clnt);
1157 dprintk("%s: cannot create RPC client. Error = %d\n",
1158 __FUNCTION__, err);
1159 goto out_fail;
1160 }
1161 clnt->cl_intr = 1;
1162 clnt->cl_softrtry = 1;
1163 clp->cl_rpcclient = clnt;
1164 memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
1165 nfs_idmap_new(clp);
1166 }
1167 list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
1168 clnt = rpc_clone_client(clp->cl_rpcclient);
1169 if (!IS_ERR(clnt))
1170 server->nfs4_state = clp;
1171 up_write(&clp->cl_sem);
1172 clp = NULL;
1173
1174 if (IS_ERR(clnt)) {
1175 dprintk("%s: cannot create RPC client. Error = %d\n",
1176 __FUNCTION__, err);
1177 return clnt;
1178 }
1179
1180 if (server->nfs4_state->cl_idmap == NULL) {
1181 dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
1182 return ERR_PTR(-ENOMEM);
1183 }
1184
1185 if (clnt->cl_auth->au_flavor != flavor) {
1186 struct rpc_auth *auth;
1187
1188 auth = rpcauth_create(flavor, clnt);
1189 if (IS_ERR(auth)) {
1190 dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
1191 return (struct rpc_clnt *)auth;
1192 }
1193 }
1194 return clnt;
1195
1196 out_fail:
1197 if (clp)
1198 nfs4_put_client(clp);
1199 return ERR_PTR(err);
1200}
1201
1202/*
1203 * Set up an NFS4 superblock
1204 */
1205static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
1206{
1207 struct nfs_server *server;
1208 struct rpc_timeout timeparms;
1209 rpc_authflavor_t authflavour;
1210 int err = -EIO;
1211
1212 sb->s_blocksize_bits = 0;
1213 sb->s_blocksize = 0;
1214 server = NFS_SB(sb);
1215 if (data->rsize != 0)
1216 server->rsize = nfs_block_size(data->rsize, NULL);
1217 if (data->wsize != 0)
1218 server->wsize = nfs_block_size(data->wsize, NULL);
1219 server->flags = data->flags & NFS_MOUNT_FLAGMASK;
1220 server->caps = NFS_CAP_ATOMIC_OPEN;
1221
1222 server->acregmin = data->acregmin*HZ;
1223 server->acregmax = data->acregmax*HZ;
1224 server->acdirmin = data->acdirmin*HZ;
1225 server->acdirmax = data->acdirmax*HZ;
1226
1227 server->rpc_ops = &nfs_v4_clientops;
1228
1229 nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
1230
1231 server->retrans_timeo = timeparms.to_initval;
1232 server->retrans_count = timeparms.to_retries;
1233
1234 /* Now create transport and client */
1235 authflavour = RPC_AUTH_UNIX;
1236 if (data->auth_flavourlen != 0) {
1237 if (data->auth_flavourlen != 1) {
1238 dprintk("%s: Invalid number of RPC auth flavours %d.\n",
1239 __FUNCTION__, data->auth_flavourlen);
1240 err = -EINVAL;
1241 goto out_fail;
1242 }
1243 if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
1244 err = -EFAULT;
1245 goto out_fail;
1246 }
1247 }
1248
1249 server->client = nfs4_create_client(server, &timeparms, data->proto, authflavour);
1250 if (IS_ERR(server->client)) {
1251 err = PTR_ERR(server->client);
1252 dprintk("%s: cannot create RPC client. Error = %d\n",
1253 __FUNCTION__, err);
1254 goto out_fail;
1255 }
1256
1257 sb->s_time_gran = 1;
1258
1259 sb->s_op = &nfs4_sops;
1260 err = nfs_sb_init(sb, authflavour);
1261
1262 out_fail:
1263 return err;
1264}
1265
1266static int nfs4_compare_super(struct super_block *sb, void *data)
1267{
1268 struct nfs_server *server = data;
1269 struct nfs_server *old = NFS_SB(sb);
1270
1271 if (strcmp(server->hostname, old->hostname) != 0)
1272 return 0;
1273 if (strcmp(server->mnt_path, old->mnt_path) != 0)
1274 return 0;
1275 return 1;
1276}
1277
1278static void *
1279nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
1280{
1281 void *p = NULL;
1282
1283 if (!src->len)
1284 return ERR_PTR(-EINVAL);
1285 if (src->len < maxlen)
1286 maxlen = src->len;
1287 if (dst == NULL) {
1288 p = dst = kmalloc(maxlen + 1, GFP_KERNEL);
1289 if (p == NULL)
1290 return ERR_PTR(-ENOMEM);
1291 }
1292 if (copy_from_user(dst, src->data, maxlen)) {
1293 kfree(p);
1294 return ERR_PTR(-EFAULT);
1295 }
1296 dst[maxlen] = '\0';
1297 return dst;
1298}
1299
1300static int nfs4_get_sb(struct file_system_type *fs_type,
1301 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
1302{
1303 int error;
1304 struct nfs_server *server;
1305 struct super_block *s;
1306 struct nfs4_mount_data *data = raw_data;
1307 void *p;
1308
1309 if (data == NULL) {
1310 dprintk("%s: missing data argument\n", __FUNCTION__);
1311 return -EINVAL;
1312 }
1313 if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
1314 dprintk("%s: bad mount version\n", __FUNCTION__);
1315 return -EINVAL;
1316 }
1317
1318 server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
1319 if (!server)
1320 return -ENOMEM;
1321 /* Zero out the NFS state stuff */
1322 init_nfsv4_state(server);
1323 server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
1324
1325 p = nfs_copy_user_string(NULL, &data->hostname, 256);
1326 if (IS_ERR(p))
1327 goto out_err;
1328 server->hostname = p;
1329
1330 p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
1331 if (IS_ERR(p))
1332 goto out_err;
1333 server->mnt_path = p;
1334
1335 p = nfs_copy_user_string(server->ip_addr, &data->client_addr,
1336 sizeof(server->ip_addr) - 1);
1337 if (IS_ERR(p))
1338 goto out_err;
1339
1340 /* We now require that the mount process passes the remote address */
1341 if (data->host_addrlen != sizeof(server->addr)) {
1342 error = -EINVAL;
1343 goto out_free;
1344 }
1345 if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
1346 error = -EFAULT;
1347 goto out_free;
1348 }
1349 if (server->addr.sin_family != AF_INET ||
1350 server->addr.sin_addr.s_addr == INADDR_ANY) {
1351 dprintk("%s: mount program didn't pass remote IP address!\n",
1352 __FUNCTION__);
1353 error = -EINVAL;
1354 goto out_free;
1355 }
1356
1357 /* Fire up rpciod if not yet running */
1358 error = rpciod_up();
1359 if (error < 0) {
1360 dprintk("%s: couldn't start rpciod! Error = %d\n",
1361 __FUNCTION__, error);
1362 goto out_free;
1363 }
1364
1365 s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
1366
1367 if (IS_ERR(s)) {
1368 error = PTR_ERR(s);
1369 goto out_free;
1370 }
1371
1372 if (s->s_root) {
1373 kfree(server->mnt_path);
1374 kfree(server->hostname);
1375 kfree(server);
1376 return simple_set_mnt(mnt, s);
1377 }
1378
1379 s->s_flags = flags;
1380
1381 error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
1382 if (error) {
1383 up_write(&s->s_umount);
1384 deactivate_super(s);
1385 return error;
1386 }
1387 s->s_flags |= MS_ACTIVE;
1388 return simple_set_mnt(mnt, s);
1389out_err:
1390 error = PTR_ERR(p);
1391out_free:
1392 kfree(server->mnt_path);
1393 kfree(server->hostname);
1394 kfree(server);
1395 return error;
1396}
1397
1398static void nfs4_kill_super(struct super_block *sb)
1399{
1400 struct nfs_server *server = NFS_SB(sb);
1401
1402 nfs_return_all_delegations(sb);
1403 kill_anon_super(sb);
1404
1405 nfs4_renewd_prepare_shutdown(server);
1406
1407 if (server->client != NULL && !IS_ERR(server->client))
1408 rpc_shutdown_client(server->client);
1409
1410 destroy_nfsv4_state(server);
1411
1412 rpciod_down();
1413
1414 nfs_free_iostats(server->io_stats);
1415 kfree(server->hostname);
1416 kfree(server);
1417 nfs_release_automount_timer();
1418}
1419
1420/*
1421 * Constructs the SERVER-side path
1422 */
1423static inline char *nfs4_dup_path(const struct dentry *dentry)
1424{
1425 char *page = (char *) __get_free_page(GFP_USER);
1426 char *path;
1427
1428 path = nfs4_path(dentry, page, PAGE_SIZE);
1429 if (!IS_ERR(path)) {
1430 int len = PAGE_SIZE + page - path;
1431 char *tmp = path;
1432
1433 path = kmalloc(len, GFP_KERNEL);
1434 if (path)
1435 memcpy(path, tmp, len);
1436 else
1437 path = ERR_PTR(-ENOMEM);
1438 }
1439 free_page((unsigned long)page);
1440 return path;
1441}
1442
1443static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
1444{
1445 const struct dentry *dentry = data->dentry;
1446 struct nfs4_client *clp = server->nfs4_state;
1447 struct super_block *sb;
1448
1449 server->fsid = data->fattr->fsid;
1450 nfs_copy_fh(&server->fh, data->fh);
1451 server->mnt_path = nfs4_dup_path(dentry);
1452 if (IS_ERR(server->mnt_path)) {
1453 sb = (struct super_block *)server->mnt_path;
1454 goto err;
1455 }
1456 sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
1457 if (IS_ERR(sb) || sb->s_root)
1458 goto free_path;
1459 nfs4_server_capabilities(server, &server->fh);
1460
1461 down_write(&clp->cl_sem);
1462 atomic_inc(&clp->cl_count);
1463 list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
1464 up_write(&clp->cl_sem);
1465 return sb;
1466free_path:
1467 kfree(server->mnt_path);
1468err:
1469 server->mnt_path = NULL;
1470 return sb;
1471}
1472
1473static int nfs_clone_nfs4_sb(struct file_system_type *fs_type,
1474 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
1475{
1476 struct nfs_clone_mount *data = raw_data;
1477 return nfs_clone_generic_sb(data, nfs4_clone_sb, nfs_clone_server, mnt);
1478}
1479
1480static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data)
1481{
1482 struct super_block *sb = ERR_PTR(-ENOMEM);
1483 int len;
1484
1485 len = strlen(data->mnt_path) + 1;
1486 server->mnt_path = kmalloc(len, GFP_KERNEL);
1487 if (server->mnt_path == NULL)
1488 goto err;
1489 memcpy(server->mnt_path, data->mnt_path, len);
1490 memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in));
1491
1492 sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
1493 if (IS_ERR(sb) || sb->s_root)
1494 goto free_path;
1495 return sb;
1496free_path:
1497 kfree(server->mnt_path);
1498err:
1499 server->mnt_path = NULL;
1500 return sb;
1501}
1502
1503static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data)
1504{
1505 struct nfs_server *server = NFS_SB(sb);
1506 struct rpc_timeout timeparms;
1507 int proto, timeo, retrans;
1508 void *err;
1509
1510 proto = IPPROTO_TCP;
1511 /* Since we are following a referral and there may be alternatives,
1512 set the timeouts and retries to low values */
1513 timeo = 2;
1514 retrans = 1;
1515 nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
1516
1517 server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor);
1518 if (IS_ERR((err = server->client)))
1519 goto out_err;
1520
1521 sb->s_time_gran = 1;
1522 sb->s_op = &nfs4_sops;
1523 err = ERR_PTR(nfs_sb_init(sb, data->authflavor));
1524 if (!IS_ERR(err))
1525 return server;
1526out_err:
1527 return (struct nfs_server *)err;
1528}
1529
1530static int nfs_referral_nfs4_sb(struct file_system_type *fs_type,
1531 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
1532{
1533 struct nfs_clone_mount *data = raw_data;
1534 return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server, mnt);
1535}
1536
1537#endif
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 18dc95b0b646..600bbe630abd 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -52,7 +52,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
52{ 52{
53 struct inode *inode = dentry->d_inode; 53 struct inode *inode = dentry->d_inode;
54 struct page *page; 54 struct page *page;
55 void *err = ERR_PTR(nfs_revalidate_inode(NFS_SERVER(inode), inode)); 55 void *err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
56 if (err) 56 if (err)
57 goto read_failed; 57 goto read_failed;
58 page = read_cache_page(&inode->i_data, 0, 58 page = read_cache_page(&inode->i_data, 0,
@@ -75,22 +75,13 @@ read_failed:
75 return NULL; 75 return NULL;
76} 76}
77 77
78static void nfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
79{
80 if (cookie) {
81 struct page *page = cookie;
82 kunmap(page);
83 page_cache_release(page);
84 }
85}
86
87/* 78/*
88 * symlinks can't do much... 79 * symlinks can't do much...
89 */ 80 */
90struct inode_operations nfs_symlink_inode_operations = { 81struct inode_operations nfs_symlink_inode_operations = {
91 .readlink = generic_readlink, 82 .readlink = generic_readlink,
92 .follow_link = nfs_follow_link, 83 .follow_link = nfs_follow_link,
93 .put_link = nfs_put_link, 84 .put_link = page_put_link,
94 .getattr = nfs_getattr, 85 .getattr = nfs_getattr,
95 .setattr = nfs_setattr, 86 .setattr = nfs_setattr,
96}; 87};
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 4c486eb867ca..db61e51bb154 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/nfs4.h> 13#include <linux/nfs4.h>
14#include <linux/nfs_idmap.h> 14#include <linux/nfs_idmap.h>
15#include <linux/nfs_fs.h>
15 16
16#include "callback.h" 17#include "callback.h"
17 18
@@ -46,6 +47,15 @@ static ctl_table nfs_cb_sysctls[] = {
46 .strategy = &sysctl_jiffies, 47 .strategy = &sysctl_jiffies,
47 }, 48 },
48#endif 49#endif
50 {
51 .ctl_name = CTL_UNNUMBERED,
52 .procname = "nfs_mountpoint_timeout",
53 .data = &nfs_mountpoint_expiry_timeout,
54 .maxlen = sizeof(nfs_mountpoint_expiry_timeout),
55 .mode = 0644,
56 .proc_handler = &proc_dointvec_jiffies,
57 .strategy = &sysctl_jiffies,
58 },
49 { .ctl_name = 0 } 59 { .ctl_name = 0 }
50}; 60};
51 61
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4cfada2cc09f..8fccb9cb173b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -98,11 +98,10 @@ struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
98 if (p) { 98 if (p) {
99 memset(p, 0, sizeof(*p)); 99 memset(p, 0, sizeof(*p));
100 INIT_LIST_HEAD(&p->pages); 100 INIT_LIST_HEAD(&p->pages);
101 if (pagecount < NFS_PAGEVEC_SIZE) 101 if (pagecount <= ARRAY_SIZE(p->page_array))
102 p->pagevec = &p->page_array[0]; 102 p->pagevec = p->page_array;
103 else { 103 else {
104 size_t size = ++pagecount * sizeof(struct page *); 104 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
105 p->pagevec = kzalloc(size, GFP_NOFS);
106 if (!p->pagevec) { 105 if (!p->pagevec) {
107 mempool_free(p, nfs_commit_mempool); 106 mempool_free(p, nfs_commit_mempool);
108 p = NULL; 107 p = NULL;
@@ -126,14 +125,11 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
126 if (p) { 125 if (p) {
127 memset(p, 0, sizeof(*p)); 126 memset(p, 0, sizeof(*p));
128 INIT_LIST_HEAD(&p->pages); 127 INIT_LIST_HEAD(&p->pages);
129 if (pagecount < NFS_PAGEVEC_SIZE) 128 if (pagecount <= ARRAY_SIZE(p->page_array))
130 p->pagevec = &p->page_array[0]; 129 p->pagevec = p->page_array;
131 else { 130 else {
132 size_t size = ++pagecount * sizeof(struct page *); 131 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
133 p->pagevec = kmalloc(size, GFP_NOFS); 132 if (!p->pagevec) {
134 if (p->pagevec) {
135 memset(p->pagevec, 0, size);
136 } else {
137 mempool_free(p, nfs_wdata_mempool); 133 mempool_free(p, nfs_wdata_mempool);
138 p = NULL; 134 p = NULL;
139 } 135 }
@@ -583,6 +579,17 @@ static int nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, un
583 return ret; 579 return ret;
584} 580}
585 581
582static void nfs_cancel_requests(struct list_head *head)
583{
584 struct nfs_page *req;
585 while(!list_empty(head)) {
586 req = nfs_list_entry(head->next);
587 nfs_list_remove_request(req);
588 nfs_inode_remove_request(req);
589 nfs_clear_page_writeback(req);
590 }
591}
592
586/* 593/*
587 * nfs_scan_dirty - Scan an inode for dirty requests 594 * nfs_scan_dirty - Scan an inode for dirty requests
588 * @inode: NFS inode to scan 595 * @inode: NFS inode to scan
@@ -627,7 +634,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st
627 int res = 0; 634 int res = 0;
628 635
629 if (nfsi->ncommit != 0) { 636 if (nfsi->ncommit != 0) {
630 res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages); 637 res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages);
631 nfsi->ncommit -= res; 638 nfsi->ncommit -= res;
632 if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) 639 if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
633 printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); 640 printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
@@ -1495,15 +1502,25 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
1495 pages = nfs_scan_dirty(inode, &head, idx_start, npages); 1502 pages = nfs_scan_dirty(inode, &head, idx_start, npages);
1496 if (pages != 0) { 1503 if (pages != 0) {
1497 spin_unlock(&nfsi->req_lock); 1504 spin_unlock(&nfsi->req_lock);
1498 ret = nfs_flush_list(inode, &head, pages, how); 1505 if (how & FLUSH_INVALIDATE)
1506 nfs_cancel_requests(&head);
1507 else
1508 ret = nfs_flush_list(inode, &head, pages, how);
1499 spin_lock(&nfsi->req_lock); 1509 spin_lock(&nfsi->req_lock);
1500 continue; 1510 continue;
1501 } 1511 }
1502 if (nocommit) 1512 if (nocommit)
1503 break; 1513 break;
1504 pages = nfs_scan_commit(inode, &head, 0, 0); 1514 pages = nfs_scan_commit(inode, &head, idx_start, npages);
1505 if (pages == 0) 1515 if (pages == 0)
1506 break; 1516 break;
1517 if (how & FLUSH_INVALIDATE) {
1518 spin_unlock(&nfsi->req_lock);
1519 nfs_cancel_requests(&head);
1520 spin_lock(&nfsi->req_lock);
1521 continue;
1522 }
1523 pages += nfs_scan_commit(inode, &head, 0, 0);
1507 spin_unlock(&nfsi->req_lock); 1524 spin_unlock(&nfsi->req_lock);
1508 ret = nfs_commit_list(inode, &head, how); 1525 ret = nfs_commit_list(inode, &head, how);
1509 spin_lock(&nfsi->req_lock); 1526 spin_lock(&nfsi->req_lock);
@@ -1512,7 +1529,7 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
1512 return ret; 1529 return ret;
1513} 1530}
1514 1531
1515int nfs_init_writepagecache(void) 1532int __init nfs_init_writepagecache(void)
1516{ 1533{
1517 nfs_wdata_cachep = kmem_cache_create("nfs_write_data", 1534 nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
1518 sizeof(struct nfs_write_data), 1535 sizeof(struct nfs_write_data),
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 96c7578cbe1e..7c7d01672d35 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -123,7 +123,7 @@ static void release_stateid(struct nfs4_stateid *stp, int flags);
123 */ 123 */
124 124
125/* recall_lock protects the del_recall_lru */ 125/* recall_lock protects the del_recall_lru */
126static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED; 126static DEFINE_SPINLOCK(recall_lock);
127static struct list_head del_recall_lru; 127static struct list_head del_recall_lru;
128 128
129static void 129static void
@@ -529,8 +529,7 @@ move_to_confirmed(struct nfs4_client *clp)
529 529
530 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); 530 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
531 list_del_init(&clp->cl_strhash); 531 list_del_init(&clp->cl_strhash);
532 list_del_init(&clp->cl_idhash); 532 list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
533 list_add(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
534 strhashval = clientstr_hashval(clp->cl_recdir); 533 strhashval = clientstr_hashval(clp->cl_recdir);
535 list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); 534 list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
536 renew_client(clp); 535 renew_client(clp);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index d852ebb538e3..fdf7cf3dfadc 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -103,8 +103,7 @@ nfsd_cache_shutdown(void)
103static void 103static void
104lru_put_end(struct svc_cacherep *rp) 104lru_put_end(struct svc_cacherep *rp)
105{ 105{
106 list_del(&rp->c_lru); 106 list_move_tail(&rp->c_lru, &lru_head);
107 list_add_tail(&rp->c_lru, &lru_head);
108} 107}
109 108
110/* 109/*
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 88292f9e4b9b..2e42c2dcae12 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1358,7 +1358,7 @@ err_out:
1358 goto out; 1358 goto out;
1359} 1359}
1360 1360
1361static size_t __ntfs_copy_from_user_iovec(char *vaddr, 1361static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
1362 const struct iovec *iov, size_t iov_ofs, size_t bytes) 1362 const struct iovec *iov, size_t iov_ofs, size_t bytes)
1363{ 1363{
1364 size_t total = 0; 1364 size_t total = 0;
@@ -1376,10 +1376,6 @@ static size_t __ntfs_copy_from_user_iovec(char *vaddr,
1376 bytes -= len; 1376 bytes -= len;
1377 vaddr += len; 1377 vaddr += len;
1378 if (unlikely(left)) { 1378 if (unlikely(left)) {
1379 /*
1380 * Zero the rest of the target like __copy_from_user().
1381 */
1382 memset(vaddr, 0, bytes);
1383 total -= left; 1379 total -= left;
1384 break; 1380 break;
1385 } 1381 }
@@ -1420,11 +1416,13 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
1420 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s 1416 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
1421 * single-segment behaviour. 1417 * single-segment behaviour.
1422 * 1418 *
1423 * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and 1419 * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
1424 * when not atomic. This is ok because __ntfs_copy_from_user_iovec() calls 1420 * when atomic and when not atomic. This is ok because
1425 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In 1421 * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
1426 * fact, the only difference between __copy_from_user_inatomic() and 1422 * and it is ok to call this when non-atomic.
1427 * __copy_from_user() is that the latter calls might_sleep(). And on many 1423 * Infact, the only difference between __copy_from_user_inatomic() and
1424 * __copy_from_user() is that the latter calls might_sleep() and the former
1425 * should not zero the tail of the buffer on error. And on many
1428 * architectures __copy_from_user_inatomic() is just defined to 1426 * architectures __copy_from_user_inatomic() is just defined to
1429 * __copy_from_user() so it makes no difference at all on those architectures. 1427 * __copy_from_user() so it makes no difference at all on those architectures.
1430 */ 1428 */
@@ -1441,14 +1439,18 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1441 if (len > bytes) 1439 if (len > bytes)
1442 len = bytes; 1440 len = bytes;
1443 kaddr = kmap_atomic(*pages, KM_USER0); 1441 kaddr = kmap_atomic(*pages, KM_USER0);
1444 copied = __ntfs_copy_from_user_iovec(kaddr + ofs, 1442 copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
1445 *iov, *iov_ofs, len); 1443 *iov, *iov_ofs, len);
1446 kunmap_atomic(kaddr, KM_USER0); 1444 kunmap_atomic(kaddr, KM_USER0);
1447 if (unlikely(copied != len)) { 1445 if (unlikely(copied != len)) {
1448 /* Do it the slow way. */ 1446 /* Do it the slow way. */
1449 kaddr = kmap(*pages); 1447 kaddr = kmap(*pages);
1450 copied = __ntfs_copy_from_user_iovec(kaddr + ofs, 1448 copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
1451 *iov, *iov_ofs, len); 1449 *iov, *iov_ofs, len);
1450 /*
1451 * Zero the rest of the target like __copy_from_user().
1452 */
1453 memset(kaddr + ofs + copied, 0, len - copied);
1452 kunmap(*pages); 1454 kunmap(*pages);
1453 if (unlikely(copied != len)) 1455 if (unlikely(copied != len))
1454 goto err_out; 1456 goto err_out;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 21f38accd039..1d26cfcd9f84 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -54,7 +54,7 @@ static DECLARE_RWSEM(o2hb_callback_sem);
54 * multiple hb threads are watching multiple regions. A node is live 54 * multiple hb threads are watching multiple regions. A node is live
55 * whenever any of the threads sees activity from the node in its region. 55 * whenever any of the threads sees activity from the node in its region.
56 */ 56 */
57static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED; 57static DEFINE_SPINLOCK(o2hb_live_lock);
58static struct list_head o2hb_live_slots[O2NM_MAX_NODES]; 58static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
59static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 59static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
60static LIST_HEAD(o2hb_node_events); 60static LIST_HEAD(o2hb_node_events);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0f60cc0d3985..1591eb37a723 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -108,7 +108,7 @@
108 ##args); \ 108 ##args); \
109} while (0) 109} while (0)
110 110
111static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED; 111static DEFINE_RWLOCK(o2net_handler_lock);
112static struct rb_root o2net_handler_tree = RB_ROOT; 112static struct rb_root o2net_handler_tree = RB_ROOT;
113 113
114static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; 114static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 355593dd8ef8..42775e2bbe2c 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -197,12 +197,14 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
197 lock->ml.node == dlm->node_num ? "master" : 197 lock->ml.node == dlm->node_num ? "master" :
198 "remote"); 198 "remote");
199 memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN); 199 memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
200 } else if (lksb->flags & DLM_LKSB_PUT_LVB) {
201 mlog(0, "setting lvb from lockres for %s node\n",
202 lock->ml.node == dlm->node_num ? "master" :
203 "remote");
204 memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
205 } 200 }
201 /* Do nothing for lvb put requests - they should be done in
202 * place when the lock is downconverted - otherwise we risk
203 * racing gets and puts which could result in old lvb data
204 * being propagated. We leave the put flag set and clear it
205 * here. In the future we might want to clear it at the time
206 * the put is actually done.
207 */
206 spin_unlock(&res->spinlock); 208 spin_unlock(&res->spinlock);
207 } 209 }
208 210
@@ -381,8 +383,7 @@ do_ast:
381 ret = DLM_NORMAL; 383 ret = DLM_NORMAL;
382 if (past->type == DLM_AST) { 384 if (past->type == DLM_AST) {
383 /* do not alter lock refcount. switching lists. */ 385 /* do not alter lock refcount. switching lists. */
384 list_del_init(&lock->list); 386 list_move_tail(&lock->list, &res->granted);
385 list_add_tail(&lock->list, &res->granted);
386 mlog(0, "ast: adding to granted list... type=%d, " 387 mlog(0, "ast: adding to granted list... type=%d, "
387 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); 388 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
388 if (lock->ml.convert_type != LKM_IVMODE) { 389 if (lock->ml.convert_type != LKM_IVMODE) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 88cc43df18f1..9bdc9cf65991 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,17 @@
37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes 37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
38#define DLM_THREAD_MS 200 // flush at least every 200 ms 38#define DLM_THREAD_MS 200 // flush at least every 200 ms
39 39
40#define DLM_HASH_BUCKETS (PAGE_SIZE / sizeof(struct hlist_head)) 40#define DLM_HASH_SIZE_DEFAULT (1 << 14)
41#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
42# define DLM_HASH_PAGES 1
43#else
44# define DLM_HASH_PAGES (DLM_HASH_SIZE_DEFAULT / PAGE_SIZE)
45#endif
46#define DLM_BUCKETS_PER_PAGE (PAGE_SIZE / sizeof(struct hlist_head))
47#define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
48
49/* Intended to make it easier for us to switch out hash functions */
50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
41 51
42enum dlm_ast_type { 52enum dlm_ast_type {
43 DLM_AST = 0, 53 DLM_AST = 0,
@@ -61,7 +71,8 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
61 return 0; 71 return 0;
62} 72}
63 73
64#define DLM_RECO_STATE_ACTIVE 0x0001 74#define DLM_RECO_STATE_ACTIVE 0x0001
75#define DLM_RECO_STATE_FINALIZE 0x0002
65 76
66struct dlm_recovery_ctxt 77struct dlm_recovery_ctxt
67{ 78{
@@ -85,7 +96,7 @@ enum dlm_ctxt_state {
85struct dlm_ctxt 96struct dlm_ctxt
86{ 97{
87 struct list_head list; 98 struct list_head list;
88 struct hlist_head *lockres_hash; 99 struct hlist_head **lockres_hash;
89 struct list_head dirty_list; 100 struct list_head dirty_list;
90 struct list_head purge_list; 101 struct list_head purge_list;
91 struct list_head pending_asts; 102 struct list_head pending_asts;
@@ -120,6 +131,7 @@ struct dlm_ctxt
120 struct o2hb_callback_func dlm_hb_down; 131 struct o2hb_callback_func dlm_hb_down;
121 struct task_struct *dlm_thread_task; 132 struct task_struct *dlm_thread_task;
122 struct task_struct *dlm_reco_thread_task; 133 struct task_struct *dlm_reco_thread_task;
134 struct workqueue_struct *dlm_worker;
123 wait_queue_head_t dlm_thread_wq; 135 wait_queue_head_t dlm_thread_wq;
124 wait_queue_head_t dlm_reco_thread_wq; 136 wait_queue_head_t dlm_reco_thread_wq;
125 wait_queue_head_t ast_wq; 137 wait_queue_head_t ast_wq;
@@ -132,6 +144,11 @@ struct dlm_ctxt
132 struct list_head dlm_eviction_callbacks; 144 struct list_head dlm_eviction_callbacks;
133}; 145};
134 146
147static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i)
148{
149 return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
150}
151
135/* these keventd work queue items are for less-frequently 152/* these keventd work queue items are for less-frequently
136 * called functions that cannot be directly called from the 153 * called functions that cannot be directly called from the
137 * net message handlers for some reason, usually because 154 * net message handlers for some reason, usually because
@@ -216,20 +233,29 @@ struct dlm_lock_resource
216 /* WARNING: Please see the comment in dlm_init_lockres before 233 /* WARNING: Please see the comment in dlm_init_lockres before
217 * adding fields here. */ 234 * adding fields here. */
218 struct hlist_node hash_node; 235 struct hlist_node hash_node;
236 struct qstr lockname;
219 struct kref refs; 237 struct kref refs;
220 238
221 /* please keep these next 3 in this order 239 /*
222 * some funcs want to iterate over all lists */ 240 * Please keep granted, converting, and blocked in this order,
241 * as some funcs want to iterate over all lists.
242 *
243 * All four lists are protected by the hash's reference.
244 */
223 struct list_head granted; 245 struct list_head granted;
224 struct list_head converting; 246 struct list_head converting;
225 struct list_head blocked; 247 struct list_head blocked;
248 struct list_head purge;
226 249
250 /*
251 * These two lists require you to hold an additional reference
252 * while they are on the list.
253 */
227 struct list_head dirty; 254 struct list_head dirty;
228 struct list_head recovering; // dlm_recovery_ctxt.resources list 255 struct list_head recovering; // dlm_recovery_ctxt.resources list
229 256
230 /* unused lock resources have their last_used stamped and are 257 /* unused lock resources have their last_used stamped and are
231 * put on a list for the dlm thread to run. */ 258 * put on a list for the dlm thread to run. */
232 struct list_head purge;
233 unsigned long last_used; 259 unsigned long last_used;
234 260
235 unsigned migration_pending:1; 261 unsigned migration_pending:1;
@@ -238,7 +264,6 @@ struct dlm_lock_resource
238 wait_queue_head_t wq; 264 wait_queue_head_t wq;
239 u8 owner; //node which owns the lock resource, or unknown 265 u8 owner; //node which owns the lock resource, or unknown
240 u16 state; 266 u16 state;
241 struct qstr lockname;
242 char lvb[DLM_LVB_LEN]; 267 char lvb[DLM_LVB_LEN];
243}; 268};
244 269
@@ -300,6 +325,15 @@ enum dlm_lockres_list {
300 DLM_BLOCKED_LIST 325 DLM_BLOCKED_LIST
301}; 326};
302 327
328static inline int dlm_lvb_is_empty(char *lvb)
329{
330 int i;
331 for (i=0; i<DLM_LVB_LEN; i++)
332 if (lvb[i])
333 return 0;
334 return 1;
335}
336
303static inline struct list_head * 337static inline struct list_head *
304dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) 338dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
305{ 339{
@@ -609,7 +643,8 @@ struct dlm_finalize_reco
609{ 643{
610 u8 node_idx; 644 u8 node_idx;
611 u8 dead_node; 645 u8 dead_node;
612 __be16 pad1; 646 u8 flags;
647 u8 pad1;
613 __be32 pad2; 648 __be32 pad2;
614}; 649};
615 650
@@ -676,6 +711,7 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
676void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); 711void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
677int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); 712int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
678int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); 713int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
714int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
679 715
680void dlm_put(struct dlm_ctxt *dlm); 716void dlm_put(struct dlm_ctxt *dlm);
681struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); 717struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -687,14 +723,20 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
687 struct dlm_lock_resource *res); 723 struct dlm_lock_resource *res);
688void dlm_purge_lockres(struct dlm_ctxt *dlm, 724void dlm_purge_lockres(struct dlm_ctxt *dlm,
689 struct dlm_lock_resource *lockres); 725 struct dlm_lock_resource *lockres);
690void dlm_lockres_get(struct dlm_lock_resource *res); 726static inline void dlm_lockres_get(struct dlm_lock_resource *res)
727{
728 /* This is called on every lookup, so it might be worth
729 * inlining. */
730 kref_get(&res->refs);
731}
691void dlm_lockres_put(struct dlm_lock_resource *res); 732void dlm_lockres_put(struct dlm_lock_resource *res);
692void __dlm_unhash_lockres(struct dlm_lock_resource *res); 733void __dlm_unhash_lockres(struct dlm_lock_resource *res);
693void __dlm_insert_lockres(struct dlm_ctxt *dlm, 734void __dlm_insert_lockres(struct dlm_ctxt *dlm,
694 struct dlm_lock_resource *res); 735 struct dlm_lock_resource *res);
695struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, 736struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
696 const char *name, 737 const char *name,
697 unsigned int len); 738 unsigned int len,
739 unsigned int hash);
698struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, 740struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
699 const char *name, 741 const char *name,
700 unsigned int len); 742 unsigned int len);
@@ -819,6 +861,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm,
819 u8 dead_node); 861 u8 dead_node);
820int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); 862int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
821 863
864int __dlm_lockres_unused(struct dlm_lock_resource *res);
822 865
823static inline const char * dlm_lock_mode_name(int mode) 866static inline const char * dlm_lock_mode_name(int mode)
824{ 867{
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 8285228d9e37..c764dc8e40a2 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -214,6 +214,9 @@ grant:
214 if (lock->ml.node == dlm->node_num) 214 if (lock->ml.node == dlm->node_num)
215 mlog(0, "doing in-place convert for nonlocal lock\n"); 215 mlog(0, "doing in-place convert for nonlocal lock\n");
216 lock->ml.type = type; 216 lock->ml.type = type;
217 if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
218 memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
219
217 status = DLM_NORMAL; 220 status = DLM_NORMAL;
218 *call_ast = 1; 221 *call_ast = 1;
219 goto unlock_exit; 222 goto unlock_exit;
@@ -231,8 +234,7 @@ switch_queues:
231 234
232 lock->ml.convert_type = type; 235 lock->ml.convert_type = type;
233 /* do not alter lock refcount. switching lists. */ 236 /* do not alter lock refcount. switching lists. */
234 list_del_init(&lock->list); 237 list_move_tail(&lock->list, &res->converting);
235 list_add_tail(&lock->list, &res->converting);
236 238
237unlock_exit: 239unlock_exit:
238 spin_unlock(&lock->spinlock); 240 spin_unlock(&lock->spinlock);
@@ -248,8 +250,7 @@ void dlm_revert_pending_convert(struct dlm_lock_resource *res,
248 struct dlm_lock *lock) 250 struct dlm_lock *lock)
249{ 251{
250 /* do not alter lock refcount. switching lists. */ 252 /* do not alter lock refcount. switching lists. */
251 list_del_init(&lock->list); 253 list_move_tail(&lock->list, &res->granted);
252 list_add_tail(&lock->list, &res->granted);
253 lock->ml.convert_type = LKM_IVMODE; 254 lock->ml.convert_type = LKM_IVMODE;
254 lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); 255 lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
255} 256}
@@ -294,8 +295,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
294 res->state |= DLM_LOCK_RES_IN_PROGRESS; 295 res->state |= DLM_LOCK_RES_IN_PROGRESS;
295 /* move lock to local convert queue */ 296 /* move lock to local convert queue */
296 /* do not alter lock refcount. switching lists. */ 297 /* do not alter lock refcount. switching lists. */
297 list_del_init(&lock->list); 298 list_move_tail(&lock->list, &res->converting);
298 list_add_tail(&lock->list, &res->converting);
299 lock->convert_pending = 1; 299 lock->convert_pending = 1;
300 lock->ml.convert_type = type; 300 lock->ml.convert_type = type;
301 301
@@ -464,6 +464,12 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
464 } 464 }
465 465
466 spin_lock(&res->spinlock); 466 spin_lock(&res->spinlock);
467 status = __dlm_lockres_state_to_status(res);
468 if (status != DLM_NORMAL) {
469 spin_unlock(&res->spinlock);
470 dlm_error(status);
471 goto leave;
472 }
467 list_for_each(iter, &res->granted) { 473 list_for_each(iter, &res->granted) {
468 lock = list_entry(iter, struct dlm_lock, list); 474 lock = list_entry(iter, struct dlm_lock, list);
469 if (lock->ml.cookie == cnv->cookie && 475 if (lock->ml.cookie == cnv->cookie &&
@@ -473,6 +479,21 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
473 } 479 }
474 lock = NULL; 480 lock = NULL;
475 } 481 }
482 if (!lock) {
483 __dlm_print_one_lock_resource(res);
484 list_for_each(iter, &res->granted) {
485 lock = list_entry(iter, struct dlm_lock, list);
486 if (lock->ml.node == cnv->node_idx) {
487 mlog(ML_ERROR, "There is something here "
488 "for node %u, lock->ml.cookie=%llu, "
489 "cnv->cookie=%llu\n", cnv->node_idx,
490 (unsigned long long)lock->ml.cookie,
491 (unsigned long long)cnv->cookie);
492 break;
493 }
494 }
495 lock = NULL;
496 }
476 spin_unlock(&res->spinlock); 497 spin_unlock(&res->spinlock);
477 if (!lock) { 498 if (!lock) {
478 status = DLM_IVLOCKID; 499 status = DLM_IVLOCKID;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c7eae5d3324e..3f6c8d88f7af 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -37,10 +37,8 @@
37 37
38#include "dlmapi.h" 38#include "dlmapi.h"
39#include "dlmcommon.h" 39#include "dlmcommon.h"
40#include "dlmdebug.h"
41 40
42#include "dlmdomain.h" 41#include "dlmdomain.h"
43#include "dlmdebug.h"
44 42
45#define MLOG_MASK_PREFIX ML_DLM 43#define MLOG_MASK_PREFIX ML_DLM
46#include "cluster/masklog.h" 44#include "cluster/masklog.h"
@@ -120,6 +118,7 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
120} 118}
121EXPORT_SYMBOL_GPL(dlm_print_one_lock); 119EXPORT_SYMBOL_GPL(dlm_print_one_lock);
122 120
121#if 0
123void dlm_dump_lock_resources(struct dlm_ctxt *dlm) 122void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
124{ 123{
125 struct dlm_lock_resource *res; 124 struct dlm_lock_resource *res;
@@ -136,12 +135,13 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
136 135
137 spin_lock(&dlm->spinlock); 136 spin_lock(&dlm->spinlock);
138 for (i=0; i<DLM_HASH_BUCKETS; i++) { 137 for (i=0; i<DLM_HASH_BUCKETS; i++) {
139 bucket = &(dlm->lockres_hash[i]); 138 bucket = dlm_lockres_hash(dlm, i);
140 hlist_for_each_entry(res, iter, bucket, hash_node) 139 hlist_for_each_entry(res, iter, bucket, hash_node)
141 dlm_print_one_lock_resource(res); 140 dlm_print_one_lock_resource(res);
142 } 141 }
143 spin_unlock(&dlm->spinlock); 142 spin_unlock(&dlm->spinlock);
144} 143}
144#endif /* 0 */
145 145
146static const char *dlm_errnames[] = { 146static const char *dlm_errnames[] = {
147 [DLM_NORMAL] = "DLM_NORMAL", 147 [DLM_NORMAL] = "DLM_NORMAL",
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
deleted file mode 100644
index 6858510c3ccd..000000000000
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ /dev/null
@@ -1,30 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdebug.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMDEBUG_H
26#define DLMDEBUG_H
27
28void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
29
30#endif
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8f3a9e3106fd..b8c23f7ba67e 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -41,7 +41,6 @@
41#include "dlmapi.h" 41#include "dlmapi.h"
42#include "dlmcommon.h" 42#include "dlmcommon.h"
43 43
44#include "dlmdebug.h"
45#include "dlmdomain.h" 44#include "dlmdomain.h"
46 45
47#include "dlmver.h" 46#include "dlmver.h"
@@ -49,6 +48,33 @@
49#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) 48#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
50#include "cluster/masklog.h" 49#include "cluster/masklog.h"
51 50
51static void dlm_free_pagevec(void **vec, int pages)
52{
53 while (pages--)
54 free_page((unsigned long)vec[pages]);
55 kfree(vec);
56}
57
58static void **dlm_alloc_pagevec(int pages)
59{
60 void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL);
61 int i;
62
63 if (!vec)
64 return NULL;
65
66 for (i = 0; i < pages; i++)
67 if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
68 goto out_free;
69
70 mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
71 pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE);
72 return vec;
73out_free:
74 dlm_free_pagevec(vec, i);
75 return NULL;
76}
77
52/* 78/*
53 * 79 *
54 * spinlock lock ordering: if multiple locks are needed, obey this ordering: 80 * spinlock lock ordering: if multiple locks are needed, obey this ordering:
@@ -62,7 +88,7 @@
62 * 88 *
63 */ 89 */
64 90
65spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED; 91DEFINE_SPINLOCK(dlm_domain_lock);
66LIST_HEAD(dlm_domains); 92LIST_HEAD(dlm_domains);
67static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); 93static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
68 94
@@ -90,8 +116,7 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
90 assert_spin_locked(&dlm->spinlock); 116 assert_spin_locked(&dlm->spinlock);
91 117
92 q = &res->lockname; 118 q = &res->lockname;
93 q->hash = full_name_hash(q->name, q->len); 119 bucket = dlm_lockres_hash(dlm, q->hash);
94 bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);
95 120
96 /* get a reference for our hashtable */ 121 /* get a reference for our hashtable */
97 dlm_lockres_get(res); 122 dlm_lockres_get(res);
@@ -100,34 +125,32 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
100} 125}
101 126
102struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, 127struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
103 const char *name, 128 const char *name,
104 unsigned int len) 129 unsigned int len,
130 unsigned int hash)
105{ 131{
106 unsigned int hash;
107 struct hlist_node *iter;
108 struct dlm_lock_resource *tmpres=NULL;
109 struct hlist_head *bucket; 132 struct hlist_head *bucket;
133 struct hlist_node *list;
110 134
111 mlog_entry("%.*s\n", len, name); 135 mlog_entry("%.*s\n", len, name);
112 136
113 assert_spin_locked(&dlm->spinlock); 137 assert_spin_locked(&dlm->spinlock);
114 138
115 hash = full_name_hash(name, len); 139 bucket = dlm_lockres_hash(dlm, hash);
116
117 bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);
118
119 /* check for pre-existing lock */
120 hlist_for_each(iter, bucket) {
121 tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node);
122 if (tmpres->lockname.len == len &&
123 memcmp(tmpres->lockname.name, name, len) == 0) {
124 dlm_lockres_get(tmpres);
125 break;
126 }
127 140
128 tmpres = NULL; 141 hlist_for_each(list, bucket) {
142 struct dlm_lock_resource *res = hlist_entry(list,
143 struct dlm_lock_resource, hash_node);
144 if (res->lockname.name[0] != name[0])
145 continue;
146 if (unlikely(res->lockname.len != len))
147 continue;
148 if (memcmp(res->lockname.name + 1, name + 1, len - 1))
149 continue;
150 dlm_lockres_get(res);
151 return res;
129 } 152 }
130 return tmpres; 153 return NULL;
131} 154}
132 155
133struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, 156struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
@@ -135,9 +158,10 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
135 unsigned int len) 158 unsigned int len)
136{ 159{
137 struct dlm_lock_resource *res; 160 struct dlm_lock_resource *res;
161 unsigned int hash = dlm_lockid_hash(name, len);
138 162
139 spin_lock(&dlm->spinlock); 163 spin_lock(&dlm->spinlock);
140 res = __dlm_lookup_lockres(dlm, name, len); 164 res = __dlm_lookup_lockres(dlm, name, len, hash);
141 spin_unlock(&dlm->spinlock); 165 spin_unlock(&dlm->spinlock);
142 return res; 166 return res;
143} 167}
@@ -194,7 +218,7 @@ static int dlm_wait_on_domain_helper(const char *domain)
194static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) 218static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
195{ 219{
196 if (dlm->lockres_hash) 220 if (dlm->lockres_hash)
197 free_page((unsigned long) dlm->lockres_hash); 221 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
198 222
199 if (dlm->name) 223 if (dlm->name)
200 kfree(dlm->name); 224 kfree(dlm->name);
@@ -278,11 +302,21 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
278 return ret; 302 return ret;
279} 303}
280 304
305static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
306{
307 if (dlm->dlm_worker) {
308 flush_workqueue(dlm->dlm_worker);
309 destroy_workqueue(dlm->dlm_worker);
310 dlm->dlm_worker = NULL;
311 }
312}
313
281static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) 314static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
282{ 315{
283 dlm_unregister_domain_handlers(dlm); 316 dlm_unregister_domain_handlers(dlm);
284 dlm_complete_thread(dlm); 317 dlm_complete_thread(dlm);
285 dlm_complete_recovery_thread(dlm); 318 dlm_complete_recovery_thread(dlm);
319 dlm_destroy_dlm_worker(dlm);
286 320
287 /* We've left the domain. Now we can take ourselves out of the 321 /* We've left the domain. Now we can take ourselves out of the
288 * list and allow the kref stuff to help us free the 322 * list and allow the kref stuff to help us free the
@@ -304,8 +338,8 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
304restart: 338restart:
305 spin_lock(&dlm->spinlock); 339 spin_lock(&dlm->spinlock);
306 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 340 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
307 while (!hlist_empty(&dlm->lockres_hash[i])) { 341 while (!hlist_empty(dlm_lockres_hash(dlm, i))) {
308 res = hlist_entry(dlm->lockres_hash[i].first, 342 res = hlist_entry(dlm_lockres_hash(dlm, i)->first,
309 struct dlm_lock_resource, hash_node); 343 struct dlm_lock_resource, hash_node);
310 /* need reference when manually grabbing lockres */ 344 /* need reference when manually grabbing lockres */
311 dlm_lockres_get(res); 345 dlm_lockres_get(res);
@@ -1126,6 +1160,13 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
1126 goto bail; 1160 goto bail;
1127 } 1161 }
1128 1162
1163 dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
1164 if (!dlm->dlm_worker) {
1165 status = -ENOMEM;
1166 mlog_errno(status);
1167 goto bail;
1168 }
1169
1129 do { 1170 do {
1130 unsigned int backoff; 1171 unsigned int backoff;
1131 status = dlm_try_to_join_domain(dlm); 1172 status = dlm_try_to_join_domain(dlm);
@@ -1166,6 +1207,7 @@ bail:
1166 dlm_unregister_domain_handlers(dlm); 1207 dlm_unregister_domain_handlers(dlm);
1167 dlm_complete_thread(dlm); 1208 dlm_complete_thread(dlm);
1168 dlm_complete_recovery_thread(dlm); 1209 dlm_complete_recovery_thread(dlm);
1210 dlm_destroy_dlm_worker(dlm);
1169 } 1211 }
1170 1212
1171 return status; 1213 return status;
@@ -1191,7 +1233,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1191 goto leave; 1233 goto leave;
1192 } 1234 }
1193 1235
1194 dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL); 1236 dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
1195 if (!dlm->lockres_hash) { 1237 if (!dlm->lockres_hash) {
1196 mlog_errno(-ENOMEM); 1238 mlog_errno(-ENOMEM);
1197 kfree(dlm->name); 1239 kfree(dlm->name);
@@ -1200,8 +1242,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1200 goto leave; 1242 goto leave;
1201 } 1243 }
1202 1244
1203 for (i=0; i<DLM_HASH_BUCKETS; i++) 1245 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1204 INIT_HLIST_HEAD(&dlm->lockres_hash[i]); 1246 INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
1205 1247
1206 strcpy(dlm->name, domain); 1248 strcpy(dlm->name, domain);
1207 dlm->key = key; 1249 dlm->key = key;
@@ -1231,6 +1273,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1231 1273
1232 dlm->dlm_thread_task = NULL; 1274 dlm->dlm_thread_task = NULL;
1233 dlm->dlm_reco_thread_task = NULL; 1275 dlm->dlm_reco_thread_task = NULL;
1276 dlm->dlm_worker = NULL;
1234 init_waitqueue_head(&dlm->dlm_thread_wq); 1277 init_waitqueue_head(&dlm->dlm_thread_wq);
1235 init_waitqueue_head(&dlm->dlm_reco_thread_wq); 1278 init_waitqueue_head(&dlm->dlm_reco_thread_wq);
1236 init_waitqueue_head(&dlm->reco.event); 1279 init_waitqueue_head(&dlm->reco.event);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 7273d9fa6bab..033ad1701232 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -116,7 +116,7 @@ static int dlmfs_file_open(struct inode *inode,
116 * doesn't make sense for LVB writes. */ 116 * doesn't make sense for LVB writes. */
117 file->f_flags &= ~O_APPEND; 117 file->f_flags &= ~O_APPEND;
118 118
119 fp = kmalloc(sizeof(*fp), GFP_KERNEL); 119 fp = kmalloc(sizeof(*fp), GFP_NOFS);
120 if (!fp) { 120 if (!fp) {
121 status = -ENOMEM; 121 status = -ENOMEM;
122 goto bail; 122 goto bail;
@@ -196,7 +196,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
196 else 196 else
197 readlen = count - *ppos; 197 readlen = count - *ppos;
198 198
199 lvb_buf = kmalloc(readlen, GFP_KERNEL); 199 lvb_buf = kmalloc(readlen, GFP_NOFS);
200 if (!lvb_buf) 200 if (!lvb_buf)
201 return -ENOMEM; 201 return -ENOMEM;
202 202
@@ -240,7 +240,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
240 else 240 else
241 writelen = count - *ppos; 241 writelen = count - *ppos;
242 242
243 lvb_buf = kmalloc(writelen, GFP_KERNEL); 243 lvb_buf = kmalloc(writelen, GFP_NOFS);
244 if (!lvb_buf) 244 if (!lvb_buf)
245 return -ENOMEM; 245 return -ENOMEM;
246 246
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 6fea28318d6d..5ca57ec650c7 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,7 +53,7 @@
53#define MLOG_MASK_PREFIX ML_DLM 53#define MLOG_MASK_PREFIX ML_DLM
54#include "cluster/masklog.h" 54#include "cluster/masklog.h"
55 55
56static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED; 56static DEFINE_SPINLOCK(dlm_cookie_lock);
57static u64 dlm_next_cookie = 1; 57static u64 dlm_next_cookie = 1;
58 58
59static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, 59static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
@@ -201,6 +201,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
201 struct dlm_lock *lock, int flags) 201 struct dlm_lock *lock, int flags)
202{ 202{
203 enum dlm_status status = DLM_DENIED; 203 enum dlm_status status = DLM_DENIED;
204 int lockres_changed = 1;
204 205
205 mlog_entry("type=%d\n", lock->ml.type); 206 mlog_entry("type=%d\n", lock->ml.type);
206 mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len, 207 mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
@@ -226,8 +227,25 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
226 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 227 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
227 lock->lock_pending = 0; 228 lock->lock_pending = 0;
228 if (status != DLM_NORMAL) { 229 if (status != DLM_NORMAL) {
229 if (status != DLM_NOTQUEUED) 230 if (status == DLM_RECOVERING &&
231 dlm_is_recovery_lock(res->lockname.name,
232 res->lockname.len)) {
233 /* recovery lock was mastered by dead node.
234 * we need to have calc_usage shoot down this
235 * lockres and completely remaster it. */
236 mlog(0, "%s: recovery lock was owned by "
237 "dead node %u, remaster it now.\n",
238 dlm->name, res->owner);
239 } else if (status != DLM_NOTQUEUED) {
240 /*
241 * DO NOT call calc_usage, as this would unhash
242 * the remote lockres before we ever get to use
243 * it. treat as if we never made any change to
244 * the lockres.
245 */
246 lockres_changed = 0;
230 dlm_error(status); 247 dlm_error(status);
248 }
231 dlm_revert_pending_lock(res, lock); 249 dlm_revert_pending_lock(res, lock);
232 dlm_lock_put(lock); 250 dlm_lock_put(lock);
233 } else if (dlm_is_recovery_lock(res->lockname.name, 251 } else if (dlm_is_recovery_lock(res->lockname.name,
@@ -239,12 +257,12 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
239 mlog(0, "%s: $RECOVERY lock for this node (%u) is " 257 mlog(0, "%s: $RECOVERY lock for this node (%u) is "
240 "mastered by %u; got lock, manually granting (no ast)\n", 258 "mastered by %u; got lock, manually granting (no ast)\n",
241 dlm->name, dlm->node_num, res->owner); 259 dlm->name, dlm->node_num, res->owner);
242 list_del_init(&lock->list); 260 list_move_tail(&lock->list, &res->granted);
243 list_add_tail(&lock->list, &res->granted);
244 } 261 }
245 spin_unlock(&res->spinlock); 262 spin_unlock(&res->spinlock);
246 263
247 dlm_lockres_calc_usage(dlm, res); 264 if (lockres_changed)
265 dlm_lockres_calc_usage(dlm, res);
248 266
249 wake_up(&res->wq); 267 wake_up(&res->wq);
250 return status; 268 return status;
@@ -281,6 +299,14 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
281 if (tmpret >= 0) { 299 if (tmpret >= 0) {
282 // successfully sent and received 300 // successfully sent and received
283 ret = status; // this is already a dlm_status 301 ret = status; // this is already a dlm_status
302 if (ret == DLM_REJECTED) {
303 mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres "
304 "no longer owned by %u. that node is coming back "
305 "up currently.\n", dlm->name, create.namelen,
306 create.name, res->owner);
307 dlm_print_one_lock_resource(res);
308 BUG();
309 }
284 } else { 310 } else {
285 mlog_errno(tmpret); 311 mlog_errno(tmpret);
286 if (dlm_is_host_down(tmpret)) { 312 if (dlm_is_host_down(tmpret)) {
@@ -382,13 +408,13 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
382 struct dlm_lock *lock; 408 struct dlm_lock *lock;
383 int kernel_allocated = 0; 409 int kernel_allocated = 0;
384 410
385 lock = kcalloc(1, sizeof(*lock), GFP_KERNEL); 411 lock = kcalloc(1, sizeof(*lock), GFP_NOFS);
386 if (!lock) 412 if (!lock)
387 return NULL; 413 return NULL;
388 414
389 if (!lksb) { 415 if (!lksb) {
390 /* zero memory only if kernel-allocated */ 416 /* zero memory only if kernel-allocated */
391 lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL); 417 lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS);
392 if (!lksb) { 418 if (!lksb) {
393 kfree(lock); 419 kfree(lock);
394 return NULL; 420 return NULL;
@@ -429,11 +455,16 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
429 if (!dlm_grab(dlm)) 455 if (!dlm_grab(dlm))
430 return DLM_REJECTED; 456 return DLM_REJECTED;
431 457
432 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
433 "Domain %s not fully joined!\n", dlm->name);
434
435 name = create->name; 458 name = create->name;
436 namelen = create->namelen; 459 namelen = create->namelen;
460 status = DLM_REJECTED;
461 if (!dlm_domain_fully_joined(dlm)) {
462 mlog(ML_ERROR, "Domain %s not fully joined, but node %u is "
463 "sending a create_lock message for lock %.*s!\n",
464 dlm->name, create->node_idx, namelen, name);
465 dlm_error(status);
466 goto leave;
467 }
437 468
438 status = DLM_IVBUFLEN; 469 status = DLM_IVBUFLEN;
439 if (namelen > DLM_LOCKID_NAME_MAX) { 470 if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -669,18 +700,22 @@ retry_lock:
669 msleep(100); 700 msleep(100);
670 /* no waiting for dlm_reco_thread */ 701 /* no waiting for dlm_reco_thread */
671 if (recovery) { 702 if (recovery) {
672 if (status == DLM_RECOVERING) { 703 if (status != DLM_RECOVERING)
673 mlog(0, "%s: got RECOVERING " 704 goto retry_lock;
674 "for $REOCVERY lock, master " 705
675 "was %u\n", dlm->name, 706 mlog(0, "%s: got RECOVERING "
676 res->owner); 707 "for $RECOVERY lock, master "
677 dlm_wait_for_node_death(dlm, res->owner, 708 "was %u\n", dlm->name,
678 DLM_NODE_DEATH_WAIT_MAX); 709 res->owner);
679 } 710 /* wait to see the node go down, then
711 * drop down and allow the lockres to
712 * get cleaned up. need to remaster. */
713 dlm_wait_for_node_death(dlm, res->owner,
714 DLM_NODE_DEATH_WAIT_MAX);
680 } else { 715 } else {
681 dlm_wait_for_recovery(dlm); 716 dlm_wait_for_recovery(dlm);
717 goto retry_lock;
682 } 718 }
683 goto retry_lock;
684 } 719 }
685 720
686 if (status != DLM_NORMAL) { 721 if (status != DLM_NORMAL) {
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 940be4c13b1f..1b8346dd0572 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -47,7 +47,6 @@
47 47
48#include "dlmapi.h" 48#include "dlmapi.h"
49#include "dlmcommon.h" 49#include "dlmcommon.h"
50#include "dlmdebug.h"
51#include "dlmdomain.h" 50#include "dlmdomain.h"
52 51
53#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) 52#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
@@ -74,6 +73,7 @@ struct dlm_master_list_entry
74 wait_queue_head_t wq; 73 wait_queue_head_t wq;
75 atomic_t woken; 74 atomic_t woken;
76 struct kref mle_refs; 75 struct kref mle_refs;
76 int inuse;
77 unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 77 unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
78 unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 78 unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
79 unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 79 unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -127,18 +127,30 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
127 return 1; 127 return 1;
128} 128}
129 129
130#if 0 130#define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m)
131/* Code here is included but defined out as it aids debugging */ 131static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
132{
133 int i;
134 printk("%s=[ ", mapname);
135 for (i=0; i<O2NM_MAX_NODES; i++)
136 if (test_bit(i, map))
137 printk("%d ", i);
138 printk("]");
139}
132 140
133void dlm_print_one_mle(struct dlm_master_list_entry *mle) 141static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
134{ 142{
135 int i = 0, refs; 143 int refs;
136 char *type; 144 char *type;
137 char attached; 145 char attached;
138 u8 master; 146 u8 master;
139 unsigned int namelen; 147 unsigned int namelen;
140 const char *name; 148 const char *name;
141 struct kref *k; 149 struct kref *k;
150 unsigned long *maybe = mle->maybe_map,
151 *vote = mle->vote_map,
152 *resp = mle->response_map,
153 *node = mle->node_map;
142 154
143 k = &mle->mle_refs; 155 k = &mle->mle_refs;
144 if (mle->type == DLM_MLE_BLOCK) 156 if (mle->type == DLM_MLE_BLOCK)
@@ -159,18 +171,29 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
159 name = mle->u.res->lockname.name; 171 name = mle->u.res->lockname.name;
160 } 172 }
161 173
162 mlog(ML_NOTICE, " #%3d: %3s %3d %3u %3u %c (%d)%.*s\n", 174 mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
163 i, type, refs, master, mle->new_master, attached, 175 namelen, name, type, refs, master, mle->new_master, attached,
164 namelen, namelen, name); 176 mle->inuse);
177 dlm_print_nodemap(maybe);
178 printk(", ");
179 dlm_print_nodemap(vote);
180 printk(", ");
181 dlm_print_nodemap(resp);
182 printk(", ");
183 dlm_print_nodemap(node);
184 printk(", ");
185 printk("\n");
165} 186}
166 187
188#if 0
189/* Code here is included but defined out as it aids debugging */
190
167static void dlm_dump_mles(struct dlm_ctxt *dlm) 191static void dlm_dump_mles(struct dlm_ctxt *dlm)
168{ 192{
169 struct dlm_master_list_entry *mle; 193 struct dlm_master_list_entry *mle;
170 struct list_head *iter; 194 struct list_head *iter;
171 195
172 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); 196 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
173 mlog(ML_NOTICE, " ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
174 spin_lock(&dlm->master_lock); 197 spin_lock(&dlm->master_lock);
175 list_for_each(iter, &dlm->master_list) { 198 list_for_each(iter, &dlm->master_list) {
176 mle = list_entry(iter, struct dlm_master_list_entry, list); 199 mle = list_entry(iter, struct dlm_master_list_entry, list);
@@ -314,6 +337,31 @@ static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
314 spin_unlock(&dlm->spinlock); 337 spin_unlock(&dlm->spinlock);
315} 338}
316 339
340static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
341{
342 struct dlm_ctxt *dlm;
343 dlm = mle->dlm;
344
345 assert_spin_locked(&dlm->spinlock);
346 assert_spin_locked(&dlm->master_lock);
347 mle->inuse++;
348 kref_get(&mle->mle_refs);
349}
350
351static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
352{
353 struct dlm_ctxt *dlm;
354 dlm = mle->dlm;
355
356 spin_lock(&dlm->spinlock);
357 spin_lock(&dlm->master_lock);
358 mle->inuse--;
359 __dlm_put_mle(mle);
360 spin_unlock(&dlm->master_lock);
361 spin_unlock(&dlm->spinlock);
362
363}
364
317/* remove from list and free */ 365/* remove from list and free */
318static void __dlm_put_mle(struct dlm_master_list_entry *mle) 366static void __dlm_put_mle(struct dlm_master_list_entry *mle)
319{ 367{
@@ -322,9 +370,14 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle)
322 370
323 assert_spin_locked(&dlm->spinlock); 371 assert_spin_locked(&dlm->spinlock);
324 assert_spin_locked(&dlm->master_lock); 372 assert_spin_locked(&dlm->master_lock);
325 BUG_ON(!atomic_read(&mle->mle_refs.refcount)); 373 if (!atomic_read(&mle->mle_refs.refcount)) {
326 374 /* this may or may not crash, but who cares.
327 kref_put(&mle->mle_refs, dlm_mle_release); 375 * it's a BUG. */
376 mlog(ML_ERROR, "bad mle: %p\n", mle);
377 dlm_print_one_mle(mle);
378 BUG();
379 } else
380 kref_put(&mle->mle_refs, dlm_mle_release);
328} 381}
329 382
330 383
@@ -367,6 +420,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
367 memset(mle->response_map, 0, sizeof(mle->response_map)); 420 memset(mle->response_map, 0, sizeof(mle->response_map));
368 mle->master = O2NM_MAX_NODES; 421 mle->master = O2NM_MAX_NODES;
369 mle->new_master = O2NM_MAX_NODES; 422 mle->new_master = O2NM_MAX_NODES;
423 mle->inuse = 0;
370 424
371 if (mle->type == DLM_MLE_MASTER) { 425 if (mle->type == DLM_MLE_MASTER) {
372 BUG_ON(!res); 426 BUG_ON(!res);
@@ -564,6 +618,28 @@ static void dlm_lockres_release(struct kref *kref)
564 mlog(0, "destroying lockres %.*s\n", res->lockname.len, 618 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
565 res->lockname.name); 619 res->lockname.name);
566 620
621 if (!hlist_unhashed(&res->hash_node) ||
622 !list_empty(&res->granted) ||
623 !list_empty(&res->converting) ||
624 !list_empty(&res->blocked) ||
625 !list_empty(&res->dirty) ||
626 !list_empty(&res->recovering) ||
627 !list_empty(&res->purge)) {
628 mlog(ML_ERROR,
629 "Going to BUG for resource %.*s."
630 " We're on a list! [%c%c%c%c%c%c%c]\n",
631 res->lockname.len, res->lockname.name,
632 !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
633 !list_empty(&res->granted) ? 'G' : ' ',
634 !list_empty(&res->converting) ? 'C' : ' ',
635 !list_empty(&res->blocked) ? 'B' : ' ',
636 !list_empty(&res->dirty) ? 'D' : ' ',
637 !list_empty(&res->recovering) ? 'R' : ' ',
638 !list_empty(&res->purge) ? 'P' : ' ');
639
640 dlm_print_one_lock_resource(res);
641 }
642
567 /* By the time we're ready to blow this guy away, we shouldn't 643 /* By the time we're ready to blow this guy away, we shouldn't
568 * be on any lists. */ 644 * be on any lists. */
569 BUG_ON(!hlist_unhashed(&res->hash_node)); 645 BUG_ON(!hlist_unhashed(&res->hash_node));
@@ -579,11 +655,6 @@ static void dlm_lockres_release(struct kref *kref)
579 kfree(res); 655 kfree(res);
580} 656}
581 657
582void dlm_lockres_get(struct dlm_lock_resource *res)
583{
584 kref_get(&res->refs);
585}
586
587void dlm_lockres_put(struct dlm_lock_resource *res) 658void dlm_lockres_put(struct dlm_lock_resource *res)
588{ 659{
589 kref_put(&res->refs, dlm_lockres_release); 660 kref_put(&res->refs, dlm_lockres_release);
@@ -603,7 +674,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
603 memcpy(qname, name, namelen); 674 memcpy(qname, name, namelen);
604 675
605 res->lockname.len = namelen; 676 res->lockname.len = namelen;
606 res->lockname.hash = full_name_hash(name, namelen); 677 res->lockname.hash = dlm_lockid_hash(name, namelen);
607 678
608 init_waitqueue_head(&res->wq); 679 init_waitqueue_head(&res->wq);
609 spin_lock_init(&res->spinlock); 680 spin_lock_init(&res->spinlock);
@@ -637,11 +708,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
637{ 708{
638 struct dlm_lock_resource *res; 709 struct dlm_lock_resource *res;
639 710
640 res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); 711 res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
641 if (!res) 712 if (!res)
642 return NULL; 713 return NULL;
643 714
644 res->lockname.name = kmalloc(namelen, GFP_KERNEL); 715 res->lockname.name = kmalloc(namelen, GFP_NOFS);
645 if (!res->lockname.name) { 716 if (!res->lockname.name) {
646 kfree(res); 717 kfree(res);
647 return NULL; 718 return NULL;
@@ -677,19 +748,20 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
677 int blocked = 0; 748 int blocked = 0;
678 int ret, nodenum; 749 int ret, nodenum;
679 struct dlm_node_iter iter; 750 struct dlm_node_iter iter;
680 unsigned int namelen; 751 unsigned int namelen, hash;
681 int tries = 0; 752 int tries = 0;
682 int bit, wait_on_recovery = 0; 753 int bit, wait_on_recovery = 0;
683 754
684 BUG_ON(!lockid); 755 BUG_ON(!lockid);
685 756
686 namelen = strlen(lockid); 757 namelen = strlen(lockid);
758 hash = dlm_lockid_hash(lockid, namelen);
687 759
688 mlog(0, "get lockres %s (len %d)\n", lockid, namelen); 760 mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
689 761
690lookup: 762lookup:
691 spin_lock(&dlm->spinlock); 763 spin_lock(&dlm->spinlock);
692 tmpres = __dlm_lookup_lockres(dlm, lockid, namelen); 764 tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash);
693 if (tmpres) { 765 if (tmpres) {
694 spin_unlock(&dlm->spinlock); 766 spin_unlock(&dlm->spinlock);
695 mlog(0, "found in hash!\n"); 767 mlog(0, "found in hash!\n");
@@ -704,7 +776,7 @@ lookup:
704 mlog(0, "allocating a new resource\n"); 776 mlog(0, "allocating a new resource\n");
705 /* nothing found and we need to allocate one. */ 777 /* nothing found and we need to allocate one. */
706 alloc_mle = (struct dlm_master_list_entry *) 778 alloc_mle = (struct dlm_master_list_entry *)
707 kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); 779 kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
708 if (!alloc_mle) 780 if (!alloc_mle)
709 goto leave; 781 goto leave;
710 res = dlm_new_lockres(dlm, lockid, namelen); 782 res = dlm_new_lockres(dlm, lockid, namelen);
@@ -790,10 +862,11 @@ lookup:
790 * if so, the creator of the BLOCK may try to put the last 862 * if so, the creator of the BLOCK may try to put the last
791 * ref at this time in the assert master handler, so we 863 * ref at this time in the assert master handler, so we
792 * need an extra one to keep from a bad ptr deref. */ 864 * need an extra one to keep from a bad ptr deref. */
793 dlm_get_mle(mle); 865 dlm_get_mle_inuse(mle);
794 spin_unlock(&dlm->master_lock); 866 spin_unlock(&dlm->master_lock);
795 spin_unlock(&dlm->spinlock); 867 spin_unlock(&dlm->spinlock);
796 868
869redo_request:
797 while (wait_on_recovery) { 870 while (wait_on_recovery) {
798 /* any cluster changes that occurred after dropping the 871 /* any cluster changes that occurred after dropping the
799 * dlm spinlock would be detectable be a change on the mle, 872 * dlm spinlock would be detectable be a change on the mle,
@@ -812,7 +885,7 @@ lookup:
812 } 885 }
813 886
814 dlm_kick_recovery_thread(dlm); 887 dlm_kick_recovery_thread(dlm);
815 msleep(100); 888 msleep(1000);
816 dlm_wait_for_recovery(dlm); 889 dlm_wait_for_recovery(dlm);
817 890
818 spin_lock(&dlm->spinlock); 891 spin_lock(&dlm->spinlock);
@@ -825,13 +898,15 @@ lookup:
825 } else 898 } else
826 wait_on_recovery = 0; 899 wait_on_recovery = 0;
827 spin_unlock(&dlm->spinlock); 900 spin_unlock(&dlm->spinlock);
901
902 if (wait_on_recovery)
903 dlm_wait_for_node_recovery(dlm, bit, 10000);
828 } 904 }
829 905
830 /* must wait for lock to be mastered elsewhere */ 906 /* must wait for lock to be mastered elsewhere */
831 if (blocked) 907 if (blocked)
832 goto wait; 908 goto wait;
833 909
834redo_request:
835 ret = -EINVAL; 910 ret = -EINVAL;
836 dlm_node_iter_init(mle->vote_map, &iter); 911 dlm_node_iter_init(mle->vote_map, &iter);
837 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { 912 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
@@ -856,6 +931,7 @@ wait:
856 /* keep going until the response map includes all nodes */ 931 /* keep going until the response map includes all nodes */
857 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); 932 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
858 if (ret < 0) { 933 if (ret < 0) {
934 wait_on_recovery = 1;
859 mlog(0, "%s:%.*s: node map changed, redo the " 935 mlog(0, "%s:%.*s: node map changed, redo the "
860 "master request now, blocked=%d\n", 936 "master request now, blocked=%d\n",
861 dlm->name, res->lockname.len, 937 dlm->name, res->lockname.len,
@@ -866,7 +942,7 @@ wait:
866 dlm->name, res->lockname.len, 942 dlm->name, res->lockname.len,
867 res->lockname.name, blocked); 943 res->lockname.name, blocked);
868 dlm_print_one_lock_resource(res); 944 dlm_print_one_lock_resource(res);
869 /* dlm_print_one_mle(mle); */ 945 dlm_print_one_mle(mle);
870 tries = 0; 946 tries = 0;
871 } 947 }
872 goto redo_request; 948 goto redo_request;
@@ -880,7 +956,7 @@ wait:
880 dlm_mle_detach_hb_events(dlm, mle); 956 dlm_mle_detach_hb_events(dlm, mle);
881 dlm_put_mle(mle); 957 dlm_put_mle(mle);
882 /* put the extra ref */ 958 /* put the extra ref */
883 dlm_put_mle(mle); 959 dlm_put_mle_inuse(mle);
884 960
885wake_waiters: 961wake_waiters:
886 spin_lock(&res->spinlock); 962 spin_lock(&res->spinlock);
@@ -921,12 +997,14 @@ recheck:
921 spin_unlock(&res->spinlock); 997 spin_unlock(&res->spinlock);
922 /* this will cause the master to re-assert across 998 /* this will cause the master to re-assert across
923 * the whole cluster, freeing up mles */ 999 * the whole cluster, freeing up mles */
924 ret = dlm_do_master_request(mle, res->owner); 1000 if (res->owner != dlm->node_num) {
925 if (ret < 0) { 1001 ret = dlm_do_master_request(mle, res->owner);
926 /* give recovery a chance to run */ 1002 if (ret < 0) {
927 mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); 1003 /* give recovery a chance to run */
928 msleep(500); 1004 mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
929 goto recheck; 1005 msleep(500);
1006 goto recheck;
1007 }
930 } 1008 }
931 ret = 0; 1009 ret = 0;
932 goto leave; 1010 goto leave;
@@ -962,6 +1040,12 @@ recheck:
962 "rechecking now\n", dlm->name, res->lockname.len, 1040 "rechecking now\n", dlm->name, res->lockname.len,
963 res->lockname.name); 1041 res->lockname.name);
964 goto recheck; 1042 goto recheck;
1043 } else {
1044 if (!voting_done) {
1045 mlog(0, "map not changed and voting not done "
1046 "for %s:%.*s\n", dlm->name, res->lockname.len,
1047 res->lockname.name);
1048 }
965 } 1049 }
966 1050
967 if (m != O2NM_MAX_NODES) { 1051 if (m != O2NM_MAX_NODES) {
@@ -1129,18 +1213,6 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1129 set_bit(node, mle->vote_map); 1213 set_bit(node, mle->vote_map);
1130 } else { 1214 } else {
1131 mlog(ML_ERROR, "node down! %d\n", node); 1215 mlog(ML_ERROR, "node down! %d\n", node);
1132
1133 /* if the node wasn't involved in mastery skip it,
1134 * but clear it out from the maps so that it will
1135 * not affect mastery of this lockres */
1136 clear_bit(node, mle->response_map);
1137 clear_bit(node, mle->vote_map);
1138 if (!test_bit(node, mle->maybe_map))
1139 goto next;
1140
1141 /* if we're already blocked on lock mastery, and the
1142 * dead node wasn't the expected master, or there is
1143 * another node in the maybe_map, keep waiting */
1144 if (blocked) { 1216 if (blocked) {
1145 int lowest = find_next_bit(mle->maybe_map, 1217 int lowest = find_next_bit(mle->maybe_map,
1146 O2NM_MAX_NODES, 0); 1218 O2NM_MAX_NODES, 0);
@@ -1148,54 +1220,53 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1148 /* act like it was never there */ 1220 /* act like it was never there */
1149 clear_bit(node, mle->maybe_map); 1221 clear_bit(node, mle->maybe_map);
1150 1222
1151 if (node != lowest) 1223 if (node == lowest) {
1152 goto next; 1224 mlog(0, "expected master %u died"
1153 1225 " while this node was blocked "
1154 mlog(ML_ERROR, "expected master %u died while " 1226 "waiting on it!\n", node);
1155 "this node was blocked waiting on it!\n", 1227 lowest = find_next_bit(mle->maybe_map,
1156 node); 1228 O2NM_MAX_NODES,
1157 lowest = find_next_bit(mle->maybe_map, 1229 lowest+1);
1158 O2NM_MAX_NODES, 1230 if (lowest < O2NM_MAX_NODES) {
1159 lowest+1); 1231 mlog(0, "%s:%.*s:still "
1160 if (lowest < O2NM_MAX_NODES) { 1232 "blocked. waiting on %u "
1161 mlog(0, "still blocked. waiting " 1233 "now\n", dlm->name,
1162 "on %u now\n", lowest); 1234 res->lockname.len,
1163 goto next; 1235 res->lockname.name,
1236 lowest);
1237 } else {
1238 /* mle is an MLE_BLOCK, but
1239 * there is now nothing left to
1240 * block on. we need to return
1241 * all the way back out and try
1242 * again with an MLE_MASTER.
1243 * dlm_do_local_recovery_cleanup
1244 * has already run, so the mle
1245 * refcount is ok */
1246 mlog(0, "%s:%.*s: no "
1247 "longer blocking. try to "
1248 "master this here\n",
1249 dlm->name,
1250 res->lockname.len,
1251 res->lockname.name);
1252 mle->type = DLM_MLE_MASTER;
1253 mle->u.res = res;
1254 }
1164 } 1255 }
1165
1166 /* mle is an MLE_BLOCK, but there is now
1167 * nothing left to block on. we need to return
1168 * all the way back out and try again with
1169 * an MLE_MASTER. dlm_do_local_recovery_cleanup
1170 * has already run, so the mle refcount is ok */
1171 mlog(0, "no longer blocking. we can "
1172 "try to master this here\n");
1173 mle->type = DLM_MLE_MASTER;
1174 memset(mle->maybe_map, 0,
1175 sizeof(mle->maybe_map));
1176 memset(mle->response_map, 0,
1177 sizeof(mle->maybe_map));
1178 memcpy(mle->vote_map, mle->node_map,
1179 sizeof(mle->node_map));
1180 mle->u.res = res;
1181 set_bit(dlm->node_num, mle->maybe_map);
1182
1183 ret = -EAGAIN;
1184 goto next;
1185 } 1256 }
1186 1257
1187 clear_bit(node, mle->maybe_map); 1258 /* now blank out everything, as if we had never
1188 if (node > dlm->node_num) 1259 * contacted anyone */
1189 goto next; 1260 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
1190 1261 memset(mle->response_map, 0, sizeof(mle->response_map));
1191 mlog(0, "dead node in map!\n"); 1262 /* reset the vote_map to the current node_map */
1192 /* yuck. go back and re-contact all nodes 1263 memcpy(mle->vote_map, mle->node_map,
1193 * in the vote_map, removing this node. */ 1264 sizeof(mle->node_map));
1194 memset(mle->response_map, 0, 1265 /* put myself into the maybe map */
1195 sizeof(mle->response_map)); 1266 if (mle->type != DLM_MLE_BLOCK)
1267 set_bit(dlm->node_num, mle->maybe_map);
1196 } 1268 }
1197 ret = -EAGAIN; 1269 ret = -EAGAIN;
1198next:
1199 node = dlm_bitmap_diff_iter_next(&bdi, &sc); 1270 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1200 } 1271 }
1201 return ret; 1272 return ret;
@@ -1316,7 +1387,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
1316 struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; 1387 struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
1317 struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; 1388 struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
1318 char *name; 1389 char *name;
1319 unsigned int namelen; 1390 unsigned int namelen, hash;
1320 int found, ret; 1391 int found, ret;
1321 int set_maybe; 1392 int set_maybe;
1322 int dispatch_assert = 0; 1393 int dispatch_assert = 0;
@@ -1331,6 +1402,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
1331 1402
1332 name = request->name; 1403 name = request->name;
1333 namelen = request->namelen; 1404 namelen = request->namelen;
1405 hash = dlm_lockid_hash(name, namelen);
1334 1406
1335 if (namelen > DLM_LOCKID_NAME_MAX) { 1407 if (namelen > DLM_LOCKID_NAME_MAX) {
1336 response = DLM_IVBUFLEN; 1408 response = DLM_IVBUFLEN;
@@ -1339,7 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
1339 1411
1340way_up_top: 1412way_up_top:
1341 spin_lock(&dlm->spinlock); 1413 spin_lock(&dlm->spinlock);
1342 res = __dlm_lookup_lockres(dlm, name, namelen); 1414 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
1343 if (res) { 1415 if (res) {
1344 spin_unlock(&dlm->spinlock); 1416 spin_unlock(&dlm->spinlock);
1345 1417
@@ -1459,21 +1531,18 @@ way_up_top:
1459 spin_unlock(&dlm->spinlock); 1531 spin_unlock(&dlm->spinlock);
1460 1532
1461 mle = (struct dlm_master_list_entry *) 1533 mle = (struct dlm_master_list_entry *)
1462 kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); 1534 kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1463 if (!mle) { 1535 if (!mle) {
1464 response = DLM_MASTER_RESP_ERROR; 1536 response = DLM_MASTER_RESP_ERROR;
1465 mlog_errno(-ENOMEM); 1537 mlog_errno(-ENOMEM);
1466 goto send_response; 1538 goto send_response;
1467 } 1539 }
1468 spin_lock(&dlm->spinlock);
1469 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
1470 name, namelen);
1471 spin_unlock(&dlm->spinlock);
1472 goto way_up_top; 1540 goto way_up_top;
1473 } 1541 }
1474 1542
1475 // mlog(0, "this is second time thru, already allocated, " 1543 // mlog(0, "this is second time thru, already allocated, "
1476 // "add the block.\n"); 1544 // "add the block.\n");
1545 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
1477 set_bit(request->node_idx, mle->maybe_map); 1546 set_bit(request->node_idx, mle->maybe_map);
1478 list_add(&mle->list, &dlm->master_list); 1547 list_add(&mle->list, &dlm->master_list);
1479 response = DLM_MASTER_RESP_NO; 1548 response = DLM_MASTER_RESP_NO;
@@ -1556,6 +1625,8 @@ again:
1556 dlm_node_iter_init(nodemap, &iter); 1625 dlm_node_iter_init(nodemap, &iter);
1557 while ((to = dlm_node_iter_next(&iter)) >= 0) { 1626 while ((to = dlm_node_iter_next(&iter)) >= 0) {
1558 int r = 0; 1627 int r = 0;
1628 struct dlm_master_list_entry *mle = NULL;
1629
1559 mlog(0, "sending assert master to %d (%.*s)\n", to, 1630 mlog(0, "sending assert master to %d (%.*s)\n", to,
1560 namelen, lockname); 1631 namelen, lockname);
1561 memset(&assert, 0, sizeof(assert)); 1632 memset(&assert, 0, sizeof(assert));
@@ -1567,20 +1638,28 @@ again:
1567 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, 1638 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1568 &assert, sizeof(assert), to, &r); 1639 &assert, sizeof(assert), to, &r);
1569 if (tmpret < 0) { 1640 if (tmpret < 0) {
1570 mlog(ML_ERROR, "assert_master returned %d!\n", tmpret); 1641 mlog(0, "assert_master returned %d!\n", tmpret);
1571 if (!dlm_is_host_down(tmpret)) { 1642 if (!dlm_is_host_down(tmpret)) {
1572 mlog(ML_ERROR, "unhandled error!\n"); 1643 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
1573 BUG(); 1644 BUG();
1574 } 1645 }
1575 /* a node died. finish out the rest of the nodes. */ 1646 /* a node died. finish out the rest of the nodes. */
1576 mlog(ML_ERROR, "link to %d went down!\n", to); 1647 mlog(0, "link to %d went down!\n", to);
1577 /* any nonzero status return will do */ 1648 /* any nonzero status return will do */
1578 ret = tmpret; 1649 ret = tmpret;
1579 } else if (r < 0) { 1650 } else if (r < 0) {
1580 /* ok, something horribly messed. kill thyself. */ 1651 /* ok, something horribly messed. kill thyself. */
1581 mlog(ML_ERROR,"during assert master of %.*s to %u, " 1652 mlog(ML_ERROR,"during assert master of %.*s to %u, "
1582 "got %d.\n", namelen, lockname, to, r); 1653 "got %d.\n", namelen, lockname, to, r);
1583 dlm_dump_lock_resources(dlm); 1654 spin_lock(&dlm->spinlock);
1655 spin_lock(&dlm->master_lock);
1656 if (dlm_find_mle(dlm, &mle, (char *)lockname,
1657 namelen)) {
1658 dlm_print_one_mle(mle);
1659 __dlm_put_mle(mle);
1660 }
1661 spin_unlock(&dlm->master_lock);
1662 spin_unlock(&dlm->spinlock);
1584 BUG(); 1663 BUG();
1585 } else if (r == EAGAIN) { 1664 } else if (r == EAGAIN) {
1586 mlog(0, "%.*s: node %u create mles on other " 1665 mlog(0, "%.*s: node %u create mles on other "
@@ -1612,7 +1691,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
1612 struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; 1691 struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
1613 struct dlm_lock_resource *res = NULL; 1692 struct dlm_lock_resource *res = NULL;
1614 char *name; 1693 char *name;
1615 unsigned int namelen; 1694 unsigned int namelen, hash;
1616 u32 flags; 1695 u32 flags;
1617 int master_request = 0; 1696 int master_request = 0;
1618 int ret = 0; 1697 int ret = 0;
@@ -1622,6 +1701,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
1622 1701
1623 name = assert->name; 1702 name = assert->name;
1624 namelen = assert->namelen; 1703 namelen = assert->namelen;
1704 hash = dlm_lockid_hash(name, namelen);
1625 flags = be32_to_cpu(assert->flags); 1705 flags = be32_to_cpu(assert->flags);
1626 1706
1627 if (namelen > DLM_LOCKID_NAME_MAX) { 1707 if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -1646,7 +1726,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
1646 if (bit >= O2NM_MAX_NODES) { 1726 if (bit >= O2NM_MAX_NODES) {
1647 /* not necessarily an error, though less likely. 1727 /* not necessarily an error, though less likely.
1648 * could be master just re-asserting. */ 1728 * could be master just re-asserting. */
1649 mlog(ML_ERROR, "no bits set in the maybe_map, but %u " 1729 mlog(0, "no bits set in the maybe_map, but %u "
1650 "is asserting! (%.*s)\n", assert->node_idx, 1730 "is asserting! (%.*s)\n", assert->node_idx,
1651 namelen, name); 1731 namelen, name);
1652 } else if (bit != assert->node_idx) { 1732 } else if (bit != assert->node_idx) {
@@ -1658,19 +1738,36 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
1658 * number winning the mastery will respond 1738 * number winning the mastery will respond
1659 * YES to mastery requests, but this node 1739 * YES to mastery requests, but this node
1660 * had no way of knowing. let it pass. */ 1740 * had no way of knowing. let it pass. */
1661 mlog(ML_ERROR, "%u is the lowest node, " 1741 mlog(0, "%u is the lowest node, "
1662 "%u is asserting. (%.*s) %u must " 1742 "%u is asserting. (%.*s) %u must "
1663 "have begun after %u won.\n", bit, 1743 "have begun after %u won.\n", bit,
1664 assert->node_idx, namelen, name, bit, 1744 assert->node_idx, namelen, name, bit,
1665 assert->node_idx); 1745 assert->node_idx);
1666 } 1746 }
1667 } 1747 }
1748 if (mle->type == DLM_MLE_MIGRATION) {
1749 if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1750 mlog(0, "%s:%.*s: got cleanup assert"
1751 " from %u for migration\n",
1752 dlm->name, namelen, name,
1753 assert->node_idx);
1754 } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
1755 mlog(0, "%s:%.*s: got unrelated assert"
1756 " from %u for migration, ignoring\n",
1757 dlm->name, namelen, name,
1758 assert->node_idx);
1759 __dlm_put_mle(mle);
1760 spin_unlock(&dlm->master_lock);
1761 spin_unlock(&dlm->spinlock);
1762 goto done;
1763 }
1764 }
1668 } 1765 }
1669 spin_unlock(&dlm->master_lock); 1766 spin_unlock(&dlm->master_lock);
1670 1767
1671 /* ok everything checks out with the MLE 1768 /* ok everything checks out with the MLE
1672 * now check to see if there is a lockres */ 1769 * now check to see if there is a lockres */
1673 res = __dlm_lookup_lockres(dlm, name, namelen); 1770 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
1674 if (res) { 1771 if (res) {
1675 spin_lock(&res->spinlock); 1772 spin_lock(&res->spinlock);
1676 if (res->state & DLM_LOCK_RES_RECOVERING) { 1773 if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -1679,7 +1776,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
1679 goto kill; 1776 goto kill;
1680 } 1777 }
1681 if (!mle) { 1778 if (!mle) {
1682 if (res->owner != assert->node_idx) { 1779 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
1780 res->owner != assert->node_idx) {
1683 mlog(ML_ERROR, "assert_master from " 1781 mlog(ML_ERROR, "assert_master from "
1684 "%u, but current owner is " 1782 "%u, but current owner is "
1685 "%u! (%.*s)\n", 1783 "%u! (%.*s)\n",
@@ -1732,6 +1830,7 @@ ok:
1732 if (mle) { 1830 if (mle) {
1733 int extra_ref = 0; 1831 int extra_ref = 0;
1734 int nn = -1; 1832 int nn = -1;
1833 int rr, err = 0;
1735 1834
1736 spin_lock(&mle->spinlock); 1835 spin_lock(&mle->spinlock);
1737 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) 1836 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
@@ -1751,27 +1850,64 @@ ok:
1751 wake_up(&mle->wq); 1850 wake_up(&mle->wq);
1752 spin_unlock(&mle->spinlock); 1851 spin_unlock(&mle->spinlock);
1753 1852
1754 if (mle->type == DLM_MLE_MIGRATION && res) { 1853 if (res) {
1755 mlog(0, "finishing off migration of lockres %.*s, "
1756 "from %u to %u\n",
1757 res->lockname.len, res->lockname.name,
1758 dlm->node_num, mle->new_master);
1759 spin_lock(&res->spinlock); 1854 spin_lock(&res->spinlock);
1760 res->state &= ~DLM_LOCK_RES_MIGRATING; 1855 if (mle->type == DLM_MLE_MIGRATION) {
1761 dlm_change_lockres_owner(dlm, res, mle->new_master); 1856 mlog(0, "finishing off migration of lockres %.*s, "
1762 BUG_ON(res->state & DLM_LOCK_RES_DIRTY); 1857 "from %u to %u\n",
1858 res->lockname.len, res->lockname.name,
1859 dlm->node_num, mle->new_master);
1860 res->state &= ~DLM_LOCK_RES_MIGRATING;
1861 dlm_change_lockres_owner(dlm, res, mle->new_master);
1862 BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
1863 } else {
1864 dlm_change_lockres_owner(dlm, res, mle->master);
1865 }
1763 spin_unlock(&res->spinlock); 1866 spin_unlock(&res->spinlock);
1764 } 1867 }
1765 /* master is known, detach if not already detached */ 1868
1766 dlm_mle_detach_hb_events(dlm, mle); 1869 /* master is known, detach if not already detached.
1767 dlm_put_mle(mle); 1870 * ensures that only one assert_master call will happen
1768 1871 * on this mle. */
1872 spin_lock(&dlm->spinlock);
1873 spin_lock(&dlm->master_lock);
1874
1875 rr = atomic_read(&mle->mle_refs.refcount);
1876 if (mle->inuse > 0) {
1877 if (extra_ref && rr < 3)
1878 err = 1;
1879 else if (!extra_ref && rr < 2)
1880 err = 1;
1881 } else {
1882 if (extra_ref && rr < 2)
1883 err = 1;
1884 else if (!extra_ref && rr < 1)
1885 err = 1;
1886 }
1887 if (err) {
1888 mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
1889 "that will mess up this node, refs=%d, extra=%d, "
1890 "inuse=%d\n", dlm->name, namelen, name,
1891 assert->node_idx, rr, extra_ref, mle->inuse);
1892 dlm_print_one_mle(mle);
1893 }
1894 list_del_init(&mle->list);
1895 __dlm_mle_detach_hb_events(dlm, mle);
1896 __dlm_put_mle(mle);
1769 if (extra_ref) { 1897 if (extra_ref) {
1770 /* the assert master message now balances the extra 1898 /* the assert master message now balances the extra
1771 * ref given by the master / migration request message. 1899 * ref given by the master / migration request message.
1772 * if this is the last put, it will be removed 1900 * if this is the last put, it will be removed
1773 * from the list. */ 1901 * from the list. */
1774 dlm_put_mle(mle); 1902 __dlm_put_mle(mle);
1903 }
1904 spin_unlock(&dlm->master_lock);
1905 spin_unlock(&dlm->spinlock);
1906 } else if (res) {
1907 if (res->owner != assert->node_idx) {
1908 mlog(0, "assert_master from %u, but current "
1909 "owner is %u (%.*s), no mle\n", assert->node_idx,
1910 res->owner, namelen, name);
1775 } 1911 }
1776 } 1912 }
1777 1913
@@ -1788,12 +1924,12 @@ done:
1788 1924
1789kill: 1925kill:
1790 /* kill the caller! */ 1926 /* kill the caller! */
1927 mlog(ML_ERROR, "Bad message received from another node. Dumping state "
1928 "and killing the other node now! This node is OK and can continue.\n");
1929 __dlm_print_one_lock_resource(res);
1791 spin_unlock(&res->spinlock); 1930 spin_unlock(&res->spinlock);
1792 spin_unlock(&dlm->spinlock); 1931 spin_unlock(&dlm->spinlock);
1793 dlm_lockres_put(res); 1932 dlm_lockres_put(res);
1794 mlog(ML_ERROR, "Bad message received from another node. Dumping state "
1795 "and killing the other node now! This node is OK and can continue.\n");
1796 dlm_dump_lock_resources(dlm);
1797 dlm_put(dlm); 1933 dlm_put(dlm);
1798 return -EINVAL; 1934 return -EINVAL;
1799} 1935}
@@ -1803,7 +1939,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
1803 int ignore_higher, u8 request_from, u32 flags) 1939 int ignore_higher, u8 request_from, u32 flags)
1804{ 1940{
1805 struct dlm_work_item *item; 1941 struct dlm_work_item *item;
1806 item = kcalloc(1, sizeof(*item), GFP_KERNEL); 1942 item = kcalloc(1, sizeof(*item), GFP_NOFS);
1807 if (!item) 1943 if (!item)
1808 return -ENOMEM; 1944 return -ENOMEM;
1809 1945
@@ -1825,7 +1961,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
1825 list_add_tail(&item->list, &dlm->work_list); 1961 list_add_tail(&item->list, &dlm->work_list);
1826 spin_unlock(&dlm->work_lock); 1962 spin_unlock(&dlm->work_lock);
1827 1963
1828 schedule_work(&dlm->dispatched_work); 1964 queue_work(dlm->dlm_worker, &dlm->dispatched_work);
1829 return 0; 1965 return 0;
1830} 1966}
1831 1967
@@ -1866,6 +2002,23 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
1866 } 2002 }
1867 } 2003 }
1868 2004
2005 /*
2006 * If we're migrating this lock to someone else, we are no
2007 * longer allowed to assert out own mastery. OTOH, we need to
2008 * prevent migration from starting while we're still asserting
2009 * our dominance. The reserved ast delays migration.
2010 */
2011 spin_lock(&res->spinlock);
2012 if (res->state & DLM_LOCK_RES_MIGRATING) {
2013 mlog(0, "Someone asked us to assert mastery, but we're "
2014 "in the middle of migration. Skipping assert, "
2015 "the new master will handle that.\n");
2016 spin_unlock(&res->spinlock);
2017 goto put;
2018 } else
2019 __dlm_lockres_reserve_ast(res);
2020 spin_unlock(&res->spinlock);
2021
1869 /* this call now finishes out the nodemap 2022 /* this call now finishes out the nodemap
1870 * even if one or more nodes die */ 2023 * even if one or more nodes die */
1871 mlog(0, "worker about to master %.*s here, this=%u\n", 2024 mlog(0, "worker about to master %.*s here, this=%u\n",
@@ -1875,9 +2028,14 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
1875 nodemap, flags); 2028 nodemap, flags);
1876 if (ret < 0) { 2029 if (ret < 0) {
1877 /* no need to restart, we are done */ 2030 /* no need to restart, we are done */
1878 mlog_errno(ret); 2031 if (!dlm_is_host_down(ret))
2032 mlog_errno(ret);
1879 } 2033 }
1880 2034
2035 /* Ok, we've asserted ourselves. Let's let migration start. */
2036 dlm_lockres_release_ast(dlm, res);
2037
2038put:
1881 dlm_lockres_put(res); 2039 dlm_lockres_put(res);
1882 2040
1883 mlog(0, "finished with dlm_assert_master_worker\n"); 2041 mlog(0, "finished with dlm_assert_master_worker\n");
@@ -1916,6 +2074,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
1916 BUG(); 2074 BUG();
1917 /* host is down, so answer for that node would be 2075 /* host is down, so answer for that node would be
1918 * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ 2076 * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */
2077 ret = 0;
1919 } 2078 }
1920 2079
1921 if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { 2080 if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
@@ -2016,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
2016 */ 2175 */
2017 2176
2018 ret = -ENOMEM; 2177 ret = -ENOMEM;
2019 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL); 2178 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
2020 if (!mres) { 2179 if (!mres) {
2021 mlog_errno(ret); 2180 mlog_errno(ret);
2022 goto leave; 2181 goto leave;
2023 } 2182 }
2024 2183
2025 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, 2184 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
2026 GFP_KERNEL); 2185 GFP_NOFS);
2027 if (!mle) { 2186 if (!mle) {
2028 mlog_errno(ret); 2187 mlog_errno(ret);
2029 goto leave; 2188 goto leave;
@@ -2117,7 +2276,7 @@ fail:
2117 * take both dlm->spinlock and dlm->master_lock */ 2276 * take both dlm->spinlock and dlm->master_lock */
2118 spin_lock(&dlm->spinlock); 2277 spin_lock(&dlm->spinlock);
2119 spin_lock(&dlm->master_lock); 2278 spin_lock(&dlm->master_lock);
2120 dlm_get_mle(mle); 2279 dlm_get_mle_inuse(mle);
2121 spin_unlock(&dlm->master_lock); 2280 spin_unlock(&dlm->master_lock);
2122 spin_unlock(&dlm->spinlock); 2281 spin_unlock(&dlm->spinlock);
2123 2282
@@ -2134,7 +2293,10 @@ fail:
2134 /* migration failed, detach and clean up mle */ 2293 /* migration failed, detach and clean up mle */
2135 dlm_mle_detach_hb_events(dlm, mle); 2294 dlm_mle_detach_hb_events(dlm, mle);
2136 dlm_put_mle(mle); 2295 dlm_put_mle(mle);
2137 dlm_put_mle(mle); 2296 dlm_put_mle_inuse(mle);
2297 spin_lock(&res->spinlock);
2298 res->state &= ~DLM_LOCK_RES_MIGRATING;
2299 spin_unlock(&res->spinlock);
2138 goto leave; 2300 goto leave;
2139 } 2301 }
2140 2302
@@ -2164,8 +2326,8 @@ fail:
2164 /* avoid hang during shutdown when migrating lockres 2326 /* avoid hang during shutdown when migrating lockres
2165 * to a node which also goes down */ 2327 * to a node which also goes down */
2166 if (dlm_is_node_dead(dlm, target)) { 2328 if (dlm_is_node_dead(dlm, target)) {
2167 mlog(0, "%s:%.*s: expected migration target %u " 2329 mlog(0, "%s:%.*s: expected migration "
2168 "is no longer up. restarting.\n", 2330 "target %u is no longer up, restarting\n",
2169 dlm->name, res->lockname.len, 2331 dlm->name, res->lockname.len,
2170 res->lockname.name, target); 2332 res->lockname.name, target);
2171 ret = -ERESTARTSYS; 2333 ret = -ERESTARTSYS;
@@ -2175,7 +2337,10 @@ fail:
2175 /* migration failed, detach and clean up mle */ 2337 /* migration failed, detach and clean up mle */
2176 dlm_mle_detach_hb_events(dlm, mle); 2338 dlm_mle_detach_hb_events(dlm, mle);
2177 dlm_put_mle(mle); 2339 dlm_put_mle(mle);
2178 dlm_put_mle(mle); 2340 dlm_put_mle_inuse(mle);
2341 spin_lock(&res->spinlock);
2342 res->state &= ~DLM_LOCK_RES_MIGRATING;
2343 spin_unlock(&res->spinlock);
2179 goto leave; 2344 goto leave;
2180 } 2345 }
2181 /* TODO: if node died: stop, clean up, return error */ 2346 /* TODO: if node died: stop, clean up, return error */
@@ -2191,7 +2356,7 @@ fail:
2191 2356
2192 /* master is known, detach if not already detached */ 2357 /* master is known, detach if not already detached */
2193 dlm_mle_detach_hb_events(dlm, mle); 2358 dlm_mle_detach_hb_events(dlm, mle);
2194 dlm_put_mle(mle); 2359 dlm_put_mle_inuse(mle);
2195 ret = 0; 2360 ret = 0;
2196 2361
2197 dlm_lockres_calc_usage(dlm, res); 2362 dlm_lockres_calc_usage(dlm, res);
@@ -2462,7 +2627,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
2462 struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; 2627 struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
2463 struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; 2628 struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
2464 const char *name; 2629 const char *name;
2465 unsigned int namelen; 2630 unsigned int namelen, hash;
2466 int ret = 0; 2631 int ret = 0;
2467 2632
2468 if (!dlm_grab(dlm)) 2633 if (!dlm_grab(dlm))
@@ -2470,10 +2635,11 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
2470 2635
2471 name = migrate->name; 2636 name = migrate->name;
2472 namelen = migrate->namelen; 2637 namelen = migrate->namelen;
2638 hash = dlm_lockid_hash(name, namelen);
2473 2639
2474 /* preallocate.. if this fails, abort */ 2640 /* preallocate.. if this fails, abort */
2475 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, 2641 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
2476 GFP_KERNEL); 2642 GFP_NOFS);
2477 2643
2478 if (!mle) { 2644 if (!mle) {
2479 ret = -ENOMEM; 2645 ret = -ENOMEM;
@@ -2482,7 +2648,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
2482 2648
2483 /* check for pre-existing lock */ 2649 /* check for pre-existing lock */
2484 spin_lock(&dlm->spinlock); 2650 spin_lock(&dlm->spinlock);
2485 res = __dlm_lookup_lockres(dlm, name, namelen); 2651 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
2486 spin_lock(&dlm->master_lock); 2652 spin_lock(&dlm->master_lock);
2487 2653
2488 if (res) { 2654 if (res) {
@@ -2580,6 +2746,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
2580 /* remove it from the list so that only one 2746 /* remove it from the list so that only one
2581 * mle will be found */ 2747 * mle will be found */
2582 list_del_init(&tmp->list); 2748 list_del_init(&tmp->list);
2749 __dlm_mle_detach_hb_events(dlm, mle);
2583 } 2750 }
2584 spin_unlock(&tmp->spinlock); 2751 spin_unlock(&tmp->spinlock);
2585 } 2752 }
@@ -2601,6 +2768,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
2601 struct list_head *iter, *iter2; 2768 struct list_head *iter, *iter2;
2602 struct dlm_master_list_entry *mle; 2769 struct dlm_master_list_entry *mle;
2603 struct dlm_lock_resource *res; 2770 struct dlm_lock_resource *res;
2771 unsigned int hash;
2604 2772
2605 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); 2773 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
2606top: 2774top:
@@ -2640,7 +2808,7 @@ top:
2640 * may result in the mle being unlinked and 2808 * may result in the mle being unlinked and
2641 * freed, but there may still be a process 2809 * freed, but there may still be a process
2642 * waiting in the dlmlock path which is fine. */ 2810 * waiting in the dlmlock path which is fine. */
2643 mlog(ML_ERROR, "node %u was expected master\n", 2811 mlog(0, "node %u was expected master\n",
2644 dead_node); 2812 dead_node);
2645 atomic_set(&mle->woken, 1); 2813 atomic_set(&mle->woken, 1);
2646 spin_unlock(&mle->spinlock); 2814 spin_unlock(&mle->spinlock);
@@ -2673,19 +2841,21 @@ top:
2673 2841
2674 /* remove from the list early. NOTE: unlinking 2842 /* remove from the list early. NOTE: unlinking
2675 * list_head while in list_for_each_safe */ 2843 * list_head while in list_for_each_safe */
2844 __dlm_mle_detach_hb_events(dlm, mle);
2676 spin_lock(&mle->spinlock); 2845 spin_lock(&mle->spinlock);
2677 list_del_init(&mle->list); 2846 list_del_init(&mle->list);
2678 atomic_set(&mle->woken, 1); 2847 atomic_set(&mle->woken, 1);
2679 spin_unlock(&mle->spinlock); 2848 spin_unlock(&mle->spinlock);
2680 wake_up(&mle->wq); 2849 wake_up(&mle->wq);
2681 2850
2682 mlog(0, "node %u died during migration from " 2851 mlog(0, "%s: node %u died during migration from "
2683 "%u to %u!\n", dead_node, 2852 "%u to %u!\n", dlm->name, dead_node,
2684 mle->master, mle->new_master); 2853 mle->master, mle->new_master);
2685 /* if there is a lockres associated with this 2854 /* if there is a lockres associated with this
2686 * mle, find it and set its owner to UNKNOWN */ 2855 * mle, find it and set its owner to UNKNOWN */
2856 hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
2687 res = __dlm_lookup_lockres(dlm, mle->u.name.name, 2857 res = __dlm_lookup_lockres(dlm, mle->u.name.name,
2688 mle->u.name.len); 2858 mle->u.name.len, hash);
2689 if (res) { 2859 if (res) {
2690 /* unfortunately if we hit this rare case, our 2860 /* unfortunately if we hit this rare case, our
2691 * lock ordering is messed. we need to drop 2861 * lock ordering is messed. we need to drop
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 805cbabac051..29b2845f370d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -98,8 +98,8 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
98 98
99static u64 dlm_get_next_mig_cookie(void); 99static u64 dlm_get_next_mig_cookie(void);
100 100
101static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED; 101static DEFINE_SPINLOCK(dlm_reco_state_lock);
102static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED; 102static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
103static u64 dlm_mig_cookie = 1; 103static u64 dlm_mig_cookie = 1;
104 104
105static u64 dlm_get_next_mig_cookie(void) 105static u64 dlm_get_next_mig_cookie(void)
@@ -115,12 +115,37 @@ static u64 dlm_get_next_mig_cookie(void)
115 return c; 115 return c;
116} 116}
117 117
118static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
119 u8 dead_node)
120{
121 assert_spin_locked(&dlm->spinlock);
122 if (dlm->reco.dead_node != dead_node)
123 mlog(0, "%s: changing dead_node from %u to %u\n",
124 dlm->name, dlm->reco.dead_node, dead_node);
125 dlm->reco.dead_node = dead_node;
126}
127
128static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
129 u8 master)
130{
131 assert_spin_locked(&dlm->spinlock);
132 mlog(0, "%s: changing new_master from %u to %u\n",
133 dlm->name, dlm->reco.new_master, master);
134 dlm->reco.new_master = master;
135}
136
137static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
138{
139 assert_spin_locked(&dlm->spinlock);
140 clear_bit(dlm->reco.dead_node, dlm->recovery_map);
141 dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
142 dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
143}
144
118static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) 145static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
119{ 146{
120 spin_lock(&dlm->spinlock); 147 spin_lock(&dlm->spinlock);
121 clear_bit(dlm->reco.dead_node, dlm->recovery_map); 148 __dlm_reset_recovery(dlm);
122 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
123 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
124 spin_unlock(&dlm->spinlock); 149 spin_unlock(&dlm->spinlock);
125} 150}
126 151
@@ -132,12 +157,21 @@ void dlm_dispatch_work(void *data)
132 struct list_head *iter, *iter2; 157 struct list_head *iter, *iter2;
133 struct dlm_work_item *item; 158 struct dlm_work_item *item;
134 dlm_workfunc_t *workfunc; 159 dlm_workfunc_t *workfunc;
160 int tot=0;
161
162 if (!dlm_joined(dlm))
163 return;
135 164
136 spin_lock(&dlm->work_lock); 165 spin_lock(&dlm->work_lock);
137 list_splice_init(&dlm->work_list, &tmp_list); 166 list_splice_init(&dlm->work_list, &tmp_list);
138 spin_unlock(&dlm->work_lock); 167 spin_unlock(&dlm->work_lock);
139 168
140 list_for_each_safe(iter, iter2, &tmp_list) { 169 list_for_each_safe(iter, iter2, &tmp_list) {
170 tot++;
171 }
172 mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
173
174 list_for_each_safe(iter, iter2, &tmp_list) {
141 item = list_entry(iter, struct dlm_work_item, list); 175 item = list_entry(iter, struct dlm_work_item, list);
142 workfunc = item->func; 176 workfunc = item->func;
143 list_del_init(&item->list); 177 list_del_init(&item->list);
@@ -220,6 +254,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
220 * 254 *
221 */ 255 */
222 256
257static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
258{
259 struct dlm_reco_node_data *ndata;
260 struct dlm_lock_resource *res;
261
262 mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
263 dlm->name, dlm->dlm_reco_thread_task->pid,
264 dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
265 dlm->reco.dead_node, dlm->reco.new_master);
266
267 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
268 char *st = "unknown";
269 switch (ndata->state) {
270 case DLM_RECO_NODE_DATA_INIT:
271 st = "init";
272 break;
273 case DLM_RECO_NODE_DATA_REQUESTING:
274 st = "requesting";
275 break;
276 case DLM_RECO_NODE_DATA_DEAD:
277 st = "dead";
278 break;
279 case DLM_RECO_NODE_DATA_RECEIVING:
280 st = "receiving";
281 break;
282 case DLM_RECO_NODE_DATA_REQUESTED:
283 st = "requested";
284 break;
285 case DLM_RECO_NODE_DATA_DONE:
286 st = "done";
287 break;
288 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
289 st = "finalize-sent";
290 break;
291 default:
292 st = "bad";
293 break;
294 }
295 mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
296 dlm->name, ndata->node_num, st);
297 }
298 list_for_each_entry(res, &dlm->reco.resources, recovering) {
299 mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
300 dlm->name, res->lockname.len, res->lockname.name);
301 }
302}
223 303
224#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) 304#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
225 305
@@ -267,11 +347,23 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
267{ 347{
268 int dead; 348 int dead;
269 spin_lock(&dlm->spinlock); 349 spin_lock(&dlm->spinlock);
270 dead = test_bit(node, dlm->domain_map); 350 dead = !test_bit(node, dlm->domain_map);
271 spin_unlock(&dlm->spinlock); 351 spin_unlock(&dlm->spinlock);
272 return dead; 352 return dead;
273} 353}
274 354
355/* returns true if node is no longer in the domain
356 * could be dead or just not joined */
357static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
358{
359 int recovered;
360 spin_lock(&dlm->spinlock);
361 recovered = !test_bit(node, dlm->recovery_map);
362 spin_unlock(&dlm->spinlock);
363 return recovered;
364}
365
366
275int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) 367int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
276{ 368{
277 if (timeout) { 369 if (timeout) {
@@ -290,6 +382,24 @@ int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
290 return 0; 382 return 0;
291} 383}
292 384
385int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
386{
387 if (timeout) {
388 mlog(0, "%s: waiting %dms for notification of "
389 "recovery of node %u\n", dlm->name, timeout, node);
390 wait_event_timeout(dlm->dlm_reco_thread_wq,
391 dlm_is_node_recovered(dlm, node),
392 msecs_to_jiffies(timeout));
393 } else {
394 mlog(0, "%s: waiting indefinitely for notification "
395 "of recovery of node %u\n", dlm->name, node);
396 wait_event(dlm->dlm_reco_thread_wq,
397 dlm_is_node_recovered(dlm, node));
398 }
399 /* for now, return 0 */
400 return 0;
401}
402
293/* callers of the top-level api calls (dlmlock/dlmunlock) should 403/* callers of the top-level api calls (dlmlock/dlmunlock) should
294 * block on the dlm->reco.event when recovery is in progress. 404 * block on the dlm->reco.event when recovery is in progress.
295 * the dlm recovery thread will set this state when it begins 405 * the dlm recovery thread will set this state when it begins
@@ -308,6 +418,13 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm)
308 418
309void dlm_wait_for_recovery(struct dlm_ctxt *dlm) 419void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
310{ 420{
421 if (dlm_in_recovery(dlm)) {
422 mlog(0, "%s: reco thread %d in recovery: "
423 "state=%d, master=%u, dead=%u\n",
424 dlm->name, dlm->dlm_reco_thread_task->pid,
425 dlm->reco.state, dlm->reco.new_master,
426 dlm->reco.dead_node);
427 }
311 wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); 428 wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
312} 429}
313 430
@@ -341,7 +458,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
341 mlog(0, "new master %u died while recovering %u!\n", 458 mlog(0, "new master %u died while recovering %u!\n",
342 dlm->reco.new_master, dlm->reco.dead_node); 459 dlm->reco.new_master, dlm->reco.dead_node);
343 /* unset the new_master, leave dead_node */ 460 /* unset the new_master, leave dead_node */
344 dlm->reco.new_master = O2NM_INVALID_NODE_NUM; 461 dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
345 } 462 }
346 463
347 /* select a target to recover */ 464 /* select a target to recover */
@@ -350,14 +467,14 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
350 467
351 bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); 468 bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
352 if (bit >= O2NM_MAX_NODES || bit < 0) 469 if (bit >= O2NM_MAX_NODES || bit < 0)
353 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; 470 dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
354 else 471 else
355 dlm->reco.dead_node = bit; 472 dlm_set_reco_dead_node(dlm, bit);
356 } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { 473 } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
357 /* BUG? */ 474 /* BUG? */
358 mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", 475 mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
359 dlm->reco.dead_node); 476 dlm->reco.dead_node);
360 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; 477 dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
361 } 478 }
362 479
363 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { 480 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
@@ -366,7 +483,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
366 /* return to main thread loop and sleep. */ 483 /* return to main thread loop and sleep. */
367 return 0; 484 return 0;
368 } 485 }
369 mlog(0, "recovery thread found node %u in the recovery map!\n", 486 mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
487 dlm->name, dlm->dlm_reco_thread_task->pid,
370 dlm->reco.dead_node); 488 dlm->reco.dead_node);
371 spin_unlock(&dlm->spinlock); 489 spin_unlock(&dlm->spinlock);
372 490
@@ -389,8 +507,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
389 } 507 }
390 mlog(0, "another node will master this recovery session.\n"); 508 mlog(0, "another node will master this recovery session.\n");
391 } 509 }
392 mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n", 510 mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
393 dlm->name, dlm->reco.new_master, 511 dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master,
394 dlm->node_num, dlm->reco.dead_node); 512 dlm->node_num, dlm->reco.dead_node);
395 513
396 /* it is safe to start everything back up here 514 /* it is safe to start everything back up here
@@ -402,11 +520,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
402 return 0; 520 return 0;
403 521
404master_here: 522master_here:
405 mlog(0, "mastering recovery of %s:%u here(this=%u)!\n", 523 mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n",
524 dlm->dlm_reco_thread_task->pid,
406 dlm->name, dlm->reco.dead_node, dlm->node_num); 525 dlm->name, dlm->reco.dead_node, dlm->node_num);
407 526
408 status = dlm_remaster_locks(dlm, dlm->reco.dead_node); 527 status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
409 if (status < 0) { 528 if (status < 0) {
529 /* we should never hit this anymore */
410 mlog(ML_ERROR, "error %d remastering locks for node %u, " 530 mlog(ML_ERROR, "error %d remastering locks for node %u, "
411 "retrying.\n", status, dlm->reco.dead_node); 531 "retrying.\n", status, dlm->reco.dead_node);
412 /* yield a bit to allow any final network messages 532 /* yield a bit to allow any final network messages
@@ -433,9 +553,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
433 int destroy = 0; 553 int destroy = 0;
434 int pass = 0; 554 int pass = 0;
435 555
436 status = dlm_init_recovery_area(dlm, dead_node); 556 do {
437 if (status < 0) 557 /* we have become recovery master. there is no escaping
438 goto leave; 558 * this, so just keep trying until we get it. */
559 status = dlm_init_recovery_area(dlm, dead_node);
560 if (status < 0) {
561 mlog(ML_ERROR, "%s: failed to alloc recovery area, "
562 "retrying\n", dlm->name);
563 msleep(1000);
564 }
565 } while (status != 0);
439 566
440 /* safe to access the node data list without a lock, since this 567 /* safe to access the node data list without a lock, since this
441 * process is the only one to change the list */ 568 * process is the only one to change the list */
@@ -452,16 +579,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
452 continue; 579 continue;
453 } 580 }
454 581
455 status = dlm_request_all_locks(dlm, ndata->node_num, dead_node); 582 do {
456 if (status < 0) { 583 status = dlm_request_all_locks(dlm, ndata->node_num,
457 mlog_errno(status); 584 dead_node);
458 if (dlm_is_host_down(status)) 585 if (status < 0) {
459 ndata->state = DLM_RECO_NODE_DATA_DEAD; 586 mlog_errno(status);
460 else { 587 if (dlm_is_host_down(status)) {
461 destroy = 1; 588 /* node died, ignore it for recovery */
462 goto leave; 589 status = 0;
590 ndata->state = DLM_RECO_NODE_DATA_DEAD;
591 /* wait for the domain map to catch up
592 * with the network state. */
593 wait_event_timeout(dlm->dlm_reco_thread_wq,
594 dlm_is_node_dead(dlm,
595 ndata->node_num),
596 msecs_to_jiffies(1000));
597 mlog(0, "waited 1 sec for %u, "
598 "dead? %s\n", ndata->node_num,
599 dlm_is_node_dead(dlm, ndata->node_num) ?
600 "yes" : "no");
601 } else {
602 /* -ENOMEM on the other node */
603 mlog(0, "%s: node %u returned "
604 "%d during recovery, retrying "
605 "after a short wait\n",
606 dlm->name, ndata->node_num,
607 status);
608 msleep(100);
609 }
463 } 610 }
464 } 611 } while (status != 0);
465 612
466 switch (ndata->state) { 613 switch (ndata->state) {
467 case DLM_RECO_NODE_DATA_INIT: 614 case DLM_RECO_NODE_DATA_INIT:
@@ -473,10 +620,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
473 mlog(0, "node %u died after requesting " 620 mlog(0, "node %u died after requesting "
474 "recovery info for node %u\n", 621 "recovery info for node %u\n",
475 ndata->node_num, dead_node); 622 ndata->node_num, dead_node);
476 // start all over 623 /* fine. don't need this node's info.
477 destroy = 1; 624 * continue without it. */
478 status = -EAGAIN; 625 break;
479 goto leave;
480 case DLM_RECO_NODE_DATA_REQUESTING: 626 case DLM_RECO_NODE_DATA_REQUESTING:
481 ndata->state = DLM_RECO_NODE_DATA_REQUESTED; 627 ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
482 mlog(0, "now receiving recovery data from " 628 mlog(0, "now receiving recovery data from "
@@ -520,35 +666,26 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
520 BUG(); 666 BUG();
521 break; 667 break;
522 case DLM_RECO_NODE_DATA_DEAD: 668 case DLM_RECO_NODE_DATA_DEAD:
523 mlog(ML_NOTICE, "node %u died after " 669 mlog(0, "node %u died after "
524 "requesting recovery info for " 670 "requesting recovery info for "
525 "node %u\n", ndata->node_num, 671 "node %u\n", ndata->node_num,
526 dead_node); 672 dead_node);
527 spin_unlock(&dlm_reco_state_lock); 673 break;
528 // start all over
529 destroy = 1;
530 status = -EAGAIN;
531 /* instead of spinning like crazy here,
532 * wait for the domain map to catch up
533 * with the network state. otherwise this
534 * can be hit hundreds of times before
535 * the node is really seen as dead. */
536 wait_event_timeout(dlm->dlm_reco_thread_wq,
537 dlm_is_node_dead(dlm,
538 ndata->node_num),
539 msecs_to_jiffies(1000));
540 mlog(0, "waited 1 sec for %u, "
541 "dead? %s\n", ndata->node_num,
542 dlm_is_node_dead(dlm, ndata->node_num) ?
543 "yes" : "no");
544 goto leave;
545 case DLM_RECO_NODE_DATA_RECEIVING: 674 case DLM_RECO_NODE_DATA_RECEIVING:
546 case DLM_RECO_NODE_DATA_REQUESTED: 675 case DLM_RECO_NODE_DATA_REQUESTED:
676 mlog(0, "%s: node %u still in state %s\n",
677 dlm->name, ndata->node_num,
678 ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
679 "receiving" : "requested");
547 all_nodes_done = 0; 680 all_nodes_done = 0;
548 break; 681 break;
549 case DLM_RECO_NODE_DATA_DONE: 682 case DLM_RECO_NODE_DATA_DONE:
683 mlog(0, "%s: node %u state is done\n",
684 dlm->name, ndata->node_num);
550 break; 685 break;
551 case DLM_RECO_NODE_DATA_FINALIZE_SENT: 686 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
687 mlog(0, "%s: node %u state is finalize\n",
688 dlm->name, ndata->node_num);
552 break; 689 break;
553 } 690 }
554 } 691 }
@@ -578,7 +715,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
578 jiffies, dlm->reco.dead_node, 715 jiffies, dlm->reco.dead_node,
579 dlm->node_num, dlm->reco.new_master); 716 dlm->node_num, dlm->reco.new_master);
580 destroy = 1; 717 destroy = 1;
581 status = ret; 718 status = 0;
582 /* rescan everything marked dirty along the way */ 719 /* rescan everything marked dirty along the way */
583 dlm_kick_thread(dlm, NULL); 720 dlm_kick_thread(dlm, NULL);
584 break; 721 break;
@@ -591,7 +728,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
591 728
592 } 729 }
593 730
594leave:
595 if (destroy) 731 if (destroy)
596 dlm_destroy_recovery_area(dlm, dead_node); 732 dlm_destroy_recovery_area(dlm, dead_node);
597 733
@@ -617,7 +753,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
617 } 753 }
618 BUG_ON(num == dead_node); 754 BUG_ON(num == dead_node);
619 755
620 ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL); 756 ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS);
621 if (!ndata) { 757 if (!ndata) {
622 dlm_destroy_recovery_area(dlm, dead_node); 758 dlm_destroy_recovery_area(dlm, dead_node);
623 return -ENOMEM; 759 return -ENOMEM;
@@ -691,16 +827,25 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
691 if (!dlm_grab(dlm)) 827 if (!dlm_grab(dlm))
692 return -EINVAL; 828 return -EINVAL;
693 829
830 if (lr->dead_node != dlm->reco.dead_node) {
831 mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
832 "dead_node is %u\n", dlm->name, lr->node_idx,
833 lr->dead_node, dlm->reco.dead_node);
834 dlm_print_reco_node_status(dlm);
835 /* this is a hack */
836 dlm_put(dlm);
837 return -ENOMEM;
838 }
694 BUG_ON(lr->dead_node != dlm->reco.dead_node); 839 BUG_ON(lr->dead_node != dlm->reco.dead_node);
695 840
696 item = kcalloc(1, sizeof(*item), GFP_KERNEL); 841 item = kcalloc(1, sizeof(*item), GFP_NOFS);
697 if (!item) { 842 if (!item) {
698 dlm_put(dlm); 843 dlm_put(dlm);
699 return -ENOMEM; 844 return -ENOMEM;
700 } 845 }
701 846
702 /* this will get freed by dlm_request_all_locks_worker */ 847 /* this will get freed by dlm_request_all_locks_worker */
703 buf = (char *) __get_free_page(GFP_KERNEL); 848 buf = (char *) __get_free_page(GFP_NOFS);
704 if (!buf) { 849 if (!buf) {
705 kfree(item); 850 kfree(item);
706 dlm_put(dlm); 851 dlm_put(dlm);
@@ -715,7 +860,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
715 spin_lock(&dlm->work_lock); 860 spin_lock(&dlm->work_lock);
716 list_add_tail(&item->list, &dlm->work_list); 861 list_add_tail(&item->list, &dlm->work_list);
717 spin_unlock(&dlm->work_lock); 862 spin_unlock(&dlm->work_lock);
718 schedule_work(&dlm->dispatched_work); 863 queue_work(dlm->dlm_worker, &dlm->dispatched_work);
719 864
720 dlm_put(dlm); 865 dlm_put(dlm);
721 return 0; 866 return 0;
@@ -730,32 +875,34 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
730 struct list_head *iter; 875 struct list_head *iter;
731 int ret; 876 int ret;
732 u8 dead_node, reco_master; 877 u8 dead_node, reco_master;
878 int skip_all_done = 0;
733 879
734 dlm = item->dlm; 880 dlm = item->dlm;
735 dead_node = item->u.ral.dead_node; 881 dead_node = item->u.ral.dead_node;
736 reco_master = item->u.ral.reco_master; 882 reco_master = item->u.ral.reco_master;
737 mres = (struct dlm_migratable_lockres *)data; 883 mres = (struct dlm_migratable_lockres *)data;
738 884
885 mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
886 dlm->name, dead_node, reco_master);
887
739 if (dead_node != dlm->reco.dead_node || 888 if (dead_node != dlm->reco.dead_node ||
740 reco_master != dlm->reco.new_master) { 889 reco_master != dlm->reco.new_master) {
741 /* show extra debug info if the recovery state is messed */ 890 /* worker could have been created before the recovery master
742 mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), " 891 * died. if so, do not continue, but do not error. */
743 "request(dead=%u, master=%u)\n", 892 if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
744 dlm->name, dlm->reco.dead_node, dlm->reco.new_master, 893 mlog(ML_NOTICE, "%s: will not send recovery state, "
745 dead_node, reco_master); 894 "recovery master %u died, thread=(dead=%u,mas=%u)"
746 mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u " 895 " current=(dead=%u,mas=%u)\n", dlm->name,
747 "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n", 896 reco_master, dead_node, reco_master,
748 dlm->name, mres->lockname_len, mres->lockname, mres->master, 897 dlm->reco.dead_node, dlm->reco.new_master);
749 mres->num_locks, mres->total_locks, mres->flags, 898 } else {
750 dlm_get_lock_cookie_node(mres->ml[0].cookie), 899 mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
751 dlm_get_lock_cookie_seq(mres->ml[0].cookie), 900 "master=%u), request(dead=%u, master=%u)\n",
752 mres->ml[0].list, mres->ml[0].flags, 901 dlm->name, dlm->reco.dead_node,
753 mres->ml[0].type, mres->ml[0].convert_type, 902 dlm->reco.new_master, dead_node, reco_master);
754 mres->ml[0].highest_blocked, mres->ml[0].node); 903 }
755 BUG(); 904 goto leave;
756 } 905 }
757 BUG_ON(dead_node != dlm->reco.dead_node);
758 BUG_ON(reco_master != dlm->reco.new_master);
759 906
760 /* lock resources should have already been moved to the 907 /* lock resources should have already been moved to the
761 * dlm->reco.resources list. now move items from that list 908 * dlm->reco.resources list. now move items from that list
@@ -766,12 +913,20 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
766 dlm_move_reco_locks_to_list(dlm, &resources, dead_node); 913 dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
767 914
768 /* now we can begin blasting lockreses without the dlm lock */ 915 /* now we can begin blasting lockreses without the dlm lock */
916
917 /* any errors returned will be due to the new_master dying,
918 * the dlm_reco_thread should detect this */
769 list_for_each(iter, &resources) { 919 list_for_each(iter, &resources) {
770 res = list_entry (iter, struct dlm_lock_resource, recovering); 920 res = list_entry (iter, struct dlm_lock_resource, recovering);
771 ret = dlm_send_one_lockres(dlm, res, mres, reco_master, 921 ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
772 DLM_MRES_RECOVERY); 922 DLM_MRES_RECOVERY);
773 if (ret < 0) 923 if (ret < 0) {
774 mlog_errno(ret); 924 mlog(ML_ERROR, "%s: node %u went down while sending "
925 "recovery state for dead node %u, ret=%d\n", dlm->name,
926 reco_master, dead_node, ret);
927 skip_all_done = 1;
928 break;
929 }
775 } 930 }
776 931
777 /* move the resources back to the list */ 932 /* move the resources back to the list */
@@ -779,10 +934,15 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
779 list_splice_init(&resources, &dlm->reco.resources); 934 list_splice_init(&resources, &dlm->reco.resources);
780 spin_unlock(&dlm->spinlock); 935 spin_unlock(&dlm->spinlock);
781 936
782 ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); 937 if (!skip_all_done) {
783 if (ret < 0) 938 ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
784 mlog_errno(ret); 939 if (ret < 0) {
785 940 mlog(ML_ERROR, "%s: node %u went down while sending "
941 "recovery all-done for dead node %u, ret=%d\n",
942 dlm->name, reco_master, dead_node, ret);
943 }
944 }
945leave:
786 free_page((unsigned long)data); 946 free_page((unsigned long)data);
787} 947}
788 948
@@ -801,8 +961,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
801 961
802 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, 962 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
803 sizeof(done_msg), send_to, &tmpret); 963 sizeof(done_msg), send_to, &tmpret);
804 /* negative status is ignored by the caller */ 964 if (ret < 0) {
805 if (ret >= 0) 965 if (!dlm_is_host_down(ret)) {
966 mlog_errno(ret);
967 mlog(ML_ERROR, "%s: unknown error sending data-done "
968 "to %u\n", dlm->name, send_to);
969 BUG();
970 }
971 } else
806 ret = tmpret; 972 ret = tmpret;
807 return ret; 973 return ret;
808} 974}
@@ -822,7 +988,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
822 mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " 988 mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
823 "node_idx=%u, this node=%u\n", done->dead_node, 989 "node_idx=%u, this node=%u\n", done->dead_node,
824 dlm->reco.dead_node, done->node_idx, dlm->node_num); 990 dlm->reco.dead_node, done->node_idx, dlm->node_num);
825 BUG_ON(done->dead_node != dlm->reco.dead_node); 991
992 mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
993 "Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
994 "node_idx=%u, this node=%u\n", done->dead_node,
995 dlm->reco.dead_node, done->node_idx, dlm->node_num);
826 996
827 spin_lock(&dlm_reco_state_lock); 997 spin_lock(&dlm_reco_state_lock);
828 list_for_each(iter, &dlm->reco.node_data) { 998 list_for_each(iter, &dlm->reco.node_data) {
@@ -905,13 +1075,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
905 mlog(0, "found lockres owned by dead node while " 1075 mlog(0, "found lockres owned by dead node while "
906 "doing recovery for node %u. sending it.\n", 1076 "doing recovery for node %u. sending it.\n",
907 dead_node); 1077 dead_node);
908 list_del_init(&res->recovering); 1078 list_move_tail(&res->recovering, list);
909 list_add_tail(&res->recovering, list);
910 } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { 1079 } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
911 mlog(0, "found UNKNOWN owner while doing recovery " 1080 mlog(0, "found UNKNOWN owner while doing recovery "
912 "for node %u. sending it.\n", dead_node); 1081 "for node %u. sending it.\n", dead_node);
913 list_del_init(&res->recovering); 1082 list_move_tail(&res->recovering, list);
914 list_add_tail(&res->recovering, list);
915 } 1083 }
916 } 1084 }
917 spin_unlock(&dlm->spinlock); 1085 spin_unlock(&dlm->spinlock);
@@ -1023,8 +1191,9 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
1023 ml->type == LKM_PRMODE) { 1191 ml->type == LKM_PRMODE) {
1024 /* if it is already set, this had better be a PR 1192 /* if it is already set, this had better be a PR
1025 * and it has to match */ 1193 * and it has to match */
1026 if (mres->lvb[0] && (ml->type == LKM_EXMODE || 1194 if (!dlm_lvb_is_empty(mres->lvb) &&
1027 memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { 1195 (ml->type == LKM_EXMODE ||
1196 memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
1028 mlog(ML_ERROR, "mismatched lvbs!\n"); 1197 mlog(ML_ERROR, "mismatched lvbs!\n");
1029 __dlm_print_one_lock_resource(lock->lockres); 1198 __dlm_print_one_lock_resource(lock->lockres);
1030 BUG(); 1199 BUG();
@@ -1083,22 +1252,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1083 * we must send it immediately. */ 1252 * we must send it immediately. */
1084 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, 1253 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
1085 res, total_locks); 1254 res, total_locks);
1086 if (ret < 0) { 1255 if (ret < 0)
1087 // TODO 1256 goto error;
1088 mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
1089 "returned %d, TODO\n", ret);
1090 BUG();
1091 }
1092 } 1257 }
1093 } 1258 }
1094 /* flush any remaining locks */ 1259 /* flush any remaining locks */
1095 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); 1260 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
1096 if (ret < 0) { 1261 if (ret < 0)
1097 // TODO 1262 goto error;
1098 mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, " 1263 return ret;
1099 "TODO\n", ret); 1264
1265error:
1266 mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
1267 dlm->name, ret);
1268 if (!dlm_is_host_down(ret))
1100 BUG(); 1269 BUG();
1101 } 1270 mlog(0, "%s: node %u went down while sending %s "
1271 "lockres %.*s\n", dlm->name, send_to,
1272 flags & DLM_MRES_RECOVERY ? "recovery" : "migration",
1273 res->lockname.len, res->lockname.name);
1102 return ret; 1274 return ret;
1103} 1275}
1104 1276
@@ -1146,8 +1318,8 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
1146 mlog(0, "all done flag. all lockres data received!\n"); 1318 mlog(0, "all done flag. all lockres data received!\n");
1147 1319
1148 ret = -ENOMEM; 1320 ret = -ENOMEM;
1149 buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL); 1321 buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
1150 item = kcalloc(1, sizeof(*item), GFP_KERNEL); 1322 item = kcalloc(1, sizeof(*item), GFP_NOFS);
1151 if (!buf || !item) 1323 if (!buf || !item)
1152 goto leave; 1324 goto leave;
1153 1325
@@ -1238,7 +1410,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
1238 spin_lock(&dlm->work_lock); 1410 spin_lock(&dlm->work_lock);
1239 list_add_tail(&item->list, &dlm->work_list); 1411 list_add_tail(&item->list, &dlm->work_list);
1240 spin_unlock(&dlm->work_lock); 1412 spin_unlock(&dlm->work_lock);
1241 schedule_work(&dlm->dispatched_work); 1413 queue_work(dlm->dlm_worker, &dlm->dispatched_work);
1242 1414
1243leave: 1415leave:
1244 dlm_put(dlm); 1416 dlm_put(dlm);
@@ -1406,6 +1578,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
1406 struct dlm_ctxt *dlm = data; 1578 struct dlm_ctxt *dlm = data;
1407 struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; 1579 struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
1408 struct dlm_lock_resource *res = NULL; 1580 struct dlm_lock_resource *res = NULL;
1581 unsigned int hash;
1409 int master = DLM_LOCK_RES_OWNER_UNKNOWN; 1582 int master = DLM_LOCK_RES_OWNER_UNKNOWN;
1410 u32 flags = DLM_ASSERT_MASTER_REQUERY; 1583 u32 flags = DLM_ASSERT_MASTER_REQUERY;
1411 1584
@@ -1415,8 +1588,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
1415 return master; 1588 return master;
1416 } 1589 }
1417 1590
1591 hash = dlm_lockid_hash(req->name, req->namelen);
1592
1418 spin_lock(&dlm->spinlock); 1593 spin_lock(&dlm->spinlock);
1419 res = __dlm_lookup_lockres(dlm, req->name, req->namelen); 1594 res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash);
1420 if (res) { 1595 if (res) {
1421 spin_lock(&res->spinlock); 1596 spin_lock(&res->spinlock);
1422 master = res->owner; 1597 master = res->owner;
@@ -1483,7 +1658,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1483 struct dlm_lock *newlock = NULL; 1658 struct dlm_lock *newlock = NULL;
1484 struct dlm_lockstatus *lksb = NULL; 1659 struct dlm_lockstatus *lksb = NULL;
1485 int ret = 0; 1660 int ret = 0;
1486 int i; 1661 int i, bad;
1487 struct list_head *iter; 1662 struct list_head *iter;
1488 struct dlm_lock *lock = NULL; 1663 struct dlm_lock *lock = NULL;
1489 1664
@@ -1529,8 +1704,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1529 1704
1530 /* move the lock to its proper place */ 1705 /* move the lock to its proper place */
1531 /* do not alter lock refcount. switching lists. */ 1706 /* do not alter lock refcount. switching lists. */
1532 list_del_init(&lock->list); 1707 list_move_tail(&lock->list, queue);
1533 list_add_tail(&lock->list, queue);
1534 spin_unlock(&res->spinlock); 1708 spin_unlock(&res->spinlock);
1535 1709
1536 mlog(0, "just reordered a local lock!\n"); 1710 mlog(0, "just reordered a local lock!\n");
@@ -1553,28 +1727,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1553 } 1727 }
1554 lksb->flags |= (ml->flags & 1728 lksb->flags |= (ml->flags &
1555 (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); 1729 (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
1556 1730
1557 if (mres->lvb[0]) { 1731 if (ml->type == LKM_NLMODE)
1732 goto skip_lvb;
1733
1734 if (!dlm_lvb_is_empty(mres->lvb)) {
1558 if (lksb->flags & DLM_LKSB_PUT_LVB) { 1735 if (lksb->flags & DLM_LKSB_PUT_LVB) {
1559 /* other node was trying to update 1736 /* other node was trying to update
1560 * lvb when node died. recreate the 1737 * lvb when node died. recreate the
1561 * lksb with the updated lvb. */ 1738 * lksb with the updated lvb. */
1562 memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); 1739 memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
1740 /* the lock resource lvb update must happen
1741 * NOW, before the spinlock is dropped.
1742 * we no longer wait for the AST to update
1743 * the lvb. */
1744 memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
1563 } else { 1745 } else {
1564 /* otherwise, the node is sending its 1746 /* otherwise, the node is sending its
1565 * most recent valid lvb info */ 1747 * most recent valid lvb info */
1566 BUG_ON(ml->type != LKM_EXMODE && 1748 BUG_ON(ml->type != LKM_EXMODE &&
1567 ml->type != LKM_PRMODE); 1749 ml->type != LKM_PRMODE);
1568 if (res->lvb[0] && (ml->type == LKM_EXMODE || 1750 if (!dlm_lvb_is_empty(res->lvb) &&
1569 memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { 1751 (ml->type == LKM_EXMODE ||
1570 mlog(ML_ERROR, "received bad lvb!\n"); 1752 memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
1571 __dlm_print_one_lock_resource(res); 1753 int i;
1572 BUG(); 1754 mlog(ML_ERROR, "%s:%.*s: received bad "
1755 "lvb! type=%d\n", dlm->name,
1756 res->lockname.len,
1757 res->lockname.name, ml->type);
1758 printk("lockres lvb=[");
1759 for (i=0; i<DLM_LVB_LEN; i++)
1760 printk("%02x", res->lvb[i]);
1761 printk("]\nmigrated lvb=[");
1762 for (i=0; i<DLM_LVB_LEN; i++)
1763 printk("%02x", mres->lvb[i]);
1764 printk("]\n");
1765 dlm_print_one_lock_resource(res);
1766 BUG();
1573 } 1767 }
1574 memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); 1768 memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
1575 } 1769 }
1576 } 1770 }
1577 1771skip_lvb:
1578 1772
1579 /* NOTE: 1773 /* NOTE:
1580 * wrt lock queue ordering and recovery: 1774 * wrt lock queue ordering and recovery:
@@ -1592,9 +1786,33 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1592 * relative to each other, but clearly *not* 1786 * relative to each other, but clearly *not*
1593 * preserved relative to locks from other nodes. 1787 * preserved relative to locks from other nodes.
1594 */ 1788 */
1789 bad = 0;
1595 spin_lock(&res->spinlock); 1790 spin_lock(&res->spinlock);
1596 dlm_lock_get(newlock); 1791 list_for_each_entry(lock, queue, list) {
1597 list_add_tail(&newlock->list, queue); 1792 if (lock->ml.cookie == ml->cookie) {
1793 u64 c = lock->ml.cookie;
1794 mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
1795 "exists on this lockres!\n", dlm->name,
1796 res->lockname.len, res->lockname.name,
1797 dlm_get_lock_cookie_node(c),
1798 dlm_get_lock_cookie_seq(c));
1799
1800 mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
1801 "node=%u, cookie=%u:%llu, queue=%d\n",
1802 ml->type, ml->convert_type, ml->node,
1803 dlm_get_lock_cookie_node(ml->cookie),
1804 dlm_get_lock_cookie_seq(ml->cookie),
1805 ml->list);
1806
1807 __dlm_print_one_lock_resource(res);
1808 bad = 1;
1809 break;
1810 }
1811 }
1812 if (!bad) {
1813 dlm_lock_get(newlock);
1814 list_add_tail(&newlock->list, queue);
1815 }
1598 spin_unlock(&res->spinlock); 1816 spin_unlock(&res->spinlock);
1599 } 1817 }
1600 mlog(0, "done running all the locks\n"); 1818 mlog(0, "done running all the locks\n");
@@ -1618,8 +1836,14 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1618 struct dlm_lock *lock; 1836 struct dlm_lock *lock;
1619 1837
1620 res->state |= DLM_LOCK_RES_RECOVERING; 1838 res->state |= DLM_LOCK_RES_RECOVERING;
1621 if (!list_empty(&res->recovering)) 1839 if (!list_empty(&res->recovering)) {
1840 mlog(0,
1841 "Recovering res %s:%.*s, is already on recovery list!\n",
1842 dlm->name, res->lockname.len, res->lockname.name);
1622 list_del_init(&res->recovering); 1843 list_del_init(&res->recovering);
1844 }
1845 /* We need to hold a reference while on the recovery list */
1846 dlm_lockres_get(res);
1623 list_add_tail(&res->recovering, &dlm->reco.resources); 1847 list_add_tail(&res->recovering, &dlm->reco.resources);
1624 1848
1625 /* find any pending locks and put them back on proper list */ 1849 /* find any pending locks and put them back on proper list */
@@ -1708,9 +1932,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
1708 spin_lock(&res->spinlock); 1932 spin_lock(&res->spinlock);
1709 dlm_change_lockres_owner(dlm, res, new_master); 1933 dlm_change_lockres_owner(dlm, res, new_master);
1710 res->state &= ~DLM_LOCK_RES_RECOVERING; 1934 res->state &= ~DLM_LOCK_RES_RECOVERING;
1711 __dlm_dirty_lockres(dlm, res); 1935 if (!__dlm_lockres_unused(res))
1936 __dlm_dirty_lockres(dlm, res);
1712 spin_unlock(&res->spinlock); 1937 spin_unlock(&res->spinlock);
1713 wake_up(&res->wq); 1938 wake_up(&res->wq);
1939 dlm_lockres_put(res);
1714 } 1940 }
1715 } 1941 }
1716 1942
@@ -1719,7 +1945,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
1719 * the RECOVERING state and set the owner 1945 * the RECOVERING state and set the owner
1720 * if necessary */ 1946 * if necessary */
1721 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 1947 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
1722 bucket = &(dlm->lockres_hash[i]); 1948 bucket = dlm_lockres_hash(dlm, i);
1723 hlist_for_each_entry(res, hash_iter, bucket, hash_node) { 1949 hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
1724 if (res->state & DLM_LOCK_RES_RECOVERING) { 1950 if (res->state & DLM_LOCK_RES_RECOVERING) {
1725 if (res->owner == dead_node) { 1951 if (res->owner == dead_node) {
@@ -1743,11 +1969,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
1743 dlm->name, res->lockname.len, 1969 dlm->name, res->lockname.len,
1744 res->lockname.name, res->owner); 1970 res->lockname.name, res->owner);
1745 list_del_init(&res->recovering); 1971 list_del_init(&res->recovering);
1972 dlm_lockres_put(res);
1746 } 1973 }
1747 spin_lock(&res->spinlock); 1974 spin_lock(&res->spinlock);
1748 dlm_change_lockres_owner(dlm, res, new_master); 1975 dlm_change_lockres_owner(dlm, res, new_master);
1749 res->state &= ~DLM_LOCK_RES_RECOVERING; 1976 res->state &= ~DLM_LOCK_RES_RECOVERING;
1750 __dlm_dirty_lockres(dlm, res); 1977 if (!__dlm_lockres_unused(res))
1978 __dlm_dirty_lockres(dlm, res);
1751 spin_unlock(&res->spinlock); 1979 spin_unlock(&res->spinlock);
1752 wake_up(&res->wq); 1980 wake_up(&res->wq);
1753 } 1981 }
@@ -1884,7 +2112,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
1884 * need to be fired as a result. 2112 * need to be fired as a result.
1885 */ 2113 */
1886 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 2114 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
1887 bucket = &(dlm->lockres_hash[i]); 2115 bucket = dlm_lockres_hash(dlm, i);
1888 hlist_for_each_entry(res, iter, bucket, hash_node) { 2116 hlist_for_each_entry(res, iter, bucket, hash_node) {
1889 /* always prune any $RECOVERY entries for dead nodes, 2117 /* always prune any $RECOVERY entries for dead nodes,
1890 * otherwise hangs can occur during later recovery */ 2118 * otherwise hangs can occur during later recovery */
@@ -1924,6 +2152,20 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
1924{ 2152{
1925 assert_spin_locked(&dlm->spinlock); 2153 assert_spin_locked(&dlm->spinlock);
1926 2154
2155 if (dlm->reco.new_master == idx) {
2156 mlog(0, "%s: recovery master %d just died\n",
2157 dlm->name, idx);
2158 if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
2159 /* finalize1 was reached, so it is safe to clear
2160 * the new_master and dead_node. that recovery
2161 * is complete. */
2162 mlog(0, "%s: dead master %d had reached "
2163 "finalize1 state, clearing\n", dlm->name, idx);
2164 dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
2165 __dlm_reset_recovery(dlm);
2166 }
2167 }
2168
1927 /* check to see if the node is already considered dead */ 2169 /* check to see if the node is already considered dead */
1928 if (!test_bit(idx, dlm->live_nodes_map)) { 2170 if (!test_bit(idx, dlm->live_nodes_map)) {
1929 mlog(0, "for domain %s, node %d is already dead. " 2171 mlog(0, "for domain %s, node %d is already dead. "
@@ -2087,7 +2329,7 @@ again:
2087 2329
2088 /* set the new_master to this node */ 2330 /* set the new_master to this node */
2089 spin_lock(&dlm->spinlock); 2331 spin_lock(&dlm->spinlock);
2090 dlm->reco.new_master = dlm->node_num; 2332 dlm_set_reco_master(dlm, dlm->node_num);
2091 spin_unlock(&dlm->spinlock); 2333 spin_unlock(&dlm->spinlock);
2092 } 2334 }
2093 2335
@@ -2125,6 +2367,10 @@ again:
2125 mlog(0, "%s: reco master %u is ready to recover %u\n", 2367 mlog(0, "%s: reco master %u is ready to recover %u\n",
2126 dlm->name, dlm->reco.new_master, dlm->reco.dead_node); 2368 dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
2127 status = -EEXIST; 2369 status = -EEXIST;
2370 } else if (ret == DLM_RECOVERING) {
2371 mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
2372 dlm->name, dlm->node_num);
2373 goto again;
2128 } else { 2374 } else {
2129 struct dlm_lock_resource *res; 2375 struct dlm_lock_resource *res;
2130 2376
@@ -2156,7 +2402,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
2156 2402
2157 mlog_entry("%u\n", dead_node); 2403 mlog_entry("%u\n", dead_node);
2158 2404
2159 mlog(0, "dead node is %u\n", dead_node); 2405 mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
2160 2406
2161 spin_lock(&dlm->spinlock); 2407 spin_lock(&dlm->spinlock);
2162 dlm_node_iter_init(dlm->domain_map, &iter); 2408 dlm_node_iter_init(dlm->domain_map, &iter);
@@ -2214,6 +2460,14 @@ retry:
2214 * another ENOMEM */ 2460 * another ENOMEM */
2215 msleep(100); 2461 msleep(100);
2216 goto retry; 2462 goto retry;
2463 } else if (ret == EAGAIN) {
2464 mlog(0, "%s: trying to start recovery of node "
2465 "%u, but node %u is waiting for last recovery "
2466 "to complete, backoff for a bit\n", dlm->name,
2467 dead_node, nodenum);
2468 /* TODO Look into replacing msleep with cond_resched() */
2469 msleep(100);
2470 goto retry;
2217 } 2471 }
2218 } 2472 }
2219 2473
@@ -2229,8 +2483,20 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2229 if (!dlm_grab(dlm)) 2483 if (!dlm_grab(dlm))
2230 return 0; 2484 return 0;
2231 2485
2232 mlog(0, "node %u wants to recover node %u\n", 2486 spin_lock(&dlm->spinlock);
2233 br->node_idx, br->dead_node); 2487 if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
2488 mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
2489 "but this node is in finalize state, waiting on finalize2\n",
2490 dlm->name, br->node_idx, br->dead_node,
2491 dlm->reco.dead_node, dlm->reco.new_master);
2492 spin_unlock(&dlm->spinlock);
2493 return EAGAIN;
2494 }
2495 spin_unlock(&dlm->spinlock);
2496
2497 mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
2498 dlm->name, br->node_idx, br->dead_node,
2499 dlm->reco.dead_node, dlm->reco.new_master);
2234 2500
2235 dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); 2501 dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
2236 2502
@@ -2252,8 +2518,8 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2252 "node %u changing it to %u\n", dlm->name, 2518 "node %u changing it to %u\n", dlm->name,
2253 dlm->reco.dead_node, br->node_idx, br->dead_node); 2519 dlm->reco.dead_node, br->node_idx, br->dead_node);
2254 } 2520 }
2255 dlm->reco.new_master = br->node_idx; 2521 dlm_set_reco_master(dlm, br->node_idx);
2256 dlm->reco.dead_node = br->dead_node; 2522 dlm_set_reco_dead_node(dlm, br->dead_node);
2257 if (!test_bit(br->dead_node, dlm->recovery_map)) { 2523 if (!test_bit(br->dead_node, dlm->recovery_map)) {
2258 mlog(0, "recovery master %u sees %u as dead, but this " 2524 mlog(0, "recovery master %u sees %u as dead, but this "
2259 "node has not yet. marking %u as dead\n", 2525 "node has not yet. marking %u as dead\n",
@@ -2272,10 +2538,16 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2272 spin_unlock(&dlm->spinlock); 2538 spin_unlock(&dlm->spinlock);
2273 2539
2274 dlm_kick_recovery_thread(dlm); 2540 dlm_kick_recovery_thread(dlm);
2541
2542 mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
2543 dlm->name, br->node_idx, br->dead_node,
2544 dlm->reco.dead_node, dlm->reco.new_master);
2545
2275 dlm_put(dlm); 2546 dlm_put(dlm);
2276 return 0; 2547 return 0;
2277} 2548}
2278 2549
2550#define DLM_FINALIZE_STAGE2 0x01
2279static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) 2551static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
2280{ 2552{
2281 int ret = 0; 2553 int ret = 0;
@@ -2283,25 +2555,31 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
2283 struct dlm_node_iter iter; 2555 struct dlm_node_iter iter;
2284 int nodenum; 2556 int nodenum;
2285 int status; 2557 int status;
2558 int stage = 1;
2286 2559
2287 mlog(0, "finishing recovery for node %s:%u\n", 2560 mlog(0, "finishing recovery for node %s:%u, "
2288 dlm->name, dlm->reco.dead_node); 2561 "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
2289 2562
2290 spin_lock(&dlm->spinlock); 2563 spin_lock(&dlm->spinlock);
2291 dlm_node_iter_init(dlm->domain_map, &iter); 2564 dlm_node_iter_init(dlm->domain_map, &iter);
2292 spin_unlock(&dlm->spinlock); 2565 spin_unlock(&dlm->spinlock);
2293 2566
2567stage2:
2294 memset(&fr, 0, sizeof(fr)); 2568 memset(&fr, 0, sizeof(fr));
2295 fr.node_idx = dlm->node_num; 2569 fr.node_idx = dlm->node_num;
2296 fr.dead_node = dlm->reco.dead_node; 2570 fr.dead_node = dlm->reco.dead_node;
2571 if (stage == 2)
2572 fr.flags |= DLM_FINALIZE_STAGE2;
2297 2573
2298 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { 2574 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
2299 if (nodenum == dlm->node_num) 2575 if (nodenum == dlm->node_num)
2300 continue; 2576 continue;
2301 ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, 2577 ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
2302 &fr, sizeof(fr), nodenum, &status); 2578 &fr, sizeof(fr), nodenum, &status);
2303 if (ret >= 0) { 2579 if (ret >= 0)
2304 ret = status; 2580 ret = status;
2581 if (ret < 0) {
2582 mlog_errno(ret);
2305 if (dlm_is_host_down(ret)) { 2583 if (dlm_is_host_down(ret)) {
2306 /* this has no effect on this recovery 2584 /* this has no effect on this recovery
2307 * session, so set the status to zero to 2585 * session, so set the status to zero to
@@ -2309,13 +2587,17 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
2309 mlog(ML_ERROR, "node %u went down after this " 2587 mlog(ML_ERROR, "node %u went down after this "
2310 "node finished recovery.\n", nodenum); 2588 "node finished recovery.\n", nodenum);
2311 ret = 0; 2589 ret = 0;
2590 continue;
2312 } 2591 }
2313 }
2314 if (ret < 0) {
2315 mlog_errno(ret);
2316 break; 2592 break;
2317 } 2593 }
2318 } 2594 }
2595 if (stage == 1) {
2596 /* reset the node_iter back to the top and send finalize2 */
2597 iter.curnode = -1;
2598 stage = 2;
2599 goto stage2;
2600 }
2319 2601
2320 return ret; 2602 return ret;
2321} 2603}
@@ -2324,14 +2606,19 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2324{ 2606{
2325 struct dlm_ctxt *dlm = data; 2607 struct dlm_ctxt *dlm = data;
2326 struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; 2608 struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
2609 int stage = 1;
2327 2610
2328 /* ok to return 0, domain has gone away */ 2611 /* ok to return 0, domain has gone away */
2329 if (!dlm_grab(dlm)) 2612 if (!dlm_grab(dlm))
2330 return 0; 2613 return 0;
2331 2614
2332 mlog(0, "node %u finalizing recovery of node %u\n", 2615 if (fr->flags & DLM_FINALIZE_STAGE2)
2333 fr->node_idx, fr->dead_node); 2616 stage = 2;
2334 2617
2618 mlog(0, "%s: node %u finalizing recovery stage%d of "
2619 "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
2620 fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
2621
2335 spin_lock(&dlm->spinlock); 2622 spin_lock(&dlm->spinlock);
2336 2623
2337 if (dlm->reco.new_master != fr->node_idx) { 2624 if (dlm->reco.new_master != fr->node_idx) {
@@ -2347,13 +2634,41 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2347 BUG(); 2634 BUG();
2348 } 2635 }
2349 2636
2350 dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); 2637 switch (stage) {
2351 2638 case 1:
2352 spin_unlock(&dlm->spinlock); 2639 dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
2640 if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
2641 mlog(ML_ERROR, "%s: received finalize1 from "
2642 "new master %u for dead node %u, but "
2643 "this node has already received it!\n",
2644 dlm->name, fr->node_idx, fr->dead_node);
2645 dlm_print_reco_node_status(dlm);
2646 BUG();
2647 }
2648 dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
2649 spin_unlock(&dlm->spinlock);
2650 break;
2651 case 2:
2652 if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
2653 mlog(ML_ERROR, "%s: received finalize2 from "
2654 "new master %u for dead node %u, but "
2655 "this node did not have finalize1!\n",
2656 dlm->name, fr->node_idx, fr->dead_node);
2657 dlm_print_reco_node_status(dlm);
2658 BUG();
2659 }
2660 dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
2661 spin_unlock(&dlm->spinlock);
2662 dlm_reset_recovery(dlm);
2663 dlm_kick_recovery_thread(dlm);
2664 break;
2665 default:
2666 BUG();
2667 }
2353 2668
2354 dlm_reset_recovery(dlm); 2669 mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
2670 dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
2355 2671
2356 dlm_kick_recovery_thread(dlm);
2357 dlm_put(dlm); 2672 dlm_put(dlm);
2358 return 0; 2673 return 0;
2359} 2674}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 5be9d14f12cb..0c822f3ffb05 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -39,6 +39,7 @@
39#include <linux/inet.h> 39#include <linux/inet.h>
40#include <linux/timer.h> 40#include <linux/timer.h>
41#include <linux/kthread.h> 41#include <linux/kthread.h>
42#include <linux/delay.h>
42 43
43 44
44#include "cluster/heartbeat.h" 45#include "cluster/heartbeat.h"
@@ -53,6 +54,8 @@
53#include "cluster/masklog.h" 54#include "cluster/masklog.h"
54 55
55static int dlm_thread(void *data); 56static int dlm_thread(void *data);
57static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
58 struct dlm_lock_resource *lockres);
56 59
57static void dlm_flush_asts(struct dlm_ctxt *dlm); 60static void dlm_flush_asts(struct dlm_ctxt *dlm);
58 61
@@ -80,7 +83,7 @@ repeat:
80} 83}
81 84
82 85
83static int __dlm_lockres_unused(struct dlm_lock_resource *res) 86int __dlm_lockres_unused(struct dlm_lock_resource *res)
84{ 87{
85 if (list_empty(&res->granted) && 88 if (list_empty(&res->granted) &&
86 list_empty(&res->converting) && 89 list_empty(&res->converting) &&
@@ -103,6 +106,20 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
103 assert_spin_locked(&res->spinlock); 106 assert_spin_locked(&res->spinlock);
104 107
105 if (__dlm_lockres_unused(res)){ 108 if (__dlm_lockres_unused(res)){
109 /* For now, just keep any resource we master */
110 if (res->owner == dlm->node_num)
111 {
112 if (!list_empty(&res->purge)) {
113 mlog(0, "we master %s:%.*s, but it is on "
114 "the purge list. Removing\n",
115 dlm->name, res->lockname.len,
116 res->lockname.name);
117 list_del_init(&res->purge);
118 dlm->purge_count--;
119 }
120 return;
121 }
122
106 if (list_empty(&res->purge)) { 123 if (list_empty(&res->purge)) {
107 mlog(0, "putting lockres %.*s from purge list\n", 124 mlog(0, "putting lockres %.*s from purge list\n",
108 res->lockname.len, res->lockname.name); 125 res->lockname.len, res->lockname.name);
@@ -110,10 +127,23 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
110 res->last_used = jiffies; 127 res->last_used = jiffies;
111 list_add_tail(&res->purge, &dlm->purge_list); 128 list_add_tail(&res->purge, &dlm->purge_list);
112 dlm->purge_count++; 129 dlm->purge_count++;
130
131 /* if this node is not the owner, there is
132 * no way to keep track of who the owner could be.
133 * unhash it to avoid serious problems. */
134 if (res->owner != dlm->node_num) {
135 mlog(0, "%s:%.*s: doing immediate "
136 "purge of lockres owned by %u\n",
137 dlm->name, res->lockname.len,
138 res->lockname.name, res->owner);
139
140 dlm_purge_lockres_now(dlm, res);
141 }
113 } 142 }
114 } else if (!list_empty(&res->purge)) { 143 } else if (!list_empty(&res->purge)) {
115 mlog(0, "removing lockres %.*s from purge list\n", 144 mlog(0, "removing lockres %.*s from purge list, "
116 res->lockname.len, res->lockname.name); 145 "owner=%u\n", res->lockname.len, res->lockname.name,
146 res->owner);
117 147
118 list_del_init(&res->purge); 148 list_del_init(&res->purge);
119 dlm->purge_count--; 149 dlm->purge_count--;
@@ -165,6 +195,7 @@ again:
165 } else if (ret < 0) { 195 } else if (ret < 0) {
166 mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", 196 mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
167 lockres->lockname.len, lockres->lockname.name); 197 lockres->lockname.len, lockres->lockname.name);
198 msleep(100);
168 goto again; 199 goto again;
169 } 200 }
170 201
@@ -178,6 +209,24 @@ finish:
178 __dlm_unhash_lockres(lockres); 209 __dlm_unhash_lockres(lockres);
179} 210}
180 211
212/* make an unused lockres go away immediately.
213 * as soon as the dlm spinlock is dropped, this lockres
214 * will not be found. kfree still happens on last put. */
215static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
216 struct dlm_lock_resource *lockres)
217{
218 assert_spin_locked(&dlm->spinlock);
219 assert_spin_locked(&lockres->spinlock);
220
221 BUG_ON(!__dlm_lockres_unused(lockres));
222
223 if (!list_empty(&lockres->purge)) {
224 list_del_init(&lockres->purge);
225 dlm->purge_count--;
226 }
227 __dlm_unhash_lockres(lockres);
228}
229
181static void dlm_run_purge_list(struct dlm_ctxt *dlm, 230static void dlm_run_purge_list(struct dlm_ctxt *dlm,
182 int purge_now) 231 int purge_now)
183{ 232{
@@ -318,8 +367,7 @@ converting:
318 367
319 target->ml.type = target->ml.convert_type; 368 target->ml.type = target->ml.convert_type;
320 target->ml.convert_type = LKM_IVMODE; 369 target->ml.convert_type = LKM_IVMODE;
321 list_del_init(&target->list); 370 list_move_tail(&target->list, &res->granted);
322 list_add_tail(&target->list, &res->granted);
323 371
324 BUG_ON(!target->lksb); 372 BUG_ON(!target->lksb);
325 target->lksb->status = DLM_NORMAL; 373 target->lksb->status = DLM_NORMAL;
@@ -380,8 +428,7 @@ blocked:
380 target->ml.type, target->ml.node); 428 target->ml.type, target->ml.node);
381 429
382 // target->ml.type is already correct 430 // target->ml.type is already correct
383 list_del_init(&target->list); 431 list_move_tail(&target->list, &res->granted);
384 list_add_tail(&target->list, &res->granted);
385 432
386 BUG_ON(!target->lksb); 433 BUG_ON(!target->lksb);
387 target->lksb->status = DLM_NORMAL; 434 target->lksb->status = DLM_NORMAL;
@@ -422,6 +469,8 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
422 /* don't shuffle secondary queues */ 469 /* don't shuffle secondary queues */
423 if ((res->owner == dlm->node_num) && 470 if ((res->owner == dlm->node_num) &&
424 !(res->state & DLM_LOCK_RES_DIRTY)) { 471 !(res->state & DLM_LOCK_RES_DIRTY)) {
472 /* ref for dirty_list */
473 dlm_lockres_get(res);
425 list_add_tail(&res->dirty, &dlm->dirty_list); 474 list_add_tail(&res->dirty, &dlm->dirty_list);
426 res->state |= DLM_LOCK_RES_DIRTY; 475 res->state |= DLM_LOCK_RES_DIRTY;
427 } 476 }
@@ -606,6 +655,8 @@ static int dlm_thread(void *data)
606 list_del_init(&res->dirty); 655 list_del_init(&res->dirty);
607 spin_unlock(&res->spinlock); 656 spin_unlock(&res->spinlock);
608 spin_unlock(&dlm->spinlock); 657 spin_unlock(&dlm->spinlock);
658 /* Drop dirty_list ref */
659 dlm_lockres_put(res);
609 660
610 /* lockres can be re-dirtied/re-added to the 661 /* lockres can be re-dirtied/re-added to the
611 * dirty_list in this gap, but that is ok */ 662 * dirty_list in this gap, but that is ok */
@@ -642,8 +693,9 @@ static int dlm_thread(void *data)
642 * spinlock and do NOT have the dlm lock. 693 * spinlock and do NOT have the dlm lock.
643 * safe to reserve/queue asts and run the lists. */ 694 * safe to reserve/queue asts and run the lists. */
644 695
645 mlog(0, "calling dlm_shuffle_lists with dlm=%p, " 696 mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
646 "res=%p\n", dlm, res); 697 "res=%.*s\n", dlm->name,
698 res->lockname.len, res->lockname.name);
647 699
648 /* called while holding lockres lock */ 700 /* called while holding lockres lock */
649 dlm_shuffle_lists(dlm, res); 701 dlm_shuffle_lists(dlm, res);
@@ -657,6 +709,8 @@ in_progress:
657 /* if the lock was in-progress, stick 709 /* if the lock was in-progress, stick
658 * it on the back of the list */ 710 * it on the back of the list */
659 if (delay) { 711 if (delay) {
712 /* ref for dirty_list */
713 dlm_lockres_get(res);
660 spin_lock(&res->spinlock); 714 spin_lock(&res->spinlock);
661 list_add_tail(&res->dirty, &dlm->dirty_list); 715 list_add_tail(&res->dirty, &dlm->dirty_list);
662 res->state |= DLM_LOCK_RES_DIRTY; 716 res->state |= DLM_LOCK_RES_DIRTY;
@@ -677,7 +731,7 @@ in_progress:
677 731
678 /* yield and continue right away if there is more work to do */ 732 /* yield and continue right away if there is more work to do */
679 if (!n) { 733 if (!n) {
680 yield(); 734 cond_resched();
681 continue; 735 continue;
682 } 736 }
683 737
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 7b1a27542674..b0c3134f4f70 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -271,8 +271,7 @@ void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
271void dlm_commit_pending_cancel(struct dlm_lock_resource *res, 271void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
272 struct dlm_lock *lock) 272 struct dlm_lock *lock)
273{ 273{
274 list_del_init(&lock->list); 274 list_move_tail(&lock->list, &res->granted);
275 list_add_tail(&lock->list, &res->granted);
276 lock->ml.convert_type = LKM_IVMODE; 275 lock->ml.convert_type = LKM_IVMODE;
277} 276}
278 277
@@ -319,6 +318,16 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
319 318
320 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); 319 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
321 320
321 if (owner == dlm->node_num) {
322 /* ended up trying to contact ourself. this means
323 * that the lockres had been remote but became local
324 * via a migration. just retry it, now as local */
325 mlog(0, "%s:%.*s: this node became the master due to a "
326 "migration, re-evaluate now\n", dlm->name,
327 res->lockname.len, res->lockname.name);
328 return DLM_FORWARD;
329 }
330
322 memset(&unlock, 0, sizeof(unlock)); 331 memset(&unlock, 0, sizeof(unlock));
323 unlock.node_idx = dlm->node_num; 332 unlock.node_idx = dlm->node_num;
324 unlock.flags = cpu_to_be32(flags); 333 unlock.flags = cpu_to_be32(flags);
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index 74ca4e5f9765..e641b084b343 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -672,7 +672,7 @@ struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
672 u32 dlm_key; 672 u32 dlm_key;
673 char *domain; 673 char *domain;
674 674
675 domain = kmalloc(name->len + 1, GFP_KERNEL); 675 domain = kmalloc(name->len + 1, GFP_NOFS);
676 if (!domain) { 676 if (!domain) {
677 mlog_errno(-ENOMEM); 677 mlog_errno(-ENOMEM);
678 return ERR_PTR(-ENOMEM); 678 return ERR_PTR(-ENOMEM);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64cd52860c87..4acd37286bdd 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -242,7 +242,7 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
242 mlog_exit_void(); 242 mlog_exit_void();
243} 243}
244 244
245static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED; 245static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
246 246
247static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, 247static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
248 struct ocfs2_dlm_debug *dlm_debug) 248 struct ocfs2_dlm_debug *dlm_debug)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index eebc3cfa6be8..910a601b2e98 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -49,7 +49,7 @@
49 49
50#include "buffer_head_io.h" 50#include "buffer_head_io.h"
51 51
52spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED; 52DEFINE_SPINLOCK(trans_inc_lock);
53 53
54static int ocfs2_force_read_journal(struct inode *inode); 54static int ocfs2_force_read_journal(struct inode *inode);
55static int ocfs2_recover_node(struct ocfs2_super *osb, 55static int ocfs2_recover_node(struct ocfs2_super *osb,
@@ -222,8 +222,7 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
222 BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list)); 222 BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
223 223
224 OCFS2_I(inode)->ip_handle = handle; 224 OCFS2_I(inode)->ip_handle = handle;
225 list_del(&(OCFS2_I(inode)->ip_handle_list)); 225 list_move_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
226 list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
227} 226}
228 227
229static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle) 228static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index ee42765a8553..cf70fe2075b8 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -988,9 +988,7 @@ int ocfs2_request_mount_vote(struct ocfs2_super *osb)
988 } 988 }
989 989
990bail: 990bail:
991 if (request) 991 kfree(request);
992 kfree(request);
993
994 return status; 992 return status;
995} 993}
996 994
@@ -1021,9 +1019,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
1021 } 1019 }
1022 1020
1023bail: 1021bail:
1024 if (request) 1022 kfree(request);
1025 kfree(request);
1026
1027 return status; 1023 return status;
1028} 1024}
1029 1025
diff --git a/fs/open.c b/fs/open.c
index 5fb16e5267dc..303f06d2a7b9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -322,7 +322,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
322 322
323 error = locks_verify_truncate(inode, file, length); 323 error = locks_verify_truncate(inode, file, length);
324 if (!error) 324 if (!error)
325 error = do_truncate(dentry, length, 0, file); 325 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
326out_putf: 326out_putf:
327 fput(file); 327 fput(file);
328out: 328out:
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 464e2bce0203..93a56bd4a2b7 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -1,5 +1,4 @@
1/* $Id: inode.c,v 1.15 2001/11/12 09:43:39 davem Exp $ 1/* inode.c: /proc/openprom handling routines
2 * openpromfs.c: /proc/openprom handling routines
3 * 2 *
4 * Copyright (C) 1996-1999 Jakub Jelinek (jakub@redhat.com) 3 * Copyright (C) 1996-1999 Jakub Jelinek (jakub@redhat.com)
5 * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) 4 * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
@@ -12,756 +11,245 @@
12#include <linux/openprom_fs.h> 11#include <linux/openprom_fs.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
15#include <linux/smp_lock.h> 14#include <linux/seq_file.h>
16 15
17#include <asm/openprom.h> 16#include <asm/openprom.h>
18#include <asm/oplib.h> 17#include <asm/oplib.h>
18#include <asm/prom.h>
19#include <asm/uaccess.h> 19#include <asm/uaccess.h>
20 20
21#define ALIASES_NNODES 64 21static DEFINE_MUTEX(op_mutex);
22 22
23typedef struct { 23#define OPENPROM_ROOT_INO 0
24 u16 parent; 24
25 u16 next; 25enum op_inode_type {
26 u16 child; 26 op_inode_node,
27 u16 first_prop; 27 op_inode_prop,
28 u32 node; 28};
29} openpromfs_node; 29
30 30union op_inode_data {
31typedef struct { 31 struct device_node *node;
32#define OPP_STRING 0x10 32 struct property *prop;
33#define OPP_STRINGLIST 0x20 33};
34#define OPP_BINARY 0x40
35#define OPP_HEXSTRING 0x80
36#define OPP_DIRTY 0x01
37#define OPP_QUOTED 0x02
38#define OPP_NOTQUOTED 0x04
39#define OPP_ASCIIZ 0x08
40 u32 flag;
41 u32 alloclen;
42 u32 len;
43 char *value;
44 char name[8];
45} openprom_property;
46
47static openpromfs_node *nodes;
48static int alloced;
49static u16 last_node;
50static u16 first_prop;
51static u16 options = 0xffff;
52static u16 aliases = 0xffff;
53static int aliases_nodes;
54static char *alias_names [ALIASES_NNODES];
55
56#define OPENPROM_ROOT_INO 16
57#define OPENPROM_FIRST_INO OPENPROM_ROOT_INO
58#define NODE(ino) nodes[ino - OPENPROM_FIRST_INO]
59#define NODE2INO(node) (node + OPENPROM_FIRST_INO)
60#define NODEP2INO(no) (no + OPENPROM_FIRST_INO + last_node)
61
62static int openpromfs_create (struct inode *, struct dentry *, int, struct nameidata *);
63static int openpromfs_readdir(struct file *, void *, filldir_t);
64static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd);
65static int openpromfs_unlink (struct inode *, struct dentry *dentry);
66 34
67static ssize_t nodenum_read(struct file *file, char __user *buf, 35struct op_inode_info {
68 size_t count, loff_t *ppos) 36 struct inode vfs_inode;
37 enum op_inode_type type;
38 union op_inode_data u;
39};
40
41static inline struct op_inode_info *OP_I(struct inode *inode)
69{ 42{
70 struct inode *inode = file->f_dentry->d_inode; 43 return container_of(inode, struct op_inode_info, vfs_inode);
71 char buffer[10];
72
73 if (count < 0 || !inode->u.generic_ip)
74 return -EINVAL;
75 sprintf (buffer, "%8.8x\n", (u32)(long)(inode->u.generic_ip));
76 if (file->f_pos >= 9)
77 return 0;
78 if (count > 9 - file->f_pos)
79 count = 9 - file->f_pos;
80 if (copy_to_user(buf, buffer + file->f_pos, count))
81 return -EFAULT;
82 *ppos += count;
83 return count;
84} 44}
85 45
86static ssize_t property_read(struct file *filp, char __user *buf, 46static int is_string(unsigned char *p, int len)
87 size_t count, loff_t *ppos)
88{ 47{
89 struct inode *inode = filp->f_dentry->d_inode; 48 int i;
90 int i, j, k;
91 u32 node;
92 char *p, *s;
93 u32 *q;
94 openprom_property *op;
95 char buffer[64];
96
97 if (!filp->private_data) {
98 node = nodes[(u16)((long)inode->u.generic_ip)].node;
99 i = ((u32)(long)inode->u.generic_ip) >> 16;
100 if ((u16)((long)inode->u.generic_ip) == aliases) {
101 if (i >= aliases_nodes)
102 p = NULL;
103 else
104 p = alias_names [i];
105 } else
106 for (p = prom_firstprop (node, buffer);
107 i && p && *p;
108 p = prom_nextprop (node, p, buffer), i--)
109 /* nothing */ ;
110 if (!p || !*p)
111 return -EIO;
112 i = prom_getproplen (node, p);
113 if (i < 0) {
114 if ((u16)((long)inode->u.generic_ip) == aliases)
115 i = 0;
116 else
117 return -EIO;
118 }
119 k = i;
120 if (i < 64) i = 64;
121 filp->private_data = kmalloc (sizeof (openprom_property)
122 + (j = strlen (p)) + 2 * i,
123 GFP_KERNEL);
124 if (!filp->private_data)
125 return -ENOMEM;
126 op = (openprom_property *)filp->private_data;
127 op->flag = 0;
128 op->alloclen = 2 * i;
129 strcpy (op->name, p);
130 op->value = (char *)(((unsigned long)(op->name + j + 4)) & ~3);
131 op->len = k;
132 if (k && prom_getproperty (node, p, op->value, i) < 0)
133 return -EIO;
134 op->value [k] = 0;
135 if (k) {
136 for (s = NULL, p = op->value; p < op->value + k; p++) {
137 if ((*p >= ' ' && *p <= '~') || *p == '\n') {
138 op->flag |= OPP_STRING;
139 s = p;
140 continue;
141 }
142 if (p > op->value && !*p && s == p - 1) {
143 if (p < op->value + k - 1)
144 op->flag |= OPP_STRINGLIST;
145 else
146 op->flag |= OPP_ASCIIZ;
147 continue;
148 }
149 if (k == 1 && !*p) {
150 op->flag |= (OPP_STRING|OPP_ASCIIZ);
151 break;
152 }
153 op->flag &= ~(OPP_STRING|OPP_STRINGLIST);
154 if (k & 3)
155 op->flag |= OPP_HEXSTRING;
156 else
157 op->flag |= OPP_BINARY;
158 break;
159 }
160 if (op->flag & OPP_STRINGLIST)
161 op->flag &= ~(OPP_STRING);
162 if (op->flag & OPP_ASCIIZ)
163 op->len--;
164 }
165 } else
166 op = (openprom_property *)filp->private_data;
167 if (!count || !(op->len || (op->flag & OPP_ASCIIZ)))
168 return 0;
169 if (*ppos >= 0xffffff || count >= 0xffffff)
170 return -EINVAL;
171 if (op->flag & OPP_STRINGLIST) {
172 for (k = 0, p = op->value; p < op->value + op->len; p++)
173 if (!*p)
174 k++;
175 i = op->len + 4 * k + 3;
176 } else if (op->flag & OPP_STRING) {
177 i = op->len + 3;
178 } else if (op->flag & OPP_BINARY) {
179 i = (op->len * 9) >> 2;
180 } else {
181 i = (op->len << 1) + 1;
182 }
183 k = *ppos;
184 if (k >= i) return 0;
185 if (count > i - k) count = i - k;
186 if (op->flag & OPP_STRING) {
187 if (!k) {
188 if (put_user('\'', buf))
189 return -EFAULT;
190 k++;
191 count--;
192 }
193 49
194 if (k + count >= i - 2) 50 for (i = 0; i < len; i++) {
195 j = i - 2 - k; 51 unsigned char val = p[i];
196 else
197 j = count;
198
199 if (j >= 0) {
200 if (copy_to_user(buf + k - *ppos,
201 op->value + k - 1, j))
202 return -EFAULT;
203 count -= j;
204 k += j;
205 }
206 52
207 if (count) { 53 if ((i && !val) ||
208 if (put_user('\'', &buf [k++ - *ppos])) 54 (val >= ' ' && val <= '~'))
209 return -EFAULT; 55 continue;
210 }
211 if (count > 1) {
212 if (put_user('\n', &buf [k++ - *ppos]))
213 return -EFAULT;
214 }
215 } else if (op->flag & OPP_STRINGLIST) {
216 char *tmp;
217
218 tmp = kmalloc (i, GFP_KERNEL);
219 if (!tmp)
220 return -ENOMEM;
221
222 s = tmp;
223 *s++ = '\'';
224 for (p = op->value; p < op->value + op->len; p++) {
225 if (!*p) {
226 strcpy(s, "' + '");
227 s += 5;
228 continue;
229 }
230 *s++ = *p;
231 }
232 strcpy(s, "'\n");
233
234 if (copy_to_user(buf, tmp + k, count))
235 return -EFAULT;
236
237 kfree(tmp);
238 k += count;
239
240 } else if (op->flag & OPP_BINARY) {
241 char buffer[10];
242 u32 *first, *last;
243 int first_off, last_cnt;
244
245 first = ((u32 *)op->value) + k / 9;
246 first_off = k % 9;
247 last = ((u32 *)op->value) + (k + count - 1) / 9;
248 last_cnt = (k + count) % 9;
249 if (!last_cnt) last_cnt = 9;
250
251 if (first == last) {
252 sprintf (buffer, "%08x.", *first);
253 if (copy_to_user(buf, buffer + first_off,
254 last_cnt - first_off))
255 return -EFAULT;
256 buf += last_cnt - first_off;
257 } else {
258 for (q = first; q <= last; q++) {
259 sprintf (buffer, "%08x.", *q);
260 if (q == first) {
261 if (copy_to_user(buf, buffer + first_off,
262 9 - first_off))
263 return -EFAULT;
264 buf += 9 - first_off;
265 } else if (q == last) {
266 if (copy_to_user(buf, buffer, last_cnt))
267 return -EFAULT;
268 buf += last_cnt;
269 } else {
270 if (copy_to_user(buf, buffer, 9))
271 return -EFAULT;
272 buf += 9;
273 }
274 }
275 }
276 56
277 if (last == (u32 *)(op->value + op->len - 4) && last_cnt == 9) { 57 return 0;
278 if (put_user('\n', (buf - 1))) 58 }
279 return -EFAULT;
280 }
281 59
282 k += count; 60 return 1;
61}
283 62
284 } else if (op->flag & OPP_HEXSTRING) { 63static int property_show(struct seq_file *f, void *v)
285 char buffer[3]; 64{
65 struct property *prop = f->private;
66 void *pval;
67 int len;
286 68
287 if ((k < i - 1) && (k & 1)) { 69 len = prop->length;
288 sprintf (buffer, "%02x", 70 pval = prop->value;
289 (unsigned char) *(op->value + (k >> 1)) & 0xff);
290 if (put_user(buffer[1], &buf[k++ - *ppos]))
291 return -EFAULT;
292 count--;
293 }
294 71
295 for (; (count > 1) && (k < i - 1); k += 2) { 72 if (is_string(pval, len)) {
296 sprintf (buffer, "%02x", 73 while (len > 0) {
297 (unsigned char) *(op->value + (k >> 1)) & 0xff); 74 int n = strlen(pval);
298 if (copy_to_user(buf + k - *ppos, buffer, 2))
299 return -EFAULT;
300 count -= 2;
301 }
302 75
303 if (count && (k < i - 1)) { 76 seq_printf(f, "%s", (char *) pval);
304 sprintf (buffer, "%02x",
305 (unsigned char) *(op->value + (k >> 1)) & 0xff);
306 if (put_user(buffer[0], &buf[k++ - *ppos]))
307 return -EFAULT;
308 count--;
309 }
310 77
311 if (count) { 78 /* Skip over the NULL byte too. */
312 if (put_user('\n', &buf [k++ - *ppos])) 79 pval += n + 1;
313 return -EFAULT; 80 len -= n + 1;
314 }
315 }
316 count = k - *ppos;
317 *ppos = k;
318 return count;
319}
320 81
321static ssize_t property_write(struct file *filp, const char __user *buf, 82 if (len > 0)
322 size_t count, loff_t *ppos) 83 seq_printf(f, " + ");
323{
324 int i, j, k;
325 char *p;
326 u32 *q;
327 void *b;
328 openprom_property *op;
329
330 if (*ppos >= 0xffffff || count >= 0xffffff)
331 return -EINVAL;
332 if (!filp->private_data) {
333 i = property_read (filp, NULL, 0, NULL);
334 if (i)
335 return i;
336 }
337 k = *ppos;
338 op = (openprom_property *)filp->private_data;
339 if (!(op->flag & OPP_STRING)) {
340 u32 *first, *last;
341 int first_off, last_cnt;
342 u32 mask, mask2;
343 char tmp [9];
344 int forcelen = 0;
345
346 j = k % 9;
347 for (i = 0; i < count; i++, j++) {
348 if (j == 9) j = 0;
349 if (!j) {
350 char ctmp;
351 if (get_user(ctmp, &buf[i]))
352 return -EFAULT;
353 if (ctmp != '.') {
354 if (ctmp != '\n') {
355 if (op->flag & OPP_BINARY)
356 return -EINVAL;
357 else
358 goto write_try_string;
359 } else {
360 count = i + 1;
361 forcelen = 1;
362 break;
363 }
364 }
365 } else {
366 char ctmp;
367 if (get_user(ctmp, &buf[i]))
368 return -EFAULT;
369 if (ctmp < '0' ||
370 (ctmp > '9' && ctmp < 'A') ||
371 (ctmp > 'F' && ctmp < 'a') ||
372 ctmp > 'f') {
373 if (op->flag & OPP_BINARY)
374 return -EINVAL;
375 else
376 goto write_try_string;
377 }
378 }
379 }
380 op->flag |= OPP_BINARY;
381 tmp [8] = 0;
382 i = ((count + k + 8) / 9) << 2;
383 if (op->alloclen <= i) {
384 b = kmalloc (sizeof (openprom_property) + 2 * i,
385 GFP_KERNEL);
386 if (!b)
387 return -ENOMEM;
388 memcpy (b, filp->private_data,
389 sizeof (openprom_property)
390 + strlen (op->name) + op->alloclen);
391 memset (((char *)b) + sizeof (openprom_property)
392 + strlen (op->name) + op->alloclen,
393 0, 2 * i - op->alloclen);
394 op = (openprom_property *)b;
395 op->alloclen = 2*i;
396 b = filp->private_data;
397 filp->private_data = (void *)op;
398 kfree (b);
399 } 84 }
400 first = ((u32 *)op->value) + (k / 9); 85 } else {
401 first_off = k % 9; 86 if (len & 3) {
402 last = (u32 *)(op->value + i); 87 while (len) {
403 last_cnt = (k + count) % 9; 88 len--;
404 if (first + 1 == last) { 89 if (len)
405 memset (tmp, '0', 8); 90 seq_printf(f, "%02x.",
406 if (copy_from_user(tmp + first_off, buf, 91 *(unsigned char *) pval);
407 (count + first_off > 8) ? 92 else
408 8 - first_off : count)) 93 seq_printf(f, "%02x",
409 return -EFAULT; 94 *(unsigned char *) pval);
410 mask = 0xffffffff; 95 pval++;
411 mask2 = 0xffffffff;
412 for (j = 0; j < first_off; j++)
413 mask >>= 1;
414 for (j = 8 - count - first_off; j > 0; j--)
415 mask2 <<= 1;
416 mask &= mask2;
417 if (mask) {
418 *first &= ~mask;
419 *first |= simple_strtoul (tmp, NULL, 16);
420 op->flag |= OPP_DIRTY;
421 } 96 }
422 } else { 97 } else {
423 op->flag |= OPP_DIRTY; 98 while (len >= 4) {
424 for (q = first; q < last; q++) { 99 len -= 4;
425 if (q == first) { 100
426 if (first_off < 8) { 101 if (len)
427 memset (tmp, '0', 8); 102 seq_printf(f, "%08x.",
428 if (copy_from_user(tmp + first_off, 103 *(unsigned int *) pval);
429 buf, 104 else
430 8 - first_off)) 105 seq_printf(f, "%08x",
431 return -EFAULT; 106 *(unsigned int *) pval);
432 mask = 0xffffffff; 107 pval += 4;
433 for (j = 0; j < first_off; j++)
434 mask >>= 1;
435 *q &= ~mask;
436 *q |= simple_strtoul (tmp,NULL,16);
437 }
438 buf += 9;
439 } else if ((q == last - 1) && last_cnt
440 && (last_cnt < 8)) {
441 memset (tmp, '0', 8);
442 if (copy_from_user(tmp, buf, last_cnt))
443 return -EFAULT;
444 mask = 0xffffffff;
445 for (j = 0; j < 8 - last_cnt; j++)
446 mask <<= 1;
447 *q &= ~mask;
448 *q |= simple_strtoul (tmp, NULL, 16);
449 buf += last_cnt;
450 } else {
451 char tchars[17]; /* XXX yuck... */
452
453 if (copy_from_user(tchars, buf, 16))
454 return -EFAULT;
455 *q = simple_strtoul (tchars, NULL, 16);
456 buf += 9;
457 }
458 }
459 }
460 if (!forcelen) {
461 if (op->len < i)
462 op->len = i;
463 } else
464 op->len = i;
465 *ppos += count;
466 }
467write_try_string:
468 if (!(op->flag & OPP_BINARY)) {
469 if (!(op->flag & (OPP_QUOTED | OPP_NOTQUOTED))) {
470 char ctmp;
471
472 /* No way, if somebody starts writing from the middle,
473 * we don't know whether he uses quotes around or not
474 */
475 if (k > 0)
476 return -EINVAL;
477 if (get_user(ctmp, buf))
478 return -EFAULT;
479 if (ctmp == '\'') {
480 op->flag |= OPP_QUOTED;
481 buf++;
482 count--;
483 (*ppos)++;
484 if (!count) {
485 op->flag |= OPP_STRING;
486 return 1;
487 }
488 } else
489 op->flag |= OPP_NOTQUOTED;
490 }
491 op->flag |= OPP_STRING;
492 if (op->alloclen <= count + *ppos) {
493 b = kmalloc (sizeof (openprom_property)
494 + 2 * (count + *ppos), GFP_KERNEL);
495 if (!b)
496 return -ENOMEM;
497 memcpy (b, filp->private_data,
498 sizeof (openprom_property)
499 + strlen (op->name) + op->alloclen);
500 memset (((char *)b) + sizeof (openprom_property)
501 + strlen (op->name) + op->alloclen,
502 0, 2*(count - *ppos) - op->alloclen);
503 op = (openprom_property *)b;
504 op->alloclen = 2*(count + *ppos);
505 b = filp->private_data;
506 filp->private_data = (void *)op;
507 kfree (b);
508 }
509 p = op->value + *ppos - ((op->flag & OPP_QUOTED) ? 1 : 0);
510 if (copy_from_user(p, buf, count))
511 return -EFAULT;
512 op->flag |= OPP_DIRTY;
513 for (i = 0; i < count; i++, p++)
514 if (*p == '\n') {
515 *p = 0;
516 break;
517 } 108 }
518 if (i < count) {
519 op->len = p - op->value;
520 *ppos += i + 1;
521 if ((p > op->value) && (op->flag & OPP_QUOTED)
522 && (*(p - 1) == '\''))
523 op->len--;
524 } else {
525 if (p - op->value > op->len)
526 op->len = p - op->value;
527 *ppos += count;
528 } 109 }
529 } 110 }
530 return *ppos - k; 111 seq_printf(f, "\n");
112
113 return 0;
531} 114}
532 115
533int property_release (struct inode *inode, struct file *filp) 116static void *property_start(struct seq_file *f, loff_t *pos)
534{ 117{
535 openprom_property *op = (openprom_property *)filp->private_data; 118 if (*pos == 0)
536 int error; 119 return pos;
537 u32 node; 120 return NULL;
538 121}
539 if (!op) 122
540 return 0; 123static void *property_next(struct seq_file *f, void *v, loff_t *pos)
541 lock_kernel(); 124{
542 node = nodes[(u16)((long)inode->u.generic_ip)].node; 125 (*pos)++;
543 if ((u16)((long)inode->u.generic_ip) == aliases) { 126 return NULL;
544 if ((op->flag & OPP_DIRTY) && (op->flag & OPP_STRING)) { 127}
545 char *p = op->name; 128
546 int i = (op->value - op->name) - strlen (op->name) - 1; 129static void property_stop(struct seq_file *f, void *v)
547 op->value [op->len] = 0; 130{
548 *(op->value - 1) = ' '; 131 /* Nothing to do */
549 if (i) { 132}
550 for (p = op->value - i - 2; p >= op->name; p--) 133
551 p[i] = *p; 134static struct seq_operations property_op = {
552 p = op->name + i; 135 .start = property_start,
553 } 136 .next = property_next,
554 memcpy (p - 8, "nvalias ", 8); 137 .stop = property_stop,
555 prom_feval (p - 8); 138 .show = property_show
556 } 139};
557 } else if (op->flag & OPP_DIRTY) { 140
558 if (op->flag & OPP_STRING) { 141static int property_open(struct inode *inode, struct file *file)
559 op->value [op->len] = 0; 142{
560 error = prom_setprop (node, op->name, 143 struct op_inode_info *oi = OP_I(inode);
561 op->value, op->len + 1); 144 int ret;
562 if (error <= 0) 145
563 printk (KERN_WARNING "openpromfs: " 146 BUG_ON(oi->type != op_inode_prop);
564 "Couldn't write property %s\n", 147
565 op->name); 148 ret = seq_open(file, &property_op);
566 } else if ((op->flag & OPP_BINARY) || !op->len) { 149 if (!ret) {
567 error = prom_setprop (node, op->name, 150 struct seq_file *m = file->private_data;
568 op->value, op->len); 151 m->private = oi->u.prop;
569 if (error <= 0)
570 printk (KERN_WARNING "openpromfs: "
571 "Couldn't write property %s\n",
572 op->name);
573 } else {
574 printk (KERN_WARNING "openpromfs: "
575 "Unknown property type of %s\n",
576 op->name);
577 }
578 } 152 }
579 unlock_kernel(); 153 return ret;
580 kfree (filp->private_data);
581 return 0;
582} 154}
583 155
584static const struct file_operations openpromfs_prop_ops = { 156static const struct file_operations openpromfs_prop_ops = {
585 .read = property_read, 157 .open = property_open,
586 .write = property_write, 158 .read = seq_read,
587 .release = property_release, 159 .llseek = seq_lseek,
160 .release = seq_release,
588}; 161};
589 162
590static const struct file_operations openpromfs_nodenum_ops = { 163static int openpromfs_readdir(struct file *, void *, filldir_t);
591 .read = nodenum_read,
592};
593 164
594static const struct file_operations openprom_operations = { 165static const struct file_operations openprom_operations = {
595 .read = generic_read_dir, 166 .read = generic_read_dir,
596 .readdir = openpromfs_readdir, 167 .readdir = openpromfs_readdir,
597}; 168};
598 169
599static struct inode_operations openprom_alias_inode_operations = { 170static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *);
600 .create = openpromfs_create,
601 .lookup = openpromfs_lookup,
602 .unlink = openpromfs_unlink,
603};
604 171
605static struct inode_operations openprom_inode_operations = { 172static struct inode_operations openprom_inode_operations = {
606 .lookup = openpromfs_lookup, 173 .lookup = openpromfs_lookup,
607}; 174};
608 175
609static int lookup_children(u16 n, const char * name, int len) 176static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
610{
611 int ret;
612 u16 node;
613 for (; n != 0xffff; n = nodes[n].next) {
614 node = nodes[n].child;
615 if (node != 0xffff) {
616 char buffer[128];
617 int i;
618 char *p;
619
620 while (node != 0xffff) {
621 if (prom_getname (nodes[node].node,
622 buffer, 128) >= 0) {
623 i = strlen (buffer);
624 if ((len == i)
625 && !strncmp (buffer, name, len))
626 return NODE2INO(node);
627 p = strchr (buffer, '@');
628 if (p && (len == p - buffer)
629 && !strncmp (buffer, name, len))
630 return NODE2INO(node);
631 }
632 node = nodes[node].next;
633 }
634 } else
635 continue;
636 ret = lookup_children (nodes[n].child, name, len);
637 if (ret) return ret;
638 }
639 return 0;
640}
641
642static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
643{ 177{
644 int ino = 0; 178 struct op_inode_info *ent_oi, *oi = OP_I(dir);
645#define OPFSL_DIR 0 179 struct device_node *dp, *child;
646#define OPFSL_PROPERTY 1 180 struct property *prop;
647#define OPFSL_NODENUM 2 181 enum op_inode_type ent_type;
648 int type = 0; 182 union op_inode_data ent_data;
649 char buffer[128];
650 char *p;
651 const char *name; 183 const char *name;
652 u32 n;
653 u16 dirnode;
654 unsigned int len;
655 int i;
656 struct inode *inode; 184 struct inode *inode;
657 char buffer2[64]; 185 unsigned int ino;
186 int len;
658 187
659 inode = NULL; 188 BUG_ON(oi->type != op_inode_node);
189
190 dp = oi->u.node;
191
660 name = dentry->d_name.name; 192 name = dentry->d_name.name;
661 len = dentry->d_name.len; 193 len = dentry->d_name.len;
662 lock_kernel(); 194
663 if (name [0] == '.' && len == 5 && !strncmp (name + 1, "node", 4)) { 195 mutex_lock(&op_mutex);
664 ino = NODEP2INO(NODE(dir->i_ino).first_prop); 196
665 type = OPFSL_NODENUM; 197 child = dp->child;
666 } 198 while (child) {
667 if (!ino) { 199 int n = strlen(child->path_component_name);
668 u16 node = NODE(dir->i_ino).child; 200
669 while (node != 0xffff) { 201 if (len == n &&
670 if (prom_getname (nodes[node].node, buffer, 128) >= 0) { 202 !strncmp(child->path_component_name, name, len)) {
671 i = strlen (buffer); 203 ent_type = op_inode_node;
672 if (len == i && !strncmp (buffer, name, len)) { 204 ent_data.node = child;
673 ino = NODE2INO(node); 205 ino = child->unique_id;
674 type = OPFSL_DIR; 206 goto found;
675 break;
676 }
677 p = strchr (buffer, '@');
678 if (p && (len == p - buffer)
679 && !strncmp (buffer, name, len)) {
680 ino = NODE2INO(node);
681 type = OPFSL_DIR;
682 break;
683 }
684 }
685 node = nodes[node].next;
686 }
687 }
688 n = NODE(dir->i_ino).node;
689 dirnode = dir->i_ino - OPENPROM_FIRST_INO;
690 if (!ino) {
691 int j = NODEP2INO(NODE(dir->i_ino).first_prop);
692 if (dirnode != aliases) {
693 for (p = prom_firstprop (n, buffer2);
694 p && *p;
695 p = prom_nextprop (n, p, buffer2)) {
696 j++;
697 if ((len == strlen (p))
698 && !strncmp (p, name, len)) {
699 ino = j;
700 type = OPFSL_PROPERTY;
701 break;
702 }
703 }
704 } else {
705 int k;
706 for (k = 0; k < aliases_nodes; k++) {
707 j++;
708 if (alias_names [k]
709 && (len == strlen (alias_names [k]))
710 && !strncmp (alias_names [k], name, len)) {
711 ino = j;
712 type = OPFSL_PROPERTY;
713 break;
714 }
715 }
716 } 207 }
208 child = child->sibling;
717 } 209 }
718 if (!ino) { 210
719 ino = lookup_children (NODE(dir->i_ino).child, name, len); 211 prop = dp->properties;
720 if (ino) 212 while (prop) {
721 type = OPFSL_DIR; 213 int n = strlen(prop->name);
722 else { 214
723 unlock_kernel(); 215 if (len == n && !strncmp(prop->name, name, len)) {
724 return ERR_PTR(-ENOENT); 216 ent_type = op_inode_prop;
217 ent_data.prop = prop;
218 ino = prop->unique_id;
219 goto found;
725 } 220 }
221
222 prop = prop->next;
726 } 223 }
727 inode = iget (dir->i_sb, ino); 224
728 unlock_kernel(); 225 mutex_unlock(&op_mutex);
226 return ERR_PTR(-ENOENT);
227
228found:
229 inode = iget(dir->i_sb, ino);
230 mutex_unlock(&op_mutex);
729 if (!inode) 231 if (!inode)
730 return ERR_PTR(-EINVAL); 232 return ERR_PTR(-EINVAL);
731 switch (type) { 233 ent_oi = OP_I(inode);
732 case OPFSL_DIR: 234 ent_oi->type = ent_type;
235 ent_oi->u = ent_data;
236
237 switch (ent_type) {
238 case op_inode_node:
733 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; 239 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
734 if (ino == OPENPROM_FIRST_INO + aliases) { 240 inode->i_op = &openprom_inode_operations;
735 inode->i_mode |= S_IWUSR;
736 inode->i_op = &openprom_alias_inode_operations;
737 } else
738 inode->i_op = &openprom_inode_operations;
739 inode->i_fop = &openprom_operations; 241 inode->i_fop = &openprom_operations;
740 inode->i_nlink = 2; 242 inode->i_nlink = 2;
741 break; 243 break;
742 case OPFSL_NODENUM: 244 case op_inode_prop:
743 inode->i_mode = S_IFREG | S_IRUGO; 245 if (!strcmp(dp->name, "options") && (len == 17) &&
744 inode->i_fop = &openpromfs_nodenum_ops; 246 !strncmp (name, "security-password", 17))
745 inode->i_nlink = 1;
746 inode->u.generic_ip = (void *)(long)(n);
747 break;
748 case OPFSL_PROPERTY:
749 if ((dirnode == options) && (len == 17)
750 && !strncmp (name, "security-password", 17))
751 inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; 247 inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
752 else { 248 else
753 inode->i_mode = S_IFREG | S_IRUGO; 249 inode->i_mode = S_IFREG | S_IRUGO;
754 if (dirnode == options || dirnode == aliases) {
755 if (len != 4 || strncmp (name, "name", 4))
756 inode->i_mode |= S_IWUSR;
757 }
758 }
759 inode->i_fop = &openpromfs_prop_ops; 250 inode->i_fop = &openpromfs_prop_ops;
760 inode->i_nlink = 1; 251 inode->i_nlink = 1;
761 if (inode->i_size < 0) 252 inode->i_size = ent_oi->u.prop->length;
762 inode->i_size = 0;
763 inode->u.generic_ip = (void *)(long)(((u16)dirnode) |
764 (((u16)(ino - NODEP2INO(NODE(dir->i_ino).first_prop) - 1)) << 16));
765 break; 253 break;
766 } 254 }
767 255
@@ -775,237 +263,89 @@ static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentr
775static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir) 263static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
776{ 264{
777 struct inode *inode = filp->f_dentry->d_inode; 265 struct inode *inode = filp->f_dentry->d_inode;
266 struct op_inode_info *oi = OP_I(inode);
267 struct device_node *dp = oi->u.node;
268 struct device_node *child;
269 struct property *prop;
778 unsigned int ino; 270 unsigned int ino;
779 u32 n; 271 int i;
780 int i, j; 272
781 char buffer[128]; 273 mutex_lock(&op_mutex);
782 u16 node;
783 char *p;
784 char buffer2[64];
785
786 lock_kernel();
787 274
788 ino = inode->i_ino; 275 ino = inode->i_ino;
789 i = filp->f_pos; 276 i = filp->f_pos;
790 switch (i) { 277 switch (i) {
791 case 0: 278 case 0:
792 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) goto out; 279 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
280 goto out;
793 i++; 281 i++;
794 filp->f_pos++; 282 filp->f_pos++;
795 /* fall thru */ 283 /* fall thru */
796 case 1: 284 case 1:
797 if (filldir(dirent, "..", 2, i, 285 if (filldir(dirent, "..", 2, i,
798 (NODE(ino).parent == 0xffff) ? 286 (dp->parent == NULL ?
799 OPENPROM_ROOT_INO : NODE2INO(NODE(ino).parent), DT_DIR) < 0) 287 OPENPROM_ROOT_INO :
288 dp->parent->unique_id), DT_DIR) < 0)
800 goto out; 289 goto out;
801 i++; 290 i++;
802 filp->f_pos++; 291 filp->f_pos++;
803 /* fall thru */ 292 /* fall thru */
804 default: 293 default:
805 i -= 2; 294 i -= 2;
806 node = NODE(ino).child; 295
807 while (i && node != 0xffff) { 296 /* First, the children nodes as directories. */
808 node = nodes[node].next; 297 child = dp->child;
298 while (i && child) {
299 child = child->sibling;
809 i--; 300 i--;
810 } 301 }
811 while (node != 0xffff) { 302 while (child) {
812 if (prom_getname (nodes[node].node, buffer, 128) < 0) 303 if (filldir(dirent,
813 goto out; 304 child->path_component_name,
814 if (filldir(dirent, buffer, strlen(buffer), 305 strlen(child->path_component_name),
815 filp->f_pos, NODE2INO(node), DT_DIR) < 0) 306 filp->f_pos, child->unique_id, DT_DIR) < 0)
816 goto out; 307 goto out;
308
817 filp->f_pos++; 309 filp->f_pos++;
818 node = nodes[node].next; 310 child = child->sibling;
819 } 311 }
820 j = NODEP2INO(NODE(ino).first_prop); 312
821 if (!i) { 313 /* Next, the properties as files. */
822 if (filldir(dirent, ".node", 5, filp->f_pos, j, DT_REG) < 0) 314 prop = dp->properties;
315 while (i && prop) {
316 prop = prop->next;
317 i--;
318 }
319 while (prop) {
320 if (filldir(dirent, prop->name, strlen(prop->name),
321 filp->f_pos, prop->unique_id, DT_REG) < 0)
823 goto out; 322 goto out;
323
824 filp->f_pos++; 324 filp->f_pos++;
825 } else 325 prop = prop->next;
826 i--;
827 n = NODE(ino).node;
828 if (ino == OPENPROM_FIRST_INO + aliases) {
829 for (j++; i < aliases_nodes; i++, j++) {
830 if (alias_names [i]) {
831 if (filldir (dirent, alias_names [i],
832 strlen (alias_names [i]),
833 filp->f_pos, j, DT_REG) < 0) goto out;
834 filp->f_pos++;
835 }
836 }
837 } else {
838 for (p = prom_firstprop (n, buffer2);
839 p && *p;
840 p = prom_nextprop (n, p, buffer2)) {
841 j++;
842 if (i) i--;
843 else {
844 if (filldir(dirent, p, strlen(p),
845 filp->f_pos, j, DT_REG) < 0)
846 goto out;
847 filp->f_pos++;
848 }
849 }
850 } 326 }
851 } 327 }
852out: 328out:
853 unlock_kernel(); 329 mutex_unlock(&op_mutex);
854 return 0;
855}
856
857static int openpromfs_create (struct inode *dir, struct dentry *dentry, int mode,
858 struct nameidata *nd)
859{
860 char *p;
861 struct inode *inode;
862
863 if (!dir)
864 return -ENOENT;
865 if (dentry->d_name.len > 256)
866 return -EINVAL;
867 p = kmalloc (dentry->d_name.len + 1, GFP_KERNEL);
868 if (!p)
869 return -ENOMEM;
870 strncpy (p, dentry->d_name.name, dentry->d_name.len);
871 p [dentry->d_name.len] = 0;
872 lock_kernel();
873 if (aliases_nodes == ALIASES_NNODES) {
874 kfree(p);
875 unlock_kernel();
876 return -EIO;
877 }
878 alias_names [aliases_nodes++] = p;
879 inode = iget (dir->i_sb,
880 NODEP2INO(NODE(dir->i_ino).first_prop) + aliases_nodes);
881 if (!inode) {
882 unlock_kernel();
883 return -EINVAL;
884 }
885 inode->i_mode = S_IFREG | S_IRUGO | S_IWUSR;
886 inode->i_fop = &openpromfs_prop_ops;
887 inode->i_nlink = 1;
888 if (inode->i_size < 0) inode->i_size = 0;
889 inode->u.generic_ip = (void *)(long)(((u16)aliases) |
890 (((u16)(aliases_nodes - 1)) << 16));
891 unlock_kernel();
892 d_instantiate(dentry, inode);
893 return 0; 330 return 0;
894} 331}
895 332
896static int openpromfs_unlink (struct inode *dir, struct dentry *dentry) 333static kmem_cache_t *op_inode_cachep;
897{
898 unsigned int len;
899 char *p;
900 const char *name;
901 int i;
902
903 name = dentry->d_name.name;
904 len = dentry->d_name.len;
905 lock_kernel();
906 for (i = 0; i < aliases_nodes; i++)
907 if ((strlen (alias_names [i]) == len)
908 && !strncmp (name, alias_names[i], len)) {
909 char buffer[512];
910
911 p = alias_names [i];
912 alias_names [i] = NULL;
913 kfree (p);
914 strcpy (buffer, "nvunalias ");
915 memcpy (buffer + 10, name, len);
916 buffer [10 + len] = 0;
917 prom_feval (buffer);
918 }
919 unlock_kernel();
920 return 0;
921}
922 334
923/* {{{ init section */ 335static struct inode *openprom_alloc_inode(struct super_block *sb)
924static int __init check_space (u16 n)
925{ 336{
926 unsigned long pages; 337 struct op_inode_info *oi;
927 338
928 if ((1 << alloced) * PAGE_SIZE < (n + 2) * sizeof(openpromfs_node)) { 339 oi = kmem_cache_alloc(op_inode_cachep, SLAB_KERNEL);
929 pages = __get_free_pages (GFP_KERNEL, alloced + 1); 340 if (!oi)
930 if (!pages) 341 return NULL;
931 return -1;
932 342
933 if (nodes) { 343 return &oi->vfs_inode;
934 memcpy ((char *)pages, (char *)nodes,
935 (1 << alloced) * PAGE_SIZE);
936 free_pages ((unsigned long)nodes, alloced);
937 }
938 alloced++;
939 nodes = (openpromfs_node *)pages;
940 }
941 return 0;
942} 344}
943 345
944static u16 __init get_nodes (u16 parent, u32 node) 346static void openprom_destroy_inode(struct inode *inode)
945{ 347{
946 char *p; 348 kmem_cache_free(op_inode_cachep, OP_I(inode));
947 u16 n = last_node++, i;
948 char buffer[64];
949
950 if (check_space (n) < 0)
951 return 0xffff;
952 nodes[n].parent = parent;
953 nodes[n].node = node;
954 nodes[n].next = 0xffff;
955 nodes[n].child = 0xffff;
956 nodes[n].first_prop = first_prop++;
957 if (!parent) {
958 char buffer[8];
959 int j;
960
961 if ((j = prom_getproperty (node, "name", buffer, 8)) >= 0) {
962 buffer[j] = 0;
963 if (!strcmp (buffer, "options"))
964 options = n;
965 else if (!strcmp (buffer, "aliases"))
966 aliases = n;
967 }
968 }
969 if (n != aliases)
970 for (p = prom_firstprop (node, buffer);
971 p && p != (char *)-1 && *p;
972 p = prom_nextprop (node, p, buffer))
973 first_prop++;
974 else {
975 char *q;
976 for (p = prom_firstprop (node, buffer);
977 p && p != (char *)-1 && *p;
978 p = prom_nextprop (node, p, buffer)) {
979 if (aliases_nodes == ALIASES_NNODES)
980 break;
981 for (i = 0; i < aliases_nodes; i++)
982 if (!strcmp (p, alias_names [i]))
983 break;
984 if (i < aliases_nodes)
985 continue;
986 q = kmalloc (strlen (p) + 1, GFP_KERNEL);
987 if (!q)
988 return 0xffff;
989 strcpy (q, p);
990 alias_names [aliases_nodes++] = q;
991 }
992 first_prop += ALIASES_NNODES;
993 }
994 node = prom_getchild (node);
995 if (node) {
996 parent = get_nodes (n, node);
997 if (parent == 0xffff)
998 return 0xffff;
999 nodes[n].child = parent;
1000 while ((node = prom_getsibling (node)) != 0) {
1001 i = get_nodes (n, node);
1002 if (i == 0xffff)
1003 return 0xffff;
1004 nodes[parent].next = i;
1005 parent = i;
1006 }
1007 }
1008 return n;
1009} 349}
1010 350
1011static void openprom_read_inode(struct inode * inode) 351static void openprom_read_inode(struct inode * inode)
@@ -1025,6 +365,8 @@ static int openprom_remount(struct super_block *sb, int *flags, char *data)
1025} 365}
1026 366
1027static struct super_operations openprom_sops = { 367static struct super_operations openprom_sops = {
368 .alloc_inode = openprom_alloc_inode,
369 .destroy_inode = openprom_destroy_inode,
1028 .read_inode = openprom_read_inode, 370 .read_inode = openprom_read_inode,
1029 .statfs = simple_statfs, 371 .statfs = simple_statfs,
1030 .remount_fs = openprom_remount, 372 .remount_fs = openprom_remount,
@@ -1032,7 +374,8 @@ static struct super_operations openprom_sops = {
1032 374
1033static int openprom_fill_super(struct super_block *s, void *data, int silent) 375static int openprom_fill_super(struct super_block *s, void *data, int silent)
1034{ 376{
1035 struct inode * root_inode; 377 struct inode *root_inode;
378 struct op_inode_info *oi;
1036 379
1037 s->s_flags |= MS_NOATIME; 380 s->s_flags |= MS_NOATIME;
1038 s->s_blocksize = 1024; 381 s->s_blocksize = 1024;
@@ -1043,6 +386,11 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)
1043 root_inode = iget(s, OPENPROM_ROOT_INO); 386 root_inode = iget(s, OPENPROM_ROOT_INO);
1044 if (!root_inode) 387 if (!root_inode)
1045 goto out_no_root; 388 goto out_no_root;
389
390 oi = OP_I(root_inode);
391 oi->type = op_inode_node;
392 oi->u.node = of_find_node_by_path("/");
393
1046 s->s_root = d_alloc_root(root_inode); 394 s->s_root = d_alloc_root(root_inode);
1047 if (!s->s_root) 395 if (!s->s_root)
1048 goto out_no_root; 396 goto out_no_root;
@@ -1067,29 +415,39 @@ static struct file_system_type openprom_fs_type = {
1067 .kill_sb = kill_anon_super, 415 .kill_sb = kill_anon_super,
1068}; 416};
1069 417
418static void op_inode_init_once(void *data, kmem_cache_t * cachep, unsigned long flags)
419{
420 struct op_inode_info *oi = (struct op_inode_info *) data;
421
422 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
423 SLAB_CTOR_CONSTRUCTOR)
424 inode_init_once(&oi->vfs_inode);
425}
426
1070static int __init init_openprom_fs(void) 427static int __init init_openprom_fs(void)
1071{ 428{
1072 nodes = (openpromfs_node *)__get_free_pages(GFP_KERNEL, 0); 429 int err;
1073 if (!nodes) { 430
1074 printk (KERN_WARNING "openpromfs: can't get free page\n"); 431 op_inode_cachep = kmem_cache_create("op_inode_cache",
1075 return -EIO; 432 sizeof(struct op_inode_info),
1076 } 433 0,
1077 if (get_nodes (0xffff, prom_root_node) == 0xffff) { 434 (SLAB_RECLAIM_ACCOUNT |
1078 printk (KERN_WARNING "openpromfs: couldn't setup tree\n"); 435 SLAB_MEM_SPREAD),
1079 return -EIO; 436 op_inode_init_once, NULL);
1080 } 437 if (!op_inode_cachep)
1081 nodes[last_node].first_prop = first_prop; 438 return -ENOMEM;
1082 return register_filesystem(&openprom_fs_type); 439
440 err = register_filesystem(&openprom_fs_type);
441 if (err)
442 kmem_cache_destroy(op_inode_cachep);
443
444 return err;
1083} 445}
1084 446
1085static void __exit exit_openprom_fs(void) 447static void __exit exit_openprom_fs(void)
1086{ 448{
1087 int i;
1088 unregister_filesystem(&openprom_fs_type); 449 unregister_filesystem(&openprom_fs_type);
1089 free_pages ((unsigned long)nodes, alloced); 450 kmem_cache_destroy(op_inode_cachep);
1090 for (i = 0; i < aliases_nodes; i++)
1091 kfree (alias_names [i]);
1092 nodes = NULL;
1093} 451}
1094 452
1095module_init(init_openprom_fs) 453module_init(init_openprom_fs)
diff --git a/fs/pnode.c b/fs/pnode.c
index 37b568ed0e05..da42ee61c1df 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -53,8 +53,7 @@ static int do_make_slave(struct vfsmount *mnt)
53 if (master) { 53 if (master) {
54 list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) 54 list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
55 slave_mnt->mnt_master = master; 55 slave_mnt->mnt_master = master;
56 list_del(&mnt->mnt_slave); 56 list_move(&mnt->mnt_slave, &master->mnt_slave_list);
57 list_add(&mnt->mnt_slave, &master->mnt_slave_list);
58 list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); 57 list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
59 INIT_LIST_HEAD(&mnt->mnt_slave_list); 58 INIT_LIST_HEAD(&mnt->mnt_slave_list);
60 } else { 59 } else {
@@ -283,10 +282,8 @@ static void __propagate_umount(struct vfsmount *mnt)
283 * umount the child only if the child has no 282 * umount the child only if the child has no
284 * other children 283 * other children
285 */ 284 */
286 if (child && list_empty(&child->mnt_mounts)) { 285 if (child && list_empty(&child->mnt_mounts))
287 list_del(&child->mnt_hash); 286 list_move_tail(&child->mnt_hash, &mnt->mnt_hash);
288 list_add_tail(&child->mnt_hash, &mnt->mnt_hash);
289 }
290 } 287 }
291} 288}
292 289
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6afff725a8c9..6ba7785319de 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -74,6 +74,16 @@
74#include <linux/poll.h> 74#include <linux/poll.h>
75#include "internal.h" 75#include "internal.h"
76 76
77/* NOTE:
78 * Implementing inode permission operations in /proc is almost
79 * certainly an error. Permission checks need to happen during
80 * each system call not at open time. The reason is that most of
81 * what we wish to check for permissions in /proc varies at runtime.
82 *
83 * The classic example of a problem is opening file descriptors
84 * in /proc for a task before it execs a suid executable.
85 */
86
77/* 87/*
78 * For hysterical raisins we keep the same inumbers as in the old procfs. 88 * For hysterical raisins we keep the same inumbers as in the old procfs.
79 * Feel free to change the macro below - just keep the range distinct from 89 * Feel free to change the macro below - just keep the range distinct from
@@ -121,6 +131,8 @@ enum pid_directory_inos {
121 PROC_TGID_ATTR_PREV, 131 PROC_TGID_ATTR_PREV,
122 PROC_TGID_ATTR_EXEC, 132 PROC_TGID_ATTR_EXEC,
123 PROC_TGID_ATTR_FSCREATE, 133 PROC_TGID_ATTR_FSCREATE,
134 PROC_TGID_ATTR_KEYCREATE,
135 PROC_TGID_ATTR_SOCKCREATE,
124#endif 136#endif
125#ifdef CONFIG_AUDITSYSCALL 137#ifdef CONFIG_AUDITSYSCALL
126 PROC_TGID_LOGINUID, 138 PROC_TGID_LOGINUID,
@@ -162,6 +174,8 @@ enum pid_directory_inos {
162 PROC_TID_ATTR_PREV, 174 PROC_TID_ATTR_PREV,
163 PROC_TID_ATTR_EXEC, 175 PROC_TID_ATTR_EXEC,
164 PROC_TID_ATTR_FSCREATE, 176 PROC_TID_ATTR_FSCREATE,
177 PROC_TID_ATTR_KEYCREATE,
178 PROC_TID_ATTR_SOCKCREATE,
165#endif 179#endif
166#ifdef CONFIG_AUDITSYSCALL 180#ifdef CONFIG_AUDITSYSCALL
167 PROC_TID_LOGINUID, 181 PROC_TID_LOGINUID,
@@ -173,6 +187,9 @@ enum pid_directory_inos {
173 PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ 187 PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */
174}; 188};
175 189
190/* Worst case buffer size needed for holding an integer. */
191#define PROC_NUMBUF 10
192
176struct pid_entry { 193struct pid_entry {
177 int type; 194 int type;
178 int len; 195 int len;
@@ -275,6 +292,8 @@ static struct pid_entry tgid_attr_stuff[] = {
275 E(PROC_TGID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), 292 E(PROC_TGID_ATTR_PREV, "prev", S_IFREG|S_IRUGO),
276 E(PROC_TGID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), 293 E(PROC_TGID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO),
277 E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), 294 E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
295 E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
296 E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
278 {0,0,NULL,0} 297 {0,0,NULL,0}
279}; 298};
280static struct pid_entry tid_attr_stuff[] = { 299static struct pid_entry tid_attr_stuff[] = {
@@ -282,6 +301,8 @@ static struct pid_entry tid_attr_stuff[] = {
282 E(PROC_TID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), 301 E(PROC_TID_ATTR_PREV, "prev", S_IFREG|S_IRUGO),
283 E(PROC_TID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), 302 E(PROC_TID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO),
284 E(PROC_TID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), 303 E(PROC_TID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
304 E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
305 E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
285 {0,0,NULL,0} 306 {0,0,NULL,0}
286}; 307};
287#endif 308#endif
@@ -290,12 +311,15 @@ static struct pid_entry tid_attr_stuff[] = {
290 311
291static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) 312static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
292{ 313{
293 struct task_struct *task = proc_task(inode); 314 struct task_struct *task = get_proc_task(inode);
294 struct files_struct *files; 315 struct files_struct *files = NULL;
295 struct file *file; 316 struct file *file;
296 int fd = proc_type(inode) - PROC_TID_FD_DIR; 317 int fd = proc_fd(inode);
297 318
298 files = get_files_struct(task); 319 if (task) {
320 files = get_files_struct(task);
321 put_task_struct(task);
322 }
299 if (files) { 323 if (files) {
300 /* 324 /*
301 * We are not taking a ref to the file structure, so we must 325 * We are not taking a ref to the file structure, so we must
@@ -327,29 +351,33 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
327 return fs; 351 return fs;
328} 352}
329 353
330static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) 354static int get_nr_threads(struct task_struct *tsk)
331{ 355{
332 struct fs_struct *fs = get_fs_struct(proc_task(inode)); 356 /* Must be called with the rcu_read_lock held */
333 int result = -ENOENT; 357 unsigned long flags;
334 if (fs) { 358 int count = 0;
335 read_lock(&fs->lock); 359
336 *mnt = mntget(fs->pwdmnt); 360 if (lock_task_sighand(tsk, &flags)) {
337 *dentry = dget(fs->pwd); 361 count = atomic_read(&tsk->signal->count);
338 read_unlock(&fs->lock); 362 unlock_task_sighand(tsk, &flags);
339 result = 0;
340 put_fs_struct(fs);
341 } 363 }
342 return result; 364 return count;
343} 365}
344 366
345static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) 367static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
346{ 368{
347 struct fs_struct *fs = get_fs_struct(proc_task(inode)); 369 struct task_struct *task = get_proc_task(inode);
370 struct fs_struct *fs = NULL;
348 int result = -ENOENT; 371 int result = -ENOENT;
372
373 if (task) {
374 fs = get_fs_struct(task);
375 put_task_struct(task);
376 }
349 if (fs) { 377 if (fs) {
350 read_lock(&fs->lock); 378 read_lock(&fs->lock);
351 *mnt = mntget(fs->rootmnt); 379 *mnt = mntget(fs->pwdmnt);
352 *dentry = dget(fs->root); 380 *dentry = dget(fs->pwd);
353 read_unlock(&fs->lock); 381 read_unlock(&fs->lock);
354 result = 0; 382 result = 0;
355 put_fs_struct(fs); 383 put_fs_struct(fs);
@@ -357,42 +385,16 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
357 return result; 385 return result;
358} 386}
359 387
360 388static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
361/* Same as proc_root_link, but this addionally tries to get fs from other
362 * threads in the group */
363static int proc_task_root_link(struct inode *inode, struct dentry **dentry,
364 struct vfsmount **mnt)
365{ 389{
366 struct fs_struct *fs; 390 struct task_struct *task = get_proc_task(inode);
391 struct fs_struct *fs = NULL;
367 int result = -ENOENT; 392 int result = -ENOENT;
368 struct task_struct *leader = proc_task(inode);
369 393
370 task_lock(leader); 394 if (task) {
371 fs = leader->fs; 395 fs = get_fs_struct(task);
372 if (fs) { 396 put_task_struct(task);
373 atomic_inc(&fs->count);
374 task_unlock(leader);
375 } else {
376 /* Try to get fs from other threads */
377 task_unlock(leader);
378 read_lock(&tasklist_lock);
379 if (pid_alive(leader)) {
380 struct task_struct *task = leader;
381
382 while ((task = next_thread(task)) != leader) {
383 task_lock(task);
384 fs = task->fs;
385 if (fs) {
386 atomic_inc(&fs->count);
387 task_unlock(task);
388 break;
389 }
390 task_unlock(task);
391 }
392 }
393 read_unlock(&tasklist_lock);
394 } 397 }
395
396 if (fs) { 398 if (fs) {
397 read_lock(&fs->lock); 399 read_lock(&fs->lock);
398 *mnt = mntget(fs->rootmnt); 400 *mnt = mntget(fs->rootmnt);
@@ -404,7 +406,6 @@ static int proc_task_root_link(struct inode *inode, struct dentry **dentry,
404 return result; 406 return result;
405} 407}
406 408
407
408#define MAY_PTRACE(task) \ 409#define MAY_PTRACE(task) \
409 (task == current || \ 410 (task == current || \
410 (task->parent == current && \ 411 (task->parent == current && \
@@ -535,142 +536,22 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
535/************************************************************************/ 536/************************************************************************/
536 537
537/* permission checks */ 538/* permission checks */
538 539static int proc_fd_access_allowed(struct inode *inode)
539/* If the process being read is separated by chroot from the reading process,
540 * don't let the reader access the threads.
541 *
542 * note: this does dput(root) and mntput(vfsmnt) on exit.
543 */
544static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt)
545{
546 struct dentry *de, *base;
547 struct vfsmount *our_vfsmnt, *mnt;
548 int res = 0;
549
550 read_lock(&current->fs->lock);
551 our_vfsmnt = mntget(current->fs->rootmnt);
552 base = dget(current->fs->root);
553 read_unlock(&current->fs->lock);
554
555 spin_lock(&vfsmount_lock);
556 de = root;
557 mnt = vfsmnt;
558
559 while (mnt != our_vfsmnt) {
560 if (mnt == mnt->mnt_parent)
561 goto out;
562 de = mnt->mnt_mountpoint;
563 mnt = mnt->mnt_parent;
564 }
565
566 if (!is_subdir(de, base))
567 goto out;
568 spin_unlock(&vfsmount_lock);
569
570exit:
571 dput(base);
572 mntput(our_vfsmnt);
573 dput(root);
574 mntput(vfsmnt);
575 return res;
576out:
577 spin_unlock(&vfsmount_lock);
578 res = -EACCES;
579 goto exit;
580}
581
582static int proc_check_root(struct inode *inode)
583{
584 struct dentry *root;
585 struct vfsmount *vfsmnt;
586
587 if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */
588 return -ENOENT;
589 return proc_check_chroot(root, vfsmnt);
590}
591
592static int proc_permission(struct inode *inode, int mask, struct nameidata *nd)
593{
594 if (generic_permission(inode, mask, NULL) != 0)
595 return -EACCES;
596 return proc_check_root(inode);
597}
598
599static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd)
600{
601 struct dentry *root;
602 struct vfsmount *vfsmnt;
603
604 if (generic_permission(inode, mask, NULL) != 0)
605 return -EACCES;
606
607 if (proc_task_root_link(inode, &root, &vfsmnt))
608 return -ENOENT;
609
610 return proc_check_chroot(root, vfsmnt);
611}
612
613extern struct seq_operations proc_pid_maps_op;
614static int maps_open(struct inode *inode, struct file *file)
615{
616 struct task_struct *task = proc_task(inode);
617 int ret = seq_open(file, &proc_pid_maps_op);
618 if (!ret) {
619 struct seq_file *m = file->private_data;
620 m->private = task;
621 }
622 return ret;
623}
624
625static struct file_operations proc_maps_operations = {
626 .open = maps_open,
627 .read = seq_read,
628 .llseek = seq_lseek,
629 .release = seq_release,
630};
631
632#ifdef CONFIG_NUMA
633extern struct seq_operations proc_pid_numa_maps_op;
634static int numa_maps_open(struct inode *inode, struct file *file)
635{
636 struct task_struct *task = proc_task(inode);
637 int ret = seq_open(file, &proc_pid_numa_maps_op);
638 if (!ret) {
639 struct seq_file *m = file->private_data;
640 m->private = task;
641 }
642 return ret;
643}
644
645static struct file_operations proc_numa_maps_operations = {
646 .open = numa_maps_open,
647 .read = seq_read,
648 .llseek = seq_lseek,
649 .release = seq_release,
650};
651#endif
652
653#ifdef CONFIG_MMU
654extern struct seq_operations proc_pid_smaps_op;
655static int smaps_open(struct inode *inode, struct file *file)
656{ 540{
657 struct task_struct *task = proc_task(inode); 541 struct task_struct *task;
658 int ret = seq_open(file, &proc_pid_smaps_op); 542 int allowed = 0;
659 if (!ret) { 543 /* Allow access to a task's file descriptors if it is us or we
660 struct seq_file *m = file->private_data; 544 * may use ptrace attach to the process and find out that
661 m->private = task; 545 * information.
546 */
547 task = get_proc_task(inode);
548 if (task) {
549 allowed = ptrace_may_attach(task);
550 put_task_struct(task);
662 } 551 }
663 return ret; 552 return allowed;
664} 553}
665 554
666static struct file_operations proc_smaps_operations = {
667 .open = smaps_open,
668 .read = seq_read,
669 .llseek = seq_lseek,
670 .release = seq_release,
671};
672#endif
673
674extern struct seq_operations mounts_op; 555extern struct seq_operations mounts_op;
675struct proc_mounts { 556struct proc_mounts {
676 struct seq_file m; 557 struct seq_file m;
@@ -679,16 +560,19 @@ struct proc_mounts {
679 560
680static int mounts_open(struct inode *inode, struct file *file) 561static int mounts_open(struct inode *inode, struct file *file)
681{ 562{
682 struct task_struct *task = proc_task(inode); 563 struct task_struct *task = get_proc_task(inode);
683 struct namespace *namespace; 564 struct namespace *namespace = NULL;
684 struct proc_mounts *p; 565 struct proc_mounts *p;
685 int ret = -EINVAL; 566 int ret = -EINVAL;
686 567
687 task_lock(task); 568 if (task) {
688 namespace = task->namespace; 569 task_lock(task);
689 if (namespace) 570 namespace = task->namespace;
690 get_namespace(namespace); 571 if (namespace)
691 task_unlock(task); 572 get_namespace(namespace);
573 task_unlock(task);
574 put_task_struct(task);
575 }
692 576
693 if (namespace) { 577 if (namespace) {
694 ret = -ENOMEM; 578 ret = -ENOMEM;
@@ -745,17 +629,21 @@ static struct file_operations proc_mounts_operations = {
745extern struct seq_operations mountstats_op; 629extern struct seq_operations mountstats_op;
746static int mountstats_open(struct inode *inode, struct file *file) 630static int mountstats_open(struct inode *inode, struct file *file)
747{ 631{
748 struct task_struct *task = proc_task(inode);
749 int ret = seq_open(file, &mountstats_op); 632 int ret = seq_open(file, &mountstats_op);
750 633
751 if (!ret) { 634 if (!ret) {
752 struct seq_file *m = file->private_data; 635 struct seq_file *m = file->private_data;
753 struct namespace *namespace; 636 struct namespace *namespace = NULL;
754 task_lock(task); 637 struct task_struct *task = get_proc_task(inode);
755 namespace = task->namespace; 638
756 if (namespace) 639 if (task) {
757 get_namespace(namespace); 640 task_lock(task);
758 task_unlock(task); 641 namespace = task->namespace;
642 if (namespace)
643 get_namespace(namespace);
644 task_unlock(task);
645 put_task_struct(task);
646 }
759 647
760 if (namespace) 648 if (namespace)
761 m->private = namespace; 649 m->private = namespace;
@@ -782,18 +670,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf,
782 struct inode * inode = file->f_dentry->d_inode; 670 struct inode * inode = file->f_dentry->d_inode;
783 unsigned long page; 671 unsigned long page;
784 ssize_t length; 672 ssize_t length;
785 struct task_struct *task = proc_task(inode); 673 struct task_struct *task = get_proc_task(inode);
674
675 length = -ESRCH;
676 if (!task)
677 goto out_no_task;
786 678
787 if (count > PROC_BLOCK_SIZE) 679 if (count > PROC_BLOCK_SIZE)
788 count = PROC_BLOCK_SIZE; 680 count = PROC_BLOCK_SIZE;
681
682 length = -ENOMEM;
789 if (!(page = __get_free_page(GFP_KERNEL))) 683 if (!(page = __get_free_page(GFP_KERNEL)))
790 return -ENOMEM; 684 goto out;
791 685
792 length = PROC_I(inode)->op.proc_read(task, (char*)page); 686 length = PROC_I(inode)->op.proc_read(task, (char*)page);
793 687
794 if (length >= 0) 688 if (length >= 0)
795 length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); 689 length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
796 free_page(page); 690 free_page(page);
691out:
692 put_task_struct(task);
693out_no_task:
797 return length; 694 return length;
798} 695}
799 696
@@ -810,12 +707,15 @@ static int mem_open(struct inode* inode, struct file* file)
810static ssize_t mem_read(struct file * file, char __user * buf, 707static ssize_t mem_read(struct file * file, char __user * buf,
811 size_t count, loff_t *ppos) 708 size_t count, loff_t *ppos)
812{ 709{
813 struct task_struct *task = proc_task(file->f_dentry->d_inode); 710 struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
814 char *page; 711 char *page;
815 unsigned long src = *ppos; 712 unsigned long src = *ppos;
816 int ret = -ESRCH; 713 int ret = -ESRCH;
817 struct mm_struct *mm; 714 struct mm_struct *mm;
818 715
716 if (!task)
717 goto out_no_task;
718
819 if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) 719 if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
820 goto out; 720 goto out;
821 721
@@ -865,6 +765,8 @@ out_put:
865out_free: 765out_free:
866 free_page((unsigned long) page); 766 free_page((unsigned long) page);
867out: 767out:
768 put_task_struct(task);
769out_no_task:
868 return ret; 770 return ret;
869} 771}
870 772
@@ -877,15 +779,20 @@ static ssize_t mem_write(struct file * file, const char * buf,
877{ 779{
878 int copied = 0; 780 int copied = 0;
879 char *page; 781 char *page;
880 struct task_struct *task = proc_task(file->f_dentry->d_inode); 782 struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
881 unsigned long dst = *ppos; 783 unsigned long dst = *ppos;
882 784
785 copied = -ESRCH;
786 if (!task)
787 goto out_no_task;
788
883 if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) 789 if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
884 return -ESRCH; 790 goto out;
885 791
792 copied = -ENOMEM;
886 page = (char *)__get_free_page(GFP_USER); 793 page = (char *)__get_free_page(GFP_USER);
887 if (!page) 794 if (!page)
888 return -ENOMEM; 795 goto out;
889 796
890 while (count > 0) { 797 while (count > 0) {
891 int this_len, retval; 798 int this_len, retval;
@@ -908,6 +815,9 @@ static ssize_t mem_write(struct file * file, const char * buf,
908 } 815 }
909 *ppos = dst; 816 *ppos = dst;
910 free_page((unsigned long) page); 817 free_page((unsigned long) page);
818out:
819 put_task_struct(task);
820out_no_task:
911 return copied; 821 return copied;
912} 822}
913#endif 823#endif
@@ -938,13 +848,18 @@ static struct file_operations proc_mem_operations = {
938static ssize_t oom_adjust_read(struct file *file, char __user *buf, 848static ssize_t oom_adjust_read(struct file *file, char __user *buf,
939 size_t count, loff_t *ppos) 849 size_t count, loff_t *ppos)
940{ 850{
941 struct task_struct *task = proc_task(file->f_dentry->d_inode); 851 struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
942 char buffer[8]; 852 char buffer[PROC_NUMBUF];
943 size_t len; 853 size_t len;
944 int oom_adjust = task->oomkilladj; 854 int oom_adjust;
945 loff_t __ppos = *ppos; 855 loff_t __ppos = *ppos;
946 856
947 len = sprintf(buffer, "%i\n", oom_adjust); 857 if (!task)
858 return -ESRCH;
859 oom_adjust = task->oomkilladj;
860 put_task_struct(task);
861
862 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
948 if (__ppos >= len) 863 if (__ppos >= len)
949 return 0; 864 return 0;
950 if (count > len-__ppos) 865 if (count > len-__ppos)
@@ -958,15 +873,15 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
958static ssize_t oom_adjust_write(struct file *file, const char __user *buf, 873static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
959 size_t count, loff_t *ppos) 874 size_t count, loff_t *ppos)
960{ 875{
961 struct task_struct *task = proc_task(file->f_dentry->d_inode); 876 struct task_struct *task;
962 char buffer[8], *end; 877 char buffer[PROC_NUMBUF], *end;
963 int oom_adjust; 878 int oom_adjust;
964 879
965 if (!capable(CAP_SYS_RESOURCE)) 880 if (!capable(CAP_SYS_RESOURCE))
966 return -EPERM; 881 return -EPERM;
967 memset(buffer, 0, 8); 882 memset(buffer, 0, sizeof(buffer));
968 if (count > 6) 883 if (count > sizeof(buffer) - 1)
969 count = 6; 884 count = sizeof(buffer) - 1;
970 if (copy_from_user(buffer, buf, count)) 885 if (copy_from_user(buffer, buf, count))
971 return -EFAULT; 886 return -EFAULT;
972 oom_adjust = simple_strtol(buffer, &end, 0); 887 oom_adjust = simple_strtol(buffer, &end, 0);
@@ -974,7 +889,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
974 return -EINVAL; 889 return -EINVAL;
975 if (*end == '\n') 890 if (*end == '\n')
976 end++; 891 end++;
892 task = get_proc_task(file->f_dentry->d_inode);
893 if (!task)
894 return -ESRCH;
977 task->oomkilladj = oom_adjust; 895 task->oomkilladj = oom_adjust;
896 put_task_struct(task);
978 if (end - buffer == 0) 897 if (end - buffer == 0)
979 return -EIO; 898 return -EIO;
980 return end - buffer; 899 return end - buffer;
@@ -985,22 +904,21 @@ static struct file_operations proc_oom_adjust_operations = {
985 .write = oom_adjust_write, 904 .write = oom_adjust_write,
986}; 905};
987 906
988static struct inode_operations proc_mem_inode_operations = {
989 .permission = proc_permission,
990};
991
992#ifdef CONFIG_AUDITSYSCALL 907#ifdef CONFIG_AUDITSYSCALL
993#define TMPBUFLEN 21 908#define TMPBUFLEN 21
994static ssize_t proc_loginuid_read(struct file * file, char __user * buf, 909static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
995 size_t count, loff_t *ppos) 910 size_t count, loff_t *ppos)
996{ 911{
997 struct inode * inode = file->f_dentry->d_inode; 912 struct inode * inode = file->f_dentry->d_inode;
998 struct task_struct *task = proc_task(inode); 913 struct task_struct *task = get_proc_task(inode);
999 ssize_t length; 914 ssize_t length;
1000 char tmpbuf[TMPBUFLEN]; 915 char tmpbuf[TMPBUFLEN];
1001 916
917 if (!task)
918 return -ESRCH;
1002 length = scnprintf(tmpbuf, TMPBUFLEN, "%u", 919 length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1003 audit_get_loginuid(task->audit_context)); 920 audit_get_loginuid(task->audit_context));
921 put_task_struct(task);
1004 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); 922 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1005} 923}
1006 924
@@ -1010,13 +928,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1010 struct inode * inode = file->f_dentry->d_inode; 928 struct inode * inode = file->f_dentry->d_inode;
1011 char *page, *tmp; 929 char *page, *tmp;
1012 ssize_t length; 930 ssize_t length;
1013 struct task_struct *task = proc_task(inode);
1014 uid_t loginuid; 931 uid_t loginuid;
1015 932
1016 if (!capable(CAP_AUDIT_CONTROL)) 933 if (!capable(CAP_AUDIT_CONTROL))
1017 return -EPERM; 934 return -EPERM;
1018 935
1019 if (current != task) 936 if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
1020 return -EPERM; 937 return -EPERM;
1021 938
1022 if (count >= PAGE_SIZE) 939 if (count >= PAGE_SIZE)
@@ -1040,7 +957,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1040 goto out_free_page; 957 goto out_free_page;
1041 958
1042 } 959 }
1043 length = audit_set_loginuid(task, loginuid); 960 length = audit_set_loginuid(current, loginuid);
1044 if (likely(length == 0)) 961 if (likely(length == 0))
1045 length = count; 962 length = count;
1046 963
@@ -1059,13 +976,16 @@ static struct file_operations proc_loginuid_operations = {
1059static ssize_t seccomp_read(struct file *file, char __user *buf, 976static ssize_t seccomp_read(struct file *file, char __user *buf,
1060 size_t count, loff_t *ppos) 977 size_t count, loff_t *ppos)
1061{ 978{
1062 struct task_struct *tsk = proc_task(file->f_dentry->d_inode); 979 struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
1063 char __buf[20]; 980 char __buf[20];
1064 loff_t __ppos = *ppos; 981 loff_t __ppos = *ppos;
1065 size_t len; 982 size_t len;
1066 983
984 if (!tsk)
985 return -ESRCH;
1067 /* no need to print the trailing zero, so use only len */ 986 /* no need to print the trailing zero, so use only len */
1068 len = sprintf(__buf, "%u\n", tsk->seccomp.mode); 987 len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
988 put_task_struct(tsk);
1069 if (__ppos >= len) 989 if (__ppos >= len)
1070 return 0; 990 return 0;
1071 if (count > len - __ppos) 991 if (count > len - __ppos)
@@ -1079,29 +999,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf,
1079static ssize_t seccomp_write(struct file *file, const char __user *buf, 999static ssize_t seccomp_write(struct file *file, const char __user *buf,
1080 size_t count, loff_t *ppos) 1000 size_t count, loff_t *ppos)
1081{ 1001{
1082 struct task_struct *tsk = proc_task(file->f_dentry->d_inode); 1002 struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
1083 char __buf[20], *end; 1003 char __buf[20], *end;
1084 unsigned int seccomp_mode; 1004 unsigned int seccomp_mode;
1005 ssize_t result;
1006
1007 result = -ESRCH;
1008 if (!tsk)
1009 goto out_no_task;
1085 1010
1086 /* can set it only once to be even more secure */ 1011 /* can set it only once to be even more secure */
1012 result = -EPERM;
1087 if (unlikely(tsk->seccomp.mode)) 1013 if (unlikely(tsk->seccomp.mode))
1088 return -EPERM; 1014 goto out;
1089 1015
1016 result = -EFAULT;
1090 memset(__buf, 0, sizeof(__buf)); 1017 memset(__buf, 0, sizeof(__buf));
1091 count = min(count, sizeof(__buf) - 1); 1018 count = min(count, sizeof(__buf) - 1);
1092 if (copy_from_user(__buf, buf, count)) 1019 if (copy_from_user(__buf, buf, count))
1093 return -EFAULT; 1020 goto out;
1021
1094 seccomp_mode = simple_strtoul(__buf, &end, 0); 1022 seccomp_mode = simple_strtoul(__buf, &end, 0);
1095 if (*end == '\n') 1023 if (*end == '\n')
1096 end++; 1024 end++;
1025 result = -EINVAL;
1097 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { 1026 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
1098 tsk->seccomp.mode = seccomp_mode; 1027 tsk->seccomp.mode = seccomp_mode;
1099 set_tsk_thread_flag(tsk, TIF_SECCOMP); 1028 set_tsk_thread_flag(tsk, TIF_SECCOMP);
1100 } else 1029 } else
1101 return -EINVAL; 1030 goto out;
1031 result = -EIO;
1102 if (unlikely(!(end - __buf))) 1032 if (unlikely(!(end - __buf)))
1103 return -EIO; 1033 goto out;
1104 return end - __buf; 1034 result = end - __buf;
1035out:
1036 put_task_struct(tsk);
1037out_no_task:
1038 return result;
1105} 1039}
1106 1040
1107static struct file_operations proc_seccomp_operations = { 1041static struct file_operations proc_seccomp_operations = {
@@ -1118,10 +1052,8 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
1118 /* We don't need a base pointer in the /proc filesystem */ 1052 /* We don't need a base pointer in the /proc filesystem */
1119 path_release(nd); 1053 path_release(nd);
1120 1054
1121 if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) 1055 /* Are we allowed to snoop on the tasks file descriptors? */
1122 goto out; 1056 if (!proc_fd_access_allowed(inode))
1123 error = proc_check_root(inode);
1124 if (error)
1125 goto out; 1057 goto out;
1126 1058
1127 error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt); 1059 error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt);
@@ -1163,12 +1095,8 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
1163 struct dentry *de; 1095 struct dentry *de;
1164 struct vfsmount *mnt = NULL; 1096 struct vfsmount *mnt = NULL;
1165 1097
1166 lock_kernel(); 1098 /* Are we allowed to snoop on the tasks file descriptors? */
1167 1099 if (!proc_fd_access_allowed(inode))
1168 if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
1169 goto out;
1170 error = proc_check_root(inode);
1171 if (error)
1172 goto out; 1100 goto out;
1173 1101
1174 error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt); 1102 error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt);
@@ -1179,7 +1107,6 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
1179 dput(de); 1107 dput(de);
1180 mntput(mnt); 1108 mntput(mnt);
1181out: 1109out:
1182 unlock_kernel();
1183 return error; 1110 return error;
1184} 1111}
1185 1112
@@ -1188,21 +1115,20 @@ static struct inode_operations proc_pid_link_inode_operations = {
1188 .follow_link = proc_pid_follow_link 1115 .follow_link = proc_pid_follow_link
1189}; 1116};
1190 1117
1191#define NUMBUF 10
1192
1193static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) 1118static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
1194{ 1119{
1195 struct inode *inode = filp->f_dentry->d_inode; 1120 struct dentry *dentry = filp->f_dentry;
1196 struct task_struct *p = proc_task(inode); 1121 struct inode *inode = dentry->d_inode;
1122 struct task_struct *p = get_proc_task(inode);
1197 unsigned int fd, tid, ino; 1123 unsigned int fd, tid, ino;
1198 int retval; 1124 int retval;
1199 char buf[NUMBUF]; 1125 char buf[PROC_NUMBUF];
1200 struct files_struct * files; 1126 struct files_struct * files;
1201 struct fdtable *fdt; 1127 struct fdtable *fdt;
1202 1128
1203 retval = -ENOENT; 1129 retval = -ENOENT;
1204 if (!pid_alive(p)) 1130 if (!p)
1205 goto out; 1131 goto out_no_task;
1206 retval = 0; 1132 retval = 0;
1207 tid = p->pid; 1133 tid = p->pid;
1208 1134
@@ -1213,7 +1139,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
1213 goto out; 1139 goto out;
1214 filp->f_pos++; 1140 filp->f_pos++;
1215 case 1: 1141 case 1:
1216 ino = fake_ino(tid, PROC_TID_INO); 1142 ino = parent_ino(dentry);
1217 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) 1143 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
1218 goto out; 1144 goto out;
1219 filp->f_pos++; 1145 filp->f_pos++;
@@ -1232,7 +1158,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
1232 continue; 1158 continue;
1233 rcu_read_unlock(); 1159 rcu_read_unlock();
1234 1160
1235 j = NUMBUF; 1161 j = PROC_NUMBUF;
1236 i = fd; 1162 i = fd;
1237 do { 1163 do {
1238 j--; 1164 j--;
@@ -1241,7 +1167,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
1241 } while (i); 1167 } while (i);
1242 1168
1243 ino = fake_ino(tid, PROC_TID_FD_DIR + fd); 1169 ino = fake_ino(tid, PROC_TID_FD_DIR + fd);
1244 if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) { 1170 if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
1245 rcu_read_lock(); 1171 rcu_read_lock();
1246 break; 1172 break;
1247 } 1173 }
@@ -1251,6 +1177,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
1251 put_files_struct(files); 1177 put_files_struct(files);
1252 } 1178 }
1253out: 1179out:
1180 put_task_struct(p);
1181out_no_task:
1254 return retval; 1182 return retval;
1255} 1183}
1256 1184
@@ -1262,16 +1190,18 @@ static int proc_pident_readdir(struct file *filp,
1262 int pid; 1190 int pid;
1263 struct dentry *dentry = filp->f_dentry; 1191 struct dentry *dentry = filp->f_dentry;
1264 struct inode *inode = dentry->d_inode; 1192 struct inode *inode = dentry->d_inode;
1193 struct task_struct *task = get_proc_task(inode);
1265 struct pid_entry *p; 1194 struct pid_entry *p;
1266 ino_t ino; 1195 ino_t ino;
1267 int ret; 1196 int ret;
1268 1197
1269 ret = -ENOENT; 1198 ret = -ENOENT;
1270 if (!pid_alive(proc_task(inode))) 1199 if (!task)
1271 goto out; 1200 goto out;
1272 1201
1273 ret = 0; 1202 ret = 0;
1274 pid = proc_task(inode)->pid; 1203 pid = task->pid;
1204 put_task_struct(task);
1275 i = filp->f_pos; 1205 i = filp->f_pos;
1276 switch (i) { 1206 switch (i) {
1277 case 0: 1207 case 0:
@@ -1354,22 +1284,19 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
1354 1284
1355 /* Common stuff */ 1285 /* Common stuff */
1356 ei = PROC_I(inode); 1286 ei = PROC_I(inode);
1357 ei->task = NULL;
1358 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1287 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1359 inode->i_ino = fake_ino(task->pid, ino); 1288 inode->i_ino = fake_ino(task->pid, ino);
1360 1289
1361 if (!pid_alive(task))
1362 goto out_unlock;
1363
1364 /* 1290 /*
1365 * grab the reference to task. 1291 * grab the reference to task.
1366 */ 1292 */
1367 get_task_struct(task); 1293 ei->pid = get_pid(task->pids[PIDTYPE_PID].pid);
1368 ei->task = task; 1294 if (!ei->pid)
1369 ei->type = ino; 1295 goto out_unlock;
1296
1370 inode->i_uid = 0; 1297 inode->i_uid = 0;
1371 inode->i_gid = 0; 1298 inode->i_gid = 0;
1372 if (ino == PROC_TGID_INO || ino == PROC_TID_INO || task_dumpable(task)) { 1299 if (task_dumpable(task)) {
1373 inode->i_uid = task->euid; 1300 inode->i_uid = task->euid;
1374 inode->i_gid = task->egid; 1301 inode->i_gid = task->egid;
1375 } 1302 }
@@ -1379,7 +1306,6 @@ out:
1379 return inode; 1306 return inode;
1380 1307
1381out_unlock: 1308out_unlock:
1382 ei->pde = NULL;
1383 iput(inode); 1309 iput(inode);
1384 return NULL; 1310 return NULL;
1385} 1311}
@@ -1393,13 +1319,21 @@ out_unlock:
1393 * 1319 *
1394 * Rewrite the inode's ownerships here because the owning task may have 1320 * Rewrite the inode's ownerships here because the owning task may have
1395 * performed a setuid(), etc. 1321 * performed a setuid(), etc.
1322 *
1323 * Before the /proc/pid/status file was created the only way to read
1324 * the effective uid of a /process was to stat /proc/pid. Reading
1325 * /proc/pid/status is slow enough that procps and other packages
1326 * kept stating /proc/pid. To keep the rules in /proc simple I have
1327 * made this apply to all per process world readable and executable
1328 * directories.
1396 */ 1329 */
1397static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) 1330static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1398{ 1331{
1399 struct inode *inode = dentry->d_inode; 1332 struct inode *inode = dentry->d_inode;
1400 struct task_struct *task = proc_task(inode); 1333 struct task_struct *task = get_proc_task(inode);
1401 if (pid_alive(task)) { 1334 if (task) {
1402 if (proc_type(inode) == PROC_TGID_INO || proc_type(inode) == PROC_TID_INO || task_dumpable(task)) { 1335 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1336 task_dumpable(task)) {
1403 inode->i_uid = task->euid; 1337 inode->i_uid = task->euid;
1404 inode->i_gid = task->egid; 1338 inode->i_gid = task->egid;
1405 } else { 1339 } else {
@@ -1407,59 +1341,75 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1407 inode->i_gid = 0; 1341 inode->i_gid = 0;
1408 } 1342 }
1409 security_task_to_inode(task, inode); 1343 security_task_to_inode(task, inode);
1344 put_task_struct(task);
1410 return 1; 1345 return 1;
1411 } 1346 }
1412 d_drop(dentry); 1347 d_drop(dentry);
1413 return 0; 1348 return 0;
1414} 1349}
1415 1350
1351static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1352{
1353 struct inode *inode = dentry->d_inode;
1354 struct task_struct *task;
1355 generic_fillattr(inode, stat);
1356
1357 rcu_read_lock();
1358 stat->uid = 0;
1359 stat->gid = 0;
1360 task = pid_task(proc_pid(inode), PIDTYPE_PID);
1361 if (task) {
1362 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1363 task_dumpable(task)) {
1364 stat->uid = task->euid;
1365 stat->gid = task->egid;
1366 }
1367 }
1368 rcu_read_unlock();
1369 return 0;
1370}
1371
1416static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) 1372static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
1417{ 1373{
1418 struct inode *inode = dentry->d_inode; 1374 struct inode *inode = dentry->d_inode;
1419 struct task_struct *task = proc_task(inode); 1375 struct task_struct *task = get_proc_task(inode);
1420 int fd = proc_type(inode) - PROC_TID_FD_DIR; 1376 int fd = proc_fd(inode);
1421 struct files_struct *files; 1377 struct files_struct *files;
1422 1378
1423 files = get_files_struct(task); 1379 if (task) {
1424 if (files) { 1380 files = get_files_struct(task);
1425 rcu_read_lock(); 1381 if (files) {
1426 if (fcheck_files(files, fd)) { 1382 rcu_read_lock();
1383 if (fcheck_files(files, fd)) {
1384 rcu_read_unlock();
1385 put_files_struct(files);
1386 if (task_dumpable(task)) {
1387 inode->i_uid = task->euid;
1388 inode->i_gid = task->egid;
1389 } else {
1390 inode->i_uid = 0;
1391 inode->i_gid = 0;
1392 }
1393 security_task_to_inode(task, inode);
1394 put_task_struct(task);
1395 return 1;
1396 }
1427 rcu_read_unlock(); 1397 rcu_read_unlock();
1428 put_files_struct(files); 1398 put_files_struct(files);
1429 if (task_dumpable(task)) {
1430 inode->i_uid = task->euid;
1431 inode->i_gid = task->egid;
1432 } else {
1433 inode->i_uid = 0;
1434 inode->i_gid = 0;
1435 }
1436 security_task_to_inode(task, inode);
1437 return 1;
1438 } 1399 }
1439 rcu_read_unlock(); 1400 put_task_struct(task);
1440 put_files_struct(files);
1441 } 1401 }
1442 d_drop(dentry); 1402 d_drop(dentry);
1443 return 0; 1403 return 0;
1444} 1404}
1445 1405
1446static void pid_base_iput(struct dentry *dentry, struct inode *inode)
1447{
1448 struct task_struct *task = proc_task(inode);
1449 spin_lock(&task->proc_lock);
1450 if (task->proc_dentry == dentry)
1451 task->proc_dentry = NULL;
1452 spin_unlock(&task->proc_lock);
1453 iput(inode);
1454}
1455
1456static int pid_delete_dentry(struct dentry * dentry) 1406static int pid_delete_dentry(struct dentry * dentry)
1457{ 1407{
1458 /* Is the task we represent dead? 1408 /* Is the task we represent dead?
1459 * If so, then don't put the dentry on the lru list, 1409 * If so, then don't put the dentry on the lru list,
1460 * kill it immediately. 1410 * kill it immediately.
1461 */ 1411 */
1462 return !pid_alive(proc_task(dentry->d_inode)); 1412 return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
1463} 1413}
1464 1414
1465static struct dentry_operations tid_fd_dentry_operations = 1415static struct dentry_operations tid_fd_dentry_operations =
@@ -1474,13 +1424,6 @@ static struct dentry_operations pid_dentry_operations =
1474 .d_delete = pid_delete_dentry, 1424 .d_delete = pid_delete_dentry,
1475}; 1425};
1476 1426
1477static struct dentry_operations pid_base_dentry_operations =
1478{
1479 .d_revalidate = pid_revalidate,
1480 .d_iput = pid_base_iput,
1481 .d_delete = pid_delete_dentry,
1482};
1483
1484/* Lookups */ 1427/* Lookups */
1485 1428
1486static unsigned name_to_int(struct dentry *dentry) 1429static unsigned name_to_int(struct dentry *dentry)
@@ -1508,22 +1451,24 @@ out:
1508/* SMP-safe */ 1451/* SMP-safe */
1509static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) 1452static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
1510{ 1453{
1511 struct task_struct *task = proc_task(dir); 1454 struct task_struct *task = get_proc_task(dir);
1512 unsigned fd = name_to_int(dentry); 1455 unsigned fd = name_to_int(dentry);
1456 struct dentry *result = ERR_PTR(-ENOENT);
1513 struct file * file; 1457 struct file * file;
1514 struct files_struct * files; 1458 struct files_struct * files;
1515 struct inode *inode; 1459 struct inode *inode;
1516 struct proc_inode *ei; 1460 struct proc_inode *ei;
1517 1461
1462 if (!task)
1463 goto out_no_task;
1518 if (fd == ~0U) 1464 if (fd == ~0U)
1519 goto out; 1465 goto out;
1520 if (!pid_alive(task))
1521 goto out;
1522 1466
1523 inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd); 1467 inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd);
1524 if (!inode) 1468 if (!inode)
1525 goto out; 1469 goto out;
1526 ei = PROC_I(inode); 1470 ei = PROC_I(inode);
1471 ei->fd = fd;
1527 files = get_files_struct(task); 1472 files = get_files_struct(task);
1528 if (!files) 1473 if (!files)
1529 goto out_unlock; 1474 goto out_unlock;
@@ -1548,19 +1493,25 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
1548 ei->op.proc_get_link = proc_fd_link; 1493 ei->op.proc_get_link = proc_fd_link;
1549 dentry->d_op = &tid_fd_dentry_operations; 1494 dentry->d_op = &tid_fd_dentry_operations;
1550 d_add(dentry, inode); 1495 d_add(dentry, inode);
1551 return NULL; 1496 /* Close the race of the process dying before we return the dentry */
1497 if (tid_fd_revalidate(dentry, NULL))
1498 result = NULL;
1499out:
1500 put_task_struct(task);
1501out_no_task:
1502 return result;
1552 1503
1553out_unlock2: 1504out_unlock2:
1554 spin_unlock(&files->file_lock); 1505 spin_unlock(&files->file_lock);
1555 put_files_struct(files); 1506 put_files_struct(files);
1556out_unlock: 1507out_unlock:
1557 iput(inode); 1508 iput(inode);
1558out: 1509 goto out;
1559 return ERR_PTR(-ENOENT);
1560} 1510}
1561 1511
1562static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir); 1512static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir);
1563static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd); 1513static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd);
1514static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
1564 1515
1565static struct file_operations proc_fd_operations = { 1516static struct file_operations proc_fd_operations = {
1566 .read = generic_read_dir, 1517 .read = generic_read_dir,
@@ -1577,12 +1528,11 @@ static struct file_operations proc_task_operations = {
1577 */ 1528 */
1578static struct inode_operations proc_fd_inode_operations = { 1529static struct inode_operations proc_fd_inode_operations = {
1579 .lookup = proc_lookupfd, 1530 .lookup = proc_lookupfd,
1580 .permission = proc_permission,
1581}; 1531};
1582 1532
1583static struct inode_operations proc_task_inode_operations = { 1533static struct inode_operations proc_task_inode_operations = {
1584 .lookup = proc_task_lookup, 1534 .lookup = proc_task_lookup,
1585 .permission = proc_task_permission, 1535 .getattr = proc_task_getattr,
1586}; 1536};
1587 1537
1588#ifdef CONFIG_SECURITY 1538#ifdef CONFIG_SECURITY
@@ -1592,12 +1542,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
1592 struct inode * inode = file->f_dentry->d_inode; 1542 struct inode * inode = file->f_dentry->d_inode;
1593 unsigned long page; 1543 unsigned long page;
1594 ssize_t length; 1544 ssize_t length;
1595 struct task_struct *task = proc_task(inode); 1545 struct task_struct *task = get_proc_task(inode);
1546
1547 length = -ESRCH;
1548 if (!task)
1549 goto out_no_task;
1596 1550
1597 if (count > PAGE_SIZE) 1551 if (count > PAGE_SIZE)
1598 count = PAGE_SIZE; 1552 count = PAGE_SIZE;
1553 length = -ENOMEM;
1599 if (!(page = __get_free_page(GFP_KERNEL))) 1554 if (!(page = __get_free_page(GFP_KERNEL)))
1600 return -ENOMEM; 1555 goto out;
1601 1556
1602 length = security_getprocattr(task, 1557 length = security_getprocattr(task,
1603 (char*)file->f_dentry->d_name.name, 1558 (char*)file->f_dentry->d_name.name,
@@ -1605,6 +1560,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
1605 if (length >= 0) 1560 if (length >= 0)
1606 length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); 1561 length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
1607 free_page(page); 1562 free_page(page);
1563out:
1564 put_task_struct(task);
1565out_no_task:
1608 return length; 1566 return length;
1609} 1567}
1610 1568
@@ -1614,26 +1572,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
1614 struct inode * inode = file->f_dentry->d_inode; 1572 struct inode * inode = file->f_dentry->d_inode;
1615 char *page; 1573 char *page;
1616 ssize_t length; 1574 ssize_t length;
1617 struct task_struct *task = proc_task(inode); 1575 struct task_struct *task = get_proc_task(inode);
1618 1576
1577 length = -ESRCH;
1578 if (!task)
1579 goto out_no_task;
1619 if (count > PAGE_SIZE) 1580 if (count > PAGE_SIZE)
1620 count = PAGE_SIZE; 1581 count = PAGE_SIZE;
1621 if (*ppos != 0) { 1582
1622 /* No partial writes. */ 1583 /* No partial writes. */
1623 return -EINVAL; 1584 length = -EINVAL;
1624 } 1585 if (*ppos != 0)
1586 goto out;
1587
1588 length = -ENOMEM;
1625 page = (char*)__get_free_page(GFP_USER); 1589 page = (char*)__get_free_page(GFP_USER);
1626 if (!page) 1590 if (!page)
1627 return -ENOMEM; 1591 goto out;
1592
1628 length = -EFAULT; 1593 length = -EFAULT;
1629 if (copy_from_user(page, buf, count)) 1594 if (copy_from_user(page, buf, count))
1630 goto out; 1595 goto out_free;
1631 1596
1632 length = security_setprocattr(task, 1597 length = security_setprocattr(task,
1633 (char*)file->f_dentry->d_name.name, 1598 (char*)file->f_dentry->d_name.name,
1634 (void*)page, count); 1599 (void*)page, count);
1635out: 1600out_free:
1636 free_page((unsigned long) page); 1601 free_page((unsigned long) page);
1602out:
1603 put_task_struct(task);
1604out_no_task:
1637 return length; 1605 return length;
1638} 1606}
1639 1607
@@ -1648,24 +1616,22 @@ static struct file_operations proc_tgid_attr_operations;
1648static struct inode_operations proc_tgid_attr_inode_operations; 1616static struct inode_operations proc_tgid_attr_inode_operations;
1649#endif 1617#endif
1650 1618
1651static int get_tid_list(int index, unsigned int *tids, struct inode *dir);
1652
1653/* SMP-safe */ 1619/* SMP-safe */
1654static struct dentry *proc_pident_lookup(struct inode *dir, 1620static struct dentry *proc_pident_lookup(struct inode *dir,
1655 struct dentry *dentry, 1621 struct dentry *dentry,
1656 struct pid_entry *ents) 1622 struct pid_entry *ents)
1657{ 1623{
1658 struct inode *inode; 1624 struct inode *inode;
1659 int error; 1625 struct dentry *error;
1660 struct task_struct *task = proc_task(dir); 1626 struct task_struct *task = get_proc_task(dir);
1661 struct pid_entry *p; 1627 struct pid_entry *p;
1662 struct proc_inode *ei; 1628 struct proc_inode *ei;
1663 1629
1664 error = -ENOENT; 1630 error = ERR_PTR(-ENOENT);
1665 inode = NULL; 1631 inode = NULL;
1666 1632
1667 if (!pid_alive(task)) 1633 if (!task)
1668 goto out; 1634 goto out_no_task;
1669 1635
1670 for (p = ents; p->name; p++) { 1636 for (p = ents; p->name; p++) {
1671 if (p->len != dentry->d_name.len) 1637 if (p->len != dentry->d_name.len)
@@ -1676,7 +1642,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
1676 if (!p->name) 1642 if (!p->name)
1677 goto out; 1643 goto out;
1678 1644
1679 error = -EINVAL; 1645 error = ERR_PTR(-EINVAL);
1680 inode = proc_pid_make_inode(dir->i_sb, task, p->type); 1646 inode = proc_pid_make_inode(dir->i_sb, task, p->type);
1681 if (!inode) 1647 if (!inode)
1682 goto out; 1648 goto out;
@@ -1689,7 +1655,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
1689 */ 1655 */
1690 switch(p->type) { 1656 switch(p->type) {
1691 case PROC_TGID_TASK: 1657 case PROC_TGID_TASK:
1692 inode->i_nlink = 2 + get_tid_list(2, NULL, dir); 1658 inode->i_nlink = 2;
1693 inode->i_op = &proc_task_inode_operations; 1659 inode->i_op = &proc_task_inode_operations;
1694 inode->i_fop = &proc_task_operations; 1660 inode->i_fop = &proc_task_operations;
1695 break; 1661 break;
@@ -1759,7 +1725,6 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
1759#endif 1725#endif
1760 case PROC_TID_MEM: 1726 case PROC_TID_MEM:
1761 case PROC_TGID_MEM: 1727 case PROC_TGID_MEM:
1762 inode->i_op = &proc_mem_inode_operations;
1763 inode->i_fop = &proc_mem_operations; 1728 inode->i_fop = &proc_mem_operations;
1764 break; 1729 break;
1765#ifdef CONFIG_SECCOMP 1730#ifdef CONFIG_SECCOMP
@@ -1801,6 +1766,10 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
1801 case PROC_TGID_ATTR_EXEC: 1766 case PROC_TGID_ATTR_EXEC:
1802 case PROC_TID_ATTR_FSCREATE: 1767 case PROC_TID_ATTR_FSCREATE:
1803 case PROC_TGID_ATTR_FSCREATE: 1768 case PROC_TGID_ATTR_FSCREATE:
1769 case PROC_TID_ATTR_KEYCREATE:
1770 case PROC_TGID_ATTR_KEYCREATE:
1771 case PROC_TID_ATTR_SOCKCREATE:
1772 case PROC_TGID_ATTR_SOCKCREATE:
1804 inode->i_fop = &proc_pid_attr_operations; 1773 inode->i_fop = &proc_pid_attr_operations;
1805 break; 1774 break;
1806#endif 1775#endif
@@ -1842,14 +1811,18 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
1842 default: 1811 default:
1843 printk("procfs: impossible type (%d)",p->type); 1812 printk("procfs: impossible type (%d)",p->type);
1844 iput(inode); 1813 iput(inode);
1845 return ERR_PTR(-EINVAL); 1814 error = ERR_PTR(-EINVAL);
1815 goto out;
1846 } 1816 }
1847 dentry->d_op = &pid_dentry_operations; 1817 dentry->d_op = &pid_dentry_operations;
1848 d_add(dentry, inode); 1818 d_add(dentry, inode);
1849 return NULL; 1819 /* Close the race of the process dying before we return the dentry */
1850 1820 if (pid_revalidate(dentry, NULL))
1821 error = NULL;
1851out: 1822out:
1852 return ERR_PTR(error); 1823 put_task_struct(task);
1824out_no_task:
1825 return error;
1853} 1826}
1854 1827
1855static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ 1828static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -1872,10 +1845,12 @@ static struct file_operations proc_tid_base_operations = {
1872 1845
1873static struct inode_operations proc_tgid_base_inode_operations = { 1846static struct inode_operations proc_tgid_base_inode_operations = {
1874 .lookup = proc_tgid_base_lookup, 1847 .lookup = proc_tgid_base_lookup,
1848 .getattr = pid_getattr,
1875}; 1849};
1876 1850
1877static struct inode_operations proc_tid_base_inode_operations = { 1851static struct inode_operations proc_tid_base_inode_operations = {
1878 .lookup = proc_tid_base_lookup, 1852 .lookup = proc_tid_base_lookup,
1853 .getattr = pid_getattr,
1879}; 1854};
1880 1855
1881#ifdef CONFIG_SECURITY 1856#ifdef CONFIG_SECURITY
@@ -1917,10 +1892,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir,
1917 1892
1918static struct inode_operations proc_tgid_attr_inode_operations = { 1893static struct inode_operations proc_tgid_attr_inode_operations = {
1919 .lookup = proc_tgid_attr_lookup, 1894 .lookup = proc_tgid_attr_lookup,
1895 .getattr = pid_getattr,
1920}; 1896};
1921 1897
1922static struct inode_operations proc_tid_attr_inode_operations = { 1898static struct inode_operations proc_tid_attr_inode_operations = {
1923 .lookup = proc_tid_attr_lookup, 1899 .lookup = proc_tid_attr_lookup,
1900 .getattr = pid_getattr,
1924}; 1901};
1925#endif 1902#endif
1926 1903
@@ -1930,14 +1907,14 @@ static struct inode_operations proc_tid_attr_inode_operations = {
1930static int proc_self_readlink(struct dentry *dentry, char __user *buffer, 1907static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
1931 int buflen) 1908 int buflen)
1932{ 1909{
1933 char tmp[30]; 1910 char tmp[PROC_NUMBUF];
1934 sprintf(tmp, "%d", current->tgid); 1911 sprintf(tmp, "%d", current->tgid);
1935 return vfs_readlink(dentry,buffer,buflen,tmp); 1912 return vfs_readlink(dentry,buffer,buflen,tmp);
1936} 1913}
1937 1914
1938static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) 1915static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
1939{ 1916{
1940 char tmp[30]; 1917 char tmp[PROC_NUMBUF];
1941 sprintf(tmp, "%d", current->tgid); 1918 sprintf(tmp, "%d", current->tgid);
1942 return ERR_PTR(vfs_follow_link(nd,tmp)); 1919 return ERR_PTR(vfs_follow_link(nd,tmp));
1943} 1920}
@@ -1948,67 +1925,80 @@ static struct inode_operations proc_self_inode_operations = {
1948}; 1925};
1949 1926
1950/** 1927/**
1951 * proc_pid_unhash - Unhash /proc/@pid entry from the dcache. 1928 * proc_flush_task - Remove dcache entries for @task from the /proc dcache.
1952 * @p: task that should be flushed. 1929 *
1930 * @task: task that should be flushed.
1931 *
1932 * Looks in the dcache for
1933 * /proc/@pid
1934 * /proc/@tgid/task/@pid
1935 * if either directory is present flushes it and all of it'ts children
1936 * from the dcache.
1953 * 1937 *
1954 * Drops the /proc/@pid dcache entry from the hash chains. 1938 * It is safe and reasonable to cache /proc entries for a task until
1939 * that task exits. After that they just clog up the dcache with
1940 * useless entries, possibly causing useful dcache entries to be
1941 * flushed instead. This routine is proved to flush those useless
1942 * dcache entries at process exit time.
1955 * 1943 *
1956 * Dropping /proc/@pid entries and detach_pid must be synchroneous, 1944 * NOTE: This routine is just an optimization so it does not guarantee
1957 * otherwise e.g. /proc/@pid/exe might point to the wrong executable, 1945 * that no dcache entries will exist at process exit time it
1958 * if the pid value is immediately reused. This is enforced by 1946 * just makes it very unlikely that any will persist.
1959 * - caller must acquire spin_lock(p->proc_lock)
1960 * - must be called before detach_pid()
1961 * - proc_pid_lookup acquires proc_lock, and checks that
1962 * the target is not dead by looking at the attach count
1963 * of PIDTYPE_PID.
1964 */ 1947 */
1965 1948void proc_flush_task(struct task_struct *task)
1966struct dentry *proc_pid_unhash(struct task_struct *p)
1967{ 1949{
1968 struct dentry *proc_dentry; 1950 struct dentry *dentry, *leader, *dir;
1951 char buf[PROC_NUMBUF];
1952 struct qstr name;
1953
1954 name.name = buf;
1955 name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
1956 dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name);
1957 if (dentry) {
1958 shrink_dcache_parent(dentry);
1959 d_drop(dentry);
1960 dput(dentry);
1961 }
1969 1962
1970 proc_dentry = p->proc_dentry; 1963 if (thread_group_leader(task))
1971 if (proc_dentry != NULL) { 1964 goto out;
1972 1965
1973 spin_lock(&dcache_lock); 1966 name.name = buf;
1974 spin_lock(&proc_dentry->d_lock); 1967 name.len = snprintf(buf, sizeof(buf), "%d", task->tgid);
1975 if (!d_unhashed(proc_dentry)) { 1968 leader = d_hash_and_lookup(proc_mnt->mnt_root, &name);
1976 dget_locked(proc_dentry); 1969 if (!leader)
1977 __d_drop(proc_dentry); 1970 goto out;
1978 spin_unlock(&proc_dentry->d_lock);
1979 } else {
1980 spin_unlock(&proc_dentry->d_lock);
1981 proc_dentry = NULL;
1982 }
1983 spin_unlock(&dcache_lock);
1984 }
1985 return proc_dentry;
1986}
1987 1971
1988/** 1972 name.name = "task";
1989 * proc_pid_flush - recover memory used by stale /proc/@pid/x entries 1973 name.len = strlen(name.name);
1990 * @proc_dentry: directoy to prune. 1974 dir = d_hash_and_lookup(leader, &name);
1991 * 1975 if (!dir)
1992 * Shrink the /proc directory that was used by the just killed thread. 1976 goto out_put_leader;
1993 */ 1977
1994 1978 name.name = buf;
1995void proc_pid_flush(struct dentry *proc_dentry) 1979 name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
1996{ 1980 dentry = d_hash_and_lookup(dir, &name);
1997 might_sleep(); 1981 if (dentry) {
1998 if(proc_dentry != NULL) { 1982 shrink_dcache_parent(dentry);
1999 shrink_dcache_parent(proc_dentry); 1983 d_drop(dentry);
2000 dput(proc_dentry); 1984 dput(dentry);
2001 } 1985 }
1986
1987 dput(dir);
1988out_put_leader:
1989 dput(leader);
1990out:
1991 return;
2002} 1992}
2003 1993
2004/* SMP-safe */ 1994/* SMP-safe */
2005struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 1995struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2006{ 1996{
1997 struct dentry *result = ERR_PTR(-ENOENT);
2007 struct task_struct *task; 1998 struct task_struct *task;
2008 struct inode *inode; 1999 struct inode *inode;
2009 struct proc_inode *ei; 2000 struct proc_inode *ei;
2010 unsigned tgid; 2001 unsigned tgid;
2011 int died;
2012 2002
2013 if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) { 2003 if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) {
2014 inode = new_inode(dir->i_sb); 2004 inode = new_inode(dir->i_sb);
@@ -2029,21 +2019,18 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
2029 if (tgid == ~0U) 2019 if (tgid == ~0U)
2030 goto out; 2020 goto out;
2031 2021
2032 read_lock(&tasklist_lock); 2022 rcu_read_lock();
2033 task = find_task_by_pid(tgid); 2023 task = find_task_by_pid(tgid);
2034 if (task) 2024 if (task)
2035 get_task_struct(task); 2025 get_task_struct(task);
2036 read_unlock(&tasklist_lock); 2026 rcu_read_unlock();
2037 if (!task) 2027 if (!task)
2038 goto out; 2028 goto out;
2039 2029
2040 inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO); 2030 inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO);
2031 if (!inode)
2032 goto out_put_task;
2041 2033
2042
2043 if (!inode) {
2044 put_task_struct(task);
2045 goto out;
2046 }
2047 inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; 2034 inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2048 inode->i_op = &proc_tgid_base_inode_operations; 2035 inode->i_op = &proc_tgid_base_inode_operations;
2049 inode->i_fop = &proc_tgid_base_operations; 2036 inode->i_fop = &proc_tgid_base_operations;
@@ -2054,45 +2041,40 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
2054 inode->i_nlink = 4; 2041 inode->i_nlink = 4;
2055#endif 2042#endif
2056 2043
2057 dentry->d_op = &pid_base_dentry_operations; 2044 dentry->d_op = &pid_dentry_operations;
2058 2045
2059 died = 0;
2060 d_add(dentry, inode); 2046 d_add(dentry, inode);
2061 spin_lock(&task->proc_lock); 2047 /* Close the race of the process dying before we return the dentry */
2062 task->proc_dentry = dentry; 2048 if (pid_revalidate(dentry, NULL))
2063 if (!pid_alive(task)) { 2049 result = NULL;
2064 dentry = proc_pid_unhash(task);
2065 died = 1;
2066 }
2067 spin_unlock(&task->proc_lock);
2068 2050
2051out_put_task:
2069 put_task_struct(task); 2052 put_task_struct(task);
2070 if (died) {
2071 proc_pid_flush(dentry);
2072 goto out;
2073 }
2074 return NULL;
2075out: 2053out:
2076 return ERR_PTR(-ENOENT); 2054 return result;
2077} 2055}
2078 2056
2079/* SMP-safe */ 2057/* SMP-safe */
2080static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 2058static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2081{ 2059{
2060 struct dentry *result = ERR_PTR(-ENOENT);
2082 struct task_struct *task; 2061 struct task_struct *task;
2083 struct task_struct *leader = proc_task(dir); 2062 struct task_struct *leader = get_proc_task(dir);
2084 struct inode *inode; 2063 struct inode *inode;
2085 unsigned tid; 2064 unsigned tid;
2086 2065
2066 if (!leader)
2067 goto out_no_task;
2068
2087 tid = name_to_int(dentry); 2069 tid = name_to_int(dentry);
2088 if (tid == ~0U) 2070 if (tid == ~0U)
2089 goto out; 2071 goto out;
2090 2072
2091 read_lock(&tasklist_lock); 2073 rcu_read_lock();
2092 task = find_task_by_pid(tid); 2074 task = find_task_by_pid(tid);
2093 if (task) 2075 if (task)
2094 get_task_struct(task); 2076 get_task_struct(task);
2095 read_unlock(&tasklist_lock); 2077 rcu_read_unlock();
2096 if (!task) 2078 if (!task)
2097 goto out; 2079 goto out;
2098 if (leader->tgid != task->tgid) 2080 if (leader->tgid != task->tgid)
@@ -2113,101 +2095,95 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
2113 inode->i_nlink = 3; 2095 inode->i_nlink = 3;
2114#endif 2096#endif
2115 2097
2116 dentry->d_op = &pid_base_dentry_operations; 2098 dentry->d_op = &pid_dentry_operations;
2117 2099
2118 d_add(dentry, inode); 2100 d_add(dentry, inode);
2101 /* Close the race of the process dying before we return the dentry */
2102 if (pid_revalidate(dentry, NULL))
2103 result = NULL;
2119 2104
2120 put_task_struct(task);
2121 return NULL;
2122out_drop_task: 2105out_drop_task:
2123 put_task_struct(task); 2106 put_task_struct(task);
2124out: 2107out:
2125 return ERR_PTR(-ENOENT); 2108 put_task_struct(leader);
2109out_no_task:
2110 return result;
2126} 2111}
2127 2112
2128#define PROC_NUMBUF 10
2129#define PROC_MAXPIDS 20
2130
2131/* 2113/*
2132 * Get a few tgid's to return for filldir - we need to hold the 2114 * Find the first tgid to return to user space.
2133 * tasklist lock while doing this, and we must release it before 2115 *
2134 * we actually do the filldir itself, so we use a temp buffer.. 2116 * Usually this is just whatever follows &init_task, but if the users
2117 * buffer was too small to hold the full list or there was a seek into
2118 * the middle of the directory we have more work to do.
2119 *
2120 * In the case of a short read we start with find_task_by_pid.
2121 *
2122 * In the case of a seek we start with &init_task and walk nr
2123 * threads past it.
2135 */ 2124 */
2136static int get_tgid_list(int index, unsigned long version, unsigned int *tgids) 2125static struct task_struct *first_tgid(int tgid, unsigned int nr)
2137{ 2126{
2138 struct task_struct *p; 2127 struct task_struct *pos;
2139 int nr_tgids = 0; 2128 rcu_read_lock();
2140 2129 if (tgid && nr) {
2141 index--; 2130 pos = find_task_by_pid(tgid);
2142 read_lock(&tasklist_lock); 2131 if (pos && thread_group_leader(pos))
2143 p = NULL; 2132 goto found;
2144 if (version) {
2145 p = find_task_by_pid(version);
2146 if (p && !thread_group_leader(p))
2147 p = NULL;
2148 } 2133 }
2134 /* If nr exceeds the number of processes get out quickly */
2135 pos = NULL;
2136 if (nr && nr >= nr_processes())
2137 goto done;
2149 2138
2150 if (p) 2139 /* If we haven't found our starting place yet start with
2151 index = 0; 2140 * the init_task and walk nr tasks forward.
2152 else 2141 */
2153 p = next_task(&init_task); 2142 for (pos = next_task(&init_task); nr > 0; --nr) {
2154 2143 pos = next_task(pos);
2155 for ( ; p != &init_task; p = next_task(p)) { 2144 if (pos == &init_task) {
2156 int tgid = p->pid; 2145 pos = NULL;
2157 if (!pid_alive(p)) 2146 goto done;
2158 continue; 2147 }
2159 if (--index >= 0)
2160 continue;
2161 tgids[nr_tgids] = tgid;
2162 nr_tgids++;
2163 if (nr_tgids >= PROC_MAXPIDS)
2164 break;
2165 } 2148 }
2166 read_unlock(&tasklist_lock); 2149found:
2167 return nr_tgids; 2150 get_task_struct(pos);
2151done:
2152 rcu_read_unlock();
2153 return pos;
2168} 2154}
2169 2155
2170/* 2156/*
2171 * Get a few tid's to return for filldir - we need to hold the 2157 * Find the next task in the task list.
2172 * tasklist lock while doing this, and we must release it before 2158 * Return NULL if we loop or there is any error.
2173 * we actually do the filldir itself, so we use a temp buffer.. 2159 *
2160 * The reference to the input task_struct is released.
2174 */ 2161 */
2175static int get_tid_list(int index, unsigned int *tids, struct inode *dir) 2162static struct task_struct *next_tgid(struct task_struct *start)
2176{ 2163{
2177 struct task_struct *leader_task = proc_task(dir); 2164 struct task_struct *pos;
2178 struct task_struct *task = leader_task; 2165 rcu_read_lock();
2179 int nr_tids = 0; 2166 pos = start;
2180 2167 if (pid_alive(start))
2181 index -= 2; 2168 pos = next_task(start);
2182 read_lock(&tasklist_lock); 2169 if (pid_alive(pos) && (pos != &init_task)) {
2183 /* 2170 get_task_struct(pos);
2184 * The starting point task (leader_task) might be an already 2171 goto done;
2185 * unlinked task, which cannot be used to access the task-list 2172 }
2186 * via next_thread(). 2173 pos = NULL;
2187 */ 2174done:
2188 if (pid_alive(task)) do { 2175 rcu_read_unlock();
2189 int tid = task->pid; 2176 put_task_struct(start);
2190 2177 return pos;
2191 if (--index >= 0)
2192 continue;
2193 if (tids != NULL)
2194 tids[nr_tids] = tid;
2195 nr_tids++;
2196 if (nr_tids >= PROC_MAXPIDS)
2197 break;
2198 } while ((task = next_thread(task)) != leader_task);
2199 read_unlock(&tasklist_lock);
2200 return nr_tids;
2201} 2178}
2202 2179
2203/* for the /proc/ directory itself, after non-process stuff has been done */ 2180/* for the /proc/ directory itself, after non-process stuff has been done */
2204int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) 2181int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2205{ 2182{
2206 unsigned int tgid_array[PROC_MAXPIDS];
2207 char buf[PROC_NUMBUF]; 2183 char buf[PROC_NUMBUF];
2208 unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; 2184 unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
2209 unsigned int nr_tgids, i; 2185 struct task_struct *task;
2210 int next_tgid; 2186 int tgid;
2211 2187
2212 if (!nr) { 2188 if (!nr) {
2213 ino_t ino = fake_ino(0,PROC_TGID_INO); 2189 ino_t ino = fake_ino(0,PROC_TGID_INO);
@@ -2216,63 +2192,116 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2216 filp->f_pos++; 2192 filp->f_pos++;
2217 nr++; 2193 nr++;
2218 } 2194 }
2195 nr -= 1;
2219 2196
2220 /* f_version caches the tgid value that the last readdir call couldn't 2197 /* f_version caches the tgid value that the last readdir call couldn't
2221 * return. lseek aka telldir automagically resets f_version to 0. 2198 * return. lseek aka telldir automagically resets f_version to 0.
2222 */ 2199 */
2223 next_tgid = filp->f_version; 2200 tgid = filp->f_version;
2224 filp->f_version = 0; 2201 filp->f_version = 0;
2225 for (;;) { 2202 for (task = first_tgid(tgid, nr);
2226 nr_tgids = get_tgid_list(nr, next_tgid, tgid_array); 2203 task;
2227 if (!nr_tgids) { 2204 task = next_tgid(task), filp->f_pos++) {
2228 /* no more entries ! */ 2205 int len;
2206 ino_t ino;
2207 tgid = task->pid;
2208 len = snprintf(buf, sizeof(buf), "%d", tgid);
2209 ino = fake_ino(tgid, PROC_TGID_INO);
2210 if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) {
2211 /* returning this tgid failed, save it as the first
2212 * pid for the next readir call */
2213 filp->f_version = tgid;
2214 put_task_struct(task);
2229 break; 2215 break;
2230 } 2216 }
2231 next_tgid = 0; 2217 }
2218 return 0;
2219}
2232 2220
2233 /* do not use the last found pid, reserve it for next_tgid */ 2221/*
2234 if (nr_tgids == PROC_MAXPIDS) { 2222 * Find the first tid of a thread group to return to user space.
2235 nr_tgids--; 2223 *
2236 next_tgid = tgid_array[nr_tgids]; 2224 * Usually this is just the thread group leader, but if the users
2237 } 2225 * buffer was too small or there was a seek into the middle of the
2226 * directory we have more work todo.
2227 *
2228 * In the case of a short read we start with find_task_by_pid.
2229 *
2230 * In the case of a seek we start with the leader and walk nr
2231 * threads past it.
2232 */
2233static struct task_struct *first_tid(struct task_struct *leader,
2234 int tid, int nr)
2235{
2236 struct task_struct *pos;
2238 2237
2239 for (i=0;i<nr_tgids;i++) { 2238 rcu_read_lock();
2240 int tgid = tgid_array[i]; 2239 /* Attempt to start with the pid of a thread */
2241 ino_t ino = fake_ino(tgid,PROC_TGID_INO); 2240 if (tid && (nr > 0)) {
2242 unsigned long j = PROC_NUMBUF; 2241 pos = find_task_by_pid(tid);
2242 if (pos && (pos->group_leader == leader))
2243 goto found;
2244 }
2243 2245
2244 do 2246 /* If nr exceeds the number of threads there is nothing todo */
2245 buf[--j] = '0' + (tgid % 10); 2247 pos = NULL;
2246 while ((tgid /= 10) != 0); 2248 if (nr && nr >= get_nr_threads(leader))
2249 goto out;
2247 2250
2248 if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) { 2251 /* If we haven't found our starting place yet start
2249 /* returning this tgid failed, save it as the first 2252 * with the leader and walk nr threads forward.
2250 * pid for the next readir call */ 2253 */
2251 filp->f_version = tgid_array[i]; 2254 for (pos = leader; nr > 0; --nr) {
2252 goto out; 2255 pos = next_thread(pos);
2253 } 2256 if (pos == leader) {
2254 filp->f_pos++; 2257 pos = NULL;
2255 nr++; 2258 goto out;
2256 } 2259 }
2257 } 2260 }
2261found:
2262 get_task_struct(pos);
2258out: 2263out:
2259 return 0; 2264 rcu_read_unlock();
2265 return pos;
2266}
2267
2268/*
2269 * Find the next thread in the thread list.
2270 * Return NULL if there is an error or no next thread.
2271 *
2272 * The reference to the input task_struct is released.
2273 */
2274static struct task_struct *next_tid(struct task_struct *start)
2275{
2276 struct task_struct *pos = NULL;
2277 rcu_read_lock();
2278 if (pid_alive(start)) {
2279 pos = next_thread(start);
2280 if (thread_group_leader(pos))
2281 pos = NULL;
2282 else
2283 get_task_struct(pos);
2284 }
2285 rcu_read_unlock();
2286 put_task_struct(start);
2287 return pos;
2260} 2288}
2261 2289
2262/* for the /proc/TGID/task/ directories */ 2290/* for the /proc/TGID/task/ directories */
2263static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) 2291static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
2264{ 2292{
2265 unsigned int tid_array[PROC_MAXPIDS];
2266 char buf[PROC_NUMBUF]; 2293 char buf[PROC_NUMBUF];
2267 unsigned int nr_tids, i;
2268 struct dentry *dentry = filp->f_dentry; 2294 struct dentry *dentry = filp->f_dentry;
2269 struct inode *inode = dentry->d_inode; 2295 struct inode *inode = dentry->d_inode;
2296 struct task_struct *leader = get_proc_task(inode);
2297 struct task_struct *task;
2270 int retval = -ENOENT; 2298 int retval = -ENOENT;
2271 ino_t ino; 2299 ino_t ino;
2300 int tid;
2272 unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ 2301 unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */
2273 2302
2274 if (!pid_alive(proc_task(inode))) 2303 if (!leader)
2275 goto out; 2304 goto out_no_task;
2276 retval = 0; 2305 retval = 0;
2277 2306
2278 switch (pos) { 2307 switch (pos) {
@@ -2290,24 +2319,45 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
2290 /* fall through */ 2319 /* fall through */
2291 } 2320 }
2292 2321
2293 nr_tids = get_tid_list(pos, tid_array, inode); 2322 /* f_version caches the tgid value that the last readdir call couldn't
2294 inode->i_nlink = pos + nr_tids; 2323 * return. lseek aka telldir automagically resets f_version to 0.
2295 2324 */
2296 for (i = 0; i < nr_tids; i++) { 2325 tid = filp->f_version;
2297 unsigned long j = PROC_NUMBUF; 2326 filp->f_version = 0;
2298 int tid = tid_array[i]; 2327 for (task = first_tid(leader, tid, pos - 2);
2299 2328 task;
2300 ino = fake_ino(tid,PROC_TID_INO); 2329 task = next_tid(task), pos++) {
2301 2330 int len;
2302 do 2331 tid = task->pid;
2303 buf[--j] = '0' + (tid % 10); 2332 len = snprintf(buf, sizeof(buf), "%d", tid);
2304 while ((tid /= 10) != 0); 2333 ino = fake_ino(tid, PROC_TID_INO);
2305 2334 if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) {
2306 if (filldir(dirent, buf+j, PROC_NUMBUF-j, pos, ino, DT_DIR) < 0) 2335 /* returning this tgid failed, save it as the first
2336 * pid for the next readir call */
2337 filp->f_version = tid;
2338 put_task_struct(task);
2307 break; 2339 break;
2308 pos++; 2340 }
2309 } 2341 }
2310out: 2342out:
2311 filp->f_pos = pos; 2343 filp->f_pos = pos;
2344 put_task_struct(leader);
2345out_no_task:
2312 return retval; 2346 return retval;
2313} 2347}
2348
2349static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
2350{
2351 struct inode *inode = dentry->d_inode;
2352 struct task_struct *p = get_proc_task(inode);
2353 generic_fillattr(inode, stat);
2354
2355 if (p) {
2356 rcu_read_lock();
2357 stat->nlink += get_nr_threads(p);
2358 rcu_read_unlock();
2359 put_task_struct(p);
2360 }
2361
2362 return 0;
2363}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 722b9c463111..6dcef089e18e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de)
58static void proc_delete_inode(struct inode *inode) 58static void proc_delete_inode(struct inode *inode)
59{ 59{
60 struct proc_dir_entry *de; 60 struct proc_dir_entry *de;
61 struct task_struct *tsk;
62 61
63 truncate_inode_pages(&inode->i_data, 0); 62 truncate_inode_pages(&inode->i_data, 0);
64 63
65 /* Let go of any associated process */ 64 /* Stop tracking associated processes */
66 tsk = PROC_I(inode)->task; 65 put_pid(PROC_I(inode)->pid);
67 if (tsk)
68 put_task_struct(tsk);
69 66
70 /* Let go of any associated proc directory entry */ 67 /* Let go of any associated proc directory entry */
71 de = PROC_I(inode)->pde; 68 de = PROC_I(inode)->pde;
@@ -94,8 +91,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
94 ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL); 91 ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
95 if (!ei) 92 if (!ei)
96 return NULL; 93 return NULL;
97 ei->task = NULL; 94 ei->pid = NULL;
98 ei->type = 0; 95 ei->fd = 0;
99 ei->op.proc_get_link = NULL; 96 ei->op.proc_get_link = NULL;
100 ei->pde = NULL; 97 ei->pde = NULL;
101 inode = &ei->vfs_inode; 98 inode = &ei->vfs_inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 0502f17b860d..146a434ba944 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -37,16 +37,30 @@ extern int proc_tgid_stat(struct task_struct *, char *);
37extern int proc_pid_status(struct task_struct *, char *); 37extern int proc_pid_status(struct task_struct *, char *);
38extern int proc_pid_statm(struct task_struct *, char *); 38extern int proc_pid_statm(struct task_struct *, char *);
39 39
40extern struct file_operations proc_maps_operations;
41extern struct file_operations proc_numa_maps_operations;
42extern struct file_operations proc_smaps_operations;
43
44extern struct file_operations proc_maps_operations;
45extern struct file_operations proc_numa_maps_operations;
46extern struct file_operations proc_smaps_operations;
47
48
40void free_proc_entry(struct proc_dir_entry *de); 49void free_proc_entry(struct proc_dir_entry *de);
41 50
42int proc_init_inodecache(void); 51int proc_init_inodecache(void);
43 52
44static inline struct task_struct *proc_task(struct inode *inode) 53static inline struct pid *proc_pid(struct inode *inode)
54{
55 return PROC_I(inode)->pid;
56}
57
58static inline struct task_struct *get_proc_task(struct inode *inode)
45{ 59{
46 return PROC_I(inode)->task; 60 return get_pid_task(proc_pid(inode), PIDTYPE_PID);
47} 61}
48 62
49static inline int proc_type(struct inode *inode) 63static inline int proc_fd(struct inode *inode)
50{ 64{
51 return PROC_I(inode)->type; 65 return PROC_I(inode)->fd;
52} 66}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 91b7c15ab373..0a163a4f7764 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount *
75{ 75{
76 struct vm_area_struct * vma; 76 struct vm_area_struct * vma;
77 int result = -ENOENT; 77 int result = -ENOENT;
78 struct task_struct *task = proc_task(inode); 78 struct task_struct *task = get_proc_task(inode);
79 struct mm_struct * mm = get_task_mm(task); 79 struct mm_struct * mm = NULL;
80 80
81 if (task) {
82 mm = get_task_mm(task);
83 put_task_struct(task);
84 }
81 if (!mm) 85 if (!mm)
82 goto out; 86 goto out;
83 down_read(&mm->mmap_sem); 87 down_read(&mm->mmap_sem);
@@ -118,9 +122,15 @@ struct mem_size_stats
118 unsigned long private_dirty; 122 unsigned long private_dirty;
119}; 123};
120 124
125__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
126{
127 return NULL;
128}
129
121static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) 130static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
122{ 131{
123 struct task_struct *task = m->private; 132 struct proc_maps_private *priv = m->private;
133 struct task_struct *task = priv->task;
124 struct vm_area_struct *vma = v; 134 struct vm_area_struct *vma = v;
125 struct mm_struct *mm = vma->vm_mm; 135 struct mm_struct *mm = vma->vm_mm;
126 struct file *file = vma->vm_file; 136 struct file *file = vma->vm_file;
@@ -153,22 +163,23 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
153 pad_len_spaces(m, len); 163 pad_len_spaces(m, len);
154 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n"); 164 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n");
155 } else { 165 } else {
156 if (mm) { 166 const char *name = arch_vma_name(vma);
157 if (vma->vm_start <= mm->start_brk && 167 if (!name) {
168 if (mm) {
169 if (vma->vm_start <= mm->start_brk &&
158 vma->vm_end >= mm->brk) { 170 vma->vm_end >= mm->brk) {
159 pad_len_spaces(m, len); 171 name = "[heap]";
160 seq_puts(m, "[heap]"); 172 } else if (vma->vm_start <= mm->start_stack &&
161 } else { 173 vma->vm_end >= mm->start_stack) {
162 if (vma->vm_start <= mm->start_stack && 174 name = "[stack]";
163 vma->vm_end >= mm->start_stack) {
164
165 pad_len_spaces(m, len);
166 seq_puts(m, "[stack]");
167 } 175 }
176 } else {
177 name = "[vdso]";
168 } 178 }
169 } else { 179 }
180 if (name) {
170 pad_len_spaces(m, len); 181 pad_len_spaces(m, len);
171 seq_puts(m, "[vdso]"); 182 seq_puts(m, name);
172 } 183 }
173 } 184 }
174 seq_putc(m, '\n'); 185 seq_putc(m, '\n');
@@ -295,12 +306,16 @@ static int show_smap(struct seq_file *m, void *v)
295 306
296static void *m_start(struct seq_file *m, loff_t *pos) 307static void *m_start(struct seq_file *m, loff_t *pos)
297{ 308{
298 struct task_struct *task = m->private; 309 struct proc_maps_private *priv = m->private;
299 unsigned long last_addr = m->version; 310 unsigned long last_addr = m->version;
300 struct mm_struct *mm; 311 struct mm_struct *mm;
301 struct vm_area_struct *vma, *tail_vma; 312 struct vm_area_struct *vma, *tail_vma = NULL;
302 loff_t l = *pos; 313 loff_t l = *pos;
303 314
315 /* Clear the per syscall fields in priv */
316 priv->task = NULL;
317 priv->tail_vma = NULL;
318
304 /* 319 /*
305 * We remember last_addr rather than next_addr to hit with 320 * We remember last_addr rather than next_addr to hit with
306 * mmap_cache most of the time. We have zero last_addr at 321 * mmap_cache most of the time. We have zero last_addr at
@@ -311,11 +326,15 @@ static void *m_start(struct seq_file *m, loff_t *pos)
311 if (last_addr == -1UL) 326 if (last_addr == -1UL)
312 return NULL; 327 return NULL;
313 328
314 mm = get_task_mm(task); 329 priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
330 if (!priv->task)
331 return NULL;
332
333 mm = get_task_mm(priv->task);
315 if (!mm) 334 if (!mm)
316 return NULL; 335 return NULL;
317 336
318 tail_vma = get_gate_vma(task); 337 priv->tail_vma = tail_vma = get_gate_vma(priv->task);
319 down_read(&mm->mmap_sem); 338 down_read(&mm->mmap_sem);
320 339
321 /* Start with last addr hint */ 340 /* Start with last addr hint */
@@ -350,11 +369,9 @@ out:
350 return tail_vma; 369 return tail_vma;
351} 370}
352 371
353static void m_stop(struct seq_file *m, void *v) 372static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
354{ 373{
355 struct task_struct *task = m->private; 374 if (vma && vma != priv->tail_vma) {
356 struct vm_area_struct *vma = v;
357 if (vma && vma != get_gate_vma(task)) {
358 struct mm_struct *mm = vma->vm_mm; 375 struct mm_struct *mm = vma->vm_mm;
359 up_read(&mm->mmap_sem); 376 up_read(&mm->mmap_sem);
360 mmput(mm); 377 mmput(mm);
@@ -363,38 +380,103 @@ static void m_stop(struct seq_file *m, void *v)
363 380
364static void *m_next(struct seq_file *m, void *v, loff_t *pos) 381static void *m_next(struct seq_file *m, void *v, loff_t *pos)
365{ 382{
366 struct task_struct *task = m->private; 383 struct proc_maps_private *priv = m->private;
367 struct vm_area_struct *vma = v; 384 struct vm_area_struct *vma = v;
368 struct vm_area_struct *tail_vma = get_gate_vma(task); 385 struct vm_area_struct *tail_vma = priv->tail_vma;
369 386
370 (*pos)++; 387 (*pos)++;
371 if (vma && (vma != tail_vma) && vma->vm_next) 388 if (vma && (vma != tail_vma) && vma->vm_next)
372 return vma->vm_next; 389 return vma->vm_next;
373 m_stop(m, v); 390 vma_stop(priv, vma);
374 return (vma != tail_vma)? tail_vma: NULL; 391 return (vma != tail_vma)? tail_vma: NULL;
375} 392}
376 393
377struct seq_operations proc_pid_maps_op = { 394static void m_stop(struct seq_file *m, void *v)
395{
396 struct proc_maps_private *priv = m->private;
397 struct vm_area_struct *vma = v;
398
399 vma_stop(priv, vma);
400 if (priv->task)
401 put_task_struct(priv->task);
402}
403
404static struct seq_operations proc_pid_maps_op = {
378 .start = m_start, 405 .start = m_start,
379 .next = m_next, 406 .next = m_next,
380 .stop = m_stop, 407 .stop = m_stop,
381 .show = show_map 408 .show = show_map
382}; 409};
383 410
384struct seq_operations proc_pid_smaps_op = { 411static struct seq_operations proc_pid_smaps_op = {
385 .start = m_start, 412 .start = m_start,
386 .next = m_next, 413 .next = m_next,
387 .stop = m_stop, 414 .stop = m_stop,
388 .show = show_smap 415 .show = show_smap
389}; 416};
390 417
418static int do_maps_open(struct inode *inode, struct file *file,
419 struct seq_operations *ops)
420{
421 struct proc_maps_private *priv;
422 int ret = -ENOMEM;
423 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
424 if (priv) {
425 priv->pid = proc_pid(inode);
426 ret = seq_open(file, ops);
427 if (!ret) {
428 struct seq_file *m = file->private_data;
429 m->private = priv;
430 } else {
431 kfree(priv);
432 }
433 }
434 return ret;
435}
436
437static int maps_open(struct inode *inode, struct file *file)
438{
439 return do_maps_open(inode, file, &proc_pid_maps_op);
440}
441
442struct file_operations proc_maps_operations = {
443 .open = maps_open,
444 .read = seq_read,
445 .llseek = seq_lseek,
446 .release = seq_release_private,
447};
448
391#ifdef CONFIG_NUMA 449#ifdef CONFIG_NUMA
392extern int show_numa_map(struct seq_file *m, void *v); 450extern int show_numa_map(struct seq_file *m, void *v);
393 451
394struct seq_operations proc_pid_numa_maps_op = { 452static struct seq_operations proc_pid_numa_maps_op = {
395 .start = m_start, 453 .start = m_start,
396 .next = m_next, 454 .next = m_next,
397 .stop = m_stop, 455 .stop = m_stop,
398 .show = show_numa_map 456 .show = show_numa_map
399}; 457};
458
459static int numa_maps_open(struct inode *inode, struct file *file)
460{
461 return do_maps_open(inode, file, &proc_pid_numa_maps_op);
462}
463
464struct file_operations proc_numa_maps_operations = {
465 .open = numa_maps_open,
466 .read = seq_read,
467 .llseek = seq_lseek,
468 .release = seq_release_private,
469};
400#endif 470#endif
471
472static int smaps_open(struct inode *inode, struct file *file)
473{
474 return do_maps_open(inode, file, &proc_pid_smaps_op);
475}
476
477struct file_operations proc_smaps_operations = {
478 .open = smaps_open,
479 .read = seq_read,
480 .llseek = seq_lseek,
481 .release = seq_release_private,
482};
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 8f68827ed10e..af69f28277b6 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -156,9 +156,28 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos)
156{ 156{
157 return NULL; 157 return NULL;
158} 158}
159struct seq_operations proc_pid_maps_op = { 159static struct seq_operations proc_pid_maps_op = {
160 .start = m_start, 160 .start = m_start,
161 .next = m_next, 161 .next = m_next,
162 .stop = m_stop, 162 .stop = m_stop,
163 .show = show_map 163 .show = show_map
164}; 164};
165
166static int maps_open(struct inode *inode, struct file *file)
167{
168 int ret;
169 ret = seq_open(file, &proc_pid_maps_op);
170 if (!ret) {
171 struct seq_file *m = file->private_data;
172 m->private = NULL;
173 }
174 return ret;
175}
176
177struct file_operations proc_maps_operations = {
178 .open = maps_open,
179 .read = seq_read,
180 .llseek = seq_lseek,
181 .release = seq_release,
182};
183
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index cf6e1cf40351..752cea12e30f 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1560,12 +1560,6 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
1560 return res; 1560 return res;
1561} 1561}
1562 1562
1563static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
1564 size_t count, loff_t pos)
1565{
1566 return generic_file_aio_write(iocb, buf, count, pos);
1567}
1568
1569const struct file_operations reiserfs_file_operations = { 1563const struct file_operations reiserfs_file_operations = {
1570 .read = generic_file_read, 1564 .read = generic_file_read,
1571 .write = reiserfs_file_write, 1565 .write = reiserfs_file_write,
@@ -1575,7 +1569,7 @@ const struct file_operations reiserfs_file_operations = {
1575 .fsync = reiserfs_sync_file, 1569 .fsync = reiserfs_sync_file,
1576 .sendfile = generic_file_sendfile, 1570 .sendfile = generic_file_sendfile,
1577 .aio_read = generic_file_aio_read, 1571 .aio_read = generic_file_aio_read,
1578 .aio_write = reiserfs_aio_write, 1572 .aio_write = generic_file_aio_write,
1579 .splice_read = generic_file_splice_read, 1573 .splice_read = generic_file_splice_read,
1580 .splice_write = generic_file_splice_write, 1574 .splice_write = generic_file_splice_write,
1581}; 1575};
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 1b73529b8099..49d1a53dbef0 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -834,8 +834,7 @@ static int write_ordered_buffers(spinlock_t * lock,
834 get_bh(bh); 834 get_bh(bh);
835 if (test_set_buffer_locked(bh)) { 835 if (test_set_buffer_locked(bh)) {
836 if (!buffer_dirty(bh)) { 836 if (!buffer_dirty(bh)) {
837 list_del_init(&jh->list); 837 list_move(&jh->list, &tmp);
838 list_add(&jh->list, &tmp);
839 goto loop_next; 838 goto loop_next;
840 } 839 }
841 spin_unlock(lock); 840 spin_unlock(lock);
@@ -855,8 +854,7 @@ static int write_ordered_buffers(spinlock_t * lock,
855 ret = -EIO; 854 ret = -EIO;
856 } 855 }
857 if (buffer_dirty(bh)) { 856 if (buffer_dirty(bh)) {
858 list_del_init(&jh->list); 857 list_move(&jh->list, &tmp);
859 list_add(&jh->list, &tmp);
860 add_to_chunk(&chunk, bh, lock, write_ordered_chunk); 858 add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
861 } else { 859 } else {
862 reiserfs_free_jh(bh); 860 reiserfs_free_jh(bh);
diff --git a/fs/select.c b/fs/select.c
index 9c4f0f2604f1..33b72ba0f86f 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -746,9 +746,9 @@ out_fds:
746asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, 746asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
747 long timeout_msecs) 747 long timeout_msecs)
748{ 748{
749 s64 timeout_jiffies = 0; 749 s64 timeout_jiffies;
750 750
751 if (timeout_msecs) { 751 if (timeout_msecs > 0) {
752#if HZ > 1000 752#if HZ > 1000
753 /* We can only overflow if HZ > 1000 */ 753 /* We can only overflow if HZ > 1000 */
754 if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ) 754 if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
@@ -756,6 +756,9 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
756 else 756 else
757#endif 757#endif
758 timeout_jiffies = msecs_to_jiffies(timeout_msecs); 758 timeout_jiffies = msecs_to_jiffies(timeout_msecs);
759 } else {
760 /* Infinite (< 0) or no (0) timeout */
761 timeout_jiffies = timeout_msecs;
759 } 762 }
760 763
761 return do_sys_poll(ufds, nfds, &timeout_jiffies); 764 return do_sys_poll(ufds, nfds, &timeout_jiffies);
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index c71dd2760d32..c8e96195b96e 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -400,8 +400,7 @@ static int smb_request_send_req(struct smb_request *req)
400 if (!(req->rq_flags & SMB_REQ_TRANSMITTED)) 400 if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
401 goto out; 401 goto out;
402 402
403 list_del_init(&req->rq_queue); 403 list_move_tail(&req->rq_queue, &server->recvq);
404 list_add_tail(&req->rq_queue, &server->recvq);
405 result = 1; 404 result = 1;
406out: 405out:
407 return result; 406 return result;
@@ -435,8 +434,7 @@ int smb_request_send_server(struct smb_sb_info *server)
435 result = smb_request_send_req(req); 434 result = smb_request_send_req(req);
436 if (result < 0) { 435 if (result < 0) {
437 server->conn_error = result; 436 server->conn_error = result;
438 list_del_init(&req->rq_queue); 437 list_move(&req->rq_queue, &server->xmitq);
439 list_add(&req->rq_queue, &server->xmitq);
440 result = -EIO; 438 result = -EIO;
441 goto out; 439 goto out;
442 } 440 }
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 481a97a423fa..24577e2c489b 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -20,6 +20,7 @@
20#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/net.h> 22#include <linux/net.h>
23#include <linux/kthread.h>
23#include <net/ip.h> 24#include <net/ip.h>
24 25
25#include <linux/smb_fs.h> 26#include <linux/smb_fs.h>
@@ -40,7 +41,7 @@ enum smbiod_state {
40}; 41};
41 42
42static enum smbiod_state smbiod_state = SMBIOD_DEAD; 43static enum smbiod_state smbiod_state = SMBIOD_DEAD;
43static pid_t smbiod_pid; 44static struct task_struct *smbiod_thread;
44static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait); 45static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
45static LIST_HEAD(smb_servers); 46static LIST_HEAD(smb_servers);
46static DEFINE_SPINLOCK(servers_lock); 47static DEFINE_SPINLOCK(servers_lock);
@@ -67,20 +68,29 @@ void smbiod_wake_up(void)
67 */ 68 */
68static int smbiod_start(void) 69static int smbiod_start(void)
69{ 70{
70 pid_t pid; 71 struct task_struct *tsk;
72 int err = 0;
73
71 if (smbiod_state != SMBIOD_DEAD) 74 if (smbiod_state != SMBIOD_DEAD)
72 return 0; 75 return 0;
73 smbiod_state = SMBIOD_STARTING; 76 smbiod_state = SMBIOD_STARTING;
74 __module_get(THIS_MODULE); 77 __module_get(THIS_MODULE);
75 spin_unlock(&servers_lock); 78 spin_unlock(&servers_lock);
76 pid = kernel_thread(smbiod, NULL, 0); 79 tsk = kthread_run(smbiod, NULL, "smbiod");
77 if (pid < 0) 80 if (IS_ERR(tsk)) {
81 err = PTR_ERR(tsk);
78 module_put(THIS_MODULE); 82 module_put(THIS_MODULE);
83 }
79 84
80 spin_lock(&servers_lock); 85 spin_lock(&servers_lock);
81 smbiod_state = pid < 0 ? SMBIOD_DEAD : SMBIOD_RUNNING; 86 if (err < 0) {
82 smbiod_pid = pid; 87 smbiod_state = SMBIOD_DEAD;
83 return pid; 88 smbiod_thread = NULL;
89 } else {
90 smbiod_state = SMBIOD_RUNNING;
91 smbiod_thread = tsk;
92 }
93 return err;
84} 94}
85 95
86/* 96/*
@@ -183,8 +193,7 @@ int smbiod_retry(struct smb_sb_info *server)
183 if (req->rq_flags & SMB_REQ_RETRY) { 193 if (req->rq_flags & SMB_REQ_RETRY) {
184 /* must move the request to the xmitq */ 194 /* must move the request to the xmitq */
185 VERBOSE("retrying request %p on recvq\n", req); 195 VERBOSE("retrying request %p on recvq\n", req);
186 list_del(&req->rq_queue); 196 list_move(&req->rq_queue, &server->xmitq);
187 list_add(&req->rq_queue, &server->xmitq);
188 continue; 197 continue;
189 } 198 }
190#endif 199#endif
@@ -290,8 +299,6 @@ out:
290 */ 299 */
291static int smbiod(void *unused) 300static int smbiod(void *unused)
292{ 301{
293 daemonize("smbiod");
294
295 allow_signal(SIGKILL); 302 allow_signal(SIGKILL);
296 303
297 VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid); 304 VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
diff --git a/fs/super.c b/fs/super.c
index 057b5325b7ef..8a669f6f3f52 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -871,8 +871,6 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data)
871 return mnt; 871 return mnt;
872} 872}
873 873
874EXPORT_SYMBOL_GPL(do_kern_mount);
875
876struct vfsmount *kern_mount(struct file_system_type *type) 874struct vfsmount *kern_mount(struct file_system_type *type)
877{ 875{
878 return vfs_kern_mount(type, 0, type->name, NULL); 876 return vfs_kern_mount(type, 0, type->name, NULL);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 610b5bdbe75b..61c42430cba3 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -430,10 +430,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
430 i++; 430 i++;
431 /* fallthrough */ 431 /* fallthrough */
432 default: 432 default:
433 if (filp->f_pos == 2) { 433 if (filp->f_pos == 2)
434 list_del(q); 434 list_move(q, &parent_sd->s_children);
435 list_add(q, &parent_sd->s_children); 435
436 }
437 for (p=q->next; p!= &parent_sd->s_children; p=p->next) { 436 for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
438 struct sysfs_dirent *next; 437 struct sysfs_dirent *next;
439 const char * name; 438 const char * name;
@@ -455,8 +454,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
455 dt_type(next)) < 0) 454 dt_type(next)) < 0)
456 return 0; 455 return 0;
457 456
458 list_del(q); 457 list_move(q, p);
459 list_add(q, p);
460 p = q; 458 p = q;
461 filp->f_pos++; 459 filp->f_pos++;
462 } 460 }
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 3ada9dcf55b8..95b878e5c7a0 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -21,14 +21,6 @@
21#include "swab.h" 21#include "swab.h"
22#include "util.h" 22#include "util.h"
23 23
24#undef UFS_BALLOC_DEBUG
25
26#ifdef UFS_BALLOC_DEBUG
27#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x;
28#else
29#define UFSD(x)
30#endif
31
32static unsigned ufs_add_fragments (struct inode *, unsigned, unsigned, unsigned, int *); 24static unsigned ufs_add_fragments (struct inode *, unsigned, unsigned, unsigned, int *);
33static unsigned ufs_alloc_fragments (struct inode *, unsigned, unsigned, unsigned, int *); 25static unsigned ufs_alloc_fragments (struct inode *, unsigned, unsigned, unsigned, int *);
34static unsigned ufs_alloccg_block (struct inode *, struct ufs_cg_private_info *, unsigned, int *); 26static unsigned ufs_alloccg_block (struct inode *, struct ufs_cg_private_info *, unsigned, int *);
@@ -39,7 +31,8 @@ static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *,
39/* 31/*
40 * Free 'count' fragments from fragment number 'fragment' 32 * Free 'count' fragments from fragment number 'fragment'
41 */ 33 */
42void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count) { 34void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
35{
43 struct super_block * sb; 36 struct super_block * sb;
44 struct ufs_sb_private_info * uspi; 37 struct ufs_sb_private_info * uspi;
45 struct ufs_super_block_first * usb1; 38 struct ufs_super_block_first * usb1;
@@ -51,7 +44,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
51 uspi = UFS_SB(sb)->s_uspi; 44 uspi = UFS_SB(sb)->s_uspi;
52 usb1 = ubh_get_usb_first(uspi); 45 usb1 = ubh_get_usb_first(uspi);
53 46
54 UFSD(("ENTER, fragment %u, count %u\n", fragment, count)) 47 UFSD("ENTER, fragment %u, count %u\n", fragment, count);
55 48
56 if (ufs_fragnum(fragment) + count > uspi->s_fpg) 49 if (ufs_fragnum(fragment) + count > uspi->s_fpg)
57 ufs_error (sb, "ufs_free_fragments", "internal error"); 50 ufs_error (sb, "ufs_free_fragments", "internal error");
@@ -68,7 +61,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
68 ucpi = ufs_load_cylinder (sb, cgno); 61 ucpi = ufs_load_cylinder (sb, cgno);
69 if (!ucpi) 62 if (!ucpi)
70 goto failed; 63 goto failed;
71 ucg = ubh_get_ucg (UCPI_UBH); 64 ucg = ubh_get_ucg (UCPI_UBH(ucpi));
72 if (!ufs_cg_chkmagic(sb, ucg)) { 65 if (!ufs_cg_chkmagic(sb, ucg)) {
73 ufs_panic (sb, "ufs_free_fragments", "internal error, bad magic number on cg %u", cgno); 66 ufs_panic (sb, "ufs_free_fragments", "internal error, bad magic number on cg %u", cgno);
74 goto failed; 67 goto failed;
@@ -76,11 +69,11 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
76 69
77 end_bit = bit + count; 70 end_bit = bit + count;
78 bbase = ufs_blknum (bit); 71 bbase = ufs_blknum (bit);
79 blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase); 72 blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
80 ufs_fragacct (sb, blkmap, ucg->cg_frsum, -1); 73 ufs_fragacct (sb, blkmap, ucg->cg_frsum, -1);
81 for (i = bit; i < end_bit; i++) { 74 for (i = bit; i < end_bit; i++) {
82 if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, i)) 75 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, i))
83 ubh_setbit (UCPI_UBH, ucpi->c_freeoff, i); 76 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, i);
84 else 77 else
85 ufs_error (sb, "ufs_free_fragments", 78 ufs_error (sb, "ufs_free_fragments",
86 "bit already cleared for fragment %u", i); 79 "bit already cleared for fragment %u", i);
@@ -90,51 +83,52 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
90 83
91 84
92 fs32_add(sb, &ucg->cg_cs.cs_nffree, count); 85 fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
93 fs32_add(sb, &usb1->fs_cstotal.cs_nffree, count); 86 uspi->cs_total.cs_nffree += count;
94 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 87 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
95 blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase); 88 blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
96 ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1); 89 ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1);
97 90
98 /* 91 /*
99 * Trying to reassemble free fragments into block 92 * Trying to reassemble free fragments into block
100 */ 93 */
101 blkno = ufs_fragstoblks (bbase); 94 blkno = ufs_fragstoblks (bbase);
102 if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) { 95 if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
103 fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb); 96 fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
104 fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, uspi->s_fpb); 97 uspi->cs_total.cs_nffree -= uspi->s_fpb;
105 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb); 98 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
106 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 99 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
107 ufs_clusteracct (sb, ucpi, blkno, 1); 100 ufs_clusteracct (sb, ucpi, blkno, 1);
108 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); 101 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
109 fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1); 102 uspi->cs_total.cs_nbfree++;
110 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); 103 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
111 cylno = ufs_cbtocylno (bbase); 104 cylno = ufs_cbtocylno (bbase);
112 fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(bbase)), 1); 105 fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(bbase)), 1);
113 fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); 106 fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
114 } 107 }
115 108
116 ubh_mark_buffer_dirty (USPI_UBH); 109 ubh_mark_buffer_dirty (USPI_UBH(uspi));
117 ubh_mark_buffer_dirty (UCPI_UBH); 110 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
118 if (sb->s_flags & MS_SYNCHRONOUS) { 111 if (sb->s_flags & MS_SYNCHRONOUS) {
119 ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi); 112 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
120 ubh_wait_on_buffer (UCPI_UBH); 113 ubh_wait_on_buffer (UCPI_UBH(ucpi));
121 } 114 }
122 sb->s_dirt = 1; 115 sb->s_dirt = 1;
123 116
124 unlock_super (sb); 117 unlock_super (sb);
125 UFSD(("EXIT\n")) 118 UFSD("EXIT\n");
126 return; 119 return;
127 120
128failed: 121failed:
129 unlock_super (sb); 122 unlock_super (sb);
130 UFSD(("EXIT (FAILED)\n")) 123 UFSD("EXIT (FAILED)\n");
131 return; 124 return;
132} 125}
133 126
134/* 127/*
135 * Free 'count' fragments from fragment number 'fragment' (free whole blocks) 128 * Free 'count' fragments from fragment number 'fragment' (free whole blocks)
136 */ 129 */
137void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) { 130void ufs_free_blocks(struct inode *inode, unsigned fragment, unsigned count)
131{
138 struct super_block * sb; 132 struct super_block * sb;
139 struct ufs_sb_private_info * uspi; 133 struct ufs_sb_private_info * uspi;
140 struct ufs_super_block_first * usb1; 134 struct ufs_super_block_first * usb1;
@@ -146,7 +140,7 @@ void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) {
146 uspi = UFS_SB(sb)->s_uspi; 140 uspi = UFS_SB(sb)->s_uspi;
147 usb1 = ubh_get_usb_first(uspi); 141 usb1 = ubh_get_usb_first(uspi);
148 142
149 UFSD(("ENTER, fragment %u, count %u\n", fragment, count)) 143 UFSD("ENTER, fragment %u, count %u\n", fragment, count);
150 144
151 if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) { 145 if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) {
152 ufs_error (sb, "ufs_free_blocks", "internal error, " 146 ufs_error (sb, "ufs_free_blocks", "internal error, "
@@ -162,7 +156,7 @@ do_more:
162 bit = ufs_dtogd (fragment); 156 bit = ufs_dtogd (fragment);
163 if (cgno >= uspi->s_ncg) { 157 if (cgno >= uspi->s_ncg) {
164 ufs_panic (sb, "ufs_free_blocks", "freeing blocks are outside device"); 158 ufs_panic (sb, "ufs_free_blocks", "freeing blocks are outside device");
165 goto failed; 159 goto failed_unlock;
166 } 160 }
167 end_bit = bit + count; 161 end_bit = bit + count;
168 if (end_bit > uspi->s_fpg) { 162 if (end_bit > uspi->s_fpg) {
@@ -173,36 +167,36 @@ do_more:
173 167
174 ucpi = ufs_load_cylinder (sb, cgno); 168 ucpi = ufs_load_cylinder (sb, cgno);
175 if (!ucpi) 169 if (!ucpi)
176 goto failed; 170 goto failed_unlock;
177 ucg = ubh_get_ucg (UCPI_UBH); 171 ucg = ubh_get_ucg (UCPI_UBH(ucpi));
178 if (!ufs_cg_chkmagic(sb, ucg)) { 172 if (!ufs_cg_chkmagic(sb, ucg)) {
179 ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno); 173 ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno);
180 goto failed; 174 goto failed_unlock;
181 } 175 }
182 176
183 for (i = bit; i < end_bit; i += uspi->s_fpb) { 177 for (i = bit; i < end_bit; i += uspi->s_fpb) {
184 blkno = ufs_fragstoblks(i); 178 blkno = ufs_fragstoblks(i);
185 if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) { 179 if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
186 ufs_error(sb, "ufs_free_blocks", "freeing free fragment"); 180 ufs_error(sb, "ufs_free_blocks", "freeing free fragment");
187 } 181 }
188 ubh_setblock(UCPI_UBH, ucpi->c_freeoff, blkno); 182 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
189 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 183 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
190 ufs_clusteracct (sb, ucpi, blkno, 1); 184 ufs_clusteracct (sb, ucpi, blkno, 1);
191 DQUOT_FREE_BLOCK(inode, uspi->s_fpb); 185 DQUOT_FREE_BLOCK(inode, uspi->s_fpb);
192 186
193 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); 187 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
194 fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1); 188 uspi->cs_total.cs_nbfree++;
195 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); 189 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
196 cylno = ufs_cbtocylno(i); 190 cylno = ufs_cbtocylno(i);
197 fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(i)), 1); 191 fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(i)), 1);
198 fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); 192 fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
199 } 193 }
200 194
201 ubh_mark_buffer_dirty (USPI_UBH); 195 ubh_mark_buffer_dirty (USPI_UBH(uspi));
202 ubh_mark_buffer_dirty (UCPI_UBH); 196 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
203 if (sb->s_flags & MS_SYNCHRONOUS) { 197 if (sb->s_flags & MS_SYNCHRONOUS) {
204 ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi); 198 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
205 ubh_wait_on_buffer (UCPI_UBH); 199 ubh_wait_on_buffer (UCPI_UBH(ucpi));
206 } 200 }
207 201
208 if (overflow) { 202 if (overflow) {
@@ -213,38 +207,127 @@ do_more:
213 207
214 sb->s_dirt = 1; 208 sb->s_dirt = 1;
215 unlock_super (sb); 209 unlock_super (sb);
216 UFSD(("EXIT\n")) 210 UFSD("EXIT\n");
217 return; 211 return;
218 212
219failed: 213failed_unlock:
220 unlock_super (sb); 214 unlock_super (sb);
221 UFSD(("EXIT (FAILED)\n")) 215failed:
216 UFSD("EXIT (FAILED)\n");
222 return; 217 return;
223} 218}
224 219
220static struct page *ufs_get_locked_page(struct address_space *mapping,
221 unsigned long index)
222{
223 struct page *page;
224
225try_again:
226 page = find_lock_page(mapping, index);
227 if (!page) {
228 page = read_cache_page(mapping, index,
229 (filler_t*)mapping->a_ops->readpage,
230 NULL);
231 if (IS_ERR(page)) {
232 printk(KERN_ERR "ufs_change_blocknr: "
233 "read_cache_page error: ino %lu, index: %lu\n",
234 mapping->host->i_ino, index);
235 goto out;
236 }
225 237
238 lock_page(page);
226 239
227#define NULLIFY_FRAGMENTS \ 240 if (!PageUptodate(page) || PageError(page)) {
228 for (i = oldcount; i < newcount; i++) { \ 241 unlock_page(page);
229 bh = sb_getblk(sb, result + i); \ 242 page_cache_release(page);
230 memset (bh->b_data, 0, sb->s_blocksize); \ 243
231 set_buffer_uptodate(bh); \ 244 printk(KERN_ERR "ufs_change_blocknr: "
232 mark_buffer_dirty (bh); \ 245 "can not read page: ino %lu, index: %lu\n",
233 if (IS_SYNC(inode)) \ 246 mapping->host->i_ino, index);
234 sync_dirty_buffer(bh); \ 247
235 brelse (bh); \ 248 page = ERR_PTR(-EIO);
249 goto out;
250 }
236 } 251 }
237 252
238unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment, 253 if (unlikely(!page->mapping || !page_has_buffers(page))) {
239 unsigned goal, unsigned count, int * err ) 254 unlock_page(page);
255 page_cache_release(page);
256 goto try_again;/*we really need these buffers*/
257 }
258out:
259 return page;
260}
261
262/*
263 * Modify inode page cache in such way:
264 * have - blocks with b_blocknr equal to oldb...oldb+count-1
265 * get - blocks with b_blocknr equal to newb...newb+count-1
266 * also we suppose that oldb...oldb+count-1 blocks
267 * situated at the end of file.
268 *
269 * We can come here from ufs_writepage or ufs_prepare_write,
270 * locked_page is argument of these functions, so we already lock it.
271 */
272static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk,
273 unsigned int count, unsigned int oldb,
274 unsigned int newb, struct page *locked_page)
275{
276 unsigned int blk_per_page = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
277 struct address_space *mapping = inode->i_mapping;
278 pgoff_t index, cur_index = locked_page->index;
279 unsigned int i, j;
280 struct page *page;
281 struct buffer_head *head, *bh;
282
283 UFSD("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
284 inode->i_ino, count, oldb, newb);
285
286 BUG_ON(!PageLocked(locked_page));
287
288 for (i = 0; i < count; i += blk_per_page) {
289 index = (baseblk+i) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
290
291 if (likely(cur_index != index)) {
292 page = ufs_get_locked_page(mapping, index);
293 if (IS_ERR(page))
294 continue;
295 } else
296 page = locked_page;
297
298 j = i;
299 head = page_buffers(page);
300 bh = head;
301 do {
302 if (likely(bh->b_blocknr == j + oldb && j < count)) {
303 unmap_underlying_metadata(bh->b_bdev,
304 bh->b_blocknr);
305 bh->b_blocknr = newb + j++;
306 mark_buffer_dirty(bh);
307 }
308
309 bh = bh->b_this_page;
310 } while (bh != head);
311
312 set_page_dirty(page);
313
314 if (likely(cur_index != index)) {
315 unlock_page(page);
316 page_cache_release(page);
317 }
318 }
319 UFSD("EXIT\n");
320}
321
322unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
323 unsigned goal, unsigned count, int * err, struct page *locked_page)
240{ 324{
241 struct super_block * sb; 325 struct super_block * sb;
242 struct ufs_sb_private_info * uspi; 326 struct ufs_sb_private_info * uspi;
243 struct ufs_super_block_first * usb1; 327 struct ufs_super_block_first * usb1;
244 struct buffer_head * bh; 328 unsigned cgno, oldcount, newcount, tmp, request, result;
245 unsigned cgno, oldcount, newcount, tmp, request, i, result;
246 329
247 UFSD(("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count)) 330 UFSD("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count);
248 331
249 sb = inode->i_sb; 332 sb = inode->i_sb;
250 uspi = UFS_SB(sb)->s_uspi; 333 uspi = UFS_SB(sb)->s_uspi;
@@ -273,14 +356,14 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
273 return (unsigned)-1; 356 return (unsigned)-1;
274 } 357 }
275 if (fragment < UFS_I(inode)->i_lastfrag) { 358 if (fragment < UFS_I(inode)->i_lastfrag) {
276 UFSD(("EXIT (ALREADY ALLOCATED)\n")) 359 UFSD("EXIT (ALREADY ALLOCATED)\n");
277 unlock_super (sb); 360 unlock_super (sb);
278 return 0; 361 return 0;
279 } 362 }
280 } 363 }
281 else { 364 else {
282 if (tmp) { 365 if (tmp) {
283 UFSD(("EXIT (ALREADY ALLOCATED)\n")) 366 UFSD("EXIT (ALREADY ALLOCATED)\n");
284 unlock_super(sb); 367 unlock_super(sb);
285 return 0; 368 return 0;
286 } 369 }
@@ -289,9 +372,9 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
289 /* 372 /*
290 * There is not enough space for user on the device 373 * There is not enough space for user on the device
291 */ 374 */
292 if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(usb1, UFS_MINFREE) <= 0) { 375 if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
293 unlock_super (sb); 376 unlock_super (sb);
294 UFSD(("EXIT (FAILED)\n")) 377 UFSD("EXIT (FAILED)\n");
295 return 0; 378 return 0;
296 } 379 }
297 380
@@ -310,12 +393,10 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
310 if (result) { 393 if (result) {
311 *p = cpu_to_fs32(sb, result); 394 *p = cpu_to_fs32(sb, result);
312 *err = 0; 395 *err = 0;
313 inode->i_blocks += count << uspi->s_nspfshift;
314 UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count); 396 UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
315 NULLIFY_FRAGMENTS
316 } 397 }
317 unlock_super(sb); 398 unlock_super(sb);
318 UFSD(("EXIT, result %u\n", result)) 399 UFSD("EXIT, result %u\n", result);
319 return result; 400 return result;
320 } 401 }
321 402
@@ -325,11 +406,9 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
325 result = ufs_add_fragments (inode, tmp, oldcount, newcount, err); 406 result = ufs_add_fragments (inode, tmp, oldcount, newcount, err);
326 if (result) { 407 if (result) {
327 *err = 0; 408 *err = 0;
328 inode->i_blocks += count << uspi->s_nspfshift;
329 UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count); 409 UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
330 NULLIFY_FRAGMENTS
331 unlock_super(sb); 410 unlock_super(sb);
332 UFSD(("EXIT, result %u\n", result)) 411 UFSD("EXIT, result %u\n", result);
333 return result; 412 return result;
334 } 413 }
335 414
@@ -339,8 +418,8 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
339 switch (fs32_to_cpu(sb, usb1->fs_optim)) { 418 switch (fs32_to_cpu(sb, usb1->fs_optim)) {
340 case UFS_OPTSPACE: 419 case UFS_OPTSPACE:
341 request = newcount; 420 request = newcount;
342 if (uspi->s_minfree < 5 || fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) 421 if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree
343 > uspi->s_dsize * uspi->s_minfree / (2 * 100) ) 422 > uspi->s_dsize * uspi->s_minfree / (2 * 100))
344 break; 423 break;
345 usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); 424 usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
346 break; 425 break;
@@ -349,7 +428,7 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
349 428
350 case UFS_OPTTIME: 429 case UFS_OPTTIME:
351 request = uspi->s_fpb; 430 request = uspi->s_fpb;
352 if (fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) < uspi->s_dsize * 431 if (uspi->cs_total.cs_nffree < uspi->s_dsize *
353 (uspi->s_minfree - 2) / 100) 432 (uspi->s_minfree - 2) / 100)
354 break; 433 break;
355 usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); 434 usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
@@ -357,39 +436,22 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
357 } 436 }
358 result = ufs_alloc_fragments (inode, cgno, goal, request, err); 437 result = ufs_alloc_fragments (inode, cgno, goal, request, err);
359 if (result) { 438 if (result) {
360 for (i = 0; i < oldcount; i++) { 439 ufs_change_blocknr(inode, fragment - oldcount, oldcount, tmp,
361 bh = sb_bread(sb, tmp + i); 440 result, locked_page);
362 if(bh) 441
363 {
364 clear_buffer_dirty(bh);
365 bh->b_blocknr = result + i;
366 mark_buffer_dirty (bh);
367 if (IS_SYNC(inode))
368 sync_dirty_buffer(bh);
369 brelse (bh);
370 }
371 else
372 {
373 printk(KERN_ERR "ufs_new_fragments: bread fail\n");
374 unlock_super(sb);
375 return 0;
376 }
377 }
378 *p = cpu_to_fs32(sb, result); 442 *p = cpu_to_fs32(sb, result);
379 *err = 0; 443 *err = 0;
380 inode->i_blocks += count << uspi->s_nspfshift;
381 UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count); 444 UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
382 NULLIFY_FRAGMENTS
383 unlock_super(sb); 445 unlock_super(sb);
384 if (newcount < request) 446 if (newcount < request)
385 ufs_free_fragments (inode, result + newcount, request - newcount); 447 ufs_free_fragments (inode, result + newcount, request - newcount);
386 ufs_free_fragments (inode, tmp, oldcount); 448 ufs_free_fragments (inode, tmp, oldcount);
387 UFSD(("EXIT, result %u\n", result)) 449 UFSD("EXIT, result %u\n", result);
388 return result; 450 return result;
389 } 451 }
390 452
391 unlock_super(sb); 453 unlock_super(sb);
392 UFSD(("EXIT (FAILED)\n")) 454 UFSD("EXIT (FAILED)\n");
393 return 0; 455 return 0;
394} 456}
395 457
@@ -404,7 +466,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
404 struct ufs_cylinder_group * ucg; 466 struct ufs_cylinder_group * ucg;
405 unsigned cgno, fragno, fragoff, count, fragsize, i; 467 unsigned cgno, fragno, fragoff, count, fragsize, i;
406 468
407 UFSD(("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount)) 469 UFSD("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount);
408 470
409 sb = inode->i_sb; 471 sb = inode->i_sb;
410 uspi = UFS_SB(sb)->s_uspi; 472 uspi = UFS_SB(sb)->s_uspi;
@@ -419,7 +481,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
419 ucpi = ufs_load_cylinder (sb, cgno); 481 ucpi = ufs_load_cylinder (sb, cgno);
420 if (!ucpi) 482 if (!ucpi)
421 return 0; 483 return 0;
422 ucg = ubh_get_ucg (UCPI_UBH); 484 ucg = ubh_get_ucg (UCPI_UBH(ucpi));
423 if (!ufs_cg_chkmagic(sb, ucg)) { 485 if (!ufs_cg_chkmagic(sb, ucg)) {
424 ufs_panic (sb, "ufs_add_fragments", 486 ufs_panic (sb, "ufs_add_fragments",
425 "internal error, bad magic number on cg %u", cgno); 487 "internal error, bad magic number on cg %u", cgno);
@@ -429,14 +491,14 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
429 fragno = ufs_dtogd (fragment); 491 fragno = ufs_dtogd (fragment);
430 fragoff = ufs_fragnum (fragno); 492 fragoff = ufs_fragnum (fragno);
431 for (i = oldcount; i < newcount; i++) 493 for (i = oldcount; i < newcount; i++)
432 if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i)) 494 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
433 return 0; 495 return 0;
434 /* 496 /*
435 * Block can be extended 497 * Block can be extended
436 */ 498 */
437 ucg->cg_time = cpu_to_fs32(sb, get_seconds()); 499 ucg->cg_time = cpu_to_fs32(sb, get_seconds());
438 for (i = newcount; i < (uspi->s_fpb - fragoff); i++) 500 for (i = newcount; i < (uspi->s_fpb - fragoff); i++)
439 if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i)) 501 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
440 break; 502 break;
441 fragsize = i - oldcount; 503 fragsize = i - oldcount;
442 if (!fs32_to_cpu(sb, ucg->cg_frsum[fragsize])) 504 if (!fs32_to_cpu(sb, ucg->cg_frsum[fragsize]))
@@ -446,7 +508,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
446 if (fragsize != count) 508 if (fragsize != count)
447 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); 509 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
448 for (i = oldcount; i < newcount; i++) 510 for (i = oldcount; i < newcount; i++)
449 ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, fragno + i); 511 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
450 if(DQUOT_ALLOC_BLOCK(inode, count)) { 512 if(DQUOT_ALLOC_BLOCK(inode, count)) {
451 *err = -EDQUOT; 513 *err = -EDQUOT;
452 return 0; 514 return 0;
@@ -454,17 +516,17 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
454 516
455 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); 517 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
456 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 518 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
457 fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count); 519 uspi->cs_total.cs_nffree -= count;
458 520
459 ubh_mark_buffer_dirty (USPI_UBH); 521 ubh_mark_buffer_dirty (USPI_UBH(uspi));
460 ubh_mark_buffer_dirty (UCPI_UBH); 522 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
461 if (sb->s_flags & MS_SYNCHRONOUS) { 523 if (sb->s_flags & MS_SYNCHRONOUS) {
462 ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi); 524 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
463 ubh_wait_on_buffer (UCPI_UBH); 525 ubh_wait_on_buffer (UCPI_UBH(ucpi));
464 } 526 }
465 sb->s_dirt = 1; 527 sb->s_dirt = 1;
466 528
467 UFSD(("EXIT, fragment %u\n", fragment)) 529 UFSD("EXIT, fragment %u\n", fragment);
468 530
469 return fragment; 531 return fragment;
470} 532}
@@ -487,7 +549,7 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
487 struct ufs_cylinder_group * ucg; 549 struct ufs_cylinder_group * ucg;
488 unsigned oldcg, i, j, k, result, allocsize; 550 unsigned oldcg, i, j, k, result, allocsize;
489 551
490 UFSD(("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count)) 552 UFSD("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count);
491 553
492 sb = inode->i_sb; 554 sb = inode->i_sb;
493 uspi = UFS_SB(sb)->s_uspi; 555 uspi = UFS_SB(sb)->s_uspi;
@@ -521,14 +583,14 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
521 UFS_TEST_FREE_SPACE_CG 583 UFS_TEST_FREE_SPACE_CG
522 } 584 }
523 585
524 UFSD(("EXIT (FAILED)\n")) 586 UFSD("EXIT (FAILED)\n");
525 return 0; 587 return 0;
526 588
527cg_found: 589cg_found:
528 ucpi = ufs_load_cylinder (sb, cgno); 590 ucpi = ufs_load_cylinder (sb, cgno);
529 if (!ucpi) 591 if (!ucpi)
530 return 0; 592 return 0;
531 ucg = ubh_get_ucg (UCPI_UBH); 593 ucg = ubh_get_ucg (UCPI_UBH(ucpi));
532 if (!ufs_cg_chkmagic(sb, ucg)) 594 if (!ufs_cg_chkmagic(sb, ucg))
533 ufs_panic (sb, "ufs_alloc_fragments", 595 ufs_panic (sb, "ufs_alloc_fragments",
534 "internal error, bad magic number on cg %u", cgno); 596 "internal error, bad magic number on cg %u", cgno);
@@ -551,12 +613,12 @@ cg_found:
551 return 0; 613 return 0;
552 goal = ufs_dtogd (result); 614 goal = ufs_dtogd (result);
553 for (i = count; i < uspi->s_fpb; i++) 615 for (i = count; i < uspi->s_fpb; i++)
554 ubh_setbit (UCPI_UBH, ucpi->c_freeoff, goal + i); 616 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
555 i = uspi->s_fpb - count; 617 i = uspi->s_fpb - count;
556 DQUOT_FREE_BLOCK(inode, i); 618 DQUOT_FREE_BLOCK(inode, i);
557 619
558 fs32_add(sb, &ucg->cg_cs.cs_nffree, i); 620 fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
559 fs32_add(sb, &usb1->fs_cstotal.cs_nffree, i); 621 uspi->cs_total.cs_nffree += i;
560 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i); 622 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i);
561 fs32_add(sb, &ucg->cg_frsum[i], 1); 623 fs32_add(sb, &ucg->cg_frsum[i], 1);
562 goto succed; 624 goto succed;
@@ -570,10 +632,10 @@ cg_found:
570 return 0; 632 return 0;
571 } 633 }
572 for (i = 0; i < count; i++) 634 for (i = 0; i < count; i++)
573 ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, result + i); 635 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
574 636
575 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); 637 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
576 fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count); 638 uspi->cs_total.cs_nffree -= count;
577 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 639 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
578 fs32_sub(sb, &ucg->cg_frsum[allocsize], 1); 640 fs32_sub(sb, &ucg->cg_frsum[allocsize], 1);
579 641
@@ -581,16 +643,16 @@ cg_found:
581 fs32_add(sb, &ucg->cg_frsum[allocsize - count], 1); 643 fs32_add(sb, &ucg->cg_frsum[allocsize - count], 1);
582 644
583succed: 645succed:
584 ubh_mark_buffer_dirty (USPI_UBH); 646 ubh_mark_buffer_dirty (USPI_UBH(uspi));
585 ubh_mark_buffer_dirty (UCPI_UBH); 647 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
586 if (sb->s_flags & MS_SYNCHRONOUS) { 648 if (sb->s_flags & MS_SYNCHRONOUS) {
587 ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi); 649 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
588 ubh_wait_on_buffer (UCPI_UBH); 650 ubh_wait_on_buffer (UCPI_UBH(ucpi));
589 } 651 }
590 sb->s_dirt = 1; 652 sb->s_dirt = 1;
591 653
592 result += cgno * uspi->s_fpg; 654 result += cgno * uspi->s_fpg;
593 UFSD(("EXIT3, result %u\n", result)) 655 UFSD("EXIT3, result %u\n", result);
594 return result; 656 return result;
595} 657}
596 658
@@ -603,12 +665,12 @@ static unsigned ufs_alloccg_block (struct inode * inode,
603 struct ufs_cylinder_group * ucg; 665 struct ufs_cylinder_group * ucg;
604 unsigned result, cylno, blkno; 666 unsigned result, cylno, blkno;
605 667
606 UFSD(("ENTER, goal %u\n", goal)) 668 UFSD("ENTER, goal %u\n", goal);
607 669
608 sb = inode->i_sb; 670 sb = inode->i_sb;
609 uspi = UFS_SB(sb)->s_uspi; 671 uspi = UFS_SB(sb)->s_uspi;
610 usb1 = ubh_get_usb_first(uspi); 672 usb1 = ubh_get_usb_first(uspi);
611 ucg = ubh_get_ucg(UCPI_UBH); 673 ucg = ubh_get_ucg(UCPI_UBH(ucpi));
612 674
613 if (goal == 0) { 675 if (goal == 0) {
614 goal = ucpi->c_rotor; 676 goal = ucpi->c_rotor;
@@ -620,7 +682,7 @@ static unsigned ufs_alloccg_block (struct inode * inode,
620 /* 682 /*
621 * If the requested block is available, use it. 683 * If the requested block is available, use it.
622 */ 684 */
623 if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, ufs_fragstoblks(goal))) { 685 if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) {
624 result = goal; 686 result = goal;
625 goto gotit; 687 goto gotit;
626 } 688 }
@@ -632,7 +694,7 @@ norot:
632 ucpi->c_rotor = result; 694 ucpi->c_rotor = result;
633gotit: 695gotit:
634 blkno = ufs_fragstoblks(result); 696 blkno = ufs_fragstoblks(result);
635 ubh_clrblock (UCPI_UBH, ucpi->c_freeoff, blkno); 697 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
636 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 698 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
637 ufs_clusteracct (sb, ucpi, blkno, -1); 699 ufs_clusteracct (sb, ucpi, blkno, -1);
638 if(DQUOT_ALLOC_BLOCK(inode, uspi->s_fpb)) { 700 if(DQUOT_ALLOC_BLOCK(inode, uspi->s_fpb)) {
@@ -641,31 +703,76 @@ gotit:
641 } 703 }
642 704
643 fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); 705 fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
644 fs32_sub(sb, &usb1->fs_cstotal.cs_nbfree, 1); 706 uspi->cs_total.cs_nbfree--;
645 fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1); 707 fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1);
646 cylno = ufs_cbtocylno(result); 708 cylno = ufs_cbtocylno(result);
647 fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(result)), 1); 709 fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(result)), 1);
648 fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1); 710 fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1);
649 711
650 UFSD(("EXIT, result %u\n", result)) 712 UFSD("EXIT, result %u\n", result);
651 713
652 return result; 714 return result;
653} 715}
654 716
655static unsigned ufs_bitmap_search (struct super_block * sb, 717static unsigned ubh_scanc(struct ufs_sb_private_info *uspi,
656 struct ufs_cg_private_info * ucpi, unsigned goal, unsigned count) 718 struct ufs_buffer_head *ubh,
719 unsigned begin, unsigned size,
720 unsigned char *table, unsigned char mask)
657{ 721{
658 struct ufs_sb_private_info * uspi; 722 unsigned rest, offset;
659 struct ufs_super_block_first * usb1; 723 unsigned char *cp;
660 struct ufs_cylinder_group * ucg; 724
661 unsigned start, length, location, result; 725
662 unsigned possition, fragsize, blockmap, mask; 726 offset = begin & ~uspi->s_fmask;
663 727 begin >>= uspi->s_fshift;
664 UFSD(("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count)) 728 for (;;) {
729 if ((offset + size) < uspi->s_fsize)
730 rest = size;
731 else
732 rest = uspi->s_fsize - offset;
733 size -= rest;
734 cp = ubh->bh[begin]->b_data + offset;
735 while ((table[*cp++] & mask) == 0 && --rest)
736 ;
737 if (rest || !size)
738 break;
739 begin++;
740 offset = 0;
741 }
742 return (size + rest);
743}
744
745/*
746 * Find a block of the specified size in the specified cylinder group.
747 * @sp: pointer to super block
748 * @ucpi: pointer to cylinder group info
749 * @goal: near which block we want find new one
750 * @count: specified size
751 */
752static unsigned ufs_bitmap_search(struct super_block *sb,
753 struct ufs_cg_private_info *ucpi,
754 unsigned goal, unsigned count)
755{
756 /*
757 * Bit patterns for identifying fragments in the block map
758 * used as ((map & mask_arr) == want_arr)
759 */
760 static const int mask_arr[9] = {
761 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff
762 };
763 static const int want_arr[9] = {
764 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
765 };
766 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
767 struct ufs_super_block_first *usb1;
768 struct ufs_cylinder_group *ucg;
769 unsigned start, length, loc, result;
770 unsigned pos, want, blockmap, mask, end;
771
772 UFSD("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count);
665 773
666 uspi = UFS_SB(sb)->s_uspi;
667 usb1 = ubh_get_usb_first (uspi); 774 usb1 = ubh_get_usb_first (uspi);
668 ucg = ubh_get_ucg(UCPI_UBH); 775 ucg = ubh_get_ucg(UCPI_UBH(ucpi));
669 776
670 if (goal) 777 if (goal)
671 start = ufs_dtogd(goal) >> 3; 778 start = ufs_dtogd(goal) >> 3;
@@ -673,53 +780,50 @@ static unsigned ufs_bitmap_search (struct super_block * sb,
673 start = ucpi->c_frotor >> 3; 780 start = ucpi->c_frotor >> 3;
674 781
675 length = ((uspi->s_fpg + 7) >> 3) - start; 782 length = ((uspi->s_fpg + 7) >> 3) - start;
676 location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff + start, length, 783 loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff + start, length,
677 (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other, 784 (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
678 1 << (count - 1 + (uspi->s_fpb & 7))); 785 1 << (count - 1 + (uspi->s_fpb & 7)));
679 if (location == 0) { 786 if (loc == 0) {
680 length = start + 1; 787 length = start + 1;
681 location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff, length, 788 loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff, length,
682 (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other, 789 (uspi->s_fpb == 8) ? ufs_fragtable_8fpb :
683 1 << (count - 1 + (uspi->s_fpb & 7))); 790 ufs_fragtable_other,
684 if (location == 0) { 791 1 << (count - 1 + (uspi->s_fpb & 7)));
685 ufs_error (sb, "ufs_bitmap_search", 792 if (loc == 0) {
686 "bitmap corrupted on cg %u, start %u, length %u, count %u, freeoff %u\n", 793 ufs_error(sb, "ufs_bitmap_search",
687 ucpi->c_cgx, start, length, count, ucpi->c_freeoff); 794 "bitmap corrupted on cg %u, start %u,"
795 " length %u, count %u, freeoff %u\n",
796 ucpi->c_cgx, start, length, count,
797 ucpi->c_freeoff);
688 return (unsigned)-1; 798 return (unsigned)-1;
689 } 799 }
690 start = 0; 800 start = 0;
691 } 801 }
692 result = (start + length - location) << 3; 802 result = (start + length - loc) << 3;
693 ucpi->c_frotor = result; 803 ucpi->c_frotor = result;
694 804
695 /* 805 /*
696 * found the byte in the map 806 * found the byte in the map
697 */ 807 */
698 blockmap = ubh_blkmap(UCPI_UBH, ucpi->c_freeoff, result); 808
699 fragsize = 0; 809 for (end = result + 8; result < end; result += uspi->s_fpb) {
700 for (possition = 0, mask = 1; possition < 8; possition++, mask <<= 1) { 810 blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result);
701 if (blockmap & mask) { 811 blockmap <<= 1;
702 if (!(possition & uspi->s_fpbmask)) 812 mask = mask_arr[count];
703 fragsize = 1; 813 want = want_arr[count];
704 else 814 for (pos = 0; pos <= uspi->s_fpb - count; pos++) {
705 fragsize++; 815 if ((blockmap & mask) == want) {
706 } 816 UFSD("EXIT, result %u\n", result);
707 else { 817 return result + pos;
708 if (fragsize == count) { 818 }
709 result += possition - count; 819 mask <<= 1;
710 UFSD(("EXIT, result %u\n", result)) 820 want <<= 1;
711 return result; 821 }
712 } 822 }
713 fragsize = 0; 823
714 } 824 ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n",
715 } 825 ucpi->c_cgx);
716 if (fragsize == count) { 826 UFSD("EXIT (FAILED)\n");
717 result += possition - count;
718 UFSD(("EXIT, result %u\n", result))
719 return result;
720 }
721 ufs_error (sb, "ufs_bitmap_search", "block not in map on cg %u\n", ucpi->c_cgx);
722 UFSD(("EXIT (FAILED)\n"))
723 return (unsigned)-1; 827 return (unsigned)-1;
724} 828}
725 829
@@ -734,9 +838,9 @@ static void ufs_clusteracct(struct super_block * sb,
734 return; 838 return;
735 839
736 if (cnt > 0) 840 if (cnt > 0)
737 ubh_setbit(UCPI_UBH, ucpi->c_clusteroff, blkno); 841 ubh_setbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
738 else 842 else
739 ubh_clrbit(UCPI_UBH, ucpi->c_clusteroff, blkno); 843 ubh_clrbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
740 844
741 /* 845 /*
742 * Find the size of the cluster going forward. 846 * Find the size of the cluster going forward.
@@ -745,7 +849,7 @@ static void ufs_clusteracct(struct super_block * sb,
745 end = start + uspi->s_contigsumsize; 849 end = start + uspi->s_contigsumsize;
746 if ( end >= ucpi->c_nclusterblks) 850 if ( end >= ucpi->c_nclusterblks)
747 end = ucpi->c_nclusterblks; 851 end = ucpi->c_nclusterblks;
748 i = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_clusteroff, end, start); 852 i = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, end, start);
749 if (i > end) 853 if (i > end)
750 i = end; 854 i = end;
751 forw = i - start; 855 forw = i - start;
@@ -757,7 +861,7 @@ static void ufs_clusteracct(struct super_block * sb,
757 end = start - uspi->s_contigsumsize; 861 end = start - uspi->s_contigsumsize;
758 if (end < 0 ) 862 if (end < 0 )
759 end = -1; 863 end = -1;
760 i = ubh_find_last_zero_bit (UCPI_UBH, ucpi->c_clusteroff, start, end); 864 i = ubh_find_last_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, start, end);
761 if ( i < end) 865 if ( i < end)
762 i = end; 866 i = end;
763 back = start - i; 867 back = start - i;
@@ -769,11 +873,11 @@ static void ufs_clusteracct(struct super_block * sb,
769 i = back + forw + 1; 873 i = back + forw + 1;
770 if (i > uspi->s_contigsumsize) 874 if (i > uspi->s_contigsumsize)
771 i = uspi->s_contigsumsize; 875 i = uspi->s_contigsumsize;
772 fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (i << 2)), cnt); 876 fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (i << 2)), cnt);
773 if (back > 0) 877 if (back > 0)
774 fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (back << 2)), cnt); 878 fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (back << 2)), cnt);
775 if (forw > 0) 879 if (forw > 0)
776 fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (forw << 2)), cnt); 880 fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (forw << 2)), cnt);
777} 881}
778 882
779 883
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
index 14abb8b835f7..09c39e5e6386 100644
--- a/fs/ufs/cylinder.c
+++ b/fs/ufs/cylinder.c
@@ -20,15 +20,6 @@
20#include "swab.h" 20#include "swab.h"
21#include "util.h" 21#include "util.h"
22 22
23#undef UFS_CYLINDER_DEBUG
24
25#ifdef UFS_CYLINDER_DEBUG
26#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x;
27#else
28#define UFSD(x)
29#endif
30
31
32/* 23/*
33 * Read cylinder group into cache. The memory space for ufs_cg_private_info 24 * Read cylinder group into cache. The memory space for ufs_cg_private_info
34 * structure is already allocated during ufs_read_super. 25 * structure is already allocated during ufs_read_super.
@@ -42,19 +33,19 @@ static void ufs_read_cylinder (struct super_block * sb,
42 struct ufs_cylinder_group * ucg; 33 struct ufs_cylinder_group * ucg;
43 unsigned i, j; 34 unsigned i, j;
44 35
45 UFSD(("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr)) 36 UFSD("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr);
46 uspi = sbi->s_uspi; 37 uspi = sbi->s_uspi;
47 ucpi = sbi->s_ucpi[bitmap_nr]; 38 ucpi = sbi->s_ucpi[bitmap_nr];
48 ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data; 39 ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data;
49 40
50 UCPI_UBH->fragment = ufs_cgcmin(cgno); 41 UCPI_UBH(ucpi)->fragment = ufs_cgcmin(cgno);
51 UCPI_UBH->count = uspi->s_cgsize >> sb->s_blocksize_bits; 42 UCPI_UBH(ucpi)->count = uspi->s_cgsize >> sb->s_blocksize_bits;
52 /* 43 /*
53 * We have already the first fragment of cylinder group block in buffer 44 * We have already the first fragment of cylinder group block in buffer
54 */ 45 */
55 UCPI_UBH->bh[0] = sbi->s_ucg[cgno]; 46 UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno];
56 for (i = 1; i < UCPI_UBH->count; i++) 47 for (i = 1; i < UCPI_UBH(ucpi)->count; i++)
57 if (!(UCPI_UBH->bh[i] = sb_bread(sb, UCPI_UBH->fragment + i))) 48 if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i)))
58 goto failed; 49 goto failed;
59 sbi->s_cgno[bitmap_nr] = cgno; 50 sbi->s_cgno[bitmap_nr] = cgno;
60 51
@@ -73,7 +64,7 @@ static void ufs_read_cylinder (struct super_block * sb,
73 ucpi->c_clustersumoff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clustersumoff); 64 ucpi->c_clustersumoff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clustersumoff);
74 ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff); 65 ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff);
75 ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks); 66 ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks);
76 UFSD(("EXIT\n")) 67 UFSD("EXIT\n");
77 return; 68 return;
78 69
79failed: 70failed:
@@ -95,15 +86,15 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
95 struct ufs_cylinder_group * ucg; 86 struct ufs_cylinder_group * ucg;
96 unsigned i; 87 unsigned i;
97 88
98 UFSD(("ENTER, bitmap_nr %u\n", bitmap_nr)) 89 UFSD("ENTER, bitmap_nr %u\n", bitmap_nr);
99 90
100 uspi = sbi->s_uspi; 91 uspi = sbi->s_uspi;
101 if (sbi->s_cgno[bitmap_nr] == UFS_CGNO_EMPTY) { 92 if (sbi->s_cgno[bitmap_nr] == UFS_CGNO_EMPTY) {
102 UFSD(("EXIT\n")) 93 UFSD("EXIT\n");
103 return; 94 return;
104 } 95 }
105 ucpi = sbi->s_ucpi[bitmap_nr]; 96 ucpi = sbi->s_ucpi[bitmap_nr];
106 ucg = ubh_get_ucg(UCPI_UBH); 97 ucg = ubh_get_ucg(UCPI_UBH(ucpi));
107 98
108 if (uspi->s_ncg > UFS_MAX_GROUP_LOADED && bitmap_nr >= sbi->s_cg_loaded) { 99 if (uspi->s_ncg > UFS_MAX_GROUP_LOADED && bitmap_nr >= sbi->s_cg_loaded) {
109 ufs_panic (sb, "ufs_put_cylinder", "internal error"); 100 ufs_panic (sb, "ufs_put_cylinder", "internal error");
@@ -116,13 +107,13 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
116 ucg->cg_rotor = cpu_to_fs32(sb, ucpi->c_rotor); 107 ucg->cg_rotor = cpu_to_fs32(sb, ucpi->c_rotor);
117 ucg->cg_frotor = cpu_to_fs32(sb, ucpi->c_frotor); 108 ucg->cg_frotor = cpu_to_fs32(sb, ucpi->c_frotor);
118 ucg->cg_irotor = cpu_to_fs32(sb, ucpi->c_irotor); 109 ucg->cg_irotor = cpu_to_fs32(sb, ucpi->c_irotor);
119 ubh_mark_buffer_dirty (UCPI_UBH); 110 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
120 for (i = 1; i < UCPI_UBH->count; i++) { 111 for (i = 1; i < UCPI_UBH(ucpi)->count; i++) {
121 brelse (UCPI_UBH->bh[i]); 112 brelse (UCPI_UBH(ucpi)->bh[i]);
122 } 113 }
123 114
124 sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY; 115 sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
125 UFSD(("EXIT\n")) 116 UFSD("EXIT\n");
126} 117}
127 118
128/* 119/*
@@ -139,7 +130,7 @@ struct ufs_cg_private_info * ufs_load_cylinder (
139 struct ufs_cg_private_info * ucpi; 130 struct ufs_cg_private_info * ucpi;
140 unsigned cg, i, j; 131 unsigned cg, i, j;
141 132
142 UFSD(("ENTER, cgno %u\n", cgno)) 133 UFSD("ENTER, cgno %u\n", cgno);
143 134
144 uspi = sbi->s_uspi; 135 uspi = sbi->s_uspi;
145 if (cgno >= uspi->s_ncg) { 136 if (cgno >= uspi->s_ncg) {
@@ -150,7 +141,7 @@ struct ufs_cg_private_info * ufs_load_cylinder (
150 * Cylinder group number cg it in cache and it was last used 141 * Cylinder group number cg it in cache and it was last used
151 */ 142 */
152 if (sbi->s_cgno[0] == cgno) { 143 if (sbi->s_cgno[0] == cgno) {
153 UFSD(("EXIT\n")) 144 UFSD("EXIT\n");
154 return sbi->s_ucpi[0]; 145 return sbi->s_ucpi[0];
155 } 146 }
156 /* 147 /*
@@ -160,16 +151,16 @@ struct ufs_cg_private_info * ufs_load_cylinder (
160 if (sbi->s_cgno[cgno] != UFS_CGNO_EMPTY) { 151 if (sbi->s_cgno[cgno] != UFS_CGNO_EMPTY) {
161 if (sbi->s_cgno[cgno] != cgno) { 152 if (sbi->s_cgno[cgno] != cgno) {
162 ufs_panic (sb, "ufs_load_cylinder", "internal error, wrong number of cg in cache"); 153 ufs_panic (sb, "ufs_load_cylinder", "internal error, wrong number of cg in cache");
163 UFSD(("EXIT (FAILED)\n")) 154 UFSD("EXIT (FAILED)\n");
164 return NULL; 155 return NULL;
165 } 156 }
166 else { 157 else {
167 UFSD(("EXIT\n")) 158 UFSD("EXIT\n");
168 return sbi->s_ucpi[cgno]; 159 return sbi->s_ucpi[cgno];
169 } 160 }
170 } else { 161 } else {
171 ufs_read_cylinder (sb, cgno, cgno); 162 ufs_read_cylinder (sb, cgno, cgno);
172 UFSD(("EXIT\n")) 163 UFSD("EXIT\n");
173 return sbi->s_ucpi[cgno]; 164 return sbi->s_ucpi[cgno];
174 } 165 }
175 } 166 }
@@ -204,6 +195,6 @@ struct ufs_cg_private_info * ufs_load_cylinder (
204 sbi->s_ucpi[0] = ucpi; 195 sbi->s_ucpi[0] = ucpi;
205 ufs_read_cylinder (sb, cgno, 0); 196 ufs_read_cylinder (sb, cgno, 0);
206 } 197 }
207 UFSD(("EXIT\n")) 198 UFSD("EXIT\n");
208 return sbi->s_ucpi[0]; 199 return sbi->s_ucpi[0];
209} 200}
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 1a561202d3f4..7f0a0aa63584 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -11,31 +11,20 @@
11 * 4.4BSD (FreeBSD) support added on February 1st 1998 by 11 * 4.4BSD (FreeBSD) support added on February 1st 1998 by
12 * Niels Kristian Bech Jensen <nkbj@image.dk> partially based 12 * Niels Kristian Bech Jensen <nkbj@image.dk> partially based
13 * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>. 13 * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>.
14 *
15 * Migration to usage of "page cache" on May 2006 by
16 * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
14 */ 17 */
15 18
16#include <linux/time.h> 19#include <linux/time.h>
17#include <linux/fs.h> 20#include <linux/fs.h>
18#include <linux/ufs_fs.h> 21#include <linux/ufs_fs.h>
19#include <linux/smp_lock.h> 22#include <linux/smp_lock.h>
20#include <linux/buffer_head.h>
21#include <linux/sched.h> 23#include <linux/sched.h>
22 24
23#include "swab.h" 25#include "swab.h"
24#include "util.h" 26#include "util.h"
25 27
26#undef UFS_DIR_DEBUG
27
28#ifdef UFS_DIR_DEBUG
29#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
30#else
31#define UFSD(x)
32#endif
33
34static int
35ufs_check_dir_entry (const char *, struct inode *, struct ufs_dir_entry *,
36 struct buffer_head *, unsigned long);
37
38
39/* 28/*
40 * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure. 29 * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure.
41 * 30 *
@@ -51,495 +40,541 @@ static inline int ufs_match(struct super_block *sb, int len,
51 return !memcmp(name, de->d_name, len); 40 return !memcmp(name, de->d_name, len);
52} 41}
53 42
54/* 43static int ufs_commit_chunk(struct page *page, unsigned from, unsigned to)
55 * This is blatantly stolen from ext2fs
56 */
57static int
58ufs_readdir (struct file * filp, void * dirent, filldir_t filldir)
59{ 44{
60 struct inode *inode = filp->f_dentry->d_inode; 45 struct inode *dir = page->mapping->host;
61 int error = 0; 46 int err = 0;
62 unsigned long offset, lblk; 47 dir->i_version++;
63 int i, stored; 48 page->mapping->a_ops->commit_write(NULL, page, from, to);
64 struct buffer_head * bh; 49 if (IS_DIRSYNC(dir))
65 struct ufs_dir_entry * de; 50 err = write_one_page(page, 1);
66 struct super_block * sb; 51 else
67 int de_reclen; 52 unlock_page(page);
68 unsigned flags; 53 return err;
69 u64 blk= 0L; 54}
70
71 lock_kernel();
72
73 sb = inode->i_sb;
74 flags = UFS_SB(sb)->s_flags;
75
76 UFSD(("ENTER, ino %lu f_pos %lu\n", inode->i_ino, (unsigned long) filp->f_pos))
77
78 stored = 0;
79 bh = NULL;
80 offset = filp->f_pos & (sb->s_blocksize - 1);
81
82 while (!error && !stored && filp->f_pos < inode->i_size) {
83 lblk = (filp->f_pos) >> sb->s_blocksize_bits;
84 blk = ufs_frag_map(inode, lblk);
85 if (!blk || !(bh = sb_bread(sb, blk))) {
86 /* XXX - error - skip to the next block */
87 printk("ufs_readdir: "
88 "dir inode %lu has a hole at offset %lu\n",
89 inode->i_ino, (unsigned long int)filp->f_pos);
90 filp->f_pos += sb->s_blocksize - offset;
91 continue;
92 }
93
94revalidate:
95 /* If the dir block has changed since the last call to
96 * readdir(2), then we might be pointing to an invalid
97 * dirent right now. Scan from the start of the block
98 * to make sure. */
99 if (filp->f_version != inode->i_version) {
100 for (i = 0; i < sb->s_blocksize && i < offset; ) {
101 de = (struct ufs_dir_entry *)(bh->b_data + i);
102 /* It's too expensive to do a full
103 * dirent test each time round this
104 * loop, but we do have to test at
105 * least that it is non-zero. A
106 * failure will be detected in the
107 * dirent test below. */
108 de_reclen = fs16_to_cpu(sb, de->d_reclen);
109 if (de_reclen < 1)
110 break;
111 i += de_reclen;
112 }
113 offset = i;
114 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
115 | offset;
116 filp->f_version = inode->i_version;
117 }
118 55
119 while (!error && filp->f_pos < inode->i_size 56static inline void ufs_put_page(struct page *page)
120 && offset < sb->s_blocksize) { 57{
121 de = (struct ufs_dir_entry *) (bh->b_data + offset); 58 kunmap(page);
122 /* XXX - put in a real ufs_check_dir_entry() */ 59 page_cache_release(page);
123 if ((de->d_reclen == 0) || (ufs_get_de_namlen(sb, de) == 0)) { 60}
124 filp->f_pos = (filp->f_pos &
125 (sb->s_blocksize - 1)) +
126 sb->s_blocksize;
127 brelse(bh);
128 unlock_kernel();
129 return stored;
130 }
131 if (!ufs_check_dir_entry ("ufs_readdir", inode, de,
132 bh, offset)) {
133 /* On error, skip the f_pos to the
134 next block. */
135 filp->f_pos = (filp->f_pos |
136 (sb->s_blocksize - 1)) +
137 1;
138 brelse (bh);
139 unlock_kernel();
140 return stored;
141 }
142 offset += fs16_to_cpu(sb, de->d_reclen);
143 if (de->d_ino) {
144 /* We might block in the next section
145 * if the data destination is
146 * currently swapped out. So, use a
147 * version stamp to detect whether or
148 * not the directory has been modified
149 * during the copy operation. */
150 unsigned long version = filp->f_version;
151 unsigned char d_type = DT_UNKNOWN;
152 61
153 UFSD(("filldir(%s,%u)\n", de->d_name, 62static inline unsigned long ufs_dir_pages(struct inode *inode)
154 fs32_to_cpu(sb, de->d_ino))) 63{
155 UFSD(("namlen %u\n", ufs_get_de_namlen(sb, de))) 64 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
65}
156 66
157 if ((flags & UFS_DE_MASK) == UFS_DE_44BSD) 67ino_t ufs_inode_by_name(struct inode *dir, struct dentry *dentry)
158 d_type = de->d_u.d_44.d_type; 68{
159 error = filldir(dirent, de->d_name, 69 ino_t res = 0;
160 ufs_get_de_namlen(sb, de), filp->f_pos, 70 struct ufs_dir_entry *de;
161 fs32_to_cpu(sb, de->d_ino), d_type); 71 struct page *page;
162 if (error) 72
163 break; 73 de = ufs_find_entry(dir, dentry, &page);
164 if (version != filp->f_version) 74 if (de) {
165 goto revalidate; 75 res = fs32_to_cpu(dir->i_sb, de->d_ino);
166 stored ++; 76 ufs_put_page(page);
167 }
168 filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
169 }
170 offset = 0;
171 brelse (bh);
172 } 77 }
173 unlock_kernel(); 78 return res;
174 return 0;
175} 79}
176 80
177/*
178 * define how far ahead to read directories while searching them.
179 */
180#define NAMEI_RA_CHUNKS 2
181#define NAMEI_RA_BLOCKS 4
182#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
183#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
184 81
185/* 82/* Releases the page */
186 * ufs_find_entry() 83void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
187 * 84 struct page *page, struct inode *inode)
188 * finds an entry in the specified directory with the wanted name. It
189 * returns the cache buffer in which the entry was found, and the entry
190 * itself (as a parameter - res_bh). It does NOT read the inode of the
191 * entry - you'll have to do that yourself if you want to.
192 */
193struct ufs_dir_entry * ufs_find_entry (struct dentry *dentry,
194 struct buffer_head ** res_bh)
195{ 85{
196 struct super_block * sb; 86 unsigned from = (char *) de - (char *) page_address(page);
197 struct buffer_head * bh_use[NAMEI_RA_SIZE]; 87 unsigned to = from + fs16_to_cpu(dir->i_sb, de->d_reclen);
198 struct buffer_head * bh_read[NAMEI_RA_SIZE]; 88 int err;
199 unsigned long offset;
200 int block, toread, i, err;
201 struct inode *dir = dentry->d_parent->d_inode;
202 const char *name = dentry->d_name.name;
203 int namelen = dentry->d_name.len;
204 89
205 UFSD(("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen)) 90 lock_page(page);
206 91 err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
207 *res_bh = NULL; 92 BUG_ON(err);
208 93 de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
209 sb = dir->i_sb; 94 ufs_set_de_type(dir->i_sb, de, inode->i_mode);
210 95 err = ufs_commit_chunk(page, from, to);
211 if (namelen > UFS_MAXNAMLEN) 96 ufs_put_page(page);
212 return NULL; 97 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
98 mark_inode_dirty(dir);
99}
213 100
214 memset (bh_use, 0, sizeof (bh_use));
215 toread = 0;
216 for (block = 0; block < NAMEI_RA_SIZE; ++block) {
217 struct buffer_head * bh;
218 101
219 if ((block << sb->s_blocksize_bits) >= dir->i_size) 102static void ufs_check_page(struct page *page)
220 break; 103{
221 bh = ufs_getfrag (dir, block, 0, &err); 104 struct inode *dir = page->mapping->host;
222 bh_use[block] = bh; 105 struct super_block *sb = dir->i_sb;
223 if (bh && !buffer_uptodate(bh)) 106 char *kaddr = page_address(page);
224 bh_read[toread++] = bh; 107 unsigned offs, rec_len;
108 unsigned limit = PAGE_CACHE_SIZE;
109 struct ufs_dir_entry *p;
110 char *error;
111
112 if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
113 limit = dir->i_size & ~PAGE_CACHE_MASK;
114 if (limit & (UFS_SECTOR_SIZE - 1))
115 goto Ebadsize;
116 if (!limit)
117 goto out;
225 } 118 }
119 for (offs = 0; offs <= limit - UFS_DIR_REC_LEN(1); offs += rec_len) {
120 p = (struct ufs_dir_entry *)(kaddr + offs);
121 rec_len = fs16_to_cpu(sb, p->d_reclen);
122
123 if (rec_len < UFS_DIR_REC_LEN(1))
124 goto Eshort;
125 if (rec_len & 3)
126 goto Ealign;
127 if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p)))
128 goto Enamelen;
129 if (((offs + rec_len - 1) ^ offs) & ~(UFS_SECTOR_SIZE-1))
130 goto Espan;
131 if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
132 UFS_SB(sb)->s_uspi->s_ncg))
133 goto Einumber;
134 }
135 if (offs != limit)
136 goto Eend;
137out:
138 SetPageChecked(page);
139 return;
140
141 /* Too bad, we had an error */
142
143Ebadsize:
144 ufs_error(sb, "ufs_check_page",
145 "size of directory #%lu is not a multiple of chunk size",
146 dir->i_ino
147 );
148 goto fail;
149Eshort:
150 error = "rec_len is smaller than minimal";
151 goto bad_entry;
152Ealign:
153 error = "unaligned directory entry";
154 goto bad_entry;
155Enamelen:
156 error = "rec_len is too small for name_len";
157 goto bad_entry;
158Espan:
159 error = "directory entry across blocks";
160 goto bad_entry;
161Einumber:
162 error = "inode out of bounds";
163bad_entry:
164 ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - "
165 "offset=%lu, rec_len=%d, name_len=%d",
166 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
167 rec_len, ufs_get_de_namlen(sb, p));
168 goto fail;
169Eend:
170 p = (struct ufs_dir_entry *)(kaddr + offs);
171 ufs_error (sb, "ext2_check_page",
172 "entry in directory #%lu spans the page boundary"
173 "offset=%lu",
174 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
175fail:
176 SetPageChecked(page);
177 SetPageError(page);
178}
226 179
227 for (block = 0, offset = 0; offset < dir->i_size; block++) { 180static struct page *ufs_get_page(struct inode *dir, unsigned long n)
228 struct buffer_head * bh; 181{
229 struct ufs_dir_entry * de; 182 struct address_space *mapping = dir->i_mapping;
230 char * dlimit; 183 struct page *page = read_cache_page(mapping, n,
231 184 (filler_t*)mapping->a_ops->readpage, NULL);
232 if ((block % NAMEI_RA_BLOCKS) == 0 && toread) { 185 if (!IS_ERR(page)) {
233 ll_rw_block (READ, toread, bh_read); 186 wait_on_page_locked(page);
234 toread = 0; 187 kmap(page);
235 } 188 if (!PageUptodate(page))
236 bh = bh_use[block % NAMEI_RA_SIZE]; 189 goto fail;
237 if (!bh) { 190 if (!PageChecked(page))
238 ufs_error (sb, "ufs_find_entry", 191 ufs_check_page(page);
239 "directory #%lu contains a hole at offset %lu", 192 if (PageError(page))
240 dir->i_ino, offset); 193 goto fail;
241 offset += sb->s_blocksize;
242 continue;
243 }
244 wait_on_buffer (bh);
245 if (!buffer_uptodate(bh)) {
246 /*
247 * read error: all bets are off
248 */
249 break;
250 }
251
252 de = (struct ufs_dir_entry *) bh->b_data;
253 dlimit = bh->b_data + sb->s_blocksize;
254 while ((char *) de < dlimit && offset < dir->i_size) {
255 /* this code is executed quadratically often */
256 /* do minimal checking by hand */
257 int de_len;
258
259 if ((char *) de + namelen <= dlimit &&
260 ufs_match(sb, namelen, name, de)) {
261 /* found a match -
262 just to be sure, do a full check */
263 if (!ufs_check_dir_entry("ufs_find_entry",
264 dir, de, bh, offset))
265 goto failed;
266 for (i = 0; i < NAMEI_RA_SIZE; ++i) {
267 if (bh_use[i] != bh)
268 brelse (bh_use[i]);
269 }
270 *res_bh = bh;
271 return de;
272 }
273 /* prevent looping on a bad block */
274 de_len = fs16_to_cpu(sb, de->d_reclen);
275 if (de_len <= 0)
276 goto failed;
277 offset += de_len;
278 de = (struct ufs_dir_entry *) ((char *) de + de_len);
279 }
280
281 brelse (bh);
282 if (((block + NAMEI_RA_SIZE) << sb->s_blocksize_bits ) >=
283 dir->i_size)
284 bh = NULL;
285 else
286 bh = ufs_getfrag (dir, block + NAMEI_RA_SIZE, 0, &err);
287 bh_use[block % NAMEI_RA_SIZE] = bh;
288 if (bh && !buffer_uptodate(bh))
289 bh_read[toread++] = bh;
290 } 194 }
195 return page;
291 196
292failed: 197fail:
293 for (i = 0; i < NAMEI_RA_SIZE; ++i) brelse (bh_use[i]); 198 ufs_put_page(page);
294 UFSD(("EXIT\n")) 199 return ERR_PTR(-EIO);
295 return NULL;
296} 200}
297 201
298static int 202/*
299ufs_check_dir_entry (const char *function, struct inode *dir, 203 * Return the offset into page `page_nr' of the last valid
300 struct ufs_dir_entry *de, struct buffer_head *bh, 204 * byte in that page, plus one.
301 unsigned long offset) 205 */
206static unsigned
207ufs_last_byte(struct inode *inode, unsigned long page_nr)
302{ 208{
303 struct super_block *sb = dir->i_sb; 209 unsigned last_byte = inode->i_size;
304 const char *error_msg = NULL; 210
305 int rlen = fs16_to_cpu(sb, de->d_reclen); 211 last_byte -= page_nr << PAGE_CACHE_SHIFT;
306 212 if (last_byte > PAGE_CACHE_SIZE)
307 if (rlen < UFS_DIR_REC_LEN(1)) 213 last_byte = PAGE_CACHE_SIZE;
308 error_msg = "reclen is smaller than minimal"; 214 return last_byte;
309 else if (rlen % 4 != 0)
310 error_msg = "reclen % 4 != 0";
311 else if (rlen < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)))
312 error_msg = "reclen is too small for namlen";
313 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
314 error_msg = "directory entry across blocks";
315 else if (fs32_to_cpu(sb, de->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
316 UFS_SB(sb)->s_uspi->s_ncg))
317 error_msg = "inode out of bounds";
318
319 if (error_msg != NULL)
320 ufs_error (sb, function, "bad entry in directory #%lu, size %Lu: %s - "
321 "offset=%lu, inode=%lu, reclen=%d, namlen=%d",
322 dir->i_ino, dir->i_size, error_msg, offset,
323 (unsigned long)fs32_to_cpu(sb, de->d_ino),
324 rlen, ufs_get_de_namlen(sb, de));
325
326 return (error_msg == NULL ? 1 : 0);
327} 215}
328 216
329struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct buffer_head **p) 217static inline struct ufs_dir_entry *
218ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p)
330{ 219{
331 int err; 220 return (struct ufs_dir_entry *)((char *)p +
332 struct buffer_head *bh = ufs_bread (dir, 0, 0, &err); 221 fs16_to_cpu(sb, p->d_reclen));
333 struct ufs_dir_entry *res = NULL;
334
335 if (bh) {
336 res = (struct ufs_dir_entry *) bh->b_data;
337 res = (struct ufs_dir_entry *)((char *)res +
338 fs16_to_cpu(dir->i_sb, res->d_reclen));
339 }
340 *p = bh;
341 return res;
342} 222}
343ino_t ufs_inode_by_name(struct inode * dir, struct dentry *dentry) 223
224struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
344{ 225{
345 ino_t res = 0; 226 struct page *page = ufs_get_page(dir, 0);
346 struct ufs_dir_entry * de; 227 struct ufs_dir_entry *de = NULL;
347 struct buffer_head *bh;
348 228
349 de = ufs_find_entry (dentry, &bh); 229 if (!IS_ERR(page)) {
350 if (de) { 230 de = ufs_next_entry(dir->i_sb,
351 res = fs32_to_cpu(dir->i_sb, de->d_ino); 231 (struct ufs_dir_entry *)page_address(page));
352 brelse(bh); 232 *p = page;
353 } 233 }
354 return res; 234 return de;
355} 235}
356 236
357void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, 237/*
358 struct buffer_head *bh, struct inode *inode) 238 * ufs_find_entry()
239 *
240 * finds an entry in the specified directory with the wanted name. It
241 * returns the page in which the entry was found, and the entry itself
242 * (as a parameter - res_dir). Page is returned mapped and unlocked.
243 * Entry is guaranteed to be valid.
244 */
245struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
246 struct page **res_page)
359{ 247{
360 dir->i_version++; 248 struct super_block *sb = dir->i_sb;
361 de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); 249 const char *name = dentry->d_name.name;
362 mark_buffer_dirty(bh); 250 int namelen = dentry->d_name.len;
363 if (IS_DIRSYNC(dir)) 251 unsigned reclen = UFS_DIR_REC_LEN(namelen);
364 sync_dirty_buffer(bh); 252 unsigned long start, n;
365 brelse (bh); 253 unsigned long npages = ufs_dir_pages(dir);
254 struct page *page = NULL;
255 struct ufs_inode_info *ui = UFS_I(dir);
256 struct ufs_dir_entry *de;
257
258 UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen);
259
260 if (npages == 0 || namelen > UFS_MAXNAMLEN)
261 goto out;
262
263 /* OFFSET_CACHE */
264 *res_page = NULL;
265
266 start = ui->i_dir_start_lookup;
267
268 if (start >= npages)
269 start = 0;
270 n = start;
271 do {
272 char *kaddr;
273 page = ufs_get_page(dir, n);
274 if (!IS_ERR(page)) {
275 kaddr = page_address(page);
276 de = (struct ufs_dir_entry *) kaddr;
277 kaddr += ufs_last_byte(dir, n) - reclen;
278 while ((char *) de <= kaddr) {
279 if (de->d_reclen == 0) {
280 ufs_error(dir->i_sb, __FUNCTION__,
281 "zero-length directory entry");
282 ufs_put_page(page);
283 goto out;
284 }
285 if (ufs_match(sb, namelen, name, de))
286 goto found;
287 de = ufs_next_entry(sb, de);
288 }
289 ufs_put_page(page);
290 }
291 if (++n >= npages)
292 n = 0;
293 } while (n != start);
294out:
295 return NULL;
296
297found:
298 *res_page = page;
299 ui->i_dir_start_lookup = n;
300 return de;
366} 301}
367 302
368/* 303/*
369 * ufs_add_entry() 304 * Parent is locked.
370 *
371 * adds a file entry to the specified directory, using the same
372 * semantics as ufs_find_entry(). It returns NULL if it failed.
373 */ 305 */
374int ufs_add_link(struct dentry *dentry, struct inode *inode) 306int ufs_add_link(struct dentry *dentry, struct inode *inode)
375{ 307{
376 struct super_block * sb;
377 struct ufs_sb_private_info * uspi;
378 unsigned long offset;
379 unsigned fragoff;
380 unsigned short rec_len;
381 struct buffer_head * bh;
382 struct ufs_dir_entry * de, * de1;
383 struct inode *dir = dentry->d_parent->d_inode; 308 struct inode *dir = dentry->d_parent->d_inode;
384 const char *name = dentry->d_name.name; 309 const char *name = dentry->d_name.name;
385 int namelen = dentry->d_name.len; 310 int namelen = dentry->d_name.len;
311 struct super_block *sb = dir->i_sb;
312 unsigned reclen = UFS_DIR_REC_LEN(namelen);
313 unsigned short rec_len, name_len;
314 struct page *page = NULL;
315 struct ufs_dir_entry *de;
316 unsigned long npages = ufs_dir_pages(dir);
317 unsigned long n;
318 char *kaddr;
319 unsigned from, to;
386 int err; 320 int err;
387 321
388 UFSD(("ENTER, name %s, namelen %u\n", name, namelen)) 322 UFSD("ENTER, name %s, namelen %u\n", name, namelen);
389 323
390 sb = dir->i_sb; 324 /*
391 uspi = UFS_SB(sb)->s_uspi; 325 * We take care of directory expansion in the same loop.
392 326 * This code plays outside i_size, so it locks the page
393 if (!namelen) 327 * to protect that region.
394 return -EINVAL; 328 */
395 bh = ufs_bread (dir, 0, 0, &err); 329 for (n = 0; n <= npages; n++) {
396 if (!bh) 330 char *dir_end;
397 return err; 331
398 rec_len = UFS_DIR_REC_LEN(namelen); 332 page = ufs_get_page(dir, n);
399 offset = 0; 333 err = PTR_ERR(page);
400 de = (struct ufs_dir_entry *) bh->b_data; 334 if (IS_ERR(page))
401 while (1) { 335 goto out;
402 if ((char *)de >= UFS_SECTOR_SIZE + bh->b_data) { 336 lock_page(page);
403 fragoff = offset & ~uspi->s_fmask; 337 kaddr = page_address(page);
404 if (fragoff != 0 && fragoff != UFS_SECTOR_SIZE) 338 dir_end = kaddr + ufs_last_byte(dir, n);
405 ufs_error (sb, "ufs_add_entry", "internal error" 339 de = (struct ufs_dir_entry *)kaddr;
406 " fragoff %u", fragoff); 340 kaddr += PAGE_CACHE_SIZE - reclen;
407 if (!fragoff) { 341 while ((char *)de <= kaddr) {
408 brelse (bh); 342 if ((char *)de == dir_end) {
409 bh = ufs_bread (dir, offset >> sb->s_blocksize_bits, 1, &err); 343 /* We hit i_size */
410 if (!bh) 344 name_len = 0;
411 return err; 345 rec_len = UFS_SECTOR_SIZE;
412 }
413 if (dir->i_size <= offset) {
414 if (dir->i_size == 0) {
415 brelse(bh);
416 return -ENOENT;
417 }
418 de = (struct ufs_dir_entry *) (bh->b_data + fragoff);
419 de->d_ino = 0;
420 de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE); 346 de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE);
421 ufs_set_de_namlen(sb, de, 0); 347 de->d_ino = 0;
422 dir->i_size = offset + UFS_SECTOR_SIZE; 348 goto got_it;
423 mark_inode_dirty(dir);
424 } else {
425 de = (struct ufs_dir_entry *) bh->b_data;
426 } 349 }
350 if (de->d_reclen == 0) {
351 ufs_error(dir->i_sb, __FUNCTION__,
352 "zero-length directory entry");
353 err = -EIO;
354 goto out_unlock;
355 }
356 err = -EEXIST;
357 if (ufs_match(sb, namelen, name, de))
358 goto out_unlock;
359 name_len = UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de));
360 rec_len = fs16_to_cpu(sb, de->d_reclen);
361 if (!de->d_ino && rec_len >= reclen)
362 goto got_it;
363 if (rec_len >= name_len + reclen)
364 goto got_it;
365 de = (struct ufs_dir_entry *) ((char *) de + rec_len);
427 } 366 }
428 if (!ufs_check_dir_entry ("ufs_add_entry", dir, de, bh, offset)) { 367 unlock_page(page);
429 brelse (bh); 368 ufs_put_page(page);
430 return -ENOENT;
431 }
432 if (ufs_match(sb, namelen, name, de)) {
433 brelse (bh);
434 return -EEXIST;
435 }
436 if (de->d_ino == 0 && fs16_to_cpu(sb, de->d_reclen) >= rec_len)
437 break;
438
439 if (fs16_to_cpu(sb, de->d_reclen) >=
440 UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)) + rec_len)
441 break;
442 offset += fs16_to_cpu(sb, de->d_reclen);
443 de = (struct ufs_dir_entry *) ((char *) de + fs16_to_cpu(sb, de->d_reclen));
444 } 369 }
445 370 BUG();
371 return -EINVAL;
372
373got_it:
374 from = (char*)de - (char*)page_address(page);
375 to = from + rec_len;
376 err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
377 if (err)
378 goto out_unlock;
446 if (de->d_ino) { 379 if (de->d_ino) {
447 de1 = (struct ufs_dir_entry *) ((char *) de + 380 struct ufs_dir_entry *de1 =
448 UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de))); 381 (struct ufs_dir_entry *) ((char *) de + name_len);
449 de1->d_reclen = 382 de1->d_reclen = cpu_to_fs16(sb, rec_len - name_len);
450 cpu_to_fs16(sb, fs16_to_cpu(sb, de->d_reclen) - 383 de->d_reclen = cpu_to_fs16(sb, name_len);
451 UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de))); 384
452 de->d_reclen =
453 cpu_to_fs16(sb, UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
454 de = de1; 385 de = de1;
455 } 386 }
456 de->d_ino = 0; 387
457 ufs_set_de_namlen(sb, de, namelen); 388 ufs_set_de_namlen(sb, de, namelen);
458 memcpy (de->d_name, name, namelen + 1); 389 memcpy(de->d_name, name, namelen + 1);
459 de->d_ino = cpu_to_fs32(sb, inode->i_ino); 390 de->d_ino = cpu_to_fs32(sb, inode->i_ino);
460 ufs_set_de_type(sb, de, inode->i_mode); 391 ufs_set_de_type(sb, de, inode->i_mode);
461 mark_buffer_dirty(bh); 392
462 if (IS_DIRSYNC(dir)) 393 err = ufs_commit_chunk(page, from, to);
463 sync_dirty_buffer(bh);
464 brelse (bh);
465 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; 394 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
466 dir->i_version++; 395
467 mark_inode_dirty(dir); 396 mark_inode_dirty(dir);
397 /* OFFSET_CACHE */
398out_put:
399 ufs_put_page(page);
400out:
401 return err;
402out_unlock:
403 unlock_page(page);
404 goto out_put;
405}
468 406
469 UFSD(("EXIT\n")) 407static inline unsigned
408ufs_validate_entry(struct super_block *sb, char *base,
409 unsigned offset, unsigned mask)
410{
411 struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset);
412 struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask));
413 while ((char*)p < (char*)de) {
414 if (p->d_reclen == 0)
415 break;
416 p = ufs_next_entry(sb, p);
417 }
418 return (char *)p - base;
419}
420
421
422/*
423 * This is blatantly stolen from ext2fs
424 */
425static int
426ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
427{
428 loff_t pos = filp->f_pos;
429 struct inode *inode = filp->f_dentry->d_inode;
430 struct super_block *sb = inode->i_sb;
431 unsigned int offset = pos & ~PAGE_CACHE_MASK;
432 unsigned long n = pos >> PAGE_CACHE_SHIFT;
433 unsigned long npages = ufs_dir_pages(inode);
434 unsigned chunk_mask = ~(UFS_SECTOR_SIZE - 1);
435 int need_revalidate = filp->f_version != inode->i_version;
436 unsigned flags = UFS_SB(sb)->s_flags;
437
438 UFSD("BEGIN\n");
439
440 if (pos > inode->i_size - UFS_DIR_REC_LEN(1))
441 return 0;
442
443 for ( ; n < npages; n++, offset = 0) {
444 char *kaddr, *limit;
445 struct ufs_dir_entry *de;
446
447 struct page *page = ufs_get_page(inode, n);
448
449 if (IS_ERR(page)) {
450 ufs_error(sb, __FUNCTION__,
451 "bad page in #%lu",
452 inode->i_ino);
453 filp->f_pos += PAGE_CACHE_SIZE - offset;
454 return -EIO;
455 }
456 kaddr = page_address(page);
457 if (unlikely(need_revalidate)) {
458 if (offset) {
459 offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
460 filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
461 }
462 filp->f_version = inode->i_version;
463 need_revalidate = 0;
464 }
465 de = (struct ufs_dir_entry *)(kaddr+offset);
466 limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1);
467 for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) {
468 if (de->d_reclen == 0) {
469 ufs_error(sb, __FUNCTION__,
470 "zero-length directory entry");
471 ufs_put_page(page);
472 return -EIO;
473 }
474 if (de->d_ino) {
475 int over;
476 unsigned char d_type = DT_UNKNOWN;
477
478 offset = (char *)de - kaddr;
479
480 UFSD("filldir(%s,%u)\n", de->d_name,
481 fs32_to_cpu(sb, de->d_ino));
482 UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
483
484 if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
485 d_type = de->d_u.d_44.d_type;
486
487 over = filldir(dirent, de->d_name,
488 ufs_get_de_namlen(sb, de),
489 (n<<PAGE_CACHE_SHIFT) | offset,
490 fs32_to_cpu(sb, de->d_ino), d_type);
491 if (over) {
492 ufs_put_page(page);
493 return 0;
494 }
495 }
496 filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
497 }
498 ufs_put_page(page);
499 }
470 return 0; 500 return 0;
471} 501}
472 502
503
473/* 504/*
474 * ufs_delete_entry deletes a directory entry by merging it with the 505 * ufs_delete_entry deletes a directory entry by merging it with the
475 * previous entry. 506 * previous entry.
476 */ 507 */
477int ufs_delete_entry (struct inode * inode, struct ufs_dir_entry * dir, 508int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
478 struct buffer_head * bh ) 509 struct page * page)
479
480{ 510{
481 struct super_block * sb; 511 struct super_block *sb = inode->i_sb;
482 struct ufs_dir_entry * de, * pde; 512 struct address_space *mapping = page->mapping;
483 unsigned i; 513 char *kaddr = page_address(page);
484 514 unsigned from = ((char*)dir - kaddr) & ~(UFS_SECTOR_SIZE - 1);
485 UFSD(("ENTER\n")) 515 unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
516 struct ufs_dir_entry *pde = NULL;
517 struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from);
518 int err;
486 519
487 sb = inode->i_sb; 520 UFSD("ENTER\n");
488 i = 0; 521
489 pde = NULL; 522 UFSD("ino %u, reclen %u, namlen %u, name %s\n",
490 de = (struct ufs_dir_entry *) bh->b_data; 523 fs32_to_cpu(sb, de->d_ino),
491 524 fs16_to_cpu(sb, de->d_reclen),
492 UFSD(("ino %u, reclen %u, namlen %u, name %s\n", 525 ufs_get_de_namlen(sb, de), de->d_name);
493 fs32_to_cpu(sb, de->d_ino), 526
494 fs16_to_cpu(sb, de->d_reclen), 527 while ((char*)de < (char*)dir) {
495 ufs_get_de_namlen(sb, de), de->d_name)) 528 if (de->d_reclen == 0) {
496 529 ufs_error(inode->i_sb, __FUNCTION__,
497 while (i < bh->b_size) { 530 "zero-length directory entry");
498 if (!ufs_check_dir_entry ("ufs_delete_entry", inode, de, bh, i)) { 531 err = -EIO;
499 brelse(bh); 532 goto out;
500 return -EIO;
501 }
502 if (de == dir) {
503 if (pde)
504 fs16_add(sb, &pde->d_reclen,
505 fs16_to_cpu(sb, dir->d_reclen));
506 dir->d_ino = 0;
507 inode->i_version++;
508 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
509 mark_inode_dirty(inode);
510 mark_buffer_dirty(bh);
511 if (IS_DIRSYNC(inode))
512 sync_dirty_buffer(bh);
513 brelse(bh);
514 UFSD(("EXIT\n"))
515 return 0;
516 } 533 }
517 i += fs16_to_cpu(sb, de->d_reclen); 534 pde = de;
518 if (i == UFS_SECTOR_SIZE) pde = NULL; 535 de = ufs_next_entry(sb, de);
519 else pde = de;
520 de = (struct ufs_dir_entry *)
521 ((char *) de + fs16_to_cpu(sb, de->d_reclen));
522 if (i == UFS_SECTOR_SIZE && de->d_reclen == 0)
523 break;
524 } 536 }
525 UFSD(("EXIT\n")) 537 if (pde)
526 brelse(bh); 538 from = (char*)pde - (char*)page_address(page);
527 return -ENOENT; 539 lock_page(page);
540 err = mapping->a_ops->prepare_write(NULL, page, from, to);
541 BUG_ON(err);
542 if (pde)
543 pde->d_reclen = cpu_to_fs16(sb, to-from);
544 dir->d_ino = 0;
545 err = ufs_commit_chunk(page, from, to);
546 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
547 mark_inode_dirty(inode);
548out:
549 ufs_put_page(page);
550 UFSD("EXIT\n");
551 return err;
528} 552}
529 553
530int ufs_make_empty(struct inode * inode, struct inode *dir) 554int ufs_make_empty(struct inode * inode, struct inode *dir)
531{ 555{
532 struct super_block * sb = dir->i_sb; 556 struct super_block * sb = dir->i_sb;
533 struct buffer_head * dir_block; 557 struct address_space *mapping = inode->i_mapping;
558 struct page *page = grab_cache_page(mapping, 0);
534 struct ufs_dir_entry * de; 559 struct ufs_dir_entry * de;
560 char *base;
535 int err; 561 int err;
536 562
537 dir_block = ufs_bread (inode, 0, 1, &err); 563 if (!page)
538 if (!dir_block) 564 return -ENOMEM;
539 return err; 565 kmap(page);
566 err = mapping->a_ops->prepare_write(NULL, page, 0, UFS_SECTOR_SIZE);
567 if (err) {
568 unlock_page(page);
569 goto fail;
570 }
571
572
573 base = (char*)page_address(page);
574 memset(base, 0, PAGE_CACHE_SIZE);
575
576 de = (struct ufs_dir_entry *) base;
540 577
541 inode->i_blocks = sb->s_blocksize / UFS_SECTOR_SIZE;
542 de = (struct ufs_dir_entry *) dir_block->b_data;
543 de->d_ino = cpu_to_fs32(sb, inode->i_ino); 578 de->d_ino = cpu_to_fs32(sb, inode->i_ino);
544 ufs_set_de_type(sb, de, inode->i_mode); 579 ufs_set_de_type(sb, de, inode->i_mode);
545 ufs_set_de_namlen(sb, de, 1); 580 ufs_set_de_namlen(sb, de, 1);
@@ -552,72 +587,65 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
552 de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE - UFS_DIR_REC_LEN(1)); 587 de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE - UFS_DIR_REC_LEN(1));
553 ufs_set_de_namlen(sb, de, 2); 588 ufs_set_de_namlen(sb, de, 2);
554 strcpy (de->d_name, ".."); 589 strcpy (de->d_name, "..");
555 mark_buffer_dirty(dir_block); 590
556 brelse (dir_block); 591 err = ufs_commit_chunk(page, 0, UFS_SECTOR_SIZE);
557 mark_inode_dirty(inode); 592fail:
558 return 0; 593 kunmap(page);
594 page_cache_release(page);
595 return err;
559} 596}
560 597
561/* 598/*
562 * routine to check that the specified directory is empty (for rmdir) 599 * routine to check that the specified directory is empty (for rmdir)
563 */ 600 */
564int ufs_empty_dir (struct inode * inode) 601int ufs_empty_dir(struct inode * inode)
565{ 602{
566 struct super_block * sb; 603 struct super_block *sb = inode->i_sb;
567 unsigned long offset; 604 struct page *page = NULL;
568 struct buffer_head * bh; 605 unsigned long i, npages = ufs_dir_pages(inode);
569 struct ufs_dir_entry * de, * de1; 606
570 int err; 607 for (i = 0; i < npages; i++) {
571 608 char *kaddr;
572 sb = inode->i_sb; 609 struct ufs_dir_entry *de;
573 610 page = ufs_get_page(inode, i);
574 if (inode->i_size < UFS_DIR_REC_LEN(1) + UFS_DIR_REC_LEN(2) || 611
575 !(bh = ufs_bread (inode, 0, 0, &err))) { 612 if (IS_ERR(page))
576 ufs_warning (inode->i_sb, "empty_dir", 613 continue;
577 "bad directory (dir #%lu) - no data block", 614
578 inode->i_ino); 615 kaddr = page_address(page);
579 return 1; 616 de = (struct ufs_dir_entry *)kaddr;
580 } 617 kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1);
581 de = (struct ufs_dir_entry *) bh->b_data; 618
582 de1 = (struct ufs_dir_entry *) 619 while ((char *)de <= kaddr) {
583 ((char *)de + fs16_to_cpu(sb, de->d_reclen)); 620 if (de->d_reclen == 0) {
584 if (fs32_to_cpu(sb, de->d_ino) != inode->i_ino || de1->d_ino == 0 || 621 ufs_error(inode->i_sb, __FUNCTION__,
585 strcmp (".", de->d_name) || strcmp ("..", de1->d_name)) { 622 "zero-length directory entry: "
586 ufs_warning (inode->i_sb, "empty_dir", 623 "kaddr=%p, de=%p\n", kaddr, de);
587 "bad directory (dir #%lu) - no `.' or `..'", 624 goto not_empty;
588 inode->i_ino);
589 return 1;
590 }
591 offset = fs16_to_cpu(sb, de->d_reclen) + fs16_to_cpu(sb, de1->d_reclen);
592 de = (struct ufs_dir_entry *)
593 ((char *)de1 + fs16_to_cpu(sb, de1->d_reclen));
594 while (offset < inode->i_size ) {
595 if (!bh || (void *) de >= (void *) (bh->b_data + sb->s_blocksize)) {
596 brelse (bh);
597 bh = ufs_bread (inode, offset >> sb->s_blocksize_bits, 1, &err);
598 if (!bh) {
599 ufs_error (sb, "empty_dir",
600 "directory #%lu contains a hole at offset %lu",
601 inode->i_ino, offset);
602 offset += sb->s_blocksize;
603 continue;
604 } 625 }
605 de = (struct ufs_dir_entry *) bh->b_data; 626 if (de->d_ino) {
606 } 627 u16 namelen=ufs_get_de_namlen(sb, de);
607 if (!ufs_check_dir_entry ("empty_dir", inode, de, bh, offset)) { 628 /* check for . and .. */
608 brelse (bh); 629 if (de->d_name[0] != '.')
609 return 1; 630 goto not_empty;
610 } 631 if (namelen > 2)
611 if (de->d_ino) { 632 goto not_empty;
612 brelse (bh); 633 if (namelen < 2) {
613 return 0; 634 if (inode->i_ino !=
635 fs32_to_cpu(sb, de->d_ino))
636 goto not_empty;
637 } else if (de->d_name[1] != '.')
638 goto not_empty;
639 }
640 de = ufs_next_entry(sb, de);
614 } 641 }
615 offset += fs16_to_cpu(sb, de->d_reclen); 642 ufs_put_page(page);
616 de = (struct ufs_dir_entry *)
617 ((char *)de + fs16_to_cpu(sb, de->d_reclen));
618 } 643 }
619 brelse (bh);
620 return 1; 644 return 1;
645
646not_empty:
647 ufs_put_page(page);
648 return 0;
621} 649}
622 650
623const struct file_operations ufs_dir_operations = { 651const struct file_operations ufs_dir_operations = {
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 312fd3f86313..0e5001512a9d 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -25,6 +25,26 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/ufs_fs.h> 27#include <linux/ufs_fs.h>
28#include <linux/buffer_head.h> /* for sync_mapping_buffers() */
29
30static int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
31{
32 struct inode *inode = dentry->d_inode;
33 int err;
34 int ret;
35
36 ret = sync_mapping_buffers(inode->i_mapping);
37 if (!(inode->i_state & I_DIRTY))
38 return ret;
39 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
40 return ret;
41
42 err = ufs_sync_inode(inode);
43 if (ret == 0)
44 ret = err;
45 return ret;
46}
47
28 48
29/* 49/*
30 * We have mostly NULL's here: the current defaults are ok for 50 * We have mostly NULL's here: the current defaults are ok for
@@ -37,6 +57,7 @@ const struct file_operations ufs_file_operations = {
37 .write = generic_file_write, 57 .write = generic_file_write,
38 .mmap = generic_file_mmap, 58 .mmap = generic_file_mmap,
39 .open = generic_file_open, 59 .open = generic_file_open,
60 .fsync = ufs_sync_file,
40 .sendfile = generic_file_sendfile, 61 .sendfile = generic_file_sendfile,
41}; 62};
42 63
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index c7a47ed4f430..9501dcd3b213 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -34,14 +34,6 @@
34#include "swab.h" 34#include "swab.h"
35#include "util.h" 35#include "util.h"
36 36
37#undef UFS_IALLOC_DEBUG
38
39#ifdef UFS_IALLOC_DEBUG
40#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
41#else
42#define UFSD(x)
43#endif
44
45/* 37/*
46 * NOTE! When we get the inode, we're the only people 38 * NOTE! When we get the inode, we're the only people
47 * that have access to it, and as such there are no 39 * that have access to it, and as such there are no
@@ -68,7 +60,7 @@ void ufs_free_inode (struct inode * inode)
68 int is_directory; 60 int is_directory;
69 unsigned ino, cg, bit; 61 unsigned ino, cg, bit;
70 62
71 UFSD(("ENTER, ino %lu\n", inode->i_ino)) 63 UFSD("ENTER, ino %lu\n", inode->i_ino);
72 64
73 sb = inode->i_sb; 65 sb = inode->i_sb;
74 uspi = UFS_SB(sb)->s_uspi; 66 uspi = UFS_SB(sb)->s_uspi;
@@ -91,7 +83,7 @@ void ufs_free_inode (struct inode * inode)
91 unlock_super (sb); 83 unlock_super (sb);
92 return; 84 return;
93 } 85 }
94 ucg = ubh_get_ucg(UCPI_UBH); 86 ucg = ubh_get_ucg(UCPI_UBH(ucpi));
95 if (!ufs_cg_chkmagic(sb, ucg)) 87 if (!ufs_cg_chkmagic(sb, ucg))
96 ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number"); 88 ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number");
97 89
@@ -104,33 +96,33 @@ void ufs_free_inode (struct inode * inode)
104 96
105 clear_inode (inode); 97 clear_inode (inode);
106 98
107 if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit)) 99 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
108 ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino); 100 ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino);
109 else { 101 else {
110 ubh_clrbit (UCPI_UBH, ucpi->c_iusedoff, bit); 102 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
111 if (ino < ucpi->c_irotor) 103 if (ino < ucpi->c_irotor)
112 ucpi->c_irotor = ino; 104 ucpi->c_irotor = ino;
113 fs32_add(sb, &ucg->cg_cs.cs_nifree, 1); 105 fs32_add(sb, &ucg->cg_cs.cs_nifree, 1);
114 fs32_add(sb, &usb1->fs_cstotal.cs_nifree, 1); 106 uspi->cs_total.cs_nifree++;
115 fs32_add(sb, &UFS_SB(sb)->fs_cs(cg).cs_nifree, 1); 107 fs32_add(sb, &UFS_SB(sb)->fs_cs(cg).cs_nifree, 1);
116 108
117 if (is_directory) { 109 if (is_directory) {
118 fs32_sub(sb, &ucg->cg_cs.cs_ndir, 1); 110 fs32_sub(sb, &ucg->cg_cs.cs_ndir, 1);
119 fs32_sub(sb, &usb1->fs_cstotal.cs_ndir, 1); 111 uspi->cs_total.cs_ndir--;
120 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cg).cs_ndir, 1); 112 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cg).cs_ndir, 1);
121 } 113 }
122 } 114 }
123 115
124 ubh_mark_buffer_dirty (USPI_UBH); 116 ubh_mark_buffer_dirty (USPI_UBH(uspi));
125 ubh_mark_buffer_dirty (UCPI_UBH); 117 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
126 if (sb->s_flags & MS_SYNCHRONOUS) { 118 if (sb->s_flags & MS_SYNCHRONOUS) {
127 ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi); 119 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
128 ubh_wait_on_buffer (UCPI_UBH); 120 ubh_wait_on_buffer (UCPI_UBH(ucpi));
129 } 121 }
130 122
131 sb->s_dirt = 1; 123 sb->s_dirt = 1;
132 unlock_super (sb); 124 unlock_super (sb);
133 UFSD(("EXIT\n")) 125 UFSD("EXIT\n");
134} 126}
135 127
136/* 128/*
@@ -155,7 +147,7 @@ struct inode * ufs_new_inode(struct inode * dir, int mode)
155 unsigned cg, bit, i, j, start; 147 unsigned cg, bit, i, j, start;
156 struct ufs_inode_info *ufsi; 148 struct ufs_inode_info *ufsi;
157 149
158 UFSD(("ENTER\n")) 150 UFSD("ENTER\n");
159 151
160 /* Cannot create files in a deleted directory */ 152 /* Cannot create files in a deleted directory */
161 if (!dir || !dir->i_nlink) 153 if (!dir || !dir->i_nlink)
@@ -213,43 +205,43 @@ cg_found:
213 ucpi = ufs_load_cylinder (sb, cg); 205 ucpi = ufs_load_cylinder (sb, cg);
214 if (!ucpi) 206 if (!ucpi)
215 goto failed; 207 goto failed;
216 ucg = ubh_get_ucg(UCPI_UBH); 208 ucg = ubh_get_ucg(UCPI_UBH(ucpi));
217 if (!ufs_cg_chkmagic(sb, ucg)) 209 if (!ufs_cg_chkmagic(sb, ucg))
218 ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number"); 210 ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number");
219 211
220 start = ucpi->c_irotor; 212 start = ucpi->c_irotor;
221 bit = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_iusedoff, uspi->s_ipg, start); 213 bit = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, uspi->s_ipg, start);
222 if (!(bit < uspi->s_ipg)) { 214 if (!(bit < uspi->s_ipg)) {
223 bit = ubh_find_first_zero_bit (UCPI_UBH, ucpi->c_iusedoff, start); 215 bit = ubh_find_first_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, start);
224 if (!(bit < start)) { 216 if (!(bit < start)) {
225 ufs_error (sb, "ufs_new_inode", 217 ufs_error (sb, "ufs_new_inode",
226 "cylinder group %u corrupted - error in inode bitmap\n", cg); 218 "cylinder group %u corrupted - error in inode bitmap\n", cg);
227 goto failed; 219 goto failed;
228 } 220 }
229 } 221 }
230 UFSD(("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg)) 222 UFSD("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg);
231 if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit)) 223 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
232 ubh_setbit (UCPI_UBH, ucpi->c_iusedoff, bit); 224 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
233 else { 225 else {
234 ufs_panic (sb, "ufs_new_inode", "internal error"); 226 ufs_panic (sb, "ufs_new_inode", "internal error");
235 goto failed; 227 goto failed;
236 } 228 }
237 229
238 fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1); 230 fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1);
239 fs32_sub(sb, &usb1->fs_cstotal.cs_nifree, 1); 231 uspi->cs_total.cs_nifree--;
240 fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1); 232 fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1);
241 233
242 if (S_ISDIR(mode)) { 234 if (S_ISDIR(mode)) {
243 fs32_add(sb, &ucg->cg_cs.cs_ndir, 1); 235 fs32_add(sb, &ucg->cg_cs.cs_ndir, 1);
244 fs32_add(sb, &usb1->fs_cstotal.cs_ndir, 1); 236 uspi->cs_total.cs_ndir++;
245 fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1); 237 fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1);
246 } 238 }
247 239
248 ubh_mark_buffer_dirty (USPI_UBH); 240 ubh_mark_buffer_dirty (USPI_UBH(uspi));
249 ubh_mark_buffer_dirty (UCPI_UBH); 241 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
250 if (sb->s_flags & MS_SYNCHRONOUS) { 242 if (sb->s_flags & MS_SYNCHRONOUS) {
251 ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi); 243 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
252 ubh_wait_on_buffer (UCPI_UBH); 244 ubh_wait_on_buffer (UCPI_UBH(ucpi));
253 } 245 }
254 sb->s_dirt = 1; 246 sb->s_dirt = 1;
255 247
@@ -272,6 +264,7 @@ cg_found:
272 ufsi->i_shadow = 0; 264 ufsi->i_shadow = 0;
273 ufsi->i_osync = 0; 265 ufsi->i_osync = 0;
274 ufsi->i_oeftflag = 0; 266 ufsi->i_oeftflag = 0;
267 ufsi->i_dir_start_lookup = 0;
275 memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1)); 268 memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
276 269
277 insert_inode_hash(inode); 270 insert_inode_hash(inode);
@@ -287,14 +280,14 @@ cg_found:
287 return ERR_PTR(-EDQUOT); 280 return ERR_PTR(-EDQUOT);
288 } 281 }
289 282
290 UFSD(("allocating inode %lu\n", inode->i_ino)) 283 UFSD("allocating inode %lu\n", inode->i_ino);
291 UFSD(("EXIT\n")) 284 UFSD("EXIT\n");
292 return inode; 285 return inode;
293 286
294failed: 287failed:
295 unlock_super (sb); 288 unlock_super (sb);
296 make_bad_inode(inode); 289 make_bad_inode(inode);
297 iput (inode); 290 iput (inode);
298 UFSD(("EXIT (FAILED)\n")) 291 UFSD("EXIT (FAILED)\n");
299 return ERR_PTR(-ENOSPC); 292 return ERR_PTR(-ENOSPC);
300} 293}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 3c3f62ce2ad9..259bd196099d 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -41,14 +41,7 @@
41#include "swab.h" 41#include "swab.h"
42#include "util.h" 42#include "util.h"
43 43
44#undef UFS_INODE_DEBUG 44static u64 ufs_frag_map(struct inode *inode, sector_t frag);
45#undef UFS_INODE_DEBUG_MORE
46
47#ifdef UFS_INODE_DEBUG
48#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
49#else
50#define UFSD(x)
51#endif
52 45
53static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) 46static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
54{ 47{
@@ -61,7 +54,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
61 int n = 0; 54 int n = 0;
62 55
63 56
64 UFSD(("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks)); 57 UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
65 if (i_block < 0) { 58 if (i_block < 0) {
66 ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0"); 59 ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0");
67 } else if (i_block < direct_blocks) { 60 } else if (i_block < direct_blocks) {
@@ -89,7 +82,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
89 * the begining of the filesystem. 82 * the begining of the filesystem.
90 */ 83 */
91 84
92u64 ufs_frag_map(struct inode *inode, sector_t frag) 85static u64 ufs_frag_map(struct inode *inode, sector_t frag)
93{ 86{
94 struct ufs_inode_info *ufsi = UFS_I(inode); 87 struct ufs_inode_info *ufsi = UFS_I(inode);
95 struct super_block *sb = inode->i_sb; 88 struct super_block *sb = inode->i_sb;
@@ -104,8 +97,8 @@ u64 ufs_frag_map(struct inode *inode, sector_t frag)
104 unsigned flags = UFS_SB(sb)->s_flags; 97 unsigned flags = UFS_SB(sb)->s_flags;
105 u64 temp = 0L; 98 u64 temp = 0L;
106 99
107 UFSD((": frag = %llu depth = %d\n", (unsigned long long)frag, depth)); 100 UFSD(": frag = %llu depth = %d\n", (unsigned long long)frag, depth);
108 UFSD((": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask)); 101 UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask);
109 102
110 if (depth == 0) 103 if (depth == 0)
111 return 0; 104 return 0;
@@ -161,26 +154,64 @@ out:
161 return ret; 154 return ret;
162} 155}
163 156
164static struct buffer_head * ufs_inode_getfrag (struct inode *inode, 157static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh)
165 unsigned int fragment, unsigned int new_fragment, 158{
166 unsigned int required, int *err, int metadata, long *phys, int *new) 159 lock_buffer(bh);
160 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
161 set_buffer_uptodate(bh);
162 mark_buffer_dirty(bh);
163 unlock_buffer(bh);
164 if (IS_SYNC(inode))
165 sync_dirty_buffer(bh);
166}
167
168static struct buffer_head *
169ufs_clear_frags(struct inode *inode, sector_t beg,
170 unsigned int n)
171{
172 struct buffer_head *res, *bh;
173 sector_t end = beg + n;
174
175 res = sb_getblk(inode->i_sb, beg);
176 ufs_clear_frag(inode, res);
177 for (++beg; beg < end; ++beg) {
178 bh = sb_getblk(inode->i_sb, beg);
179 ufs_clear_frag(inode, bh);
180 brelse(bh);
181 }
182 return res;
183}
184
185/**
186 * ufs_inode_getfrag() - allocate new fragment(s)
187 * @inode - pointer to inode
188 * @fragment - number of `fragment' which hold pointer
189 * to new allocated fragment(s)
190 * @new_fragment - number of new allocated fragment(s)
191 * @required - how many fragment(s) we require
192 * @err - we set it if something wrong
193 * @phys - pointer to where we save physical number of new allocated fragments,
194 * NULL if we allocate not data(indirect blocks for example).
195 * @new - we set it if we allocate new block
196 * @locked_page - for ufs_new_fragments()
197 */
198static struct buffer_head *
199ufs_inode_getfrag(struct inode *inode, unsigned int fragment,
200 sector_t new_fragment, unsigned int required, int *err,
201 long *phys, int *new, struct page *locked_page)
167{ 202{
168 struct ufs_inode_info *ufsi = UFS_I(inode); 203 struct ufs_inode_info *ufsi = UFS_I(inode);
169 struct super_block * sb; 204 struct super_block *sb = inode->i_sb;
170 struct ufs_sb_private_info * uspi; 205 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
171 struct buffer_head * result; 206 struct buffer_head * result;
172 unsigned block, blockoff, lastfrag, lastblock, lastblockoff; 207 unsigned block, blockoff, lastfrag, lastblock, lastblockoff;
173 unsigned tmp, goal; 208 unsigned tmp, goal;
174 __fs32 * p, * p2; 209 __fs32 * p, * p2;
175 unsigned flags = 0;
176 210
177 UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u, required %u\n", 211 UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, required %u, "
178 inode->i_ino, fragment, new_fragment, required)) 212 "metadata %d\n", inode->i_ino, fragment,
213 (unsigned long long)new_fragment, required, !phys);
179 214
180 sb = inode->i_sb;
181 uspi = UFS_SB(sb)->s_uspi;
182
183 flags = UFS_SB(sb)->s_flags;
184 /* TODO : to be done for write support 215 /* TODO : to be done for write support
185 if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) 216 if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
186 goto ufs2; 217 goto ufs2;
@@ -195,16 +226,16 @@ repeat:
195 tmp = fs32_to_cpu(sb, *p); 226 tmp = fs32_to_cpu(sb, *p);
196 lastfrag = ufsi->i_lastfrag; 227 lastfrag = ufsi->i_lastfrag;
197 if (tmp && fragment < lastfrag) { 228 if (tmp && fragment < lastfrag) {
198 if (metadata) { 229 if (!phys) {
199 result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); 230 result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
200 if (tmp == fs32_to_cpu(sb, *p)) { 231 if (tmp == fs32_to_cpu(sb, *p)) {
201 UFSD(("EXIT, result %u\n", tmp + blockoff)) 232 UFSD("EXIT, result %u\n", tmp + blockoff);
202 return result; 233 return result;
203 } 234 }
204 brelse (result); 235 brelse (result);
205 goto repeat; 236 goto repeat;
206 } else { 237 } else {
207 *phys = tmp; 238 *phys = tmp + blockoff;
208 return NULL; 239 return NULL;
209 } 240 }
210 } 241 }
@@ -221,7 +252,8 @@ repeat:
221 if (lastblockoff) { 252 if (lastblockoff) {
222 p2 = ufsi->i_u1.i_data + lastblock; 253 p2 = ufsi->i_u1.i_data + lastblock;
223 tmp = ufs_new_fragments (inode, p2, lastfrag, 254 tmp = ufs_new_fragments (inode, p2, lastfrag,
224 fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff, err); 255 fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff,
256 err, locked_page);
225 if (!tmp) { 257 if (!tmp) {
226 if (lastfrag != ufsi->i_lastfrag) 258 if (lastfrag != ufsi->i_lastfrag)
227 goto repeat; 259 goto repeat;
@@ -233,14 +265,16 @@ repeat:
233 } 265 }
234 goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb; 266 goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb;
235 tmp = ufs_new_fragments (inode, p, fragment - blockoff, 267 tmp = ufs_new_fragments (inode, p, fragment - blockoff,
236 goal, required + blockoff, err); 268 goal, required + blockoff,
269 err, locked_page);
237 } 270 }
238 /* 271 /*
239 * We will extend last allocated block 272 * We will extend last allocated block
240 */ 273 */
241 else if (lastblock == block) { 274 else if (lastblock == block) {
242 tmp = ufs_new_fragments (inode, p, fragment - (blockoff - lastblockoff), 275 tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff),
243 fs32_to_cpu(sb, *p), required + (blockoff - lastblockoff), err); 276 fs32_to_cpu(sb, *p), required + (blockoff - lastblockoff),
277 err, locked_page);
244 } 278 }
245 /* 279 /*
246 * We will allocate new block before last allocated block 280 * We will allocate new block before last allocated block
@@ -248,8 +282,8 @@ repeat:
248 else /* (lastblock > block) */ { 282 else /* (lastblock > block) */ {
249 if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1]))) 283 if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1])))
250 goal = tmp + uspi->s_fpb; 284 goal = tmp + uspi->s_fpb;
251 tmp = ufs_new_fragments (inode, p, fragment - blockoff, 285 tmp = ufs_new_fragments(inode, p, fragment - blockoff,
252 goal, uspi->s_fpb, err); 286 goal, uspi->s_fpb, err, locked_page);
253 } 287 }
254 if (!tmp) { 288 if (!tmp) {
255 if ((!blockoff && *p) || 289 if ((!blockoff && *p) ||
@@ -259,14 +293,10 @@ repeat:
259 return NULL; 293 return NULL;
260 } 294 }
261 295
262 /* The nullification of framgents done in ufs/balloc.c is 296 if (!phys) {
263 * something I don't have the stomache to move into here right 297 result = ufs_clear_frags(inode, tmp + blockoff, required);
264 * now. -DaveM
265 */
266 if (metadata) {
267 result = sb_getblk(inode->i_sb, tmp + blockoff);
268 } else { 298 } else {
269 *phys = tmp; 299 *phys = tmp + blockoff;
270 result = NULL; 300 result = NULL;
271 *err = 0; 301 *err = 0;
272 *new = 1; 302 *new = 1;
@@ -276,7 +306,7 @@ repeat:
276 if (IS_SYNC(inode)) 306 if (IS_SYNC(inode))
277 ufs_sync_inode (inode); 307 ufs_sync_inode (inode);
278 mark_inode_dirty(inode); 308 mark_inode_dirty(inode);
279 UFSD(("EXIT, result %u\n", tmp + blockoff)) 309 UFSD("EXIT, result %u\n", tmp + blockoff);
280 return result; 310 return result;
281 311
282 /* This part : To be implemented .... 312 /* This part : To be implemented ....
@@ -295,22 +325,35 @@ repeat2:
295 */ 325 */
296} 326}
297 327
298static struct buffer_head * ufs_block_getfrag (struct inode *inode, 328/**
299 struct buffer_head *bh, unsigned int fragment, unsigned int new_fragment, 329 * ufs_inode_getblock() - allocate new block
300 unsigned int blocksize, int * err, int metadata, long *phys, int *new) 330 * @inode - pointer to inode
331 * @bh - pointer to block which hold "pointer" to new allocated block
332 * @fragment - number of `fragment' which hold pointer
333 * to new allocated block
334 * @new_fragment - number of new allocated fragment
335 * (block will hold this fragment and also uspi->s_fpb-1)
336 * @err - see ufs_inode_getfrag()
337 * @phys - see ufs_inode_getfrag()
338 * @new - see ufs_inode_getfrag()
339 * @locked_page - see ufs_inode_getfrag()
340 */
341static struct buffer_head *
342ufs_inode_getblock(struct inode *inode, struct buffer_head *bh,
343 unsigned int fragment, sector_t new_fragment, int *err,
344 long *phys, int *new, struct page *locked_page)
301{ 345{
302 struct super_block * sb; 346 struct super_block *sb = inode->i_sb;
303 struct ufs_sb_private_info * uspi; 347 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
304 struct buffer_head * result; 348 struct buffer_head * result;
305 unsigned tmp, goal, block, blockoff; 349 unsigned tmp, goal, block, blockoff;
306 __fs32 * p; 350 __fs32 * p;
307 351
308 sb = inode->i_sb;
309 uspi = UFS_SB(sb)->s_uspi;
310 block = ufs_fragstoblks (fragment); 352 block = ufs_fragstoblks (fragment);
311 blockoff = ufs_fragnum (fragment); 353 blockoff = ufs_fragnum (fragment);
312 354
313 UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u\n", inode->i_ino, fragment, new_fragment)) 355 UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, metadata %d\n",
356 inode->i_ino, fragment, (unsigned long long)new_fragment, !phys);
314 357
315 result = NULL; 358 result = NULL;
316 if (!bh) 359 if (!bh)
@@ -326,14 +369,14 @@ static struct buffer_head * ufs_block_getfrag (struct inode *inode,
326repeat: 369repeat:
327 tmp = fs32_to_cpu(sb, *p); 370 tmp = fs32_to_cpu(sb, *p);
328 if (tmp) { 371 if (tmp) {
329 if (metadata) { 372 if (!phys) {
330 result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); 373 result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
331 if (tmp == fs32_to_cpu(sb, *p)) 374 if (tmp == fs32_to_cpu(sb, *p))
332 goto out; 375 goto out;
333 brelse (result); 376 brelse (result);
334 goto repeat; 377 goto repeat;
335 } else { 378 } else {
336 *phys = tmp; 379 *phys = tmp + blockoff;
337 goto out; 380 goto out;
338 } 381 }
339 } 382 }
@@ -342,21 +385,19 @@ repeat:
342 goal = tmp + uspi->s_fpb; 385 goal = tmp + uspi->s_fpb;
343 else 386 else
344 goal = bh->b_blocknr + uspi->s_fpb; 387 goal = bh->b_blocknr + uspi->s_fpb;
345 tmp = ufs_new_fragments (inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err); 388 tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
389 uspi->s_fpb, err, locked_page);
346 if (!tmp) { 390 if (!tmp) {
347 if (fs32_to_cpu(sb, *p)) 391 if (fs32_to_cpu(sb, *p))
348 goto repeat; 392 goto repeat;
349 goto out; 393 goto out;
350 } 394 }
351 395
352 /* The nullification of framgents done in ufs/balloc.c is 396
353 * something I don't have the stomache to move into here right 397 if (!phys) {
354 * now. -DaveM 398 result = ufs_clear_frags(inode, tmp + blockoff, uspi->s_fpb);
355 */
356 if (metadata) {
357 result = sb_getblk(sb, tmp + blockoff);
358 } else { 399 } else {
359 *phys = tmp; 400 *phys = tmp + blockoff;
360 *new = 1; 401 *new = 1;
361 } 402 }
362 403
@@ -365,18 +406,19 @@ repeat:
365 sync_dirty_buffer(bh); 406 sync_dirty_buffer(bh);
366 inode->i_ctime = CURRENT_TIME_SEC; 407 inode->i_ctime = CURRENT_TIME_SEC;
367 mark_inode_dirty(inode); 408 mark_inode_dirty(inode);
368 UFSD(("result %u\n", tmp + blockoff)); 409 UFSD("result %u\n", tmp + blockoff);
369out: 410out:
370 brelse (bh); 411 brelse (bh);
371 UFSD(("EXIT\n")); 412 UFSD("EXIT\n");
372 return result; 413 return result;
373} 414}
374 415
375/* 416/**
376 * This function gets the block which contains the fragment. 417 * ufs_getfrag_bloc() - `get_block_t' function, interface between UFS and
418 * readpage, writepage and so on
377 */ 419 */
378 420
379int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) 421int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
380{ 422{
381 struct super_block * sb = inode->i_sb; 423 struct super_block * sb = inode->i_sb;
382 struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi; 424 struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
@@ -387,7 +429,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
387 429
388 if (!create) { 430 if (!create) {
389 phys64 = ufs_frag_map(inode, fragment); 431 phys64 = ufs_frag_map(inode, fragment);
390 UFSD(("phys64 = %llu \n",phys64)); 432 UFSD("phys64 = %llu \n",phys64);
391 if (phys64) 433 if (phys64)
392 map_bh(bh_result, sb, phys64); 434 map_bh(bh_result, sb, phys64);
393 return 0; 435 return 0;
@@ -402,7 +444,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
402 444
403 lock_kernel(); 445 lock_kernel();
404 446
405 UFSD(("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment)) 447 UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
406 if (fragment < 0) 448 if (fragment < 0)
407 goto abort_negative; 449 goto abort_negative;
408 if (fragment > 450 if (fragment >
@@ -418,15 +460,15 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
418 * it much more readable: 460 * it much more readable:
419 */ 461 */
420#define GET_INODE_DATABLOCK(x) \ 462#define GET_INODE_DATABLOCK(x) \
421 ufs_inode_getfrag(inode, x, fragment, 1, &err, 0, &phys, &new) 463 ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new, bh_result->b_page)
422#define GET_INODE_PTR(x) \ 464#define GET_INODE_PTR(x) \
423 ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, 1, NULL, NULL) 465 ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page)
424#define GET_INDIRECT_DATABLOCK(x) \ 466#define GET_INDIRECT_DATABLOCK(x) \
425 ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \ 467 ufs_inode_getblock(inode, bh, x, fragment, \
426 &err, 0, &phys, &new); 468 &err, &phys, &new, bh_result->b_page);
427#define GET_INDIRECT_PTR(x) \ 469#define GET_INDIRECT_PTR(x) \
428 ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \ 470 ufs_inode_getblock(inode, bh, x, fragment, \
429 &err, 1, NULL, NULL); 471 &err, NULL, NULL, bh_result->b_page);
430 472
431 if (ptr < UFS_NDIR_FRAGMENT) { 473 if (ptr < UFS_NDIR_FRAGMENT) {
432 bh = GET_INODE_DATABLOCK(ptr); 474 bh = GET_INODE_DATABLOCK(ptr);
@@ -474,8 +516,9 @@ abort_too_big:
474 goto abort; 516 goto abort;
475} 517}
476 518
477struct buffer_head *ufs_getfrag(struct inode *inode, unsigned int fragment, 519static struct buffer_head *ufs_getfrag(struct inode *inode,
478 int create, int *err) 520 unsigned int fragment,
521 int create, int *err)
479{ 522{
480 struct buffer_head dummy; 523 struct buffer_head dummy;
481 int error; 524 int error;
@@ -502,7 +545,7 @@ struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment,
502{ 545{
503 struct buffer_head * bh; 546 struct buffer_head * bh;
504 547
505 UFSD(("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment)) 548 UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment);
506 bh = ufs_getfrag (inode, fragment, create, err); 549 bh = ufs_getfrag (inode, fragment, create, err);
507 if (!bh || buffer_uptodate(bh)) 550 if (!bh || buffer_uptodate(bh))
508 return bh; 551 return bh;
@@ -540,39 +583,34 @@ struct address_space_operations ufs_aops = {
540 .bmap = ufs_bmap 583 .bmap = ufs_bmap
541}; 584};
542 585
543void ufs_read_inode (struct inode * inode) 586static void ufs_set_inode_ops(struct inode *inode)
587{
588 if (S_ISREG(inode->i_mode)) {
589 inode->i_op = &ufs_file_inode_operations;
590 inode->i_fop = &ufs_file_operations;
591 inode->i_mapping->a_ops = &ufs_aops;
592 } else if (S_ISDIR(inode->i_mode)) {
593 inode->i_op = &ufs_dir_inode_operations;
594 inode->i_fop = &ufs_dir_operations;
595 inode->i_mapping->a_ops = &ufs_aops;
596 } else if (S_ISLNK(inode->i_mode)) {
597 if (!inode->i_blocks)
598 inode->i_op = &ufs_fast_symlink_inode_operations;
599 else {
600 inode->i_op = &page_symlink_inode_operations;
601 inode->i_mapping->a_ops = &ufs_aops;
602 }
603 } else
604 init_special_inode(inode, inode->i_mode,
605 ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
606}
607
608static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
544{ 609{
545 struct ufs_inode_info *ufsi = UFS_I(inode); 610 struct ufs_inode_info *ufsi = UFS_I(inode);
546 struct super_block * sb; 611 struct super_block *sb = inode->i_sb;
547 struct ufs_sb_private_info * uspi;
548 struct ufs_inode * ufs_inode;
549 struct ufs2_inode *ufs2_inode;
550 struct buffer_head * bh;
551 mode_t mode; 612 mode_t mode;
552 unsigned i; 613 unsigned i;
553 unsigned flags;
554
555 UFSD(("ENTER, ino %lu\n", inode->i_ino))
556
557 sb = inode->i_sb;
558 uspi = UFS_SB(sb)->s_uspi;
559 flags = UFS_SB(sb)->s_flags;
560
561 if (inode->i_ino < UFS_ROOTINO ||
562 inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
563 ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino);
564 goto bad_inode;
565 }
566
567 bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
568 if (!bh) {
569 ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino);
570 goto bad_inode;
571 }
572 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
573 goto ufs2_inode;
574
575 ufs_inode = (struct ufs_inode *) (bh->b_data + sizeof(struct ufs_inode) * ufs_inotofsbo(inode->i_ino));
576 614
577 /* 615 /*
578 * Copy data to the in-core inode. 616 * Copy data to the in-core inode.
@@ -596,56 +634,29 @@ void ufs_read_inode (struct inode * inode)
596 inode->i_atime.tv_nsec = 0; 634 inode->i_atime.tv_nsec = 0;
597 inode->i_ctime.tv_nsec = 0; 635 inode->i_ctime.tv_nsec = 0;
598 inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks); 636 inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
599 inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat) */
600 inode->i_version++;
601 ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags); 637 ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
602 ufsi->i_gen = fs32_to_cpu(sb, ufs_inode->ui_gen); 638 ufsi->i_gen = fs32_to_cpu(sb, ufs_inode->ui_gen);
603 ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); 639 ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
604 ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); 640 ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
605 ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; 641
606 642
607 if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { 643 if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
608 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) 644 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
609 ufsi->i_u1.i_data[i] = ufs_inode->ui_u2.ui_addr.ui_db[i]; 645 ufsi->i_u1.i_data[i] = ufs_inode->ui_u2.ui_addr.ui_db[i];
610 } 646 } else {
611 else {
612 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) 647 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
613 ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i]; 648 ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
614 } 649 }
615 ufsi->i_osync = 0; 650}
616
617 if (S_ISREG(inode->i_mode)) {
618 inode->i_op = &ufs_file_inode_operations;
619 inode->i_fop = &ufs_file_operations;
620 inode->i_mapping->a_ops = &ufs_aops;
621 } else if (S_ISDIR(inode->i_mode)) {
622 inode->i_op = &ufs_dir_inode_operations;
623 inode->i_fop = &ufs_dir_operations;
624 } else if (S_ISLNK(inode->i_mode)) {
625 if (!inode->i_blocks)
626 inode->i_op = &ufs_fast_symlink_inode_operations;
627 else {
628 inode->i_op = &page_symlink_inode_operations;
629 inode->i_mapping->a_ops = &ufs_aops;
630 }
631 } else
632 init_special_inode(inode, inode->i_mode,
633 ufs_get_inode_dev(sb, ufsi));
634
635 brelse (bh);
636
637 UFSD(("EXIT\n"))
638 return;
639
640bad_inode:
641 make_bad_inode(inode);
642 return;
643
644ufs2_inode :
645 UFSD(("Reading ufs2 inode, ino %lu\n", inode->i_ino))
646 651
647 ufs2_inode = (struct ufs2_inode *)(bh->b_data + sizeof(struct ufs2_inode) * ufs_inotofsbo(inode->i_ino)); 652static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
653{
654 struct ufs_inode_info *ufsi = UFS_I(inode);
655 struct super_block *sb = inode->i_sb;
656 mode_t mode;
657 unsigned i;
648 658
659 UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
649 /* 660 /*
650 * Copy data to the in-core inode. 661 * Copy data to the in-core inode.
651 */ 662 */
@@ -668,50 +679,75 @@ ufs2_inode :
668 inode->i_atime.tv_nsec = 0; 679 inode->i_atime.tv_nsec = 0;
669 inode->i_ctime.tv_nsec = 0; 680 inode->i_ctime.tv_nsec = 0;
670 inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks); 681 inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
671 inode->i_blksize = PAGE_SIZE; /*This is the optimal IO size(for stat)*/
672
673 inode->i_version++;
674 ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags); 682 ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
675 ufsi->i_gen = fs32_to_cpu(sb, ufs2_inode->ui_gen); 683 ufsi->i_gen = fs32_to_cpu(sb, ufs2_inode->ui_gen);
676 /* 684 /*
677 ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); 685 ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
678 ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); 686 ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
679 */ 687 */
680 ufsi->i_lastfrag= (inode->i_size + uspi->s_fsize- 1) >> uspi->s_fshift;
681 688
682 if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { 689 if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
683 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) 690 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
684 ufsi->i_u1.u2_i_data[i] = 691 ufsi->i_u1.u2_i_data[i] =
685 ufs2_inode->ui_u2.ui_addr.ui_db[i]; 692 ufs2_inode->ui_u2.ui_addr.ui_db[i];
686 } 693 } else {
687 else {
688 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) 694 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
689 ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i]; 695 ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i];
690 } 696 }
697}
698
699void ufs_read_inode(struct inode * inode)
700{
701 struct ufs_inode_info *ufsi = UFS_I(inode);
702 struct super_block * sb;
703 struct ufs_sb_private_info * uspi;
704 struct buffer_head * bh;
705
706 UFSD("ENTER, ino %lu\n", inode->i_ino);
707
708 sb = inode->i_sb;
709 uspi = UFS_SB(sb)->s_uspi;
710
711 if (inode->i_ino < UFS_ROOTINO ||
712 inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
713 ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n",
714 inode->i_ino);
715 goto bad_inode;
716 }
717
718 bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
719 if (!bh) {
720 ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n",
721 inode->i_ino);
722 goto bad_inode;
723 }
724 if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
725 struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
726
727 ufs2_read_inode(inode,
728 ufs2_inode + ufs_inotofsbo(inode->i_ino));
729 } else {
730 struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data;
731
732 ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
733 }
734
735 inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/
736 inode->i_version++;
737 ufsi->i_lastfrag =
738 (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
739 ufsi->i_dir_start_lookup = 0;
691 ufsi->i_osync = 0; 740 ufsi->i_osync = 0;
692 741
693 if (S_ISREG(inode->i_mode)) { 742 ufs_set_inode_ops(inode);
694 inode->i_op = &ufs_file_inode_operations;
695 inode->i_fop = &ufs_file_operations;
696 inode->i_mapping->a_ops = &ufs_aops;
697 } else if (S_ISDIR(inode->i_mode)) {
698 inode->i_op = &ufs_dir_inode_operations;
699 inode->i_fop = &ufs_dir_operations;
700 } else if (S_ISLNK(inode->i_mode)) {
701 if (!inode->i_blocks)
702 inode->i_op = &ufs_fast_symlink_inode_operations;
703 else {
704 inode->i_op = &page_symlink_inode_operations;
705 inode->i_mapping->a_ops = &ufs_aops;
706 }
707 } else /* TODO : here ...*/
708 init_special_inode(inode, inode->i_mode,
709 ufs_get_inode_dev(sb, ufsi));
710 743
711 brelse(bh); 744 brelse(bh);
712 745
713 UFSD(("EXIT\n")) 746 UFSD("EXIT\n");
714 return; 747 return;
748
749bad_inode:
750 make_bad_inode(inode);
715} 751}
716 752
717static int ufs_update_inode(struct inode * inode, int do_sync) 753static int ufs_update_inode(struct inode * inode, int do_sync)
@@ -724,7 +760,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
724 unsigned i; 760 unsigned i;
725 unsigned flags; 761 unsigned flags;
726 762
727 UFSD(("ENTER, ino %lu\n", inode->i_ino)) 763 UFSD("ENTER, ino %lu\n", inode->i_ino);
728 764
729 sb = inode->i_sb; 765 sb = inode->i_sb;
730 uspi = UFS_SB(sb)->s_uspi; 766 uspi = UFS_SB(sb)->s_uspi;
@@ -785,7 +821,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
785 sync_dirty_buffer(bh); 821 sync_dirty_buffer(bh);
786 brelse (bh); 822 brelse (bh);
787 823
788 UFSD(("EXIT\n")) 824 UFSD("EXIT\n");
789 return 0; 825 return 0;
790} 826}
791 827
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 8d5f98a01c74..abd5f23a426d 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -1,6 +1,9 @@
1/* 1/*
2 * linux/fs/ufs/namei.c 2 * linux/fs/ufs/namei.c
3 * 3 *
4 * Migration to usage of "page cache" on May 2006 by
5 * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
6 *
4 * Copyright (C) 1998 7 * Copyright (C) 1998
5 * Daniel Pirkl <daniel.pirkl@email.cz> 8 * Daniel Pirkl <daniel.pirkl@email.cz>
6 * Charles University, Faculty of Mathematics and Physics 9 * Charles University, Faculty of Mathematics and Physics
@@ -28,21 +31,9 @@
28#include <linux/fs.h> 31#include <linux/fs.h>
29#include <linux/ufs_fs.h> 32#include <linux/ufs_fs.h>
30#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
31#include <linux/buffer_head.h>
32#include "swab.h" /* will go away - see comment in mknod() */ 34#include "swab.h" /* will go away - see comment in mknod() */
33#include "util.h" 35#include "util.h"
34 36
35/*
36#undef UFS_NAMEI_DEBUG
37*/
38#define UFS_NAMEI_DEBUG
39
40#ifdef UFS_NAMEI_DEBUG
41#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
42#else
43#define UFSD(x)
44#endif
45
46static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode) 37static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
47{ 38{
48 int err = ufs_add_link(dentry, inode); 39 int err = ufs_add_link(dentry, inode);
@@ -88,8 +79,13 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
88static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, 79static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
89 struct nameidata *nd) 80 struct nameidata *nd)
90{ 81{
91 struct inode * inode = ufs_new_inode(dir, mode); 82 struct inode *inode;
92 int err = PTR_ERR(inode); 83 int err;
84
85 UFSD("BEGIN\n");
86 inode = ufs_new_inode(dir, mode);
87 err = PTR_ERR(inode);
88
93 if (!IS_ERR(inode)) { 89 if (!IS_ERR(inode)) {
94 inode->i_op = &ufs_file_inode_operations; 90 inode->i_op = &ufs_file_inode_operations;
95 inode->i_fop = &ufs_file_operations; 91 inode->i_fop = &ufs_file_operations;
@@ -99,6 +95,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
99 err = ufs_add_nondir(dentry, inode); 95 err = ufs_add_nondir(dentry, inode);
100 unlock_kernel(); 96 unlock_kernel();
101 } 97 }
98 UFSD("END: err=%d\n", err);
102 return err; 99 return err;
103} 100}
104 101
@@ -205,6 +202,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
205 202
206 inode->i_op = &ufs_dir_inode_operations; 203 inode->i_op = &ufs_dir_inode_operations;
207 inode->i_fop = &ufs_dir_operations; 204 inode->i_fop = &ufs_dir_operations;
205 inode->i_mapping->a_ops = &ufs_aops;
208 206
209 inode_inc_link_count(inode); 207 inode_inc_link_count(inode);
210 208
@@ -231,19 +229,18 @@ out_dir:
231 goto out; 229 goto out;
232} 230}
233 231
234static int ufs_unlink(struct inode * dir, struct dentry *dentry) 232static int ufs_unlink(struct inode *dir, struct dentry *dentry)
235{ 233{
236 struct inode * inode = dentry->d_inode; 234 struct inode * inode = dentry->d_inode;
237 struct buffer_head * bh; 235 struct ufs_dir_entry *de;
238 struct ufs_dir_entry * de; 236 struct page *page;
239 int err = -ENOENT; 237 int err = -ENOENT;
240 238
241 lock_kernel(); 239 de = ufs_find_entry(dir, dentry, &page);
242 de = ufs_find_entry (dentry, &bh);
243 if (!de) 240 if (!de)
244 goto out; 241 goto out;
245 242
246 err = ufs_delete_entry (dir, de, bh); 243 err = ufs_delete_entry(dir, de, page);
247 if (err) 244 if (err)
248 goto out; 245 goto out;
249 246
@@ -251,7 +248,6 @@ static int ufs_unlink(struct inode * dir, struct dentry *dentry)
251 inode_dec_link_count(inode); 248 inode_dec_link_count(inode);
252 err = 0; 249 err = 0;
253out: 250out:
254 unlock_kernel();
255 return err; 251 return err;
256} 252}
257 253
@@ -273,42 +269,42 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
273 return err; 269 return err;
274} 270}
275 271
276static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry, 272static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
277 struct inode * new_dir, struct dentry * new_dentry ) 273 struct inode *new_dir, struct dentry *new_dentry)
278{ 274{
279 struct inode *old_inode = old_dentry->d_inode; 275 struct inode *old_inode = old_dentry->d_inode;
280 struct inode *new_inode = new_dentry->d_inode; 276 struct inode *new_inode = new_dentry->d_inode;
281 struct buffer_head *dir_bh = NULL; 277 struct page *dir_page = NULL;
282 struct ufs_dir_entry *dir_de = NULL; 278 struct ufs_dir_entry * dir_de = NULL;
283 struct buffer_head *old_bh; 279 struct page *old_page;
284 struct ufs_dir_entry *old_de; 280 struct ufs_dir_entry *old_de;
285 int err = -ENOENT; 281 int err = -ENOENT;
286 282
287 lock_kernel(); 283 old_de = ufs_find_entry(old_dir, old_dentry, &old_page);
288 old_de = ufs_find_entry (old_dentry, &old_bh);
289 if (!old_de) 284 if (!old_de)
290 goto out; 285 goto out;
291 286
292 if (S_ISDIR(old_inode->i_mode)) { 287 if (S_ISDIR(old_inode->i_mode)) {
293 err = -EIO; 288 err = -EIO;
294 dir_de = ufs_dotdot(old_inode, &dir_bh); 289 dir_de = ufs_dotdot(old_inode, &dir_page);
295 if (!dir_de) 290 if (!dir_de)
296 goto out_old; 291 goto out_old;
297 } 292 }
298 293
299 if (new_inode) { 294 if (new_inode) {
300 struct buffer_head *new_bh; 295 struct page *new_page;
301 struct ufs_dir_entry *new_de; 296 struct ufs_dir_entry *new_de;
302 297
303 err = -ENOTEMPTY; 298 err = -ENOTEMPTY;
304 if (dir_de && !ufs_empty_dir (new_inode)) 299 if (dir_de && !ufs_empty_dir(new_inode))
305 goto out_dir; 300 goto out_dir;
301
306 err = -ENOENT; 302 err = -ENOENT;
307 new_de = ufs_find_entry (new_dentry, &new_bh); 303 new_de = ufs_find_entry(new_dir, new_dentry, &new_page);
308 if (!new_de) 304 if (!new_de)
309 goto out_dir; 305 goto out_dir;
310 inode_inc_link_count(old_inode); 306 inode_inc_link_count(old_inode);
311 ufs_set_link(new_dir, new_de, new_bh, old_inode); 307 ufs_set_link(new_dir, new_de, new_page, old_inode);
312 new_inode->i_ctime = CURRENT_TIME_SEC; 308 new_inode->i_ctime = CURRENT_TIME_SEC;
313 if (dir_de) 309 if (dir_de)
314 new_inode->i_nlink--; 310 new_inode->i_nlink--;
@@ -329,24 +325,32 @@ static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry,
329 inode_inc_link_count(new_dir); 325 inode_inc_link_count(new_dir);
330 } 326 }
331 327
332 ufs_delete_entry (old_dir, old_de, old_bh); 328 /*
329 * Like most other Unix systems, set the ctime for inodes on a
330 * rename.
331 * inode_dec_link_count() will mark the inode dirty.
332 */
333 old_inode->i_ctime = CURRENT_TIME_SEC;
333 334
335 ufs_delete_entry(old_dir, old_de, old_page);
334 inode_dec_link_count(old_inode); 336 inode_dec_link_count(old_inode);
335 337
336 if (dir_de) { 338 if (dir_de) {
337 ufs_set_link(old_inode, dir_de, dir_bh, new_dir); 339 ufs_set_link(old_inode, dir_de, dir_page, new_dir);
338 inode_dec_link_count(old_dir); 340 inode_dec_link_count(old_dir);
339 } 341 }
340 unlock_kernel();
341 return 0; 342 return 0;
342 343
344
343out_dir: 345out_dir:
344 if (dir_de) 346 if (dir_de) {
345 brelse(dir_bh); 347 kunmap(dir_page);
348 page_cache_release(dir_page);
349 }
346out_old: 350out_old:
347 brelse (old_bh); 351 kunmap(old_page);
352 page_cache_release(old_page);
348out: 353out:
349 unlock_kernel();
350 return err; 354 return err;
351} 355}
352 356
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index fe5ab2aa2899..74ef5e9bedff 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -90,95 +90,84 @@
90#include "swab.h" 90#include "swab.h"
91#include "util.h" 91#include "util.h"
92 92
93#undef UFS_SUPER_DEBUG 93#ifdef CONFIG_UFS_DEBUG
94#undef UFS_SUPER_DEBUG_MORE
95
96
97#undef UFS_SUPER_DEBUG_MORE
98#ifdef UFS_SUPER_DEBUG
99#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
100#else
101#define UFSD(x)
102#endif
103
104#ifdef UFS_SUPER_DEBUG_MORE
105/* 94/*
106 * Print contents of ufs_super_block, useful for debugging 95 * Print contents of ufs_super_block, useful for debugging
107 */ 96 */
108void ufs_print_super_stuff(struct super_block *sb, 97static void ufs_print_super_stuff(struct super_block *sb, unsigned flags,
109 struct ufs_super_block_first * usb1, 98 struct ufs_super_block_first *usb1,
110 struct ufs_super_block_second * usb2, 99 struct ufs_super_block_second *usb2,
111 struct ufs_super_block_third * usb3) 100 struct ufs_super_block_third *usb3)
112{ 101{
113 printk("ufs_print_super_stuff\n"); 102 printk("ufs_print_super_stuff\n");
114 printk("size of usb: %u\n", sizeof(struct ufs_super_block)); 103 printk(" magic: 0x%x\n", fs32_to_cpu(sb, usb3->fs_magic));
115 printk(" magic: 0x%x\n", fs32_to_cpu(sb, usb3->fs_magic)); 104 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
116 printk(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); 105 printk(" fs_size: %llu\n", (unsigned long long)
117 printk(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); 106 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
118 printk(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); 107 printk(" fs_dsize: %llu\n", (unsigned long long)
119 printk(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); 108 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
120 printk(" cgoffset: %u\n", fs32_to_cpu(sb, usb1->fs_cgoffset)); 109 printk(" bsize: %u\n",
121 printk(" ~cgmask: 0x%x\n", ~fs32_to_cpu(sb, usb1->fs_cgmask)); 110 fs32_to_cpu(sb, usb1->fs_bsize));
122 printk(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); 111 printk(" fsize: %u\n",
123 printk(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); 112 fs32_to_cpu(sb, usb1->fs_fsize));
124 printk(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); 113 printk(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname);
125 printk(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); 114 printk(" fs_sblockloc: %llu\n", (unsigned long long)
126 printk(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); 115 fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
127 printk(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); 116 printk(" cs_ndir(No of dirs): %llu\n", (unsigned long long)
128 printk(" fragshift: %u\n", fs32_to_cpu(sb, usb1->fs_fragshift)); 117 fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
129 printk(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); 118 printk(" cs_nbfree(No of free blocks): %llu\n",
130 printk(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); 119 (unsigned long long)
131 printk(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); 120 fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
132 printk(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); 121 } else {
133 printk(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); 122 printk(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
134 printk(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); 123 printk(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
135 printk(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); 124 printk(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
136 printk(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); 125 printk(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
137 printk(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); 126 printk(" cgoffset: %u\n",
138 printk(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); 127 fs32_to_cpu(sb, usb1->fs_cgoffset));
139 printk(" fstodb: %u\n", fs32_to_cpu(sb, usb1->fs_fsbtodb)); 128 printk(" ~cgmask: 0x%x\n",
140 printk(" contigsumsize: %d\n", fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize)); 129 ~fs32_to_cpu(sb, usb1->fs_cgmask));
141 printk(" postblformat: %u\n", fs32_to_cpu(sb, usb3->fs_postblformat)); 130 printk(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size));
142 printk(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); 131 printk(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
143 printk(" ndir %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); 132 printk(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
144 printk(" nifree %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); 133 printk(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
145 printk(" nbfree %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); 134 printk(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
146 printk(" nffree %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); 135 printk(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag));
147 printk("\n"); 136 printk(" fragshift: %u\n",
148} 137 fs32_to_cpu(sb, usb1->fs_fragshift));
149 138 printk(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
150/* 139 printk(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
151 * Print contents of ufs2 ufs_super_block, useful for debugging 140 printk(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
152 */ 141 printk(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc));
153void ufs2_print_super_stuff( 142 printk(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
154 struct super_block *sb, 143 printk(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
155 struct ufs_super_block *usb) 144 printk(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
156{ 145 printk(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
157 printk("ufs_print_super_stuff\n"); 146 printk(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
158 printk("size of usb: %u\n", sizeof(struct ufs_super_block)); 147 printk(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
159 printk(" magic: 0x%x\n", fs32_to_cpu(sb, usb->fs_magic)); 148 printk(" fstodb: %u\n",
160 printk(" fs_size: %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size)); 149 fs32_to_cpu(sb, usb1->fs_fsbtodb));
161 printk(" fs_dsize: %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize)); 150 printk(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
162 printk(" bsize: %u\n", fs32_to_cpu(usb, usb->fs_bsize)); 151 printk(" ndir %u\n",
163 printk(" fsize: %u\n", fs32_to_cpu(usb, usb->fs_fsize)); 152 fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
164 printk(" fs_volname: %s\n", usb->fs_u11.fs_u2.fs_volname); 153 printk(" nifree %u\n",
165 printk(" fs_fsmnt: %s\n", usb->fs_u11.fs_u2.fs_fsmnt); 154 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
166 printk(" fs_sblockloc: %u\n",fs64_to_cpu(sb, 155 printk(" nbfree %u\n",
167 usb->fs_u11.fs_u2.fs_sblockloc)); 156 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
168 printk(" cs_ndir(No of dirs): %u\n",fs64_to_cpu(sb, 157 printk(" nffree %u\n",
169 usb->fs_u11.fs_u2.fs_cstotal.cs_ndir)); 158 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
170 printk(" cs_nbfree(No of free blocks): %u\n",fs64_to_cpu(sb, 159 }
171 usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree));
172 printk("\n"); 160 printk("\n");
173} 161}
174 162
175/* 163/*
176 * Print contents of ufs_cylinder_group, useful for debugging 164 * Print contents of ufs_cylinder_group, useful for debugging
177 */ 165 */
178void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg) 166static void ufs_print_cylinder_stuff(struct super_block *sb,
167 struct ufs_cylinder_group *cg)
179{ 168{
180 printk("\nufs_print_cylinder_stuff\n"); 169 printk("\nufs_print_cylinder_stuff\n");
181 printk("size of ucg: %u\n", sizeof(struct ufs_cylinder_group)); 170 printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
182 printk(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); 171 printk(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic));
183 printk(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); 172 printk(" time: %u\n", fs32_to_cpu(sb, cg->cg_time));
184 printk(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); 173 printk(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx));
@@ -202,12 +191,18 @@ void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group
202 printk(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); 191 printk(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
203 printk(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); 192 printk(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
204 printk(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); 193 printk(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
205 printk(" clustersumoff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); 194 printk(" clustersumoff %u\n",
206 printk(" clusteroff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); 195 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
207 printk(" nclusterblks %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); 196 printk(" clusteroff %u\n",
197 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
198 printk(" nclusterblks %u\n",
199 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
208 printk("\n"); 200 printk("\n");
209} 201}
210#endif /* UFS_SUPER_DEBUG_MORE */ 202#else
203# define ufs_print_super_stuff(sb, flags, usb1, usb2, usb3) /**/
204# define ufs_print_cylinder_stuff(sb, cg) /**/
205#endif /* CONFIG_UFS_DEBUG */
211 206
212static struct super_operations ufs_super_ops; 207static struct super_operations ufs_super_ops;
213 208
@@ -225,7 +220,7 @@ void ufs_error (struct super_block * sb, const char * function,
225 220
226 if (!(sb->s_flags & MS_RDONLY)) { 221 if (!(sb->s_flags & MS_RDONLY)) {
227 usb1->fs_clean = UFS_FSBAD; 222 usb1->fs_clean = UFS_FSBAD;
228 ubh_mark_buffer_dirty(USPI_UBH); 223 ubh_mark_buffer_dirty(USPI_UBH(uspi));
229 sb->s_dirt = 1; 224 sb->s_dirt = 1;
230 sb->s_flags |= MS_RDONLY; 225 sb->s_flags |= MS_RDONLY;
231 } 226 }
@@ -257,7 +252,7 @@ void ufs_panic (struct super_block * sb, const char * function,
257 252
258 if (!(sb->s_flags & MS_RDONLY)) { 253 if (!(sb->s_flags & MS_RDONLY)) {
259 usb1->fs_clean = UFS_FSBAD; 254 usb1->fs_clean = UFS_FSBAD;
260 ubh_mark_buffer_dirty(USPI_UBH); 255 ubh_mark_buffer_dirty(USPI_UBH(uspi));
261 sb->s_dirt = 1; 256 sb->s_dirt = 1;
262 } 257 }
263 va_start (args, fmt); 258 va_start (args, fmt);
@@ -309,7 +304,7 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
309{ 304{
310 char * p; 305 char * p;
311 306
312 UFSD(("ENTER\n")) 307 UFSD("ENTER\n");
313 308
314 if (!options) 309 if (!options)
315 return 1; 310 return 1;
@@ -386,27 +381,57 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
386} 381}
387 382
388/* 383/*
384 * Diffrent types of UFS hold fs_cstotal in different
385 * places, and use diffrent data structure for it.
386 * To make things simplier we just copy fs_cstotal to ufs_sb_private_info
387 */
388static void ufs_setup_cstotal(struct super_block *sb)
389{
390 struct ufs_sb_info *sbi = UFS_SB(sb);
391 struct ufs_sb_private_info *uspi = sbi->s_uspi;
392 struct ufs_super_block_first *usb1;
393 struct ufs_super_block_second *usb2;
394 struct ufs_super_block_third *usb3;
395 unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
396
397 UFSD("ENTER, mtype=%u\n", mtype);
398 usb1 = ubh_get_usb_first(uspi);
399 usb2 = ubh_get_usb_second(uspi);
400 usb3 = ubh_get_usb_third(uspi);
401
402 if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
403 (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
404 mtype == UFS_MOUNT_UFSTYPE_UFS2) {
405 /*we have statistic in different place, then usual*/
406 uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir);
407 uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree);
408 uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree);
409 uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree);
410 } else {
411 uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir);
412 uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree);
413 uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
414 uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
415 }
416 UFSD("EXIT\n");
417}
418
419/*
389 * Read on-disk structures associated with cylinder groups 420 * Read on-disk structures associated with cylinder groups
390 */ 421 */
391static int ufs_read_cylinder_structures (struct super_block *sb) 422static int ufs_read_cylinder_structures(struct super_block *sb)
392{ 423{
393 struct ufs_sb_info * sbi = UFS_SB(sb); 424 struct ufs_sb_info *sbi = UFS_SB(sb);
394 struct ufs_sb_private_info * uspi; 425 struct ufs_sb_private_info *uspi = sbi->s_uspi;
395 struct ufs_super_block *usb; 426 unsigned flags = sbi->s_flags;
396 struct ufs_buffer_head * ubh; 427 struct ufs_buffer_head * ubh;
397 unsigned char * base, * space; 428 unsigned char * base, * space;
398 unsigned size, blks, i; 429 unsigned size, blks, i;
399 unsigned flags = 0; 430 struct ufs_super_block_third *usb3;
400
401 UFSD(("ENTER\n"))
402
403 uspi = sbi->s_uspi;
404 431
405 usb = (struct ufs_super_block *) 432 UFSD("ENTER\n");
406 ((struct ufs_buffer_head *)uspi)->bh[0]->b_data;
407 433
408 flags = UFS_SB(sb)->s_flags; 434 usb3 = ubh_get_usb_third(uspi);
409
410 /* 435 /*
411 * Read cs structures from (usually) first data block 436 * Read cs structures from (usually) first data block
412 * on the device. 437 * on the device.
@@ -424,7 +449,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
424 449
425 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) 450 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
426 ubh = ubh_bread(sb, 451 ubh = ubh_bread(sb,
427 fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_csaddr) + i, size); 452 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr) + i, size);
428 else 453 else
429 ubh = ubh_bread(sb, uspi->s_csaddr + i, size); 454 ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
430 455
@@ -451,14 +476,13 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
451 sbi->s_cgno[i] = UFS_CGNO_EMPTY; 476 sbi->s_cgno[i] = UFS_CGNO_EMPTY;
452 } 477 }
453 for (i = 0; i < uspi->s_ncg; i++) { 478 for (i = 0; i < uspi->s_ncg; i++) {
454 UFSD(("read cg %u\n", i)) 479 UFSD("read cg %u\n", i);
455 if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i)))) 480 if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i))))
456 goto failed; 481 goto failed;
457 if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data)) 482 if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data))
458 goto failed; 483 goto failed;
459#ifdef UFS_SUPER_DEBUG_MORE 484
460 ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data); 485 ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
461#endif
462 } 486 }
463 for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { 487 for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
464 if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL))) 488 if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL)))
@@ -466,7 +490,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
466 sbi->s_cgno[i] = UFS_CGNO_EMPTY; 490 sbi->s_cgno[i] = UFS_CGNO_EMPTY;
467 } 491 }
468 sbi->s_cg_loaded = 0; 492 sbi->s_cg_loaded = 0;
469 UFSD(("EXIT\n")) 493 UFSD("EXIT\n");
470 return 1; 494 return 1;
471 495
472failed: 496failed:
@@ -479,26 +503,69 @@ failed:
479 for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) 503 for (i = 0; i < UFS_MAX_GROUP_LOADED; i++)
480 kfree (sbi->s_ucpi[i]); 504 kfree (sbi->s_ucpi[i]);
481 } 505 }
482 UFSD(("EXIT (FAILED)\n")) 506 UFSD("EXIT (FAILED)\n");
483 return 0; 507 return 0;
484} 508}
485 509
486/* 510/*
487 * Put on-disk structures associated with cylinder groups and 511 * Sync our internal copy of fs_cstotal with disk
488 * write them back to disk
489 */ 512 */
490static void ufs_put_cylinder_structures (struct super_block *sb) 513static void ufs_put_cstotal(struct super_block *sb)
491{ 514{
492 struct ufs_sb_info * sbi = UFS_SB(sb); 515 unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
493 struct ufs_sb_private_info * uspi; 516 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
517 struct ufs_super_block_first *usb1;
518 struct ufs_super_block_second *usb2;
519 struct ufs_super_block_third *usb3;
520
521 UFSD("ENTER\n");
522 usb1 = ubh_get_usb_first(uspi);
523 usb2 = ubh_get_usb_second(uspi);
524 usb3 = ubh_get_usb_third(uspi);
525
526 if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
527 (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
528 mtype == UFS_MOUNT_UFSTYPE_UFS2) {
529 /*we have statistic in different place, then usual*/
530 usb2->fs_un.fs_u2.cs_ndir =
531 cpu_to_fs64(sb, uspi->cs_total.cs_ndir);
532 usb2->fs_un.fs_u2.cs_nbfree =
533 cpu_to_fs64(sb, uspi->cs_total.cs_nbfree);
534 usb3->fs_un1.fs_u2.cs_nifree =
535 cpu_to_fs64(sb, uspi->cs_total.cs_nifree);
536 usb3->fs_un1.fs_u2.cs_nffree =
537 cpu_to_fs64(sb, uspi->cs_total.cs_nffree);
538 } else {
539 usb1->fs_cstotal.cs_ndir =
540 cpu_to_fs32(sb, uspi->cs_total.cs_ndir);
541 usb1->fs_cstotal.cs_nbfree =
542 cpu_to_fs32(sb, uspi->cs_total.cs_nbfree);
543 usb1->fs_cstotal.cs_nifree =
544 cpu_to_fs32(sb, uspi->cs_total.cs_nifree);
545 usb1->fs_cstotal.cs_nffree =
546 cpu_to_fs32(sb, uspi->cs_total.cs_nffree);
547 }
548 ubh_mark_buffer_dirty(USPI_UBH(uspi));
549 UFSD("EXIT\n");
550}
551
552/**
553 * ufs_put_super_internal() - put on-disk intrenal structures
554 * @sb: pointer to super_block structure
555 * Put on-disk structures associated with cylinder groups
556 * and write them back to disk, also update cs_total on disk
557 */
558static void ufs_put_super_internal(struct super_block *sb)
559{
560 struct ufs_sb_info *sbi = UFS_SB(sb);
561 struct ufs_sb_private_info *uspi = sbi->s_uspi;
494 struct ufs_buffer_head * ubh; 562 struct ufs_buffer_head * ubh;
495 unsigned char * base, * space; 563 unsigned char * base, * space;
496 unsigned blks, size, i; 564 unsigned blks, size, i;
497
498 UFSD(("ENTER\n"))
499
500 uspi = sbi->s_uspi;
501 565
566
567 UFSD("ENTER\n");
568 ufs_put_cstotal(sb);
502 size = uspi->s_cssize; 569 size = uspi->s_cssize;
503 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; 570 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
504 base = space = (char*) sbi->s_csp; 571 base = space = (char*) sbi->s_csp;
@@ -523,7 +590,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb)
523 brelse (sbi->s_ucg[i]); 590 brelse (sbi->s_ucg[i]);
524 kfree (sbi->s_ucg); 591 kfree (sbi->s_ucg);
525 kfree (base); 592 kfree (base);
526 UFSD(("EXIT\n")) 593 UFSD("EXIT\n");
527} 594}
528 595
529static int ufs_fill_super(struct super_block *sb, void *data, int silent) 596static int ufs_fill_super(struct super_block *sb, void *data, int silent)
@@ -533,7 +600,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
533 struct ufs_super_block_first * usb1; 600 struct ufs_super_block_first * usb1;
534 struct ufs_super_block_second * usb2; 601 struct ufs_super_block_second * usb2;
535 struct ufs_super_block_third * usb3; 602 struct ufs_super_block_third * usb3;
536 struct ufs_super_block *usb;
537 struct ufs_buffer_head * ubh; 603 struct ufs_buffer_head * ubh;
538 struct inode *inode; 604 struct inode *inode;
539 unsigned block_size, super_block_size; 605 unsigned block_size, super_block_size;
@@ -544,7 +610,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
544 ubh = NULL; 610 ubh = NULL;
545 flags = 0; 611 flags = 0;
546 612
547 UFSD(("ENTER\n")) 613 UFSD("ENTER\n");
548 614
549 sbi = kmalloc(sizeof(struct ufs_sb_info), GFP_KERNEL); 615 sbi = kmalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
550 if (!sbi) 616 if (!sbi)
@@ -552,7 +618,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
552 sb->s_fs_info = sbi; 618 sb->s_fs_info = sbi;
553 memset(sbi, 0, sizeof(struct ufs_sb_info)); 619 memset(sbi, 0, sizeof(struct ufs_sb_info));
554 620
555 UFSD(("flag %u\n", (int)(sb->s_flags & MS_RDONLY))) 621 UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
556 622
557#ifndef CONFIG_UFS_FS_WRITE 623#ifndef CONFIG_UFS_FS_WRITE
558 if (!(sb->s_flags & MS_RDONLY)) { 624 if (!(sb->s_flags & MS_RDONLY)) {
@@ -593,7 +659,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
593 the rules */ 659 the rules */
594 switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { 660 switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) {
595 case UFS_MOUNT_UFSTYPE_44BSD: 661 case UFS_MOUNT_UFSTYPE_44BSD:
596 UFSD(("ufstype=44bsd\n")) 662 UFSD("ufstype=44bsd\n");
597 uspi->s_fsize = block_size = 512; 663 uspi->s_fsize = block_size = 512;
598 uspi->s_fmask = ~(512 - 1); 664 uspi->s_fmask = ~(512 - 1);
599 uspi->s_fshift = 9; 665 uspi->s_fshift = 9;
@@ -602,7 +668,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
602 flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; 668 flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
603 break; 669 break;
604 case UFS_MOUNT_UFSTYPE_UFS2: 670 case UFS_MOUNT_UFSTYPE_UFS2:
605 UFSD(("ufstype=ufs2\n")); 671 UFSD("ufstype=ufs2\n");
606 super_block_offset=SBLOCK_UFS2; 672 super_block_offset=SBLOCK_UFS2;
607 uspi->s_fsize = block_size = 512; 673 uspi->s_fsize = block_size = 512;
608 uspi->s_fmask = ~(512 - 1); 674 uspi->s_fmask = ~(512 - 1);
@@ -617,7 +683,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
617 break; 683 break;
618 684
619 case UFS_MOUNT_UFSTYPE_SUN: 685 case UFS_MOUNT_UFSTYPE_SUN:
620 UFSD(("ufstype=sun\n")) 686 UFSD("ufstype=sun\n");
621 uspi->s_fsize = block_size = 1024; 687 uspi->s_fsize = block_size = 1024;
622 uspi->s_fmask = ~(1024 - 1); 688 uspi->s_fmask = ~(1024 - 1);
623 uspi->s_fshift = 10; 689 uspi->s_fshift = 10;
@@ -628,7 +694,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
628 break; 694 break;
629 695
630 case UFS_MOUNT_UFSTYPE_SUNx86: 696 case UFS_MOUNT_UFSTYPE_SUNx86:
631 UFSD(("ufstype=sunx86\n")) 697 UFSD("ufstype=sunx86\n");
632 uspi->s_fsize = block_size = 1024; 698 uspi->s_fsize = block_size = 1024;
633 uspi->s_fmask = ~(1024 - 1); 699 uspi->s_fmask = ~(1024 - 1);
634 uspi->s_fshift = 10; 700 uspi->s_fshift = 10;
@@ -639,7 +705,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
639 break; 705 break;
640 706
641 case UFS_MOUNT_UFSTYPE_OLD: 707 case UFS_MOUNT_UFSTYPE_OLD:
642 UFSD(("ufstype=old\n")) 708 UFSD("ufstype=old\n");
643 uspi->s_fsize = block_size = 1024; 709 uspi->s_fsize = block_size = 1024;
644 uspi->s_fmask = ~(1024 - 1); 710 uspi->s_fmask = ~(1024 - 1);
645 uspi->s_fshift = 10; 711 uspi->s_fshift = 10;
@@ -654,7 +720,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
654 break; 720 break;
655 721
656 case UFS_MOUNT_UFSTYPE_NEXTSTEP: 722 case UFS_MOUNT_UFSTYPE_NEXTSTEP:
657 UFSD(("ufstype=nextstep\n")) 723 UFSD("ufstype=nextstep\n");
658 uspi->s_fsize = block_size = 1024; 724 uspi->s_fsize = block_size = 1024;
659 uspi->s_fmask = ~(1024 - 1); 725 uspi->s_fmask = ~(1024 - 1);
660 uspi->s_fshift = 10; 726 uspi->s_fshift = 10;
@@ -669,7 +735,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
669 break; 735 break;
670 736
671 case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD: 737 case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD:
672 UFSD(("ufstype=nextstep-cd\n")) 738 UFSD("ufstype=nextstep-cd\n");
673 uspi->s_fsize = block_size = 2048; 739 uspi->s_fsize = block_size = 2048;
674 uspi->s_fmask = ~(2048 - 1); 740 uspi->s_fmask = ~(2048 - 1);
675 uspi->s_fshift = 11; 741 uspi->s_fshift = 11;
@@ -684,7 +750,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
684 break; 750 break;
685 751
686 case UFS_MOUNT_UFSTYPE_OPENSTEP: 752 case UFS_MOUNT_UFSTYPE_OPENSTEP:
687 UFSD(("ufstype=openstep\n")) 753 UFSD("ufstype=openstep\n");
688 uspi->s_fsize = block_size = 1024; 754 uspi->s_fsize = block_size = 1024;
689 uspi->s_fmask = ~(1024 - 1); 755 uspi->s_fmask = ~(1024 - 1);
690 uspi->s_fshift = 10; 756 uspi->s_fshift = 10;
@@ -699,7 +765,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
699 break; 765 break;
700 766
701 case UFS_MOUNT_UFSTYPE_HP: 767 case UFS_MOUNT_UFSTYPE_HP:
702 UFSD(("ufstype=hp\n")) 768 UFSD("ufstype=hp\n");
703 uspi->s_fsize = block_size = 1024; 769 uspi->s_fsize = block_size = 1024;
704 uspi->s_fmask = ~(1024 - 1); 770 uspi->s_fmask = ~(1024 - 1);
705 uspi->s_fshift = 10; 771 uspi->s_fshift = 10;
@@ -737,8 +803,6 @@ again:
737 usb1 = ubh_get_usb_first(uspi); 803 usb1 = ubh_get_usb_first(uspi);
738 usb2 = ubh_get_usb_second(uspi); 804 usb2 = ubh_get_usb_second(uspi);
739 usb3 = ubh_get_usb_third(uspi); 805 usb3 = ubh_get_usb_third(uspi);
740 usb = (struct ufs_super_block *)
741 ((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
742 806
743 /* 807 /*
744 * Check ufs magic number 808 * Check ufs magic number
@@ -820,16 +884,12 @@ magic_found:
820 ubh = NULL; 884 ubh = NULL;
821 block_size = uspi->s_fsize; 885 block_size = uspi->s_fsize;
822 super_block_size = uspi->s_sbsize; 886 super_block_size = uspi->s_sbsize;
823 UFSD(("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size)) 887 UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size);
824 goto again; 888 goto again;
825 } 889 }
826 890
827#ifdef UFS_SUPER_DEBUG_MORE 891
828 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) 892 ufs_print_super_stuff(sb, flags, usb1, usb2, usb3);
829 ufs2_print_super_stuff(sb,usb);
830 else
831 ufs_print_super_stuff(sb, usb1, usb2, usb3);
832#endif
833 893
834 /* 894 /*
835 * Check, if file system was correctly unmounted. 895 * Check, if file system was correctly unmounted.
@@ -842,13 +902,13 @@ magic_found:
842 (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) { 902 (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) {
843 switch(usb1->fs_clean) { 903 switch(usb1->fs_clean) {
844 case UFS_FSCLEAN: 904 case UFS_FSCLEAN:
845 UFSD(("fs is clean\n")) 905 UFSD("fs is clean\n");
846 break; 906 break;
847 case UFS_FSSTABLE: 907 case UFS_FSSTABLE:
848 UFSD(("fs is stable\n")) 908 UFSD("fs is stable\n");
849 break; 909 break;
850 case UFS_FSOSF1: 910 case UFS_FSOSF1:
851 UFSD(("fs is DEC OSF/1\n")) 911 UFSD("fs is DEC OSF/1\n");
852 break; 912 break;
853 case UFS_FSACTIVE: 913 case UFS_FSACTIVE:
854 printk("ufs_read_super: fs is active\n"); 914 printk("ufs_read_super: fs is active\n");
@@ -863,8 +923,7 @@ magic_found:
863 sb->s_flags |= MS_RDONLY; 923 sb->s_flags |= MS_RDONLY;
864 break; 924 break;
865 } 925 }
866 } 926 } else {
867 else {
868 printk("ufs_read_super: fs needs fsck\n"); 927 printk("ufs_read_super: fs needs fsck\n");
869 sb->s_flags |= MS_RDONLY; 928 sb->s_flags |= MS_RDONLY;
870 } 929 }
@@ -884,10 +943,9 @@ magic_found:
884 uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask); 943 uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask);
885 944
886 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { 945 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
887 uspi->s_u2_size = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size); 946 uspi->s_u2_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size);
888 uspi->s_u2_dsize = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize); 947 uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
889 } 948 } else {
890 else {
891 uspi->s_size = fs32_to_cpu(sb, usb1->fs_size); 949 uspi->s_size = fs32_to_cpu(sb, usb1->fs_size);
892 uspi->s_dsize = fs32_to_cpu(sb, usb1->fs_dsize); 950 uspi->s_dsize = fs32_to_cpu(sb, usb1->fs_dsize);
893 } 951 }
@@ -901,8 +959,8 @@ magic_found:
901 uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); 959 uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
902 uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift); 960 uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift);
903 uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); 961 uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
904 UFSD(("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift, 962 UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
905 uspi->s_fshift)); 963 uspi->s_fshift);
906 uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift); 964 uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift);
907 uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb); 965 uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb);
908 /* s_sbsize already set */ 966 /* s_sbsize already set */
@@ -922,8 +980,8 @@ magic_found:
922 uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc); 980 uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc);
923 uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg); 981 uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg);
924 uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg); 982 uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg);
925 uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_cpc); 983 uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc);
926 uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize); 984 uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize);
927 uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3); 985 uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3);
928 uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3); 986 uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3);
929 uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat); 987 uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat);
@@ -935,12 +993,11 @@ magic_found:
935 * Compute another frequently used values 993 * Compute another frequently used values
936 */ 994 */
937 uspi->s_fpbmask = uspi->s_fpb - 1; 995 uspi->s_fpbmask = uspi->s_fpb - 1;
938 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { 996 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
939 uspi->s_apbshift = uspi->s_bshift - 3; 997 uspi->s_apbshift = uspi->s_bshift - 3;
940 } 998 else
941 else {
942 uspi->s_apbshift = uspi->s_bshift - 2; 999 uspi->s_apbshift = uspi->s_bshift - 2;
943 } 1000
944 uspi->s_2apbshift = uspi->s_apbshift * 2; 1001 uspi->s_2apbshift = uspi->s_apbshift * 2;
945 uspi->s_3apbshift = uspi->s_apbshift * 3; 1002 uspi->s_3apbshift = uspi->s_apbshift * 3;
946 uspi->s_apb = 1 << uspi->s_apbshift; 1003 uspi->s_apb = 1 << uspi->s_apbshift;
@@ -956,7 +1013,7 @@ magic_found:
956 if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == 1013 if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) ==
957 UFS_MOUNT_UFSTYPE_44BSD) 1014 UFS_MOUNT_UFSTYPE_44BSD)
958 uspi->s_maxsymlinklen = 1015 uspi->s_maxsymlinklen =
959 fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_maxsymlinklen); 1016 fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
960 1017
961 sbi->s_flags = flags; 1018 sbi->s_flags = flags;
962 1019
@@ -967,7 +1024,7 @@ magic_found:
967 if (!sb->s_root) 1024 if (!sb->s_root)
968 goto dalloc_failed; 1025 goto dalloc_failed;
969 1026
970 1027 ufs_setup_cstotal(sb);
971 /* 1028 /*
972 * Read cylinder group structures 1029 * Read cylinder group structures
973 */ 1030 */
@@ -975,7 +1032,7 @@ magic_found:
975 if (!ufs_read_cylinder_structures(sb)) 1032 if (!ufs_read_cylinder_structures(sb))
976 goto failed; 1033 goto failed;
977 1034
978 UFSD(("EXIT\n")) 1035 UFSD("EXIT\n");
979 return 0; 1036 return 0;
980 1037
981dalloc_failed: 1038dalloc_failed:
@@ -986,15 +1043,16 @@ failed:
986 kfree (uspi); 1043 kfree (uspi);
987 kfree(sbi); 1044 kfree(sbi);
988 sb->s_fs_info = NULL; 1045 sb->s_fs_info = NULL;
989 UFSD(("EXIT (FAILED)\n")) 1046 UFSD("EXIT (FAILED)\n");
990 return -EINVAL; 1047 return -EINVAL;
991 1048
992failed_nomem: 1049failed_nomem:
993 UFSD(("EXIT (NOMEM)\n")) 1050 UFSD("EXIT (NOMEM)\n");
994 return -ENOMEM; 1051 return -ENOMEM;
995} 1052}
996 1053
997static void ufs_write_super (struct super_block *sb) { 1054static void ufs_write_super(struct super_block *sb)
1055{
998 struct ufs_sb_private_info * uspi; 1056 struct ufs_sb_private_info * uspi;
999 struct ufs_super_block_first * usb1; 1057 struct ufs_super_block_first * usb1;
1000 struct ufs_super_block_third * usb3; 1058 struct ufs_super_block_third * usb3;
@@ -1002,7 +1060,7 @@ static void ufs_write_super (struct super_block *sb) {
1002 1060
1003 lock_kernel(); 1061 lock_kernel();
1004 1062
1005 UFSD(("ENTER\n")) 1063 UFSD("ENTER\n");
1006 flags = UFS_SB(sb)->s_flags; 1064 flags = UFS_SB(sb)->s_flags;
1007 uspi = UFS_SB(sb)->s_uspi; 1065 uspi = UFS_SB(sb)->s_uspi;
1008 usb1 = ubh_get_usb_first(uspi); 1066 usb1 = ubh_get_usb_first(uspi);
@@ -1014,26 +1072,27 @@ static void ufs_write_super (struct super_block *sb) {
1014 || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 1072 || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
1015 ufs_set_fs_state(sb, usb1, usb3, 1073 ufs_set_fs_state(sb, usb1, usb3,
1016 UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); 1074 UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
1017 ubh_mark_buffer_dirty (USPI_UBH); 1075 ufs_put_cstotal(sb);
1018 } 1076 }
1019 sb->s_dirt = 0; 1077 sb->s_dirt = 0;
1020 UFSD(("EXIT\n")) 1078 UFSD("EXIT\n");
1021 unlock_kernel(); 1079 unlock_kernel();
1022} 1080}
1023 1081
1024static void ufs_put_super (struct super_block *sb) 1082static void ufs_put_super(struct super_block *sb)
1025{ 1083{
1026 struct ufs_sb_info * sbi = UFS_SB(sb); 1084 struct ufs_sb_info * sbi = UFS_SB(sb);
1027 1085
1028 UFSD(("ENTER\n")) 1086 UFSD("ENTER\n");
1029 1087
1030 if (!(sb->s_flags & MS_RDONLY)) 1088 if (!(sb->s_flags & MS_RDONLY))
1031 ufs_put_cylinder_structures (sb); 1089 ufs_put_super_internal(sb);
1032 1090
1033 ubh_brelse_uspi (sbi->s_uspi); 1091 ubh_brelse_uspi (sbi->s_uspi);
1034 kfree (sbi->s_uspi); 1092 kfree (sbi->s_uspi);
1035 kfree (sbi); 1093 kfree (sbi);
1036 sb->s_fs_info = NULL; 1094 sb->s_fs_info = NULL;
1095 UFSD("EXIT\n");
1037 return; 1096 return;
1038} 1097}
1039 1098
@@ -1062,8 +1121,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1062 return -EINVAL; 1121 return -EINVAL;
1063 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { 1122 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
1064 new_mount_opt |= ufstype; 1123 new_mount_opt |= ufstype;
1065 } 1124 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
1066 else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
1067 printk("ufstype can't be changed during remount\n"); 1125 printk("ufstype can't be changed during remount\n");
1068 return -EINVAL; 1126 return -EINVAL;
1069 } 1127 }
@@ -1077,20 +1135,19 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1077 * fs was mouted as rw, remounting ro 1135 * fs was mouted as rw, remounting ro
1078 */ 1136 */
1079 if (*mount_flags & MS_RDONLY) { 1137 if (*mount_flags & MS_RDONLY) {
1080 ufs_put_cylinder_structures(sb); 1138 ufs_put_super_internal(sb);
1081 usb1->fs_time = cpu_to_fs32(sb, get_seconds()); 1139 usb1->fs_time = cpu_to_fs32(sb, get_seconds());
1082 if ((flags & UFS_ST_MASK) == UFS_ST_SUN 1140 if ((flags & UFS_ST_MASK) == UFS_ST_SUN
1083 || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 1141 || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
1084 ufs_set_fs_state(sb, usb1, usb3, 1142 ufs_set_fs_state(sb, usb1, usb3,
1085 UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); 1143 UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
1086 ubh_mark_buffer_dirty (USPI_UBH); 1144 ubh_mark_buffer_dirty (USPI_UBH(uspi));
1087 sb->s_dirt = 0; 1145 sb->s_dirt = 0;
1088 sb->s_flags |= MS_RDONLY; 1146 sb->s_flags |= MS_RDONLY;
1089 } 1147 } else {
1090 /* 1148 /*
1091 * fs was mounted as ro, remounting rw 1149 * fs was mounted as ro, remounting rw
1092 */ 1150 */
1093 else {
1094#ifndef CONFIG_UFS_FS_WRITE 1151#ifndef CONFIG_UFS_FS_WRITE
1095 printk("ufs was compiled with read-only support, " 1152 printk("ufs was compiled with read-only support, "
1096 "can't be mounted as read-write\n"); 1153 "can't be mounted as read-write\n");
@@ -1102,7 +1159,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1102 printk("this ufstype is read-only supported\n"); 1159 printk("this ufstype is read-only supported\n");
1103 return -EINVAL; 1160 return -EINVAL;
1104 } 1161 }
1105 if (!ufs_read_cylinder_structures (sb)) { 1162 if (!ufs_read_cylinder_structures(sb)) {
1106 printk("failed during remounting\n"); 1163 printk("failed during remounting\n");
1107 return -EPERM; 1164 return -EPERM;
1108 } 1165 }
@@ -1113,37 +1170,31 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1113 return 0; 1170 return 0;
1114} 1171}
1115 1172
1116static int ufs_statfs (struct dentry *dentry, struct kstatfs *buf) 1173static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1117{ 1174{
1118 struct super_block *sb = dentry->d_sb; 1175 struct super_block *sb = dentry->d_sb;
1119 struct ufs_sb_private_info * uspi; 1176 struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
1120 struct ufs_super_block_first * usb1; 1177 unsigned flags = UFS_SB(sb)->s_flags;
1121 struct ufs_super_block * usb; 1178 struct ufs_super_block_first *usb1;
1122 unsigned flags = 0; 1179 struct ufs_super_block_second *usb2;
1180 struct ufs_super_block_third *usb3;
1123 1181
1124 lock_kernel(); 1182 lock_kernel();
1125 1183
1126 uspi = UFS_SB(sb)->s_uspi; 1184 usb1 = ubh_get_usb_first(uspi);
1127 usb1 = ubh_get_usb_first (uspi); 1185 usb2 = ubh_get_usb_second(uspi);
1128 usb = (struct ufs_super_block *) 1186 usb3 = ubh_get_usb_third(uspi);
1129 ((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
1130 1187
1131 flags = UFS_SB(sb)->s_flags;
1132 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { 1188 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
1133 buf->f_type = UFS2_MAGIC; 1189 buf->f_type = UFS2_MAGIC;
1134 buf->f_blocks = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize); 1190 buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
1135 buf->f_bfree = ufs_blkstofrags(fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree)) + 1191 } else {
1136 fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nffree);
1137 buf->f_ffree = fs64_to_cpu(sb,
1138 usb->fs_u11.fs_u2.fs_cstotal.cs_nifree);
1139 }
1140 else {
1141 buf->f_type = UFS_MAGIC; 1192 buf->f_type = UFS_MAGIC;
1142 buf->f_blocks = uspi->s_dsize; 1193 buf->f_blocks = uspi->s_dsize;
1143 buf->f_bfree = ufs_blkstofrags(fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)) +
1144 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
1145 buf->f_ffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
1146 } 1194 }
1195 buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
1196 uspi->cs_total.cs_nffree;
1197 buf->f_ffree = uspi->cs_total.cs_nifree;
1147 buf->f_bsize = sb->s_blocksize; 1198 buf->f_bsize = sb->s_blocksize;
1148 buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree)) 1199 buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree))
1149 ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; 1200 ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 02e86291ef8a..3c3b301f8701 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -49,14 +49,6 @@
49#include "swab.h" 49#include "swab.h"
50#include "util.h" 50#include "util.h"
51 51
52#undef UFS_TRUNCATE_DEBUG
53
54#ifdef UFS_TRUNCATE_DEBUG
55#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
56#else
57#define UFSD(x)
58#endif
59
60/* 52/*
61 * Secure deletion currently doesn't work. It interacts very badly 53 * Secure deletion currently doesn't work. It interacts very badly
62 * with buffers shared with memory mappings, and for that reason 54 * with buffers shared with memory mappings, and for that reason
@@ -82,7 +74,7 @@ static int ufs_trunc_direct (struct inode * inode)
82 unsigned i, tmp; 74 unsigned i, tmp;
83 int retry; 75 int retry;
84 76
85 UFSD(("ENTER\n")) 77 UFSD("ENTER\n");
86 78
87 sb = inode->i_sb; 79 sb = inode->i_sb;
88 uspi = UFS_SB(sb)->s_uspi; 80 uspi = UFS_SB(sb)->s_uspi;
@@ -105,7 +97,7 @@ static int ufs_trunc_direct (struct inode * inode)
105 block2 = ufs_fragstoblks (frag3); 97 block2 = ufs_fragstoblks (frag3);
106 } 98 }
107 99
108 UFSD(("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4)) 100 UFSD("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4);
109 101
110 if (frag1 >= frag2) 102 if (frag1 >= frag2)
111 goto next1; 103 goto next1;
@@ -120,9 +112,8 @@ static int ufs_trunc_direct (struct inode * inode)
120 frag1 = ufs_fragnum (frag1); 112 frag1 = ufs_fragnum (frag1);
121 frag2 = ufs_fragnum (frag2); 113 frag2 = ufs_fragnum (frag2);
122 114
123 inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift;
124 mark_inode_dirty(inode);
125 ufs_free_fragments (inode, tmp + frag1, frag2 - frag1); 115 ufs_free_fragments (inode, tmp + frag1, frag2 - frag1);
116 mark_inode_dirty(inode);
126 frag_to_free = tmp + frag1; 117 frag_to_free = tmp + frag1;
127 118
128next1: 119next1:
@@ -136,8 +127,7 @@ next1:
136 continue; 127 continue;
137 128
138 *p = 0; 129 *p = 0;
139 inode->i_blocks -= uspi->s_nspb; 130
140 mark_inode_dirty(inode);
141 if (free_count == 0) { 131 if (free_count == 0) {
142 frag_to_free = tmp; 132 frag_to_free = tmp;
143 free_count = uspi->s_fpb; 133 free_count = uspi->s_fpb;
@@ -148,6 +138,7 @@ next1:
148 frag_to_free = tmp; 138 frag_to_free = tmp;
149 free_count = uspi->s_fpb; 139 free_count = uspi->s_fpb;
150 } 140 }
141 mark_inode_dirty(inode);
151 } 142 }
152 143
153 if (free_count > 0) 144 if (free_count > 0)
@@ -166,12 +157,12 @@ next1:
166 frag4 = ufs_fragnum (frag4); 157 frag4 = ufs_fragnum (frag4);
167 158
168 *p = 0; 159 *p = 0;
169 inode->i_blocks -= frag4 << uspi->s_nspfshift; 160
170 mark_inode_dirty(inode);
171 ufs_free_fragments (inode, tmp, frag4); 161 ufs_free_fragments (inode, tmp, frag4);
162 mark_inode_dirty(inode);
172 next3: 163 next3:
173 164
174 UFSD(("EXIT\n")) 165 UFSD("EXIT\n");
175 return retry; 166 return retry;
176} 167}
177 168
@@ -186,7 +177,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
186 unsigned frag_to_free, free_count; 177 unsigned frag_to_free, free_count;
187 int retry; 178 int retry;
188 179
189 UFSD(("ENTER\n")) 180 UFSD("ENTER\n");
190 181
191 sb = inode->i_sb; 182 sb = inode->i_sb;
192 uspi = UFS_SB(sb)->s_uspi; 183 uspi = UFS_SB(sb)->s_uspi;
@@ -227,7 +218,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
227 frag_to_free = tmp; 218 frag_to_free = tmp;
228 free_count = uspi->s_fpb; 219 free_count = uspi->s_fpb;
229 } 220 }
230 inode->i_blocks -= uspi->s_nspb; 221
231 mark_inode_dirty(inode); 222 mark_inode_dirty(inode);
232 } 223 }
233 224
@@ -238,26 +229,21 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
238 if (*ubh_get_addr32(ind_ubh,i)) 229 if (*ubh_get_addr32(ind_ubh,i))
239 break; 230 break;
240 if (i >= uspi->s_apb) { 231 if (i >= uspi->s_apb) {
241 if (ubh_max_bcount(ind_ubh) != 1) { 232 tmp = fs32_to_cpu(sb, *p);
242 retry = 1; 233 *p = 0;
243 } 234
244 else { 235 ufs_free_blocks (inode, tmp, uspi->s_fpb);
245 tmp = fs32_to_cpu(sb, *p); 236 mark_inode_dirty(inode);
246 *p = 0; 237 ubh_bforget(ind_ubh);
247 inode->i_blocks -= uspi->s_nspb; 238 ind_ubh = NULL;
248 mark_inode_dirty(inode);
249 ufs_free_blocks (inode, tmp, uspi->s_fpb);
250 ubh_bforget(ind_ubh);
251 ind_ubh = NULL;
252 }
253 } 239 }
254 if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) { 240 if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
255 ubh_ll_rw_block (SWRITE, 1, &ind_ubh); 241 ubh_ll_rw_block(SWRITE, ind_ubh);
256 ubh_wait_on_buffer (ind_ubh); 242 ubh_wait_on_buffer (ind_ubh);
257 } 243 }
258 ubh_brelse (ind_ubh); 244 ubh_brelse (ind_ubh);
259 245
260 UFSD(("EXIT\n")) 246 UFSD("EXIT\n");
261 247
262 return retry; 248 return retry;
263} 249}
@@ -271,7 +257,7 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
271 __fs32 * dind; 257 __fs32 * dind;
272 int retry = 0; 258 int retry = 0;
273 259
274 UFSD(("ENTER\n")) 260 UFSD("ENTER\n");
275 261
276 sb = inode->i_sb; 262 sb = inode->i_sb;
277 uspi = UFS_SB(sb)->s_uspi; 263 uspi = UFS_SB(sb)->s_uspi;
@@ -306,25 +292,21 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
306 if (*ubh_get_addr32 (dind_bh, i)) 292 if (*ubh_get_addr32 (dind_bh, i))
307 break; 293 break;
308 if (i >= uspi->s_apb) { 294 if (i >= uspi->s_apb) {
309 if (ubh_max_bcount(dind_bh) != 1) 295 tmp = fs32_to_cpu(sb, *p);
310 retry = 1; 296 *p = 0;
311 else { 297
312 tmp = fs32_to_cpu(sb, *p); 298 ufs_free_blocks(inode, tmp, uspi->s_fpb);
313 *p = 0; 299 mark_inode_dirty(inode);
314 inode->i_blocks -= uspi->s_nspb; 300 ubh_bforget(dind_bh);
315 mark_inode_dirty(inode); 301 dind_bh = NULL;
316 ufs_free_blocks (inode, tmp, uspi->s_fpb);
317 ubh_bforget(dind_bh);
318 dind_bh = NULL;
319 }
320 } 302 }
321 if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) { 303 if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
322 ubh_ll_rw_block (SWRITE, 1, &dind_bh); 304 ubh_ll_rw_block(SWRITE, dind_bh);
323 ubh_wait_on_buffer (dind_bh); 305 ubh_wait_on_buffer (dind_bh);
324 } 306 }
325 ubh_brelse (dind_bh); 307 ubh_brelse (dind_bh);
326 308
327 UFSD(("EXIT\n")) 309 UFSD("EXIT\n");
328 310
329 return retry; 311 return retry;
330} 312}
@@ -339,7 +321,7 @@ static int ufs_trunc_tindirect (struct inode * inode)
339 __fs32 * tind, * p; 321 __fs32 * tind, * p;
340 int retry; 322 int retry;
341 323
342 UFSD(("ENTER\n")) 324 UFSD("ENTER\n");
343 325
344 sb = inode->i_sb; 326 sb = inode->i_sb;
345 uspi = UFS_SB(sb)->s_uspi; 327 uspi = UFS_SB(sb)->s_uspi;
@@ -370,25 +352,21 @@ static int ufs_trunc_tindirect (struct inode * inode)
370 if (*ubh_get_addr32 (tind_bh, i)) 352 if (*ubh_get_addr32 (tind_bh, i))
371 break; 353 break;
372 if (i >= uspi->s_apb) { 354 if (i >= uspi->s_apb) {
373 if (ubh_max_bcount(tind_bh) != 1) 355 tmp = fs32_to_cpu(sb, *p);
374 retry = 1; 356 *p = 0;
375 else { 357
376 tmp = fs32_to_cpu(sb, *p); 358 ufs_free_blocks(inode, tmp, uspi->s_fpb);
377 *p = 0; 359 mark_inode_dirty(inode);
378 inode->i_blocks -= uspi->s_nspb; 360 ubh_bforget(tind_bh);
379 mark_inode_dirty(inode); 361 tind_bh = NULL;
380 ufs_free_blocks (inode, tmp, uspi->s_fpb);
381 ubh_bforget(tind_bh);
382 tind_bh = NULL;
383 }
384 } 362 }
385 if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) { 363 if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
386 ubh_ll_rw_block (SWRITE, 1, &tind_bh); 364 ubh_ll_rw_block(SWRITE, tind_bh);
387 ubh_wait_on_buffer (tind_bh); 365 ubh_wait_on_buffer (tind_bh);
388 } 366 }
389 ubh_brelse (tind_bh); 367 ubh_brelse (tind_bh);
390 368
391 UFSD(("EXIT\n")) 369 UFSD("EXIT\n");
392 return retry; 370 return retry;
393} 371}
394 372
@@ -399,7 +377,7 @@ void ufs_truncate (struct inode * inode)
399 struct ufs_sb_private_info * uspi; 377 struct ufs_sb_private_info * uspi;
400 int retry; 378 int retry;
401 379
402 UFSD(("ENTER\n")) 380 UFSD("ENTER\n");
403 sb = inode->i_sb; 381 sb = inode->i_sb;
404 uspi = UFS_SB(sb)->s_uspi; 382 uspi = UFS_SB(sb)->s_uspi;
405 383
@@ -430,5 +408,5 @@ void ufs_truncate (struct inode * inode)
430 ufsi->i_lastfrag = DIRECT_FRAGMENT; 408 ufsi->i_lastfrag = DIRECT_FRAGMENT;
431 unlock_kernel(); 409 unlock_kernel();
432 mark_inode_dirty(inode); 410 mark_inode_dirty(inode);
433 UFSD(("EXIT\n")) 411 UFSD("EXIT\n");
434} 412}
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 59acc8f073ac..a2f13f45708b 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -14,15 +14,6 @@
14#include "swab.h" 14#include "swab.h"
15#include "util.h" 15#include "util.h"
16 16
17#undef UFS_UTILS_DEBUG
18
19#ifdef UFS_UTILS_DEBUG
20#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
21#else
22#define UFSD(x)
23#endif
24
25
26struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi, 17struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
27 struct super_block *sb, u64 fragment, u64 size) 18 struct super_block *sb, u64 fragment, u64 size)
28{ 19{
@@ -63,17 +54,17 @@ struct ufs_buffer_head * ubh_bread_uspi (struct ufs_sb_private_info * uspi,
63 count = size >> uspi->s_fshift; 54 count = size >> uspi->s_fshift;
64 if (count <= 0 || count > UFS_MAXFRAG) 55 if (count <= 0 || count > UFS_MAXFRAG)
65 return NULL; 56 return NULL;
66 USPI_UBH->fragment = fragment; 57 USPI_UBH(uspi)->fragment = fragment;
67 USPI_UBH->count = count; 58 USPI_UBH(uspi)->count = count;
68 for (i = 0; i < count; i++) 59 for (i = 0; i < count; i++)
69 if (!(USPI_UBH->bh[i] = sb_bread(sb, fragment + i))) 60 if (!(USPI_UBH(uspi)->bh[i] = sb_bread(sb, fragment + i)))
70 goto failed; 61 goto failed;
71 for (; i < UFS_MAXFRAG; i++) 62 for (; i < UFS_MAXFRAG; i++)
72 USPI_UBH->bh[i] = NULL; 63 USPI_UBH(uspi)->bh[i] = NULL;
73 return USPI_UBH; 64 return USPI_UBH(uspi);
74failed: 65failed:
75 for (j = 0; j < i; j++) 66 for (j = 0; j < i; j++)
76 brelse (USPI_UBH->bh[j]); 67 brelse (USPI_UBH(uspi)->bh[j]);
77 return NULL; 68 return NULL;
78} 69}
79 70
@@ -90,11 +81,11 @@ void ubh_brelse (struct ufs_buffer_head * ubh)
90void ubh_brelse_uspi (struct ufs_sb_private_info * uspi) 81void ubh_brelse_uspi (struct ufs_sb_private_info * uspi)
91{ 82{
92 unsigned i; 83 unsigned i;
93 if (!USPI_UBH) 84 if (!USPI_UBH(uspi))
94 return; 85 return;
95 for ( i = 0; i < USPI_UBH->count; i++ ) { 86 for ( i = 0; i < USPI_UBH(uspi)->count; i++ ) {
96 brelse (USPI_UBH->bh[i]); 87 brelse (USPI_UBH(uspi)->bh[i]);
97 USPI_UBH->bh[i] = NULL; 88 USPI_UBH(uspi)->bh[i] = NULL;
98 } 89 }
99} 90}
100 91
@@ -121,13 +112,12 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
121 } 112 }
122} 113}
123 114
124void ubh_ll_rw_block (int rw, unsigned nr, struct ufs_buffer_head * ubh[]) 115void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh)
125{ 116{
126 unsigned i;
127 if (!ubh) 117 if (!ubh)
128 return; 118 return;
129 for ( i = 0; i < nr; i++ ) 119
130 ll_rw_block (rw, ubh[i]->count, ubh[i]->bh); 120 ll_rw_block(rw, ubh->count, ubh->bh);
131} 121}
132 122
133void ubh_wait_on_buffer (struct ufs_buffer_head * ubh) 123void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
@@ -139,18 +129,6 @@ void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
139 wait_on_buffer (ubh->bh[i]); 129 wait_on_buffer (ubh->bh[i]);
140} 130}
141 131
142unsigned ubh_max_bcount (struct ufs_buffer_head * ubh)
143{
144 unsigned i;
145 unsigned max = 0;
146 if (!ubh)
147 return 0;
148 for ( i = 0; i < ubh->count; i++ )
149 if ( atomic_read(&ubh->bh[i]->b_count) > max )
150 max = atomic_read(&ubh->bh[i]->b_count);
151 return max;
152}
153
154void ubh_bforget (struct ufs_buffer_head * ubh) 132void ubh_bforget (struct ufs_buffer_head * ubh)
155{ 133{
156 unsigned i; 134 unsigned i;
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 48d6d9bcc157..406981fff5e7 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -17,10 +17,16 @@
17#define in_range(b,first,len) ((b)>=(first)&&(b)<(first)+(len)) 17#define in_range(b,first,len) ((b)>=(first)&&(b)<(first)+(len))
18 18
19/* 19/*
20 * macros used for retyping 20 * functions used for retyping
21 */ 21 */
22#define UCPI_UBH ((struct ufs_buffer_head *)ucpi) 22static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi)
23#define USPI_UBH ((struct ufs_buffer_head *)uspi) 23{
24 return &cpi->c_ubh;
25}
26static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi)
27{
28 return &spi->s_ubh;
29}
24 30
25 31
26 32
@@ -33,12 +39,12 @@ ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
33{ 39{
34 switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { 40 switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
35 case UFS_ST_SUN: 41 case UFS_ST_SUN:
36 return fs32_to_cpu(sb, usb3->fs_u2.fs_sun.fs_state); 42 return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state);
37 case UFS_ST_SUNx86: 43 case UFS_ST_SUNx86:
38 return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state); 44 return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state);
39 case UFS_ST_44BSD: 45 case UFS_ST_44BSD:
40 default: 46 default:
41 return fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_state); 47 return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state);
42 } 48 }
43} 49}
44 50
@@ -48,13 +54,13 @@ ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
48{ 54{
49 switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { 55 switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
50 case UFS_ST_SUN: 56 case UFS_ST_SUN:
51 usb3->fs_u2.fs_sun.fs_state = cpu_to_fs32(sb, value); 57 usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value);
52 break; 58 break;
53 case UFS_ST_SUNx86: 59 case UFS_ST_SUNx86:
54 usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value); 60 usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value);
55 break; 61 break;
56 case UFS_ST_44BSD: 62 case UFS_ST_44BSD:
57 usb3->fs_u2.fs_44.fs_state = cpu_to_fs32(sb, value); 63 usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value);
58 break; 64 break;
59 } 65 }
60} 66}
@@ -64,7 +70,7 @@ ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1,
64 struct ufs_super_block_third *usb3) 70 struct ufs_super_block_third *usb3)
65{ 71{
66 if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) 72 if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
67 return fs32_to_cpu(sb, usb3->fs_u2.fs_sunx86.fs_npsect); 73 return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect);
68 else 74 else
69 return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect); 75 return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect);
70} 76}
@@ -76,16 +82,16 @@ ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3)
76 82
77 switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { 83 switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
78 case UFS_ST_SUN: 84 case UFS_ST_SUN:
79 ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qbmask[0]; 85 ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0];
80 ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qbmask[1]; 86 ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1];
81 break; 87 break;
82 case UFS_ST_SUNx86: 88 case UFS_ST_SUNx86:
83 ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qbmask[0]; 89 ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0];
84 ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qbmask[1]; 90 ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1];
85 break; 91 break;
86 case UFS_ST_44BSD: 92 case UFS_ST_44BSD:
87 ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qbmask[0]; 93 ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0];
88 ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qbmask[1]; 94 ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1];
89 break; 95 break;
90 } 96 }
91 97
@@ -99,16 +105,16 @@ ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3)
99 105
100 switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { 106 switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
101 case UFS_ST_SUN: 107 case UFS_ST_SUN:
102 ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qfmask[0]; 108 ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0];
103 ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qfmask[1]; 109 ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1];
104 break; 110 break;
105 case UFS_ST_SUNx86: 111 case UFS_ST_SUNx86:
106 ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qfmask[0]; 112 ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0];
107 ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qfmask[1]; 113 ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1];
108 break; 114 break;
109 case UFS_ST_44BSD: 115 case UFS_ST_44BSD:
110 ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qfmask[0]; 116 ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0];
111 ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qfmask[1]; 117 ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1];
112 break; 118 break;
113 } 119 }
114 120
@@ -236,9 +242,8 @@ extern void ubh_brelse (struct ufs_buffer_head *);
236extern void ubh_brelse_uspi (struct ufs_sb_private_info *); 242extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
237extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *); 243extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
238extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int); 244extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
239extern void ubh_ll_rw_block (int, unsigned, struct ufs_buffer_head **); 245extern void ubh_ll_rw_block(int, struct ufs_buffer_head *);
240extern void ubh_wait_on_buffer (struct ufs_buffer_head *); 246extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
241extern unsigned ubh_max_bcount (struct ufs_buffer_head *);
242extern void ubh_bforget (struct ufs_buffer_head *); 247extern void ubh_bforget (struct ufs_buffer_head *);
243extern int ubh_buffer_dirty (struct ufs_buffer_head *); 248extern int ubh_buffer_dirty (struct ufs_buffer_head *);
244#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size) 249#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
@@ -297,40 +302,26 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
297#define ubh_blkmap(ubh,begin,bit) \ 302#define ubh_blkmap(ubh,begin,bit) \
298 ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb))) 303 ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb)))
299 304
300
301/*
302 * Macros for access to superblock array structures
303 */
304#define ubh_postbl(ubh,cylno,i) \
305 ((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \
306 ? (*(__s16*)(ubh_get_addr(ubh, \
307 (unsigned)(&((struct ufs_super_block *)0)->fs_opostbl) \
308 + (((cylno) * 16 + (i)) << 1) ) )) \
309 : (*(__s16*)(ubh_get_addr(ubh, \
310 uspi->s_postbloff + (((cylno) * uspi->s_nrpos + (i)) << 1) ))))
311
312#define ubh_rotbl(ubh,i) \
313 ((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \
314 ? (*(__u8*)(ubh_get_addr(ubh, \
315 (unsigned)(&((struct ufs_super_block *)0)->fs_space) + (i)))) \
316 : (*(__u8*)(ubh_get_addr(ubh, uspi->s_rotbloff + (i)))))
317
318/* 305/*
319 * Determine the number of available frags given a 306 * Determine the number of available frags given a
320 * percentage to hold in reserve. 307 * percentage to hold in reserve.
321 */ 308 */
322#define ufs_freespace(usb, percentreserved) \ 309static inline u64
323 (ufs_blkstofrags(fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nbfree)) + \ 310ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved)
324 fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nffree) - (uspi->s_dsize * (percentreserved) / 100)) 311{
312 return ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
313 uspi->cs_total.cs_nffree -
314 (uspi->s_dsize * (percentreserved) / 100);
315}
325 316
326/* 317/*
327 * Macros to access cylinder group array structures 318 * Macros to access cylinder group array structures
328 */ 319 */
329#define ubh_cg_blktot(ucpi,cylno) \ 320#define ubh_cg_blktot(ucpi,cylno) \
330 (*((__fs32*)ubh_get_addr(UCPI_UBH, (ucpi)->c_btotoff + ((cylno) << 2)))) 321 (*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2))))
331 322
332#define ubh_cg_blks(ucpi,cylno,rpos) \ 323#define ubh_cg_blks(ucpi,cylno,rpos) \
333 (*((__fs16*)ubh_get_addr(UCPI_UBH, \ 324 (*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \
334 (ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 )))) 325 (ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 ))))
335 326
336/* 327/*
@@ -508,29 +499,3 @@ static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap,
508 if (fragsize > 0 && fragsize < uspi->s_fpb) 499 if (fragsize > 0 && fragsize < uspi->s_fpb)
509 fs32_add(sb, &fraglist[fragsize], cnt); 500 fs32_add(sb, &fraglist[fragsize], cnt);
510} 501}
511
512#define ubh_scanc(ubh,begin,size,table,mask) _ubh_scanc_(uspi,ubh,begin,size,table,mask)
513static inline unsigned _ubh_scanc_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh,
514 unsigned begin, unsigned size, unsigned char * table, unsigned char mask)
515{
516 unsigned rest, offset;
517 unsigned char * cp;
518
519
520 offset = begin & ~uspi->s_fmask;
521 begin >>= uspi->s_fshift;
522 for (;;) {
523 if ((offset + size) < uspi->s_fsize)
524 rest = size;
525 else
526 rest = uspi->s_fsize - offset;
527 size -= rest;
528 cp = ubh->bh[begin]->b_data + offset;
529 while ((table[*cp++] & mask) == 0 && --rest);
530 if (rest || !size)
531 break;
532 begin++;
533 offset = 0;
534 }
535 return (size + rest);
536}
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 70662371bb11..3d4f6dff2113 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -299,7 +299,8 @@ xfs_file_open(
299 299
300STATIC int 300STATIC int
301xfs_file_close( 301xfs_file_close(
302 struct file *filp) 302 struct file *filp,
303 fl_owner_t id)
303{ 304{
304 return -bhv_vop_close(vn_from_inode(filp->f_dentry->d_inode), 0, 305 return -bhv_vop_close(vn_from_inode(filp->f_dentry->d_inode), 0,
305 file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL); 306 file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 10dbf203c62f..ed7579beb6b0 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1721,15 +1721,14 @@ xfs_mount_log_sbunit(
1721 * is present to prevent thrashing). 1721 * is present to prevent thrashing).
1722 */ 1722 */
1723 1723
1724#ifdef CONFIG_HOTPLUG_CPU
1724/* 1725/*
1725 * hot-plug CPU notifier support. 1726 * hot-plug CPU notifier support.
1726 * 1727 *
1727 * We cannot use the hotcpu_register() function because it does 1728 * We need a notifier per filesystem as we need to be able to identify
1728 * not allow notifier instances. We need a notifier per filesystem 1729 * the filesystem to balance the counters out. This is achieved by
1729 * as we need to be able to identify the filesystem to balance 1730 * having a notifier block embedded in the xfs_mount_t and doing pointer
1730 * the counters out. This is achieved by having a notifier block 1731 * magic to get the mount pointer from the notifier block address.
1731 * embedded in the xfs_mount_t and doing pointer magic to get the
1732 * mount pointer from the notifier block address.
1733 */ 1732 */
1734STATIC int 1733STATIC int
1735xfs_icsb_cpu_notify( 1734xfs_icsb_cpu_notify(
@@ -1779,6 +1778,7 @@ xfs_icsb_cpu_notify(
1779 1778
1780 return NOTIFY_OK; 1779 return NOTIFY_OK;
1781} 1780}
1781#endif /* CONFIG_HOTPLUG_CPU */
1782 1782
1783int 1783int
1784xfs_icsb_init_counters( 1784xfs_icsb_init_counters(
@@ -1791,9 +1791,11 @@ xfs_icsb_init_counters(
1791 if (mp->m_sb_cnts == NULL) 1791 if (mp->m_sb_cnts == NULL)
1792 return -ENOMEM; 1792 return -ENOMEM;
1793 1793
1794#ifdef CONFIG_HOTPLUG_CPU
1794 mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; 1795 mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
1795 mp->m_icsb_notifier.priority = 0; 1796 mp->m_icsb_notifier.priority = 0;
1796 register_cpu_notifier(&mp->m_icsb_notifier); 1797 register_hotcpu_notifier(&mp->m_icsb_notifier);
1798#endif /* CONFIG_HOTPLUG_CPU */
1797 1799
1798 for_each_online_cpu(i) { 1800 for_each_online_cpu(i) {
1799 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); 1801 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
@@ -1812,7 +1814,7 @@ xfs_icsb_destroy_counters(
1812 xfs_mount_t *mp) 1814 xfs_mount_t *mp)
1813{ 1815{
1814 if (mp->m_sb_cnts) { 1816 if (mp->m_sb_cnts) {
1815 unregister_cpu_notifier(&mp->m_icsb_notifier); 1817 unregister_hotcpu_notifier(&mp->m_icsb_notifier);
1816 free_percpu(mp->m_sb_cnts); 1818 free_percpu(mp->m_sb_cnts);
1817 } 1819 }
1818} 1820}