aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/Kconfig25
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/adfs/super.c3
-rw-r--r--fs/affs/super.c3
-rw-r--r--fs/afs/Makefile7
-rw-r--r--fs/afs/afs.h146
-rw-r--r--fs/afs/afs_cm.h32
-rw-r--r--fs/afs/afs_fs.h48
-rw-r--r--fs/afs/afs_vl.h (renamed from fs/afs/vlclient.h)49
-rw-r--r--fs/afs/cache.c256
-rw-r--r--fs/afs/cache.h12
-rw-r--r--fs/afs/callback.c509
-rw-r--r--fs/afs/cell.c471
-rw-r--r--fs/afs/cell.h78
-rw-r--r--fs/afs/cmservice.c927
-rw-r--r--fs/afs/cmservice.h29
-rw-r--r--fs/afs/dir.c855
-rw-r--r--fs/afs/errors.h34
-rw-r--r--fs/afs/file.c124
-rw-r--r--fs/afs/fsclient.c1529
-rw-r--r--fs/afs/fsclient.h54
-rw-r--r--fs/afs/inode.c248
-rw-r--r--fs/afs/internal.h754
-rw-r--r--fs/afs/kafsasyncd.c255
-rw-r--r--fs/afs/kafsasyncd.h52
-rw-r--r--fs/afs/kafstimod.c205
-rw-r--r--fs/afs/kafstimod.h49
-rw-r--r--fs/afs/main.c262
-rw-r--r--fs/afs/misc.c38
-rw-r--r--fs/afs/mntpt.c150
-rw-r--r--fs/afs/mount.h23
-rw-r--r--fs/afs/netdevices.c68
-rw-r--r--fs/afs/proc.c230
-rw-r--r--fs/afs/rxrpc.c782
-rw-r--r--fs/afs/security.c356
-rw-r--r--fs/afs/server.c647
-rw-r--r--fs/afs/server.h102
-rw-r--r--fs/afs/super.c395
-rw-r--r--fs/afs/super.h45
-rw-r--r--fs/afs/transport.h21
-rw-r--r--fs/afs/types.h125
-rw-r--r--fs/afs/vlclient.c737
-rw-r--r--fs/afs/vlocation.c1224
-rw-r--r--fs/afs/vnode.c731
-rw-r--r--fs/afs/vnode.h94
-rw-r--r--fs/afs/volume.c290
-rw-r--r--fs/afs/volume.h140
-rw-r--r--fs/aio.c6
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/bfs/inode.c3
-rw-r--r--fs/bio.c44
-rw-r--r--fs/block_dev.c12
-rw-r--r--fs/buffer.c24
-rw-r--r--fs/cifs/CHANGES22
-rw-r--r--fs/cifs/README43
-rw-r--r--fs/cifs/TODO69
-rw-r--r--fs/cifs/cifs_fs_sb.h14
-rw-r--r--fs/cifs/cifs_unicode.c4
-rw-r--r--fs/cifs/cifsfs.c82
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/cifspdu.h32
-rw-r--r--fs/cifs/cifsproto.h7
-rw-r--r--fs/cifs/cifssmb.c130
-rw-r--r--fs/cifs/connect.c140
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c129
-rw-r--r--fs/cifs/inode.c282
-rw-r--r--fs/cifs/netmisc.c24
-rw-r--r--fs/cifs/readdir.c161
-rw-r--r--fs/coda/inode.c3
-rw-r--r--fs/compat.c5
-rw-r--r--fs/compat_ioctl.c37
-rw-r--r--fs/configfs/mount.c2
-rw-r--r--fs/cramfs/inode.c3
-rw-r--r--fs/dcache.c8
-rw-r--r--fs/debugfs/file.c42
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/dlm/Kconfig31
-rw-r--r--fs/dlm/Makefile6
-rw-r--r--fs/dlm/ast.c1
-rw-r--r--fs/dlm/config.c10
-rw-r--r--fs/dlm/config.h3
-rw-r--r--fs/dlm/dlm_internal.h11
-rw-r--r--fs/dlm/lock.c955
-rw-r--r--fs/dlm/lock.h2
-rw-r--r--fs/dlm/lockspace.c6
-rw-r--r--fs/dlm/lowcomms-sctp.c1210
-rw-r--r--fs/dlm/lowcomms.c (renamed from fs/dlm/lowcomms-tcp.c)788
-rw-r--r--fs/dlm/user.c163
-rw-r--r--fs/dquot.c4
-rw-r--r--fs/ecryptfs/main.c15
-rw-r--r--fs/ecryptfs/mmap.c11
-rw-r--r--fs/ecryptfs/netlink.c6
-rw-r--r--fs/efs/super.c3
-rw-r--r--fs/exec.c18
-rw-r--r--fs/ext2/dir.c3
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext3/super.c7
-rw-r--r--fs/ext4/super.c7
-rw-r--r--fs/fat/cache.c3
-rw-r--r--fs/fat/inode.c3
-rw-r--r--fs/freevxfs/vxfs_subr.c3
-rw-r--r--fs/fuse/inode.c7
-rw-r--r--fs/gfs2/dir.c38
-rw-r--r--fs/gfs2/glock.c619
-rw-r--r--fs/gfs2/glock.h8
-rw-r--r--fs/gfs2/incore.h14
-rw-r--r--fs/gfs2/locking/dlm/lock.c14
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h3
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c2
-rw-r--r--fs/gfs2/lops.c20
-rw-r--r--fs/gfs2/main.c10
-rw-r--r--fs/gfs2/mount.c239
-rw-r--r--fs/gfs2/ops_address.c21
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/ops_super.c28
-rw-r--r--fs/gfs2/rgrp.c12
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/hfs/super.c2
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hpfs/super.c3
-rw-r--r--fs/hugetlbfs/inode.c20
-rw-r--r--fs/inode.c3
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/jffs2/LICENCE7
-rw-r--r--fs/jffs2/Makefile1
-rw-r--r--fs/jffs2/README.Locking1
-rw-r--r--fs/jffs2/TODO3
-rw-r--r--fs/jffs2/acl.c3
-rw-r--r--fs/jffs2/acl.h3
-rw-r--r--fs/jffs2/background.c4
-rw-r--r--fs/jffs2/build.c4
-rw-r--r--fs/jffs2/compr.c144
-rw-r--r--fs/jffs2/compr.h17
-rw-r--r--fs/jffs2/compr_rtime.c3
-rw-r--r--fs/jffs2/compr_rubin.c81
-rw-r--r--fs/jffs2/compr_rubin.h21
-rw-r--r--fs/jffs2/compr_zlib.c4
-rw-r--r--fs/jffs2/comprtest.c307
-rw-r--r--fs/jffs2/debug.c5
-rw-r--r--fs/jffs2/debug.h5
-rw-r--r--fs/jffs2/dir.c4
-rw-r--r--fs/jffs2/erase.c6
-rw-r--r--fs/jffs2/file.c4
-rw-r--r--fs/jffs2/fs.c16
-rw-r--r--fs/jffs2/gc.c7
-rw-r--r--fs/jffs2/ioctl.c4
-rw-r--r--fs/jffs2/jffs2_fs_i.h11
-rw-r--r--fs/jffs2/jffs2_fs_sb.h11
-rw-r--r--fs/jffs2/malloc.c4
-rw-r--r--fs/jffs2/nodelist.c482
-rw-r--r--fs/jffs2/nodelist.h40
-rw-r--r--fs/jffs2/nodemgmt.c9
-rw-r--r--fs/jffs2/os-linux.h10
-rw-r--r--fs/jffs2/pushpull.h72
-rw-r--r--fs/jffs2/read.c4
-rw-r--r--fs/jffs2/readinode.c851
-rw-r--r--fs/jffs2/scan.c62
-rw-r--r--fs/jffs2/security.c3
-rw-r--r--fs/jffs2/summary.c12
-rw-r--r--fs/jffs2/summary.h10
-rw-r--r--fs/jffs2/super.c9
-rw-r--r--fs/jffs2/symlink.c5
-rw-r--r--fs/jffs2/wbuf.c39
-rw-r--r--fs/jffs2/write.c7
-rw-r--r--fs/jffs2/writev.c4
-rw-r--r--fs/jffs2/xattr.c3
-rw-r--r--fs/jffs2/xattr.h3
-rw-r--r--fs/jffs2/xattr_trusted.c3
-rw-r--r--fs/jffs2/xattr_user.c3
-rw-r--r--fs/jfs/jfs_metapage.c3
-rw-r--r--fs/jfs/super.c3
-rw-r--r--fs/lockd/mon.c10
-rw-r--r--fs/lockd/xdr.c20
-rw-r--r--fs/lockd/xdr4.c24
-rw-r--r--fs/locks.c3
-rw-r--r--fs/minix/dir.c1
-rw-r--r--fs/minix/inode.c3
-rw-r--r--fs/namei.c84
-rw-r--r--fs/ncpfs/inode.c3
-rw-r--r--fs/nfs/client.c3
-rw-r--r--fs/nfs/dir.c25
-rw-r--r--fs/nfs/direct.c5
-rw-r--r--fs/nfs/inode.c3
-rw-r--r--fs/nfs/internal.h12
-rw-r--r--fs/nfs/mount_clnt.c7
-rw-r--r--fs/nfs/nfs2xdr.c7
-rw-r--r--fs/nfs/nfs3xdr.c13
-rw-r--r--fs/nfs/nfs4proc.c3
-rw-r--r--fs/nfs/nfs4xdr.c7
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/pagelist.c242
-rw-r--r--fs/nfs/read.c92
-rw-r--r--fs/nfs/super.c10
-rw-r--r--fs/nfs/symlink.c6
-rw-r--r--fs/nfs/write.c421
-rw-r--r--fs/nfsd/nfs4callback.c7
-rw-r--r--fs/ntfs/aops.h3
-rw-r--r--fs/ntfs/attrib.c18
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ntfs/super.c33
-rw-r--r--fs/ocfs2/alloc.c3043
-rw-r--r--fs/ocfs2/alloc.h27
-rw-r--r--fs/ocfs2/aops.c1014
-rw-r--r--fs/ocfs2/aops.h77
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/masklog.c4
-rw-r--r--fs/ocfs2/cluster/masklog.h2
-rw-r--r--fs/ocfs2/cluster/quorum.c5
-rw-r--r--fs/ocfs2/cluster/sys.c7
-rw-r--r--fs/ocfs2/cluster/tcp.c10
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h5
-rw-r--r--fs/ocfs2/dir.c22
-rw-r--r--fs/ocfs2/dlm/dlmast.c12
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c5
-rw-r--r--fs/ocfs2/dlm/dlmfs.c3
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c6
-rw-r--r--fs/ocfs2/dlm/dlmthread.c2
-rw-r--r--fs/ocfs2/dlmglue.c197
-rw-r--r--fs/ocfs2/dlmglue.h10
-rw-r--r--fs/ocfs2/export.c6
-rw-r--r--fs/ocfs2/extent_map.c1233
-rw-r--r--fs/ocfs2/extent_map.h39
-rw-r--r--fs/ocfs2/file.c654
-rw-r--r--fs/ocfs2/file.h10
-rw-r--r--fs/ocfs2/inode.c228
-rw-r--r--fs/ocfs2/inode.h24
-rw-r--r--fs/ocfs2/ioctl.c24
-rw-r--r--fs/ocfs2/ioctl.h1
-rw-r--r--fs/ocfs2/journal.c31
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/mmap.c7
-rw-r--r--fs/ocfs2/namei.c28
-rw-r--r--fs/ocfs2/ocfs2.h67
-rw-r--r--fs/ocfs2/ocfs2_fs.h33
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/suballoc.c13
-rw-r--r--fs/ocfs2/super.c12
-rw-r--r--fs/ocfs2/symlink.c7
-rw-r--r--fs/ocfs2/vote.c289
-rw-r--r--fs/ocfs2/vote.h3
-rw-r--r--fs/openpromfs/inode.c3
-rw-r--r--fs/partitions/acorn.c2
-rw-r--r--fs/partitions/check.c9
-rw-r--r--fs/proc/base.c36
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/proc_devtree.c2
-rw-r--r--fs/proc/proc_misc.c2
-rw-r--r--fs/proc/task_mmu.c134
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/reiserfs/super.c3
-rw-r--r--fs/reiserfs/xattr.c96
-rw-r--r--fs/romfs/inode.c3
-rw-r--r--fs/smbfs/inode.c3
-rw-r--r--fs/super.c12
-rw-r--r--fs/sync.c8
-rw-r--r--fs/sysfs/file.c25
-rw-r--r--fs/sysfs/group.c6
-rw-r--r--fs/sysv/dir.c10
-rw-r--r--fs/sysv/inode.c3
-rw-r--r--fs/udf/super.c4
-rw-r--r--fs/ufs/dir.c6
-rw-r--r--fs/ufs/inode.c29
-rw-r--r--fs/ufs/super.c3
-rw-r--r--fs/ufs/util.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
270 files changed, 16638 insertions, 13239 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 124a085d1f2e..b01b0a457932 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -415,7 +415,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
415 file_inode = file->d_inode; 415 file_inode = file->d_inode;
416 sb = file_inode->i_sb; 416 sb = file_inode->i_sb;
417 v9ses = v9fs_inode2v9ses(file_inode); 417 v9ses = v9fs_inode2v9ses(file_inode);
418 v9fid = v9fs_fid_lookup(file); 418 v9fid = v9fs_fid_clone(file);
419 if(IS_ERR(v9fid)) 419 if(IS_ERR(v9fid))
420 return PTR_ERR(v9fid); 420 return PTR_ERR(v9fid);
421 421
diff --git a/fs/Kconfig b/fs/Kconfig
index 3c4886b849f5..8ea7b04c661f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1734,6 +1734,18 @@ config SUNRPC
1734config SUNRPC_GSS 1734config SUNRPC_GSS
1735 tristate 1735 tristate
1736 1736
1737config SUNRPC_BIND34
1738 bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
1739 depends on SUNRPC && EXPERIMENTAL
1740 help
1741 Provides kernel support for querying rpcbind servers via versions 3
1742 and 4 of the rpcbind protocol. The kernel automatically falls back
1743 to version 2 if a remote rpcbind service does not support versions
1744 3 or 4.
1745
1746 If unsure, say N to get traditional behavior (version 2 rpcbind
1747 requests only).
1748
1737config RPCSEC_GSS_KRB5 1749config RPCSEC_GSS_KRB5
1738 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" 1750 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
1739 depends on SUNRPC && EXPERIMENTAL 1751 depends on SUNRPC && EXPERIMENTAL
@@ -2019,7 +2031,7 @@ config CODA_FS_OLD_API
2019config AFS_FS 2031config AFS_FS
2020 tristate "Andrew File System support (AFS) (EXPERIMENTAL)" 2032 tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
2021 depends on INET && EXPERIMENTAL 2033 depends on INET && EXPERIMENTAL
2022 select RXRPC 2034 select AF_RXRPC
2023 help 2035 help
2024 If you say Y here, you will get an experimental Andrew File System 2036 If you say Y here, you will get an experimental Andrew File System
2025 driver. It currently only supports unsecured read-only AFS access. 2037 driver. It currently only supports unsecured read-only AFS access.
@@ -2028,8 +2040,15 @@ config AFS_FS
2028 2040
2029 If unsure, say N. 2041 If unsure, say N.
2030 2042
2031config RXRPC 2043config AFS_DEBUG
2032 tristate 2044 bool "AFS dynamic debugging"
2045 depends on AFS_FS
2046 help
2047 Say Y here to make runtime controllable debugging messages appear.
2048
2049 See <file:Documentation/filesystems/afs.txt> for more information.
2050
2051 If unsure, say N.
2033 2052
2034config 9P_FS 2053config 9P_FS
2035 tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)" 2054 tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index f3d3d81eb7e9..74c64409ddbc 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -26,7 +26,7 @@ config BINFMT_ELF
26config BINFMT_ELF_FDPIC 26config BINFMT_ELF_FDPIC
27 bool "Kernel support for FDPIC ELF binaries" 27 bool "Kernel support for FDPIC ELF binaries"
28 default y 28 default y
29 depends on FRV 29 depends on (FRV || BLACKFIN)
30 help 30 help
31 ELF FDPIC binaries are based on ELF, but allow the individual load 31 ELF FDPIC binaries are based on ELF, but allow the individual load
32 segments of a binary to be located in memory independently of each 32 segments of a binary to be located in memory independently of each
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 2e5f2c8371ee..30c296508497 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -232,8 +232,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
232{ 232{
233 struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; 233 struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
234 234
235 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 235 if (flags & SLAB_CTOR_CONSTRUCTOR)
236 SLAB_CTOR_CONSTRUCTOR)
237 inode_init_once(&ei->vfs_inode); 236 inode_init_once(&ei->vfs_inode);
238} 237}
239 238
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c3986a1911b0..beff7d21e6e2 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -87,8 +87,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
87{ 87{
88 struct affs_inode_info *ei = (struct affs_inode_info *) foo; 88 struct affs_inode_info *ei = (struct affs_inode_info *) foo;
89 89
90 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 90 if (flags & SLAB_CTOR_CONSTRUCTOR) {
91 SLAB_CTOR_CONSTRUCTOR) {
92 init_MUTEX(&ei->i_link_lock); 91 init_MUTEX(&ei->i_link_lock);
93 init_MUTEX(&ei->i_ext_lock); 92 init_MUTEX(&ei->i_ext_lock);
94 inode_init_once(&ei->vfs_inode); 93 inode_init_once(&ei->vfs_inode);
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 4029c9da4b86..cf83e5d63512 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,8 +2,6 @@
2# Makefile for Red Hat Linux AFS client. 2# Makefile for Red Hat Linux AFS client.
3# 3#
4 4
5#CFLAGS += -finstrument-functions
6
7kafs-objs := \ 5kafs-objs := \
8 callback.o \ 6 callback.o \
9 cell.o \ 7 cell.o \
@@ -12,14 +10,15 @@ kafs-objs := \
12 file.o \ 10 file.o \
13 fsclient.o \ 11 fsclient.o \
14 inode.o \ 12 inode.o \
15 kafsasyncd.o \
16 kafstimod.o \
17 main.o \ 13 main.o \
18 misc.o \ 14 misc.o \
19 mntpt.o \ 15 mntpt.o \
20 proc.o \ 16 proc.o \
17 rxrpc.o \
18 security.o \
21 server.o \ 19 server.o \
22 super.o \ 20 super.o \
21 netdevices.o \
23 vlclient.o \ 22 vlclient.o \
24 vlocation.o \ 23 vlocation.o \
25 vnode.o \ 24 vnode.o \
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
new file mode 100644
index 000000000000..52d0752265b8
--- /dev/null
+++ b/fs/afs/afs.h
@@ -0,0 +1,146 @@
1/* AFS common types
2 *
3 * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef AFS_H
13#define AFS_H
14
15#include <linux/in.h>
16
17#define AFS_MAXCELLNAME 64 /* maximum length of a cell name */
18#define AFS_MAXVOLNAME 64 /* maximum length of a volume name */
19
20typedef unsigned afs_volid_t;
21typedef unsigned afs_vnodeid_t;
22typedef unsigned long long afs_dataversion_t;
23
24typedef enum {
25 AFSVL_RWVOL, /* read/write volume */
26 AFSVL_ROVOL, /* read-only volume */
27 AFSVL_BACKVOL, /* backup volume */
28} __attribute__((packed)) afs_voltype_t;
29
30typedef enum {
31 AFS_FTYPE_INVALID = 0,
32 AFS_FTYPE_FILE = 1,
33 AFS_FTYPE_DIR = 2,
34 AFS_FTYPE_SYMLINK = 3,
35} afs_file_type_t;
36
37/*
38 * AFS file identifier
39 */
40struct afs_fid {
41 afs_volid_t vid; /* volume ID */
42 afs_vnodeid_t vnode; /* file index within volume */
43 unsigned unique; /* unique ID number (file index version) */
44};
45
46/*
47 * AFS callback notification
48 */
49typedef enum {
50 AFSCM_CB_UNTYPED = 0, /* no type set on CB break */
51 AFSCM_CB_EXCLUSIVE = 1, /* CB exclusive to CM [not implemented] */
52 AFSCM_CB_SHARED = 2, /* CB shared by other CM's */
53 AFSCM_CB_DROPPED = 3, /* CB promise cancelled by file server */
54} afs_callback_type_t;
55
56struct afs_callback {
57 struct afs_fid fid; /* file identifier */
58 unsigned version; /* callback version */
59 unsigned expiry; /* time at which expires */
60 afs_callback_type_t type; /* type of callback */
61};
62
63#define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */
64
65/*
66 * AFS volume information
67 */
68struct afs_volume_info {
69 afs_volid_t vid; /* volume ID */
70 afs_voltype_t type; /* type of this volume */
71 afs_volid_t type_vids[5]; /* volume ID's for possible types for this vol */
72
73 /* list of fileservers serving this volume */
74 size_t nservers; /* number of entries used in servers[] */
75 struct {
76 struct in_addr addr; /* fileserver address */
77 } servers[8];
78};
79
80/*
81 * AFS security ACE access mask
82 */
83typedef u32 afs_access_t;
84#define AFS_ACE_READ 0x00000001U /* - permission to read a file/dir */
85#define AFS_ACE_WRITE 0x00000002U /* - permission to write/chmod a file */
86#define AFS_ACE_INSERT 0x00000004U /* - permission to create dirent in a dir */
87#define AFS_ACE_LOOKUP 0x00000008U /* - permission to lookup a file/dir in a dir */
88#define AFS_ACE_DELETE 0x00000010U /* - permission to delete a dirent from a dir */
89#define AFS_ACE_LOCK 0x00000020U /* - permission to lock a file */
90#define AFS_ACE_ADMINISTER 0x00000040U /* - permission to change ACL */
91#define AFS_ACE_USER_A 0x01000000U /* - 'A' user-defined permission */
92#define AFS_ACE_USER_B 0x02000000U /* - 'B' user-defined permission */
93#define AFS_ACE_USER_C 0x04000000U /* - 'C' user-defined permission */
94#define AFS_ACE_USER_D 0x08000000U /* - 'D' user-defined permission */
95#define AFS_ACE_USER_E 0x10000000U /* - 'E' user-defined permission */
96#define AFS_ACE_USER_F 0x20000000U /* - 'F' user-defined permission */
97#define AFS_ACE_USER_G 0x40000000U /* - 'G' user-defined permission */
98#define AFS_ACE_USER_H 0x80000000U /* - 'H' user-defined permission */
99
100/*
101 * AFS file status information
102 */
103struct afs_file_status {
104 unsigned if_version; /* interface version */
105#define AFS_FSTATUS_VERSION 1
106
107 afs_file_type_t type; /* file type */
108 unsigned nlink; /* link count */
109 u64 size; /* file size */
110 afs_dataversion_t data_version; /* current data version */
111 u32 author; /* author ID */
112 u32 owner; /* owner ID */
113 u32 group; /* group ID */
114 afs_access_t caller_access; /* access rights for authenticated caller */
115 afs_access_t anon_access; /* access rights for unauthenticated caller */
116 umode_t mode; /* UNIX mode */
117 struct afs_fid parent; /* parent dir ID for non-dirs only */
118 time_t mtime_client; /* last time client changed data */
119 time_t mtime_server; /* last time server changed data */
120};
121
122/*
123 * AFS file status change request
124 */
125struct afs_store_status {
126 u32 mask; /* which bits of the struct are set */
127 u32 mtime_client; /* last time client changed data */
128 u32 owner; /* owner ID */
129 u32 group; /* group ID */
130 umode_t mode; /* UNIX mode */
131};
132
133#define AFS_SET_MTIME 0x01 /* set the mtime */
134#define AFS_SET_OWNER 0x02 /* set the owner ID */
135#define AFS_SET_GROUP 0x04 /* set the group ID (unsupported?) */
136#define AFS_SET_MODE 0x08 /* set the UNIX mode */
137#define AFS_SET_SEG_SIZE 0x10 /* set the segment size (unsupported) */
138
139/*
140 * AFS volume synchronisation information
141 */
142struct afs_volsync {
143 time_t creation; /* volume creation time */
144};
145
146#endif /* AFS_H */
diff --git a/fs/afs/afs_cm.h b/fs/afs/afs_cm.h
new file mode 100644
index 000000000000..7b4d4fab4c80
--- /dev/null
+++ b/fs/afs/afs_cm.h
@@ -0,0 +1,32 @@
1/* AFS Cache Manager definitions
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef AFS_CM_H
13#define AFS_CM_H
14
15#define AFS_CM_PORT 7001 /* AFS file server port */
16#define CM_SERVICE 1 /* AFS File Service ID */
17
18enum AFS_CM_Operations {
19 CBCallBack = 204, /* break callback promises */
20 CBInitCallBackState = 205, /* initialise callback state */
21 CBProbe = 206, /* probe client */
22 CBGetLock = 207, /* get contents of CM lock table */
23 CBGetCE = 208, /* get cache file description */
24 CBGetXStatsVersion = 209, /* get version of extended statistics */
25 CBGetXStats = 210, /* get contents of extended statistics data */
26 CBInitCallBackState3 = 213, /* initialise callback state, version 3 */
27 CBGetCapabilities = 65538, /* get client capabilities */
28};
29
30#define AFS_CAP_ERROR_TRANSLATION 0x1
31
32#endif /* AFS_FS_H */
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
new file mode 100644
index 000000000000..89e0d1650a72
--- /dev/null
+++ b/fs/afs/afs_fs.h
@@ -0,0 +1,48 @@
1/* AFS File Service definitions
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef AFS_FS_H
13#define AFS_FS_H
14
15#define AFS_FS_PORT 7000 /* AFS file server port */
16#define FS_SERVICE 1 /* AFS File Service ID */
17
18enum AFS_FS_Operations {
19 FSFETCHDATA = 130, /* AFS Fetch file data */
20 FSFETCHSTATUS = 132, /* AFS Fetch file status */
21 FSREMOVEFILE = 136, /* AFS Remove a file */
22 FSCREATEFILE = 137, /* AFS Create a file */
23 FSRENAME = 138, /* AFS Rename or move a file or directory */
24 FSSYMLINK = 139, /* AFS Create a symbolic link */
25 FSLINK = 140, /* AFS Create a hard link */
26 FSMAKEDIR = 141, /* AFS Create a directory */
27 FSREMOVEDIR = 142, /* AFS Remove a directory */
28 FSGIVEUPCALLBACKS = 147, /* AFS Discard callback promises */
29 FSGETVOLUMEINFO = 148, /* AFS Get root volume information */
30 FSGETROOTVOLUME = 151, /* AFS Get root volume name */
31 FSLOOKUP = 161, /* AFS lookup file in directory */
32};
33
34enum AFS_FS_Errors {
35 VSALVAGE = 101, /* volume needs salvaging */
36 VNOVNODE = 102, /* no such file/dir (vnode) */
37 VNOVOL = 103, /* no such volume or volume unavailable */
38 VVOLEXISTS = 104, /* volume name already exists */
39 VNOSERVICE = 105, /* volume not currently in service */
40 VOFFLINE = 106, /* volume is currently offline (more info available [VVL-spec]) */
41 VONLINE = 107, /* volume is already online */
42 VDISKFULL = 108, /* disk partition is full */
43 VOVERQUOTA = 109, /* volume's maximum quota exceeded */
44 VBUSY = 110, /* volume is temporarily unavailable */
45 VMOVED = 111, /* volume moved to new server - ask this FS where */
46};
47
48#endif /* AFS_FS_H */
diff --git a/fs/afs/vlclient.h b/fs/afs/afs_vl.h
index e3d601179c46..8bbefe009ed4 100644
--- a/fs/afs/vlclient.h
+++ b/fs/afs/afs_vl.h
@@ -1,6 +1,6 @@
1/* vlclient.h: Volume Location Service client interface 1/* AFS Volume Location Service client interface
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -9,10 +9,19 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#ifndef _LINUX_AFS_VLCLIENT_H 12#ifndef AFS_VL_H
13#define _LINUX_AFS_VLCLIENT_H 13#define AFS_VL_H
14 14
15#include "types.h" 15#include "afs.h"
16
17#define AFS_VL_PORT 7003 /* volume location service port */
18#define VL_SERVICE 52 /* RxRPC service ID for the Volume Location service */
19
20enum AFSVL_Operations {
21 VLGETENTRYBYID = 503, /* AFS Get Cache Entry By ID operation ID */
22 VLGETENTRYBYNAME = 504, /* AFS Get Cache Entry By Name operation ID */
23 VLPROBE = 514, /* AFS Probe Volume Location Service operation ID */
24};
16 25
17enum AFSVL_Errors { 26enum AFSVL_Errors {
18 AFSVL_IDEXIST = 363520, /* Volume Id entry exists in vl database */ 27 AFSVL_IDEXIST = 363520, /* Volume Id entry exists in vl database */
@@ -40,14 +49,16 @@ enum AFSVL_Errors {
40 AFSVL_BADVOLOPER = 363542, /* Bad volume operation code */ 49 AFSVL_BADVOLOPER = 363542, /* Bad volume operation code */
41 AFSVL_BADRELLOCKTYPE = 363543, /* Bad release lock type */ 50 AFSVL_BADRELLOCKTYPE = 363543, /* Bad release lock type */
42 AFSVL_RERELEASE = 363544, /* Status report: last release was aborted */ 51 AFSVL_RERELEASE = 363544, /* Status report: last release was aborted */
43 AFSVL_BADSERVERFLAG = 363545, /* Invalid replication site server °ag */ 52 AFSVL_BADSERVERFLAG = 363545, /* Invalid replication site server °ag */
44 AFSVL_PERM = 363546, /* No permission access */ 53 AFSVL_PERM = 363546, /* No permission access */
45 AFSVL_NOMEM = 363547, /* malloc/realloc failed to alloc enough memory */ 54 AFSVL_NOMEM = 363547, /* malloc/realloc failed to alloc enough memory */
46}; 55};
47 56
48/* maps to "struct vldbentry" in vvl-spec.pdf */ 57/*
58 * maps to "struct vldbentry" in vvl-spec.pdf
59 */
49struct afs_vldbentry { 60struct afs_vldbentry {
50 char name[65]; /* name of volume (including NUL char) */ 61 char name[65]; /* name of volume (with NUL char) */
51 afs_voltype_t type; /* volume type */ 62 afs_voltype_t type; /* volume type */
52 unsigned num_servers; /* num servers that hold instances of this vol */ 63 unsigned num_servers; /* num servers that hold instances of this vol */
53 unsigned clone_id; /* cloning ID */ 64 unsigned clone_id; /* cloning ID */
@@ -68,26 +79,6 @@ struct afs_vldbentry {
68#define AFS_VLSF_RWVOL 0x0004 /* this server holds a R/W instance of the volume */ 79#define AFS_VLSF_RWVOL 0x0004 /* this server holds a R/W instance of the volume */
69#define AFS_VLSF_BACKVOL 0x0008 /* this server holds a backup instance of the volume */ 80#define AFS_VLSF_BACKVOL 0x0008 /* this server holds a backup instance of the volume */
70 } servers[8]; 81 } servers[8];
71
72}; 82};
73 83
74/* look up a volume location database entry by name */ 84#endif /* AFS_VL_H */
75extern int afs_rxvl_get_entry_by_name(struct afs_server *server,
76 const char *volname,
77 unsigned volnamesz,
78 struct afs_cache_vlocation *entry);
79
80/* look up a volume location database entry by ID */
81extern int afs_rxvl_get_entry_by_id(struct afs_server *server,
82 afs_volid_t volid,
83 afs_voltype_t voltype,
84 struct afs_cache_vlocation *entry);
85
86extern int afs_rxvl_get_entry_by_id_async(struct afs_async_op *op,
87 afs_volid_t volid,
88 afs_voltype_t voltype);
89
90extern int afs_rxvl_get_entry_by_id_async2(struct afs_async_op *op,
91 struct afs_cache_vlocation *entry);
92
93#endif /* _LINUX_AFS_VLCLIENT_H */
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
new file mode 100644
index 000000000000..de0d7de69edc
--- /dev/null
+++ b/fs/afs/cache.c
@@ -0,0 +1,256 @@
1/* AFS caching stuff
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifdef AFS_CACHING_SUPPORT
13static cachefs_match_val_t afs_cell_cache_match(void *target,
14 const void *entry);
15static void afs_cell_cache_update(void *source, void *entry);
16
17struct cachefs_index_def afs_cache_cell_index_def = {
18 .name = "cell_ix",
19 .data_size = sizeof(struct afs_cache_cell),
20 .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
21 .match = afs_cell_cache_match,
22 .update = afs_cell_cache_update,
23};
24#endif
25
26/*
27 * match a cell record obtained from the cache
28 */
29#ifdef AFS_CACHING_SUPPORT
30static cachefs_match_val_t afs_cell_cache_match(void *target,
31 const void *entry)
32{
33 const struct afs_cache_cell *ccell = entry;
34 struct afs_cell *cell = target;
35
36 _enter("{%s},{%s}", ccell->name, cell->name);
37
38 if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
39 _leave(" = SUCCESS");
40 return CACHEFS_MATCH_SUCCESS;
41 }
42
43 _leave(" = FAILED");
44 return CACHEFS_MATCH_FAILED;
45}
46#endif
47
48/*
49 * update a cell record in the cache
50 */
51#ifdef AFS_CACHING_SUPPORT
52static void afs_cell_cache_update(void *source, void *entry)
53{
54 struct afs_cache_cell *ccell = entry;
55 struct afs_cell *cell = source;
56
57 _enter("%p,%p", source, entry);
58
59 strncpy(ccell->name, cell->name, sizeof(ccell->name));
60
61 memcpy(ccell->vl_servers,
62 cell->vl_addrs,
63 min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
64
65}
66#endif
67
68#ifdef AFS_CACHING_SUPPORT
69static cachefs_match_val_t afs_vlocation_cache_match(void *target,
70 const void *entry);
71static void afs_vlocation_cache_update(void *source, void *entry);
72
73struct cachefs_index_def afs_vlocation_cache_index_def = {
74 .name = "vldb",
75 .data_size = sizeof(struct afs_cache_vlocation),
76 .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
77 .match = afs_vlocation_cache_match,
78 .update = afs_vlocation_cache_update,
79};
80#endif
81
82/*
83 * match a VLDB record stored in the cache
84 * - may also load target from entry
85 */
86#ifdef AFS_CACHING_SUPPORT
87static cachefs_match_val_t afs_vlocation_cache_match(void *target,
88 const void *entry)
89{
90 const struct afs_cache_vlocation *vldb = entry;
91 struct afs_vlocation *vlocation = target;
92
93 _enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
94
95 if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
96 ) {
97 if (!vlocation->valid ||
98 vlocation->vldb.rtime == vldb->rtime
99 ) {
100 vlocation->vldb = *vldb;
101 vlocation->valid = 1;
102 _leave(" = SUCCESS [c->m]");
103 return CACHEFS_MATCH_SUCCESS;
104 } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
105 /* delete if VIDs for this name differ */
106 if (memcmp(&vlocation->vldb.vid,
107 &vldb->vid,
108 sizeof(vldb->vid)) != 0) {
109 _leave(" = DELETE");
110 return CACHEFS_MATCH_SUCCESS_DELETE;
111 }
112
113 _leave(" = UPDATE");
114 return CACHEFS_MATCH_SUCCESS_UPDATE;
115 } else {
116 _leave(" = SUCCESS");
117 return CACHEFS_MATCH_SUCCESS;
118 }
119 }
120
121 _leave(" = FAILED");
122 return CACHEFS_MATCH_FAILED;
123}
124#endif
125
126/*
127 * update a VLDB record stored in the cache
128 */
129#ifdef AFS_CACHING_SUPPORT
130static void afs_vlocation_cache_update(void *source, void *entry)
131{
132 struct afs_cache_vlocation *vldb = entry;
133 struct afs_vlocation *vlocation = source;
134
135 _enter("");
136
137 *vldb = vlocation->vldb;
138}
139#endif
140
141#ifdef AFS_CACHING_SUPPORT
142static cachefs_match_val_t afs_volume_cache_match(void *target,
143 const void *entry);
144static void afs_volume_cache_update(void *source, void *entry);
145
146struct cachefs_index_def afs_volume_cache_index_def = {
147 .name = "volume",
148 .data_size = sizeof(struct afs_cache_vhash),
149 .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 1 },
150 .keys[1] = { CACHEFS_INDEX_KEYS_BIN, 1 },
151 .match = afs_volume_cache_match,
152 .update = afs_volume_cache_update,
153};
154#endif
155
156/*
157 * match a volume hash record stored in the cache
158 */
159#ifdef AFS_CACHING_SUPPORT
160static cachefs_match_val_t afs_volume_cache_match(void *target,
161 const void *entry)
162{
163 const struct afs_cache_vhash *vhash = entry;
164 struct afs_volume *volume = target;
165
166 _enter("{%u},{%u}", volume->type, vhash->vtype);
167
168 if (volume->type == vhash->vtype) {
169 _leave(" = SUCCESS");
170 return CACHEFS_MATCH_SUCCESS;
171 }
172
173 _leave(" = FAILED");
174 return CACHEFS_MATCH_FAILED;
175}
176#endif
177
178/*
179 * update a volume hash record stored in the cache
180 */
181#ifdef AFS_CACHING_SUPPORT
182static void afs_volume_cache_update(void *source, void *entry)
183{
184 struct afs_cache_vhash *vhash = entry;
185 struct afs_volume *volume = source;
186
187 _enter("");
188
189 vhash->vtype = volume->type;
190}
191#endif
192
193#ifdef AFS_CACHING_SUPPORT
194static cachefs_match_val_t afs_vnode_cache_match(void *target,
195 const void *entry);
196static void afs_vnode_cache_update(void *source, void *entry);
197
198struct cachefs_index_def afs_vnode_cache_index_def = {
199 .name = "vnode",
200 .data_size = sizeof(struct afs_cache_vnode),
201 .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 4 },
202 .match = afs_vnode_cache_match,
203 .update = afs_vnode_cache_update,
204};
205#endif
206
207/*
208 * match a vnode record stored in the cache
209 */
210#ifdef AFS_CACHING_SUPPORT
211static cachefs_match_val_t afs_vnode_cache_match(void *target,
212 const void *entry)
213{
214 const struct afs_cache_vnode *cvnode = entry;
215 struct afs_vnode *vnode = target;
216
217 _enter("{%x,%x,%Lx},{%x,%x,%Lx}",
218 vnode->fid.vnode,
219 vnode->fid.unique,
220 vnode->status.version,
221 cvnode->vnode_id,
222 cvnode->vnode_unique,
223 cvnode->data_version);
224
225 if (vnode->fid.vnode != cvnode->vnode_id) {
226 _leave(" = FAILED");
227 return CACHEFS_MATCH_FAILED;
228 }
229
230 if (vnode->fid.unique != cvnode->vnode_unique ||
231 vnode->status.version != cvnode->data_version) {
232 _leave(" = DELETE");
233 return CACHEFS_MATCH_SUCCESS_DELETE;
234 }
235
236 _leave(" = SUCCESS");
237 return CACHEFS_MATCH_SUCCESS;
238}
239#endif
240
241/*
242 * update a vnode record stored in the cache
243 */
244#ifdef AFS_CACHING_SUPPORT
245static void afs_vnode_cache_update(void *source, void *entry)
246{
247 struct afs_cache_vnode *cvnode = entry;
248 struct afs_vnode *vnode = source;
249
250 _enter("");
251
252 cvnode->vnode_id = vnode->fid.vnode;
253 cvnode->vnode_unique = vnode->fid.unique;
254 cvnode->data_version = vnode->status.version;
255}
256#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 9eb7722b34d5..36a3642cf90e 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,4 +1,4 @@
1/* cache.h: AFS local cache management interface 1/* AFS local cache management interface
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
@@ -9,8 +9,8 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#ifndef _LINUX_AFS_CACHE_H 12#ifndef AFS_CACHE_H
13#define _LINUX_AFS_CACHE_H 13#define AFS_CACHE_H
14 14
15#undef AFS_CACHING_SUPPORT 15#undef AFS_CACHING_SUPPORT
16 16
@@ -20,8 +20,4 @@
20#endif 20#endif
21#include "types.h" 21#include "types.h"
22 22
23#ifdef __KERNEL__ 23#endif /* AFS_CACHE_H */
24
25#endif /* __KERNEL__ */
26
27#endif /* _LINUX_AFS_CACHE_H */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 9cb206e9d4be..9bdbf36a9aa9 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2002 Red Hat, Inc. All rights reserved. 2 * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This software may be freely redistributed under the terms of the 4 * This software may be freely redistributed under the terms of the
5 * GNU General Public License. 5 * GNU General Public License.
@@ -16,85 +16,187 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include "server.h" 19#include <linux/circ_buf.h>
20#include "vnode.h"
21#include "internal.h" 20#include "internal.h"
22#include "cmservice.h"
23 21
24/*****************************************************************************/ 22unsigned afs_vnode_update_timeout = 10;
23
24#define afs_breakring_space(server) \
25 CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail, \
26 ARRAY_SIZE((server)->cb_break))
27
28//static void afs_callback_updater(struct work_struct *);
29
30static struct workqueue_struct *afs_callback_update_worker;
31
25/* 32/*
26 * allow the fileserver to request callback state (re-)initialisation 33 * allow the fileserver to request callback state (re-)initialisation
27 */ 34 */
28int SRXAFSCM_InitCallBackState(struct afs_server *server) 35void afs_init_callback_state(struct afs_server *server)
29{ 36{
30 struct list_head callbacks; 37 struct afs_vnode *vnode;
31 38
32 _enter("%p", server); 39 _enter("{%p}", server);
33 40
34 INIT_LIST_HEAD(&callbacks);
35
36 /* transfer the callback list from the server to a temp holding area */
37 spin_lock(&server->cb_lock); 41 spin_lock(&server->cb_lock);
38 42
39 list_add(&callbacks, &server->cb_promises); 43 /* kill all the promises on record from this server */
40 list_del_init(&server->cb_promises); 44 while (!RB_EMPTY_ROOT(&server->cb_promises)) {
45 vnode = rb_entry(server->cb_promises.rb_node,
46 struct afs_vnode, cb_promise);
47 _debug("UNPROMISE { vid=%x vn=%u uq=%u}",
48 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
49 rb_erase(&vnode->cb_promise, &server->cb_promises);
50 vnode->cb_promised = false;
51 }
41 52
42 /* munch our way through the list, grabbing the inode, dropping all the 53 spin_unlock(&server->cb_lock);
43 * locks and regetting them in the right order 54 _leave("");
44 */ 55}
45 while (!list_empty(&callbacks)) {
46 struct afs_vnode *vnode;
47 struct inode *inode;
48 56
49 vnode = list_entry(callbacks.next, struct afs_vnode, cb_link); 57/*
50 list_del_init(&vnode->cb_link); 58 * handle the data invalidation side of a callback being broken
59 */
60void afs_broken_callback_work(struct work_struct *work)
61{
62 struct afs_vnode *vnode =
63 container_of(work, struct afs_vnode, cb_broken_work);
51 64
52 /* try and grab the inode - may fail */ 65 _enter("");
53 inode = igrab(AFS_VNODE_TO_I(vnode));
54 if (inode) {
55 int release = 0;
56 66
57 spin_unlock(&server->cb_lock); 67 if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
58 spin_lock(&vnode->lock); 68 return;
59 69
60 if (vnode->cb_server == server) { 70 /* we're only interested in dealing with a broken callback on *this*
61 vnode->cb_server = NULL; 71 * vnode and only if no-one else has dealt with it yet */
62 afs_kafstimod_del_timer(&vnode->cb_timeout); 72 if (!mutex_trylock(&vnode->validate_lock))
63 spin_lock(&afs_cb_hash_lock); 73 return; /* someone else is dealing with it */
64 list_del_init(&vnode->cb_hash_link);
65 spin_unlock(&afs_cb_hash_lock);
66 release = 1;
67 }
68 74
69 spin_unlock(&vnode->lock); 75 if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
76 if (S_ISDIR(vnode->vfs_inode.i_mode))
77 afs_clear_permits(vnode);
70 78
71 iput(inode); 79 if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0)
72 afs_put_server(server); 80 goto out;
73 81
74 spin_lock(&server->cb_lock); 82 if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
83 goto out;
84
85 /* if the vnode's data version number changed then its contents
86 * are different */
87 if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
88 _debug("zap data {%x:%u}",
89 vnode->fid.vid, vnode->fid.vnode);
90 invalidate_remote_inode(&vnode->vfs_inode);
75 } 91 }
76 } 92 }
77 93
78 spin_unlock(&server->cb_lock); 94out:
95 mutex_unlock(&vnode->validate_lock);
79 96
80 _leave(" = 0"); 97 /* avoid the potential race whereby the mutex_trylock() in this
81 return 0; 98 * function happens again between the clear_bit() and the
82} /* end SRXAFSCM_InitCallBackState() */ 99 * mutex_unlock() */
100 if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
101 _debug("requeue");
102 queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
103 }
104 _leave("");
105}
106
107/*
108 * actually break a callback
109 */
110static void afs_break_callback(struct afs_server *server,
111 struct afs_vnode *vnode)
112{
113 _enter("");
114
115 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
116
117 if (vnode->cb_promised) {
118 spin_lock(&vnode->lock);
119
120 _debug("break callback");
121
122 spin_lock(&server->cb_lock);
123 if (vnode->cb_promised) {
124 rb_erase(&vnode->cb_promise, &server->cb_promises);
125 vnode->cb_promised = false;
126 }
127 spin_unlock(&server->cb_lock);
128
129 queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
130 spin_unlock(&vnode->lock);
131 }
132}
133
134/*
135 * allow the fileserver to explicitly break one callback
136 * - happens when
137 * - the backing file is changed
138 * - a lock is released
139 */
140static void afs_break_one_callback(struct afs_server *server,
141 struct afs_fid *fid)
142{
143 struct afs_vnode *vnode;
144 struct rb_node *p;
145
146 _debug("find");
147 spin_lock(&server->fs_lock);
148 p = server->fs_vnodes.rb_node;
149 while (p) {
150 vnode = rb_entry(p, struct afs_vnode, server_rb);
151 if (fid->vid < vnode->fid.vid)
152 p = p->rb_left;
153 else if (fid->vid > vnode->fid.vid)
154 p = p->rb_right;
155 else if (fid->vnode < vnode->fid.vnode)
156 p = p->rb_left;
157 else if (fid->vnode > vnode->fid.vnode)
158 p = p->rb_right;
159 else if (fid->unique < vnode->fid.unique)
160 p = p->rb_left;
161 else if (fid->unique > vnode->fid.unique)
162 p = p->rb_right;
163 else
164 goto found;
165 }
166
167 /* not found so we just ignore it (it may have moved to another
168 * server) */
169not_available:
170 _debug("not avail");
171 spin_unlock(&server->fs_lock);
172 _leave("");
173 return;
174
175found:
176 _debug("found");
177 ASSERTCMP(server, ==, vnode->server);
178
179 if (!igrab(AFS_VNODE_TO_I(vnode)))
180 goto not_available;
181 spin_unlock(&server->fs_lock);
182
183 afs_break_callback(server, vnode);
184 iput(&vnode->vfs_inode);
185 _leave("");
186}
83 187
84/*****************************************************************************/
85/* 188/*
86 * allow the fileserver to break callback promises 189 * allow the fileserver to break callback promises
87 */ 190 */
88int SRXAFSCM_CallBack(struct afs_server *server, size_t count, 191void afs_break_callbacks(struct afs_server *server, size_t count,
89 struct afs_callback callbacks[]) 192 struct afs_callback callbacks[])
90{ 193{
91 _enter("%p,%u,", server, count); 194 _enter("%p,%zu,", server, count);
92 195
93 for (; count > 0; callbacks++, count--) { 196 ASSERT(server != NULL);
94 struct afs_vnode *vnode = NULL; 197 ASSERTCMP(count, <=, AFSCBMAX);
95 struct inode *inode = NULL;
96 int valid = 0;
97 198
199 for (; count > 0; callbacks++, count--) {
98 _debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }", 200 _debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }",
99 callbacks->fid.vid, 201 callbacks->fid.vid,
100 callbacks->fid.vnode, 202 callbacks->fid.vnode,
@@ -103,67 +205,270 @@ int SRXAFSCM_CallBack(struct afs_server *server, size_t count,
103 callbacks->expiry, 205 callbacks->expiry,
104 callbacks->type 206 callbacks->type
105 ); 207 );
208 afs_break_one_callback(server, &callbacks->fid);
209 }
106 210
107 /* find the inode for this fid */ 211 _leave("");
108 spin_lock(&afs_cb_hash_lock); 212 return;
213}
109 214
110 list_for_each_entry(vnode, 215/*
111 &afs_cb_hash(server, &callbacks->fid), 216 * record the callback for breaking
112 cb_hash_link) { 217 * - the caller must hold server->cb_lock
113 if (memcmp(&vnode->fid, &callbacks->fid, 218 */
114 sizeof(struct afs_fid)) != 0) 219static void afs_do_give_up_callback(struct afs_server *server,
115 continue; 220 struct afs_vnode *vnode)
221{
222 struct afs_callback *cb;
116 223
117 /* right vnode, but is it same server? */ 224 _enter("%p,%p", server, vnode);
118 if (vnode->cb_server != server)
119 break; /* no */
120 225
121 /* try and nail the inode down */ 226 cb = &server->cb_break[server->cb_break_head];
122 inode = igrab(AFS_VNODE_TO_I(vnode)); 227 cb->fid = vnode->fid;
123 break; 228 cb->version = vnode->cb_version;
229 cb->expiry = vnode->cb_expiry;
230 cb->type = vnode->cb_type;
231 smp_wmb();
232 server->cb_break_head =
233 (server->cb_break_head + 1) &
234 (ARRAY_SIZE(server->cb_break) - 1);
235
236 /* defer the breaking of callbacks to try and collect as many as
237 * possible to ship in one operation */
238 switch (atomic_inc_return(&server->cb_break_n)) {
239 case 1 ... AFSCBMAX - 1:
240 queue_delayed_work(afs_callback_update_worker,
241 &server->cb_break_work, HZ * 2);
242 break;
243 case AFSCBMAX:
244 afs_flush_callback_breaks(server);
245 break;
246 default:
247 break;
248 }
249
250 ASSERT(server->cb_promises.rb_node != NULL);
251 rb_erase(&vnode->cb_promise, &server->cb_promises);
252 vnode->cb_promised = false;
253 _leave("");
254}
255
256/*
257 * discard the callback on a deleted item
258 */
259void afs_discard_callback_on_delete(struct afs_vnode *vnode)
260{
261 struct afs_server *server = vnode->server;
262
263 _enter("%d", vnode->cb_promised);
264
265 if (!vnode->cb_promised) {
266 _leave(" [not promised]");
267 return;
268 }
269
270 ASSERT(server != NULL);
271
272 spin_lock(&server->cb_lock);
273 if (vnode->cb_promised) {
274 ASSERT(server->cb_promises.rb_node != NULL);
275 rb_erase(&vnode->cb_promise, &server->cb_promises);
276 vnode->cb_promised = false;
277 }
278 spin_unlock(&server->cb_lock);
279 _leave("");
280}
281
282/*
283 * give up the callback registered for a vnode on the file server when the
284 * inode is being cleared
285 */
286void afs_give_up_callback(struct afs_vnode *vnode)
287{
288 struct afs_server *server = vnode->server;
289
290 DECLARE_WAITQUEUE(myself, current);
291
292 _enter("%d", vnode->cb_promised);
293
294 _debug("GIVE UP INODE %p", &vnode->vfs_inode);
295
296 if (!vnode->cb_promised) {
297 _leave(" [not promised]");
298 return;
299 }
300
301 ASSERT(server != NULL);
302
303 spin_lock(&server->cb_lock);
304 if (vnode->cb_promised && afs_breakring_space(server) == 0) {
305 add_wait_queue(&server->cb_break_waitq, &myself);
306 for (;;) {
307 set_current_state(TASK_UNINTERRUPTIBLE);
308 if (!vnode->cb_promised ||
309 afs_breakring_space(server) != 0)
310 break;
311 spin_unlock(&server->cb_lock);
312 schedule();
313 spin_lock(&server->cb_lock);
124 } 314 }
315 remove_wait_queue(&server->cb_break_waitq, &myself);
316 __set_current_state(TASK_RUNNING);
317 }
318
319 /* of course, it's always possible for the server to break this vnode's
320 * callback first... */
321 if (vnode->cb_promised)
322 afs_do_give_up_callback(server, vnode);
323
324 spin_unlock(&server->cb_lock);
325 _leave("");
326}
327
328/*
329 * dispatch a deferred give up callbacks operation
330 */
331void afs_dispatch_give_up_callbacks(struct work_struct *work)
332{
333 struct afs_server *server =
334 container_of(work, struct afs_server, cb_break_work.work);
335
336 _enter("");
337
338 /* tell the fileserver to discard the callback promises it has
339 * - in the event of ENOMEM or some other error, we just forget that we
340 * had callbacks entirely, and the server will call us later to break
341 * them
342 */
343 afs_fs_give_up_callbacks(server, &afs_async_call);
344}
345
346/*
347 * flush the outstanding callback breaks on a server
348 */
349void afs_flush_callback_breaks(struct afs_server *server)
350{
351 cancel_delayed_work(&server->cb_break_work);
352 queue_delayed_work(afs_callback_update_worker,
353 &server->cb_break_work, 0);
354}
125 355
126 spin_unlock(&afs_cb_hash_lock); 356#if 0
127 357/*
128 if (inode) { 358 * update a bunch of callbacks
129 /* we've found the record for this vnode */ 359 */
130 spin_lock(&vnode->lock); 360static void afs_callback_updater(struct work_struct *work)
131 if (vnode->cb_server == server) { 361{
132 /* the callback _is_ on the calling server */ 362 struct afs_server *server;
133 vnode->cb_server = NULL; 363 struct afs_vnode *vnode, *xvnode;
134 valid = 1; 364 time_t now;
135 365 long timeout;
136 afs_kafstimod_del_timer(&vnode->cb_timeout); 366 int ret;
137 vnode->flags |= AFS_VNODE_CHANGED; 367
138 368 server = container_of(work, struct afs_server, updater);
139 spin_lock(&server->cb_lock); 369
140 list_del_init(&vnode->cb_link); 370 _enter("");
141 spin_unlock(&server->cb_lock); 371
142 372 now = get_seconds();
143 spin_lock(&afs_cb_hash_lock); 373
144 list_del_init(&vnode->cb_hash_link); 374 /* find the first vnode to update */
145 spin_unlock(&afs_cb_hash_lock); 375 spin_lock(&server->cb_lock);
146 } 376 for (;;) {
147 spin_unlock(&vnode->lock); 377 if (RB_EMPTY_ROOT(&server->cb_promises)) {
148 378 spin_unlock(&server->cb_lock);
149 if (valid) { 379 _leave(" [nothing]");
150 invalidate_remote_inode(inode); 380 return;
151 afs_put_server(server);
152 }
153 iput(inode);
154 } 381 }
382
383 vnode = rb_entry(rb_first(&server->cb_promises),
384 struct afs_vnode, cb_promise);
385 if (atomic_read(&vnode->usage) > 0)
386 break;
387 rb_erase(&vnode->cb_promise, &server->cb_promises);
388 vnode->cb_promised = false;
155 } 389 }
156 390
157 _leave(" = 0"); 391 timeout = vnode->update_at - now;
158 return 0; 392 if (timeout > 0) {
159} /* end SRXAFSCM_CallBack() */ 393 queue_delayed_work(afs_vnode_update_worker,
394 &afs_vnode_update, timeout * HZ);
395 spin_unlock(&server->cb_lock);
396 _leave(" [nothing]");
397 return;
398 }
399
400 list_del_init(&vnode->update);
401 atomic_inc(&vnode->usage);
402 spin_unlock(&server->cb_lock);
403
404 /* we can now perform the update */
405 _debug("update %s", vnode->vldb.name);
406 vnode->state = AFS_VL_UPDATING;
407 vnode->upd_rej_cnt = 0;
408 vnode->upd_busy_cnt = 0;
409
410 ret = afs_vnode_update_record(vl, &vldb);
411 switch (ret) {
412 case 0:
413 afs_vnode_apply_update(vl, &vldb);
414 vnode->state = AFS_VL_UPDATING;
415 break;
416 case -ENOMEDIUM:
417 vnode->state = AFS_VL_VOLUME_DELETED;
418 break;
419 default:
420 vnode->state = AFS_VL_UNCERTAIN;
421 break;
422 }
423
424 /* and then reschedule */
425 _debug("reschedule");
426 vnode->update_at = get_seconds() + afs_vnode_update_timeout;
427
428 spin_lock(&server->cb_lock);
429
430 if (!list_empty(&server->cb_promises)) {
431 /* next update in 10 minutes, but wait at least 1 second more
432 * than the newest record already queued so that we don't spam
433 * the VL server suddenly with lots of requests
434 */
435 xvnode = list_entry(server->cb_promises.prev,
436 struct afs_vnode, update);
437 if (vnode->update_at <= xvnode->update_at)
438 vnode->update_at = xvnode->update_at + 1;
439 xvnode = list_entry(server->cb_promises.next,
440 struct afs_vnode, update);
441 timeout = xvnode->update_at - now;
442 if (timeout < 0)
443 timeout = 0;
444 } else {
445 timeout = afs_vnode_update_timeout;
446 }
447
448 list_add_tail(&vnode->update, &server->cb_promises);
449
450 _debug("timeout %ld", timeout);
451 queue_delayed_work(afs_vnode_update_worker,
452 &afs_vnode_update, timeout * HZ);
453 spin_unlock(&server->cb_lock);
454 afs_put_vnode(vl);
455}
456#endif
457
458/*
459 * initialise the callback update process
460 */
461int __init afs_callback_update_init(void)
462{
463 afs_callback_update_worker =
464 create_singlethread_workqueue("kafs_callbackd");
465 return afs_callback_update_worker ? 0 : -ENOMEM;
466}
160 467
161/*****************************************************************************/
162/* 468/*
163 * allow the fileserver to see if the cache manager is still alive 469 * shut down the callback update process
164 */ 470 */
165int SRXAFSCM_Probe(struct afs_server *server) 471void afs_callback_update_kill(void)
166{ 472{
167 _debug("SRXAFSCM_Probe(%p)\n", server); 473 destroy_workqueue(afs_callback_update_worker);
168 return 0; 474}
169} /* end SRXAFSCM_Probe() */
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 1fc578372759..9b1311a1df51 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -1,4 +1,4 @@
1/* cell.c: AFS cell and server record management 1/* AFS cell and server record management
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
@@ -11,15 +11,9 @@
11 11
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <rxrpc/peer.h> 14#include <linux/key.h>
15#include <rxrpc/connection.h> 15#include <linux/ctype.h>
16#include "volume.h" 16#include <keys/rxrpc-type.h>
17#include "cell.h"
18#include "server.h"
19#include "transport.h"
20#include "vlclient.h"
21#include "kafstimod.h"
22#include "super.h"
23#include "internal.h" 17#include "internal.h"
24 18
25DECLARE_RWSEM(afs_proc_cells_sem); 19DECLARE_RWSEM(afs_proc_cells_sem);
@@ -28,66 +22,47 @@ LIST_HEAD(afs_proc_cells);
28static struct list_head afs_cells = LIST_HEAD_INIT(afs_cells); 22static struct list_head afs_cells = LIST_HEAD_INIT(afs_cells);
29static DEFINE_RWLOCK(afs_cells_lock); 23static DEFINE_RWLOCK(afs_cells_lock);
30static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */ 24static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */
25static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq);
31static struct afs_cell *afs_cell_root; 26static struct afs_cell *afs_cell_root;
32 27
33#ifdef AFS_CACHING_SUPPORT
34static cachefs_match_val_t afs_cell_cache_match(void *target,
35 const void *entry);
36static void afs_cell_cache_update(void *source, void *entry);
37
38struct cachefs_index_def afs_cache_cell_index_def = {
39 .name = "cell_ix",
40 .data_size = sizeof(struct afs_cache_cell),
41 .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
42 .match = afs_cell_cache_match,
43 .update = afs_cell_cache_update,
44};
45#endif
46
47/*****************************************************************************/
48/* 28/*
49 * create a cell record 29 * allocate a cell record and fill in its name, VL server address list and
50 * - "name" is the name of the cell 30 * allocate an anonymous key
51 * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
52 */ 31 */
53int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell) 32static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
54{ 33{
55 struct afs_cell *cell; 34 struct afs_cell *cell;
56 char *next; 35 size_t namelen;
36 char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
57 int ret; 37 int ret;
58 38
59 _enter("%s", name); 39 _enter("%s,%s", name, vllist);
60 40
61 BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ 41 BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
62 42
43 namelen = strlen(name);
44 if (namelen > AFS_MAXCELLNAME)
45 return ERR_PTR(-ENAMETOOLONG);
46
63 /* allocate and initialise a cell record */ 47 /* allocate and initialise a cell record */
64 cell = kmalloc(sizeof(struct afs_cell) + strlen(name) + 1, GFP_KERNEL); 48 cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
65 if (!cell) { 49 if (!cell) {
66 _leave(" = -ENOMEM"); 50 _leave(" = -ENOMEM");
67 return -ENOMEM; 51 return ERR_PTR(-ENOMEM);
68 } 52 }
69 53
70 down_write(&afs_cells_sem); 54 memcpy(cell->name, name, namelen);
71 55 cell->name[namelen] = 0;
72 memset(cell, 0, sizeof(struct afs_cell));
73 atomic_set(&cell->usage, 0);
74 56
57 atomic_set(&cell->usage, 1);
75 INIT_LIST_HEAD(&cell->link); 58 INIT_LIST_HEAD(&cell->link);
76 59 rwlock_init(&cell->servers_lock);
77 rwlock_init(&cell->sv_lock); 60 INIT_LIST_HEAD(&cell->servers);
78 INIT_LIST_HEAD(&cell->sv_list);
79 INIT_LIST_HEAD(&cell->sv_graveyard);
80 spin_lock_init(&cell->sv_gylock);
81
82 init_rwsem(&cell->vl_sem); 61 init_rwsem(&cell->vl_sem);
83 INIT_LIST_HEAD(&cell->vl_list); 62 INIT_LIST_HEAD(&cell->vl_list);
84 INIT_LIST_HEAD(&cell->vl_graveyard); 63 spin_lock_init(&cell->vl_lock);
85 spin_lock_init(&cell->vl_gylock);
86
87 strcpy(cell->name,name);
88 64
89 /* fill in the VL server list from the rest of the string */ 65 /* fill in the VL server list from the rest of the string */
90 ret = -EINVAL;
91 do { 66 do {
92 unsigned a, b, c, d; 67 unsigned a, b, c, d;
93 68
@@ -96,20 +71,75 @@ int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
96 *next++ = 0; 71 *next++ = 0;
97 72
98 if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) 73 if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
99 goto badaddr; 74 goto bad_address;
100 75
101 if (a > 255 || b > 255 || c > 255 || d > 255) 76 if (a > 255 || b > 255 || c > 255 || d > 255)
102 goto badaddr; 77 goto bad_address;
103 78
104 cell->vl_addrs[cell->vl_naddrs++].s_addr = 79 cell->vl_addrs[cell->vl_naddrs++].s_addr =
105 htonl((a << 24) | (b << 16) | (c << 8) | d); 80 htonl((a << 24) | (b << 16) | (c << 8) | d);
106 81
107 if (cell->vl_naddrs >= AFS_CELL_MAX_ADDRS) 82 } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next));
108 break; 83
84 /* create a key to represent an anonymous user */
85 memcpy(keyname, "afs@", 4);
86 dp = keyname + 4;
87 cp = cell->name;
88 do {
89 *dp++ = toupper(*cp);
90 } while (*cp++);
91 cell->anonymous_key = key_alloc(&key_type_rxrpc, keyname, 0, 0, current,
92 KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
93 if (IS_ERR(cell->anonymous_key)) {
94 _debug("no key");
95 ret = PTR_ERR(cell->anonymous_key);
96 goto error;
97 }
98
99 ret = key_instantiate_and_link(cell->anonymous_key, NULL, 0,
100 NULL, NULL);
101 if (ret < 0) {
102 _debug("instantiate failed");
103 goto error;
104 }
105
106 _debug("anon key %p{%x}",
107 cell->anonymous_key, key_serial(cell->anonymous_key));
108
109 _leave(" = %p", cell);
110 return cell;
111
112bad_address:
113 printk(KERN_ERR "kAFS: bad VL server IP address\n");
114 ret = -EINVAL;
115error:
116 key_put(cell->anonymous_key);
117 kfree(cell);
118 _leave(" = %d", ret);
119 return ERR_PTR(ret);
120}
121
122/*
123 * create a cell record
124 * - "name" is the name of the cell
125 * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
126 */
127struct afs_cell *afs_cell_create(const char *name, char *vllist)
128{
129 struct afs_cell *cell;
130 int ret;
131
132 _enter("%s,%s", name, vllist);
109 133
110 } while(vllist = next, vllist); 134 cell = afs_cell_alloc(name, vllist);
135 if (IS_ERR(cell)) {
136 _leave(" = %ld", PTR_ERR(cell));
137 return cell;
138 }
139
140 down_write(&afs_cells_sem);
111 141
112 /* add a proc dir for this cell */ 142 /* add a proc directory for this cell */
113 ret = afs_proc_cell_setup(cell); 143 ret = afs_proc_cell_setup(cell);
114 if (ret < 0) 144 if (ret < 0)
115 goto error; 145 goto error;
@@ -130,31 +160,28 @@ int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
130 down_write(&afs_proc_cells_sem); 160 down_write(&afs_proc_cells_sem);
131 list_add_tail(&cell->proc_link, &afs_proc_cells); 161 list_add_tail(&cell->proc_link, &afs_proc_cells);
132 up_write(&afs_proc_cells_sem); 162 up_write(&afs_proc_cells_sem);
133
134 *_cell = cell;
135 up_write(&afs_cells_sem); 163 up_write(&afs_cells_sem);
136 164
137 _leave(" = 0 (%p)", cell); 165 _leave(" = %p", cell);
138 return 0; 166 return cell;
139 167
140 badaddr: 168error:
141 printk(KERN_ERR "kAFS: bad VL server IP address: '%s'\n", vllist);
142 error:
143 up_write(&afs_cells_sem); 169 up_write(&afs_cells_sem);
170 key_put(cell->anonymous_key);
144 kfree(cell); 171 kfree(cell);
145 _leave(" = %d", ret); 172 _leave(" = %d", ret);
146 return ret; 173 return ERR_PTR(ret);
147} /* end afs_cell_create() */ 174}
148 175
149/*****************************************************************************/
150/* 176/*
151 * initialise the cell database from module parameters 177 * set the root cell information
178 * - can be called with a module parameter string
179 * - can be called from a write to /proc/fs/afs/rootcell
152 */ 180 */
153int afs_cell_init(char *rootcell) 181int afs_cell_init(char *rootcell)
154{ 182{
155 struct afs_cell *old_root, *new_root; 183 struct afs_cell *old_root, *new_root;
156 char *cp; 184 char *cp;
157 int ret;
158 185
159 _enter(""); 186 _enter("");
160 187
@@ -162,82 +189,60 @@ int afs_cell_init(char *rootcell)
162 /* module is loaded with no parameters, or built statically. 189 /* module is loaded with no parameters, or built statically.
163 * - in the future we might initialize cell DB here. 190 * - in the future we might initialize cell DB here.
164 */ 191 */
165 _leave(" = 0 (but no root)"); 192 _leave(" = 0 [no root]");
166 return 0; 193 return 0;
167 } 194 }
168 195
169 cp = strchr(rootcell, ':'); 196 cp = strchr(rootcell, ':');
170 if (!cp) { 197 if (!cp) {
171 printk(KERN_ERR "kAFS: no VL server IP addresses specified\n"); 198 printk(KERN_ERR "kAFS: no VL server IP addresses specified\n");
172 _leave(" = %d (no colon)", -EINVAL); 199 _leave(" = -EINVAL");
173 return -EINVAL; 200 return -EINVAL;
174 } 201 }
175 202
176 /* allocate a cell record for the root cell */ 203 /* allocate a cell record for the root cell */
177 *cp++ = 0; 204 *cp++ = 0;
178 ret = afs_cell_create(rootcell, cp, &new_root); 205 new_root = afs_cell_create(rootcell, cp);
179 if (ret < 0) { 206 if (IS_ERR(new_root)) {
180 _leave(" = %d", ret); 207 _leave(" = %ld", PTR_ERR(new_root));
181 return ret; 208 return PTR_ERR(new_root);
182 } 209 }
183 210
184 /* as afs_put_cell() takes locks by itself, we have to do 211 /* install the new cell */
185 * a little gymnastics to be race-free.
186 */
187 afs_get_cell(new_root);
188
189 write_lock(&afs_cells_lock); 212 write_lock(&afs_cells_lock);
190 while (afs_cell_root) { 213 old_root = afs_cell_root;
191 old_root = afs_cell_root;
192 afs_cell_root = NULL;
193 write_unlock(&afs_cells_lock);
194 afs_put_cell(old_root);
195 write_lock(&afs_cells_lock);
196 }
197 afs_cell_root = new_root; 214 afs_cell_root = new_root;
198 write_unlock(&afs_cells_lock); 215 write_unlock(&afs_cells_lock);
216 afs_put_cell(old_root);
199 217
200 _leave(" = %d", ret); 218 _leave(" = 0");
201 return ret; 219 return 0;
202 220}
203} /* end afs_cell_init() */
204 221
205/*****************************************************************************/
206/* 222/*
207 * lookup a cell record 223 * lookup a cell record
208 */ 224 */
209int afs_cell_lookup(const char *name, unsigned namesz, struct afs_cell **_cell) 225struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
210{ 226{
211 struct afs_cell *cell; 227 struct afs_cell *cell;
212 int ret;
213 228
214 _enter("\"%*.*s\",", namesz, namesz, name ? name : ""); 229 _enter("\"%*.*s\",", namesz, namesz, name ? name : "");
215 230
216 *_cell = NULL; 231 down_read(&afs_cells_sem);
232 read_lock(&afs_cells_lock);
217 233
218 if (name) { 234 if (name) {
219 /* if the cell was named, look for it in the cell record list */ 235 /* if the cell was named, look for it in the cell record list */
220 ret = -ENOENT;
221 cell = NULL;
222 read_lock(&afs_cells_lock);
223
224 list_for_each_entry(cell, &afs_cells, link) { 236 list_for_each_entry(cell, &afs_cells, link) {
225 if (strncmp(cell->name, name, namesz) == 0) { 237 if (strncmp(cell->name, name, namesz) == 0) {
226 afs_get_cell(cell); 238 afs_get_cell(cell);
227 goto found; 239 goto found;
228 } 240 }
229 } 241 }
230 cell = NULL; 242 cell = ERR_PTR(-ENOENT);
231 found: 243 found:
232 244 ;
233 read_unlock(&afs_cells_lock); 245 } else {
234
235 if (cell)
236 ret = 0;
237 }
238 else {
239 read_lock(&afs_cells_lock);
240
241 cell = afs_cell_root; 246 cell = afs_cell_root;
242 if (!cell) { 247 if (!cell) {
243 /* this should not happen unless user tries to mount 248 /* this should not happen unless user tries to mount
@@ -246,44 +251,35 @@ int afs_cell_lookup(const char *name, unsigned namesz, struct afs_cell **_cell)
246 * ENOENT might be "more appropriate" but they happen 251 * ENOENT might be "more appropriate" but they happen
247 * for other reasons. 252 * for other reasons.
248 */ 253 */
249 ret = -EDESTADDRREQ; 254 cell = ERR_PTR(-EDESTADDRREQ);
250 } 255 } else {
251 else {
252 afs_get_cell(cell); 256 afs_get_cell(cell);
253 ret = 0;
254 } 257 }
255 258
256 read_unlock(&afs_cells_lock);
257 } 259 }
258 260
259 *_cell = cell; 261 read_unlock(&afs_cells_lock);
260 _leave(" = %d (%p)", ret, cell); 262 up_read(&afs_cells_sem);
261 return ret; 263 _leave(" = %p", cell);
262 264 return cell;
263} /* end afs_cell_lookup() */ 265}
264 266
265/*****************************************************************************/
266/* 267/*
267 * try and get a cell record 268 * try and get a cell record
268 */ 269 */
269struct afs_cell *afs_get_cell_maybe(struct afs_cell **_cell) 270struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell)
270{ 271{
271 struct afs_cell *cell;
272
273 write_lock(&afs_cells_lock); 272 write_lock(&afs_cells_lock);
274 273
275 cell = *_cell;
276 if (cell && !list_empty(&cell->link)) 274 if (cell && !list_empty(&cell->link))
277 afs_get_cell(cell); 275 afs_get_cell(cell);
278 else 276 else
279 cell = NULL; 277 cell = NULL;
280 278
281 write_unlock(&afs_cells_lock); 279 write_unlock(&afs_cells_lock);
282
283 return cell; 280 return cell;
284} /* end afs_get_cell_maybe() */ 281}
285 282
286/*****************************************************************************/
287/* 283/*
288 * destroy a cell record 284 * destroy a cell record
289 */ 285 */
@@ -294,8 +290,7 @@ void afs_put_cell(struct afs_cell *cell)
294 290
295 _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name); 291 _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
296 292
297 /* sanity check */ 293 ASSERTCMP(atomic_read(&cell->usage), >, 0);
298 BUG_ON(atomic_read(&cell->usage) <= 0);
299 294
300 /* to prevent a race, the decrement and the dequeue must be effectively 295 /* to prevent a race, the decrement and the dequeue must be effectively
301 * atomic */ 296 * atomic */
@@ -307,36 +302,49 @@ void afs_put_cell(struct afs_cell *cell)
307 return; 302 return;
308 } 303 }
309 304
305 ASSERT(list_empty(&cell->servers));
306 ASSERT(list_empty(&cell->vl_list));
307
310 write_unlock(&afs_cells_lock); 308 write_unlock(&afs_cells_lock);
311 309
312 BUG_ON(!list_empty(&cell->sv_list)); 310 wake_up(&afs_cells_freeable_wq);
313 BUG_ON(!list_empty(&cell->sv_graveyard));
314 BUG_ON(!list_empty(&cell->vl_list));
315 BUG_ON(!list_empty(&cell->vl_graveyard));
316 311
317 _leave(" [unused]"); 312 _leave(" [unused]");
318} /* end afs_put_cell() */ 313}
319 314
320/*****************************************************************************/
321/* 315/*
322 * destroy a cell record 316 * destroy a cell record
317 * - must be called with the afs_cells_sem write-locked
318 * - cell->link should have been broken by the caller
323 */ 319 */
324static void afs_cell_destroy(struct afs_cell *cell) 320static void afs_cell_destroy(struct afs_cell *cell)
325{ 321{
326 _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name); 322 _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
327 323
328 /* to prevent a race, the decrement and the dequeue must be effectively 324 ASSERTCMP(atomic_read(&cell->usage), >=, 0);
329 * atomic */ 325 ASSERT(list_empty(&cell->link));
330 write_lock(&afs_cells_lock);
331 326
332 /* sanity check */ 327 /* wait for everyone to stop using the cell */
333 BUG_ON(atomic_read(&cell->usage) != 0); 328 if (atomic_read(&cell->usage) > 0) {
329 DECLARE_WAITQUEUE(myself, current);
334 330
335 list_del_init(&cell->link); 331 _debug("wait for cell %s", cell->name);
332 set_current_state(TASK_UNINTERRUPTIBLE);
333 add_wait_queue(&afs_cells_freeable_wq, &myself);
336 334
337 write_unlock(&afs_cells_lock); 335 while (atomic_read(&cell->usage) > 0) {
336 schedule();
337 set_current_state(TASK_UNINTERRUPTIBLE);
338 }
338 339
339 down_write(&afs_cells_sem); 340 remove_wait_queue(&afs_cells_freeable_wq, &myself);
341 set_current_state(TASK_RUNNING);
342 }
343
344 _debug("cell dead");
345 ASSERTCMP(atomic_read(&cell->usage), ==, 0);
346 ASSERT(list_empty(&cell->servers));
347 ASSERT(list_empty(&cell->vl_list));
340 348
341 afs_proc_cell_remove(cell); 349 afs_proc_cell_remove(cell);
342 350
@@ -348,104 +356,26 @@ static void afs_cell_destroy(struct afs_cell *cell)
348 cachefs_relinquish_cookie(cell->cache, 0); 356 cachefs_relinquish_cookie(cell->cache, 0);
349#endif 357#endif
350 358
351 up_write(&afs_cells_sem); 359 key_put(cell->anonymous_key);
352
353 BUG_ON(!list_empty(&cell->sv_list));
354 BUG_ON(!list_empty(&cell->sv_graveyard));
355 BUG_ON(!list_empty(&cell->vl_list));
356 BUG_ON(!list_empty(&cell->vl_graveyard));
357
358 /* finish cleaning up the cell */
359 kfree(cell); 360 kfree(cell);
360 361
361 _leave(" [destroyed]"); 362 _leave(" [destroyed]");
362} /* end afs_cell_destroy() */ 363}
363
364/*****************************************************************************/
365/*
366 * lookup the server record corresponding to an Rx RPC peer
367 */
368int afs_server_find_by_peer(const struct rxrpc_peer *peer,
369 struct afs_server **_server)
370{
371 struct afs_server *server;
372 struct afs_cell *cell;
373
374 _enter("%p{a=%08x},", peer, ntohl(peer->addr.s_addr));
375
376 /* search the cell list */
377 read_lock(&afs_cells_lock);
378
379 list_for_each_entry(cell, &afs_cells, link) {
380
381 _debug("? cell %s",cell->name);
382
383 write_lock(&cell->sv_lock);
384
385 /* check the active list */
386 list_for_each_entry(server, &cell->sv_list, link) {
387 _debug("?? server %08x", ntohl(server->addr.s_addr));
388
389 if (memcmp(&server->addr, &peer->addr,
390 sizeof(struct in_addr)) == 0)
391 goto found_server;
392 }
393 364
394 /* check the inactive list */
395 spin_lock(&cell->sv_gylock);
396 list_for_each_entry(server, &cell->sv_graveyard, link) {
397 _debug("?? dead server %08x",
398 ntohl(server->addr.s_addr));
399
400 if (memcmp(&server->addr, &peer->addr,
401 sizeof(struct in_addr)) == 0)
402 goto found_dead_server;
403 }
404 spin_unlock(&cell->sv_gylock);
405
406 write_unlock(&cell->sv_lock);
407 }
408 read_unlock(&afs_cells_lock);
409
410 _leave(" = -ENOENT");
411 return -ENOENT;
412
413 /* we found it in the graveyard - resurrect it */
414 found_dead_server:
415 list_move_tail(&server->link, &cell->sv_list);
416 afs_get_server(server);
417 afs_kafstimod_del_timer(&server->timeout);
418 spin_unlock(&cell->sv_gylock);
419 goto success;
420
421 /* we found it - increment its ref count and return it */
422 found_server:
423 afs_get_server(server);
424
425 success:
426 write_unlock(&cell->sv_lock);
427 read_unlock(&afs_cells_lock);
428
429 *_server = server;
430 _leave(" = 0 (s=%p c=%p)", server, cell);
431 return 0;
432
433} /* end afs_server_find_by_peer() */
434
435/*****************************************************************************/
436/* 365/*
437 * purge in-memory cell database on module unload or afs_init() failure 366 * purge in-memory cell database on module unload or afs_init() failure
438 * - the timeout daemon is stopped before calling this 367 * - the timeout daemon is stopped before calling this
439 */ 368 */
440void afs_cell_purge(void) 369void afs_cell_purge(void)
441{ 370{
442 struct afs_vlocation *vlocation;
443 struct afs_cell *cell; 371 struct afs_cell *cell;
444 372
445 _enter(""); 373 _enter("");
446 374
447 afs_put_cell(afs_cell_root); 375 afs_put_cell(afs_cell_root);
448 376
377 down_write(&afs_cells_sem);
378
449 while (!list_empty(&afs_cells)) { 379 while (!list_empty(&afs_cells)) {
450 cell = NULL; 380 cell = NULL;
451 381
@@ -464,104 +394,11 @@ void afs_cell_purge(void)
464 _debug("PURGING CELL %s (%d)", 394 _debug("PURGING CELL %s (%d)",
465 cell->name, atomic_read(&cell->usage)); 395 cell->name, atomic_read(&cell->usage));
466 396
467 BUG_ON(!list_empty(&cell->sv_list));
468 BUG_ON(!list_empty(&cell->vl_list));
469
470 /* purge the cell's VL graveyard list */
471 _debug(" - clearing VL graveyard");
472
473 spin_lock(&cell->vl_gylock);
474
475 while (!list_empty(&cell->vl_graveyard)) {
476 vlocation = list_entry(cell->vl_graveyard.next,
477 struct afs_vlocation,
478 link);
479 list_del_init(&vlocation->link);
480
481 afs_kafstimod_del_timer(&vlocation->timeout);
482
483 spin_unlock(&cell->vl_gylock);
484
485 afs_vlocation_do_timeout(vlocation);
486 /* TODO: race if move to use krxtimod instead
487 * of kafstimod */
488
489 spin_lock(&cell->vl_gylock);
490 }
491
492 spin_unlock(&cell->vl_gylock);
493
494 /* purge the cell's server graveyard list */
495 _debug(" - clearing server graveyard");
496
497 spin_lock(&cell->sv_gylock);
498
499 while (!list_empty(&cell->sv_graveyard)) {
500 struct afs_server *server;
501
502 server = list_entry(cell->sv_graveyard.next,
503 struct afs_server, link);
504 list_del_init(&server->link);
505
506 afs_kafstimod_del_timer(&server->timeout);
507
508 spin_unlock(&cell->sv_gylock);
509
510 afs_server_do_timeout(server);
511
512 spin_lock(&cell->sv_gylock);
513 }
514
515 spin_unlock(&cell->sv_gylock);
516
517 /* now the cell should be left with no references */ 397 /* now the cell should be left with no references */
518 afs_cell_destroy(cell); 398 afs_cell_destroy(cell);
519 } 399 }
520 } 400 }
521 401
402 up_write(&afs_cells_sem);
522 _leave(""); 403 _leave("");
523} /* end afs_cell_purge() */ 404}
524
525/*****************************************************************************/
526/*
527 * match a cell record obtained from the cache
528 */
529#ifdef AFS_CACHING_SUPPORT
530static cachefs_match_val_t afs_cell_cache_match(void *target,
531 const void *entry)
532{
533 const struct afs_cache_cell *ccell = entry;
534 struct afs_cell *cell = target;
535
536 _enter("{%s},{%s}", ccell->name, cell->name);
537
538 if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
539 _leave(" = SUCCESS");
540 return CACHEFS_MATCH_SUCCESS;
541 }
542
543 _leave(" = FAILED");
544 return CACHEFS_MATCH_FAILED;
545} /* end afs_cell_cache_match() */
546#endif
547
548/*****************************************************************************/
549/*
550 * update a cell record in the cache
551 */
552#ifdef AFS_CACHING_SUPPORT
553static void afs_cell_cache_update(void *source, void *entry)
554{
555 struct afs_cache_cell *ccell = entry;
556 struct afs_cell *cell = source;
557
558 _enter("%p,%p", source, entry);
559
560 strncpy(ccell->name, cell->name, sizeof(ccell->name));
561
562 memcpy(ccell->vl_servers,
563 cell->vl_addrs,
564 min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
565
566} /* end afs_cell_cache_update() */
567#endif
diff --git a/fs/afs/cell.h b/fs/afs/cell.h
deleted file mode 100644
index 48349108fb00..000000000000
--- a/fs/afs/cell.h
+++ /dev/null
@@ -1,78 +0,0 @@
1/* cell.h: AFS cell record
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef _LINUX_AFS_CELL_H
13#define _LINUX_AFS_CELL_H
14
15#include "types.h"
16#include "cache.h"
17
18#define AFS_CELL_MAX_ADDRS 15
19
20extern volatile int afs_cells_being_purged; /* T when cells are being purged by rmmod */
21
22/*****************************************************************************/
23/*
24 * entry in the cached cell catalogue
25 */
26struct afs_cache_cell
27{
28 char name[64]; /* cell name (padded with NULs) */
29 struct in_addr vl_servers[15]; /* cached cell VL servers */
30};
31
32/*****************************************************************************/
33/*
34 * AFS cell record
35 */
36struct afs_cell
37{
38 atomic_t usage;
39 struct list_head link; /* main cell list link */
40 struct list_head proc_link; /* /proc cell list link */
41 struct proc_dir_entry *proc_dir; /* /proc dir for this cell */
42#ifdef AFS_CACHING_SUPPORT
43 struct cachefs_cookie *cache; /* caching cookie */
44#endif
45
46 /* server record management */
47 rwlock_t sv_lock; /* active server list lock */
48 struct list_head sv_list; /* active server list */
49 struct list_head sv_graveyard; /* inactive server list */
50 spinlock_t sv_gylock; /* inactive server list lock */
51
52 /* volume location record management */
53 struct rw_semaphore vl_sem; /* volume management serialisation semaphore */
54 struct list_head vl_list; /* cell's active VL record list */
55 struct list_head vl_graveyard; /* cell's inactive VL record list */
56 spinlock_t vl_gylock; /* graveyard lock */
57 unsigned short vl_naddrs; /* number of VL servers in addr list */
58 unsigned short vl_curr_svix; /* current server index */
59 struct in_addr vl_addrs[AFS_CELL_MAX_ADDRS]; /* cell VL server addresses */
60
61 char name[0]; /* cell name - must go last */
62};
63
64extern int afs_cell_init(char *rootcell);
65
66extern int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell);
67
68extern int afs_cell_lookup(const char *name, unsigned nmsize, struct afs_cell **_cell);
69
70#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
71
72extern struct afs_cell *afs_get_cell_maybe(struct afs_cell **_cell);
73
74extern void afs_put_cell(struct afs_cell *cell);
75
76extern void afs_cell_purge(void);
77
78#endif /* _LINUX_AFS_CELL_H */
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 3d097fddcb7a..d5b2ad6575bc 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -1,4 +1,4 @@
1/* cmservice.c: AFS Cache Manager Service 1/* AFS Cache Manager Service
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
@@ -12,641 +12,464 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/completion.h> 15#include <linux/ip.h>
16#include "server.h"
17#include "cell.h"
18#include "transport.h"
19#include <rxrpc/rxrpc.h>
20#include <rxrpc/transport.h>
21#include <rxrpc/connection.h>
22#include <rxrpc/call.h>
23#include "cmservice.h"
24#include "internal.h" 16#include "internal.h"
17#include "afs_cm.h"
25 18
26static unsigned afscm_usage; /* AFS cache manager usage count */ 19struct workqueue_struct *afs_cm_workqueue;
27static struct rw_semaphore afscm_sem; /* AFS cache manager start/stop semaphore */
28
29static int afscm_new_call(struct rxrpc_call *call);
30static void afscm_attention(struct rxrpc_call *call);
31static void afscm_error(struct rxrpc_call *call);
32static void afscm_aemap(struct rxrpc_call *call);
33
34static void _SRXAFSCM_CallBack(struct rxrpc_call *call);
35static void _SRXAFSCM_InitCallBackState(struct rxrpc_call *call);
36static void _SRXAFSCM_Probe(struct rxrpc_call *call);
37
38typedef void (*_SRXAFSCM_xxxx_t)(struct rxrpc_call *call);
39
40static const struct rxrpc_operation AFSCM_ops[] = {
41 {
42 .id = 204,
43 .asize = RXRPC_APP_MARK_EOF,
44 .name = "CallBack",
45 .user = _SRXAFSCM_CallBack,
46 },
47 {
48 .id = 205,
49 .asize = RXRPC_APP_MARK_EOF,
50 .name = "InitCallBackState",
51 .user = _SRXAFSCM_InitCallBackState,
52 },
53 {
54 .id = 206,
55 .asize = RXRPC_APP_MARK_EOF,
56 .name = "Probe",
57 .user = _SRXAFSCM_Probe,
58 },
59#if 0
60 {
61 .id = 207,
62 .asize = RXRPC_APP_MARK_EOF,
63 .name = "GetLock",
64 .user = _SRXAFSCM_GetLock,
65 },
66 {
67 .id = 208,
68 .asize = RXRPC_APP_MARK_EOF,
69 .name = "GetCE",
70 .user = _SRXAFSCM_GetCE,
71 },
72 {
73 .id = 209,
74 .asize = RXRPC_APP_MARK_EOF,
75 .name = "GetXStatsVersion",
76 .user = _SRXAFSCM_GetXStatsVersion,
77 },
78 {
79 .id = 210,
80 .asize = RXRPC_APP_MARK_EOF,
81 .name = "GetXStats",
82 .user = _SRXAFSCM_GetXStats,
83 }
84#endif
85};
86 20
87static struct rxrpc_service AFSCM_service = { 21static int afs_deliver_cb_init_call_back_state(struct afs_call *,
88 .name = "AFS/CM", 22 struct sk_buff *, bool);
89 .owner = THIS_MODULE, 23static int afs_deliver_cb_init_call_back_state3(struct afs_call *,
90 .link = LIST_HEAD_INIT(AFSCM_service.link), 24 struct sk_buff *, bool);
91 .new_call = afscm_new_call, 25static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool);
92 .service_id = 1, 26static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool);
93 .attn_func = afscm_attention, 27static int afs_deliver_cb_get_capabilities(struct afs_call *, struct sk_buff *,
94 .error_func = afscm_error, 28 bool);
95 .aemap_func = afscm_aemap, 29static void afs_cm_destructor(struct afs_call *);
96 .ops_begin = &AFSCM_ops[0],
97 .ops_end = &AFSCM_ops[ARRAY_SIZE(AFSCM_ops)],
98};
99 30
100static DECLARE_COMPLETION(kafscmd_alive);
101static DECLARE_COMPLETION(kafscmd_dead);
102static DECLARE_WAIT_QUEUE_HEAD(kafscmd_sleepq);
103static LIST_HEAD(kafscmd_attention_list);
104static LIST_HEAD(afscm_calls);
105static DEFINE_SPINLOCK(afscm_calls_lock);
106static DEFINE_SPINLOCK(kafscmd_attention_lock);
107static int kafscmd_die;
108
109/*****************************************************************************/
110/* 31/*
111 * AFS Cache Manager kernel thread 32 * CB.CallBack operation type
112 */ 33 */
113static int kafscmd(void *arg) 34static const struct afs_call_type afs_SRXCBCallBack = {
114{ 35 .name = "CB.CallBack",
115 DECLARE_WAITQUEUE(myself, current); 36 .deliver = afs_deliver_cb_callback,
116 37 .abort_to_error = afs_abort_to_error,
117 struct rxrpc_call *call; 38 .destructor = afs_cm_destructor,
118 _SRXAFSCM_xxxx_t func; 39};
119 int die;
120
121 printk(KERN_INFO "kAFS: Started kafscmd %d\n", current->pid);
122
123 daemonize("kafscmd");
124
125 complete(&kafscmd_alive);
126
127 /* loop around looking for things to attend to */
128 do {
129 if (list_empty(&kafscmd_attention_list)) {
130 set_current_state(TASK_INTERRUPTIBLE);
131 add_wait_queue(&kafscmd_sleepq, &myself);
132
133 for (;;) {
134 set_current_state(TASK_INTERRUPTIBLE);
135 if (!list_empty(&kafscmd_attention_list) ||
136 signal_pending(current) ||
137 kafscmd_die)
138 break;
139
140 schedule();
141 }
142
143 remove_wait_queue(&kafscmd_sleepq, &myself);
144 set_current_state(TASK_RUNNING);
145 }
146
147 die = kafscmd_die;
148
149 /* dequeue the next call requiring attention */
150 call = NULL;
151 spin_lock(&kafscmd_attention_lock);
152
153 if (!list_empty(&kafscmd_attention_list)) {
154 call = list_entry(kafscmd_attention_list.next,
155 struct rxrpc_call,
156 app_attn_link);
157 list_del_init(&call->app_attn_link);
158 die = 0;
159 }
160
161 spin_unlock(&kafscmd_attention_lock);
162
163 if (call) {
164 /* act upon it */
165 _debug("@@@ Begin Attend Call %p", call);
166
167 func = call->app_user;
168 if (func)
169 func(call);
170
171 rxrpc_put_call(call);
172
173 _debug("@@@ End Attend Call %p", call);
174 }
175
176 } while(!die);
177
178 /* and that's all */
179 complete_and_exit(&kafscmd_dead, 0);
180
181} /* end kafscmd() */
182 40
183/*****************************************************************************/
184/* 41/*
185 * handle a call coming in to the cache manager 42 * CB.InitCallBackState operation type
186 * - if I want to keep the call, I must increment its usage count
187 * - the return value will be negated and passed back in an abort packet if
188 * non-zero
189 * - serialised by virtue of there only being one krxiod
190 */ 43 */
191static int afscm_new_call(struct rxrpc_call *call) 44static const struct afs_call_type afs_SRXCBInitCallBackState = {
192{ 45 .name = "CB.InitCallBackState",
193 _enter("%p{cid=%u u=%d}", 46 .deliver = afs_deliver_cb_init_call_back_state,
194 call, ntohl(call->call_id), atomic_read(&call->usage)); 47 .abort_to_error = afs_abort_to_error,
195 48 .destructor = afs_cm_destructor,
196 rxrpc_get_call(call); 49};
197
198 /* add to my current call list */
199 spin_lock(&afscm_calls_lock);
200 list_add(&call->app_link,&afscm_calls);
201 spin_unlock(&afscm_calls_lock);
202
203 _leave(" = 0");
204 return 0;
205
206} /* end afscm_new_call() */
207 50
208/*****************************************************************************/
209/* 51/*
210 * queue on the kafscmd queue for attention 52 * CB.InitCallBackState3 operation type
211 */ 53 */
212static void afscm_attention(struct rxrpc_call *call) 54static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
213{ 55 .name = "CB.InitCallBackState3",
214 _enter("%p{cid=%u u=%d}", 56 .deliver = afs_deliver_cb_init_call_back_state3,
215 call, ntohl(call->call_id), atomic_read(&call->usage)); 57 .abort_to_error = afs_abort_to_error,
216 58 .destructor = afs_cm_destructor,
217 spin_lock(&kafscmd_attention_lock); 59};
218
219 if (list_empty(&call->app_attn_link)) {
220 list_add_tail(&call->app_attn_link, &kafscmd_attention_list);
221 rxrpc_get_call(call);
222 }
223
224 spin_unlock(&kafscmd_attention_lock);
225
226 wake_up(&kafscmd_sleepq);
227
228 _leave(" {u=%d}", atomic_read(&call->usage));
229} /* end afscm_attention() */
230 60
231/*****************************************************************************/
232/* 61/*
233 * handle my call being aborted 62 * CB.Probe operation type
234 * - clean up, dequeue and put my ref to the call
235 */ 63 */
236static void afscm_error(struct rxrpc_call *call) 64static const struct afs_call_type afs_SRXCBProbe = {
237{ 65 .name = "CB.Probe",
238 int removed; 66 .deliver = afs_deliver_cb_probe,
239 67 .abort_to_error = afs_abort_to_error,
240 _enter("%p{est=%s ac=%u er=%d}", 68 .destructor = afs_cm_destructor,
241 call, 69};
242 rxrpc_call_error_states[call->app_err_state],
243 call->app_abort_code,
244 call->app_errno);
245
246 spin_lock(&kafscmd_attention_lock);
247
248 if (list_empty(&call->app_attn_link)) {
249 list_add_tail(&call->app_attn_link, &kafscmd_attention_list);
250 rxrpc_get_call(call);
251 }
252
253 spin_unlock(&kafscmd_attention_lock);
254
255 removed = 0;
256 spin_lock(&afscm_calls_lock);
257 if (!list_empty(&call->app_link)) {
258 list_del_init(&call->app_link);
259 removed = 1;
260 }
261 spin_unlock(&afscm_calls_lock);
262
263 if (removed)
264 rxrpc_put_call(call);
265
266 wake_up(&kafscmd_sleepq);
267 70
268 _leave(""); 71/*
269} /* end afscm_error() */ 72 * CB.GetCapabilities operation type
73 */
74static const struct afs_call_type afs_SRXCBGetCapabilites = {
75 .name = "CB.GetCapabilities",
76 .deliver = afs_deliver_cb_get_capabilities,
77 .abort_to_error = afs_abort_to_error,
78 .destructor = afs_cm_destructor,
79};
270 80
271/*****************************************************************************/
272/* 81/*
273 * map afs abort codes to/from Linux error codes 82 * route an incoming cache manager call
274 * - called with call->lock held 83 * - return T if supported, F if not
275 */ 84 */
276static void afscm_aemap(struct rxrpc_call *call) 85bool afs_cm_incoming_call(struct afs_call *call)
277{ 86{
278 switch (call->app_err_state) { 87 u32 operation_id = ntohl(call->operation_ID);
279 case RXRPC_ESTATE_LOCAL_ABORT: 88
280 call->app_abort_code = -call->app_errno; 89 _enter("{CB.OP %u}", operation_id);
281 break; 90
282 case RXRPC_ESTATE_PEER_ABORT: 91 switch (operation_id) {
283 call->app_errno = -ECONNABORTED; 92 case CBCallBack:
284 break; 93 call->type = &afs_SRXCBCallBack;
94 return true;
95 case CBInitCallBackState:
96 call->type = &afs_SRXCBInitCallBackState;
97 return true;
98 case CBInitCallBackState3:
99 call->type = &afs_SRXCBInitCallBackState3;
100 return true;
101 case CBProbe:
102 call->type = &afs_SRXCBProbe;
103 return true;
104 case CBGetCapabilities:
105 call->type = &afs_SRXCBGetCapabilites;
106 return true;
285 default: 107 default:
286 break; 108 return false;
287 } 109 }
288} /* end afscm_aemap() */ 110}
289 111
290/*****************************************************************************/
291/* 112/*
292 * start the cache manager service if not already started 113 * clean up a cache manager call
293 */ 114 */
294int afscm_start(void) 115static void afs_cm_destructor(struct afs_call *call)
295{ 116{
296 int ret; 117 _enter("");
297
298 down_write(&afscm_sem);
299 if (!afscm_usage) {
300 ret = kernel_thread(kafscmd, NULL, 0);
301 if (ret < 0)
302 goto out;
303
304 wait_for_completion(&kafscmd_alive);
305
306 ret = rxrpc_add_service(afs_transport, &AFSCM_service);
307 if (ret < 0)
308 goto kill;
309
310 afs_kafstimod_add_timer(&afs_mntpt_expiry_timer,
311 afs_mntpt_expiry_timeout * HZ);
312 }
313
314 afscm_usage++;
315 up_write(&afscm_sem);
316
317 return 0;
318
319 kill:
320 kafscmd_die = 1;
321 wake_up(&kafscmd_sleepq);
322 wait_for_completion(&kafscmd_dead);
323
324 out:
325 up_write(&afscm_sem);
326 return ret;
327 118
328} /* end afscm_start() */ 119 afs_put_server(call->server);
120 call->server = NULL;
121 kfree(call->buffer);
122 call->buffer = NULL;
123}
329 124
330/*****************************************************************************/
331/* 125/*
332 * stop the cache manager service 126 * allow the fileserver to see if the cache manager is still alive
333 */ 127 */
334void afscm_stop(void) 128static void SRXAFSCB_CallBack(struct work_struct *work)
335{ 129{
336 struct rxrpc_call *call; 130 struct afs_call *call = container_of(work, struct afs_call, work);
337 131
338 down_write(&afscm_sem); 132 _enter("");
339 133
340 BUG_ON(afscm_usage == 0); 134 /* be sure to send the reply *before* attempting to spam the AFS server
341 afscm_usage--; 135 * with FSFetchStatus requests on the vnodes with broken callbacks lest
136 * the AFS server get into a vicious cycle of trying to break further
137 * callbacks because it hadn't received completion of the CBCallBack op
138 * yet */
139 afs_send_empty_reply(call);
342 140
343 if (afscm_usage == 0) { 141 afs_break_callbacks(call->server, call->count, call->request);
344 /* don't want more incoming calls */ 142 _leave("");
345 rxrpc_del_service(afs_transport, &AFSCM_service); 143}
346
347 /* abort any calls I've still got open (the afscm_error() will
348 * dequeue them) */
349 spin_lock(&afscm_calls_lock);
350 while (!list_empty(&afscm_calls)) {
351 call = list_entry(afscm_calls.next,
352 struct rxrpc_call,
353 app_link);
354 144
355 list_del_init(&call->app_link); 145/*
356 rxrpc_get_call(call); 146 * deliver request data to a CB.CallBack call
357 spin_unlock(&afscm_calls_lock); 147 */
148static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
149 bool last)
150{
151 struct afs_callback *cb;
152 struct afs_server *server;
153 struct in_addr addr;
154 __be32 *bp;
155 u32 tmp;
156 int ret, loop;
157
158 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
159
160 switch (call->unmarshall) {
161 case 0:
162 call->offset = 0;
163 call->unmarshall++;
164
165 /* extract the FID array and its count in two steps */
166 case 1:
167 _debug("extract FID count");
168 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
169 switch (ret) {
170 case 0: break;
171 case -EAGAIN: return 0;
172 default: return ret;
173 }
358 174
359 rxrpc_call_abort(call, -ESRCH); /* abort, dequeue and 175 call->count = ntohl(call->tmp);
360 * put */ 176 _debug("FID count: %u", call->count);
177 if (call->count > AFSCBMAX)
178 return -EBADMSG;
179
180 call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL);
181 if (!call->buffer)
182 return -ENOMEM;
183 call->offset = 0;
184 call->unmarshall++;
185
186 case 2:
187 _debug("extract FID array");
188 ret = afs_extract_data(call, skb, last, call->buffer,
189 call->count * 3 * 4);
190 switch (ret) {
191 case 0: break;
192 case -EAGAIN: return 0;
193 default: return ret;
194 }
361 195
362 _debug("nuking active call %08x.%d", 196 _debug("unmarshall FID array");
363 ntohl(call->conn->conn_id), 197 call->request = kcalloc(call->count,
364 ntohl(call->call_id)); 198 sizeof(struct afs_callback),
365 rxrpc_put_call(call); 199 GFP_KERNEL);
366 rxrpc_put_call(call); 200 if (!call->request)
201 return -ENOMEM;
202
203 cb = call->request;
204 bp = call->buffer;
205 for (loop = call->count; loop > 0; loop--, cb++) {
206 cb->fid.vid = ntohl(*bp++);
207 cb->fid.vnode = ntohl(*bp++);
208 cb->fid.unique = ntohl(*bp++);
209 cb->type = AFSCM_CB_UNTYPED;
210 }
367 211
368 spin_lock(&afscm_calls_lock); 212 call->offset = 0;
213 call->unmarshall++;
214
215 /* extract the callback array and its count in two steps */
216 case 3:
217 _debug("extract CB count");
218 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
219 switch (ret) {
220 case 0: break;
221 case -EAGAIN: return 0;
222 default: return ret;
369 } 223 }
370 spin_unlock(&afscm_calls_lock);
371 224
372 /* get rid of my daemon */ 225 tmp = ntohl(call->tmp);
373 kafscmd_die = 1; 226 _debug("CB count: %u", tmp);
374 wake_up(&kafscmd_sleepq); 227 if (tmp != call->count && tmp != 0)
375 wait_for_completion(&kafscmd_dead); 228 return -EBADMSG;
229 call->offset = 0;
230 call->unmarshall++;
231 if (tmp == 0)
232 goto empty_cb_array;
233
234 case 4:
235 _debug("extract CB array");
236 ret = afs_extract_data(call, skb, last, call->request,
237 call->count * 3 * 4);
238 switch (ret) {
239 case 0: break;
240 case -EAGAIN: return 0;
241 default: return ret;
242 }
376 243
377 /* dispose of any calls waiting for attention */ 244 _debug("unmarshall CB array");
378 spin_lock(&kafscmd_attention_lock); 245 cb = call->request;
379 while (!list_empty(&kafscmd_attention_list)) { 246 bp = call->buffer;
380 call = list_entry(kafscmd_attention_list.next, 247 for (loop = call->count; loop > 0; loop--, cb++) {
381 struct rxrpc_call, 248 cb->version = ntohl(*bp++);
382 app_attn_link); 249 cb->expiry = ntohl(*bp++);
250 cb->type = ntohl(*bp++);
251 }
383 252
384 list_del_init(&call->app_attn_link); 253 empty_cb_array:
385 spin_unlock(&kafscmd_attention_lock); 254 call->offset = 0;
255 call->unmarshall++;
386 256
387 rxrpc_put_call(call); 257 case 5:
258 _debug("trailer");
259 if (skb->len != 0)
260 return -EBADMSG;
261 break;
262 }
388 263
389 spin_lock(&kafscmd_attention_lock); 264 if (!last)
390 } 265 return 0;
391 spin_unlock(&kafscmd_attention_lock);
392 266
393 afs_kafstimod_del_timer(&afs_mntpt_expiry_timer); 267 call->state = AFS_CALL_REPLYING;
394 }
395 268
396 up_write(&afscm_sem); 269 /* we'll need the file server record as that tells us which set of
270 * vnodes to operate upon */
271 memcpy(&addr, &ip_hdr(skb)->saddr, 4);
272 server = afs_find_server(&addr);
273 if (!server)
274 return -ENOTCONN;
275 call->server = server;
397 276
398} /* end afscm_stop() */ 277 INIT_WORK(&call->work, SRXAFSCB_CallBack);
278 schedule_work(&call->work);
279 return 0;
280}
399 281
400/*****************************************************************************/
401/* 282/*
402 * handle the fileserver breaking a set of callbacks 283 * allow the fileserver to request callback state (re-)initialisation
403 */ 284 */
404static void _SRXAFSCM_CallBack(struct rxrpc_call *call) 285static void SRXAFSCB_InitCallBackState(struct work_struct *work)
405{ 286{
406 struct afs_server *server; 287 struct afs_call *call = container_of(work, struct afs_call, work);
407 size_t count, qty, tmp;
408 int ret = 0, removed;
409
410 _enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
411
412 server = afs_server_get_from_peer(call->conn->peer);
413
414 switch (call->app_call_state) {
415 /* we've received the last packet
416 * - drain all the data from the call and send the reply
417 */
418 case RXRPC_CSTATE_SRVR_GOT_ARGS:
419 ret = -EBADMSG;
420 qty = call->app_ready_qty;
421 if (qty < 8 || qty > 50 * (6 * 4) + 8)
422 break;
423
424 {
425 struct afs_callback *cb, *pcb;
426 int loop;
427 __be32 *fp, *bp;
428
429 fp = rxrpc_call_alloc_scratch(call, qty);
430
431 /* drag the entire argument block out to the scratch
432 * space */
433 ret = rxrpc_call_read_data(call, fp, qty, 0);
434 if (ret < 0)
435 break;
436
437 /* and unmarshall the parameter block */
438 ret = -EBADMSG;
439 count = ntohl(*fp++);
440 if (count>AFSCBMAX ||
441 (count * (3 * 4) + 8 != qty &&
442 count * (6 * 4) + 8 != qty))
443 break;
444
445 bp = fp + count*3;
446 tmp = ntohl(*bp++);
447 if (tmp > 0 && tmp != count)
448 break;
449 if (tmp == 0)
450 bp = NULL;
451
452 pcb = cb = rxrpc_call_alloc_scratch_s(
453 call, struct afs_callback);
454
455 for (loop = count - 1; loop >= 0; loop--) {
456 pcb->fid.vid = ntohl(*fp++);
457 pcb->fid.vnode = ntohl(*fp++);
458 pcb->fid.unique = ntohl(*fp++);
459 if (bp) {
460 pcb->version = ntohl(*bp++);
461 pcb->expiry = ntohl(*bp++);
462 pcb->type = ntohl(*bp++);
463 }
464 else {
465 pcb->version = 0;
466 pcb->expiry = 0;
467 pcb->type = AFSCM_CB_UNTYPED;
468 }
469 pcb++;
470 }
471
472 /* invoke the actual service routine */
473 ret = SRXAFSCM_CallBack(server, count, cb);
474 if (ret < 0)
475 break;
476 }
477 288
478 /* send the reply */ 289 _enter("{%p}", call->server);
479 ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
480 GFP_KERNEL, 0, &count);
481 if (ret < 0)
482 break;
483 break;
484
485 /* operation complete */
486 case RXRPC_CSTATE_COMPLETE:
487 call->app_user = NULL;
488 removed = 0;
489 spin_lock(&afscm_calls_lock);
490 if (!list_empty(&call->app_link)) {
491 list_del_init(&call->app_link);
492 removed = 1;
493 }
494 spin_unlock(&afscm_calls_lock);
495 290
496 if (removed) 291 afs_init_callback_state(call->server);
497 rxrpc_put_call(call); 292 afs_send_empty_reply(call);
498 break; 293 _leave("");
294}
499 295
500 /* operation terminated on error */ 296/*
501 case RXRPC_CSTATE_ERROR: 297 * deliver request data to a CB.InitCallBackState call
502 call->app_user = NULL; 298 */
503 break; 299static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
300 struct sk_buff *skb,
301 bool last)
302{
303 struct afs_server *server;
304 struct in_addr addr;
504 305
505 default: 306 _enter(",{%u},%d", skb->len, last);
506 break;
507 }
508 307
509 if (ret < 0) 308 if (skb->len > 0)
510 rxrpc_call_abort(call, ret); 309 return -EBADMSG;
310 if (!last)
311 return 0;
511 312
512 afs_put_server(server); 313 /* no unmarshalling required */
314 call->state = AFS_CALL_REPLYING;
513 315
514 _leave(" = %d", ret); 316 /* we'll need the file server record as that tells us which set of
317 * vnodes to operate upon */
318 memcpy(&addr, &ip_hdr(skb)->saddr, 4);
319 server = afs_find_server(&addr);
320 if (!server)
321 return -ENOTCONN;
322 call->server = server;
515 323
516} /* end _SRXAFSCM_CallBack() */ 324 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
325 schedule_work(&call->work);
326 return 0;
327}
517 328
518/*****************************************************************************/
519/* 329/*
520 * handle the fileserver asking us to initialise our callback state 330 * deliver request data to a CB.InitCallBackState3 call
521 */ 331 */
522static void _SRXAFSCM_InitCallBackState(struct rxrpc_call *call) 332static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
333 struct sk_buff *skb,
334 bool last)
523{ 335{
524 struct afs_server *server; 336 struct afs_server *server;
525 size_t count; 337 struct in_addr addr;
526 int ret = 0, removed;
527 338
528 _enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]); 339 _enter(",{%u},%d", skb->len, last);
529 340
530 server = afs_server_get_from_peer(call->conn->peer); 341 if (!last)
342 return 0;
531 343
532 switch (call->app_call_state) { 344 /* no unmarshalling required */
533 /* we've received the last packet - drain all the data from the 345 call->state = AFS_CALL_REPLYING;
534 * call */
535 case RXRPC_CSTATE_SRVR_GOT_ARGS:
536 /* shouldn't be any args */
537 ret = -EBADMSG;
538 break;
539
540 /* send the reply when asked for it */
541 case RXRPC_CSTATE_SRVR_SND_REPLY:
542 /* invoke the actual service routine */
543 ret = SRXAFSCM_InitCallBackState(server);
544 if (ret < 0)
545 break;
546
547 ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
548 GFP_KERNEL, 0, &count);
549 if (ret < 0)
550 break;
551 break;
552 346
553 /* operation complete */ 347 /* we'll need the file server record as that tells us which set of
554 case RXRPC_CSTATE_COMPLETE: 348 * vnodes to operate upon */
555 call->app_user = NULL; 349 memcpy(&addr, &ip_hdr(skb)->saddr, 4);
556 removed = 0; 350 server = afs_find_server(&addr);
557 spin_lock(&afscm_calls_lock); 351 if (!server)
558 if (!list_empty(&call->app_link)) { 352 return -ENOTCONN;
559 list_del_init(&call->app_link); 353 call->server = server;
560 removed = 1;
561 }
562 spin_unlock(&afscm_calls_lock);
563 354
564 if (removed) 355 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
565 rxrpc_put_call(call); 356 schedule_work(&call->work);
566 break; 357 return 0;
567 358}
568 /* operation terminated on error */
569 case RXRPC_CSTATE_ERROR:
570 call->app_user = NULL;
571 break;
572
573 default:
574 break;
575 }
576
577 if (ret < 0)
578 rxrpc_call_abort(call, ret);
579
580 afs_put_server(server);
581 359
582 _leave(" = %d", ret); 360/*
361 * allow the fileserver to see if the cache manager is still alive
362 */
363static void SRXAFSCB_Probe(struct work_struct *work)
364{
365 struct afs_call *call = container_of(work, struct afs_call, work);
583 366
584} /* end _SRXAFSCM_InitCallBackState() */ 367 _enter("");
368 afs_send_empty_reply(call);
369 _leave("");
370}
585 371
586/*****************************************************************************/
587/* 372/*
588 * handle a probe from a fileserver 373 * deliver request data to a CB.Probe call
589 */ 374 */
590static void _SRXAFSCM_Probe(struct rxrpc_call *call) 375static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
376 bool last)
591{ 377{
592 struct afs_server *server; 378 _enter(",{%u},%d", skb->len, last);
593 size_t count;
594 int ret = 0, removed;
595
596 _enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
597 379
598 server = afs_server_get_from_peer(call->conn->peer); 380 if (skb->len > 0)
381 return -EBADMSG;
382 if (!last)
383 return 0;
599 384
600 switch (call->app_call_state) { 385 /* no unmarshalling required */
601 /* we've received the last packet - drain all the data from the 386 call->state = AFS_CALL_REPLYING;
602 * call */
603 case RXRPC_CSTATE_SRVR_GOT_ARGS:
604 /* shouldn't be any args */
605 ret = -EBADMSG;
606 break;
607 387
608 /* send the reply when asked for it */ 388 INIT_WORK(&call->work, SRXAFSCB_Probe);
609 case RXRPC_CSTATE_SRVR_SND_REPLY: 389 schedule_work(&call->work);
610 /* invoke the actual service routine */ 390 return 0;
611 ret = SRXAFSCM_Probe(server); 391}
612 if (ret < 0)
613 break;
614
615 ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
616 GFP_KERNEL, 0, &count);
617 if (ret < 0)
618 break;
619 break;
620 392
621 /* operation complete */ 393/*
622 case RXRPC_CSTATE_COMPLETE: 394 * allow the fileserver to ask about the cache manager's capabilities
623 call->app_user = NULL; 395 */
624 removed = 0; 396static void SRXAFSCB_GetCapabilities(struct work_struct *work)
625 spin_lock(&afscm_calls_lock); 397{
626 if (!list_empty(&call->app_link)) { 398 struct afs_interface *ifs;
627 list_del_init(&call->app_link); 399 struct afs_call *call = container_of(work, struct afs_call, work);
628 removed = 1; 400 int loop, nifs;
401
402 struct {
403 struct /* InterfaceAddr */ {
404 __be32 nifs;
405 __be32 uuid[11];
406 __be32 ifaddr[32];
407 __be32 netmask[32];
408 __be32 mtu[32];
409 } ia;
410 struct /* Capabilities */ {
411 __be32 capcount;
412 __be32 caps[1];
413 } cap;
414 } reply;
415
416 _enter("");
417
418 nifs = 0;
419 ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL);
420 if (ifs) {
421 nifs = afs_get_ipv4_interfaces(ifs, 32, false);
422 if (nifs < 0) {
423 kfree(ifs);
424 ifs = NULL;
425 nifs = 0;
629 } 426 }
630 spin_unlock(&afscm_calls_lock); 427 }
631 428
632 if (removed) 429 memset(&reply, 0, sizeof(reply));
633 rxrpc_put_call(call); 430 reply.ia.nifs = htonl(nifs);
634 break; 431
432 reply.ia.uuid[0] = htonl(afs_uuid.time_low);
433 reply.ia.uuid[1] = htonl(afs_uuid.time_mid);
434 reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version);
435 reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved);
436 reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low);
437 for (loop = 0; loop < 6; loop++)
438 reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]);
439
440 if (ifs) {
441 for (loop = 0; loop < nifs; loop++) {
442 reply.ia.ifaddr[loop] = ifs[loop].address.s_addr;
443 reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
444 reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
445 }
446 kfree(ifs);
447 }
635 448
636 /* operation terminated on error */ 449 reply.cap.capcount = htonl(1);
637 case RXRPC_CSTATE_ERROR: 450 reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION);
638 call->app_user = NULL; 451 afs_send_simple_reply(call, &reply, sizeof(reply));
639 break;
640 452
641 default: 453 _leave("");
642 break; 454}
643 }
644 455
645 if (ret < 0) 456/*
646 rxrpc_call_abort(call, ret); 457 * deliver request data to a CB.GetCapabilities call
458 */
459static int afs_deliver_cb_get_capabilities(struct afs_call *call,
460 struct sk_buff *skb, bool last)
461{
462 _enter(",{%u},%d", skb->len, last);
647 463
648 afs_put_server(server); 464 if (skb->len > 0)
465 return -EBADMSG;
466 if (!last)
467 return 0;
649 468
650 _leave(" = %d", ret); 469 /* no unmarshalling required */
470 call->state = AFS_CALL_REPLYING;
651 471
652} /* end _SRXAFSCM_Probe() */ 472 INIT_WORK(&call->work, SRXAFSCB_GetCapabilities);
473 schedule_work(&call->work);
474 return 0;
475}
diff --git a/fs/afs/cmservice.h b/fs/afs/cmservice.h
deleted file mode 100644
index af8d4d689cb2..000000000000
--- a/fs/afs/cmservice.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/* cmservice.h: AFS Cache Manager Service declarations
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef _LINUX_AFS_CMSERVICE_H
13#define _LINUX_AFS_CMSERVICE_H
14
15#include <rxrpc/transport.h>
16#include "types.h"
17
18/* cache manager start/stop */
19extern int afscm_start(void);
20extern void afscm_stop(void);
21
22/* cache manager server functions */
23extern int SRXAFSCM_InitCallBackState(struct afs_server *server);
24extern int SRXAFSCM_CallBack(struct afs_server *server,
25 size_t count,
26 struct afs_callback callbacks[]);
27extern int SRXAFSCM_Probe(struct afs_server *server);
28
29#endif /* _LINUX_AFS_CMSERVICE_H */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b6dc2ebe47a8..0c1e902f17a3 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -15,45 +15,53 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/pagemap.h> 17#include <linux/pagemap.h>
18#include <linux/smp_lock.h> 18#include <linux/ctype.h>
19#include "vnode.h"
20#include "volume.h"
21#include <rxrpc/call.h>
22#include "super.h"
23#include "internal.h" 19#include "internal.h"
24 20
25static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry, 21static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
26 struct nameidata *nd); 22 struct nameidata *nd);
27static int afs_dir_open(struct inode *inode, struct file *file); 23static int afs_dir_open(struct inode *inode, struct file *file);
28static int afs_dir_readdir(struct file *file, void *dirent, filldir_t filldir); 24static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
29static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd); 25static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
30static int afs_d_delete(struct dentry *dentry); 26static int afs_d_delete(struct dentry *dentry);
31static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen, 27static void afs_d_release(struct dentry *dentry);
28static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
32 loff_t fpos, u64 ino, unsigned dtype); 29 loff_t fpos, u64 ino, unsigned dtype);
30static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
31 struct nameidata *nd);
32static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
33static int afs_rmdir(struct inode *dir, struct dentry *dentry);
34static int afs_unlink(struct inode *dir, struct dentry *dentry);
35static int afs_link(struct dentry *from, struct inode *dir,
36 struct dentry *dentry);
37static int afs_symlink(struct inode *dir, struct dentry *dentry,
38 const char *content);
39static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
40 struct inode *new_dir, struct dentry *new_dentry);
33 41
34const struct file_operations afs_dir_file_operations = { 42const struct file_operations afs_dir_file_operations = {
35 .open = afs_dir_open, 43 .open = afs_dir_open,
36 .readdir = afs_dir_readdir, 44 .release = afs_release,
45 .readdir = afs_readdir,
37}; 46};
38 47
39const struct inode_operations afs_dir_inode_operations = { 48const struct inode_operations afs_dir_inode_operations = {
40 .lookup = afs_dir_lookup, 49 .create = afs_create,
50 .lookup = afs_lookup,
51 .link = afs_link,
52 .unlink = afs_unlink,
53 .symlink = afs_symlink,
54 .mkdir = afs_mkdir,
55 .rmdir = afs_rmdir,
56 .rename = afs_rename,
57 .permission = afs_permission,
41 .getattr = afs_inode_getattr, 58 .getattr = afs_inode_getattr,
42#if 0 /* TODO */
43 .create = afs_dir_create,
44 .link = afs_dir_link,
45 .unlink = afs_dir_unlink,
46 .symlink = afs_dir_symlink,
47 .mkdir = afs_dir_mkdir,
48 .rmdir = afs_dir_rmdir,
49 .mknod = afs_dir_mknod,
50 .rename = afs_dir_rename,
51#endif
52}; 59};
53 60
54static struct dentry_operations afs_fs_dentry_operations = { 61static struct dentry_operations afs_fs_dentry_operations = {
55 .d_revalidate = afs_d_revalidate, 62 .d_revalidate = afs_d_revalidate,
56 .d_delete = afs_d_delete, 63 .d_delete = afs_d_delete,
64 .d_release = afs_d_release,
57}; 65};
58 66
59#define AFS_DIR_HASHTBL_SIZE 128 67#define AFS_DIR_HASHTBL_SIZE 128
@@ -105,14 +113,13 @@ struct afs_dir_page {
105 union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)]; 113 union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)];
106}; 114};
107 115
108struct afs_dir_lookup_cookie { 116struct afs_lookup_cookie {
109 struct afs_fid fid; 117 struct afs_fid fid;
110 const char *name; 118 const char *name;
111 size_t nlen; 119 size_t nlen;
112 int found; 120 int found;
113}; 121};
114 122
115/*****************************************************************************/
116/* 123/*
117 * check that a directory page is valid 124 * check that a directory page is valid
118 */ 125 */
@@ -128,9 +135,10 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
128 if (qty == 0) 135 if (qty == 0)
129 goto error; 136 goto error;
130 137
131 if (page->index==0 && qty!=ntohs(dbuf->blocks[0].pagehdr.npages)) { 138 if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) {
132 printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n", 139 printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n",
133 __FUNCTION__,dir->i_ino,qty,ntohs(dbuf->blocks[0].pagehdr.npages)); 140 __FUNCTION__, dir->i_ino, qty,
141 ntohs(dbuf->blocks[0].pagehdr.npages));
134 goto error; 142 goto error;
135 } 143 }
136#endif 144#endif
@@ -157,13 +165,11 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
157 SetPageChecked(page); 165 SetPageChecked(page);
158 return; 166 return;
159 167
160 error: 168error:
161 SetPageChecked(page); 169 SetPageChecked(page);
162 SetPageError(page); 170 SetPageError(page);
171}
163 172
164} /* end afs_dir_check_page() */
165
166/*****************************************************************************/
167/* 173/*
168 * discard a page cached in the pagecache 174 * discard a page cached in the pagecache
169 */ 175 */
@@ -171,25 +177,24 @@ static inline void afs_dir_put_page(struct page *page)
171{ 177{
172 kunmap(page); 178 kunmap(page);
173 page_cache_release(page); 179 page_cache_release(page);
180}
174 181
175} /* end afs_dir_put_page() */
176
177/*****************************************************************************/
178/* 182/*
179 * get a page into the pagecache 183 * get a page into the pagecache
180 */ 184 */
181static struct page *afs_dir_get_page(struct inode *dir, unsigned long index) 185static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
186 struct key *key)
182{ 187{
183 struct page *page; 188 struct page *page;
189 struct file file = {
190 .private_data = key,
191 };
184 192
185 _enter("{%lu},%lu", dir->i_ino, index); 193 _enter("{%lu},%lu", dir->i_ino, index);
186 194
187 page = read_mapping_page(dir->i_mapping, index, NULL); 195 page = read_mapping_page(dir->i_mapping, index, &file);
188 if (!IS_ERR(page)) { 196 if (!IS_ERR(page)) {
189 wait_on_page_locked(page);
190 kmap(page); 197 kmap(page);
191 if (!PageUptodate(page))
192 goto fail;
193 if (!PageChecked(page)) 198 if (!PageChecked(page))
194 afs_dir_check_page(dir, page); 199 afs_dir_check_page(dir, page);
195 if (PageError(page)) 200 if (PageError(page))
@@ -197,12 +202,12 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
197 } 202 }
198 return page; 203 return page;
199 204
200 fail: 205fail:
201 afs_dir_put_page(page); 206 afs_dir_put_page(page);
207 _leave(" = -EIO");
202 return ERR_PTR(-EIO); 208 return ERR_PTR(-EIO);
203} /* end afs_dir_get_page() */ 209}
204 210
205/*****************************************************************************/
206/* 211/*
207 * open an AFS directory file 212 * open an AFS directory file
208 */ 213 */
@@ -213,15 +218,12 @@ static int afs_dir_open(struct inode *inode, struct file *file)
213 BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); 218 BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
214 BUILD_BUG_ON(sizeof(union afs_dirent) != 32); 219 BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
215 220
216 if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED) 221 if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags))
217 return -ENOENT; 222 return -ENOENT;
218 223
219 _leave(" = 0"); 224 return afs_open(inode, file);
220 return 0; 225}
221 226
222} /* end afs_dir_open() */
223
224/*****************************************************************************/
225/* 227/*
226 * deal with one block in an AFS directory 228 * deal with one block in an AFS directory
227 */ 229 */
@@ -250,7 +252,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
250 /* skip entries marked unused in the bitmap */ 252 /* skip entries marked unused in the bitmap */
251 if (!(block->pagehdr.bitmap[offset / 8] & 253 if (!(block->pagehdr.bitmap[offset / 8] &
252 (1 << (offset % 8)))) { 254 (1 << (offset % 8)))) {
253 _debug("ENT[%Zu.%u]: unused\n", 255 _debug("ENT[%Zu.%u]: unused",
254 blkoff / sizeof(union afs_dir_block), offset); 256 blkoff / sizeof(union afs_dir_block), offset);
255 if (offset >= curr) 257 if (offset >= curr)
256 *fpos = blkoff + 258 *fpos = blkoff +
@@ -264,7 +266,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
264 sizeof(*block) - 266 sizeof(*block) -
265 offset * sizeof(union afs_dirent)); 267 offset * sizeof(union afs_dirent));
266 268
267 _debug("ENT[%Zu.%u]: %s %Zu \"%s\"\n", 269 _debug("ENT[%Zu.%u]: %s %Zu \"%s\"",
268 blkoff / sizeof(union afs_dir_block), offset, 270 blkoff / sizeof(union afs_dir_block), offset,
269 (offset < curr ? "skip" : "fill"), 271 (offset < curr ? "skip" : "fill"),
270 nlen, dire->u.name); 272 nlen, dire->u.name);
@@ -274,7 +276,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
274 if (next >= AFS_DIRENT_PER_BLOCK) { 276 if (next >= AFS_DIRENT_PER_BLOCK) {
275 _debug("ENT[%Zu.%u]:" 277 _debug("ENT[%Zu.%u]:"
276 " %u travelled beyond end dir block" 278 " %u travelled beyond end dir block"
277 " (len %u/%Zu)\n", 279 " (len %u/%Zu)",
278 blkoff / sizeof(union afs_dir_block), 280 blkoff / sizeof(union afs_dir_block),
279 offset, next, tmp, nlen); 281 offset, next, tmp, nlen);
280 return -EIO; 282 return -EIO;
@@ -282,13 +284,13 @@ static int afs_dir_iterate_block(unsigned *fpos,
282 if (!(block->pagehdr.bitmap[next / 8] & 284 if (!(block->pagehdr.bitmap[next / 8] &
283 (1 << (next % 8)))) { 285 (1 << (next % 8)))) {
284 _debug("ENT[%Zu.%u]:" 286 _debug("ENT[%Zu.%u]:"
285 " %u unmarked extension (len %u/%Zu)\n", 287 " %u unmarked extension (len %u/%Zu)",
286 blkoff / sizeof(union afs_dir_block), 288 blkoff / sizeof(union afs_dir_block),
287 offset, next, tmp, nlen); 289 offset, next, tmp, nlen);
288 return -EIO; 290 return -EIO;
289 } 291 }
290 292
291 _debug("ENT[%Zu.%u]: ext %u/%Zu\n", 293 _debug("ENT[%Zu.%u]: ext %u/%Zu",
292 blkoff / sizeof(union afs_dir_block), 294 blkoff / sizeof(union afs_dir_block),
293 next, tmp, nlen); 295 next, tmp, nlen);
294 next++; 296 next++;
@@ -304,7 +306,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
304 nlen, 306 nlen,
305 blkoff + offset * sizeof(union afs_dirent), 307 blkoff + offset * sizeof(union afs_dirent),
306 ntohl(dire->u.vnode), 308 ntohl(dire->u.vnode),
307 filldir == afs_dir_lookup_filldir ? 309 filldir == afs_lookup_filldir ?
308 ntohl(dire->u.unique) : DT_UNKNOWN); 310 ntohl(dire->u.unique) : DT_UNKNOWN);
309 if (ret < 0) { 311 if (ret < 0) {
310 _leave(" = 0 [full]"); 312 _leave(" = 0 [full]");
@@ -316,16 +318,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
316 318
317 _leave(" = 1 [more]"); 319 _leave(" = 1 [more]");
318 return 1; 320 return 1;
319} /* end afs_dir_iterate_block() */ 321}
320 322
321/*****************************************************************************/
322/* 323/*
323 * read an AFS directory 324 * iterate through the data blob that lists the contents of an AFS directory
324 */ 325 */
325static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, 326static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
326 filldir_t filldir) 327 filldir_t filldir, struct key *key)
327{ 328{
328 union afs_dir_block *dblock; 329 union afs_dir_block *dblock;
329 struct afs_dir_page *dbuf; 330 struct afs_dir_page *dbuf;
330 struct page *page; 331 struct page *page;
331 unsigned blkoff, limit; 332 unsigned blkoff, limit;
@@ -333,7 +334,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
333 334
334 _enter("{%lu},%u,,", dir->i_ino, *fpos); 335 _enter("{%lu},%u,,", dir->i_ino, *fpos);
335 336
336 if (AFS_FS_I(dir)->flags & AFS_VNODE_DELETED) { 337 if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
337 _leave(" = -ESTALE"); 338 _leave(" = -ESTALE");
338 return -ESTALE; 339 return -ESTALE;
339 } 340 }
@@ -348,7 +349,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
348 blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1); 349 blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
349 350
350 /* fetch the appropriate page from the directory */ 351 /* fetch the appropriate page from the directory */
351 page = afs_dir_get_page(dir, blkoff / PAGE_SIZE); 352 page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
352 if (IS_ERR(page)) { 353 if (IS_ERR(page)) {
353 ret = PTR_ERR(page); 354 ret = PTR_ERR(page);
354 break; 355 break;
@@ -377,43 +378,50 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
377 ret = 0; 378 ret = 0;
378 } 379 }
379 380
380 out: 381out:
381 _leave(" = %d", ret); 382 _leave(" = %d", ret);
382 return ret; 383 return ret;
383} /* end afs_dir_iterate() */ 384}
384 385
385/*****************************************************************************/
386/* 386/*
387 * read an AFS directory 387 * read an AFS directory
388 */ 388 */
389static int afs_dir_readdir(struct file *file, void *cookie, filldir_t filldir) 389static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
390{ 390{
391 unsigned fpos; 391 unsigned fpos;
392 int ret; 392 int ret;
393 393
394 _enter("{%Ld,{%lu}}", file->f_pos, file->f_path.dentry->d_inode->i_ino); 394 _enter("{%Ld,{%lu}}",
395 file->f_pos, file->f_path.dentry->d_inode->i_ino);
396
397 ASSERT(file->private_data != NULL);
395 398
396 fpos = file->f_pos; 399 fpos = file->f_pos;
397 ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos, cookie, filldir); 400 ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos,
401 cookie, filldir, file->private_data);
398 file->f_pos = fpos; 402 file->f_pos = fpos;
399 403
400 _leave(" = %d", ret); 404 _leave(" = %d", ret);
401 return ret; 405 return ret;
402} /* end afs_dir_readdir() */ 406}
403 407
404/*****************************************************************************/
405/* 408/*
406 * search the directory for a name 409 * search the directory for a name
407 * - if afs_dir_iterate_block() spots this function, it'll pass the FID 410 * - if afs_dir_iterate_block() spots this function, it'll pass the FID
408 * uniquifier through dtype 411 * uniquifier through dtype
409 */ 412 */
410static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen, 413static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
411 loff_t fpos, u64 ino, unsigned dtype) 414 loff_t fpos, u64 ino, unsigned dtype)
412{ 415{
413 struct afs_dir_lookup_cookie *cookie = _cookie; 416 struct afs_lookup_cookie *cookie = _cookie;
417
418 _enter("{%s,%Zu},%s,%u,,%llu,%u",
419 cookie->name, cookie->nlen, name, nlen,
420 (unsigned long long) ino, dtype);
414 421
415 _enter("{%s,%Zu},%s,%u,,%lu,%u", 422 /* insanity checks first */
416 cookie->name, cookie->nlen, name, nlen, ino, dtype); 423 BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
424 BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
417 425
418 if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) { 426 if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
419 _leave(" = 0 [no]"); 427 _leave(" = 0 [no]");
@@ -426,216 +434,254 @@ static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
426 434
427 _leave(" = -1 [found]"); 435 _leave(" = -1 [found]");
428 return -1; 436 return -1;
429} /* end afs_dir_lookup_filldir() */ 437}
430 438
431/*****************************************************************************/
432/* 439/*
433 * look up an entry in a directory 440 * do a lookup in a directory
441 * - just returns the FID the dentry name maps to if found
434 */ 442 */
435static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry, 443static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
436 struct nameidata *nd) 444 struct afs_fid *fid, struct key *key)
437{ 445{
438 struct afs_dir_lookup_cookie cookie; 446 struct afs_lookup_cookie cookie;
439 struct afs_super_info *as; 447 struct afs_super_info *as;
448 unsigned fpos;
449 int ret;
450
451 _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
452
453 as = dir->i_sb->s_fs_info;
454
455 /* search the directory */
456 cookie.name = dentry->d_name.name;
457 cookie.nlen = dentry->d_name.len;
458 cookie.fid.vid = as->volume->vid;
459 cookie.found = 0;
460
461 fpos = 0;
462 ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
463 key);
464 if (ret < 0) {
465 _leave(" = %d [iter]", ret);
466 return ret;
467 }
468
469 ret = -ENOENT;
470 if (!cookie.found) {
471 _leave(" = -ENOENT [not found]");
472 return -ENOENT;
473 }
474
475 *fid = cookie.fid;
476 _leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique);
477 return 0;
478}
479
480/*
481 * look up an entry in a directory
482 */
483static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
484 struct nameidata *nd)
485{
440 struct afs_vnode *vnode; 486 struct afs_vnode *vnode;
487 struct afs_fid fid;
441 struct inode *inode; 488 struct inode *inode;
442 unsigned fpos; 489 struct key *key;
443 int ret; 490 int ret;
444 491
445 _enter("{%lu},%p{%s}", dir->i_ino, dentry, dentry->d_name.name); 492 vnode = AFS_FS_I(dir);
446 493
447 /* insanity checks first */ 494 _enter("{%x:%d},%p{%s},",
448 BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); 495 vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name);
449 BUILD_BUG_ON(sizeof(union afs_dirent) != 32); 496
497 ASSERTCMP(dentry->d_inode, ==, NULL);
450 498
451 if (dentry->d_name.len > 255) { 499 if (dentry->d_name.len > 255) {
452 _leave(" = -ENAMETOOLONG"); 500 _leave(" = -ENAMETOOLONG");
453 return ERR_PTR(-ENAMETOOLONG); 501 return ERR_PTR(-ENAMETOOLONG);
454 } 502 }
455 503
456 vnode = AFS_FS_I(dir); 504 if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
457 if (vnode->flags & AFS_VNODE_DELETED) {
458 _leave(" = -ESTALE"); 505 _leave(" = -ESTALE");
459 return ERR_PTR(-ESTALE); 506 return ERR_PTR(-ESTALE);
460 } 507 }
461 508
462 as = dir->i_sb->s_fs_info; 509 key = afs_request_key(vnode->volume->cell);
463 510 if (IS_ERR(key)) {
464 /* search the directory */ 511 _leave(" = %ld [key]", PTR_ERR(key));
465 cookie.name = dentry->d_name.name; 512 return ERR_PTR(PTR_ERR(key));
466 cookie.nlen = dentry->d_name.len; 513 }
467 cookie.fid.vid = as->volume->vid;
468 cookie.found = 0;
469 514
470 fpos = 0; 515 ret = afs_validate(vnode, key);
471 ret = afs_dir_iterate(dir, &fpos, &cookie, afs_dir_lookup_filldir);
472 if (ret < 0) { 516 if (ret < 0) {
473 _leave(" = %d", ret); 517 key_put(key);
518 _leave(" = %d [val]", ret);
474 return ERR_PTR(ret); 519 return ERR_PTR(ret);
475 } 520 }
476 521
477 ret = -ENOENT; 522 ret = afs_do_lookup(dir, dentry, &fid, key);
478 if (!cookie.found) { 523 if (ret < 0) {
479 _leave(" = %d", ret); 524 key_put(key);
525 if (ret == -ENOENT) {
526 d_add(dentry, NULL);
527 _leave(" = NULL [negative]");
528 return NULL;
529 }
530 _leave(" = %d [do]", ret);
480 return ERR_PTR(ret); 531 return ERR_PTR(ret);
481 } 532 }
533 dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version;
482 534
483 /* instantiate the dentry */ 535 /* instantiate the dentry */
484 ret = afs_iget(dir->i_sb, &cookie.fid, &inode); 536 inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL);
485 if (ret < 0) { 537 key_put(key);
486 _leave(" = %d", ret); 538 if (IS_ERR(inode)) {
487 return ERR_PTR(ret); 539 _leave(" = %ld", PTR_ERR(inode));
540 return ERR_PTR(PTR_ERR(inode));
488 } 541 }
489 542
490 dentry->d_op = &afs_fs_dentry_operations; 543 dentry->d_op = &afs_fs_dentry_operations;
491 dentry->d_fsdata = (void *) (unsigned long) vnode->status.version;
492 544
493 d_add(dentry, inode); 545 d_add(dentry, inode);
494 _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%lu }", 546 _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%lu }",
495 cookie.fid.vnode, 547 fid.vnode,
496 cookie.fid.unique, 548 fid.unique,
497 dentry->d_inode->i_ino, 549 dentry->d_inode->i_ino,
498 dentry->d_inode->i_version); 550 dentry->d_inode->i_version);
499 551
500 return NULL; 552 return NULL;
501} /* end afs_dir_lookup() */ 553}
502 554
503/*****************************************************************************/
504/* 555/*
505 * check that a dentry lookup hit has found a valid entry 556 * check that a dentry lookup hit has found a valid entry
506 * - NOTE! the hit can be a negative hit too, so we can't assume we have an 557 * - NOTE! the hit can be a negative hit too, so we can't assume we have an
507 * inode 558 * inode
508 * (derived from nfs_lookup_revalidate)
509 */ 559 */
510static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) 560static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
511{ 561{
512 struct afs_dir_lookup_cookie cookie; 562 struct afs_vnode *vnode, *dir;
563 struct afs_fid fid;
513 struct dentry *parent; 564 struct dentry *parent;
514 struct inode *inode, *dir; 565 struct key *key;
515 unsigned fpos; 566 void *dir_version;
516 int ret; 567 int ret;
517 568
518 _enter("{sb=%p n=%s},", dentry->d_sb, dentry->d_name.name); 569 vnode = AFS_FS_I(dentry->d_inode);
519 570
520 /* lock down the parent dentry so we can peer at it */ 571 if (dentry->d_inode)
521 parent = dget_parent(dentry->d_parent); 572 _enter("{v={%x:%u} n=%s fl=%lx},",
573 vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
574 vnode->flags);
575 else
576 _enter("{neg n=%s}", dentry->d_name.name);
522 577
523 dir = parent->d_inode; 578 key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
524 inode = dentry->d_inode; 579 if (IS_ERR(key))
580 key = NULL;
525 581
526 /* handle a negative dentry */ 582 /* lock down the parent dentry so we can peer at it */
527 if (!inode) 583 parent = dget_parent(dentry);
584 if (!parent->d_inode)
528 goto out_bad; 585 goto out_bad;
529 586
530 /* handle a bad inode */ 587 dir = AFS_FS_I(parent->d_inode);
531 if (is_bad_inode(inode)) {
532 printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n",
533 dentry->d_parent->d_name.name, dentry->d_name.name);
534 goto out_bad;
535 }
536 588
537 /* force a full look up if the parent directory changed since last the 589 /* validate the parent directory */
538 * server was consulted 590 if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
539 * - otherwise this inode must still exist, even if the inode details 591 afs_validate(dir, key);
540 * themselves have changed
541 */
542 if (AFS_FS_I(dir)->flags & AFS_VNODE_CHANGED)
543 afs_vnode_fetch_status(AFS_FS_I(dir));
544 592
545 if (AFS_FS_I(dir)->flags & AFS_VNODE_DELETED) { 593 if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
546 _debug("%s: parent dir deleted", dentry->d_name.name); 594 _debug("%s: parent dir deleted", dentry->d_name.name);
547 goto out_bad; 595 goto out_bad;
548 } 596 }
549 597
550 if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED) { 598 dir_version = (void *) (unsigned long) dir->status.data_version;
551 _debug("%s: file already deleted", dentry->d_name.name); 599 if (dentry->d_fsdata == dir_version)
552 goto out_bad; 600 goto out_valid; /* the dir contents are unchanged */
553 }
554
555 if ((unsigned long) dentry->d_fsdata !=
556 (unsigned long) AFS_FS_I(dir)->status.version) {
557 _debug("%s: parent changed %lu -> %u",
558 dentry->d_name.name,
559 (unsigned long) dentry->d_fsdata,
560 (unsigned) AFS_FS_I(dir)->status.version);
561 601
562 /* search the directory for this vnode */ 602 _debug("dir modified");
563 cookie.name = dentry->d_name.name;
564 cookie.nlen = dentry->d_name.len;
565 cookie.fid.vid = AFS_FS_I(inode)->volume->vid;
566 cookie.found = 0;
567 603
568 fpos = 0; 604 /* search the directory for this vnode */
569 ret = afs_dir_iterate(dir, &fpos, &cookie, 605 ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key);
570 afs_dir_lookup_filldir); 606 switch (ret) {
571 if (ret < 0) { 607 case 0:
572 _debug("failed to iterate dir %s: %d", 608 /* the filename maps to something */
573 parent->d_name.name, ret); 609 if (!dentry->d_inode)
610 goto out_bad;
611 if (is_bad_inode(dentry->d_inode)) {
612 printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n",
613 parent->d_name.name, dentry->d_name.name);
574 goto out_bad; 614 goto out_bad;
575 }
576
577 if (!cookie.found) {
578 _debug("%s: dirent not found", dentry->d_name.name);
579 goto not_found;
580 } 615 }
581 616
582 /* if the vnode ID has changed, then the dirent points to a 617 /* if the vnode ID has changed, then the dirent points to a
583 * different file */ 618 * different file */
584 if (cookie.fid.vnode != AFS_FS_I(inode)->fid.vnode) { 619 if (fid.vnode != vnode->fid.vnode) {
585 _debug("%s: dirent changed", dentry->d_name.name); 620 _debug("%s: dirent changed [%u != %u]",
621 dentry->d_name.name, fid.vnode,
622 vnode->fid.vnode);
586 goto not_found; 623 goto not_found;
587 } 624 }
588 625
589 /* if the vnode ID uniqifier has changed, then the file has 626 /* if the vnode ID uniqifier has changed, then the file has
590 * been deleted */ 627 * been deleted and replaced, and the original vnode ID has
591 if (cookie.fid.unique != AFS_FS_I(inode)->fid.unique) { 628 * been reused */
629 if (fid.unique != vnode->fid.unique) {
592 _debug("%s: file deleted (uq %u -> %u I:%lu)", 630 _debug("%s: file deleted (uq %u -> %u I:%lu)",
593 dentry->d_name.name, 631 dentry->d_name.name, fid.unique,
594 cookie.fid.unique, 632 vnode->fid.unique, dentry->d_inode->i_version);
595 AFS_FS_I(inode)->fid.unique, 633 spin_lock(&vnode->lock);
596 inode->i_version); 634 set_bit(AFS_VNODE_DELETED, &vnode->flags);
597 spin_lock(&AFS_FS_I(inode)->lock); 635 spin_unlock(&vnode->lock);
598 AFS_FS_I(inode)->flags |= AFS_VNODE_DELETED; 636 goto not_found;
599 spin_unlock(&AFS_FS_I(inode)->lock);
600 invalidate_remote_inode(inode);
601 goto out_bad;
602 } 637 }
638 goto out_valid;
603 639
604 dentry->d_fsdata = 640 case -ENOENT:
605 (void *) (unsigned long) AFS_FS_I(dir)->status.version; 641 /* the filename is unknown */
642 _debug("%s: dirent not found", dentry->d_name.name);
643 if (dentry->d_inode)
644 goto not_found;
645 goto out_valid;
646
647 default:
648 _debug("failed to iterate dir %s: %d",
649 parent->d_name.name, ret);
650 goto out_bad;
606 } 651 }
607 652
608 out_valid: 653out_valid:
654 dentry->d_fsdata = dir_version;
655out_skip:
609 dput(parent); 656 dput(parent);
657 key_put(key);
610 _leave(" = 1 [valid]"); 658 _leave(" = 1 [valid]");
611 return 1; 659 return 1;
612 660
613 /* the dirent, if it exists, now points to a different vnode */ 661 /* the dirent, if it exists, now points to a different vnode */
614 not_found: 662not_found:
615 spin_lock(&dentry->d_lock); 663 spin_lock(&dentry->d_lock);
616 dentry->d_flags |= DCACHE_NFSFS_RENAMED; 664 dentry->d_flags |= DCACHE_NFSFS_RENAMED;
617 spin_unlock(&dentry->d_lock); 665 spin_unlock(&dentry->d_lock);
618 666
619 out_bad: 667out_bad:
620 if (inode) { 668 if (dentry->d_inode) {
621 /* don't unhash if we have submounts */ 669 /* don't unhash if we have submounts */
622 if (have_submounts(dentry)) 670 if (have_submounts(dentry))
623 goto out_valid; 671 goto out_skip;
624 } 672 }
625 673
626 shrink_dcache_parent(dentry);
627
628 _debug("dropping dentry %s/%s", 674 _debug("dropping dentry %s/%s",
629 dentry->d_parent->d_name.name, dentry->d_name.name); 675 parent->d_name.name, dentry->d_name.name);
676 shrink_dcache_parent(dentry);
630 d_drop(dentry); 677 d_drop(dentry);
631
632 dput(parent); 678 dput(parent);
679 key_put(key);
633 680
634 _leave(" = 0 [bad]"); 681 _leave(" = 0 [bad]");
635 return 0; 682 return 0;
636} /* end afs_d_revalidate() */ 683}
637 684
638/*****************************************************************************/
639/* 685/*
640 * allow the VFS to enquire as to whether a dentry should be unhashed (mustn't 686 * allow the VFS to enquire as to whether a dentry should be unhashed (mustn't
641 * sleep) 687 * sleep)
@@ -649,15 +695,444 @@ static int afs_d_delete(struct dentry *dentry)
649 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) 695 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
650 goto zap; 696 goto zap;
651 697
652 if (dentry->d_inode) { 698 if (dentry->d_inode &&
653 if (AFS_FS_I(dentry->d_inode)->flags & AFS_VNODE_DELETED) 699 test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags))
654 goto zap; 700 goto zap;
655 }
656 701
657 _leave(" = 0 [keep]"); 702 _leave(" = 0 [keep]");
658 return 0; 703 return 0;
659 704
660 zap: 705zap:
661 _leave(" = 1 [zap]"); 706 _leave(" = 1 [zap]");
662 return 1; 707 return 1;
663} /* end afs_d_delete() */ 708}
709
710/*
711 * handle dentry release
712 */
713static void afs_d_release(struct dentry *dentry)
714{
715 _enter("%s", dentry->d_name.name);
716}
717
718/*
719 * create a directory on an AFS filesystem
720 */
721static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
722{
723 struct afs_file_status status;
724 struct afs_callback cb;
725 struct afs_server *server;
726 struct afs_vnode *dvnode, *vnode;
727 struct afs_fid fid;
728 struct inode *inode;
729 struct key *key;
730 int ret;
731
732 dvnode = AFS_FS_I(dir);
733
734 _enter("{%x:%d},{%s},%o",
735 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
736
737 ret = -ENAMETOOLONG;
738 if (dentry->d_name.len > 255)
739 goto error;
740
741 key = afs_request_key(dvnode->volume->cell);
742 if (IS_ERR(key)) {
743 ret = PTR_ERR(key);
744 goto error;
745 }
746
747 mode |= S_IFDIR;
748 ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
749 mode, &fid, &status, &cb, &server);
750 if (ret < 0)
751 goto mkdir_error;
752
753 inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
754 if (IS_ERR(inode)) {
755 /* ENOMEM at a really inconvenient time - just abandon the new
756 * directory on the server */
757 ret = PTR_ERR(inode);
758 goto iget_error;
759 }
760
761 /* apply the status report we've got for the new vnode */
762 vnode = AFS_FS_I(inode);
763 spin_lock(&vnode->lock);
764 vnode->update_cnt++;
765 spin_unlock(&vnode->lock);
766 afs_vnode_finalise_status_update(vnode, server);
767 afs_put_server(server);
768
769 d_instantiate(dentry, inode);
770 if (d_unhashed(dentry)) {
771 _debug("not hashed");
772 d_rehash(dentry);
773 }
774 key_put(key);
775 _leave(" = 0");
776 return 0;
777
778iget_error:
779 afs_put_server(server);
780mkdir_error:
781 key_put(key);
782error:
783 d_drop(dentry);
784 _leave(" = %d", ret);
785 return ret;
786}
787
788/*
789 * remove a directory from an AFS filesystem
790 */
791static int afs_rmdir(struct inode *dir, struct dentry *dentry)
792{
793 struct afs_vnode *dvnode, *vnode;
794 struct key *key;
795 int ret;
796
797 dvnode = AFS_FS_I(dir);
798
799 _enter("{%x:%d},{%s}",
800 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
801
802 ret = -ENAMETOOLONG;
803 if (dentry->d_name.len > 255)
804 goto error;
805
806 key = afs_request_key(dvnode->volume->cell);
807 if (IS_ERR(key)) {
808 ret = PTR_ERR(key);
809 goto error;
810 }
811
812 ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true);
813 if (ret < 0)
814 goto rmdir_error;
815
816 if (dentry->d_inode) {
817 vnode = AFS_FS_I(dentry->d_inode);
818 clear_nlink(&vnode->vfs_inode);
819 set_bit(AFS_VNODE_DELETED, &vnode->flags);
820 afs_discard_callback_on_delete(vnode);
821 }
822
823 key_put(key);
824 _leave(" = 0");
825 return 0;
826
827rmdir_error:
828 key_put(key);
829error:
830 _leave(" = %d", ret);
831 return ret;
832}
833
834/*
835 * remove a file from an AFS filesystem
836 */
837static int afs_unlink(struct inode *dir, struct dentry *dentry)
838{
839 struct afs_vnode *dvnode, *vnode;
840 struct key *key;
841 int ret;
842
843 dvnode = AFS_FS_I(dir);
844
845 _enter("{%x:%d},{%s}",
846 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
847
848 ret = -ENAMETOOLONG;
849 if (dentry->d_name.len > 255)
850 goto error;
851
852 key = afs_request_key(dvnode->volume->cell);
853 if (IS_ERR(key)) {
854 ret = PTR_ERR(key);
855 goto error;
856 }
857
858 if (dentry->d_inode) {
859 vnode = AFS_FS_I(dentry->d_inode);
860
861 /* make sure we have a callback promise on the victim */
862 ret = afs_validate(vnode, key);
863 if (ret < 0)
864 goto error;
865 }
866
867 ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false);
868 if (ret < 0)
869 goto remove_error;
870
871 if (dentry->d_inode) {
872 /* if the file wasn't deleted due to excess hard links, the
873 * fileserver will break the callback promise on the file - if
874 * it had one - before it returns to us, and if it was deleted,
875 * it won't
876 *
877 * however, if we didn't have a callback promise outstanding,
878 * or it was outstanding on a different server, then it won't
879 * break it either...
880 */
881 vnode = AFS_FS_I(dentry->d_inode);
882 if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
883 _debug("AFS_VNODE_DELETED");
884 if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
885 _debug("AFS_VNODE_CB_BROKEN");
886 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
887 ret = afs_validate(vnode, key);
888 _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
889 }
890
891 key_put(key);
892 _leave(" = 0");
893 return 0;
894
895remove_error:
896 key_put(key);
897error:
898 _leave(" = %d", ret);
899 return ret;
900}
901
902/*
903 * create a regular file on an AFS filesystem
904 */
905static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
906 struct nameidata *nd)
907{
908 struct afs_file_status status;
909 struct afs_callback cb;
910 struct afs_server *server;
911 struct afs_vnode *dvnode, *vnode;
912 struct afs_fid fid;
913 struct inode *inode;
914 struct key *key;
915 int ret;
916
917 dvnode = AFS_FS_I(dir);
918
919 _enter("{%x:%d},{%s},%o,",
920 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
921
922 ret = -ENAMETOOLONG;
923 if (dentry->d_name.len > 255)
924 goto error;
925
926 key = afs_request_key(dvnode->volume->cell);
927 if (IS_ERR(key)) {
928 ret = PTR_ERR(key);
929 goto error;
930 }
931
932 mode |= S_IFREG;
933 ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
934 mode, &fid, &status, &cb, &server);
935 if (ret < 0)
936 goto create_error;
937
938 inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
939 if (IS_ERR(inode)) {
940 /* ENOMEM at a really inconvenient time - just abandon the new
941 * directory on the server */
942 ret = PTR_ERR(inode);
943 goto iget_error;
944 }
945
946 /* apply the status report we've got for the new vnode */
947 vnode = AFS_FS_I(inode);
948 spin_lock(&vnode->lock);
949 vnode->update_cnt++;
950 spin_unlock(&vnode->lock);
951 afs_vnode_finalise_status_update(vnode, server);
952 afs_put_server(server);
953
954 d_instantiate(dentry, inode);
955 if (d_unhashed(dentry)) {
956 _debug("not hashed");
957 d_rehash(dentry);
958 }
959 key_put(key);
960 _leave(" = 0");
961 return 0;
962
963iget_error:
964 afs_put_server(server);
965create_error:
966 key_put(key);
967error:
968 d_drop(dentry);
969 _leave(" = %d", ret);
970 return ret;
971}
972
973/*
974 * create a hard link between files in an AFS filesystem
975 */
976static int afs_link(struct dentry *from, struct inode *dir,
977 struct dentry *dentry)
978{
979 struct afs_vnode *dvnode, *vnode;
980 struct key *key;
981 int ret;
982
983 vnode = AFS_FS_I(from->d_inode);
984 dvnode = AFS_FS_I(dir);
985
986 _enter("{%x:%d},{%x:%d},{%s}",
987 vnode->fid.vid, vnode->fid.vnode,
988 dvnode->fid.vid, dvnode->fid.vnode,
989 dentry->d_name.name);
990
991 ret = -ENAMETOOLONG;
992 if (dentry->d_name.len > 255)
993 goto error;
994
995 key = afs_request_key(dvnode->volume->cell);
996 if (IS_ERR(key)) {
997 ret = PTR_ERR(key);
998 goto error;
999 }
1000
1001 ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name);
1002 if (ret < 0)
1003 goto link_error;
1004
1005 atomic_inc(&vnode->vfs_inode.i_count);
1006 d_instantiate(dentry, &vnode->vfs_inode);
1007 key_put(key);
1008 _leave(" = 0");
1009 return 0;
1010
1011link_error:
1012 key_put(key);
1013error:
1014 d_drop(dentry);
1015 _leave(" = %d", ret);
1016 return ret;
1017}
1018
1019/*
1020 * create a symlink in an AFS filesystem
1021 */
1022static int afs_symlink(struct inode *dir, struct dentry *dentry,
1023 const char *content)
1024{
1025 struct afs_file_status status;
1026 struct afs_server *server;
1027 struct afs_vnode *dvnode, *vnode;
1028 struct afs_fid fid;
1029 struct inode *inode;
1030 struct key *key;
1031 int ret;
1032
1033 dvnode = AFS_FS_I(dir);
1034
1035 _enter("{%x:%d},{%s},%s",
1036 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name,
1037 content);
1038
1039 ret = -ENAMETOOLONG;
1040 if (dentry->d_name.len > 255)
1041 goto error;
1042
1043 ret = -EINVAL;
1044 if (strlen(content) > 1023)
1045 goto error;
1046
1047 key = afs_request_key(dvnode->volume->cell);
1048 if (IS_ERR(key)) {
1049 ret = PTR_ERR(key);
1050 goto error;
1051 }
1052
1053 ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content,
1054 &fid, &status, &server);
1055 if (ret < 0)
1056 goto create_error;
1057
1058 inode = afs_iget(dir->i_sb, key, &fid, &status, NULL);
1059 if (IS_ERR(inode)) {
1060 /* ENOMEM at a really inconvenient time - just abandon the new
1061 * directory on the server */
1062 ret = PTR_ERR(inode);
1063 goto iget_error;
1064 }
1065
1066 /* apply the status report we've got for the new vnode */
1067 vnode = AFS_FS_I(inode);
1068 spin_lock(&vnode->lock);
1069 vnode->update_cnt++;
1070 spin_unlock(&vnode->lock);
1071 afs_vnode_finalise_status_update(vnode, server);
1072 afs_put_server(server);
1073
1074 d_instantiate(dentry, inode);
1075 if (d_unhashed(dentry)) {
1076 _debug("not hashed");
1077 d_rehash(dentry);
1078 }
1079 key_put(key);
1080 _leave(" = 0");
1081 return 0;
1082
1083iget_error:
1084 afs_put_server(server);
1085create_error:
1086 key_put(key);
1087error:
1088 d_drop(dentry);
1089 _leave(" = %d", ret);
1090 return ret;
1091}
1092
1093/*
1094 * rename a file in an AFS filesystem and/or move it between directories
1095 */
1096static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
1097 struct inode *new_dir, struct dentry *new_dentry)
1098{
1099 struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
1100 struct key *key;
1101 int ret;
1102
1103 vnode = AFS_FS_I(old_dentry->d_inode);
1104 orig_dvnode = AFS_FS_I(old_dir);
1105 new_dvnode = AFS_FS_I(new_dir);
1106
1107 _enter("{%x:%d},{%x:%d},{%x:%d},{%s}",
1108 orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
1109 vnode->fid.vid, vnode->fid.vnode,
1110 new_dvnode->fid.vid, new_dvnode->fid.vnode,
1111 new_dentry->d_name.name);
1112
1113 ret = -ENAMETOOLONG;
1114 if (new_dentry->d_name.len > 255)
1115 goto error;
1116
1117 key = afs_request_key(orig_dvnode->volume->cell);
1118 if (IS_ERR(key)) {
1119 ret = PTR_ERR(key);
1120 goto error;
1121 }
1122
1123 ret = afs_vnode_rename(orig_dvnode, new_dvnode, key,
1124 old_dentry->d_name.name,
1125 new_dentry->d_name.name);
1126 if (ret < 0)
1127 goto rename_error;
1128 key_put(key);
1129 _leave(" = 0");
1130 return 0;
1131
1132rename_error:
1133 key_put(key);
1134error:
1135 d_drop(new_dentry);
1136 _leave(" = %d", ret);
1137 return ret;
1138}
diff --git a/fs/afs/errors.h b/fs/afs/errors.h
deleted file mode 100644
index 574d94ac8d05..000000000000
--- a/fs/afs/errors.h
+++ /dev/null
@@ -1,34 +0,0 @@
1/* errors.h: AFS abort/error codes
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef _LINUX_AFS_ERRORS_H
13#define _LINUX_AFS_ERRORS_H
14
15#include "types.h"
16
17/* file server abort codes */
18typedef enum {
19 VSALVAGE = 101, /* volume needs salvaging */
20 VNOVNODE = 102, /* no such file/dir (vnode) */
21 VNOVOL = 103, /* no such volume or volume unavailable */
22 VVOLEXISTS = 104, /* volume name already exists */
23 VNOSERVICE = 105, /* volume not currently in service */
24 VOFFLINE = 106, /* volume is currently offline (more info available [VVL-spec]) */
25 VONLINE = 107, /* volume is already online */
26 VDISKFULL = 108, /* disk partition is full */
27 VOVERQUOTA = 109, /* volume's maximum quota exceeded */
28 VBUSY = 110, /* volume is temporarily unavailable */
29 VMOVED = 111, /* volume moved to new server - ask this FS where */
30} afs_rxfs_abort_t;
31
32extern int afs_abort_to_error(int abortcode);
33
34#endif /* _LINUX_AFS_ERRORS_H */
diff --git a/fs/afs/file.c b/fs/afs/file.c
index b17634541f67..ae256498f4f7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -1,6 +1,6 @@
1/* file.c: AFS filesystem file handling 1/* AFS filesystem file handling
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -15,22 +15,25 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/pagemap.h> 17#include <linux/pagemap.h>
18#include "volume.h"
19#include "vnode.h"
20#include <rxrpc/call.h>
21#include "internal.h" 18#include "internal.h"
22 19
23#if 0
24static int afs_file_open(struct inode *inode, struct file *file);
25static int afs_file_release(struct inode *inode, struct file *file);
26#endif
27
28static int afs_file_readpage(struct file *file, struct page *page); 20static int afs_file_readpage(struct file *file, struct page *page);
29static void afs_file_invalidatepage(struct page *page, unsigned long offset); 21static void afs_file_invalidatepage(struct page *page, unsigned long offset);
30static int afs_file_releasepage(struct page *page, gfp_t gfp_flags); 22static int afs_file_releasepage(struct page *page, gfp_t gfp_flags);
31 23
24const struct file_operations afs_file_operations = {
25 .open = afs_open,
26 .release = afs_release,
27 .llseek = generic_file_llseek,
28 .read = do_sync_read,
29 .aio_read = generic_file_aio_read,
30 .mmap = generic_file_readonly_mmap,
31 .sendfile = generic_file_sendfile,
32};
33
32const struct inode_operations afs_file_inode_operations = { 34const struct inode_operations afs_file_inode_operations = {
33 .getattr = afs_inode_getattr, 35 .getattr = afs_inode_getattr,
36 .permission = afs_permission,
34}; 37};
35 38
36const struct address_space_operations afs_fs_aops = { 39const struct address_space_operations afs_fs_aops = {
@@ -40,7 +43,48 @@ const struct address_space_operations afs_fs_aops = {
40 .invalidatepage = afs_file_invalidatepage, 43 .invalidatepage = afs_file_invalidatepage,
41}; 44};
42 45
43/*****************************************************************************/ 46/*
47 * open an AFS file or directory and attach a key to it
48 */
49int afs_open(struct inode *inode, struct file *file)
50{
51 struct afs_vnode *vnode = AFS_FS_I(inode);
52 struct key *key;
53 int ret;
54
55 _enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
56
57 key = afs_request_key(vnode->volume->cell);
58 if (IS_ERR(key)) {
59 _leave(" = %ld [key]", PTR_ERR(key));
60 return PTR_ERR(key);
61 }
62
63 ret = afs_validate(vnode, key);
64 if (ret < 0) {
65 _leave(" = %d [val]", ret);
66 return ret;
67 }
68
69 file->private_data = key;
70 _leave(" = 0");
71 return 0;
72}
73
74/*
75 * release an AFS file or directory and discard its key
76 */
77int afs_release(struct inode *inode, struct file *file)
78{
79 struct afs_vnode *vnode = AFS_FS_I(inode);
80
81 _enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
82
83 key_put(file->private_data);
84 _leave(" = 0");
85 return 0;
86}
87
44/* 88/*
45 * deal with notification that a page was read from the cache 89 * deal with notification that a page was read from the cache
46 */ 90 */
@@ -58,10 +102,9 @@ static void afs_file_readpage_read_complete(void *cookie_data,
58 SetPageUptodate(page); 102 SetPageUptodate(page);
59 unlock_page(page); 103 unlock_page(page);
60 104
61} /* end afs_file_readpage_read_complete() */ 105}
62#endif 106#endif
63 107
64/*****************************************************************************/
65/* 108/*
66 * deal with notification that a page was written to the cache 109 * deal with notification that a page was written to the cache
67 */ 110 */
@@ -74,41 +117,38 @@ static void afs_file_readpage_write_complete(void *cookie_data,
74 _enter("%p,%p,%p,%d", cookie_data, page, data, error); 117 _enter("%p,%p,%p,%d", cookie_data, page, data, error);
75 118
76 unlock_page(page); 119 unlock_page(page);
77 120}
78} /* end afs_file_readpage_write_complete() */
79#endif 121#endif
80 122
81/*****************************************************************************/
82/* 123/*
83 * AFS read page from file (or symlink) 124 * AFS read page from file (or symlink)
84 */ 125 */
85static int afs_file_readpage(struct file *file, struct page *page) 126static int afs_file_readpage(struct file *file, struct page *page)
86{ 127{
87 struct afs_rxfs_fetch_descriptor desc;
88#ifdef AFS_CACHING_SUPPORT
89 struct cachefs_page *pageio;
90#endif
91 struct afs_vnode *vnode; 128 struct afs_vnode *vnode;
92 struct inode *inode; 129 struct inode *inode;
130 struct key *key;
131 size_t len;
132 off_t offset;
93 int ret; 133 int ret;
94 134
95 inode = page->mapping->host; 135 inode = page->mapping->host;
96 136
97 _enter("{%lu},{%lu}", inode->i_ino, page->index); 137 ASSERT(file != NULL);
138 key = file->private_data;
139 ASSERT(key != NULL);
140
141 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
98 142
99 vnode = AFS_FS_I(inode); 143 vnode = AFS_FS_I(inode);
100 144
101 BUG_ON(!PageLocked(page)); 145 BUG_ON(!PageLocked(page));
102 146
103 ret = -ESTALE; 147 ret = -ESTALE;
104 if (vnode->flags & AFS_VNODE_DELETED) 148 if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
105 goto error; 149 goto error;
106 150
107#ifdef AFS_CACHING_SUPPORT 151#ifdef AFS_CACHING_SUPPORT
108 ret = cachefs_page_get_private(page, &pageio, GFP_NOIO);
109 if (ret < 0)
110 goto error;
111
112 /* is it cached? */ 152 /* is it cached? */
113 ret = cachefs_read_or_alloc_page(vnode->cache, 153 ret = cachefs_read_or_alloc_page(vnode->cache,
114 page, 154 page,
@@ -132,26 +172,19 @@ static int afs_file_readpage(struct file *file, struct page *page)
132 case -ENOBUFS: 172 case -ENOBUFS:
133 case -ENODATA: 173 case -ENODATA:
134 default: 174 default:
135 desc.fid = vnode->fid; 175 offset = page->index << PAGE_CACHE_SHIFT;
136 desc.offset = page->index << PAGE_CACHE_SHIFT; 176 len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
137 desc.size = min((size_t) (inode->i_size - desc.offset),
138 (size_t) PAGE_SIZE);
139 desc.buffer = kmap(page);
140
141 clear_page(desc.buffer);
142 177
143 /* read the contents of the file from the server into the 178 /* read the contents of the file from the server into the
144 * page */ 179 * page */
145 ret = afs_vnode_fetch_data(vnode, &desc); 180 ret = afs_vnode_fetch_data(vnode, key, offset, len, page);
146 kunmap(page);
147 if (ret < 0) { 181 if (ret < 0) {
148 if (ret==-ENOENT) { 182 if (ret == -ENOENT) {
149 _debug("got NOENT from server" 183 _debug("got NOENT from server"
150 " - marking file deleted and stale"); 184 " - marking file deleted and stale");
151 vnode->flags |= AFS_VNODE_DELETED; 185 set_bit(AFS_VNODE_DELETED, &vnode->flags);
152 ret = -ESTALE; 186 ret = -ESTALE;
153 } 187 }
154
155#ifdef AFS_CACHING_SUPPORT 188#ifdef AFS_CACHING_SUPPORT
156 cachefs_uncache_page(vnode->cache, page); 189 cachefs_uncache_page(vnode->cache, page);
157#endif 190#endif
@@ -178,16 +211,13 @@ static int afs_file_readpage(struct file *file, struct page *page)
178 _leave(" = 0"); 211 _leave(" = 0");
179 return 0; 212 return 0;
180 213
181 error: 214error:
182 SetPageError(page); 215 SetPageError(page);
183 unlock_page(page); 216 unlock_page(page);
184
185 _leave(" = %d", ret); 217 _leave(" = %d", ret);
186 return ret; 218 return ret;
219}
187 220
188} /* end afs_file_readpage() */
189
190/*****************************************************************************/
191/* 221/*
192 * get a page cookie for the specified page 222 * get a page cookie for the specified page
193 */ 223 */
@@ -202,10 +232,9 @@ int afs_cache_get_page_cookie(struct page *page,
202 232
203 _leave(" = %d", ret); 233 _leave(" = %d", ret);
204 return ret; 234 return ret;
205} /* end afs_cache_get_page_cookie() */ 235}
206#endif 236#endif
207 237
208/*****************************************************************************/
209/* 238/*
210 * invalidate part or all of a page 239 * invalidate part or all of a page
211 */ 240 */
@@ -240,9 +269,8 @@ static void afs_file_invalidatepage(struct page *page, unsigned long offset)
240 } 269 }
241 270
242 _leave(" = %d", ret); 271 _leave(" = %d", ret);
243} /* end afs_file_invalidatepage() */ 272}
244 273
245/*****************************************************************************/
246/* 274/*
247 * release a page and cleanup its private data 275 * release a page and cleanup its private data
248 */ 276 */
@@ -267,4 +295,4 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
267 295
268 _leave(" = 0"); 296 _leave(" = 0");
269 return 0; 297 return 0;
270} /* end afs_file_releasepage() */ 298}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 61bc371532ab..e54e6c2ad343 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -1,6 +1,6 @@
1/* fsclient.c: AFS File Server client stubs 1/* AFS File Server client stubs
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -11,827 +11,928 @@
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <rxrpc/rxrpc.h> 14#include <linux/circ_buf.h>
15#include <rxrpc/transport.h>
16#include <rxrpc/connection.h>
17#include <rxrpc/call.h>
18#include "fsclient.h"
19#include "cmservice.h"
20#include "vnode.h"
21#include "server.h"
22#include "errors.h"
23#include "internal.h" 15#include "internal.h"
16#include "afs_fs.h"
24 17
25#define FSFETCHSTATUS 132 /* AFS Fetch file status */
26#define FSFETCHDATA 130 /* AFS Fetch file data */
27#define FSGIVEUPCALLBACKS 147 /* AFS Discard callback promises */
28#define FSGETVOLUMEINFO 148 /* AFS Get root volume information */
29#define FSGETROOTVOLUME 151 /* AFS Get root volume name */
30#define FSLOOKUP 161 /* AFS lookup file in directory */
31
32/*****************************************************************************/
33/* 18/*
34 * map afs abort codes to/from Linux error codes 19 * decode an AFSFid block
35 * - called with call->lock held
36 */ 20 */
37static void afs_rxfs_aemap(struct rxrpc_call *call) 21static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid)
38{ 22{
39 switch (call->app_err_state) { 23 const __be32 *bp = *_bp;
40 case RXRPC_ESTATE_LOCAL_ABORT: 24
41 call->app_abort_code = -call->app_errno; 25 fid->vid = ntohl(*bp++);
42 break; 26 fid->vnode = ntohl(*bp++);
43 case RXRPC_ESTATE_PEER_ABORT: 27 fid->unique = ntohl(*bp++);
44 call->app_errno = afs_abort_to_error(call->app_abort_code); 28 *_bp = bp;
45 break; 29}
46 default:
47 break;
48 }
49} /* end afs_rxfs_aemap() */
50 30
51/*****************************************************************************/
52/* 31/*
53 * get the root volume name from a fileserver 32 * decode an AFSFetchStatus block
54 * - this operation doesn't seem to work correctly in OpenAFS server 1.2.2
55 */ 33 */
56#if 0 34static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
57int afs_rxfs_get_root_volume(struct afs_server *server, 35 struct afs_file_status *status,
58 char *buf, size_t *buflen) 36 struct afs_vnode *vnode)
59{ 37{
60 struct rxrpc_connection *conn; 38 const __be32 *bp = *_bp;
61 struct rxrpc_call *call; 39 umode_t mode;
62 struct kvec piov[2]; 40 u64 data_version, size;
63 size_t sent; 41 u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
64 int ret; 42
65 u32 param[1]; 43#define EXTRACT(DST) \
44 do { \
45 u32 x = ntohl(*bp++); \
46 changed |= DST - x; \
47 DST = x; \
48 } while (0)
49
50 status->if_version = ntohl(*bp++);
51 EXTRACT(status->type);
52 EXTRACT(status->nlink);
53 size = ntohl(*bp++);
54 data_version = ntohl(*bp++);
55 EXTRACT(status->author);
56 EXTRACT(status->owner);
57 EXTRACT(status->caller_access); /* call ticket dependent */
58 EXTRACT(status->anon_access);
59 EXTRACT(status->mode);
60 EXTRACT(status->parent.vnode);
61 EXTRACT(status->parent.unique);
62 bp++; /* seg size */
63 status->mtime_client = ntohl(*bp++);
64 status->mtime_server = ntohl(*bp++);
65 EXTRACT(status->group);
66 bp++; /* sync counter */
67 data_version |= (u64) ntohl(*bp++) << 32;
68 bp++; /* lock count */
69 size |= (u64) ntohl(*bp++) << 32;
70 bp++; /* spare 4 */
71 *_bp = bp;
72
73 if (size != status->size) {
74 status->size = size;
75 changed |= true;
76 }
77 status->mode &= S_IALLUGO;
78
79 _debug("vnode time %lx, %lx",
80 status->mtime_client, status->mtime_server);
81
82 if (vnode) {
83 status->parent.vid = vnode->fid.vid;
84 if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
85 _debug("vnode changed");
86 i_size_write(&vnode->vfs_inode, size);
87 vnode->vfs_inode.i_uid = status->owner;
88 vnode->vfs_inode.i_gid = status->group;
89 vnode->vfs_inode.i_version = vnode->fid.unique;
90 vnode->vfs_inode.i_nlink = status->nlink;
91
92 mode = vnode->vfs_inode.i_mode;
93 mode &= ~S_IALLUGO;
94 mode |= status->mode;
95 barrier();
96 vnode->vfs_inode.i_mode = mode;
97 }
66 98
67 DECLARE_WAITQUEUE(myself, current); 99 vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server;
100 vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime;
101 vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime;
102 }
68 103
69 kenter("%p,%p,%u",server, buf, *buflen); 104 if (status->data_version != data_version) {
105 status->data_version = data_version;
106 if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
107 _debug("vnode modified %llx on {%x:%u}",
108 (unsigned long long) data_version,
109 vnode->fid.vid, vnode->fid.vnode);
110 set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
111 set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
112 }
113 }
114}
70 115
71 /* get hold of the fileserver connection */ 116/*
72 ret = afs_server_get_fsconn(server, &conn); 117 * decode an AFSCallBack block
73 if (ret < 0) 118 */
74 goto out; 119static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode)
120{
121 const __be32 *bp = *_bp;
75 122
76 /* create a call through that connection */ 123 vnode->cb_version = ntohl(*bp++);
77 ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call); 124 vnode->cb_expiry = ntohl(*bp++);
78 if (ret < 0) { 125 vnode->cb_type = ntohl(*bp++);
79 printk("kAFS: Unable to create call: %d\n", ret); 126 vnode->cb_expires = vnode->cb_expiry + get_seconds();
80 goto out_put_conn; 127 *_bp = bp;
81 } 128}
82 call->app_opcode = FSGETROOTVOLUME;
83 129
84 /* we want to get event notifications from the call */ 130static void xdr_decode_AFSCallBack_raw(const __be32 **_bp,
85 add_wait_queue(&call->waitq, &myself); 131 struct afs_callback *cb)
132{
133 const __be32 *bp = *_bp;
86 134
87 /* marshall the parameters */ 135 cb->version = ntohl(*bp++);
88 param[0] = htonl(FSGETROOTVOLUME); 136 cb->expiry = ntohl(*bp++);
89 137 cb->type = ntohl(*bp++);
90 piov[0].iov_len = sizeof(param); 138 *_bp = bp;
91 piov[0].iov_base = param; 139}
92
93 /* send the parameters to the server */
94 ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
95 0, &sent);
96 if (ret < 0)
97 goto abort;
98
99 /* wait for the reply to completely arrive */
100 for (;;) {
101 set_current_state(TASK_INTERRUPTIBLE);
102 if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
103 signal_pending(current))
104 break;
105 schedule();
106 }
107 set_current_state(TASK_RUNNING);
108 140
109 ret = -EINTR; 141/*
110 if (signal_pending(current)) 142 * decode an AFSVolSync block
111 goto abort; 143 */
144static void xdr_decode_AFSVolSync(const __be32 **_bp,
145 struct afs_volsync *volsync)
146{
147 const __be32 *bp = *_bp;
112 148
113 switch (call->app_call_state) { 149 volsync->creation = ntohl(*bp++);
114 case RXRPC_CSTATE_ERROR: 150 bp++; /* spare2 */
115 ret = call->app_errno; 151 bp++; /* spare3 */
116 kdebug("Got Error: %d", ret); 152 bp++; /* spare4 */
117 goto out_unwait; 153 bp++; /* spare5 */
154 bp++; /* spare6 */
155 *_bp = bp;
156}
118 157
119 case RXRPC_CSTATE_CLNT_GOT_REPLY: 158/*
120 /* read the reply */ 159 * deliver reply data to an FS.FetchStatus
121 kdebug("Got Reply: qty=%d", call->app_ready_qty); 160 */
161static int afs_deliver_fs_fetch_status(struct afs_call *call,
162 struct sk_buff *skb, bool last)
163{
164 struct afs_vnode *vnode = call->reply;
165 const __be32 *bp;
122 166
123 ret = -EBADMSG; 167 _enter(",,%u", last);
124 if (call->app_ready_qty <= 4)
125 goto abort;
126 168
127 ret = rxrpc_call_read_data(call, NULL, call->app_ready_qty, 0); 169 afs_transfer_reply(call, skb);
128 if (ret < 0) 170 if (!last)
129 goto abort; 171 return 0;
130 172
131#if 0 173 if (call->reply_size != call->reply_max)
132 /* unmarshall the reply */ 174 return -EBADMSG;
133 bp = buffer;
134 for (loop = 0; loop < 65; loop++)
135 entry->name[loop] = ntohl(*bp++);
136 entry->name[64] = 0;
137 175
138 entry->type = ntohl(*bp++); 176 /* unmarshall the reply once we've received all of it */
139 entry->num_servers = ntohl(*bp++); 177 bp = call->buffer;
178 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
179 xdr_decode_AFSCallBack(&bp, vnode);
180 if (call->reply2)
181 xdr_decode_AFSVolSync(&bp, call->reply2);
140 182
141 for (loop = 0; loop < 8; loop++) 183 _leave(" = 0 [done]");
142 entry->servers[loop].addr.s_addr = *bp++; 184 return 0;
185}
143 186
144 for (loop = 0; loop < 8; loop++) 187/*
145 entry->servers[loop].partition = ntohl(*bp++); 188 * FS.FetchStatus operation type
189 */
190static const struct afs_call_type afs_RXFSFetchStatus = {
191 .name = "FS.FetchStatus",
192 .deliver = afs_deliver_fs_fetch_status,
193 .abort_to_error = afs_abort_to_error,
194 .destructor = afs_flat_call_destructor,
195};
146 196
147 for (loop = 0; loop < 8; loop++) 197/*
148 entry->servers[loop].flags = ntohl(*bp++); 198 * fetch the status information for a file
199 */
200int afs_fs_fetch_file_status(struct afs_server *server,
201 struct key *key,
202 struct afs_vnode *vnode,
203 struct afs_volsync *volsync,
204 const struct afs_wait_mode *wait_mode)
205{
206 struct afs_call *call;
207 __be32 *bp;
149 208
150 for (loop = 0; loop < 3; loop++) 209 _enter(",%x,{%x:%d},,",
151 entry->volume_ids[loop] = ntohl(*bp++); 210 key_serial(key), vnode->fid.vid, vnode->fid.vnode);
152 211
153 entry->clone_id = ntohl(*bp++); 212 call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
154 entry->flags = ntohl(*bp); 213 if (!call)
155#endif 214 return -ENOMEM;
156 215
157 /* success */ 216 call->key = key;
158 ret = 0; 217 call->reply = vnode;
159 goto out_unwait; 218 call->reply2 = volsync;
219 call->service_id = FS_SERVICE;
220 call->port = htons(AFS_FS_PORT);
160 221
161 default: 222 /* marshall the parameters */
162 BUG(); 223 bp = call->request;
163 } 224 bp[0] = htonl(FSFETCHSTATUS);
225 bp[1] = htonl(vnode->fid.vid);
226 bp[2] = htonl(vnode->fid.vnode);
227 bp[3] = htonl(vnode->fid.unique);
228
229 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
230}
164 231
165 abort:
166 set_current_state(TASK_UNINTERRUPTIBLE);
167 rxrpc_call_abort(call, ret);
168 schedule();
169 out_unwait:
170 set_current_state(TASK_RUNNING);
171 remove_wait_queue(&call->waitq, &myself);
172 rxrpc_put_call(call);
173 out_put_conn:
174 afs_server_release_fsconn(server, conn);
175 out:
176 kleave("");
177 return ret;
178} /* end afs_rxfs_get_root_volume() */
179#endif
180
181/*****************************************************************************/
182/* 232/*
183 * get information about a volume 233 * deliver reply data to an FS.FetchData
184 */ 234 */
185#if 0 235static int afs_deliver_fs_fetch_data(struct afs_call *call,
186int afs_rxfs_get_volume_info(struct afs_server *server, 236 struct sk_buff *skb, bool last)
187 const char *name,
188 struct afs_volume_info *vinfo)
189{ 237{
190 struct rxrpc_connection *conn; 238 struct afs_vnode *vnode = call->reply;
191 struct rxrpc_call *call; 239 const __be32 *bp;
192 struct kvec piov[3]; 240 struct page *page;
193 size_t sent; 241 void *buffer;
194 int ret; 242 int ret;
195 u32 param[2], *bp, zero;
196 243
197 DECLARE_WAITQUEUE(myself, current); 244 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
245
246 switch (call->unmarshall) {
247 case 0:
248 call->offset = 0;
249 call->unmarshall++;
250
251 /* extract the returned data length */
252 case 1:
253 _debug("extract data length");
254 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
255 switch (ret) {
256 case 0: break;
257 case -EAGAIN: return 0;
258 default: return ret;
259 }
198 260
199 _enter("%p,%s,%p", server, name, vinfo); 261 call->count = ntohl(call->tmp);
262 _debug("DATA length: %u", call->count);
263 if (call->count > PAGE_SIZE)
264 return -EBADMSG;
265 call->offset = 0;
266 call->unmarshall++;
267
268 if (call->count < PAGE_SIZE) {
269 page = call->reply3;
270 buffer = kmap_atomic(page, KM_USER0);
271 memset(buffer + PAGE_SIZE - call->count, 0,
272 call->count);
273 kunmap_atomic(buffer, KM_USER0);
274 }
200 275
201 /* get hold of the fileserver connection */ 276 /* extract the returned data */
202 ret = afs_server_get_fsconn(server, &conn); 277 case 2:
203 if (ret < 0) 278 _debug("extract data");
204 goto out; 279 page = call->reply3;
280 buffer = kmap_atomic(page, KM_USER0);
281 ret = afs_extract_data(call, skb, last, buffer, call->count);
282 kunmap_atomic(buffer, KM_USER0);
283 switch (ret) {
284 case 0: break;
285 case -EAGAIN: return 0;
286 default: return ret;
287 }
205 288
206 /* create a call through that connection */ 289 call->offset = 0;
207 ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call); 290 call->unmarshall++;
208 if (ret < 0) { 291
209 printk("kAFS: Unable to create call: %d\n", ret); 292 /* extract the metadata */
210 goto out_put_conn; 293 case 3:
211 } 294 ret = afs_extract_data(call, skb, last, call->buffer,
212 call->app_opcode = FSGETVOLUMEINFO; 295 (21 + 3 + 6) * 4);
296 switch (ret) {
297 case 0: break;
298 case -EAGAIN: return 0;
299 default: return ret;
300 }
213 301
214 /* we want to get event notifications from the call */ 302 bp = call->buffer;
215 add_wait_queue(&call->waitq, &myself); 303 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
304 xdr_decode_AFSCallBack(&bp, vnode);
305 if (call->reply2)
306 xdr_decode_AFSVolSync(&bp, call->reply2);
216 307
217 /* marshall the parameters */ 308 call->offset = 0;
218 piov[1].iov_len = strlen(name); 309 call->unmarshall++;
219 piov[1].iov_base = (char *) name; 310
220 311 case 4:
221 zero = 0; 312 _debug("trailer");
222 piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3; 313 if (skb->len != 0)
223 piov[2].iov_base = &zero; 314 return -EBADMSG;
224 315 break;
225 param[0] = htonl(FSGETVOLUMEINFO);
226 param[1] = htonl(piov[1].iov_len);
227
228 piov[0].iov_len = sizeof(param);
229 piov[0].iov_base = param;
230
231 /* send the parameters to the server */
232 ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
233 0, &sent);
234 if (ret < 0)
235 goto abort;
236
237 /* wait for the reply to completely arrive */
238 bp = rxrpc_call_alloc_scratch(call, 64);
239
240 ret = rxrpc_call_read_data(call, bp, 64,
241 RXRPC_CALL_READ_BLOCK |
242 RXRPC_CALL_READ_ALL);
243 if (ret < 0) {
244 if (ret == -ECONNABORTED) {
245 ret = call->app_errno;
246 goto out_unwait;
247 }
248 goto abort;
249 } 316 }
250 317
251 /* unmarshall the reply */ 318 if (!last)
252 vinfo->vid = ntohl(*bp++); 319 return 0;
253 vinfo->type = ntohl(*bp++); 320
254 321 _leave(" = 0 [done]");
255 vinfo->type_vids[0] = ntohl(*bp++); 322 return 0;
256 vinfo->type_vids[1] = ntohl(*bp++); 323}
257 vinfo->type_vids[2] = ntohl(*bp++); 324
258 vinfo->type_vids[3] = ntohl(*bp++);
259 vinfo->type_vids[4] = ntohl(*bp++);
260
261 vinfo->nservers = ntohl(*bp++);
262 vinfo->servers[0].addr.s_addr = *bp++;
263 vinfo->servers[1].addr.s_addr = *bp++;
264 vinfo->servers[2].addr.s_addr = *bp++;
265 vinfo->servers[3].addr.s_addr = *bp++;
266 vinfo->servers[4].addr.s_addr = *bp++;
267 vinfo->servers[5].addr.s_addr = *bp++;
268 vinfo->servers[6].addr.s_addr = *bp++;
269 vinfo->servers[7].addr.s_addr = *bp++;
270
271 ret = -EBADMSG;
272 if (vinfo->nservers > 8)
273 goto abort;
274
275 /* success */
276 ret = 0;
277
278 out_unwait:
279 set_current_state(TASK_RUNNING);
280 remove_wait_queue(&call->waitq, &myself);
281 rxrpc_put_call(call);
282 out_put_conn:
283 afs_server_release_fsconn(server, conn);
284 out:
285 _leave("");
286 return ret;
287
288 abort:
289 set_current_state(TASK_UNINTERRUPTIBLE);
290 rxrpc_call_abort(call, ret);
291 schedule();
292 goto out_unwait;
293
294} /* end afs_rxfs_get_volume_info() */
295#endif
296
297/*****************************************************************************/
298/* 325/*
299 * fetch the status information for a file 326 * FS.FetchData operation type
327 */
328static const struct afs_call_type afs_RXFSFetchData = {
329 .name = "FS.FetchData",
330 .deliver = afs_deliver_fs_fetch_data,
331 .abort_to_error = afs_abort_to_error,
332 .destructor = afs_flat_call_destructor,
333};
334
335/*
336 * fetch data from a file
300 */ 337 */
301int afs_rxfs_fetch_file_status(struct afs_server *server, 338int afs_fs_fetch_data(struct afs_server *server,
302 struct afs_vnode *vnode, 339 struct key *key,
303 struct afs_volsync *volsync) 340 struct afs_vnode *vnode,
341 off_t offset, size_t length,
342 struct page *buffer,
343 const struct afs_wait_mode *wait_mode)
304{ 344{
305 struct afs_server_callslot callslot; 345 struct afs_call *call;
306 struct rxrpc_call *call;
307 struct kvec piov[1];
308 size_t sent;
309 int ret;
310 __be32 *bp; 346 __be32 *bp;
311 347
312 DECLARE_WAITQUEUE(myself, current); 348 _enter("");
313 349
314 _enter("%p,{%u,%u,%u}", 350 call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
315 server, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); 351 if (!call)
352 return -ENOMEM;
316 353
317 /* get hold of the fileserver connection */ 354 call->key = key;
318 ret = afs_server_request_callslot(server, &callslot); 355 call->reply = vnode;
319 if (ret < 0) 356 call->reply2 = NULL; /* volsync */
320 goto out; 357 call->reply3 = buffer;
321 358 call->service_id = FS_SERVICE;
322 /* create a call through that connection */ 359 call->port = htons(AFS_FS_PORT);
323 ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap,
324 &call);
325 if (ret < 0) {
326 printk("kAFS: Unable to create call: %d\n", ret);
327 goto out_put_conn;
328 }
329 call->app_opcode = FSFETCHSTATUS;
330
331 /* we want to get event notifications from the call */
332 add_wait_queue(&call->waitq, &myself);
333 360
334 /* marshall the parameters */ 361 /* marshall the parameters */
335 bp = rxrpc_call_alloc_scratch(call, 16); 362 bp = call->request;
336 bp[0] = htonl(FSFETCHSTATUS); 363 bp[0] = htonl(FSFETCHDATA);
337 bp[1] = htonl(vnode->fid.vid); 364 bp[1] = htonl(vnode->fid.vid);
338 bp[2] = htonl(vnode->fid.vnode); 365 bp[2] = htonl(vnode->fid.vnode);
339 bp[3] = htonl(vnode->fid.unique); 366 bp[3] = htonl(vnode->fid.unique);
367 bp[4] = htonl(offset);
368 bp[5] = htonl(length);
340 369
341 piov[0].iov_len = 16; 370 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
342 piov[0].iov_base = bp; 371}
343
344 /* send the parameters to the server */
345 ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
346 0, &sent);
347 if (ret < 0)
348 goto abort;
349
350 /* wait for the reply to completely arrive */
351 bp = rxrpc_call_alloc_scratch(call, 120);
352
353 ret = rxrpc_call_read_data(call, bp, 120,
354 RXRPC_CALL_READ_BLOCK |
355 RXRPC_CALL_READ_ALL);
356 if (ret < 0) {
357 if (ret == -ECONNABORTED) {
358 ret = call->app_errno;
359 goto out_unwait;
360 }
361 goto abort;
362 }
363 372
364 /* unmarshall the reply */ 373/*
365 vnode->status.if_version = ntohl(*bp++); 374 * deliver reply data to an FS.GiveUpCallBacks
366 vnode->status.type = ntohl(*bp++); 375 */
367 vnode->status.nlink = ntohl(*bp++); 376static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
368 vnode->status.size = ntohl(*bp++); 377 struct sk_buff *skb, bool last)
369 vnode->status.version = ntohl(*bp++); 378{
370 vnode->status.author = ntohl(*bp++); 379 _enter(",{%u},%d", skb->len, last);
371 vnode->status.owner = ntohl(*bp++);
372 vnode->status.caller_access = ntohl(*bp++);
373 vnode->status.anon_access = ntohl(*bp++);
374 vnode->status.mode = ntohl(*bp++);
375 vnode->status.parent.vid = vnode->fid.vid;
376 vnode->status.parent.vnode = ntohl(*bp++);
377 vnode->status.parent.unique = ntohl(*bp++);
378 bp++; /* seg size */
379 vnode->status.mtime_client = ntohl(*bp++);
380 vnode->status.mtime_server = ntohl(*bp++);
381 bp++; /* group */
382 bp++; /* sync counter */
383 vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
384 bp++; /* spare2 */
385 bp++; /* spare3 */
386 bp++; /* spare4 */
387 380
388 vnode->cb_version = ntohl(*bp++); 381 if (skb->len > 0)
389 vnode->cb_expiry = ntohl(*bp++); 382 return -EBADMSG; /* shouldn't be any reply data */
390 vnode->cb_type = ntohl(*bp++); 383 return 0;
391 384}
392 if (volsync) {
393 volsync->creation = ntohl(*bp++);
394 bp++; /* spare2 */
395 bp++; /* spare3 */
396 bp++; /* spare4 */
397 bp++; /* spare5 */
398 bp++; /* spare6 */
399 }
400 385
401 /* success */
402 ret = 0;
403
404 out_unwait:
405 set_current_state(TASK_RUNNING);
406 remove_wait_queue(&call->waitq, &myself);
407 rxrpc_put_call(call);
408 out_put_conn:
409 afs_server_release_callslot(server, &callslot);
410 out:
411 _leave("");
412 return ret;
413
414 abort:
415 set_current_state(TASK_UNINTERRUPTIBLE);
416 rxrpc_call_abort(call, ret);
417 schedule();
418 goto out_unwait;
419} /* end afs_rxfs_fetch_file_status() */
420
421/*****************************************************************************/
422/* 386/*
423 * fetch the contents of a file or directory 387 * FS.GiveUpCallBacks operation type
424 */ 388 */
425int afs_rxfs_fetch_file_data(struct afs_server *server, 389static const struct afs_call_type afs_RXFSGiveUpCallBacks = {
426 struct afs_vnode *vnode, 390 .name = "FS.GiveUpCallBacks",
427 struct afs_rxfs_fetch_descriptor *desc, 391 .deliver = afs_deliver_fs_give_up_callbacks,
428 struct afs_volsync *volsync) 392 .abort_to_error = afs_abort_to_error,
393 .destructor = afs_flat_call_destructor,
394};
395
396/*
397 * give up a set of callbacks
398 * - the callbacks are held in the server->cb_break ring
399 */
400int afs_fs_give_up_callbacks(struct afs_server *server,
401 const struct afs_wait_mode *wait_mode)
429{ 402{
430 struct afs_server_callslot callslot; 403 struct afs_call *call;
431 struct rxrpc_call *call; 404 size_t ncallbacks;
432 struct kvec piov[1]; 405 __be32 *bp, *tp;
433 size_t sent; 406 int loop;
434 int ret;
435 __be32 *bp;
436 407
437 DECLARE_WAITQUEUE(myself, current); 408 ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail,
438 409 ARRAY_SIZE(server->cb_break));
439 _enter("%p,{fid={%u,%u,%u},sz=%Zu,of=%lu}", 410
440 server, 411 _enter("{%zu},", ncallbacks);
441 desc->fid.vid, 412
442 desc->fid.vnode, 413 if (ncallbacks == 0)
443 desc->fid.unique, 414 return 0;
444 desc->size, 415 if (ncallbacks > AFSCBMAX)
445 desc->offset); 416 ncallbacks = AFSCBMAX;
446 417
447 /* get hold of the fileserver connection */ 418 _debug("break %zu callbacks", ncallbacks);
448 ret = afs_server_request_callslot(server, &callslot);
449 if (ret < 0)
450 goto out;
451
452 /* create a call through that connection */
453 ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap, &call);
454 if (ret < 0) {
455 printk("kAFS: Unable to create call: %d\n", ret);
456 goto out_put_conn;
457 }
458 call->app_opcode = FSFETCHDATA;
459 419
460 /* we want to get event notifications from the call */ 420 call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks,
461 add_wait_queue(&call->waitq, &myself); 421 12 + ncallbacks * 6 * 4, 0);
422 if (!call)
423 return -ENOMEM;
424
425 call->service_id = FS_SERVICE;
426 call->port = htons(AFS_FS_PORT);
462 427
463 /* marshall the parameters */ 428 /* marshall the parameters */
464 bp = rxrpc_call_alloc_scratch(call, 24); 429 bp = call->request;
465 bp[0] = htonl(FSFETCHDATA); 430 tp = bp + 2 + ncallbacks * 3;
466 bp[1] = htonl(desc->fid.vid); 431 *bp++ = htonl(FSGIVEUPCALLBACKS);
467 bp[2] = htonl(desc->fid.vnode); 432 *bp++ = htonl(ncallbacks);
468 bp[3] = htonl(desc->fid.unique); 433 *tp++ = htonl(ncallbacks);
469 bp[4] = htonl(desc->offset); 434
470 bp[5] = htonl(desc->size); 435 atomic_sub(ncallbacks, &server->cb_break_n);
471 436 for (loop = ncallbacks; loop > 0; loop--) {
472 piov[0].iov_len = 24; 437 struct afs_callback *cb =
473 piov[0].iov_base = bp; 438 &server->cb_break[server->cb_break_tail];
474 439
475 /* send the parameters to the server */ 440 *bp++ = htonl(cb->fid.vid);
476 ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS, 441 *bp++ = htonl(cb->fid.vnode);
477 0, &sent); 442 *bp++ = htonl(cb->fid.unique);
478 if (ret < 0) 443 *tp++ = htonl(cb->version);
479 goto abort; 444 *tp++ = htonl(cb->expiry);
480 445 *tp++ = htonl(cb->type);
481 /* wait for the data count to arrive */ 446 smp_mb();
482 ret = rxrpc_call_read_data(call, bp, 4, RXRPC_CALL_READ_BLOCK); 447 server->cb_break_tail =
483 if (ret < 0) 448 (server->cb_break_tail + 1) &
484 goto read_failed; 449 (ARRAY_SIZE(server->cb_break) - 1);
485
486 desc->actual = ntohl(bp[0]);
487 if (desc->actual != desc->size) {
488 ret = -EBADMSG;
489 goto abort;
490 } 450 }
491 451
492 /* call the app to read the actual data */ 452 ASSERT(ncallbacks > 0);
493 rxrpc_call_reset_scratch(call); 453 wake_up_nr(&server->cb_break_waitq, ncallbacks);
494
495 ret = rxrpc_call_read_data(call, desc->buffer, desc->actual,
496 RXRPC_CALL_READ_BLOCK);
497 if (ret < 0)
498 goto read_failed;
499
500 /* wait for the rest of the reply to completely arrive */
501 rxrpc_call_reset_scratch(call);
502 bp = rxrpc_call_alloc_scratch(call, 120);
503
504 ret = rxrpc_call_read_data(call, bp, 120,
505 RXRPC_CALL_READ_BLOCK |
506 RXRPC_CALL_READ_ALL);
507 if (ret < 0)
508 goto read_failed;
509
510 /* unmarshall the reply */
511 vnode->status.if_version = ntohl(*bp++);
512 vnode->status.type = ntohl(*bp++);
513 vnode->status.nlink = ntohl(*bp++);
514 vnode->status.size = ntohl(*bp++);
515 vnode->status.version = ntohl(*bp++);
516 vnode->status.author = ntohl(*bp++);
517 vnode->status.owner = ntohl(*bp++);
518 vnode->status.caller_access = ntohl(*bp++);
519 vnode->status.anon_access = ntohl(*bp++);
520 vnode->status.mode = ntohl(*bp++);
521 vnode->status.parent.vid = desc->fid.vid;
522 vnode->status.parent.vnode = ntohl(*bp++);
523 vnode->status.parent.unique = ntohl(*bp++);
524 bp++; /* seg size */
525 vnode->status.mtime_client = ntohl(*bp++);
526 vnode->status.mtime_server = ntohl(*bp++);
527 bp++; /* group */
528 bp++; /* sync counter */
529 vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
530 bp++; /* spare2 */
531 bp++; /* spare3 */
532 bp++; /* spare4 */
533 454
534 vnode->cb_version = ntohl(*bp++); 455 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
535 vnode->cb_expiry = ntohl(*bp++); 456}
536 vnode->cb_type = ntohl(*bp++);
537
538 if (volsync) {
539 volsync->creation = ntohl(*bp++);
540 bp++; /* spare2 */
541 bp++; /* spare3 */
542 bp++; /* spare4 */
543 bp++; /* spare5 */
544 bp++; /* spare6 */
545 }
546 457
547 /* success */ 458/*
548 ret = 0; 459 * deliver reply data to an FS.CreateFile or an FS.MakeDir
549 460 */
550 out_unwait: 461static int afs_deliver_fs_create_vnode(struct afs_call *call,
551 set_current_state(TASK_RUNNING); 462 struct sk_buff *skb, bool last)
552 remove_wait_queue(&call->waitq,&myself); 463{
553 rxrpc_put_call(call); 464 struct afs_vnode *vnode = call->reply;
554 out_put_conn: 465 const __be32 *bp;
555 afs_server_release_callslot(server, &callslot); 466
556 out: 467 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
557 _leave(" = %d", ret);
558 return ret;
559
560 read_failed:
561 if (ret == -ECONNABORTED) {
562 ret = call->app_errno;
563 goto out_unwait;
564 }
565 468
566 abort: 469 afs_transfer_reply(call, skb);
567 set_current_state(TASK_UNINTERRUPTIBLE); 470 if (!last)
568 rxrpc_call_abort(call, ret); 471 return 0;
569 schedule();
570 goto out_unwait;
571 472
572} /* end afs_rxfs_fetch_file_data() */ 473 if (call->reply_size != call->reply_max)
474 return -EBADMSG;
475
476 /* unmarshall the reply once we've received all of it */
477 bp = call->buffer;
478 xdr_decode_AFSFid(&bp, call->reply2);
479 xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
480 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
481 xdr_decode_AFSCallBack_raw(&bp, call->reply4);
482 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
483
484 _leave(" = 0 [done]");
485 return 0;
486}
487
488/*
489 * FS.CreateFile and FS.MakeDir operation type
490 */
491static const struct afs_call_type afs_RXFSCreateXXXX = {
492 .name = "FS.CreateXXXX",
493 .deliver = afs_deliver_fs_create_vnode,
494 .abort_to_error = afs_abort_to_error,
495 .destructor = afs_flat_call_destructor,
496};
573 497
574/*****************************************************************************/
575/* 498/*
576 * ask the AFS fileserver to discard a callback request on a file 499 * create a file or make a directory
577 */ 500 */
578int afs_rxfs_give_up_callback(struct afs_server *server, 501int afs_fs_create(struct afs_server *server,
579 struct afs_vnode *vnode) 502 struct key *key,
503 struct afs_vnode *vnode,
504 const char *name,
505 umode_t mode,
506 struct afs_fid *newfid,
507 struct afs_file_status *newstatus,
508 struct afs_callback *newcb,
509 const struct afs_wait_mode *wait_mode)
580{ 510{
581 struct afs_server_callslot callslot; 511 struct afs_call *call;
582 struct rxrpc_call *call; 512 size_t namesz, reqsz, padsz;
583 struct kvec piov[1];
584 size_t sent;
585 int ret;
586 __be32 *bp; 513 __be32 *bp;
587 514
588 DECLARE_WAITQUEUE(myself, current); 515 _enter("");
589 516
590 _enter("%p,{%u,%u,%u}", 517 namesz = strlen(name);
591 server, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); 518 padsz = (4 - (namesz & 3)) & 3;
519 reqsz = (5 * 4) + namesz + padsz + (6 * 4);
592 520
593 /* get hold of the fileserver connection */ 521 call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz,
594 ret = afs_server_request_callslot(server, &callslot); 522 (3 + 21 + 21 + 3 + 6) * 4);
595 if (ret < 0) 523 if (!call)
596 goto out; 524 return -ENOMEM;
597 525
598 /* create a call through that connection */ 526 call->key = key;
599 ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap, &call); 527 call->reply = vnode;
600 if (ret < 0) { 528 call->reply2 = newfid;
601 printk("kAFS: Unable to create call: %d\n", ret); 529 call->reply3 = newstatus;
602 goto out_put_conn; 530 call->reply4 = newcb;
531 call->service_id = FS_SERVICE;
532 call->port = htons(AFS_FS_PORT);
533
534 /* marshall the parameters */
535 bp = call->request;
536 *bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE);
537 *bp++ = htonl(vnode->fid.vid);
538 *bp++ = htonl(vnode->fid.vnode);
539 *bp++ = htonl(vnode->fid.unique);
540 *bp++ = htonl(namesz);
541 memcpy(bp, name, namesz);
542 bp = (void *) bp + namesz;
543 if (padsz > 0) {
544 memset(bp, 0, padsz);
545 bp = (void *) bp + padsz;
603 } 546 }
604 call->app_opcode = FSGIVEUPCALLBACKS; 547 *bp++ = htonl(AFS_SET_MODE);
548 *bp++ = 0; /* mtime */
549 *bp++ = 0; /* owner */
550 *bp++ = 0; /* group */
551 *bp++ = htonl(mode & S_IALLUGO); /* unix mode */
552 *bp++ = 0; /* segment size */
605 553
606 /* we want to get event notifications from the call */ 554 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
607 add_wait_queue(&call->waitq, &myself); 555}
608 556
609 /* marshall the parameters */ 557/*
610 bp = rxrpc_call_alloc_scratch(call, (1 + 4 + 4) * 4); 558 * deliver reply data to an FS.RemoveFile or FS.RemoveDir
559 */
560static int afs_deliver_fs_remove(struct afs_call *call,
561 struct sk_buff *skb, bool last)
562{
563 struct afs_vnode *vnode = call->reply;
564 const __be32 *bp;
611 565
612 piov[0].iov_len = (1 + 4 + 4) * 4; 566 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
613 piov[0].iov_base = bp;
614 567
615 *bp++ = htonl(FSGIVEUPCALLBACKS); 568 afs_transfer_reply(call, skb);
616 *bp++ = htonl(1); 569 if (!last)
570 return 0;
571
572 if (call->reply_size != call->reply_max)
573 return -EBADMSG;
574
575 /* unmarshall the reply once we've received all of it */
576 bp = call->buffer;
577 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
578 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
579
580 _leave(" = 0 [done]");
581 return 0;
582}
583
584/*
585 * FS.RemoveDir/FS.RemoveFile operation type
586 */
587static const struct afs_call_type afs_RXFSRemoveXXXX = {
588 .name = "FS.RemoveXXXX",
589 .deliver = afs_deliver_fs_remove,
590 .abort_to_error = afs_abort_to_error,
591 .destructor = afs_flat_call_destructor,
592};
593
594/*
595 * remove a file or directory
596 */
597int afs_fs_remove(struct afs_server *server,
598 struct key *key,
599 struct afs_vnode *vnode,
600 const char *name,
601 bool isdir,
602 const struct afs_wait_mode *wait_mode)
603{
604 struct afs_call *call;
605 size_t namesz, reqsz, padsz;
606 __be32 *bp;
607
608 _enter("");
609
610 namesz = strlen(name);
611 padsz = (4 - (namesz & 3)) & 3;
612 reqsz = (5 * 4) + namesz + padsz;
613
614 call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4);
615 if (!call)
616 return -ENOMEM;
617
618 call->key = key;
619 call->reply = vnode;
620 call->service_id = FS_SERVICE;
621 call->port = htons(AFS_FS_PORT);
622
623 /* marshall the parameters */
624 bp = call->request;
625 *bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE);
617 *bp++ = htonl(vnode->fid.vid); 626 *bp++ = htonl(vnode->fid.vid);
618 *bp++ = htonl(vnode->fid.vnode); 627 *bp++ = htonl(vnode->fid.vnode);
619 *bp++ = htonl(vnode->fid.unique); 628 *bp++ = htonl(vnode->fid.unique);
620 *bp++ = htonl(1); 629 *bp++ = htonl(namesz);
621 *bp++ = htonl(vnode->cb_version); 630 memcpy(bp, name, namesz);
622 *bp++ = htonl(vnode->cb_expiry); 631 bp = (void *) bp + namesz;
623 *bp++ = htonl(vnode->cb_type); 632 if (padsz > 0) {
624 633 memset(bp, 0, padsz);
625 /* send the parameters to the server */ 634 bp = (void *) bp + padsz;
626 ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
627 0, &sent);
628 if (ret < 0)
629 goto abort;
630
631 /* wait for the reply to completely arrive */
632 for (;;) {
633 set_current_state(TASK_INTERRUPTIBLE);
634 if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
635 signal_pending(current))
636 break;
637 schedule();
638 } 635 }
639 set_current_state(TASK_RUNNING);
640 636
641 ret = -EINTR; 637 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
642 if (signal_pending(current)) 638}
643 goto abort;
644 639
645 switch (call->app_call_state) { 640/*
646 case RXRPC_CSTATE_ERROR: 641 * deliver reply data to an FS.Link
647 ret = call->app_errno; 642 */
648 goto out_unwait; 643static int afs_deliver_fs_link(struct afs_call *call,
644 struct sk_buff *skb, bool last)
645{
646 struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
647 const __be32 *bp;
649 648
650 case RXRPC_CSTATE_CLNT_GOT_REPLY: 649 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
651 ret = 0;
652 goto out_unwait;
653 650
654 default: 651 afs_transfer_reply(call, skb);
655 BUG(); 652 if (!last)
656 } 653 return 0;
654
655 if (call->reply_size != call->reply_max)
656 return -EBADMSG;
657
658 /* unmarshall the reply once we've received all of it */
659 bp = call->buffer;
660 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
661 xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode);
662 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
663
664 _leave(" = 0 [done]");
665 return 0;
666}
667
668/*
669 * FS.Link operation type
670 */
671static const struct afs_call_type afs_RXFSLink = {
672 .name = "FS.Link",
673 .deliver = afs_deliver_fs_link,
674 .abort_to_error = afs_abort_to_error,
675 .destructor = afs_flat_call_destructor,
676};
657 677
658 out_unwait:
659 set_current_state(TASK_RUNNING);
660 remove_wait_queue(&call->waitq, &myself);
661 rxrpc_put_call(call);
662 out_put_conn:
663 afs_server_release_callslot(server, &callslot);
664 out:
665 _leave("");
666 return ret;
667
668 abort:
669 set_current_state(TASK_UNINTERRUPTIBLE);
670 rxrpc_call_abort(call, ret);
671 schedule();
672 goto out_unwait;
673} /* end afs_rxfs_give_up_callback() */
674
675/*****************************************************************************/
676/* 678/*
677 * look a filename up in a directory 679 * make a hard link
678 * - this operation doesn't seem to work correctly in OpenAFS server 1.2.2
679 */ 680 */
680#if 0 681int afs_fs_link(struct afs_server *server,
681int afs_rxfs_lookup(struct afs_server *server, 682 struct key *key,
682 struct afs_vnode *dir, 683 struct afs_vnode *dvnode,
683 const char *filename, 684 struct afs_vnode *vnode,
684 struct afs_vnode *vnode, 685 const char *name,
685 struct afs_volsync *volsync) 686 const struct afs_wait_mode *wait_mode)
686{ 687{
687 struct rxrpc_connection *conn; 688 struct afs_call *call;
688 struct rxrpc_call *call; 689 size_t namesz, reqsz, padsz;
689 struct kvec piov[3]; 690 __be32 *bp;
690 size_t sent;
691 int ret;
692 u32 *bp, zero;
693 691
694 DECLARE_WAITQUEUE(myself, current); 692 _enter("");
695 693
696 kenter("%p,{%u,%u,%u},%s", 694 namesz = strlen(name);
697 server, fid->vid, fid->vnode, fid->unique, filename); 695 padsz = (4 - (namesz & 3)) & 3;
696 reqsz = (5 * 4) + namesz + padsz + (3 * 4);
698 697
699 /* get hold of the fileserver connection */ 698 call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
700 ret = afs_server_get_fsconn(server, &conn); 699 if (!call)
701 if (ret < 0) 700 return -ENOMEM;
702 goto out;
703 701
704 /* create a call through that connection */ 702 call->key = key;
705 ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call); 703 call->reply = dvnode;
706 if (ret < 0) { 704 call->reply2 = vnode;
707 printk("kAFS: Unable to create call: %d\n", ret); 705 call->service_id = FS_SERVICE;
708 goto out_put_conn; 706 call->port = htons(AFS_FS_PORT);
707
708 /* marshall the parameters */
709 bp = call->request;
710 *bp++ = htonl(FSLINK);
711 *bp++ = htonl(dvnode->fid.vid);
712 *bp++ = htonl(dvnode->fid.vnode);
713 *bp++ = htonl(dvnode->fid.unique);
714 *bp++ = htonl(namesz);
715 memcpy(bp, name, namesz);
716 bp = (void *) bp + namesz;
717 if (padsz > 0) {
718 memset(bp, 0, padsz);
719 bp = (void *) bp + padsz;
709 } 720 }
710 call->app_opcode = FSLOOKUP; 721 *bp++ = htonl(vnode->fid.vid);
722 *bp++ = htonl(vnode->fid.vnode);
723 *bp++ = htonl(vnode->fid.unique);
724
725 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
726}
727
728/*
729 * deliver reply data to an FS.Symlink
730 */
731static int afs_deliver_fs_symlink(struct afs_call *call,
732 struct sk_buff *skb, bool last)
733{
734 struct afs_vnode *vnode = call->reply;
735 const __be32 *bp;
711 736
712 /* we want to get event notifications from the call */ 737 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
713 add_wait_queue(&call->waitq,&myself); 738
739 afs_transfer_reply(call, skb);
740 if (!last)
741 return 0;
742
743 if (call->reply_size != call->reply_max)
744 return -EBADMSG;
745
746 /* unmarshall the reply once we've received all of it */
747 bp = call->buffer;
748 xdr_decode_AFSFid(&bp, call->reply2);
749 xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
750 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
751 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
752
753 _leave(" = 0 [done]");
754 return 0;
755}
756
757/*
758 * FS.Symlink operation type
759 */
760static const struct afs_call_type afs_RXFSSymlink = {
761 .name = "FS.Symlink",
762 .deliver = afs_deliver_fs_symlink,
763 .abort_to_error = afs_abort_to_error,
764 .destructor = afs_flat_call_destructor,
765};
766
767/*
768 * create a symbolic link
769 */
770int afs_fs_symlink(struct afs_server *server,
771 struct key *key,
772 struct afs_vnode *vnode,
773 const char *name,
774 const char *contents,
775 struct afs_fid *newfid,
776 struct afs_file_status *newstatus,
777 const struct afs_wait_mode *wait_mode)
778{
779 struct afs_call *call;
780 size_t namesz, reqsz, padsz, c_namesz, c_padsz;
781 __be32 *bp;
782
783 _enter("");
784
785 namesz = strlen(name);
786 padsz = (4 - (namesz & 3)) & 3;
787
788 c_namesz = strlen(contents);
789 c_padsz = (4 - (c_namesz & 3)) & 3;
790
791 reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4);
792
793 call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz,
794 (3 + 21 + 21 + 6) * 4);
795 if (!call)
796 return -ENOMEM;
797
798 call->key = key;
799 call->reply = vnode;
800 call->reply2 = newfid;
801 call->reply3 = newstatus;
802 call->service_id = FS_SERVICE;
803 call->port = htons(AFS_FS_PORT);
714 804
715 /* marshall the parameters */ 805 /* marshall the parameters */
716 bp = rxrpc_call_alloc_scratch(call, 20); 806 bp = call->request;
717 807 *bp++ = htonl(FSSYMLINK);
718 zero = 0; 808 *bp++ = htonl(vnode->fid.vid);
719 809 *bp++ = htonl(vnode->fid.vnode);
720 piov[0].iov_len = 20; 810 *bp++ = htonl(vnode->fid.unique);
721 piov[0].iov_base = bp; 811 *bp++ = htonl(namesz);
722 piov[1].iov_len = strlen(filename); 812 memcpy(bp, name, namesz);
723 piov[1].iov_base = (char *) filename; 813 bp = (void *) bp + namesz;
724 piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3; 814 if (padsz > 0) {
725 piov[2].iov_base = &zero; 815 memset(bp, 0, padsz);
726 816 bp = (void *) bp + padsz;
727 *bp++ = htonl(FSLOOKUP);
728 *bp++ = htonl(dirfid->vid);
729 *bp++ = htonl(dirfid->vnode);
730 *bp++ = htonl(dirfid->unique);
731 *bp++ = htonl(piov[1].iov_len);
732
733 /* send the parameters to the server */
734 ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
735 0, &sent);
736 if (ret < 0)
737 goto abort;
738
739 /* wait for the reply to completely arrive */
740 bp = rxrpc_call_alloc_scratch(call, 220);
741
742 ret = rxrpc_call_read_data(call, bp, 220,
743 RXRPC_CALL_READ_BLOCK |
744 RXRPC_CALL_READ_ALL);
745 if (ret < 0) {
746 if (ret == -ECONNABORTED) {
747 ret = call->app_errno;
748 goto out_unwait;
749 }
750 goto abort;
751 } 817 }
818 *bp++ = htonl(c_namesz);
819 memcpy(bp, contents, c_namesz);
820 bp = (void *) bp + c_namesz;
821 if (c_padsz > 0) {
822 memset(bp, 0, c_padsz);
823 bp = (void *) bp + c_padsz;
824 }
825 *bp++ = htonl(AFS_SET_MODE);
826 *bp++ = 0; /* mtime */
827 *bp++ = 0; /* owner */
828 *bp++ = 0; /* group */
829 *bp++ = htonl(S_IRWXUGO); /* unix mode */
830 *bp++ = 0; /* segment size */
752 831
753 /* unmarshall the reply */ 832 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
754 fid->vid = ntohl(*bp++); 833}
755 fid->vnode = ntohl(*bp++);
756 fid->unique = ntohl(*bp++);
757 834
758 vnode->status.if_version = ntohl(*bp++); 835/*
759 vnode->status.type = ntohl(*bp++); 836 * deliver reply data to an FS.Rename
760 vnode->status.nlink = ntohl(*bp++); 837 */
761 vnode->status.size = ntohl(*bp++); 838static int afs_deliver_fs_rename(struct afs_call *call,
762 vnode->status.version = ntohl(*bp++); 839 struct sk_buff *skb, bool last)
763 vnode->status.author = ntohl(*bp++); 840{
764 vnode->status.owner = ntohl(*bp++); 841 struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
765 vnode->status.caller_access = ntohl(*bp++); 842 const __be32 *bp;
766 vnode->status.anon_access = ntohl(*bp++);
767 vnode->status.mode = ntohl(*bp++);
768 vnode->status.parent.vid = dirfid->vid;
769 vnode->status.parent.vnode = ntohl(*bp++);
770 vnode->status.parent.unique = ntohl(*bp++);
771 bp++; /* seg size */
772 vnode->status.mtime_client = ntohl(*bp++);
773 vnode->status.mtime_server = ntohl(*bp++);
774 bp++; /* group */
775 bp++; /* sync counter */
776 vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
777 bp++; /* spare2 */
778 bp++; /* spare3 */
779 bp++; /* spare4 */
780 843
781 dir->status.if_version = ntohl(*bp++); 844 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
782 dir->status.type = ntohl(*bp++); 845
783 dir->status.nlink = ntohl(*bp++); 846 afs_transfer_reply(call, skb);
784 dir->status.size = ntohl(*bp++); 847 if (!last)
785 dir->status.version = ntohl(*bp++); 848 return 0;
786 dir->status.author = ntohl(*bp++); 849
787 dir->status.owner = ntohl(*bp++); 850 if (call->reply_size != call->reply_max)
788 dir->status.caller_access = ntohl(*bp++); 851 return -EBADMSG;
789 dir->status.anon_access = ntohl(*bp++); 852
790 dir->status.mode = ntohl(*bp++); 853 /* unmarshall the reply once we've received all of it */
791 dir->status.parent.vid = dirfid->vid; 854 bp = call->buffer;
792 dir->status.parent.vnode = ntohl(*bp++); 855 xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode);
793 dir->status.parent.unique = ntohl(*bp++); 856 if (new_dvnode != orig_dvnode)
794 bp++; /* seg size */ 857 xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode);
795 dir->status.mtime_client = ntohl(*bp++); 858 /* xdr_decode_AFSVolSync(&bp, call->replyX); */
796 dir->status.mtime_server = ntohl(*bp++); 859
797 bp++; /* group */ 860 _leave(" = 0 [done]");
798 bp++; /* sync counter */ 861 return 0;
799 dir->status.version |= ((unsigned long long) ntohl(*bp++)) << 32; 862}
800 bp++; /* spare2 */ 863
801 bp++; /* spare3 */ 864/*
802 bp++; /* spare4 */ 865 * FS.Rename operation type
866 */
867static const struct afs_call_type afs_RXFSRename = {
868 .name = "FS.Rename",
869 .deliver = afs_deliver_fs_rename,
870 .abort_to_error = afs_abort_to_error,
871 .destructor = afs_flat_call_destructor,
872};
873
874/*
875 * create a symbolic link
876 */
877int afs_fs_rename(struct afs_server *server,
878 struct key *key,
879 struct afs_vnode *orig_dvnode,
880 const char *orig_name,
881 struct afs_vnode *new_dvnode,
882 const char *new_name,
883 const struct afs_wait_mode *wait_mode)
884{
885 struct afs_call *call;
886 size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
887 __be32 *bp;
888
889 _enter("");
890
891 o_namesz = strlen(orig_name);
892 o_padsz = (4 - (o_namesz & 3)) & 3;
893
894 n_namesz = strlen(new_name);
895 n_padsz = (4 - (n_namesz & 3)) & 3;
896
897 reqsz = (4 * 4) +
898 4 + o_namesz + o_padsz +
899 (3 * 4) +
900 4 + n_namesz + n_padsz;
901
902 call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
903 if (!call)
904 return -ENOMEM;
905
906 call->key = key;
907 call->reply = orig_dvnode;
908 call->reply2 = new_dvnode;
909 call->service_id = FS_SERVICE;
910 call->port = htons(AFS_FS_PORT);
911
912 /* marshall the parameters */
913 bp = call->request;
914 *bp++ = htonl(FSRENAME);
915 *bp++ = htonl(orig_dvnode->fid.vid);
916 *bp++ = htonl(orig_dvnode->fid.vnode);
917 *bp++ = htonl(orig_dvnode->fid.unique);
918 *bp++ = htonl(o_namesz);
919 memcpy(bp, orig_name, o_namesz);
920 bp = (void *) bp + o_namesz;
921 if (o_padsz > 0) {
922 memset(bp, 0, o_padsz);
923 bp = (void *) bp + o_padsz;
924 }
803 925
804 callback->fid = *fid; 926 *bp++ = htonl(new_dvnode->fid.vid);
805 callback->version = ntohl(*bp++); 927 *bp++ = htonl(new_dvnode->fid.vnode);
806 callback->expiry = ntohl(*bp++); 928 *bp++ = htonl(new_dvnode->fid.unique);
807 callback->type = ntohl(*bp++); 929 *bp++ = htonl(n_namesz);
808 930 memcpy(bp, new_name, n_namesz);
809 if (volsync) { 931 bp = (void *) bp + n_namesz;
810 volsync->creation = ntohl(*bp++); 932 if (n_padsz > 0) {
811 bp++; /* spare2 */ 933 memset(bp, 0, n_padsz);
812 bp++; /* spare3 */ 934 bp = (void *) bp + n_padsz;
813 bp++; /* spare4 */
814 bp++; /* spare5 */
815 bp++; /* spare6 */
816 } 935 }
817 936
818 /* success */ 937 return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
819 ret = 0; 938}
820
821 out_unwait:
822 set_current_state(TASK_RUNNING);
823 remove_wait_queue(&call->waitq, &myself);
824 rxrpc_put_call(call);
825 out_put_conn:
826 afs_server_release_fsconn(server, conn);
827 out:
828 kleave("");
829 return ret;
830
831 abort:
832 set_current_state(TASK_UNINTERRUPTIBLE);
833 rxrpc_call_abort(call, ret);
834 schedule();
835 goto out_unwait;
836} /* end afs_rxfs_lookup() */
837#endif
diff --git a/fs/afs/fsclient.h b/fs/afs/fsclient.h
deleted file mode 100644
index 8ba3e749ee3c..000000000000
--- a/fs/afs/fsclient.h
+++ /dev/null
@@ -1,54 +0,0 @@
1/* fsclient.h: AFS File Server client stub declarations
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef _LINUX_AFS_FSCLIENT_H
13#define _LINUX_AFS_FSCLIENT_H
14
15#include "server.h"
16
17extern int afs_rxfs_get_volume_info(struct afs_server *server,
18 const char *name,
19 struct afs_volume_info *vinfo);
20
21extern int afs_rxfs_fetch_file_status(struct afs_server *server,
22 struct afs_vnode *vnode,
23 struct afs_volsync *volsync);
24
25struct afs_rxfs_fetch_descriptor {
26 struct afs_fid fid; /* file ID to fetch */
27 size_t size; /* total number of bytes to fetch */
28 off_t offset; /* offset in file to start from */
29 void *buffer; /* read buffer */
30 size_t actual; /* actual size sent back by server */
31};
32
33extern int afs_rxfs_fetch_file_data(struct afs_server *server,
34 struct afs_vnode *vnode,
35 struct afs_rxfs_fetch_descriptor *desc,
36 struct afs_volsync *volsync);
37
38extern int afs_rxfs_give_up_callback(struct afs_server *server,
39 struct afs_vnode *vnode);
40
41/* this doesn't appear to work in OpenAFS server */
42extern int afs_rxfs_lookup(struct afs_server *server,
43 struct afs_vnode *dir,
44 const char *filename,
45 struct afs_vnode *vnode,
46 struct afs_volsync *volsync);
47
48/* this is apparently mis-implemented in OpenAFS server */
49extern int afs_rxfs_get_root_volume(struct afs_server *server,
50 char *buf,
51 size_t *buflen);
52
53
54#endif /* _LINUX_AFS_FSCLIENT_H */
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 9d9bca6c28b5..c184a4ee5995 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,9 +19,6 @@
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include "volume.h"
23#include "vnode.h"
24#include "super.h"
25#include "internal.h" 22#include "internal.h"
26 23
27struct afs_iget_data { 24struct afs_iget_data {
@@ -29,26 +26,25 @@ struct afs_iget_data {
29 struct afs_volume *volume; /* volume on which resides */ 26 struct afs_volume *volume; /* volume on which resides */
30}; 27};
31 28
32/*****************************************************************************/
33/* 29/*
34 * map the AFS file status to the inode member variables 30 * map the AFS file status to the inode member variables
35 */ 31 */
36static int afs_inode_map_status(struct afs_vnode *vnode) 32static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
37{ 33{
38 struct inode *inode = AFS_VNODE_TO_I(vnode); 34 struct inode *inode = AFS_VNODE_TO_I(vnode);
39 35
40 _debug("FS: ft=%d lk=%d sz=%Zu ver=%Lu mod=%hu", 36 _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
41 vnode->status.type, 37 vnode->status.type,
42 vnode->status.nlink, 38 vnode->status.nlink,
43 vnode->status.size, 39 (unsigned long long) vnode->status.size,
44 vnode->status.version, 40 vnode->status.data_version,
45 vnode->status.mode); 41 vnode->status.mode);
46 42
47 switch (vnode->status.type) { 43 switch (vnode->status.type) {
48 case AFS_FTYPE_FILE: 44 case AFS_FTYPE_FILE:
49 inode->i_mode = S_IFREG | vnode->status.mode; 45 inode->i_mode = S_IFREG | vnode->status.mode;
50 inode->i_op = &afs_file_inode_operations; 46 inode->i_op = &afs_file_inode_operations;
51 inode->i_fop = &generic_ro_fops; 47 inode->i_fop = &afs_file_operations;
52 break; 48 break;
53 case AFS_FTYPE_DIR: 49 case AFS_FTYPE_DIR:
54 inode->i_mode = S_IFDIR | vnode->status.mode; 50 inode->i_mode = S_IFDIR | vnode->status.mode;
@@ -77,9 +73,9 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
77 73
78 /* check to see whether a symbolic link is really a mountpoint */ 74 /* check to see whether a symbolic link is really a mountpoint */
79 if (vnode->status.type == AFS_FTYPE_SYMLINK) { 75 if (vnode->status.type == AFS_FTYPE_SYMLINK) {
80 afs_mntpt_check_symlink(vnode); 76 afs_mntpt_check_symlink(vnode, key);
81 77
82 if (vnode->flags & AFS_VNODE_MOUNTPOINT) { 78 if (test_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags)) {
83 inode->i_mode = S_IFDIR | vnode->status.mode; 79 inode->i_mode = S_IFDIR | vnode->status.mode;
84 inode->i_op = &afs_mntpt_inode_operations; 80 inode->i_op = &afs_mntpt_inode_operations;
85 inode->i_fop = &afs_mntpt_file_operations; 81 inode->i_fop = &afs_mntpt_file_operations;
@@ -87,30 +83,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
87 } 83 }
88 84
89 return 0; 85 return 0;
90} /* end afs_inode_map_status() */ 86}
91 87
92/*****************************************************************************/
93/*
94 * attempt to fetch the status of an inode, coelescing multiple simultaneous
95 * fetches
96 */
97static int afs_inode_fetch_status(struct inode *inode)
98{
99 struct afs_vnode *vnode;
100 int ret;
101
102 vnode = AFS_FS_I(inode);
103
104 ret = afs_vnode_fetch_status(vnode);
105
106 if (ret == 0)
107 ret = afs_inode_map_status(vnode);
108
109 return ret;
110
111} /* end afs_inode_fetch_status() */
112
113/*****************************************************************************/
114/* 88/*
115 * iget5() comparator 89 * iget5() comparator
116 */ 90 */
@@ -120,9 +94,8 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
120 94
121 return inode->i_ino == data->fid.vnode && 95 return inode->i_ino == data->fid.vnode &&
122 inode->i_version == data->fid.unique; 96 inode->i_version == data->fid.unique;
123} /* end afs_iget5_test() */ 97}
124 98
125/*****************************************************************************/
126/* 99/*
127 * iget5() inode initialiser 100 * iget5() inode initialiser
128 */ 101 */
@@ -137,14 +110,14 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
137 vnode->volume = data->volume; 110 vnode->volume = data->volume;
138 111
139 return 0; 112 return 0;
140} /* end afs_iget5_set() */ 113}
141 114
142/*****************************************************************************/
143/* 115/*
144 * inode retrieval 116 * inode retrieval
145 */ 117 */
146inline int afs_iget(struct super_block *sb, struct afs_fid *fid, 118struct inode *afs_iget(struct super_block *sb, struct key *key,
147 struct inode **_inode) 119 struct afs_fid *fid, struct afs_file_status *status,
120 struct afs_callback *cb)
148{ 121{
149 struct afs_iget_data data = { .fid = *fid }; 122 struct afs_iget_data data = { .fid = *fid };
150 struct afs_super_info *as; 123 struct afs_super_info *as;
@@ -161,20 +134,18 @@ inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
161 &data); 134 &data);
162 if (!inode) { 135 if (!inode) {
163 _leave(" = -ENOMEM"); 136 _leave(" = -ENOMEM");
164 return -ENOMEM; 137 return ERR_PTR(-ENOMEM);
165 } 138 }
166 139
140 _debug("GOT INODE %p { vl=%x vn=%x, u=%x }",
141 inode, fid->vid, fid->vnode, fid->unique);
142
167 vnode = AFS_FS_I(inode); 143 vnode = AFS_FS_I(inode);
168 144
169 /* deal with an existing inode */ 145 /* deal with an existing inode */
170 if (!(inode->i_state & I_NEW)) { 146 if (!(inode->i_state & I_NEW)) {
171 ret = afs_vnode_fetch_status(vnode); 147 _leave(" = %p", inode);
172 if (ret==0) 148 return inode;
173 *_inode = inode;
174 else
175 iput(inode);
176 _leave(" = %d", ret);
177 return ret;
178 } 149 }
179 150
180#ifdef AFS_CACHING_SUPPORT 151#ifdef AFS_CACHING_SUPPORT
@@ -186,100 +157,185 @@ inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
186 &vnode->cache); 157 &vnode->cache);
187#endif 158#endif
188 159
189 /* okay... it's a new inode */ 160 if (!status) {
190 inode->i_flags |= S_NOATIME; 161 /* it's a remotely extant inode */
191 vnode->flags |= AFS_VNODE_CHANGED; 162 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
192 ret = afs_inode_fetch_status(inode); 163 ret = afs_vnode_fetch_status(vnode, NULL, key);
193 if (ret<0) 164 if (ret < 0)
165 goto bad_inode;
166 } else {
167 /* it's an inode we just created */
168 memcpy(&vnode->status, status, sizeof(vnode->status));
169
170 if (!cb) {
171 /* it's a symlink we just created (the fileserver
172 * didn't give us a callback) */
173 vnode->cb_version = 0;
174 vnode->cb_expiry = 0;
175 vnode->cb_type = 0;
176 vnode->cb_expires = get_seconds();
177 } else {
178 vnode->cb_version = cb->version;
179 vnode->cb_expiry = cb->expiry;
180 vnode->cb_type = cb->type;
181 vnode->cb_expires = vnode->cb_expiry + get_seconds();
182 }
183 }
184
185 ret = afs_inode_map_status(vnode, key);
186 if (ret < 0)
194 goto bad_inode; 187 goto bad_inode;
195 188
196 /* success */ 189 /* success */
190 clear_bit(AFS_VNODE_UNSET, &vnode->flags);
191 inode->i_flags |= S_NOATIME;
197 unlock_new_inode(inode); 192 unlock_new_inode(inode);
198 193 _leave(" = %p [CB { v=%u t=%u }]", inode, vnode->cb_version, vnode->cb_type);
199 *_inode = inode; 194 return inode;
200 _leave(" = 0 [CB { v=%u x=%lu t=%u }]",
201 vnode->cb_version,
202 vnode->cb_timeout.timo_jif,
203 vnode->cb_type);
204 return 0;
205 195
206 /* failure */ 196 /* failure */
207 bad_inode: 197bad_inode:
208 make_bad_inode(inode); 198 make_bad_inode(inode);
209 unlock_new_inode(inode); 199 unlock_new_inode(inode);
210 iput(inode); 200 iput(inode);
211 201
212 _leave(" = %d [bad]", ret); 202 _leave(" = %d [bad]", ret);
203 return ERR_PTR(ret);
204}
205
206/*
207 * validate a vnode/inode
208 * - there are several things we need to check
209 * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
210 * symlink)
211 * - parent dir metadata changed (security changes)
212 * - dentry data changed (write, truncate)
213 * - dentry metadata changed (security changes)
214 */
215int afs_validate(struct afs_vnode *vnode, struct key *key)
216{
217 int ret;
218
219 _enter("{v={%x:%u} fl=%lx},%x",
220 vnode->fid.vid, vnode->fid.vnode, vnode->flags,
221 key_serial(key));
222
223 if (vnode->cb_promised &&
224 !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
225 !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) &&
226 !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
227 if (vnode->cb_expires < get_seconds() + 10) {
228 _debug("callback expired");
229 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
230 } else {
231 goto valid;
232 }
233 }
234
235 if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
236 goto valid;
237
238 mutex_lock(&vnode->validate_lock);
239
240 /* if the promise has expired, we need to check the server again to get
241 * a new promise - note that if the (parent) directory's metadata was
242 * changed then the security may be different and we may no longer have
243 * access */
244 if (!vnode->cb_promised ||
245 test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
246 _debug("not promised");
247 ret = afs_vnode_fetch_status(vnode, NULL, key);
248 if (ret < 0)
249 goto error_unlock;
250 _debug("new promise [fl=%lx]", vnode->flags);
251 }
252
253 if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
254 _debug("file already deleted");
255 ret = -ESTALE;
256 goto error_unlock;
257 }
258
259 /* if the vnode's data version number changed then its contents are
260 * different */
261 if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
262 _debug("zap data {%x:%d}", vnode->fid.vid, vnode->fid.vnode);
263 invalidate_remote_inode(&vnode->vfs_inode);
264 }
265
266 clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
267 mutex_unlock(&vnode->validate_lock);
268valid:
269 _leave(" = 0");
270 return 0;
271
272error_unlock:
273 mutex_unlock(&vnode->validate_lock);
274 _leave(" = %d", ret);
213 return ret; 275 return ret;
214} /* end afs_iget() */ 276}
215 277
216/*****************************************************************************/
217/* 278/*
218 * read the attributes of an inode 279 * read the attributes of an inode
219 */ 280 */
220int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry, 281int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
221 struct kstat *stat) 282 struct kstat *stat)
222{ 283{
223 struct afs_vnode *vnode;
224 struct inode *inode; 284 struct inode *inode;
225 int ret;
226 285
227 inode = dentry->d_inode; 286 inode = dentry->d_inode;
228 287
229 _enter("{ ino=%lu v=%lu }", inode->i_ino, inode->i_version); 288 _enter("{ ino=%lu v=%lu }", inode->i_ino, inode->i_version);
230 289
231 vnode = AFS_FS_I(inode);
232
233 ret = afs_inode_fetch_status(inode);
234 if (ret == -ENOENT) {
235 _leave(" = %d [%d %p]",
236 ret, atomic_read(&dentry->d_count), dentry->d_inode);
237 return ret;
238 }
239 else if (ret < 0) {
240 make_bad_inode(inode);
241 _leave(" = %d", ret);
242 return ret;
243 }
244
245 /* transfer attributes from the inode structure to the stat
246 * structure */
247 generic_fillattr(inode, stat); 290 generic_fillattr(inode, stat);
248
249 _leave(" = 0 CB { v=%u x=%u t=%u }",
250 vnode->cb_version,
251 vnode->cb_expiry,
252 vnode->cb_type);
253
254 return 0; 291 return 0;
255} /* end afs_inode_getattr() */ 292}
256 293
257/*****************************************************************************/
258/* 294/*
259 * clear an AFS inode 295 * clear an AFS inode
260 */ 296 */
261void afs_clear_inode(struct inode *inode) 297void afs_clear_inode(struct inode *inode)
262{ 298{
299 struct afs_permits *permits;
263 struct afs_vnode *vnode; 300 struct afs_vnode *vnode;
264 301
265 vnode = AFS_FS_I(inode); 302 vnode = AFS_FS_I(inode);
266 303
267 _enter("ino=%lu { vn=%08x v=%u x=%u t=%u }", 304 _enter("{%x:%d.%d} v=%u x=%u t=%u }",
268 inode->i_ino, 305 vnode->fid.vid,
269 vnode->fid.vnode, 306 vnode->fid.vnode,
307 vnode->fid.unique,
270 vnode->cb_version, 308 vnode->cb_version,
271 vnode->cb_expiry, 309 vnode->cb_expiry,
272 vnode->cb_type 310 vnode->cb_type);
273 );
274 311
275 BUG_ON(inode->i_ino != vnode->fid.vnode); 312 _debug("CLEAR INODE %p", inode);
276 313
277 afs_vnode_give_up_callback(vnode); 314 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
315
316 afs_give_up_callback(vnode);
317
318 if (vnode->server) {
319 spin_lock(&vnode->server->fs_lock);
320 rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
321 spin_unlock(&vnode->server->fs_lock);
322 afs_put_server(vnode->server);
323 vnode->server = NULL;
324 }
325
326 ASSERT(!vnode->cb_promised);
278 327
279#ifdef AFS_CACHING_SUPPORT 328#ifdef AFS_CACHING_SUPPORT
280 cachefs_relinquish_cookie(vnode->cache, 0); 329 cachefs_relinquish_cookie(vnode->cache, 0);
281 vnode->cache = NULL; 330 vnode->cache = NULL;
282#endif 331#endif
283 332
333 mutex_lock(&vnode->permits_lock);
334 permits = vnode->permits;
335 rcu_assign_pointer(vnode->permits, NULL);
336 mutex_unlock(&vnode->permits_lock);
337 if (permits)
338 call_rcu(&permits->rcu, afs_zap_permits);
339
284 _leave(""); 340 _leave("");
285} /* end afs_clear_inode() */ 341}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5151d5da2c2f..d90c158cd934 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1,6 +1,6 @@
1/* internal.h: internal AFS stuff 1/* internal AFS stuff
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -9,48 +9,390 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#ifndef AFS_INTERNAL_H
13#define AFS_INTERNAL_H
14
15#include <linux/compiler.h> 12#include <linux/compiler.h>
16#include <linux/kernel.h> 13#include <linux/kernel.h>
17#include <linux/fs.h> 14#include <linux/fs.h>
18#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/skbuff.h>
17#include <linux/rxrpc.h>
18#include <linux/key.h>
19#include "afs.h"
20#include "afs_vl.h"
21
22#define AFS_CELL_MAX_ADDRS 15
23
24struct afs_call;
25
26typedef enum {
27 AFS_VL_NEW, /* new, uninitialised record */
28 AFS_VL_CREATING, /* creating record */
29 AFS_VL_VALID, /* record is pending */
30 AFS_VL_NO_VOLUME, /* no such volume available */
31 AFS_VL_UPDATING, /* update in progress */
32 AFS_VL_VOLUME_DELETED, /* volume was deleted */
33 AFS_VL_UNCERTAIN, /* uncertain state (update failed) */
34} __attribute__((packed)) afs_vlocation_state_t;
35
36struct afs_mount_params {
37 bool rwpath; /* T if the parent should be considered R/W */
38 bool force; /* T to force cell type */
39 afs_voltype_t type; /* type of volume requested */
40 int volnamesz; /* size of volume name */
41 const char *volname; /* name of volume to mount */
42 struct afs_cell *cell; /* cell in which to find volume */
43 struct afs_volume *volume; /* volume record */
44 struct key *key; /* key to use for secure mounting */
45};
19 46
20/* 47/*
21 * debug tracing 48 * definition of how to wait for the completion of an operation
22 */ 49 */
23#define kenter(FMT, a...) printk("==> %s("FMT")\n",__FUNCTION__ , ## a) 50struct afs_wait_mode {
24#define kleave(FMT, a...) printk("<== %s()"FMT"\n",__FUNCTION__ , ## a) 51 /* RxRPC received message notification */
25#define kdebug(FMT, a...) printk(FMT"\n" , ## a) 52 void (*rx_wakeup)(struct afs_call *call);
26#define kproto(FMT, a...) printk("### "FMT"\n" , ## a)
27#define knet(FMT, a...) printk(FMT"\n" , ## a)
28
29#ifdef __KDEBUG
30#define _enter(FMT, a...) kenter(FMT , ## a)
31#define _leave(FMT, a...) kleave(FMT , ## a)
32#define _debug(FMT, a...) kdebug(FMT , ## a)
33#define _proto(FMT, a...) kproto(FMT , ## a)
34#define _net(FMT, a...) knet(FMT , ## a)
35#else
36#define _enter(FMT, a...) do { } while(0)
37#define _leave(FMT, a...) do { } while(0)
38#define _debug(FMT, a...) do { } while(0)
39#define _proto(FMT, a...) do { } while(0)
40#define _net(FMT, a...) do { } while(0)
41#endif
42 53
43static inline void afs_discard_my_signals(void) 54 /* synchronous call waiter and call dispatched notification */
44{ 55 int (*wait)(struct afs_call *call);
45 while (signal_pending(current)) { 56
46 siginfo_t sinfo; 57 /* asynchronous call completion */
58 void (*async_complete)(void *reply, int error);
59};
60
61extern const struct afs_wait_mode afs_sync_call;
62extern const struct afs_wait_mode afs_async_call;
47 63
48 spin_lock_irq(&current->sighand->siglock); 64/*
49 dequeue_signal(current,&current->blocked, &sinfo); 65 * a record of an in-progress RxRPC call
50 spin_unlock_irq(&current->sighand->siglock); 66 */
51 } 67struct afs_call {
68 const struct afs_call_type *type; /* type of call */
69 const struct afs_wait_mode *wait_mode; /* completion wait mode */
70 wait_queue_head_t waitq; /* processes awaiting completion */
71 struct work_struct async_work; /* asynchronous work processor */
72 struct work_struct work; /* actual work processor */
73 struct sk_buff_head rx_queue; /* received packets */
74 struct rxrpc_call *rxcall; /* RxRPC call handle */
75 struct key *key; /* security for this call */
76 struct afs_server *server; /* server affected by incoming CM call */
77 void *request; /* request data (first part) */
78 void *request2; /* request data (second part) */
79 void *buffer; /* reply receive buffer */
80 void *reply; /* reply buffer (first part) */
81 void *reply2; /* reply buffer (second part) */
82 void *reply3; /* reply buffer (third part) */
83 void *reply4; /* reply buffer (fourth part) */
84 enum { /* call state */
85 AFS_CALL_REQUESTING, /* request is being sent for outgoing call */
86 AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */
87 AFS_CALL_AWAIT_OP_ID, /* awaiting op ID on incoming call */
88 AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */
89 AFS_CALL_REPLYING, /* replying to incoming call */
90 AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */
91 AFS_CALL_COMPLETE, /* successfully completed */
92 AFS_CALL_BUSY, /* server was busy */
93 AFS_CALL_ABORTED, /* call was aborted */
94 AFS_CALL_ERROR, /* call failed due to error */
95 } state;
96 int error; /* error code */
97 unsigned request_size; /* size of request data */
98 unsigned reply_max; /* maximum size of reply */
99 unsigned reply_size; /* current size of reply */
100 unsigned short offset; /* offset into received data store */
101 unsigned char unmarshall; /* unmarshalling phase */
102 bool incoming; /* T if incoming call */
103 u16 service_id; /* RxRPC service ID to call */
104 __be16 port; /* target UDP port */
105 __be32 operation_ID; /* operation ID for an incoming call */
106 u32 count; /* count for use in unmarshalling */
107 __be32 tmp; /* place to extract temporary data */
108};
109
110struct afs_call_type {
111 const char *name;
112
113 /* deliver request or reply data to an call
114 * - returning an error will cause the call to be aborted
115 */
116 int (*deliver)(struct afs_call *call, struct sk_buff *skb,
117 bool last);
118
119 /* map an abort code to an error number */
120 int (*abort_to_error)(u32 abort_code);
121
122 /* clean up a call */
123 void (*destructor)(struct afs_call *call);
124};
125
126/*
127 * AFS superblock private data
128 * - there's one superblock per volume
129 */
130struct afs_super_info {
131 struct afs_volume *volume; /* volume record */
132 char rwparent; /* T if parent is R/W AFS volume */
133};
134
135static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
136{
137 return sb->s_fs_info;
52} 138}
53 139
140extern struct file_system_type afs_fs_type;
141
142/*
143 * entry in the cached cell catalogue
144 */
145struct afs_cache_cell {
146 char name[AFS_MAXCELLNAME]; /* cell name (padded with NULs) */
147 struct in_addr vl_servers[15]; /* cached cell VL servers */
148};
149
150/*
151 * AFS cell record
152 */
153struct afs_cell {
154 atomic_t usage;
155 struct list_head link; /* main cell list link */
156 struct key *anonymous_key; /* anonymous user key for this cell */
157 struct list_head proc_link; /* /proc cell list link */
158 struct proc_dir_entry *proc_dir; /* /proc dir for this cell */
159#ifdef AFS_CACHING_SUPPORT
160 struct cachefs_cookie *cache; /* caching cookie */
161#endif
162
163 /* server record management */
164 rwlock_t servers_lock; /* active server list lock */
165 struct list_head servers; /* active server list */
166
167 /* volume location record management */
168 struct rw_semaphore vl_sem; /* volume management serialisation semaphore */
169 struct list_head vl_list; /* cell's active VL record list */
170 spinlock_t vl_lock; /* vl_list lock */
171 unsigned short vl_naddrs; /* number of VL servers in addr list */
172 unsigned short vl_curr_svix; /* current server index */
173 struct in_addr vl_addrs[AFS_CELL_MAX_ADDRS]; /* cell VL server addresses */
174
175 char name[0]; /* cell name - must go last */
176};
177
178/*
179 * entry in the cached volume location catalogue
180 */
181struct afs_cache_vlocation {
182 /* volume name (lowercase, padded with NULs) */
183 uint8_t name[AFS_MAXVOLNAME + 1];
184
185 uint8_t nservers; /* number of entries used in servers[] */
186 uint8_t vidmask; /* voltype mask for vid[] */
187 uint8_t srvtmask[8]; /* voltype masks for servers[] */
188#define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */
189#define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */
190#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */
191
192 afs_volid_t vid[3]; /* volume IDs for R/W, R/O and Bak volumes */
193 struct in_addr servers[8]; /* fileserver addresses */
194 time_t rtime; /* last retrieval time */
195};
196
197/*
198 * volume -> vnode hash table entry
199 */
200struct afs_cache_vhash {
201 afs_voltype_t vtype; /* which volume variation */
202 uint8_t hash_bucket; /* which hash bucket this represents */
203} __attribute__((packed));
204
205/*
206 * AFS volume location record
207 */
208struct afs_vlocation {
209 atomic_t usage;
210 time_t time_of_death; /* time at which put reduced usage to 0 */
211 struct list_head link; /* link in cell volume location list */
212 struct list_head grave; /* link in master graveyard list */
213 struct list_head update; /* link in master update list */
214 struct afs_cell *cell; /* cell to which volume belongs */
215#ifdef AFS_CACHING_SUPPORT
216 struct cachefs_cookie *cache; /* caching cookie */
217#endif
218 struct afs_cache_vlocation vldb; /* volume information DB record */
219 struct afs_volume *vols[3]; /* volume access record pointer (index by type) */
220 wait_queue_head_t waitq; /* status change waitqueue */
221 time_t update_at; /* time at which record should be updated */
222 spinlock_t lock; /* access lock */
223 afs_vlocation_state_t state; /* volume location state */
224 unsigned short upd_rej_cnt; /* ENOMEDIUM count during update */
225 unsigned short upd_busy_cnt; /* EBUSY count during update */
226 bool valid; /* T if valid */
227};
228
229/*
230 * AFS fileserver record
231 */
232struct afs_server {
233 atomic_t usage;
234 time_t time_of_death; /* time at which put reduced usage to 0 */
235 struct in_addr addr; /* server address */
236 struct afs_cell *cell; /* cell in which server resides */
237 struct list_head link; /* link in cell's server list */
238 struct list_head grave; /* link in master graveyard list */
239 struct rb_node master_rb; /* link in master by-addr tree */
240 struct rw_semaphore sem; /* access lock */
241
242 /* file service access */
243 struct rb_root fs_vnodes; /* vnodes backed by this server (ordered by FID) */
244 unsigned long fs_act_jif; /* time at which last activity occurred */
245 unsigned long fs_dead_jif; /* time at which no longer to be considered dead */
246 spinlock_t fs_lock; /* access lock */
247 int fs_state; /* 0 or reason FS currently marked dead (-errno) */
248
249 /* callback promise management */
250 struct rb_root cb_promises; /* vnode expiration list (ordered earliest first) */
251 struct delayed_work cb_updater; /* callback updater */
252 struct delayed_work cb_break_work; /* collected break dispatcher */
253 wait_queue_head_t cb_break_waitq; /* space available in cb_break waitqueue */
254 spinlock_t cb_lock; /* access lock */
255 struct afs_callback cb_break[64]; /* ring of callbacks awaiting breaking */
256 atomic_t cb_break_n; /* number of pending breaks */
257 u8 cb_break_head; /* head of callback breaking ring */
258 u8 cb_break_tail; /* tail of callback breaking ring */
259};
260
261/*
262 * AFS volume access record
263 */
264struct afs_volume {
265 atomic_t usage;
266 struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */
267 struct afs_vlocation *vlocation; /* volume location */
268#ifdef AFS_CACHING_SUPPORT
269 struct cachefs_cookie *cache; /* caching cookie */
270#endif
271 afs_volid_t vid; /* volume ID */
272 afs_voltype_t type; /* type of volume */
273 char type_force; /* force volume type (suppress R/O -> R/W) */
274 unsigned short nservers; /* number of server slots filled */
275 unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */
276 struct afs_server *servers[8]; /* servers on which volume resides (ordered) */
277 struct rw_semaphore server_sem; /* lock for accessing current server */
278};
279
280/*
281 * vnode catalogue entry
282 */
283struct afs_cache_vnode {
284 afs_vnodeid_t vnode_id; /* vnode ID */
285 unsigned vnode_unique; /* vnode ID uniquifier */
286 afs_dataversion_t data_version; /* data version */
287};
288
289/*
290 * AFS inode private data
291 */
292struct afs_vnode {
293 struct inode vfs_inode; /* the VFS's inode record */
294
295 struct afs_volume *volume; /* volume on which vnode resides */
296 struct afs_server *server; /* server currently supplying this file */
297 struct afs_fid fid; /* the file identifier for this inode */
298 struct afs_file_status status; /* AFS status info for this file */
299#ifdef AFS_CACHING_SUPPORT
300 struct cachefs_cookie *cache; /* caching cookie */
301#endif
302 struct afs_permits *permits; /* cache of permits so far obtained */
303 struct mutex permits_lock; /* lock for altering permits list */
304 struct mutex validate_lock; /* lock for validating this vnode */
305 wait_queue_head_t update_waitq; /* status fetch waitqueue */
306 int update_cnt; /* number of outstanding ops that will update the
307 * status */
308 spinlock_t lock; /* waitqueue/flags lock */
309 unsigned long flags;
310#define AFS_VNODE_CB_BROKEN 0 /* set if vnode's callback was broken */
311#define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */
312#define AFS_VNODE_MODIFIED 2 /* set if vnode's data modified */
313#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
314#define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */
315#define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */
316
317 long acl_order; /* ACL check count (callback break count) */
318
319 /* outstanding callback notification on this file */
320 struct rb_node server_rb; /* link in server->fs_vnodes */
321 struct rb_node cb_promise; /* link in server->cb_promises */
322 struct work_struct cb_broken_work; /* work to be done on callback break */
323 time_t cb_expires; /* time at which callback expires */
324 time_t cb_expires_at; /* time used to order cb_promise */
325 unsigned cb_version; /* callback version */
326 unsigned cb_expiry; /* callback expiry time */
327 afs_callback_type_t cb_type; /* type of callback */
328 bool cb_promised; /* true if promise still holds */
329};
330
331/*
332 * cached security record for one user's attempt to access a vnode
333 */
334struct afs_permit {
335 struct key *key; /* RxRPC ticket holding a security context */
336 afs_access_t access_mask; /* access mask for this key */
337};
338
339/*
340 * cache of security records from attempts to access a vnode
341 */
342struct afs_permits {
343 struct rcu_head rcu; /* disposal procedure */
344 int count; /* number of records */
345 struct afs_permit permits[0]; /* the permits so far examined */
346};
347
348/*
349 * record of one of a system's set of network interfaces
350 */
351struct afs_interface {
352 struct in_addr address; /* IPv4 address bound to interface */
353 struct in_addr netmask; /* netmask applied to address */
354 unsigned mtu; /* MTU of interface */
355};
356
357/*
358 * UUID definition [internet draft]
359 * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns
360 * increments since midnight 15th October 1582
361 * - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID
362 * time
363 * - the clock sequence is a 14-bit counter to avoid duplicate times
364 */
365struct afs_uuid {
366 u32 time_low; /* low part of timestamp */
367 u16 time_mid; /* mid part of timestamp */
368 u16 time_hi_and_version; /* high part of timestamp and version */
369#define AFS_UUID_TO_UNIX_TIME 0x01b21dd213814000ULL
370#define AFS_UUID_TIMEHI_MASK 0x0fff
371#define AFS_UUID_VERSION_TIME 0x1000 /* time-based UUID */
372#define AFS_UUID_VERSION_NAME 0x3000 /* name-based UUID */
373#define AFS_UUID_VERSION_RANDOM 0x4000 /* (pseudo-)random generated UUID */
374 u8 clock_seq_hi_and_reserved; /* clock seq hi and variant */
375#define AFS_UUID_CLOCKHI_MASK 0x3f
376#define AFS_UUID_VARIANT_STD 0x80
377 u8 clock_seq_low; /* clock seq low */
378 u8 node[6]; /* spatially unique node ID (MAC addr) */
379};
380
381/*****************************************************************************/
382/*
383 * callback.c
384 */
385extern void afs_init_callback_state(struct afs_server *);
386extern void afs_broken_callback_work(struct work_struct *);
387extern void afs_break_callbacks(struct afs_server *, size_t,
388 struct afs_callback[]);
389extern void afs_discard_callback_on_delete(struct afs_vnode *);
390extern void afs_give_up_callback(struct afs_vnode *);
391extern void afs_dispatch_give_up_callbacks(struct work_struct *);
392extern void afs_flush_callback_breaks(struct afs_server *);
393extern int __init afs_callback_update_init(void);
394extern void afs_callback_update_kill(void);
395
54/* 396/*
55 * cell.c 397 * cell.c
56 */ 398 */
@@ -60,57 +402,156 @@ extern struct list_head afs_proc_cells;
60extern struct cachefs_index_def afs_cache_cell_index_def; 402extern struct cachefs_index_def afs_cache_cell_index_def;
61#endif 403#endif
62 404
405#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
406extern int afs_cell_init(char *);
407extern struct afs_cell *afs_cell_create(const char *, char *);
408extern struct afs_cell *afs_cell_lookup(const char *, unsigned);
409extern struct afs_cell *afs_grab_cell(struct afs_cell *);
410extern void afs_put_cell(struct afs_cell *);
411extern void afs_cell_purge(void);
412
413/*
414 * cmservice.c
415 */
416extern bool afs_cm_incoming_call(struct afs_call *);
417
63/* 418/*
64 * dir.c 419 * dir.c
65 */ 420 */
66extern const struct inode_operations afs_dir_inode_operations; 421extern const struct inode_operations afs_dir_inode_operations;
67extern const struct file_operations afs_dir_file_operations; 422extern const struct file_operations afs_dir_file_operations;
68 423
424extern int afs_permission(struct inode *, int, struct nameidata *);
425
69/* 426/*
70 * file.c 427 * file.c
71 */ 428 */
72extern const struct address_space_operations afs_fs_aops; 429extern const struct address_space_operations afs_fs_aops;
73extern const struct inode_operations afs_file_inode_operations; 430extern const struct inode_operations afs_file_inode_operations;
431extern const struct file_operations afs_file_operations;
432
433extern int afs_open(struct inode *, struct file *);
434extern int afs_release(struct inode *, struct file *);
74 435
75#ifdef AFS_CACHING_SUPPORT 436#ifdef AFS_CACHING_SUPPORT
76extern int afs_cache_get_page_cookie(struct page *page, 437extern int afs_cache_get_page_cookie(struct page *, struct cachefs_page **);
77 struct cachefs_page **_page_cookie);
78#endif 438#endif
79 439
80/* 440/*
81 * inode.c 441 * fsclient.c
82 */ 442 */
83extern int afs_iget(struct super_block *sb, struct afs_fid *fid, 443extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
84 struct inode **_inode); 444 struct afs_vnode *, struct afs_volsync *,
85extern int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry, 445 const struct afs_wait_mode *);
86 struct kstat *stat); 446extern int afs_fs_give_up_callbacks(struct afs_server *,
87extern void afs_clear_inode(struct inode *inode); 447 const struct afs_wait_mode *);
448extern int afs_fs_fetch_data(struct afs_server *, struct key *,
449 struct afs_vnode *, off_t, size_t, struct page *,
450 const struct afs_wait_mode *);
451extern int afs_fs_create(struct afs_server *, struct key *,
452 struct afs_vnode *, const char *, umode_t,
453 struct afs_fid *, struct afs_file_status *,
454 struct afs_callback *,
455 const struct afs_wait_mode *);
456extern int afs_fs_remove(struct afs_server *, struct key *,
457 struct afs_vnode *, const char *, bool,
458 const struct afs_wait_mode *);
459extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *,
460 struct afs_vnode *, const char *,
461 const struct afs_wait_mode *);
462extern int afs_fs_symlink(struct afs_server *, struct key *,
463 struct afs_vnode *, const char *, const char *,
464 struct afs_fid *, struct afs_file_status *,
465 const struct afs_wait_mode *);
466extern int afs_fs_rename(struct afs_server *, struct key *,
467 struct afs_vnode *, const char *,
468 struct afs_vnode *, const char *,
469 const struct afs_wait_mode *);
88 470
89/* 471/*
90 * key_afs.c 472 * inode.c
91 */ 473 */
92#ifdef CONFIG_KEYS 474extern struct inode *afs_iget(struct super_block *, struct key *,
93extern int afs_key_register(void); 475 struct afs_fid *, struct afs_file_status *,
94extern void afs_key_unregister(void); 476 struct afs_callback *);
95#endif 477extern int afs_validate(struct afs_vnode *, struct key *);
478extern int afs_inode_getattr(struct vfsmount *, struct dentry *,
479 struct kstat *);
480extern void afs_zap_permits(struct rcu_head *);
481extern void afs_clear_inode(struct inode *);
96 482
97/* 483/*
98 * main.c 484 * main.c
99 */ 485 */
486extern struct afs_uuid afs_uuid;
100#ifdef AFS_CACHING_SUPPORT 487#ifdef AFS_CACHING_SUPPORT
101extern struct cachefs_netfs afs_cache_netfs; 488extern struct cachefs_netfs afs_cache_netfs;
102#endif 489#endif
103 490
104/* 491/*
492 * misc.c
493 */
494extern int afs_abort_to_error(u32);
495
496/*
105 * mntpt.c 497 * mntpt.c
106 */ 498 */
107extern const struct inode_operations afs_mntpt_inode_operations; 499extern const struct inode_operations afs_mntpt_inode_operations;
108extern const struct file_operations afs_mntpt_file_operations; 500extern const struct file_operations afs_mntpt_file_operations;
109extern struct afs_timer afs_mntpt_expiry_timer;
110extern struct afs_timer_ops afs_mntpt_expiry_timer_ops;
111extern unsigned long afs_mntpt_expiry_timeout; 501extern unsigned long afs_mntpt_expiry_timeout;
112 502
113extern int afs_mntpt_check_symlink(struct afs_vnode *vnode); 503extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
504extern void afs_mntpt_kill_timer(void);
505extern void afs_umount_begin(struct vfsmount *, int);
506
507/*
508 * proc.c
509 */
510extern int afs_proc_init(void);
511extern void afs_proc_cleanup(void);
512extern int afs_proc_cell_setup(struct afs_cell *);
513extern void afs_proc_cell_remove(struct afs_cell *);
514
515/*
516 * rxrpc.c
517 */
518extern int afs_open_socket(void);
519extern void afs_close_socket(void);
520extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
521 const struct afs_wait_mode *);
522extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
523 size_t, size_t);
524extern void afs_flat_call_destructor(struct afs_call *);
525extern void afs_transfer_reply(struct afs_call *, struct sk_buff *);
526extern void afs_send_empty_reply(struct afs_call *);
527extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
528extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
529 size_t);
530
531/*
532 * security.c
533 */
534extern void afs_clear_permits(struct afs_vnode *);
535extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
536extern struct key *afs_request_key(struct afs_cell *);
537extern int afs_permission(struct inode *, int, struct nameidata *);
538
539/*
540 * server.c
541 */
542extern spinlock_t afs_server_peer_lock;
543
544#define afs_get_server(S) \
545do { \
546 _debug("GET SERVER %d", atomic_read(&(S)->usage)); \
547 atomic_inc(&(S)->usage); \
548} while(0)
549
550extern struct afs_server *afs_lookup_server(struct afs_cell *,
551 const struct in_addr *);
552extern struct afs_server *afs_find_server(const struct in_addr *);
553extern void afs_put_server(struct afs_server *);
554extern void __exit afs_purge_servers(void);
114 555
115/* 556/*
116 * super.c 557 * super.c
@@ -118,22 +559,211 @@ extern int afs_mntpt_check_symlink(struct afs_vnode *vnode);
118extern int afs_fs_init(void); 559extern int afs_fs_init(void);
119extern void afs_fs_exit(void); 560extern void afs_fs_exit(void);
120 561
121#define AFS_CB_HASH_COUNT (PAGE_SIZE / sizeof(struct list_head)) 562/*
563 * use-rtnetlink.c
564 */
565extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
566extern int afs_get_MAC_address(u8 *, size_t);
122 567
123extern struct list_head afs_cb_hash_tbl[]; 568/*
124extern spinlock_t afs_cb_hash_lock; 569 * vlclient.c
570 */
571#ifdef AFS_CACHING_SUPPORT
572extern struct cachefs_index_def afs_vlocation_cache_index_def;
573#endif
125 574
126#define afs_cb_hash(SRV,FID) \ 575extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
127 afs_cb_hash_tbl[((unsigned long)(SRV) + \ 576 const char *, struct afs_cache_vlocation *,
128 (FID)->vid + (FID)->vnode + (FID)->unique) % \ 577 const struct afs_wait_mode *);
129 AFS_CB_HASH_COUNT] 578extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *,
579 afs_volid_t, afs_voltype_t,
580 struct afs_cache_vlocation *,
581 const struct afs_wait_mode *);
130 582
131/* 583/*
132 * proc.c 584 * vlocation.c
133 */ 585 */
134extern int afs_proc_init(void); 586#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
135extern void afs_proc_cleanup(void); 587
136extern int afs_proc_cell_setup(struct afs_cell *cell); 588extern int __init afs_vlocation_update_init(void);
137extern void afs_proc_cell_remove(struct afs_cell *cell); 589extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *,
590 struct key *,
591 const char *, size_t);
592extern void afs_put_vlocation(struct afs_vlocation *);
593extern void afs_vlocation_purge(void);
594
595/*
596 * vnode.c
597 */
598#ifdef AFS_CACHING_SUPPORT
599extern struct cachefs_index_def afs_vnode_cache_index_def;
600#endif
601
602extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
603
604static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
605{
606 return container_of(inode, struct afs_vnode, vfs_inode);
607}
608
609static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
610{
611 return &vnode->vfs_inode;
612}
613
614extern void afs_vnode_finalise_status_update(struct afs_vnode *,
615 struct afs_server *);
616extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
617 struct key *);
618extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
619 off_t, size_t, struct page *);
620extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,
621 umode_t, struct afs_fid *, struct afs_file_status *,
622 struct afs_callback *, struct afs_server **);
623extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *,
624 bool);
625extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *,
626 const char *);
627extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *,
628 const char *, struct afs_fid *,
629 struct afs_file_status *, struct afs_server **);
630extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
631 struct key *, const char *, const char *);
632
633/*
634 * volume.c
635 */
636#ifdef AFS_CACHING_SUPPORT
637extern struct cachefs_index_def afs_volume_cache_index_def;
638#endif
639
640#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
641
642extern void afs_put_volume(struct afs_volume *);
643extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *);
644extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *);
645extern int afs_volume_release_fileserver(struct afs_vnode *,
646 struct afs_server *, int);
647
648/*****************************************************************************/
649/*
650 * debug tracing
651 */
652extern unsigned afs_debug;
653
654#define dbgprintk(FMT,...) \
655 printk("[%x%-6.6s] "FMT"\n", smp_processor_id(), current->comm ,##__VA_ARGS__)
656
657/* make sure we maintain the format strings, even when debugging is disabled */
658static inline __attribute__((format(printf,1,2)))
659void _dbprintk(const char *fmt, ...)
660{
661}
662
663#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
664#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
665#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__)
666
667
668#if defined(__KDEBUG)
669#define _enter(FMT,...) kenter(FMT,##__VA_ARGS__)
670#define _leave(FMT,...) kleave(FMT,##__VA_ARGS__)
671#define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__)
672
673#elif defined(CONFIG_AFS_DEBUG)
674#define AFS_DEBUG_KENTER 0x01
675#define AFS_DEBUG_KLEAVE 0x02
676#define AFS_DEBUG_KDEBUG 0x04
677
678#define _enter(FMT,...) \
679do { \
680 if (unlikely(afs_debug & AFS_DEBUG_KENTER)) \
681 kenter(FMT,##__VA_ARGS__); \
682} while (0)
683
684#define _leave(FMT,...) \
685do { \
686 if (unlikely(afs_debug & AFS_DEBUG_KLEAVE)) \
687 kleave(FMT,##__VA_ARGS__); \
688} while (0)
689
690#define _debug(FMT,...) \
691do { \
692 if (unlikely(afs_debug & AFS_DEBUG_KDEBUG)) \
693 kdebug(FMT,##__VA_ARGS__); \
694} while (0)
695
696#else
697#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
698#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
699#define _debug(FMT,...) _dbprintk(" "FMT ,##__VA_ARGS__)
700#endif
701
702/*
703 * debug assertion checking
704 */
705#if 1 // defined(__KDEBUGALL)
706
707#define ASSERT(X) \
708do { \
709 if (unlikely(!(X))) { \
710 printk(KERN_ERR "\n"); \
711 printk(KERN_ERR "AFS: Assertion failed\n"); \
712 BUG(); \
713 } \
714} while(0)
715
716#define ASSERTCMP(X, OP, Y) \
717do { \
718 if (unlikely(!((X) OP (Y)))) { \
719 printk(KERN_ERR "\n"); \
720 printk(KERN_ERR "AFS: Assertion failed\n"); \
721 printk(KERN_ERR "%lu " #OP " %lu is false\n", \
722 (unsigned long)(X), (unsigned long)(Y)); \
723 printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \
724 (unsigned long)(X), (unsigned long)(Y)); \
725 BUG(); \
726 } \
727} while(0)
728
729#define ASSERTIF(C, X) \
730do { \
731 if (unlikely((C) && !(X))) { \
732 printk(KERN_ERR "\n"); \
733 printk(KERN_ERR "AFS: Assertion failed\n"); \
734 BUG(); \
735 } \
736} while(0)
737
738#define ASSERTIFCMP(C, X, OP, Y) \
739do { \
740 if (unlikely((C) && !((X) OP (Y)))) { \
741 printk(KERN_ERR "\n"); \
742 printk(KERN_ERR "AFS: Assertion failed\n"); \
743 printk(KERN_ERR "%lu " #OP " %lu is false\n", \
744 (unsigned long)(X), (unsigned long)(Y)); \
745 printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \
746 (unsigned long)(X), (unsigned long)(Y)); \
747 BUG(); \
748 } \
749} while(0)
750
751#else
752
753#define ASSERT(X) \
754do { \
755} while(0)
756
757#define ASSERTCMP(X, OP, Y) \
758do { \
759} while(0)
760
761#define ASSERTIF(C, X) \
762do { \
763} while(0)
764
765#define ASSERTIFCMP(C, X, OP, Y) \
766do { \
767} while(0)
138 768
139#endif /* AFS_INTERNAL_H */ 769#endif /* __KDEBUGALL */
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
deleted file mode 100644
index 615df2407cb2..000000000000
--- a/fs/afs/kafsasyncd.c
+++ /dev/null
@@ -1,255 +0,0 @@
1/* kafsasyncd.c: AFS asynchronous operation daemon
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 *
12 * The AFS async daemon is used to the following:
13 * - probe "dead" servers to see whether they've come back to life yet.
14 * - probe "live" servers that we haven't talked to for a while to see if they are better
15 * candidates for serving than what we're currently using
16 * - poll volume location servers to keep up to date volume location lists
17 */
18
19#include <linux/module.h>
20#include <linux/init.h>
21#include <linux/sched.h>
22#include <linux/completion.h>
23#include <linux/freezer.h>
24#include "cell.h"
25#include "server.h"
26#include "volume.h"
27#include "kafsasyncd.h"
28#include "kafstimod.h"
29#include <rxrpc/call.h>
30#include <asm/errno.h>
31#include "internal.h"
32
33static DECLARE_COMPLETION(kafsasyncd_alive);
34static DECLARE_COMPLETION(kafsasyncd_dead);
35static DECLARE_WAIT_QUEUE_HEAD(kafsasyncd_sleepq);
36static struct task_struct *kafsasyncd_task;
37static int kafsasyncd_die;
38
39static int kafsasyncd(void *arg);
40
41static LIST_HEAD(kafsasyncd_async_attnq);
42static LIST_HEAD(kafsasyncd_async_busyq);
43static DEFINE_SPINLOCK(kafsasyncd_async_lock);
44
45static void kafsasyncd_null_call_attn_func(struct rxrpc_call *call)
46{
47}
48
49static void kafsasyncd_null_call_error_func(struct rxrpc_call *call)
50{
51}
52
53/*****************************************************************************/
54/*
55 * start the async daemon
56 */
57int afs_kafsasyncd_start(void)
58{
59 int ret;
60
61 ret = kernel_thread(kafsasyncd, NULL, 0);
62 if (ret < 0)
63 return ret;
64
65 wait_for_completion(&kafsasyncd_alive);
66
67 return ret;
68} /* end afs_kafsasyncd_start() */
69
70/*****************************************************************************/
71/*
72 * stop the async daemon
73 */
74void afs_kafsasyncd_stop(void)
75{
76 /* get rid of my daemon */
77 kafsasyncd_die = 1;
78 wake_up(&kafsasyncd_sleepq);
79 wait_for_completion(&kafsasyncd_dead);
80
81} /* end afs_kafsasyncd_stop() */
82
83/*****************************************************************************/
84/*
85 * probing daemon
86 */
87static int kafsasyncd(void *arg)
88{
89 struct afs_async_op *op;
90 int die;
91
92 DECLARE_WAITQUEUE(myself, current);
93
94 kafsasyncd_task = current;
95
96 printk("kAFS: Started kafsasyncd %d\n", current->pid);
97
98 daemonize("kafsasyncd");
99
100 complete(&kafsasyncd_alive);
101
102 /* loop around looking for things to attend to */
103 do {
104 set_current_state(TASK_INTERRUPTIBLE);
105 add_wait_queue(&kafsasyncd_sleepq, &myself);
106
107 for (;;) {
108 if (!list_empty(&kafsasyncd_async_attnq) ||
109 signal_pending(current) ||
110 kafsasyncd_die)
111 break;
112
113 schedule();
114 set_current_state(TASK_INTERRUPTIBLE);
115 }
116
117 remove_wait_queue(&kafsasyncd_sleepq, &myself);
118 set_current_state(TASK_RUNNING);
119
120 try_to_freeze();
121
122 /* discard pending signals */
123 afs_discard_my_signals();
124
125 die = kafsasyncd_die;
126
127 /* deal with the next asynchronous operation requiring
128 * attention */
129 if (!list_empty(&kafsasyncd_async_attnq)) {
130 struct afs_async_op *op;
131
132 _debug("@@@ Begin Asynchronous Operation");
133
134 op = NULL;
135 spin_lock(&kafsasyncd_async_lock);
136
137 if (!list_empty(&kafsasyncd_async_attnq)) {
138 op = list_entry(kafsasyncd_async_attnq.next,
139 struct afs_async_op, link);
140 list_move_tail(&op->link,
141 &kafsasyncd_async_busyq);
142 }
143
144 spin_unlock(&kafsasyncd_async_lock);
145
146 _debug("@@@ Operation %p {%p}\n",
147 op, op ? op->ops : NULL);
148
149 if (op)
150 op->ops->attend(op);
151
152 _debug("@@@ End Asynchronous Operation");
153 }
154
155 } while(!die);
156
157 /* need to kill all outstanding asynchronous operations before
158 * exiting */
159 kafsasyncd_task = NULL;
160 spin_lock(&kafsasyncd_async_lock);
161
162 /* fold the busy and attention queues together */
163 list_splice_init(&kafsasyncd_async_busyq,
164 &kafsasyncd_async_attnq);
165
166 /* dequeue kafsasyncd from all their wait queues */
167 list_for_each_entry(op, &kafsasyncd_async_attnq, link) {
168 op->call->app_attn_func = kafsasyncd_null_call_attn_func;
169 op->call->app_error_func = kafsasyncd_null_call_error_func;
170 remove_wait_queue(&op->call->waitq, &op->waiter);
171 }
172
173 spin_unlock(&kafsasyncd_async_lock);
174
175 /* abort all the operations */
176 while (!list_empty(&kafsasyncd_async_attnq)) {
177 op = list_entry(kafsasyncd_async_attnq.next, struct afs_async_op, link);
178 list_del_init(&op->link);
179
180 rxrpc_call_abort(op->call, -EIO);
181 rxrpc_put_call(op->call);
182 op->call = NULL;
183
184 op->ops->discard(op);
185 }
186
187 /* and that's all */
188 _leave("");
189 complete_and_exit(&kafsasyncd_dead, 0);
190
191} /* end kafsasyncd() */
192
193/*****************************************************************************/
194/*
195 * begin an operation
196 * - place operation on busy queue
197 */
198void afs_kafsasyncd_begin_op(struct afs_async_op *op)
199{
200 _enter("");
201
202 spin_lock(&kafsasyncd_async_lock);
203
204 init_waitqueue_entry(&op->waiter, kafsasyncd_task);
205 add_wait_queue(&op->call->waitq, &op->waiter);
206
207 list_move_tail(&op->link, &kafsasyncd_async_busyq);
208
209 spin_unlock(&kafsasyncd_async_lock);
210
211 _leave("");
212} /* end afs_kafsasyncd_begin_op() */
213
214/*****************************************************************************/
215/*
216 * request attention for an operation
217 * - move to attention queue
218 */
219void afs_kafsasyncd_attend_op(struct afs_async_op *op)
220{
221 _enter("");
222
223 spin_lock(&kafsasyncd_async_lock);
224
225 list_move_tail(&op->link, &kafsasyncd_async_attnq);
226
227 spin_unlock(&kafsasyncd_async_lock);
228
229 wake_up(&kafsasyncd_sleepq);
230
231 _leave("");
232} /* end afs_kafsasyncd_attend_op() */
233
234/*****************************************************************************/
235/*
236 * terminate an operation
237 * - remove from either queue
238 */
239void afs_kafsasyncd_terminate_op(struct afs_async_op *op)
240{
241 _enter("");
242
243 spin_lock(&kafsasyncd_async_lock);
244
245 if (!list_empty(&op->link)) {
246 list_del_init(&op->link);
247 remove_wait_queue(&op->call->waitq, &op->waiter);
248 }
249
250 spin_unlock(&kafsasyncd_async_lock);
251
252 wake_up(&kafsasyncd_sleepq);
253
254 _leave("");
255} /* end afs_kafsasyncd_terminate_op() */
diff --git a/fs/afs/kafsasyncd.h b/fs/afs/kafsasyncd.h
deleted file mode 100644
index 791803f9a6fb..000000000000
--- a/fs/afs/kafsasyncd.h
+++ /dev/null
@@ -1,52 +0,0 @@
1/* kafsasyncd.h: AFS asynchronous operation daemon
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef _LINUX_AFS_KAFSASYNCD_H
13#define _LINUX_AFS_KAFSASYNCD_H
14
15#include "types.h"
16
17struct afs_async_op;
18
19struct afs_async_op_ops {
20 void (*attend)(struct afs_async_op *op);
21 void (*discard)(struct afs_async_op *op);
22};
23
24/*****************************************************************************/
25/*
26 * asynchronous operation record
27 */
28struct afs_async_op
29{
30 struct list_head link;
31 struct afs_server *server; /* server being contacted */
32 struct rxrpc_call *call; /* RxRPC call performing op */
33 wait_queue_t waiter; /* wait queue for kafsasyncd */
34 const struct afs_async_op_ops *ops; /* operations */
35};
36
37static inline void afs_async_op_init(struct afs_async_op *op,
38 const struct afs_async_op_ops *ops)
39{
40 INIT_LIST_HEAD(&op->link);
41 op->call = NULL;
42 op->ops = ops;
43}
44
45extern int afs_kafsasyncd_start(void);
46extern void afs_kafsasyncd_stop(void);
47
48extern void afs_kafsasyncd_begin_op(struct afs_async_op *op);
49extern void afs_kafsasyncd_attend_op(struct afs_async_op *op);
50extern void afs_kafsasyncd_terminate_op(struct afs_async_op *op);
51
52#endif /* _LINUX_AFS_KAFSASYNCD_H */
diff --git a/fs/afs/kafstimod.c b/fs/afs/kafstimod.c
deleted file mode 100644
index 694344e4d3c7..000000000000
--- a/fs/afs/kafstimod.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/* kafstimod.c: AFS timeout daemon
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/sched.h>
15#include <linux/completion.h>
16#include <linux/freezer.h>
17#include "cell.h"
18#include "volume.h"
19#include "kafstimod.h"
20#include <asm/errno.h>
21#include "internal.h"
22
23static DECLARE_COMPLETION(kafstimod_alive);
24static DECLARE_COMPLETION(kafstimod_dead);
25static DECLARE_WAIT_QUEUE_HEAD(kafstimod_sleepq);
26static int kafstimod_die;
27
28static LIST_HEAD(kafstimod_list);
29static DEFINE_SPINLOCK(kafstimod_lock);
30
31static int kafstimod(void *arg);
32
33/*****************************************************************************/
34/*
35 * start the timeout daemon
36 */
37int afs_kafstimod_start(void)
38{
39 int ret;
40
41 ret = kernel_thread(kafstimod, NULL, 0);
42 if (ret < 0)
43 return ret;
44
45 wait_for_completion(&kafstimod_alive);
46
47 return ret;
48} /* end afs_kafstimod_start() */
49
50/*****************************************************************************/
51/*
52 * stop the timeout daemon
53 */
54void afs_kafstimod_stop(void)
55{
56 /* get rid of my daemon */
57 kafstimod_die = 1;
58 wake_up(&kafstimod_sleepq);
59 wait_for_completion(&kafstimod_dead);
60
61} /* end afs_kafstimod_stop() */
62
63/*****************************************************************************/
64/*
65 * timeout processing daemon
66 */
67static int kafstimod(void *arg)
68{
69 struct afs_timer *timer;
70
71 DECLARE_WAITQUEUE(myself, current);
72
73 printk("kAFS: Started kafstimod %d\n", current->pid);
74
75 daemonize("kafstimod");
76
77 complete(&kafstimod_alive);
78
79 /* loop around looking for things to attend to */
80 loop:
81 set_current_state(TASK_INTERRUPTIBLE);
82 add_wait_queue(&kafstimod_sleepq, &myself);
83
84 for (;;) {
85 unsigned long jif;
86 signed long timeout;
87
88 /* deal with the server being asked to die */
89 if (kafstimod_die) {
90 remove_wait_queue(&kafstimod_sleepq, &myself);
91 _leave("");
92 complete_and_exit(&kafstimod_dead, 0);
93 }
94
95 try_to_freeze();
96
97 /* discard pending signals */
98 afs_discard_my_signals();
99
100 /* work out the time to elapse before the next event */
101 spin_lock(&kafstimod_lock);
102 if (list_empty(&kafstimod_list)) {
103 timeout = MAX_SCHEDULE_TIMEOUT;
104 }
105 else {
106 timer = list_entry(kafstimod_list.next,
107 struct afs_timer, link);
108 timeout = timer->timo_jif;
109 jif = jiffies;
110
111 if (time_before_eq((unsigned long) timeout, jif))
112 goto immediate;
113
114 else {
115 timeout = (long) timeout - (long) jiffies;
116 }
117 }
118 spin_unlock(&kafstimod_lock);
119
120 schedule_timeout(timeout);
121
122 set_current_state(TASK_INTERRUPTIBLE);
123 }
124
125 /* the thing on the front of the queue needs processing
126 * - we come here with the lock held and timer pointing to the expired
127 * entry
128 */
129 immediate:
130 remove_wait_queue(&kafstimod_sleepq, &myself);
131 set_current_state(TASK_RUNNING);
132
133 _debug("@@@ Begin Timeout of %p", timer);
134
135 /* dequeue the timer */
136 list_del_init(&timer->link);
137 spin_unlock(&kafstimod_lock);
138
139 /* call the timeout function */
140 timer->ops->timed_out(timer);
141
142 _debug("@@@ End Timeout");
143 goto loop;
144
145} /* end kafstimod() */
146
147/*****************************************************************************/
148/*
149 * (re-)queue a timer
150 */
151void afs_kafstimod_add_timer(struct afs_timer *timer, unsigned long timeout)
152{
153 struct afs_timer *ptimer;
154 struct list_head *_p;
155
156 _enter("%p,%lu", timer, timeout);
157
158 spin_lock(&kafstimod_lock);
159
160 list_del(&timer->link);
161
162 /* the timer was deferred or reset - put it back in the queue at the
163 * right place */
164 timer->timo_jif = jiffies + timeout;
165
166 list_for_each(_p, &kafstimod_list) {
167 ptimer = list_entry(_p, struct afs_timer, link);
168 if (time_before(timer->timo_jif, ptimer->timo_jif))
169 break;
170 }
171
172 list_add_tail(&timer->link, _p); /* insert before stopping point */
173
174 spin_unlock(&kafstimod_lock);
175
176 wake_up(&kafstimod_sleepq);
177
178 _leave("");
179} /* end afs_kafstimod_add_timer() */
180
181/*****************************************************************************/
182/*
183 * dequeue a timer
184 * - returns 0 if the timer was deleted or -ENOENT if it wasn't queued
185 */
186int afs_kafstimod_del_timer(struct afs_timer *timer)
187{
188 int ret = 0;
189
190 _enter("%p", timer);
191
192 spin_lock(&kafstimod_lock);
193
194 if (list_empty(&timer->link))
195 ret = -ENOENT;
196 else
197 list_del_init(&timer->link);
198
199 spin_unlock(&kafstimod_lock);
200
201 wake_up(&kafstimod_sleepq);
202
203 _leave(" = %d", ret);
204 return ret;
205} /* end afs_kafstimod_del_timer() */
diff --git a/fs/afs/kafstimod.h b/fs/afs/kafstimod.h
deleted file mode 100644
index e312f1a61a7f..000000000000
--- a/fs/afs/kafstimod.h
+++ /dev/null
@@ -1,49 +0,0 @@
1/* kafstimod.h: AFS timeout daemon
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef _LINUX_AFS_KAFSTIMOD_H
13#define _LINUX_AFS_KAFSTIMOD_H
14
15#include "types.h"
16
17struct afs_timer;
18
19struct afs_timer_ops {
20 /* called when the front of the timer queue has timed out */
21 void (*timed_out)(struct afs_timer *timer);
22};
23
24/*****************************************************************************/
25/*
26 * AFS timer/timeout record
27 */
28struct afs_timer
29{
30 struct list_head link; /* link in timer queue */
31 unsigned long timo_jif; /* timeout time */
32 const struct afs_timer_ops *ops; /* timeout expiry function */
33};
34
35static inline void afs_timer_init(struct afs_timer *timer,
36 const struct afs_timer_ops *ops)
37{
38 INIT_LIST_HEAD(&timer->link);
39 timer->ops = ops;
40}
41
42extern int afs_kafstimod_start(void);
43extern void afs_kafstimod_stop(void);
44
45extern void afs_kafstimod_add_timer(struct afs_timer *timer,
46 unsigned long timeout);
47extern int afs_kafstimod_del_timer(struct afs_timer *timer);
48
49#endif /* _LINUX_AFS_KAFSTIMOD_H */
diff --git a/fs/afs/main.c b/fs/afs/main.c
index f2704ba53857..80ec6fd19a73 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,4 +1,4 @@
1/* main.c: AFS client file system 1/* AFS client file system
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
@@ -13,43 +13,21 @@
13#include <linux/moduleparam.h> 13#include <linux/moduleparam.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/completion.h> 15#include <linux/completion.h>
16#include <rxrpc/rxrpc.h>
17#include <rxrpc/transport.h>
18#include <rxrpc/call.h>
19#include <rxrpc/peer.h>
20#include "cache.h"
21#include "cell.h"
22#include "server.h"
23#include "fsclient.h"
24#include "cmservice.h"
25#include "kafstimod.h"
26#include "kafsasyncd.h"
27#include "internal.h" 16#include "internal.h"
28 17
29struct rxrpc_transport *afs_transport;
30
31static int afs_adding_peer(struct rxrpc_peer *peer);
32static void afs_discarding_peer(struct rxrpc_peer *peer);
33
34
35MODULE_DESCRIPTION("AFS Client File System"); 18MODULE_DESCRIPTION("AFS Client File System");
36MODULE_AUTHOR("Red Hat, Inc."); 19MODULE_AUTHOR("Red Hat, Inc.");
37MODULE_LICENSE("GPL"); 20MODULE_LICENSE("GPL");
38 21
22unsigned afs_debug;
23module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO);
24MODULE_PARM_DESC(afs_debug, "AFS debugging mask");
25
39static char *rootcell; 26static char *rootcell;
40 27
41module_param(rootcell, charp, 0); 28module_param(rootcell, charp, 0);
42MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); 29MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
43 30
44
45static struct rxrpc_peer_ops afs_peer_ops = {
46 .adding = afs_adding_peer,
47 .discarding = afs_discarding_peer,
48};
49
50struct list_head afs_cb_hash_tbl[AFS_CB_HASH_COUNT];
51DEFINE_SPINLOCK(afs_cb_hash_lock);
52
53#ifdef AFS_CACHING_SUPPORT 31#ifdef AFS_CACHING_SUPPORT
54static struct cachefs_netfs_operations afs_cache_ops = { 32static struct cachefs_netfs_operations afs_cache_ops = {
55 .get_page_cookie = afs_cache_get_page_cookie, 33 .get_page_cookie = afs_cache_get_page_cookie,
@@ -62,20 +40,63 @@ struct cachefs_netfs afs_cache_netfs = {
62}; 40};
63#endif 41#endif
64 42
65/*****************************************************************************/ 43struct afs_uuid afs_uuid;
44
45/*
46 * get a client UUID
47 */
48static int __init afs_get_client_UUID(void)
49{
50 struct timespec ts;
51 u64 uuidtime;
52 u16 clockseq;
53 int ret;
54
55 /* read the MAC address of one of the external interfaces and construct
56 * a UUID from it */
57 ret = afs_get_MAC_address(afs_uuid.node, sizeof(afs_uuid.node));
58 if (ret < 0)
59 return ret;
60
61 getnstimeofday(&ts);
62 uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10;
63 uuidtime += ts.tv_nsec / 100;
64 uuidtime += AFS_UUID_TO_UNIX_TIME;
65 afs_uuid.time_low = uuidtime;
66 afs_uuid.time_mid = uuidtime >> 32;
67 afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK;
68 afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME;
69
70 get_random_bytes(&clockseq, 2);
71 afs_uuid.clock_seq_low = clockseq;
72 afs_uuid.clock_seq_hi_and_reserved =
73 (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK;
74 afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD;
75
76 _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
77 afs_uuid.time_low,
78 afs_uuid.time_mid,
79 afs_uuid.time_hi_and_version,
80 afs_uuid.clock_seq_hi_and_reserved,
81 afs_uuid.clock_seq_low,
82 afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2],
83 afs_uuid.node[3], afs_uuid.node[4], afs_uuid.node[5]);
84
85 return 0;
86}
87
66/* 88/*
67 * initialise the AFS client FS module 89 * initialise the AFS client FS module
68 */ 90 */
69static int __init afs_init(void) 91static int __init afs_init(void)
70{ 92{
71 int loop, ret; 93 int ret;
72 94
73 printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); 95 printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
74 96
75 /* initialise the callback hash table */ 97 ret = afs_get_client_UUID();
76 spin_lock_init(&afs_cb_hash_lock); 98 if (ret < 0)
77 for (loop = AFS_CB_HASH_COUNT - 1; loop >= 0; loop--) 99 return ret;
78 INIT_LIST_HEAD(&afs_cb_hash_tbl[loop]);
79 100
80 /* register the /proc stuff */ 101 /* register the /proc stuff */
81 ret = afs_proc_init(); 102 ret = afs_proc_init();
@@ -87,70 +108,56 @@ static int __init afs_init(void)
87 ret = cachefs_register_netfs(&afs_cache_netfs, 108 ret = cachefs_register_netfs(&afs_cache_netfs,
88 &afs_cache_cell_index_def); 109 &afs_cache_cell_index_def);
89 if (ret < 0) 110 if (ret < 0)
90 goto error;
91#endif
92
93#ifdef CONFIG_KEYS_TURNED_OFF
94 ret = afs_key_register();
95 if (ret < 0)
96 goto error_cache; 111 goto error_cache;
97#endif 112#endif
98 113
99 /* initialise the cell DB */ 114 /* initialise the cell DB */
100 ret = afs_cell_init(rootcell); 115 ret = afs_cell_init(rootcell);
101 if (ret < 0) 116 if (ret < 0)
102 goto error_keys; 117 goto error_cell_init;
103 118
104 /* start the timeout daemon */ 119 /* initialise the VL update process */
105 ret = afs_kafstimod_start(); 120 ret = afs_vlocation_update_init();
106 if (ret < 0) 121 if (ret < 0)
107 goto error_keys; 122 goto error_vl_update_init;
108 123
109 /* start the async operation daemon */ 124 /* initialise the callback update process */
110 ret = afs_kafsasyncd_start(); 125 ret = afs_callback_update_init();
111 if (ret < 0)
112 goto error_kafstimod;
113 126
114 /* create the RxRPC transport */ 127 /* create the RxRPC transport */
115 ret = rxrpc_create_transport(7001, &afs_transport); 128 ret = afs_open_socket();
116 if (ret < 0) 129 if (ret < 0)
117 goto error_kafsasyncd; 130 goto error_open_socket;
118
119 afs_transport->peer_ops = &afs_peer_ops;
120 131
121 /* register the filesystems */ 132 /* register the filesystems */
122 ret = afs_fs_init(); 133 ret = afs_fs_init();
123 if (ret < 0) 134 if (ret < 0)
124 goto error_transport; 135 goto error_fs;
125 136
126 return ret; 137 return ret;
127 138
128 error_transport: 139error_fs:
129 rxrpc_put_transport(afs_transport); 140 afs_close_socket();
130 error_kafsasyncd: 141error_open_socket:
131 afs_kafsasyncd_stop(); 142error_vl_update_init:
132 error_kafstimod: 143error_cell_init:
133 afs_kafstimod_stop();
134 error_keys:
135#ifdef CONFIG_KEYS_TURNED_OFF
136 afs_key_unregister();
137 error_cache:
138#endif
139#ifdef AFS_CACHING_SUPPORT 144#ifdef AFS_CACHING_SUPPORT
140 cachefs_unregister_netfs(&afs_cache_netfs); 145 cachefs_unregister_netfs(&afs_cache_netfs);
141 error: 146error_cache:
142#endif 147#endif
148 afs_callback_update_kill();
149 afs_vlocation_purge();
143 afs_cell_purge(); 150 afs_cell_purge();
144 afs_proc_cleanup(); 151 afs_proc_cleanup();
145 printk(KERN_ERR "kAFS: failed to register: %d\n", ret); 152 printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
146 return ret; 153 return ret;
147} /* end afs_init() */ 154}
148 155
149/* XXX late_initcall is kludgy, but the only alternative seems to create 156/* XXX late_initcall is kludgy, but the only alternative seems to create
150 * a transport upon the first mount, which is worse. Or is it? 157 * a transport upon the first mount, which is worse. Or is it?
151 */ 158 */
152late_initcall(afs_init); /* must be called after net/ to create socket */ 159late_initcall(afs_init); /* must be called after net/ to create socket */
153/*****************************************************************************/ 160
154/* 161/*
155 * clean up on module removal 162 * clean up on module removal
156 */ 163 */
@@ -159,127 +166,16 @@ static void __exit afs_exit(void)
159 printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n"); 166 printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
160 167
161 afs_fs_exit(); 168 afs_fs_exit();
162 rxrpc_put_transport(afs_transport); 169 afs_close_socket();
163 afs_kafstimod_stop(); 170 afs_purge_servers();
164 afs_kafsasyncd_stop(); 171 afs_callback_update_kill();
172 afs_vlocation_purge();
173 flush_scheduled_work();
165 afs_cell_purge(); 174 afs_cell_purge();
166#ifdef CONFIG_KEYS_TURNED_OFF
167 afs_key_unregister();
168#endif
169#ifdef AFS_CACHING_SUPPORT 175#ifdef AFS_CACHING_SUPPORT
170 cachefs_unregister_netfs(&afs_cache_netfs); 176 cachefs_unregister_netfs(&afs_cache_netfs);
171#endif 177#endif
172 afs_proc_cleanup(); 178 afs_proc_cleanup();
173
174} /* end afs_exit() */
175
176module_exit(afs_exit);
177
178/*****************************************************************************/
179/*
180 * notification that new peer record is being added
181 * - called from krxsecd
182 * - return an error to induce an abort
183 * - mustn't sleep (caller holds an rwlock)
184 */
185static int afs_adding_peer(struct rxrpc_peer *peer)
186{
187 struct afs_server *server;
188 int ret;
189
190 _debug("kAFS: Adding new peer %08x\n", ntohl(peer->addr.s_addr));
191
192 /* determine which server the peer resides in (if any) */
193 ret = afs_server_find_by_peer(peer, &server);
194 if (ret < 0)
195 return ret; /* none that we recognise, so abort */
196
197 _debug("Server %p{u=%d}\n", server, atomic_read(&server->usage));
198
199 _debug("Cell %p{u=%d}\n",
200 server->cell, atomic_read(&server->cell->usage));
201
202 /* cross-point the structs under a global lock */
203 spin_lock(&afs_server_peer_lock);
204 peer->user = server;
205 server->peer = peer;
206 spin_unlock(&afs_server_peer_lock);
207
208 afs_put_server(server);
209
210 return 0;
211} /* end afs_adding_peer() */
212
213/*****************************************************************************/
214/*
215 * notification that a peer record is being discarded
216 * - called from krxiod or krxsecd
217 */
218static void afs_discarding_peer(struct rxrpc_peer *peer)
219{
220 struct afs_server *server;
221
222 _enter("%p",peer);
223
224 _debug("Discarding peer %08x (rtt=%lu.%lumS)\n",
225 ntohl(peer->addr.s_addr),
226 (long) (peer->rtt / 1000),
227 (long) (peer->rtt % 1000));
228
229 /* uncross-point the structs under a global lock */
230 spin_lock(&afs_server_peer_lock);
231 server = peer->user;
232 if (server) {
233 peer->user = NULL;
234 server->peer = NULL;
235 }
236 spin_unlock(&afs_server_peer_lock);
237
238 _leave("");
239
240} /* end afs_discarding_peer() */
241
242/*****************************************************************************/
243/*
244 * clear the dead space between task_struct and kernel stack
245 * - called by supplying -finstrument-functions to gcc
246 */
247#if 0
248void __cyg_profile_func_enter (void *this_fn, void *call_site)
249__attribute__((no_instrument_function));
250
251void __cyg_profile_func_enter (void *this_fn, void *call_site)
252{
253 asm volatile(" movl %%esp,%%edi \n"
254 " andl %0,%%edi \n"
255 " addl %1,%%edi \n"
256 " movl %%esp,%%ecx \n"
257 " subl %%edi,%%ecx \n"
258 " shrl $2,%%ecx \n"
259 " movl $0xedededed,%%eax \n"
260 " rep stosl \n"
261 :
262 : "i"(~(THREAD_SIZE - 1)), "i"(sizeof(struct thread_info))
263 : "eax", "ecx", "edi", "memory", "cc"
264 );
265} 179}
266 180
267void __cyg_profile_func_exit(void *this_fn, void *call_site) 181module_exit(afs_exit);
268__attribute__((no_instrument_function));
269
270void __cyg_profile_func_exit(void *this_fn, void *call_site)
271{
272 asm volatile(" movl %%esp,%%edi \n"
273 " andl %0,%%edi \n"
274 " addl %1,%%edi \n"
275 " movl %%esp,%%ecx \n"
276 " subl %%edi,%%ecx \n"
277 " shrl $2,%%ecx \n"
278 " movl $0xdadadada,%%eax \n"
279 " rep stosl \n"
280 :
281 : "i"(~(THREAD_SIZE - 1)), "i"(sizeof(struct thread_info))
282 : "eax", "ecx", "edi", "memory", "cc"
283 );
284}
285#endif
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index e4fce66d76e0..cdb9792d8161 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -1,6 +1,6 @@
1/* misc.c: miscellaneous bits 1/* miscellaneous bits
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -12,19 +12,20 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include "errors.h"
16#include "internal.h" 15#include "internal.h"
16#include "afs_fs.h"
17 17
18/*****************************************************************************/
19/* 18/*
20 * convert an AFS abort code to a Linux error number 19 * convert an AFS abort code to a Linux error number
21 */ 20 */
22int afs_abort_to_error(int abortcode) 21int afs_abort_to_error(u32 abort_code)
23{ 22{
24 switch (abortcode) { 23 switch (abort_code) {
24 case 13: return -EACCES;
25 case 30: return -EROFS;
25 case VSALVAGE: return -EIO; 26 case VSALVAGE: return -EIO;
26 case VNOVNODE: return -ENOENT; 27 case VNOVNODE: return -ENOENT;
27 case VNOVOL: return -ENXIO; 28 case VNOVOL: return -ENOMEDIUM;
28 case VVOLEXISTS: return -EEXIST; 29 case VVOLEXISTS: return -EEXIST;
29 case VNOSERVICE: return -EIO; 30 case VNOSERVICE: return -EIO;
30 case VOFFLINE: return -ENOENT; 31 case VOFFLINE: return -ENOENT;
@@ -33,7 +34,24 @@ int afs_abort_to_error(int abortcode)
33 case VOVERQUOTA: return -EDQUOT; 34 case VOVERQUOTA: return -EDQUOT;
34 case VBUSY: return -EBUSY; 35 case VBUSY: return -EBUSY;
35 case VMOVED: return -ENXIO; 36 case VMOVED: return -ENXIO;
36 default: return -EIO; 37 case 0x2f6df0c: return -EACCES;
38 case 0x2f6df0f: return -EBUSY;
39 case 0x2f6df10: return -EEXIST;
40 case 0x2f6df11: return -EXDEV;
41 case 0x2f6df13: return -ENOTDIR;
42 case 0x2f6df14: return -EISDIR;
43 case 0x2f6df15: return -EINVAL;
44 case 0x2f6df1a: return -EFBIG;
45 case 0x2f6df1b: return -ENOSPC;
46 case 0x2f6df1d: return -EROFS;
47 case 0x2f6df1e: return -EMLINK;
48 case 0x2f6df20: return -EDOM;
49 case 0x2f6df21: return -ERANGE;
50 case 0x2f6df22: return -EDEADLK;
51 case 0x2f6df23: return -ENAMETOOLONG;
52 case 0x2f6df24: return -ENOLCK;
53 case 0x2f6df26: return -ENOTEMPTY;
54 case 0x2f6df78: return -EDQUOT;
55 default: return -EREMOTEIO;
37 } 56 }
38 57}
39} /* end afs_abort_to_error() */
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 68495f0de7b3..034fcfd4e330 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -1,4 +1,4 @@
1/* mntpt.c: mountpoint management 1/* mountpoint management
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
@@ -18,10 +18,6 @@
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/namei.h> 19#include <linux/namei.h>
20#include <linux/mnt_namespace.h> 20#include <linux/mnt_namespace.h>
21#include "super.h"
22#include "cell.h"
23#include "volume.h"
24#include "vnode.h"
25#include "internal.h" 21#include "internal.h"
26 22
27 23
@@ -30,6 +26,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
30 struct nameidata *nd); 26 struct nameidata *nd);
31static int afs_mntpt_open(struct inode *inode, struct file *file); 27static int afs_mntpt_open(struct inode *inode, struct file *file);
32static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd); 28static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
29static void afs_mntpt_expiry_timed_out(struct work_struct *work);
33 30
34const struct file_operations afs_mntpt_file_operations = { 31const struct file_operations afs_mntpt_file_operations = {
35 .open = afs_mntpt_open, 32 .open = afs_mntpt_open,
@@ -43,24 +40,19 @@ const struct inode_operations afs_mntpt_inode_operations = {
43}; 40};
44 41
45static LIST_HEAD(afs_vfsmounts); 42static LIST_HEAD(afs_vfsmounts);
43static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
46 44
47static void afs_mntpt_expiry_timed_out(struct afs_timer *timer); 45unsigned long afs_mntpt_expiry_timeout = 10 * 60;
48 46
49struct afs_timer_ops afs_mntpt_expiry_timer_ops = {
50 .timed_out = afs_mntpt_expiry_timed_out,
51};
52
53struct afs_timer afs_mntpt_expiry_timer;
54
55unsigned long afs_mntpt_expiry_timeout = 20;
56
57/*****************************************************************************/
58/* 47/*
59 * check a symbolic link to see whether it actually encodes a mountpoint 48 * check a symbolic link to see whether it actually encodes a mountpoint
60 * - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately 49 * - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately
61 */ 50 */
62int afs_mntpt_check_symlink(struct afs_vnode *vnode) 51int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
63{ 52{
53 struct file file = {
54 .private_data = key,
55 };
64 struct page *page; 56 struct page *page;
65 size_t size; 57 size_t size;
66 char *buf; 58 char *buf;
@@ -69,23 +61,21 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
69 _enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique); 61 _enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
70 62
71 /* read the contents of the symlink into the pagecache */ 63 /* read the contents of the symlink into the pagecache */
72 page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, NULL); 64 page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
73 if (IS_ERR(page)) { 65 if (IS_ERR(page)) {
74 ret = PTR_ERR(page); 66 ret = PTR_ERR(page);
75 goto out; 67 goto out;
76 } 68 }
77 69
78 ret = -EIO; 70 ret = -EIO;
79 wait_on_page_locked(page);
80 buf = kmap(page);
81 if (!PageUptodate(page))
82 goto out_free;
83 if (PageError(page)) 71 if (PageError(page))
84 goto out_free; 72 goto out_free;
85 73
74 buf = kmap(page);
75
86 /* examine the symlink's contents */ 76 /* examine the symlink's contents */
87 size = vnode->status.size; 77 size = vnode->status.size;
88 _debug("symlink to %*.*s", size, (int) size, buf); 78 _debug("symlink to %*.*s", (int) size, (int) size, buf);
89 79
90 if (size > 2 && 80 if (size > 2 &&
91 (buf[0] == '%' || buf[0] == '#') && 81 (buf[0] == '%' || buf[0] == '#') &&
@@ -93,22 +83,20 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
93 ) { 83 ) {
94 _debug("symlink is a mountpoint"); 84 _debug("symlink is a mountpoint");
95 spin_lock(&vnode->lock); 85 spin_lock(&vnode->lock);
96 vnode->flags |= AFS_VNODE_MOUNTPOINT; 86 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
97 spin_unlock(&vnode->lock); 87 spin_unlock(&vnode->lock);
98 } 88 }
99 89
100 ret = 0; 90 ret = 0;
101 91
102 out_free:
103 kunmap(page); 92 kunmap(page);
93out_free:
104 page_cache_release(page); 94 page_cache_release(page);
105 out: 95out:
106 _leave(" = %d", ret); 96 _leave(" = %d", ret);
107 return ret; 97 return ret;
98}
108 99
109} /* end afs_mntpt_check_symlink() */
110
111/*****************************************************************************/
112/* 100/*
113 * no valid lookup procedure on this sort of dir 101 * no valid lookup procedure on this sort of dir
114 */ 102 */
@@ -116,7 +104,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
116 struct dentry *dentry, 104 struct dentry *dentry,
117 struct nameidata *nd) 105 struct nameidata *nd)
118{ 106{
119 kenter("%p,%p{%p{%s},%s}", 107 _enter("%p,%p{%p{%s},%s}",
120 dir, 108 dir,
121 dentry, 109 dentry,
122 dentry->d_parent, 110 dentry->d_parent,
@@ -125,15 +113,14 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
125 dentry->d_name.name); 113 dentry->d_name.name);
126 114
127 return ERR_PTR(-EREMOTE); 115 return ERR_PTR(-EREMOTE);
128} /* end afs_mntpt_lookup() */ 116}
129 117
130/*****************************************************************************/
131/* 118/*
132 * no valid open procedure on this sort of dir 119 * no valid open procedure on this sort of dir
133 */ 120 */
134static int afs_mntpt_open(struct inode *inode, struct file *file) 121static int afs_mntpt_open(struct inode *inode, struct file *file)
135{ 122{
136 kenter("%p,%p{%p{%s},%s}", 123 _enter("%p,%p{%p{%s},%s}",
137 inode, file, 124 inode, file,
138 file->f_path.dentry->d_parent, 125 file->f_path.dentry->d_parent,
139 file->f_path.dentry->d_parent ? 126 file->f_path.dentry->d_parent ?
@@ -142,9 +129,8 @@ static int afs_mntpt_open(struct inode *inode, struct file *file)
142 file->f_path.dentry->d_name.name); 129 file->f_path.dentry->d_name.name);
143 130
144 return -EREMOTE; 131 return -EREMOTE;
145} /* end afs_mntpt_open() */ 132}
146 133
147/*****************************************************************************/
148/* 134/*
149 * create a vfsmount to be automounted 135 * create a vfsmount to be automounted
150 */ 136 */
@@ -157,7 +143,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
157 char *buf, *devname = NULL, *options = NULL; 143 char *buf, *devname = NULL, *options = NULL;
158 int ret; 144 int ret;
159 145
160 kenter("{%s}", mntpt->d_name.name); 146 _enter("{%s}", mntpt->d_name.name);
161 147
162 BUG_ON(!mntpt->d_inode); 148 BUG_ON(!mntpt->d_inode);
163 149
@@ -183,8 +169,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
183 } 169 }
184 170
185 ret = -EIO; 171 ret = -EIO;
186 wait_on_page_locked(page); 172 if (PageError(page))
187 if (!PageUptodate(page) || PageError(page))
188 goto error; 173 goto error;
189 174
190 buf = kmap(page); 175 buf = kmap(page);
@@ -201,79 +186,108 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
201 strcat(options, ",rwpath"); 186 strcat(options, ",rwpath");
202 187
203 /* try and do the mount */ 188 /* try and do the mount */
204 kdebug("--- attempting mount %s -o %s ---", devname, options); 189 _debug("--- attempting mount %s -o %s ---", devname, options);
205 mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); 190 mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
206 kdebug("--- mount result %p ---", mnt); 191 _debug("--- mount result %p ---", mnt);
207 192
208 free_page((unsigned long) devname); 193 free_page((unsigned long) devname);
209 free_page((unsigned long) options); 194 free_page((unsigned long) options);
210 kleave(" = %p", mnt); 195 _leave(" = %p", mnt);
211 return mnt; 196 return mnt;
212 197
213 error: 198error:
214 if (page) 199 if (page)
215 page_cache_release(page); 200 page_cache_release(page);
216 if (devname) 201 if (devname)
217 free_page((unsigned long) devname); 202 free_page((unsigned long) devname);
218 if (options) 203 if (options)
219 free_page((unsigned long) options); 204 free_page((unsigned long) options);
220 kleave(" = %d", ret); 205 _leave(" = %d", ret);
221 return ERR_PTR(ret); 206 return ERR_PTR(ret);
222} /* end afs_mntpt_do_automount() */ 207}
223 208
224/*****************************************************************************/
225/* 209/*
226 * follow a link from a mountpoint directory, thus causing it to be mounted 210 * follow a link from a mountpoint directory, thus causing it to be mounted
227 */ 211 */
228static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) 212static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
229{ 213{
230 struct vfsmount *newmnt; 214 struct vfsmount *newmnt;
231 struct dentry *old_dentry;
232 int err; 215 int err;
233 216
234 kenter("%p{%s},{%s:%p{%s}}", 217 _enter("%p{%s},{%s:%p{%s},}",
235 dentry, 218 dentry,
236 dentry->d_name.name, 219 dentry->d_name.name,
237 nd->mnt->mnt_devname, 220 nd->mnt->mnt_devname,
238 dentry, 221 dentry,
239 nd->dentry->d_name.name); 222 nd->dentry->d_name.name);
240 223
241 newmnt = afs_mntpt_do_automount(dentry); 224 dput(nd->dentry);
225 nd->dentry = dget(dentry);
226
227 newmnt = afs_mntpt_do_automount(nd->dentry);
242 if (IS_ERR(newmnt)) { 228 if (IS_ERR(newmnt)) {
243 path_release(nd); 229 path_release(nd);
244 return (void *)newmnt; 230 return (void *)newmnt;
245 } 231 }
246 232
247 old_dentry = nd->dentry; 233 mntget(newmnt);
248 nd->dentry = dentry; 234 err = do_add_mount(newmnt, nd, MNT_SHRINKABLE, &afs_vfsmounts);
249 err = do_add_mount(newmnt, nd, 0, &afs_vfsmounts); 235 switch (err) {
250 nd->dentry = old_dentry; 236 case 0:
251 237 mntput(nd->mnt);
252 path_release(nd); 238 dput(nd->dentry);
253
254 if (!err) {
255 mntget(newmnt);
256 nd->mnt = newmnt; 239 nd->mnt = newmnt;
257 dget(newmnt->mnt_root); 240 nd->dentry = dget(newmnt->mnt_root);
258 nd->dentry = newmnt->mnt_root; 241 schedule_delayed_work(&afs_mntpt_expiry_timer,
242 afs_mntpt_expiry_timeout * HZ);
243 break;
244 case -EBUSY:
245 /* someone else made a mount here whilst we were busy */
246 while (d_mountpoint(nd->dentry) &&
247 follow_down(&nd->mnt, &nd->dentry))
248 ;
249 err = 0;
250 default:
251 mntput(newmnt);
252 break;
259 } 253 }
260 254
261 kleave(" = %d", err); 255 _leave(" = %d", err);
262 return ERR_PTR(err); 256 return ERR_PTR(err);
263} /* end afs_mntpt_follow_link() */ 257}
264 258
265/*****************************************************************************/
266/* 259/*
267 * handle mountpoint expiry timer going off 260 * handle mountpoint expiry timer going off
268 */ 261 */
269static void afs_mntpt_expiry_timed_out(struct afs_timer *timer) 262static void afs_mntpt_expiry_timed_out(struct work_struct *work)
270{ 263{
271 kenter(""); 264 _enter("");
272 265
273 mark_mounts_for_expiry(&afs_vfsmounts); 266 if (!list_empty(&afs_vfsmounts)) {
267 mark_mounts_for_expiry(&afs_vfsmounts);
268 schedule_delayed_work(&afs_mntpt_expiry_timer,
269 afs_mntpt_expiry_timeout * HZ);
270 }
274 271
275 afs_kafstimod_add_timer(&afs_mntpt_expiry_timer, 272 _leave("");
276 afs_mntpt_expiry_timeout * HZ); 273}
277 274
278 kleave(""); 275/*
279} /* end afs_mntpt_expiry_timed_out() */ 276 * kill the AFS mountpoint timer if it's still running
277 */
278void afs_mntpt_kill_timer(void)
279{
280 _enter("");
281
282 ASSERT(list_empty(&afs_vfsmounts));
283 cancel_delayed_work(&afs_mntpt_expiry_timer);
284 flush_scheduled_work();
285}
286
287/*
288 * begin unmount by attempting to remove all automounted mountpoints we added
289 */
290void afs_umount_begin(struct vfsmount *vfsmnt, int flags)
291{
292 shrink_submounts(vfsmnt, &afs_vfsmounts);
293}
diff --git a/fs/afs/mount.h b/fs/afs/mount.h
deleted file mode 100644
index 9d2f46ec549f..000000000000
--- a/fs/afs/mount.h
+++ /dev/null
@@ -1,23 +0,0 @@
1/* mount.h: mount parameters
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef _LINUX_AFS_MOUNT_H
13#define _LINUX_AFS_MOUNT_H
14
15struct afs_mountdata {
16 const char *volume; /* name of volume */
17 const char *cell; /* name of cell containing volume */
18 const char *cache; /* name of cache block device */
19 size_t nservers; /* number of server addresses listed */
20 uint32_t servers[10]; /* IP addresses of servers in this cell */
21};
22
23#endif /* _LINUX_AFS_MOUNT_H */
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
new file mode 100644
index 000000000000..fc27d4b52e5f
--- /dev/null
+++ b/fs/afs/netdevices.c
@@ -0,0 +1,68 @@
1/* AFS network device helpers
2 *
3 * Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
4 */
5
6#include <linux/string.h>
7#include <linux/rtnetlink.h>
8#include <linux/inetdevice.h>
9#include <linux/netdevice.h>
10#include <linux/if_arp.h>
11#include "internal.h"
12
13/*
14 * get a MAC address from a random ethernet interface that has a real one
15 * - the buffer will normally be 6 bytes in size
16 */
17int afs_get_MAC_address(u8 *mac, size_t maclen)
18{
19 struct net_device *dev;
20 int ret = -ENODEV;
21
22 if (maclen != ETH_ALEN)
23 BUG();
24
25 rtnl_lock();
26 dev = __dev_getfirstbyhwtype(ARPHRD_ETHER);
27 if (dev) {
28 memcpy(mac, dev->dev_addr, maclen);
29 ret = 0;
30 }
31 rtnl_unlock();
32 return ret;
33}
34
35/*
36 * get a list of this system's interface IPv4 addresses, netmasks and MTUs
37 * - maxbufs must be at least 1
38 * - returns the number of interface records in the buffer
39 */
40int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
41 bool wantloopback)
42{
43 struct net_device *dev;
44 struct in_device *idev;
45 int n = 0;
46
47 ASSERT(maxbufs > 0);
48
49 rtnl_lock();
50 for_each_netdev(dev) {
51 if (dev->type == ARPHRD_LOOPBACK && !wantloopback)
52 continue;
53 idev = __in_dev_get_rtnl(dev);
54 if (!idev)
55 continue;
56 for_primary_ifa(idev) {
57 bufs[n].address.s_addr = ifa->ifa_address;
58 bufs[n].netmask.s_addr = ifa->ifa_mask;
59 bufs[n].mtu = dev->mtu;
60 n++;
61 if (n >= maxbufs)
62 goto out;
63 } endfor_ifa(idev);
64 }
65out:
66 rtnl_unlock();
67 return n;
68}
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index ae6b85b1e484..d5601f617cdb 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -1,4 +1,4 @@
1/* proc.c: /proc interface for AFS 1/* /proc interface for AFS
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
@@ -13,8 +13,6 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/proc_fs.h> 14#include <linux/proc_fs.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include "cell.h"
17#include "volume.h"
18#include <asm/uaccess.h> 16#include <asm/uaccess.h>
19#include "internal.h" 17#include "internal.h"
20 18
@@ -130,7 +128,6 @@ static const struct file_operations afs_proc_cell_servers_fops = {
130 .release = afs_proc_cell_servers_release, 128 .release = afs_proc_cell_servers_release,
131}; 129};
132 130
133/*****************************************************************************/
134/* 131/*
135 * initialise the /proc/fs/afs/ directory 132 * initialise the /proc/fs/afs/ directory
136 */ 133 */
@@ -142,47 +139,43 @@ int afs_proc_init(void)
142 139
143 proc_afs = proc_mkdir("fs/afs", NULL); 140 proc_afs = proc_mkdir("fs/afs", NULL);
144 if (!proc_afs) 141 if (!proc_afs)
145 goto error; 142 goto error_dir;
146 proc_afs->owner = THIS_MODULE; 143 proc_afs->owner = THIS_MODULE;
147 144
148 p = create_proc_entry("cells", 0, proc_afs); 145 p = create_proc_entry("cells", 0, proc_afs);
149 if (!p) 146 if (!p)
150 goto error_proc; 147 goto error_cells;
151 p->proc_fops = &afs_proc_cells_fops; 148 p->proc_fops = &afs_proc_cells_fops;
152 p->owner = THIS_MODULE; 149 p->owner = THIS_MODULE;
153 150
154 p = create_proc_entry("rootcell", 0, proc_afs); 151 p = create_proc_entry("rootcell", 0, proc_afs);
155 if (!p) 152 if (!p)
156 goto error_cells; 153 goto error_rootcell;
157 p->proc_fops = &afs_proc_rootcell_fops; 154 p->proc_fops = &afs_proc_rootcell_fops;
158 p->owner = THIS_MODULE; 155 p->owner = THIS_MODULE;
159 156
160 _leave(" = 0"); 157 _leave(" = 0");
161 return 0; 158 return 0;
162 159
163 error_cells: 160error_rootcell:
164 remove_proc_entry("cells", proc_afs); 161 remove_proc_entry("cells", proc_afs);
165 error_proc: 162error_cells:
166 remove_proc_entry("fs/afs", NULL); 163 remove_proc_entry("fs/afs", NULL);
167 error: 164error_dir:
168 _leave(" = -ENOMEM"); 165 _leave(" = -ENOMEM");
169 return -ENOMEM; 166 return -ENOMEM;
167}
170 168
171} /* end afs_proc_init() */
172
173/*****************************************************************************/
174/* 169/*
175 * clean up the /proc/fs/afs/ directory 170 * clean up the /proc/fs/afs/ directory
176 */ 171 */
177void afs_proc_cleanup(void) 172void afs_proc_cleanup(void)
178{ 173{
174 remove_proc_entry("rootcell", proc_afs);
179 remove_proc_entry("cells", proc_afs); 175 remove_proc_entry("cells", proc_afs);
180
181 remove_proc_entry("fs/afs", NULL); 176 remove_proc_entry("fs/afs", NULL);
177}
182 178
183} /* end afs_proc_cleanup() */
184
185/*****************************************************************************/
186/* 179/*
187 * open "/proc/fs/afs/cells" which provides a summary of extant cells 180 * open "/proc/fs/afs/cells" which provides a summary of extant cells
188 */ 181 */
@@ -199,9 +192,8 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
199 m->private = PDE(inode)->data; 192 m->private = PDE(inode)->data;
200 193
201 return 0; 194 return 0;
202} /* end afs_proc_cells_open() */ 195}
203 196
204/*****************************************************************************/
205/* 197/*
206 * set up the iterator to start reading from the cells list and return the 198 * set up the iterator to start reading from the cells list and return the
207 * first item 199 * first item
@@ -225,9 +217,8 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
225 break; 217 break;
226 218
227 return _p != &afs_proc_cells ? _p : NULL; 219 return _p != &afs_proc_cells ? _p : NULL;
228} /* end afs_proc_cells_start() */ 220}
229 221
230/*****************************************************************************/
231/* 222/*
232 * move to next cell in cells list 223 * move to next cell in cells list
233 */ 224 */
@@ -241,19 +232,16 @@ static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
241 _p = v == (void *) 1 ? afs_proc_cells.next : _p->next; 232 _p = v == (void *) 1 ? afs_proc_cells.next : _p->next;
242 233
243 return _p != &afs_proc_cells ? _p : NULL; 234 return _p != &afs_proc_cells ? _p : NULL;
244} /* end afs_proc_cells_next() */ 235}
245 236
246/*****************************************************************************/
247/* 237/*
248 * clean up after reading from the cells list 238 * clean up after reading from the cells list
249 */ 239 */
250static void afs_proc_cells_stop(struct seq_file *p, void *v) 240static void afs_proc_cells_stop(struct seq_file *p, void *v)
251{ 241{
252 up_read(&afs_proc_cells_sem); 242 up_read(&afs_proc_cells_sem);
243}
253 244
254} /* end afs_proc_cells_stop() */
255
256/*****************************************************************************/
257/* 245/*
258 * display a header line followed by a load of cell lines 246 * display a header line followed by a load of cell lines
259 */ 247 */
@@ -261,19 +249,18 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
261{ 249{
262 struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); 250 struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
263 251
264 /* display header on line 1 */
265 if (v == (void *) 1) { 252 if (v == (void *) 1) {
253 /* display header on line 1 */
266 seq_puts(m, "USE NAME\n"); 254 seq_puts(m, "USE NAME\n");
267 return 0; 255 return 0;
268 } 256 }
269 257
270 /* display one cell per line on subsequent lines */ 258 /* display one cell per line on subsequent lines */
271 seq_printf(m, "%3d %s\n", atomic_read(&cell->usage), cell->name); 259 seq_printf(m, "%3d %s\n",
272 260 atomic_read(&cell->usage), cell->name);
273 return 0; 261 return 0;
274} /* end afs_proc_cells_show() */ 262}
275 263
276/*****************************************************************************/
277/* 264/*
278 * handle writes to /proc/fs/afs/cells 265 * handle writes to /proc/fs/afs/cells
279 * - to add cells: echo "add <cellname> <IP>[:<IP>][:<IP>]" 266 * - to add cells: echo "add <cellname> <IP>[:<IP>][:<IP>]"
@@ -326,30 +313,32 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
326 313
327 if (strcmp(kbuf, "add") == 0) { 314 if (strcmp(kbuf, "add") == 0) {
328 struct afs_cell *cell; 315 struct afs_cell *cell;
329 ret = afs_cell_create(name, args, &cell); 316
330 if (ret < 0) 317 cell = afs_cell_create(name, args);
318 if (IS_ERR(cell)) {
319 ret = PTR_ERR(cell);
331 goto done; 320 goto done;
321 }
332 322
323 afs_put_cell(cell);
333 printk("kAFS: Added new cell '%s'\n", name); 324 printk("kAFS: Added new cell '%s'\n", name);
334 } 325 } else {
335 else {
336 goto inval; 326 goto inval;
337 } 327 }
338 328
339 ret = size; 329 ret = size;
340 330
341 done: 331done:
342 kfree(kbuf); 332 kfree(kbuf);
343 _leave(" = %d", ret); 333 _leave(" = %d", ret);
344 return ret; 334 return ret;
345 335
346 inval: 336inval:
347 ret = -EINVAL; 337 ret = -EINVAL;
348 printk("kAFS: Invalid Command on /proc/fs/afs/cells file\n"); 338 printk("kAFS: Invalid Command on /proc/fs/afs/cells file\n");
349 goto done; 339 goto done;
350} /* end afs_proc_cells_write() */ 340}
351 341
352/*****************************************************************************/
353/* 342/*
354 * Stubs for /proc/fs/afs/rootcell 343 * Stubs for /proc/fs/afs/rootcell
355 */ 344 */
@@ -369,7 +358,6 @@ static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
369 return 0; 358 return 0;
370} 359}
371 360
372/*****************************************************************************/
373/* 361/*
374 * handle writes to /proc/fs/afs/rootcell 362 * handle writes to /proc/fs/afs/rootcell
375 * - to initialize rootcell: echo "cell.name:192.168.231.14" 363 * - to initialize rootcell: echo "cell.name:192.168.231.14"
@@ -407,14 +395,13 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
407 if (ret >= 0) 395 if (ret >= 0)
408 ret = size; /* consume everything, always */ 396 ret = size; /* consume everything, always */
409 397
410 infault: 398infault:
411 kfree(kbuf); 399 kfree(kbuf);
412 nomem: 400nomem:
413 _leave(" = %d", ret); 401 _leave(" = %d", ret);
414 return ret; 402 return ret;
415} /* end afs_proc_rootcell_write() */ 403}
416 404
417/*****************************************************************************/
418/* 405/*
419 * initialise /proc/fs/afs/<cell>/ 406 * initialise /proc/fs/afs/<cell>/
420 */ 407 */
@@ -426,25 +413,25 @@ int afs_proc_cell_setup(struct afs_cell *cell)
426 413
427 cell->proc_dir = proc_mkdir(cell->name, proc_afs); 414 cell->proc_dir = proc_mkdir(cell->name, proc_afs);
428 if (!cell->proc_dir) 415 if (!cell->proc_dir)
429 return -ENOMEM; 416 goto error_dir;
430 417
431 p = create_proc_entry("servers", 0, cell->proc_dir); 418 p = create_proc_entry("servers", 0, cell->proc_dir);
432 if (!p) 419 if (!p)
433 goto error_proc; 420 goto error_servers;
434 p->proc_fops = &afs_proc_cell_servers_fops; 421 p->proc_fops = &afs_proc_cell_servers_fops;
435 p->owner = THIS_MODULE; 422 p->owner = THIS_MODULE;
436 p->data = cell; 423 p->data = cell;
437 424
438 p = create_proc_entry("vlservers", 0, cell->proc_dir); 425 p = create_proc_entry("vlservers", 0, cell->proc_dir);
439 if (!p) 426 if (!p)
440 goto error_servers; 427 goto error_vlservers;
441 p->proc_fops = &afs_proc_cell_vlservers_fops; 428 p->proc_fops = &afs_proc_cell_vlservers_fops;
442 p->owner = THIS_MODULE; 429 p->owner = THIS_MODULE;
443 p->data = cell; 430 p->data = cell;
444 431
445 p = create_proc_entry("volumes", 0, cell->proc_dir); 432 p = create_proc_entry("volumes", 0, cell->proc_dir);
446 if (!p) 433 if (!p)
447 goto error_vlservers; 434 goto error_volumes;
448 p->proc_fops = &afs_proc_cell_volumes_fops; 435 p->proc_fops = &afs_proc_cell_volumes_fops;
449 p->owner = THIS_MODULE; 436 p->owner = THIS_MODULE;
450 p->data = cell; 437 p->data = cell;
@@ -452,17 +439,17 @@ int afs_proc_cell_setup(struct afs_cell *cell)
452 _leave(" = 0"); 439 _leave(" = 0");
453 return 0; 440 return 0;
454 441
455 error_vlservers: 442error_volumes:
456 remove_proc_entry("vlservers", cell->proc_dir); 443 remove_proc_entry("vlservers", cell->proc_dir);
457 error_servers: 444error_vlservers:
458 remove_proc_entry("servers", cell->proc_dir); 445 remove_proc_entry("servers", cell->proc_dir);
459 error_proc: 446error_servers:
460 remove_proc_entry(cell->name, proc_afs); 447 remove_proc_entry(cell->name, proc_afs);
448error_dir:
461 _leave(" = -ENOMEM"); 449 _leave(" = -ENOMEM");
462 return -ENOMEM; 450 return -ENOMEM;
463} /* end afs_proc_cell_setup() */ 451}
464 452
465/*****************************************************************************/
466/* 453/*
467 * remove /proc/fs/afs/<cell>/ 454 * remove /proc/fs/afs/<cell>/
468 */ 455 */
@@ -476,9 +463,8 @@ void afs_proc_cell_remove(struct afs_cell *cell)
476 remove_proc_entry(cell->name, proc_afs); 463 remove_proc_entry(cell->name, proc_afs);
477 464
478 _leave(""); 465 _leave("");
479} /* end afs_proc_cell_remove() */ 466}
480 467
481/*****************************************************************************/
482/* 468/*
483 * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells 469 * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells
484 */ 470 */
@@ -488,7 +474,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
488 struct seq_file *m; 474 struct seq_file *m;
489 int ret; 475 int ret;
490 476
491 cell = afs_get_cell_maybe((struct afs_cell **) &PDE(inode)->data); 477 cell = PDE(inode)->data;
492 if (!cell) 478 if (!cell)
493 return -ENOENT; 479 return -ENOENT;
494 480
@@ -500,25 +486,16 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
500 m->private = cell; 486 m->private = cell;
501 487
502 return 0; 488 return 0;
503} /* end afs_proc_cell_volumes_open() */ 489}
504 490
505/*****************************************************************************/
506/* 491/*
507 * close the file and release the ref to the cell 492 * close the file and release the ref to the cell
508 */ 493 */
509static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file) 494static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
510{ 495{
511 struct afs_cell *cell = PDE(inode)->data; 496 return seq_release(inode, file);
512 int ret; 497}
513
514 ret = seq_release(inode,file);
515
516 afs_put_cell(cell);
517
518 return ret;
519} /* end afs_proc_cell_volumes_release() */
520 498
521/*****************************************************************************/
522/* 499/*
523 * set up the iterator to start reading from the cells list and return the 500 * set up the iterator to start reading from the cells list and return the
524 * first item 501 * first item
@@ -545,9 +522,8 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
545 break; 522 break;
546 523
547 return _p != &cell->vl_list ? _p : NULL; 524 return _p != &cell->vl_list ? _p : NULL;
548} /* end afs_proc_cell_volumes_start() */ 525}
549 526
550/*****************************************************************************/
551/* 527/*
552 * move to next cell in cells list 528 * move to next cell in cells list
553 */ 529 */
@@ -562,12 +538,11 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
562 (*_pos)++; 538 (*_pos)++;
563 539
564 _p = v; 540 _p = v;
565 _p = v == (void *) 1 ? cell->vl_list.next : _p->next; 541 _p = (v == (void *) 1) ? cell->vl_list.next : _p->next;
566 542
567 return _p != &cell->vl_list ? _p : NULL; 543 return (_p != &cell->vl_list) ? _p : NULL;
568} /* end afs_proc_cell_volumes_next() */ 544}
569 545
570/*****************************************************************************/
571/* 546/*
572 * clean up after reading from the cells list 547 * clean up after reading from the cells list
573 */ 548 */
@@ -576,10 +551,18 @@ static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v)
576 struct afs_cell *cell = p->private; 551 struct afs_cell *cell = p->private;
577 552
578 up_read(&cell->vl_sem); 553 up_read(&cell->vl_sem);
554}
579 555
580} /* end afs_proc_cell_volumes_stop() */ 556const char afs_vlocation_states[][4] = {
557 [AFS_VL_NEW] = "New",
558 [AFS_VL_CREATING] = "Crt",
559 [AFS_VL_VALID] = "Val",
560 [AFS_VL_NO_VOLUME] = "NoV",
561 [AFS_VL_UPDATING] = "Upd",
562 [AFS_VL_VOLUME_DELETED] = "Del",
563 [AFS_VL_UNCERTAIN] = "Unc",
564};
581 565
582/*****************************************************************************/
583/* 566/*
584 * display a header line followed by a load of volume lines 567 * display a header line followed by a load of volume lines
585 */ 568 */
@@ -590,23 +573,22 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
590 573
591 /* display header on line 1 */ 574 /* display header on line 1 */
592 if (v == (void *) 1) { 575 if (v == (void *) 1) {
593 seq_puts(m, "USE VLID[0] VLID[1] VLID[2] NAME\n"); 576 seq_puts(m, "USE STT VLID[0] VLID[1] VLID[2] NAME\n");
594 return 0; 577 return 0;
595 } 578 }
596 579
597 /* display one cell per line on subsequent lines */ 580 /* display one cell per line on subsequent lines */
598 seq_printf(m, "%3d %08x %08x %08x %s\n", 581 seq_printf(m, "%3d %s %08x %08x %08x %s\n",
599 atomic_read(&vlocation->usage), 582 atomic_read(&vlocation->usage),
583 afs_vlocation_states[vlocation->state],
600 vlocation->vldb.vid[0], 584 vlocation->vldb.vid[0],
601 vlocation->vldb.vid[1], 585 vlocation->vldb.vid[1],
602 vlocation->vldb.vid[2], 586 vlocation->vldb.vid[2],
603 vlocation->vldb.name 587 vlocation->vldb.name);
604 );
605 588
606 return 0; 589 return 0;
607} /* end afs_proc_cell_volumes_show() */ 590}
608 591
609/*****************************************************************************/
610/* 592/*
611 * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume 593 * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume
612 * location server 594 * location server
@@ -617,11 +599,11 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
617 struct seq_file *m; 599 struct seq_file *m;
618 int ret; 600 int ret;
619 601
620 cell = afs_get_cell_maybe((struct afs_cell**)&PDE(inode)->data); 602 cell = PDE(inode)->data;
621 if (!cell) 603 if (!cell)
622 return -ENOENT; 604 return -ENOENT;
623 605
624 ret = seq_open(file,&afs_proc_cell_vlservers_ops); 606 ret = seq_open(file, &afs_proc_cell_vlservers_ops);
625 if (ret<0) 607 if (ret<0)
626 return ret; 608 return ret;
627 609
@@ -629,26 +611,17 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
629 m->private = cell; 611 m->private = cell;
630 612
631 return 0; 613 return 0;
632} /* end afs_proc_cell_vlservers_open() */ 614}
633 615
634/*****************************************************************************/
635/* 616/*
636 * close the file and release the ref to the cell 617 * close the file and release the ref to the cell
637 */ 618 */
638static int afs_proc_cell_vlservers_release(struct inode *inode, 619static int afs_proc_cell_vlservers_release(struct inode *inode,
639 struct file *file) 620 struct file *file)
640{ 621{
641 struct afs_cell *cell = PDE(inode)->data; 622 return seq_release(inode, file);
642 int ret; 623}
643
644 ret = seq_release(inode,file);
645
646 afs_put_cell(cell);
647
648 return ret;
649} /* end afs_proc_cell_vlservers_release() */
650 624
651/*****************************************************************************/
652/* 625/*
653 * set up the iterator to start reading from the cells list and return the 626 * set up the iterator to start reading from the cells list and return the
654 * first item 627 * first item
@@ -672,9 +645,8 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
672 return NULL; 645 return NULL;
673 646
674 return &cell->vl_addrs[pos]; 647 return &cell->vl_addrs[pos];
675} /* end afs_proc_cell_vlservers_start() */ 648}
676 649
677/*****************************************************************************/
678/* 650/*
679 * move to next cell in cells list 651 * move to next cell in cells list
680 */ 652 */
@@ -692,9 +664,8 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
692 return NULL; 664 return NULL;
693 665
694 return &cell->vl_addrs[pos]; 666 return &cell->vl_addrs[pos];
695} /* end afs_proc_cell_vlservers_next() */ 667}
696 668
697/*****************************************************************************/
698/* 669/*
699 * clean up after reading from the cells list 670 * clean up after reading from the cells list
700 */ 671 */
@@ -703,10 +674,8 @@ static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
703 struct afs_cell *cell = p->private; 674 struct afs_cell *cell = p->private;
704 675
705 up_read(&cell->vl_sem); 676 up_read(&cell->vl_sem);
677}
706 678
707} /* end afs_proc_cell_vlservers_stop() */
708
709/*****************************************************************************/
710/* 679/*
711 * display a header line followed by a load of volume lines 680 * display a header line followed by a load of volume lines
712 */ 681 */
@@ -722,11 +691,9 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
722 691
723 /* display one cell per line on subsequent lines */ 692 /* display one cell per line on subsequent lines */
724 seq_printf(m, "%u.%u.%u.%u\n", NIPQUAD(addr->s_addr)); 693 seq_printf(m, "%u.%u.%u.%u\n", NIPQUAD(addr->s_addr));
725
726 return 0; 694 return 0;
727} /* end afs_proc_cell_vlservers_show() */ 695}
728 696
729/*****************************************************************************/
730/* 697/*
731 * open "/proc/fs/afs/<cell>/servers" which provides a summary of active 698 * open "/proc/fs/afs/<cell>/servers" which provides a summary of active
732 * servers 699 * servers
@@ -737,7 +704,7 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
737 struct seq_file *m; 704 struct seq_file *m;
738 int ret; 705 int ret;
739 706
740 cell = afs_get_cell_maybe((struct afs_cell **) &PDE(inode)->data); 707 cell = PDE(inode)->data;
741 if (!cell) 708 if (!cell)
742 return -ENOENT; 709 return -ENOENT;
743 710
@@ -747,34 +714,24 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
747 714
748 m = file->private_data; 715 m = file->private_data;
749 m->private = cell; 716 m->private = cell;
750
751 return 0; 717 return 0;
752} /* end afs_proc_cell_servers_open() */ 718}
753 719
754/*****************************************************************************/
755/* 720/*
756 * close the file and release the ref to the cell 721 * close the file and release the ref to the cell
757 */ 722 */
758static int afs_proc_cell_servers_release(struct inode *inode, 723static int afs_proc_cell_servers_release(struct inode *inode,
759 struct file *file) 724 struct file *file)
760{ 725{
761 struct afs_cell *cell = PDE(inode)->data; 726 return seq_release(inode, file);
762 int ret; 727}
763
764 ret = seq_release(inode, file);
765
766 afs_put_cell(cell);
767
768 return ret;
769} /* end afs_proc_cell_servers_release() */
770 728
771/*****************************************************************************/
772/* 729/*
773 * set up the iterator to start reading from the cells list and return the 730 * set up the iterator to start reading from the cells list and return the
774 * first item 731 * first item
775 */ 732 */
776static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos) 733static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
777 __acquires(m->private->sv_lock) 734 __acquires(m->private->servers_lock)
778{ 735{
779 struct list_head *_p; 736 struct list_head *_p;
780 struct afs_cell *cell = m->private; 737 struct afs_cell *cell = m->private;
@@ -783,7 +740,7 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
783 _enter("cell=%p pos=%Ld", cell, *_pos); 740 _enter("cell=%p pos=%Ld", cell, *_pos);
784 741
785 /* lock the list against modification */ 742 /* lock the list against modification */
786 read_lock(&cell->sv_lock); 743 read_lock(&cell->servers_lock);
787 744
788 /* allow for the header line */ 745 /* allow for the header line */
789 if (!pos) 746 if (!pos)
@@ -791,14 +748,13 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
791 pos--; 748 pos--;
792 749
793 /* find the n'th element in the list */ 750 /* find the n'th element in the list */
794 list_for_each(_p, &cell->sv_list) 751 list_for_each(_p, &cell->servers)
795 if (!pos--) 752 if (!pos--)
796 break; 753 break;
797 754
798 return _p != &cell->sv_list ? _p : NULL; 755 return _p != &cell->servers ? _p : NULL;
799} /* end afs_proc_cell_servers_start() */ 756}
800 757
801/*****************************************************************************/
802/* 758/*
803 * move to next cell in cells list 759 * move to next cell in cells list
804 */ 760 */
@@ -813,25 +769,22 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
813 (*_pos)++; 769 (*_pos)++;
814 770
815 _p = v; 771 _p = v;
816 _p = v == (void *) 1 ? cell->sv_list.next : _p->next; 772 _p = v == (void *) 1 ? cell->servers.next : _p->next;
817 773
818 return _p != &cell->sv_list ? _p : NULL; 774 return _p != &cell->servers ? _p : NULL;
819} /* end afs_proc_cell_servers_next() */ 775}
820 776
821/*****************************************************************************/
822/* 777/*
823 * clean up after reading from the cells list 778 * clean up after reading from the cells list
824 */ 779 */
825static void afs_proc_cell_servers_stop(struct seq_file *p, void *v) 780static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
826 __releases(p->private->sv_lock) 781 __releases(p->private->servers_lock)
827{ 782{
828 struct afs_cell *cell = p->private; 783 struct afs_cell *cell = p->private;
829 784
830 read_unlock(&cell->sv_lock); 785 read_unlock(&cell->servers_lock);
831 786}
832} /* end afs_proc_cell_servers_stop() */
833 787
834/*****************************************************************************/
835/* 788/*
836 * display a header line followed by a load of volume lines 789 * display a header line followed by a load of volume lines
837 */ 790 */
@@ -849,10 +802,7 @@ static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
849 /* display one cell per line on subsequent lines */ 802 /* display one cell per line on subsequent lines */
850 sprintf(ipaddr, "%u.%u.%u.%u", NIPQUAD(server->addr)); 803 sprintf(ipaddr, "%u.%u.%u.%u", NIPQUAD(server->addr));
851 seq_printf(m, "%3d %-15.15s %5d\n", 804 seq_printf(m, "%3d %-15.15s %5d\n",
852 atomic_read(&server->usage), 805 atomic_read(&server->usage), ipaddr, server->fs_state);
853 ipaddr,
854 server->fs_state
855 );
856 806
857 return 0; 807 return 0;
858} /* end afs_proc_cell_servers_show() */ 808}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
new file mode 100644
index 000000000000..222c1a3abbb8
--- /dev/null
+++ b/fs/afs/rxrpc.c
@@ -0,0 +1,782 @@
1/* Maintain an RxRPC server socket to do AFS communications through
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <net/sock.h>
13#include <net/af_rxrpc.h>
14#include <rxrpc/packet.h>
15#include "internal.h"
16#include "afs_cm.h"
17
18static struct socket *afs_socket; /* my RxRPC socket */
19static struct workqueue_struct *afs_async_calls;
20static atomic_t afs_outstanding_calls;
21static atomic_t afs_outstanding_skbs;
22
23static void afs_wake_up_call_waiter(struct afs_call *);
24static int afs_wait_for_call_to_complete(struct afs_call *);
25static void afs_wake_up_async_call(struct afs_call *);
26static int afs_dont_wait_for_call_to_complete(struct afs_call *);
27static void afs_process_async_call(struct work_struct *);
28static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
29static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
30
31/* synchronous call management */
32const struct afs_wait_mode afs_sync_call = {
33 .rx_wakeup = afs_wake_up_call_waiter,
34 .wait = afs_wait_for_call_to_complete,
35};
36
37/* asynchronous call management */
38const struct afs_wait_mode afs_async_call = {
39 .rx_wakeup = afs_wake_up_async_call,
40 .wait = afs_dont_wait_for_call_to_complete,
41};
42
43/* asynchronous incoming call management */
44static const struct afs_wait_mode afs_async_incoming_call = {
45 .rx_wakeup = afs_wake_up_async_call,
46};
47
48/* asynchronous incoming call initial processing */
49static const struct afs_call_type afs_RXCMxxxx = {
50 .name = "CB.xxxx",
51 .deliver = afs_deliver_cm_op_id,
52 .abort_to_error = afs_abort_to_error,
53};
54
55static void afs_collect_incoming_call(struct work_struct *);
56
57static struct sk_buff_head afs_incoming_calls;
58static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
59
60/*
61 * open an RxRPC socket and bind it to be a server for callback notifications
62 * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
63 */
64int afs_open_socket(void)
65{
66 struct sockaddr_rxrpc srx;
67 struct socket *socket;
68 int ret;
69
70 _enter("");
71
72 skb_queue_head_init(&afs_incoming_calls);
73
74 afs_async_calls = create_singlethread_workqueue("kafsd");
75 if (!afs_async_calls) {
76 _leave(" = -ENOMEM [wq]");
77 return -ENOMEM;
78 }
79
80 ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
81 if (ret < 0) {
82 destroy_workqueue(afs_async_calls);
83 _leave(" = %d [socket]", ret);
84 return ret;
85 }
86
87 socket->sk->sk_allocation = GFP_NOFS;
88
89 /* bind the callback manager's address to make this a server socket */
90 srx.srx_family = AF_RXRPC;
91 srx.srx_service = CM_SERVICE;
92 srx.transport_type = SOCK_DGRAM;
93 srx.transport_len = sizeof(srx.transport.sin);
94 srx.transport.sin.sin_family = AF_INET;
95 srx.transport.sin.sin_port = htons(AFS_CM_PORT);
96 memset(&srx.transport.sin.sin_addr, 0,
97 sizeof(srx.transport.sin.sin_addr));
98
99 ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
100 if (ret < 0) {
101 sock_release(socket);
102 _leave(" = %d [bind]", ret);
103 return ret;
104 }
105
106 rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor);
107
108 afs_socket = socket;
109 _leave(" = 0");
110 return 0;
111}
112
113/*
114 * close the RxRPC socket AFS was using
115 */
116void afs_close_socket(void)
117{
118 _enter("");
119
120 sock_release(afs_socket);
121
122 _debug("dework");
123 destroy_workqueue(afs_async_calls);
124
125 ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0);
126 ASSERTCMP(atomic_read(&afs_outstanding_calls), ==, 0);
127 _leave("");
128}
129
130/*
131 * note that the data in a socket buffer is now delivered and that the buffer
132 * should be freed
133 */
134static void afs_data_delivered(struct sk_buff *skb)
135{
136 if (!skb) {
137 _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
138 dump_stack();
139 } else {
140 _debug("DLVR %p{%u} [%d]",
141 skb, skb->mark, atomic_read(&afs_outstanding_skbs));
142 if (atomic_dec_return(&afs_outstanding_skbs) == -1)
143 BUG();
144 rxrpc_kernel_data_delivered(skb);
145 }
146}
147
148/*
149 * free a socket buffer
150 */
151static void afs_free_skb(struct sk_buff *skb)
152{
153 if (!skb) {
154 _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs));
155 dump_stack();
156 } else {
157 _debug("FREE %p{%u} [%d]",
158 skb, skb->mark, atomic_read(&afs_outstanding_skbs));
159 if (atomic_dec_return(&afs_outstanding_skbs) == -1)
160 BUG();
161 rxrpc_kernel_free_skb(skb);
162 }
163}
164
165/*
166 * free a call
167 */
168static void afs_free_call(struct afs_call *call)
169{
170 _debug("DONE %p{%s} [%d]",
171 call, call->type->name, atomic_read(&afs_outstanding_calls));
172 if (atomic_dec_return(&afs_outstanding_calls) == -1)
173 BUG();
174
175 ASSERTCMP(call->rxcall, ==, NULL);
176 ASSERT(!work_pending(&call->async_work));
177 ASSERT(skb_queue_empty(&call->rx_queue));
178 ASSERT(call->type->name != NULL);
179
180 kfree(call->request);
181 kfree(call);
182}
183
184/*
185 * allocate a call with flat request and reply buffers
186 */
187struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
188 size_t request_size, size_t reply_size)
189{
190 struct afs_call *call;
191
192 call = kzalloc(sizeof(*call), GFP_NOFS);
193 if (!call)
194 goto nomem_call;
195
196 _debug("CALL %p{%s} [%d]",
197 call, type->name, atomic_read(&afs_outstanding_calls));
198 atomic_inc(&afs_outstanding_calls);
199
200 call->type = type;
201 call->request_size = request_size;
202 call->reply_max = reply_size;
203
204 if (request_size) {
205 call->request = kmalloc(request_size, GFP_NOFS);
206 if (!call->request)
207 goto nomem_free;
208 }
209
210 if (reply_size) {
211 call->buffer = kmalloc(reply_size, GFP_NOFS);
212 if (!call->buffer)
213 goto nomem_free;
214 }
215
216 init_waitqueue_head(&call->waitq);
217 skb_queue_head_init(&call->rx_queue);
218 return call;
219
220nomem_free:
221 afs_free_call(call);
222nomem_call:
223 return NULL;
224}
225
226/*
227 * clean up a call with flat buffer
228 */
229void afs_flat_call_destructor(struct afs_call *call)
230{
231 _enter("");
232
233 kfree(call->request);
234 call->request = NULL;
235 kfree(call->buffer);
236 call->buffer = NULL;
237}
238
239/*
240 * initiate a call
241 */
242int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
243 const struct afs_wait_mode *wait_mode)
244{
245 struct sockaddr_rxrpc srx;
246 struct rxrpc_call *rxcall;
247 struct msghdr msg;
248 struct kvec iov[1];
249 int ret;
250
251 _enter("%x,{%d},", addr->s_addr, ntohs(call->port));
252
253 ASSERT(call->type != NULL);
254 ASSERT(call->type->name != NULL);
255
256 _debug("MAKE %p{%s} [%d]",
257 call, call->type->name, atomic_read(&afs_outstanding_calls));
258
259 call->wait_mode = wait_mode;
260 INIT_WORK(&call->async_work, afs_process_async_call);
261
262 memset(&srx, 0, sizeof(srx));
263 srx.srx_family = AF_RXRPC;
264 srx.srx_service = call->service_id;
265 srx.transport_type = SOCK_DGRAM;
266 srx.transport_len = sizeof(srx.transport.sin);
267 srx.transport.sin.sin_family = AF_INET;
268 srx.transport.sin.sin_port = call->port;
269 memcpy(&srx.transport.sin.sin_addr, addr, 4);
270
271 /* create a call */
272 rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
273 (unsigned long) call, gfp);
274 call->key = NULL;
275 if (IS_ERR(rxcall)) {
276 ret = PTR_ERR(rxcall);
277 goto error_kill_call;
278 }
279
280 call->rxcall = rxcall;
281
282 /* send the request */
283 iov[0].iov_base = call->request;
284 iov[0].iov_len = call->request_size;
285
286 msg.msg_name = NULL;
287 msg.msg_namelen = 0;
288 msg.msg_iov = (struct iovec *) iov;
289 msg.msg_iovlen = 1;
290 msg.msg_control = NULL;
291 msg.msg_controllen = 0;
292 msg.msg_flags = 0;
293
294 /* have to change the state *before* sending the last packet as RxRPC
295 * might give us the reply before it returns from sending the
296 * request */
297 call->state = AFS_CALL_AWAIT_REPLY;
298 ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size);
299 if (ret < 0)
300 goto error_do_abort;
301
302 /* at this point, an async call may no longer exist as it may have
303 * already completed */
304 return wait_mode->wait(call);
305
306error_do_abort:
307 rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
308 rxrpc_kernel_end_call(rxcall);
309 call->rxcall = NULL;
310error_kill_call:
311 call->type->destructor(call);
312 afs_free_call(call);
313 _leave(" = %d", ret);
314 return ret;
315}
316
317/*
318 * handles intercepted messages that were arriving in the socket's Rx queue
319 * - called with the socket receive queue lock held to ensure message ordering
320 * - called with softirqs disabled
321 */
322static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
323 struct sk_buff *skb)
324{
325 struct afs_call *call = (struct afs_call *) user_call_ID;
326
327 _enter("%p,,%u", call, skb->mark);
328
329 _debug("ICPT %p{%u} [%d]",
330 skb, skb->mark, atomic_read(&afs_outstanding_skbs));
331
332 ASSERTCMP(sk, ==, afs_socket->sk);
333 atomic_inc(&afs_outstanding_skbs);
334
335 if (!call) {
336 /* its an incoming call for our callback service */
337 skb_queue_tail(&afs_incoming_calls, skb);
338 schedule_work(&afs_collect_incoming_call_work);
339 } else {
340 /* route the messages directly to the appropriate call */
341 skb_queue_tail(&call->rx_queue, skb);
342 call->wait_mode->rx_wakeup(call);
343 }
344
345 _leave("");
346}
347
348/*
349 * deliver messages to a call
350 */
351static void afs_deliver_to_call(struct afs_call *call)
352{
353 struct sk_buff *skb;
354 bool last;
355 u32 abort_code;
356 int ret;
357
358 _enter("");
359
360 while ((call->state == AFS_CALL_AWAIT_REPLY ||
361 call->state == AFS_CALL_AWAIT_OP_ID ||
362 call->state == AFS_CALL_AWAIT_REQUEST ||
363 call->state == AFS_CALL_AWAIT_ACK) &&
364 (skb = skb_dequeue(&call->rx_queue))) {
365 switch (skb->mark) {
366 case RXRPC_SKB_MARK_DATA:
367 _debug("Rcv DATA");
368 last = rxrpc_kernel_is_data_last(skb);
369 ret = call->type->deliver(call, skb, last);
370 switch (ret) {
371 case 0:
372 if (last &&
373 call->state == AFS_CALL_AWAIT_REPLY)
374 call->state = AFS_CALL_COMPLETE;
375 break;
376 case -ENOTCONN:
377 abort_code = RX_CALL_DEAD;
378 goto do_abort;
379 case -ENOTSUPP:
380 abort_code = RX_INVALID_OPERATION;
381 goto do_abort;
382 default:
383 abort_code = RXGEN_CC_UNMARSHAL;
384 if (call->state != AFS_CALL_AWAIT_REPLY)
385 abort_code = RXGEN_SS_UNMARSHAL;
386 do_abort:
387 rxrpc_kernel_abort_call(call->rxcall,
388 abort_code);
389 call->error = ret;
390 call->state = AFS_CALL_ERROR;
391 break;
392 }
393 afs_data_delivered(skb);
394 skb = NULL;
395 continue;
396 case RXRPC_SKB_MARK_FINAL_ACK:
397 _debug("Rcv ACK");
398 call->state = AFS_CALL_COMPLETE;
399 break;
400 case RXRPC_SKB_MARK_BUSY:
401 _debug("Rcv BUSY");
402 call->error = -EBUSY;
403 call->state = AFS_CALL_BUSY;
404 break;
405 case RXRPC_SKB_MARK_REMOTE_ABORT:
406 abort_code = rxrpc_kernel_get_abort_code(skb);
407 call->error = call->type->abort_to_error(abort_code);
408 call->state = AFS_CALL_ABORTED;
409 _debug("Rcv ABORT %u -> %d", abort_code, call->error);
410 break;
411 case RXRPC_SKB_MARK_NET_ERROR:
412 call->error = -rxrpc_kernel_get_error_number(skb);
413 call->state = AFS_CALL_ERROR;
414 _debug("Rcv NET ERROR %d", call->error);
415 break;
416 case RXRPC_SKB_MARK_LOCAL_ERROR:
417 call->error = -rxrpc_kernel_get_error_number(skb);
418 call->state = AFS_CALL_ERROR;
419 _debug("Rcv LOCAL ERROR %d", call->error);
420 break;
421 default:
422 BUG();
423 break;
424 }
425
426 afs_free_skb(skb);
427 }
428
429 /* make sure the queue is empty if the call is done with (we might have
430 * aborted the call early because of an unmarshalling error) */
431 if (call->state >= AFS_CALL_COMPLETE) {
432 while ((skb = skb_dequeue(&call->rx_queue)))
433 afs_free_skb(skb);
434 if (call->incoming) {
435 rxrpc_kernel_end_call(call->rxcall);
436 call->rxcall = NULL;
437 call->type->destructor(call);
438 afs_free_call(call);
439 }
440 }
441
442 _leave("");
443}
444
445/*
446 * wait synchronously for a call to complete
447 */
448static int afs_wait_for_call_to_complete(struct afs_call *call)
449{
450 struct sk_buff *skb;
451 int ret;
452
453 DECLARE_WAITQUEUE(myself, current);
454
455 _enter("");
456
457 add_wait_queue(&call->waitq, &myself);
458 for (;;) {
459 set_current_state(TASK_INTERRUPTIBLE);
460
461 /* deliver any messages that are in the queue */
462 if (!skb_queue_empty(&call->rx_queue)) {
463 __set_current_state(TASK_RUNNING);
464 afs_deliver_to_call(call);
465 continue;
466 }
467
468 ret = call->error;
469 if (call->state >= AFS_CALL_COMPLETE)
470 break;
471 ret = -EINTR;
472 if (signal_pending(current))
473 break;
474 schedule();
475 }
476
477 remove_wait_queue(&call->waitq, &myself);
478 __set_current_state(TASK_RUNNING);
479
480 /* kill the call */
481 if (call->state < AFS_CALL_COMPLETE) {
482 _debug("call incomplete");
483 rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD);
484 while ((skb = skb_dequeue(&call->rx_queue)))
485 afs_free_skb(skb);
486 }
487
488 _debug("call complete");
489 rxrpc_kernel_end_call(call->rxcall);
490 call->rxcall = NULL;
491 call->type->destructor(call);
492 afs_free_call(call);
493 _leave(" = %d", ret);
494 return ret;
495}
496
497/*
498 * wake up a waiting call
499 */
500static void afs_wake_up_call_waiter(struct afs_call *call)
501{
502 wake_up(&call->waitq);
503}
504
505/*
506 * wake up an asynchronous call
507 */
508static void afs_wake_up_async_call(struct afs_call *call)
509{
510 _enter("");
511 queue_work(afs_async_calls, &call->async_work);
512}
513
514/*
515 * put a call into asynchronous mode
516 * - mustn't touch the call descriptor as the call my have completed by the
517 * time we get here
518 */
519static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
520{
521 _enter("");
522 return -EINPROGRESS;
523}
524
525/*
526 * delete an asynchronous call
527 */
528static void afs_delete_async_call(struct work_struct *work)
529{
530 struct afs_call *call =
531 container_of(work, struct afs_call, async_work);
532
533 _enter("");
534
535 afs_free_call(call);
536
537 _leave("");
538}
539
540/*
541 * perform processing on an asynchronous call
542 * - on a multiple-thread workqueue this work item may try to run on several
543 * CPUs at the same time
544 */
545static void afs_process_async_call(struct work_struct *work)
546{
547 struct afs_call *call =
548 container_of(work, struct afs_call, async_work);
549
550 _enter("");
551
552 if (!skb_queue_empty(&call->rx_queue))
553 afs_deliver_to_call(call);
554
555 if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) {
556 if (call->wait_mode->async_complete)
557 call->wait_mode->async_complete(call->reply,
558 call->error);
559 call->reply = NULL;
560
561 /* kill the call */
562 rxrpc_kernel_end_call(call->rxcall);
563 call->rxcall = NULL;
564 if (call->type->destructor)
565 call->type->destructor(call);
566
567 /* we can't just delete the call because the work item may be
568 * queued */
569 PREPARE_WORK(&call->async_work, afs_delete_async_call);
570 queue_work(afs_async_calls, &call->async_work);
571 }
572
573 _leave("");
574}
575
576/*
577 * empty a socket buffer into a flat reply buffer
578 */
579void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
580{
581 size_t len = skb->len;
582
583 if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0)
584 BUG();
585 call->reply_size += len;
586}
587
588/*
589 * accept the backlog of incoming calls
590 */
591static void afs_collect_incoming_call(struct work_struct *work)
592{
593 struct rxrpc_call *rxcall;
594 struct afs_call *call = NULL;
595 struct sk_buff *skb;
596
597 while ((skb = skb_dequeue(&afs_incoming_calls))) {
598 _debug("new call");
599
600 /* don't need the notification */
601 afs_free_skb(skb);
602
603 if (!call) {
604 call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
605 if (!call) {
606 rxrpc_kernel_reject_call(afs_socket);
607 return;
608 }
609
610 INIT_WORK(&call->async_work, afs_process_async_call);
611 call->wait_mode = &afs_async_incoming_call;
612 call->type = &afs_RXCMxxxx;
613 init_waitqueue_head(&call->waitq);
614 skb_queue_head_init(&call->rx_queue);
615 call->state = AFS_CALL_AWAIT_OP_ID;
616
617 _debug("CALL %p{%s} [%d]",
618 call, call->type->name,
619 atomic_read(&afs_outstanding_calls));
620 atomic_inc(&afs_outstanding_calls);
621 }
622
623 rxcall = rxrpc_kernel_accept_call(afs_socket,
624 (unsigned long) call);
625 if (!IS_ERR(rxcall)) {
626 call->rxcall = rxcall;
627 call = NULL;
628 }
629 }
630
631 if (call)
632 afs_free_call(call);
633}
634
635/*
636 * grab the operation ID from an incoming cache manager call
637 */
638static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
639 bool last)
640{
641 size_t len = skb->len;
642 void *oibuf = (void *) &call->operation_ID;
643
644 _enter("{%u},{%zu},%d", call->offset, len, last);
645
646 ASSERTCMP(call->offset, <, 4);
647
648 /* the operation ID forms the first four bytes of the request data */
649 len = min_t(size_t, len, 4 - call->offset);
650 if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0)
651 BUG();
652 if (!pskb_pull(skb, len))
653 BUG();
654 call->offset += len;
655
656 if (call->offset < 4) {
657 if (last) {
658 _leave(" = -EBADMSG [op ID short]");
659 return -EBADMSG;
660 }
661 _leave(" = 0 [incomplete]");
662 return 0;
663 }
664
665 call->state = AFS_CALL_AWAIT_REQUEST;
666
667 /* ask the cache manager to route the call (it'll change the call type
668 * if successful) */
669 if (!afs_cm_incoming_call(call))
670 return -ENOTSUPP;
671
672 /* pass responsibility for the remainer of this message off to the
673 * cache manager op */
674 return call->type->deliver(call, skb, last);
675}
676
677/*
678 * send an empty reply
679 */
680void afs_send_empty_reply(struct afs_call *call)
681{
682 struct msghdr msg;
683 struct iovec iov[1];
684
685 _enter("");
686
687 iov[0].iov_base = NULL;
688 iov[0].iov_len = 0;
689 msg.msg_name = NULL;
690 msg.msg_namelen = 0;
691 msg.msg_iov = iov;
692 msg.msg_iovlen = 0;
693 msg.msg_control = NULL;
694 msg.msg_controllen = 0;
695 msg.msg_flags = 0;
696
697 call->state = AFS_CALL_AWAIT_ACK;
698 switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) {
699 case 0:
700 _leave(" [replied]");
701 return;
702
703 case -ENOMEM:
704 _debug("oom");
705 rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
706 default:
707 rxrpc_kernel_end_call(call->rxcall);
708 call->rxcall = NULL;
709 call->type->destructor(call);
710 afs_free_call(call);
711 _leave(" [error]");
712 return;
713 }
714}
715
716/*
717 * send a simple reply
718 */
719void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
720{
721 struct msghdr msg;
722 struct iovec iov[1];
723
724 _enter("");
725
726 iov[0].iov_base = (void *) buf;
727 iov[0].iov_len = len;
728 msg.msg_name = NULL;
729 msg.msg_namelen = 0;
730 msg.msg_iov = iov;
731 msg.msg_iovlen = 1;
732 msg.msg_control = NULL;
733 msg.msg_controllen = 0;
734 msg.msg_flags = 0;
735
736 call->state = AFS_CALL_AWAIT_ACK;
737 switch (rxrpc_kernel_send_data(call->rxcall, &msg, len)) {
738 case 0:
739 _leave(" [replied]");
740 return;
741
742 case -ENOMEM:
743 _debug("oom");
744 rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
745 default:
746 rxrpc_kernel_end_call(call->rxcall);
747 call->rxcall = NULL;
748 call->type->destructor(call);
749 afs_free_call(call);
750 _leave(" [error]");
751 return;
752 }
753}
754
755/*
756 * extract a piece of data from the received data socket buffers
757 */
758int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
759 bool last, void *buf, size_t count)
760{
761 size_t len = skb->len;
762
763 _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count);
764
765 ASSERTCMP(call->offset, <, count);
766
767 len = min_t(size_t, len, count - call->offset);
768 if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 ||
769 !pskb_pull(skb, len))
770 BUG();
771 call->offset += len;
772
773 if (call->offset < count) {
774 if (last) {
775 _leave(" = -EBADMSG [%d < %zu]", call->offset, count);
776 return -EBADMSG;
777 }
778 _leave(" = -EAGAIN");
779 return -EAGAIN;
780 }
781 return 0;
782}
diff --git a/fs/afs/security.c b/fs/afs/security.c
new file mode 100644
index 000000000000..f9f424d80458
--- /dev/null
+++ b/fs/afs/security.c
@@ -0,0 +1,356 @@
1/* AFS security handling
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <linux/fs.h>
15#include <linux/ctype.h>
16#include <keys/rxrpc-type.h>
17#include "internal.h"
18
19/*
20 * get a key
21 */
22struct key *afs_request_key(struct afs_cell *cell)
23{
24 struct key *key;
25
26 _enter("{%x}", key_serial(cell->anonymous_key));
27
28 _debug("key %s", cell->anonymous_key->description);
29 key = request_key(&key_type_rxrpc, cell->anonymous_key->description,
30 NULL);
31 if (IS_ERR(key)) {
32 if (PTR_ERR(key) != -ENOKEY) {
33 _leave(" = %ld", PTR_ERR(key));
34 return key;
35 }
36
37 /* act as anonymous user */
38 _leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
39 return key_get(cell->anonymous_key);
40 } else {
41 /* act as authorised user */
42 _leave(" = {%x} [auth]", key_serial(key));
43 return key;
44 }
45}
46
47/*
48 * dispose of a permits list
49 */
50void afs_zap_permits(struct rcu_head *rcu)
51{
52 struct afs_permits *permits =
53 container_of(rcu, struct afs_permits, rcu);
54 int loop;
55
56 _enter("{%d}", permits->count);
57
58 for (loop = permits->count - 1; loop >= 0; loop--)
59 key_put(permits->permits[loop].key);
60 kfree(permits);
61}
62
63/*
64 * dispose of a permits list in which all the key pointers have been copied
65 */
66static void afs_dispose_of_permits(struct rcu_head *rcu)
67{
68 struct afs_permits *permits =
69 container_of(rcu, struct afs_permits, rcu);
70
71 _enter("{%d}", permits->count);
72
73 kfree(permits);
74}
75
76/*
77 * get the authorising vnode - this is the specified inode itself if it's a
78 * directory or it's the parent directory if the specified inode is a file or
79 * symlink
80 * - the caller must release the ref on the inode
81 */
82static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode,
83 struct key *key)
84{
85 struct afs_vnode *auth_vnode;
86 struct inode *auth_inode;
87
88 _enter("");
89
90 if (S_ISDIR(vnode->vfs_inode.i_mode)) {
91 auth_inode = igrab(&vnode->vfs_inode);
92 ASSERT(auth_inode != NULL);
93 } else {
94 auth_inode = afs_iget(vnode->vfs_inode.i_sb, key,
95 &vnode->status.parent, NULL, NULL);
96 if (IS_ERR(auth_inode))
97 return ERR_PTR(PTR_ERR(auth_inode));
98 }
99
100 auth_vnode = AFS_FS_I(auth_inode);
101 _leave(" = {%x}", auth_vnode->fid.vnode);
102 return auth_vnode;
103}
104
105/*
106 * clear the permit cache on a directory vnode
107 */
108void afs_clear_permits(struct afs_vnode *vnode)
109{
110 struct afs_permits *permits;
111
112 _enter("{%x}", vnode->fid.vnode);
113
114 mutex_lock(&vnode->permits_lock);
115 permits = vnode->permits;
116 rcu_assign_pointer(vnode->permits, NULL);
117 mutex_unlock(&vnode->permits_lock);
118
119 if (permits)
120 call_rcu(&permits->rcu, afs_zap_permits);
121 _leave("");
122}
123
124/*
125 * add the result obtained for a vnode to its or its parent directory's cache
126 * for the key used to access it
127 */
128void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
129{
130 struct afs_permits *permits, *xpermits;
131 struct afs_permit *permit;
132 struct afs_vnode *auth_vnode;
133 int count, loop;
134
135 _enter("{%x},%x,%lx", vnode->fid.vnode, key_serial(key), acl_order);
136
137 auth_vnode = afs_get_auth_inode(vnode, key);
138 if (IS_ERR(auth_vnode)) {
139 _leave(" [get error %ld]", PTR_ERR(auth_vnode));
140 return;
141 }
142
143 mutex_lock(&auth_vnode->permits_lock);
144
145 /* guard against a rename being detected whilst we waited for the
146 * lock */
147 if (memcmp(&auth_vnode->fid, &vnode->status.parent,
148 sizeof(struct afs_fid)) != 0) {
149 _debug("renamed");
150 goto out_unlock;
151 }
152
153 /* have to be careful as the directory's callback may be broken between
154 * us receiving the status we're trying to cache and us getting the
155 * lock to update the cache for the status */
156 if (auth_vnode->acl_order - acl_order > 0) {
157 _debug("ACL changed?");
158 goto out_unlock;
159 }
160
161 /* always update the anonymous mask */
162 _debug("anon access %x", vnode->status.anon_access);
163 auth_vnode->status.anon_access = vnode->status.anon_access;
164 if (key == vnode->volume->cell->anonymous_key)
165 goto out_unlock;
166
167 xpermits = auth_vnode->permits;
168 count = 0;
169 if (xpermits) {
170 /* see if the permit is already in the list
171 * - if it is then we just amend the list
172 */
173 count = xpermits->count;
174 permit = xpermits->permits;
175 for (loop = count; loop > 0; loop--) {
176 if (permit->key == key) {
177 permit->access_mask =
178 vnode->status.caller_access;
179 goto out_unlock;
180 }
181 permit++;
182 }
183 }
184
185 permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1),
186 GFP_NOFS);
187 if (!permits)
188 goto out_unlock;
189
190 memcpy(permits->permits, xpermits->permits,
191 count * sizeof(struct afs_permit));
192
193 _debug("key %x access %x",
194 key_serial(key), vnode->status.caller_access);
195 permits->permits[count].access_mask = vnode->status.caller_access;
196 permits->permits[count].key = key_get(key);
197 permits->count = count + 1;
198
199 rcu_assign_pointer(auth_vnode->permits, permits);
200 if (xpermits)
201 call_rcu(&xpermits->rcu, afs_dispose_of_permits);
202
203out_unlock:
204 mutex_unlock(&auth_vnode->permits_lock);
205 iput(&auth_vnode->vfs_inode);
206 _leave("");
207}
208
209/*
210 * check with the fileserver to see if the directory or parent directory is
211 * permitted to be accessed with this authorisation, and if so, what access it
212 * is granted
213 */
214static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
215 afs_access_t *_access)
216{
217 struct afs_permits *permits;
218 struct afs_permit *permit;
219 struct afs_vnode *auth_vnode;
220 bool valid;
221 int loop, ret;
222
223 _enter("");
224
225 auth_vnode = afs_get_auth_inode(vnode, key);
226 if (IS_ERR(auth_vnode)) {
227 *_access = 0;
228 _leave(" = %ld", PTR_ERR(auth_vnode));
229 return PTR_ERR(auth_vnode);
230 }
231
232 ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode));
233
234 /* check the permits to see if we've got one yet */
235 if (key == auth_vnode->volume->cell->anonymous_key) {
236 _debug("anon");
237 *_access = auth_vnode->status.anon_access;
238 valid = true;
239 } else {
240 valid = false;
241 rcu_read_lock();
242 permits = rcu_dereference(auth_vnode->permits);
243 if (permits) {
244 permit = permits->permits;
245 for (loop = permits->count; loop > 0; loop--) {
246 if (permit->key == key) {
247 _debug("found in cache");
248 *_access = permit->access_mask;
249 valid = true;
250 break;
251 }
252 permit++;
253 }
254 }
255 rcu_read_unlock();
256 }
257
258 if (!valid) {
259 /* check the status on the file we're actually interested in
260 * (the post-processing will cache the result on auth_vnode) */
261 _debug("no valid permit");
262
263 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
264 ret = afs_vnode_fetch_status(vnode, auth_vnode, key);
265 if (ret < 0) {
266 iput(&auth_vnode->vfs_inode);
267 *_access = 0;
268 _leave(" = %d", ret);
269 return ret;
270 }
271 }
272
273 *_access = vnode->status.caller_access;
274 iput(&auth_vnode->vfs_inode);
275 _leave(" = 0 [access %x]", *_access);
276 return 0;
277}
278
279/*
280 * check the permissions on an AFS file
281 * - AFS ACLs are attached to directories only, and a file is controlled by its
282 * parent directory's ACL
283 */
284int afs_permission(struct inode *inode, int mask, struct nameidata *nd)
285{
286 struct afs_vnode *vnode = AFS_FS_I(inode);
287 afs_access_t access;
288 struct key *key;
289 int ret;
290
291 _enter("{{%x:%x},%lx},%x,",
292 vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
293
294 key = afs_request_key(vnode->volume->cell);
295 if (IS_ERR(key)) {
296 _leave(" = %ld [key]", PTR_ERR(key));
297 return PTR_ERR(key);
298 }
299
300 /* if the promise has expired, we need to check the server again */
301 if (!vnode->cb_promised) {
302 _debug("not promised");
303 ret = afs_vnode_fetch_status(vnode, NULL, key);
304 if (ret < 0)
305 goto error;
306 _debug("new promise [fl=%lx]", vnode->flags);
307 }
308
309 /* check the permits to see if we've got one yet */
310 ret = afs_check_permit(vnode, key, &access);
311 if (ret < 0)
312 goto error;
313
314 /* interpret the access mask */
315 _debug("REQ %x ACC %x on %s",
316 mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file");
317
318 if (S_ISDIR(inode->i_mode)) {
319 if (mask & MAY_EXEC) {
320 if (!(access & AFS_ACE_LOOKUP))
321 goto permission_denied;
322 } else if (mask & MAY_READ) {
323 if (!(access & AFS_ACE_READ))
324 goto permission_denied;
325 } else if (mask & MAY_WRITE) {
326 if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */
327 AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */
328 AFS_ACE_WRITE))) /* chmod */
329 goto permission_denied;
330 } else {
331 BUG();
332 }
333 } else {
334 if (!(access & AFS_ACE_LOOKUP))
335 goto permission_denied;
336 if (mask & (MAY_EXEC | MAY_READ)) {
337 if (!(access & AFS_ACE_READ))
338 goto permission_denied;
339 } else if (mask & MAY_WRITE) {
340 if (!(access & AFS_ACE_WRITE))
341 goto permission_denied;
342 }
343 }
344
345 key_put(key);
346 ret = generic_permission(inode, mask, NULL);
347 _leave(" = %d", ret);
348 return ret;
349
350permission_denied:
351 ret = -EACCES;
352error:
353 key_put(key);
354 _leave(" = %d", ret);
355 return ret;
356}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 44aff81dc6a7..96bb23b476a2 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -1,6 +1,6 @@
1/* server.c: AFS server record management 1/* AFS server record management
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -11,489 +11,314 @@
11 11
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <rxrpc/peer.h>
15#include <rxrpc/connection.h>
16#include "volume.h"
17#include "cell.h"
18#include "server.h"
19#include "transport.h"
20#include "vlclient.h"
21#include "kafstimod.h"
22#include "internal.h" 14#include "internal.h"
23 15
24DEFINE_SPINLOCK(afs_server_peer_lock); 16unsigned afs_server_timeout = 10; /* server timeout in seconds */
25 17
26#define FS_SERVICE_ID 1 /* AFS Volume Location Service ID */ 18static void afs_reap_server(struct work_struct *);
27#define VL_SERVICE_ID 52 /* AFS Volume Location Service ID */
28 19
29static void __afs_server_timeout(struct afs_timer *timer) 20/* tree of all the servers, indexed by IP address */
21static struct rb_root afs_servers = RB_ROOT;
22static DEFINE_RWLOCK(afs_servers_lock);
23
24/* LRU list of all the servers not currently in use */
25static LIST_HEAD(afs_server_graveyard);
26static DEFINE_SPINLOCK(afs_server_graveyard_lock);
27static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server);
28
29/*
30 * install a server record in the master tree
31 */
32static int afs_install_server(struct afs_server *server)
30{ 33{
31 struct afs_server *server = 34 struct afs_server *xserver;
32 list_entry(timer, struct afs_server, timeout); 35 struct rb_node **pp, *p;
36 int ret;
33 37
34 _debug("SERVER TIMEOUT [%p{u=%d}]", 38 _enter("%p", server);
35 server, atomic_read(&server->usage));
36 39
37 afs_server_do_timeout(server); 40 write_lock(&afs_servers_lock);
38} 41
42 ret = -EEXIST;
43 pp = &afs_servers.rb_node;
44 p = NULL;
45 while (*pp) {
46 p = *pp;
47 _debug("- consider %p", p);
48 xserver = rb_entry(p, struct afs_server, master_rb);
49 if (server->addr.s_addr < xserver->addr.s_addr)
50 pp = &(*pp)->rb_left;
51 else if (server->addr.s_addr > xserver->addr.s_addr)
52 pp = &(*pp)->rb_right;
53 else
54 goto error;
55 }
39 56
40static const struct afs_timer_ops afs_server_timer_ops = { 57 rb_link_node(&server->master_rb, p, pp);
41 .timed_out = __afs_server_timeout, 58 rb_insert_color(&server->master_rb, &afs_servers);
42}; 59 ret = 0;
60
61error:
62 write_unlock(&afs_servers_lock);
63 return ret;
64}
43 65
44/*****************************************************************************/
45/* 66/*
46 * lookup a server record in a cell 67 * allocate a new server record
47 * - TODO: search the cell's server list
48 */ 68 */
49int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr, 69static struct afs_server *afs_alloc_server(struct afs_cell *cell,
50 struct afs_server **_server) 70 const struct in_addr *addr)
51{ 71{
52 struct afs_server *server, *active, *zombie; 72 struct afs_server *server;
53 int loop;
54 73
55 _enter("%p,%08x,", cell, ntohl(addr->s_addr)); 74 _enter("");
56 75
57 /* allocate and initialise a server record */
58 server = kzalloc(sizeof(struct afs_server), GFP_KERNEL); 76 server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
59 if (!server) { 77 if (server) {
60 _leave(" = -ENOMEM"); 78 atomic_set(&server->usage, 1);
61 return -ENOMEM; 79 server->cell = cell;
80
81 INIT_LIST_HEAD(&server->link);
82 INIT_LIST_HEAD(&server->grave);
83 init_rwsem(&server->sem);
84 spin_lock_init(&server->fs_lock);
85 server->fs_vnodes = RB_ROOT;
86 server->cb_promises = RB_ROOT;
87 spin_lock_init(&server->cb_lock);
88 init_waitqueue_head(&server->cb_break_waitq);
89 INIT_DELAYED_WORK(&server->cb_break_work,
90 afs_dispatch_give_up_callbacks);
91
92 memcpy(&server->addr, addr, sizeof(struct in_addr));
93 server->addr.s_addr = addr->s_addr;
62 } 94 }
63 95
64 atomic_set(&server->usage, 1); 96 _leave(" = %p{%d}", server, atomic_read(&server->usage));
65 97 return server;
66 INIT_LIST_HEAD(&server->link); 98}
67 init_rwsem(&server->sem);
68 INIT_LIST_HEAD(&server->fs_callq);
69 spin_lock_init(&server->fs_lock);
70 INIT_LIST_HEAD(&server->cb_promises);
71 spin_lock_init(&server->cb_lock);
72
73 for (loop = 0; loop < AFS_SERVER_CONN_LIST_SIZE; loop++)
74 server->fs_conn_cnt[loop] = 4;
75 99
76 memcpy(&server->addr, addr, sizeof(struct in_addr)); 100/*
77 server->addr.s_addr = addr->s_addr; 101 * get an FS-server record for a cell
102 */
103struct afs_server *afs_lookup_server(struct afs_cell *cell,
104 const struct in_addr *addr)
105{
106 struct afs_server *server, *candidate;
78 107
79 afs_timer_init(&server->timeout, &afs_server_timer_ops); 108 _enter("%p,"NIPQUAD_FMT, cell, NIPQUAD(addr->s_addr));
80 109
81 /* add to the cell */ 110 /* quick scan of the list to see if we already have the server */
82 write_lock(&cell->sv_lock); 111 read_lock(&cell->servers_lock);
83 112
84 /* check the active list */ 113 list_for_each_entry(server, &cell->servers, link) {
85 list_for_each_entry(active, &cell->sv_list, link) { 114 if (server->addr.s_addr == addr->s_addr)
86 if (active->addr.s_addr == addr->s_addr) 115 goto found_server_quickly;
87 goto use_active_server;
88 } 116 }
117 read_unlock(&cell->servers_lock);
89 118
90 /* check the inactive list */ 119 candidate = afs_alloc_server(cell, addr);
91 spin_lock(&cell->sv_gylock); 120 if (!candidate) {
92 list_for_each_entry(zombie, &cell->sv_graveyard, link) { 121 _leave(" = -ENOMEM");
93 if (zombie->addr.s_addr == addr->s_addr) 122 return ERR_PTR(-ENOMEM);
94 goto resurrect_server;
95 } 123 }
96 spin_unlock(&cell->sv_gylock);
97 124
98 afs_get_cell(cell); 125 write_lock(&cell->servers_lock);
99 server->cell = cell;
100 list_add_tail(&server->link, &cell->sv_list);
101 126
102 write_unlock(&cell->sv_lock); 127 /* check the cell's server list again */
128 list_for_each_entry(server, &cell->servers, link) {
129 if (server->addr.s_addr == addr->s_addr)
130 goto found_server;
131 }
103 132
104 *_server = server; 133 _debug("new");
105 _leave(" = 0 (%p)", server); 134 server = candidate;
106 return 0; 135 if (afs_install_server(server) < 0)
136 goto server_in_two_cells;
107 137
108 /* found a matching active server */ 138 afs_get_cell(cell);
109 use_active_server: 139 list_add_tail(&server->link, &cell->servers);
110 _debug("active server"); 140
111 afs_get_server(active); 141 write_unlock(&cell->servers_lock);
112 write_unlock(&cell->sv_lock); 142 _leave(" = %p{%d}", server, atomic_read(&server->usage));
143 return server;
144
145 /* found a matching server quickly */
146found_server_quickly:
147 _debug("found quickly");
148 afs_get_server(server);
149 read_unlock(&cell->servers_lock);
150no_longer_unused:
151 if (!list_empty(&server->grave)) {
152 spin_lock(&afs_server_graveyard_lock);
153 list_del_init(&server->grave);
154 spin_unlock(&afs_server_graveyard_lock);
155 }
156 _leave(" = %p{%d}", server, atomic_read(&server->usage));
157 return server;
158
159 /* found a matching server on the second pass */
160found_server:
161 _debug("found");
162 afs_get_server(server);
163 write_unlock(&cell->servers_lock);
164 kfree(candidate);
165 goto no_longer_unused;
166
167 /* found a server that seems to be in two cells */
168server_in_two_cells:
169 write_unlock(&cell->servers_lock);
170 kfree(candidate);
171 printk(KERN_NOTICE "kAFS:"
172 " Server "NIPQUAD_FMT" appears to be in two cells\n",
173 NIPQUAD(*addr));
174 _leave(" = -EEXIST");
175 return ERR_PTR(-EEXIST);
176}
113 177
114 kfree(server); 178/*
179 * look up a server by its IP address
180 */
181struct afs_server *afs_find_server(const struct in_addr *_addr)
182{
183 struct afs_server *server = NULL;
184 struct rb_node *p;
185 struct in_addr addr = *_addr;
115 186
116 *_server = active; 187 _enter(NIPQUAD_FMT, NIPQUAD(addr.s_addr));
117 _leave(" = 0 (%p)", active);
118 return 0;
119 188
120 /* found a matching server in the graveyard, so resurrect it and 189 read_lock(&afs_servers_lock);
121 * dispose of the new record */
122 resurrect_server:
123 _debug("resurrecting server");
124 190
125 list_move_tail(&zombie->link, &cell->sv_list); 191 p = afs_servers.rb_node;
126 afs_get_server(zombie); 192 while (p) {
127 afs_kafstimod_del_timer(&zombie->timeout); 193 server = rb_entry(p, struct afs_server, master_rb);
128 spin_unlock(&cell->sv_gylock);
129 write_unlock(&cell->sv_lock);
130 194
131 kfree(server); 195 _debug("- consider %p", p);
132 196
133 *_server = zombie; 197 if (addr.s_addr < server->addr.s_addr) {
134 _leave(" = 0 (%p)", zombie); 198 p = p->rb_left;
135 return 0; 199 } else if (addr.s_addr > server->addr.s_addr) {
200 p = p->rb_right;
201 } else {
202 afs_get_server(server);
203 goto found;
204 }
205 }
136 206
137} /* end afs_server_lookup() */ 207 server = NULL;
208found:
209 read_unlock(&afs_servers_lock);
210 ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr);
211 _leave(" = %p", server);
212 return server;
213}
138 214
139/*****************************************************************************/
140/* 215/*
141 * destroy a server record 216 * destroy a server record
142 * - removes from the cell list 217 * - removes from the cell list
143 */ 218 */
144void afs_put_server(struct afs_server *server) 219void afs_put_server(struct afs_server *server)
145{ 220{
146 struct afs_cell *cell;
147
148 if (!server) 221 if (!server)
149 return; 222 return;
150 223
151 _enter("%p", server); 224 _enter("%p{%d}", server, atomic_read(&server->usage));
152
153 cell = server->cell;
154 225
155 /* sanity check */ 226 _debug("PUT SERVER %d", atomic_read(&server->usage));
156 BUG_ON(atomic_read(&server->usage) <= 0);
157 227
158 /* to prevent a race, the decrement and the dequeue must be effectively 228 ASSERTCMP(atomic_read(&server->usage), >, 0);
159 * atomic */
160 write_lock(&cell->sv_lock);
161 229
162 if (likely(!atomic_dec_and_test(&server->usage))) { 230 if (likely(!atomic_dec_and_test(&server->usage))) {
163 write_unlock(&cell->sv_lock);
164 _leave(""); 231 _leave("");
165 return; 232 return;
166 } 233 }
167 234
168 spin_lock(&cell->sv_gylock); 235 afs_flush_callback_breaks(server);
169 list_move_tail(&server->link, &cell->sv_graveyard);
170 236
171 /* time out in 10 secs */ 237 spin_lock(&afs_server_graveyard_lock);
172 afs_kafstimod_add_timer(&server->timeout, 10 * HZ); 238 if (atomic_read(&server->usage) == 0) {
173 239 list_move_tail(&server->grave, &afs_server_graveyard);
174 spin_unlock(&cell->sv_gylock); 240 server->time_of_death = get_seconds();
175 write_unlock(&cell->sv_lock); 241 schedule_delayed_work(&afs_server_reaper,
176 242 afs_server_timeout * HZ);
177 _leave(" [killed]"); 243 }
178} /* end afs_put_server() */ 244 spin_unlock(&afs_server_graveyard_lock);
245 _leave(" [dead]");
246}
179 247
180/*****************************************************************************/
181/* 248/*
182 * timeout server record 249 * destroy a dead server
183 * - removes from the cell's graveyard if the usage count is zero
184 */ 250 */
185void afs_server_do_timeout(struct afs_server *server) 251static void afs_destroy_server(struct afs_server *server)
186{ 252{
187 struct rxrpc_peer *peer;
188 struct afs_cell *cell;
189 int loop;
190
191 _enter("%p", server); 253 _enter("%p", server);
192 254
193 cell = server->cell; 255 ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
194 256 ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
195 BUG_ON(atomic_read(&server->usage) < 0); 257 ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
196 258 ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0);
197 /* remove from graveyard if still dead */
198 spin_lock(&cell->vl_gylock);
199 if (atomic_read(&server->usage) == 0)
200 list_del_init(&server->link);
201 else
202 server = NULL;
203 spin_unlock(&cell->vl_gylock);
204
205 if (!server) {
206 _leave("");
207 return; /* resurrected */
208 }
209
210 /* we can now destroy it properly */
211 afs_put_cell(cell);
212
213 /* uncross-point the structs under a global lock */
214 spin_lock(&afs_server_peer_lock);
215 peer = server->peer;
216 if (peer) {
217 server->peer = NULL;
218 peer->user = NULL;
219 }
220 spin_unlock(&afs_server_peer_lock);
221
222 /* finish cleaning up the server */
223 for (loop = AFS_SERVER_CONN_LIST_SIZE - 1; loop >= 0; loop--)
224 if (server->fs_conn[loop])
225 rxrpc_put_connection(server->fs_conn[loop]);
226
227 if (server->vlserver)
228 rxrpc_put_connection(server->vlserver);
229 259
260 afs_put_cell(server->cell);
230 kfree(server); 261 kfree(server);
262}
231 263
232 _leave(" [destroyed]");
233} /* end afs_server_do_timeout() */
234
235/*****************************************************************************/
236/* 264/*
237 * get a callslot on a connection to the fileserver on the specified server 265 * reap dead server records
238 */ 266 */
239int afs_server_request_callslot(struct afs_server *server, 267static void afs_reap_server(struct work_struct *work)
240 struct afs_server_callslot *callslot)
241{ 268{
242 struct afs_server_callslot *pcallslot; 269 LIST_HEAD(corpses);
243 struct rxrpc_connection *conn; 270 struct afs_server *server;
244 int nconn, ret; 271 unsigned long delay, expiry;
245 272 time_t now;
246 _enter("%p,",server); 273
247 274 now = get_seconds();
248 INIT_LIST_HEAD(&callslot->link); 275 spin_lock(&afs_server_graveyard_lock);
249 callslot->task = current; 276
250 callslot->conn = NULL; 277 while (!list_empty(&afs_server_graveyard)) {
251 callslot->nconn = -1; 278 server = list_entry(afs_server_graveyard.next,
252 callslot->ready = 0; 279 struct afs_server, grave);
253 280
254 ret = 0; 281 /* the queue is ordered most dead first */
255 conn = NULL; 282 expiry = server->time_of_death + afs_server_timeout;
256 283 if (expiry > now) {
257 /* get hold of a callslot first */ 284 delay = (expiry - now) * HZ;
258 spin_lock(&server->fs_lock); 285 if (!schedule_delayed_work(&afs_server_reaper, delay)) {
259 286 cancel_delayed_work(&afs_server_reaper);
260 /* resurrect the server if it's death timeout has expired */ 287 schedule_delayed_work(&afs_server_reaper,
261 if (server->fs_state) { 288 delay);
262 if (time_before(jiffies, server->fs_dead_jif)) { 289 }
263 ret = server->fs_state; 290 break;
264 spin_unlock(&server->fs_lock);
265 _leave(" = %d [still dead]", ret);
266 return ret;
267 } 291 }
268 292
269 server->fs_state = 0; 293 write_lock(&server->cell->servers_lock);
270 } 294 write_lock(&afs_servers_lock);
271 295 if (atomic_read(&server->usage) > 0) {
272 /* try and find a connection that has spare callslots */ 296 list_del_init(&server->grave);
273 for (nconn = 0; nconn < AFS_SERVER_CONN_LIST_SIZE; nconn++) { 297 } else {
274 if (server->fs_conn_cnt[nconn] > 0) { 298 list_move_tail(&server->grave, &corpses);
275 server->fs_conn_cnt[nconn]--; 299 list_del_init(&server->link);
276 spin_unlock(&server->fs_lock); 300 rb_erase(&server->master_rb, &afs_servers);
277 callslot->nconn = nconn;
278 goto obtained_slot;
279 } 301 }
302 write_unlock(&afs_servers_lock);
303 write_unlock(&server->cell->servers_lock);
280 } 304 }
281 305
282 /* none were available - wait interruptibly for one to become 306 spin_unlock(&afs_server_graveyard_lock);
283 * available */
284 set_current_state(TASK_INTERRUPTIBLE);
285 list_add_tail(&callslot->link, &server->fs_callq);
286 spin_unlock(&server->fs_lock);
287
288 while (!callslot->ready && !signal_pending(current)) {
289 schedule();
290 set_current_state(TASK_INTERRUPTIBLE);
291 }
292
293 set_current_state(TASK_RUNNING);
294
295 /* even if we were interrupted we may still be queued */
296 if (!callslot->ready) {
297 spin_lock(&server->fs_lock);
298 list_del_init(&callslot->link);
299 spin_unlock(&server->fs_lock);
300 }
301
302 nconn = callslot->nconn;
303 307
304 /* if interrupted, we must release any slot we also got before 308 /* now reap the corpses we've extracted */
305 * returning an error */ 309 while (!list_empty(&corpses)) {
306 if (signal_pending(current)) { 310 server = list_entry(corpses.next, struct afs_server, grave);
307 ret = -EINTR; 311 list_del(&server->grave);
308 goto error_release; 312 afs_destroy_server(server);
309 } 313 }
314}
310 315
311 /* if we were woken up with an error, then pass that error back to the
312 * called */
313 if (nconn < 0) {
314 _leave(" = %d", callslot->errno);
315 return callslot->errno;
316 }
317
318 /* were we given a connection directly? */
319 if (callslot->conn) {
320 /* yes - use it */
321 _leave(" = 0 (nc=%d)", nconn);
322 return 0;
323 }
324
325 /* got a callslot, but no connection */
326 obtained_slot:
327
328 /* need to get hold of the RxRPC connection */
329 down_write(&server->sem);
330
331 /* quick check to see if there's an outstanding error */
332 ret = server->fs_state;
333 if (ret)
334 goto error_release_upw;
335
336 if (server->fs_conn[nconn]) {
337 /* reuse an existing connection */
338 rxrpc_get_connection(server->fs_conn[nconn]);
339 callslot->conn = server->fs_conn[nconn];
340 }
341 else {
342 /* create a new connection */
343 ret = rxrpc_create_connection(afs_transport,
344 htons(7000),
345 server->addr.s_addr,
346 FS_SERVICE_ID,
347 NULL,
348 &server->fs_conn[nconn]);
349
350 if (ret < 0)
351 goto error_release_upw;
352
353 callslot->conn = server->fs_conn[0];
354 rxrpc_get_connection(callslot->conn);
355 }
356
357 up_write(&server->sem);
358
359 _leave(" = 0");
360 return 0;
361
362 /* handle an error occurring */
363 error_release_upw:
364 up_write(&server->sem);
365
366 error_release:
367 /* either release the callslot or pass it along to another deserving
368 * task */
369 spin_lock(&server->fs_lock);
370
371 if (nconn < 0) {
372 /* no callslot allocated */
373 }
374 else if (list_empty(&server->fs_callq)) {
375 /* no one waiting */
376 server->fs_conn_cnt[nconn]++;
377 spin_unlock(&server->fs_lock);
378 }
379 else {
380 /* someone's waiting - dequeue them and wake them up */
381 pcallslot = list_entry(server->fs_callq.next,
382 struct afs_server_callslot, link);
383 list_del_init(&pcallslot->link);
384
385 pcallslot->errno = server->fs_state;
386 if (!pcallslot->errno) {
387 /* pass them out callslot details */
388 callslot->conn = xchg(&pcallslot->conn,
389 callslot->conn);
390 pcallslot->nconn = nconn;
391 callslot->nconn = nconn = -1;
392 }
393 pcallslot->ready = 1;
394 wake_up_process(pcallslot->task);
395 spin_unlock(&server->fs_lock);
396 }
397
398 rxrpc_put_connection(callslot->conn);
399 callslot->conn = NULL;
400
401 _leave(" = %d", ret);
402 return ret;
403
404} /* end afs_server_request_callslot() */
405
406/*****************************************************************************/
407/*
408 * release a callslot back to the server
409 * - transfers the RxRPC connection to the next pending callslot if possible
410 */
411void afs_server_release_callslot(struct afs_server *server,
412 struct afs_server_callslot *callslot)
413{
414 struct afs_server_callslot *pcallslot;
415
416 _enter("{ad=%08x,cnt=%u},{%d}",
417 ntohl(server->addr.s_addr),
418 server->fs_conn_cnt[callslot->nconn],
419 callslot->nconn);
420
421 BUG_ON(callslot->nconn < 0);
422
423 spin_lock(&server->fs_lock);
424
425 if (list_empty(&server->fs_callq)) {
426 /* no one waiting */
427 server->fs_conn_cnt[callslot->nconn]++;
428 spin_unlock(&server->fs_lock);
429 }
430 else {
431 /* someone's waiting - dequeue them and wake them up */
432 pcallslot = list_entry(server->fs_callq.next,
433 struct afs_server_callslot, link);
434 list_del_init(&pcallslot->link);
435
436 pcallslot->errno = server->fs_state;
437 if (!pcallslot->errno) {
438 /* pass them out callslot details */
439 callslot->conn = xchg(&pcallslot->conn, callslot->conn);
440 pcallslot->nconn = callslot->nconn;
441 callslot->nconn = -1;
442 }
443
444 pcallslot->ready = 1;
445 wake_up_process(pcallslot->task);
446 spin_unlock(&server->fs_lock);
447 }
448
449 rxrpc_put_connection(callslot->conn);
450
451 _leave("");
452} /* end afs_server_release_callslot() */
453
454/*****************************************************************************/
455/* 316/*
456 * get a handle to a connection to the vlserver (volume location) on the 317 * discard all the server records for rmmod
457 * specified server
458 */ 318 */
459int afs_server_get_vlconn(struct afs_server *server, 319void __exit afs_purge_servers(void)
460 struct rxrpc_connection **_conn)
461{ 320{
462 struct rxrpc_connection *conn; 321 afs_server_timeout = 0;
463 int ret; 322 cancel_delayed_work(&afs_server_reaper);
464 323 schedule_delayed_work(&afs_server_reaper, 0);
465 _enter("%p,", server); 324}
466
467 ret = 0;
468 conn = NULL;
469 down_read(&server->sem);
470
471 if (server->vlserver) {
472 /* reuse an existing connection */
473 rxrpc_get_connection(server->vlserver);
474 conn = server->vlserver;
475 up_read(&server->sem);
476 }
477 else {
478 /* create a new connection */
479 up_read(&server->sem);
480 down_write(&server->sem);
481 if (!server->vlserver) {
482 ret = rxrpc_create_connection(afs_transport,
483 htons(7003),
484 server->addr.s_addr,
485 VL_SERVICE_ID,
486 NULL,
487 &server->vlserver);
488 }
489 if (ret == 0) {
490 rxrpc_get_connection(server->vlserver);
491 conn = server->vlserver;
492 }
493 up_write(&server->sem);
494 }
495
496 *_conn = conn;
497 _leave(" = %d", ret);
498 return ret;
499} /* end afs_server_get_vlconn() */
diff --git a/fs/afs/server.h b/fs/afs/server.h
deleted file mode 100644
index c3d24115578f..000000000000
--- a/fs/afs/server.h
+++ /dev/null
@@ -1,102 +0,0 @@
1/* server.h: AFS server record
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef _LINUX_AFS_SERVER_H
13#define _LINUX_AFS_SERVER_H
14
15#include "types.h"
16#include "kafstimod.h"
17#include <rxrpc/peer.h>
18#include <linux/rwsem.h>
19
20extern spinlock_t afs_server_peer_lock;
21
22/*****************************************************************************/
23/*
24 * AFS server record
25 */
26struct afs_server
27{
28 atomic_t usage;
29 struct afs_cell *cell; /* cell in which server resides */
30 struct list_head link; /* link in cell's server list */
31 struct rw_semaphore sem; /* access lock */
32 struct afs_timer timeout; /* graveyard timeout */
33 struct in_addr addr; /* server address */
34 struct rxrpc_peer *peer; /* peer record for this server */
35 struct rxrpc_connection *vlserver; /* connection to the volume location service */
36
37 /* file service access */
38#define AFS_SERVER_CONN_LIST_SIZE 2
39 struct rxrpc_connection *fs_conn[AFS_SERVER_CONN_LIST_SIZE]; /* FS connections */
40 unsigned fs_conn_cnt[AFS_SERVER_CONN_LIST_SIZE]; /* per conn call count */
41 struct list_head fs_callq; /* queue of processes waiting to make a call */
42 spinlock_t fs_lock; /* access lock */
43 int fs_state; /* 0 or reason FS currently marked dead (-errno) */
44 unsigned fs_rtt; /* FS round trip time */
45 unsigned long fs_act_jif; /* time at which last activity occurred */
46 unsigned long fs_dead_jif; /* time at which no longer to be considered dead */
47
48 /* callback promise management */
49 struct list_head cb_promises; /* as yet unbroken promises from this server */
50 spinlock_t cb_lock; /* access lock */
51};
52
53extern int afs_server_lookup(struct afs_cell *cell,
54 const struct in_addr *addr,
55 struct afs_server **_server);
56
57#define afs_get_server(S) do { atomic_inc(&(S)->usage); } while(0)
58
59extern void afs_put_server(struct afs_server *server);
60extern void afs_server_do_timeout(struct afs_server *server);
61
62extern int afs_server_find_by_peer(const struct rxrpc_peer *peer,
63 struct afs_server **_server);
64
65extern int afs_server_get_vlconn(struct afs_server *server,
66 struct rxrpc_connection **_conn);
67
68static inline
69struct afs_server *afs_server_get_from_peer(struct rxrpc_peer *peer)
70{
71 struct afs_server *server;
72
73 spin_lock(&afs_server_peer_lock);
74 server = peer->user;
75 if (server)
76 afs_get_server(server);
77 spin_unlock(&afs_server_peer_lock);
78
79 return server;
80}
81
82/*****************************************************************************/
83/*
84 * AFS server callslot grant record
85 */
86struct afs_server_callslot
87{
88 struct list_head link; /* link in server's list */
89 struct task_struct *task; /* process waiting to make call */
90 struct rxrpc_connection *conn; /* connection to use (or NULL on error) */
91 short nconn; /* connection slot number (-1 on error) */
92 char ready; /* T when ready */
93 int errno; /* error number if nconn==-1 */
94};
95
96extern int afs_server_request_callslot(struct afs_server *server,
97 struct afs_server_callslot *callslot);
98
99extern void afs_server_release_callslot(struct afs_server *server,
100 struct afs_server_callslot *callslot);
101
102#endif /* _LINUX_AFS_SERVER_H */
diff --git a/fs/afs/super.c b/fs/afs/super.c
index eb7e32349da3..7030d76155fc 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -1,5 +1,6 @@
1/* 1/* AFS superblock handling
2 * Copyright (c) 2002 Red Hat, Inc. All rights reserved. 2 *
3 * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
3 * 4 *
4 * This software may be freely redistributed under the terms of the 5 * This software may be freely redistributed under the terms of the
5 * GNU General Public License. 6 * GNU General Public License.
@@ -9,7 +10,7 @@
9 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 10 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
10 * 11 *
11 * Authors: David Howells <dhowells@redhat.com> 12 * Authors: David Howells <dhowells@redhat.com>
12 * David Woodhouse <dwmw2@cambridge.redhat.com> 13 * David Woodhouse <dwmw2@redhat.com>
13 * 14 *
14 */ 15 */
15 16
@@ -19,22 +20,11 @@
19#include <linux/slab.h> 20#include <linux/slab.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include "vnode.h" 23#include <linux/parser.h>
23#include "volume.h"
24#include "cell.h"
25#include "cmservice.h"
26#include "fsclient.h"
27#include "super.h"
28#include "internal.h" 24#include "internal.h"
29 25
30#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ 26#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
31 27
32struct afs_mount_params {
33 int rwpath;
34 struct afs_cell *default_cell;
35 struct afs_volume *volume;
36};
37
38static void afs_i_init_once(void *foo, struct kmem_cache *cachep, 28static void afs_i_init_once(void *foo, struct kmem_cache *cachep,
39 unsigned long flags); 29 unsigned long flags);
40 30
@@ -53,7 +43,7 @@ struct file_system_type afs_fs_type = {
53 .name = "afs", 43 .name = "afs",
54 .get_sb = afs_get_sb, 44 .get_sb = afs_get_sb,
55 .kill_sb = kill_anon_super, 45 .kill_sb = kill_anon_super,
56 .fs_flags = FS_BINARY_MOUNTDATA, 46 .fs_flags = 0,
57}; 47};
58 48
59static const struct super_operations afs_super_ops = { 49static const struct super_operations afs_super_ops = {
@@ -62,13 +52,27 @@ static const struct super_operations afs_super_ops = {
62 .drop_inode = generic_delete_inode, 52 .drop_inode = generic_delete_inode,
63 .destroy_inode = afs_destroy_inode, 53 .destroy_inode = afs_destroy_inode,
64 .clear_inode = afs_clear_inode, 54 .clear_inode = afs_clear_inode,
55 .umount_begin = afs_umount_begin,
65 .put_super = afs_put_super, 56 .put_super = afs_put_super,
66}; 57};
67 58
68static struct kmem_cache *afs_inode_cachep; 59static struct kmem_cache *afs_inode_cachep;
69static atomic_t afs_count_active_inodes; 60static atomic_t afs_count_active_inodes;
70 61
71/*****************************************************************************/ 62enum {
63 afs_no_opt,
64 afs_opt_cell,
65 afs_opt_rwpath,
66 afs_opt_vol,
67};
68
69static const match_table_t afs_options_list = {
70 { afs_opt_cell, "cell=%s" },
71 { afs_opt_rwpath, "rwpath" },
72 { afs_opt_vol, "vol=%s" },
73 { afs_no_opt, NULL },
74};
75
72/* 76/*
73 * initialise the filesystem 77 * initialise the filesystem
74 */ 78 */
@@ -78,8 +82,6 @@ int __init afs_fs_init(void)
78 82
79 _enter(""); 83 _enter("");
80 84
81 afs_timer_init(&afs_mntpt_expiry_timer, &afs_mntpt_expiry_timer_ops);
82
83 /* create ourselves an inode cache */ 85 /* create ourselves an inode cache */
84 atomic_set(&afs_count_active_inodes, 0); 86 atomic_set(&afs_count_active_inodes, 0);
85 87
@@ -99,20 +101,22 @@ int __init afs_fs_init(void)
99 ret = register_filesystem(&afs_fs_type); 101 ret = register_filesystem(&afs_fs_type);
100 if (ret < 0) { 102 if (ret < 0) {
101 kmem_cache_destroy(afs_inode_cachep); 103 kmem_cache_destroy(afs_inode_cachep);
102 kleave(" = %d", ret); 104 _leave(" = %d", ret);
103 return ret; 105 return ret;
104 } 106 }
105 107
106 kleave(" = 0"); 108 _leave(" = 0");
107 return 0; 109 return 0;
108} /* end afs_fs_init() */ 110}
109 111
110/*****************************************************************************/
111/* 112/*
112 * clean up the filesystem 113 * clean up the filesystem
113 */ 114 */
114void __exit afs_fs_exit(void) 115void __exit afs_fs_exit(void)
115{ 116{
117 _enter("");
118
119 afs_mntpt_kill_timer();
116 unregister_filesystem(&afs_fs_type); 120 unregister_filesystem(&afs_fs_type);
117 121
118 if (atomic_read(&afs_count_active_inodes) != 0) { 122 if (atomic_read(&afs_count_active_inodes) != 0) {
@@ -122,99 +126,153 @@ void __exit afs_fs_exit(void)
122 } 126 }
123 127
124 kmem_cache_destroy(afs_inode_cachep); 128 kmem_cache_destroy(afs_inode_cachep);
129 _leave("");
130}
125 131
126} /* end afs_fs_exit() */
127
128/*****************************************************************************/
129/*
130 * check that an argument has a value
131 */
132static int want_arg(char **_value, const char *option)
133{
134 if (!_value || !*_value || !**_value) {
135 printk(KERN_NOTICE "kAFS: %s: argument missing\n", option);
136 return 0;
137 }
138 return 1;
139} /* end want_arg() */
140
141/*****************************************************************************/
142/*
143 * check that there's no subsequent value
144 */
145static int want_no_value(char *const *_value, const char *option)
146{
147 if (*_value && **_value) {
148 printk(KERN_NOTICE "kAFS: %s: Invalid argument: %s\n",
149 option, *_value);
150 return 0;
151 }
152 return 1;
153} /* end want_no_value() */
154
155/*****************************************************************************/
156/* 132/*
157 * parse the mount options 133 * parse the mount options
158 * - this function has been shamelessly adapted from the ext3 fs which 134 * - this function has been shamelessly adapted from the ext3 fs which
159 * shamelessly adapted it from the msdos fs 135 * shamelessly adapted it from the msdos fs
160 */ 136 */
161static int afs_super_parse_options(struct afs_mount_params *params, 137static int afs_parse_options(struct afs_mount_params *params,
162 char *options, 138 char *options, const char **devname)
163 const char **devname)
164{ 139{
165 char *key, *value; 140 struct afs_cell *cell;
166 int ret; 141 substring_t args[MAX_OPT_ARGS];
142 char *p;
143 int token;
167 144
168 _enter("%s", options); 145 _enter("%s", options);
169 146
170 options[PAGE_SIZE - 1] = 0; 147 options[PAGE_SIZE - 1] = 0;
171 148
172 ret = 0; 149 while ((p = strsep(&options, ","))) {
173 while ((key = strsep(&options, ",")) != 0) 150 if (!*p)
174 { 151 continue;
175 value = strchr(key, '=');
176 if (value)
177 *value++ = 0;
178
179 printk("kAFS: KEY: %s, VAL:%s\n", key, value ?: "-");
180 152
181 if (strcmp(key, "rwpath") == 0) { 153 token = match_token(p, afs_options_list, args);
182 if (!want_no_value(&value, "rwpath")) 154 switch (token) {
183 return -EINVAL; 155 case afs_opt_cell:
156 cell = afs_cell_lookup(args[0].from,
157 args[0].to - args[0].from);
158 if (IS_ERR(cell))
159 return PTR_ERR(cell);
160 afs_put_cell(params->cell);
161 params->cell = cell;
162 break;
163
164 case afs_opt_rwpath:
184 params->rwpath = 1; 165 params->rwpath = 1;
185 continue; 166 break;
186 } 167
187 else if (strcmp(key, "vol") == 0) { 168 case afs_opt_vol:
188 if (!want_arg(&value, "vol")) 169 *devname = args[0].from;
189 return -EINVAL; 170 break;
190 *devname = value; 171
191 continue; 172 default:
173 printk(KERN_ERR "kAFS:"
174 " Unknown or invalid mount option: '%s'\n", p);
175 return -EINVAL;
192 } 176 }
193 else if (strcmp(key, "cell") == 0) { 177 }
194 if (!want_arg(&value, "cell")) 178
195 return -EINVAL; 179 _leave(" = 0");
196 afs_put_cell(params->default_cell); 180 return 0;
197 ret = afs_cell_lookup(value, 181}
198 strlen(value), 182
199 &params->default_cell); 183/*
200 if (ret < 0) 184 * parse a device name to get cell name, volume name, volume type and R/W
201 return -EINVAL; 185 * selector
202 continue; 186 * - this can be one of the following:
187 * "%[cell:]volume[.]" R/W volume
188 * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0),
189 * or R/W (rwpath=1) volume
190 * "%[cell:]volume.readonly" R/O volume
191 * "#[cell:]volume.readonly" R/O volume
192 * "%[cell:]volume.backup" Backup volume
193 * "#[cell:]volume.backup" Backup volume
194 */
195static int afs_parse_device_name(struct afs_mount_params *params,
196 const char *name)
197{
198 struct afs_cell *cell;
199 const char *cellname, *suffix;
200 int cellnamesz;
201
202 _enter(",%s", name);
203
204 if (!name) {
205 printk(KERN_ERR "kAFS: no volume name specified\n");
206 return -EINVAL;
207 }
208
209 if ((name[0] != '%' && name[0] != '#') || !name[1]) {
210 printk(KERN_ERR "kAFS: unparsable volume name\n");
211 return -EINVAL;
212 }
213
214 /* determine the type of volume we're looking for */
215 params->type = AFSVL_ROVOL;
216 params->force = false;
217 if (params->rwpath || name[0] == '%') {
218 params->type = AFSVL_RWVOL;
219 params->force = true;
220 }
221 name++;
222
223 /* split the cell name out if there is one */
224 params->volname = strchr(name, ':');
225 if (params->volname) {
226 cellname = name;
227 cellnamesz = params->volname - name;
228 params->volname++;
229 } else {
230 params->volname = name;
231 cellname = NULL;
232 cellnamesz = 0;
233 }
234
235 /* the volume type is further affected by a possible suffix */
236 suffix = strrchr(params->volname, '.');
237 if (suffix) {
238 if (strcmp(suffix, ".readonly") == 0) {
239 params->type = AFSVL_ROVOL;
240 params->force = true;
241 } else if (strcmp(suffix, ".backup") == 0) {
242 params->type = AFSVL_BACKVOL;
243 params->force = true;
244 } else if (suffix[1] == 0) {
245 } else {
246 suffix = NULL;
203 } 247 }
248 }
204 249
205 printk("kAFS: Unknown mount option: '%s'\n", key); 250 params->volnamesz = suffix ?
206 ret = -EINVAL; 251 suffix - params->volname : strlen(params->volname);
207 goto error; 252
253 _debug("cell %*.*s [%p]",
254 cellnamesz, cellnamesz, cellname ?: "", params->cell);
255
256 /* lookup the cell record */
257 if (cellname || !params->cell) {
258 cell = afs_cell_lookup(cellname, cellnamesz);
259 if (IS_ERR(cell)) {
260 printk(KERN_ERR "kAFS: unable to lookup cell '%s'\n",
261 cellname ?: "");
262 return PTR_ERR(cell);
263 }
264 afs_put_cell(params->cell);
265 params->cell = cell;
208 } 266 }
209 267
210 ret = 0; 268 _debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
269 params->cell->name, params->cell,
270 params->volnamesz, params->volnamesz, params->volname,
271 suffix ?: "-", params->type, params->force ? " FORCE" : "");
211 272
212 error: 273 return 0;
213 _leave(" = %d", ret); 274}
214 return ret;
215} /* end afs_super_parse_options() */
216 275
217/*****************************************************************************/
218/* 276/*
219 * check a superblock to see if it's the one we're looking for 277 * check a superblock to see if it's the one we're looking for
220 */ 278 */
@@ -224,13 +282,12 @@ static int afs_test_super(struct super_block *sb, void *data)
224 struct afs_super_info *as = sb->s_fs_info; 282 struct afs_super_info *as = sb->s_fs_info;
225 283
226 return as->volume == params->volume; 284 return as->volume == params->volume;
227} /* end afs_test_super() */ 285}
228 286
229/*****************************************************************************/
230/* 287/*
231 * fill in the superblock 288 * fill in the superblock
232 */ 289 */
233static int afs_fill_super(struct super_block *sb, void *data, int silent) 290static int afs_fill_super(struct super_block *sb, void *data)
234{ 291{
235 struct afs_mount_params *params = data; 292 struct afs_mount_params *params = data;
236 struct afs_super_info *as = NULL; 293 struct afs_super_info *as = NULL;
@@ -239,7 +296,7 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
239 struct inode *inode = NULL; 296 struct inode *inode = NULL;
240 int ret; 297 int ret;
241 298
242 kenter(""); 299 _enter("");
243 300
244 /* allocate a superblock info record */ 301 /* allocate a superblock info record */
245 as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); 302 as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
@@ -262,9 +319,9 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
262 fid.vid = as->volume->vid; 319 fid.vid = as->volume->vid;
263 fid.vnode = 1; 320 fid.vnode = 1;
264 fid.unique = 1; 321 fid.unique = 1;
265 ret = afs_iget(sb, &fid, &inode); 322 inode = afs_iget(sb, params->key, &fid, NULL, NULL);
266 if (ret < 0) 323 if (IS_ERR(inode))
267 goto error; 324 goto error_inode;
268 325
269 ret = -ENOMEM; 326 ret = -ENOMEM;
270 root = d_alloc_root(inode); 327 root = d_alloc_root(inode);
@@ -273,24 +330,25 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
273 330
274 sb->s_root = root; 331 sb->s_root = root;
275 332
276 kleave(" = 0"); 333 _leave(" = 0");
277 return 0; 334 return 0;
278 335
279 error: 336error_inode:
337 ret = PTR_ERR(inode);
338 inode = NULL;
339error:
280 iput(inode); 340 iput(inode);
281 afs_put_volume(as->volume); 341 afs_put_volume(as->volume);
282 kfree(as); 342 kfree(as);
283 343
284 sb->s_fs_info = NULL; 344 sb->s_fs_info = NULL;
285 345
286 kleave(" = %d", ret); 346 _leave(" = %d", ret);
287 return ret; 347 return ret;
288} /* end afs_fill_super() */ 348}
289 349
290/*****************************************************************************/
291/* 350/*
292 * get an AFS superblock 351 * get an AFS superblock
293 * - TODO: don't use get_sb_nodev(), but rather call sget() directly
294 */ 352 */
295static int afs_get_sb(struct file_system_type *fs_type, 353static int afs_get_sb(struct file_system_type *fs_type,
296 int flags, 354 int flags,
@@ -300,69 +358,79 @@ static int afs_get_sb(struct file_system_type *fs_type,
300{ 358{
301 struct afs_mount_params params; 359 struct afs_mount_params params;
302 struct super_block *sb; 360 struct super_block *sb;
361 struct afs_volume *vol;
362 struct key *key;
303 int ret; 363 int ret;
304 364
305 _enter(",,%s,%p", dev_name, options); 365 _enter(",,%s,%p", dev_name, options);
306 366
307 memset(&params, 0, sizeof(params)); 367 memset(&params, 0, sizeof(params));
308 368
309 /* start the cache manager */ 369 /* parse the options and device name */
310 ret = afscm_start();
311 if (ret < 0) {
312 _leave(" = %d", ret);
313 return ret;
314 }
315
316 /* parse the options */
317 if (options) { 370 if (options) {
318 ret = afs_super_parse_options(&params, options, &dev_name); 371 ret = afs_parse_options(&params, options, &dev_name);
319 if (ret < 0) 372 if (ret < 0)
320 goto error; 373 goto error;
321 if (!dev_name) {
322 printk("kAFS: no volume name specified\n");
323 ret = -EINVAL;
324 goto error;
325 }
326 } 374 }
327 375
328 /* parse the device name */ 376 ret = afs_parse_device_name(&params, dev_name);
329 ret = afs_volume_lookup(dev_name,
330 params.default_cell,
331 params.rwpath,
332 &params.volume);
333 if (ret < 0) 377 if (ret < 0)
334 goto error; 378 goto error;
335 379
336 /* allocate a deviceless superblock */ 380 /* try and do the mount securely */
337 sb = sget(fs_type, afs_test_super, set_anon_super, &params); 381 key = afs_request_key(params.cell);
338 if (IS_ERR(sb)) 382 if (IS_ERR(key)) {
383 _leave(" = %ld [key]", PTR_ERR(key));
384 ret = PTR_ERR(key);
339 goto error; 385 goto error;
386 }
387 params.key = key;
340 388
341 sb->s_flags = flags; 389 /* parse the device name */
390 vol = afs_volume_lookup(&params);
391 if (IS_ERR(vol)) {
392 ret = PTR_ERR(vol);
393 goto error;
394 }
395 params.volume = vol;
342 396
343 ret = afs_fill_super(sb, &params, flags & MS_SILENT ? 1 : 0); 397 /* allocate a deviceless superblock */
344 if (ret < 0) { 398 sb = sget(fs_type, afs_test_super, set_anon_super, &params);
345 up_write(&sb->s_umount); 399 if (IS_ERR(sb)) {
346 deactivate_super(sb); 400 ret = PTR_ERR(sb);
347 goto error; 401 goto error;
348 } 402 }
349 sb->s_flags |= MS_ACTIVE;
350 simple_set_mnt(mnt, sb);
351 403
404 if (!sb->s_root) {
405 /* initial superblock/root creation */
406 _debug("create");
407 sb->s_flags = flags;
408 ret = afs_fill_super(sb, &params);
409 if (ret < 0) {
410 up_write(&sb->s_umount);
411 deactivate_super(sb);
412 goto error;
413 }
414 sb->s_flags |= MS_ACTIVE;
415 } else {
416 _debug("reuse");
417 ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
418 }
419
420 simple_set_mnt(mnt, sb);
352 afs_put_volume(params.volume); 421 afs_put_volume(params.volume);
353 afs_put_cell(params.default_cell); 422 afs_put_cell(params.cell);
354 _leave(" = 0 [%p]", 0, sb); 423 _leave(" = 0 [%p]", sb);
355 return 0; 424 return 0;
356 425
357 error: 426error:
358 afs_put_volume(params.volume); 427 afs_put_volume(params.volume);
359 afs_put_cell(params.default_cell); 428 afs_put_cell(params.cell);
360 afscm_stop(); 429 key_put(params.key);
361 _leave(" = %d", ret); 430 _leave(" = %d", ret);
362 return ret; 431 return ret;
363} /* end afs_get_sb() */ 432}
364 433
365/*****************************************************************************/
366/* 434/*
367 * finish the unmounting process on the superblock 435 * finish the unmounting process on the superblock
368 */ 436 */
@@ -373,35 +441,29 @@ static void afs_put_super(struct super_block *sb)
373 _enter(""); 441 _enter("");
374 442
375 afs_put_volume(as->volume); 443 afs_put_volume(as->volume);
376 afscm_stop();
377 444
378 _leave(""); 445 _leave("");
379} /* end afs_put_super() */ 446}
380 447
381/*****************************************************************************/
382/* 448/*
383 * initialise an inode cache slab element prior to any use 449 * initialise an inode cache slab element prior to any use
384 */ 450 */
385static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep, 451static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
386 unsigned long flags) 452 unsigned long flags)
387{ 453{
388 struct afs_vnode *vnode = (struct afs_vnode *) _vnode; 454 struct afs_vnode *vnode = _vnode;
389 455
390 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 456 if (flags & SLAB_CTOR_CONSTRUCTOR) {
391 SLAB_CTOR_CONSTRUCTOR) {
392 memset(vnode, 0, sizeof(*vnode)); 457 memset(vnode, 0, sizeof(*vnode));
393 inode_init_once(&vnode->vfs_inode); 458 inode_init_once(&vnode->vfs_inode);
394 init_waitqueue_head(&vnode->update_waitq); 459 init_waitqueue_head(&vnode->update_waitq);
460 mutex_init(&vnode->permits_lock);
461 mutex_init(&vnode->validate_lock);
395 spin_lock_init(&vnode->lock); 462 spin_lock_init(&vnode->lock);
396 INIT_LIST_HEAD(&vnode->cb_link); 463 INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
397 INIT_LIST_HEAD(&vnode->cb_hash_link);
398 afs_timer_init(&vnode->cb_timeout,
399 &afs_vnode_cb_timed_out_ops);
400 } 464 }
465}
401 466
402} /* end afs_i_init_once() */
403
404/*****************************************************************************/
405/* 467/*
406 * allocate an AFS inode struct from our slab cache 468 * allocate an AFS inode struct from our slab cache
407 */ 469 */
@@ -409,8 +471,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
409{ 471{
410 struct afs_vnode *vnode; 472 struct afs_vnode *vnode;
411 473
412 vnode = (struct afs_vnode *) 474 vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
413 kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
414 if (!vnode) 475 if (!vnode)
415 return NULL; 476 return NULL;
416 477
@@ -421,21 +482,25 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
421 482
422 vnode->volume = NULL; 483 vnode->volume = NULL;
423 vnode->update_cnt = 0; 484 vnode->update_cnt = 0;
424 vnode->flags = 0; 485 vnode->flags = 1 << AFS_VNODE_UNSET;
486 vnode->cb_promised = false;
425 487
426 return &vnode->vfs_inode; 488 return &vnode->vfs_inode;
427} /* end afs_alloc_inode() */ 489}
428 490
429/*****************************************************************************/
430/* 491/*
431 * destroy an AFS inode struct 492 * destroy an AFS inode struct
432 */ 493 */
433static void afs_destroy_inode(struct inode *inode) 494static void afs_destroy_inode(struct inode *inode)
434{ 495{
496 struct afs_vnode *vnode = AFS_FS_I(inode);
497
435 _enter("{%lu}", inode->i_ino); 498 _enter("{%lu}", inode->i_ino);
436 499
437 kmem_cache_free(afs_inode_cachep, AFS_FS_I(inode)); 500 _debug("DESTROY INODE %p", inode);
438 501
439 atomic_dec(&afs_count_active_inodes); 502 ASSERTCMP(vnode->server, ==, NULL);
440 503
441} /* end afs_destroy_inode() */ 504 kmem_cache_free(afs_inode_cachep, vnode);
505 atomic_dec(&afs_count_active_inodes);
506}
diff --git a/fs/afs/super.h b/fs/afs/super.h
deleted file mode 100644
index 32de8cc6fae8..000000000000
--- a/fs/afs/super.h
+++ /dev/null
@@ -1,45 +0,0 @@
1/* super.h: AFS filesystem internal private data
2 *
3 * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
4 *
5 * This software may be freely redistributed under the terms of the
6 * GNU General Public License.
7 *
8 * You should have received a copy of the GNU General Public License
9 * along with this program; if not, write to the Free Software
10 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
11 *
12 * Authors: David Woodhouse <dwmw2@cambridge.redhat.com>
13 * David Howells <dhowells@redhat.com>
14 *
15 */
16
17#ifndef _LINUX_AFS_SUPER_H
18#define _LINUX_AFS_SUPER_H
19
20#include <linux/fs.h>
21#include "server.h"
22
23#ifdef __KERNEL__
24
25/*****************************************************************************/
26/*
27 * AFS superblock private data
28 * - there's one superblock per volume
29 */
30struct afs_super_info
31{
32 struct afs_volume *volume; /* volume record */
33 char rwparent; /* T if parent is R/W AFS volume */
34};
35
36static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
37{
38 return sb->s_fs_info;
39}
40
41extern struct file_system_type afs_fs_type;
42
43#endif /* __KERNEL__ */
44
45#endif /* _LINUX_AFS_SUPER_H */
diff --git a/fs/afs/transport.h b/fs/afs/transport.h
deleted file mode 100644
index 7013ae6ccc8c..000000000000
--- a/fs/afs/transport.h
+++ /dev/null
@@ -1,21 +0,0 @@
1/* transport.h: AFS transport management
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef _LINUX_AFS_TRANSPORT_H
13#define _LINUX_AFS_TRANSPORT_H
14
15#include "types.h"
16#include <rxrpc/transport.h>
17
18/* the cache manager transport endpoint */
19extern struct rxrpc_transport *afs_transport;
20
21#endif /* _LINUX_AFS_TRANSPORT_H */
diff --git a/fs/afs/types.h b/fs/afs/types.h
deleted file mode 100644
index b1a2367c7587..000000000000
--- a/fs/afs/types.h
+++ /dev/null
@@ -1,125 +0,0 @@
1/* types.h: AFS types
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef _LINUX_AFS_TYPES_H
13#define _LINUX_AFS_TYPES_H
14
15#ifdef __KERNEL__
16#include <rxrpc/types.h>
17#endif /* __KERNEL__ */
18
19typedef unsigned afs_volid_t;
20typedef unsigned afs_vnodeid_t;
21typedef unsigned long long afs_dataversion_t;
22
23typedef enum {
24 AFSVL_RWVOL, /* read/write volume */
25 AFSVL_ROVOL, /* read-only volume */
26 AFSVL_BACKVOL, /* backup volume */
27} __attribute__((packed)) afs_voltype_t;
28
29typedef enum {
30 AFS_FTYPE_INVALID = 0,
31 AFS_FTYPE_FILE = 1,
32 AFS_FTYPE_DIR = 2,
33 AFS_FTYPE_SYMLINK = 3,
34} afs_file_type_t;
35
36#ifdef __KERNEL__
37
38struct afs_cell;
39struct afs_vnode;
40
41/*****************************************************************************/
42/*
43 * AFS file identifier
44 */
45struct afs_fid
46{
47 afs_volid_t vid; /* volume ID */
48 afs_vnodeid_t vnode; /* file index within volume */
49 unsigned unique; /* unique ID number (file index version) */
50};
51
52/*****************************************************************************/
53/*
54 * AFS callback notification
55 */
56typedef enum {
57 AFSCM_CB_UNTYPED = 0, /* no type set on CB break */
58 AFSCM_CB_EXCLUSIVE = 1, /* CB exclusive to CM [not implemented] */
59 AFSCM_CB_SHARED = 2, /* CB shared by other CM's */
60 AFSCM_CB_DROPPED = 3, /* CB promise cancelled by file server */
61} afs_callback_type_t;
62
63struct afs_callback
64{
65 struct afs_server *server; /* server that made the promise */
66 struct afs_fid fid; /* file identifier */
67 unsigned version; /* callback version */
68 unsigned expiry; /* time at which expires */
69 afs_callback_type_t type; /* type of callback */
70};
71
72#define AFSCBMAX 50
73
74/*****************************************************************************/
75/*
76 * AFS volume information
77 */
78struct afs_volume_info
79{
80 afs_volid_t vid; /* volume ID */
81 afs_voltype_t type; /* type of this volume */
82 afs_volid_t type_vids[5]; /* volume ID's for possible types for this vol */
83
84 /* list of fileservers serving this volume */
85 size_t nservers; /* number of entries used in servers[] */
86 struct {
87 struct in_addr addr; /* fileserver address */
88 } servers[8];
89};
90
91/*****************************************************************************/
92/*
93 * AFS file status information
94 */
95struct afs_file_status
96{
97 unsigned if_version; /* interface version */
98#define AFS_FSTATUS_VERSION 1
99
100 afs_file_type_t type; /* file type */
101 unsigned nlink; /* link count */
102 size_t size; /* file size */
103 afs_dataversion_t version; /* current data version */
104 unsigned author; /* author ID */
105 unsigned owner; /* owner ID */
106 unsigned caller_access; /* access rights for authenticated caller */
107 unsigned anon_access; /* access rights for unauthenticated caller */
108 umode_t mode; /* UNIX mode */
109 struct afs_fid parent; /* parent file ID */
110 time_t mtime_client; /* last time client changed data */
111 time_t mtime_server; /* last time server changed data */
112};
113
114/*****************************************************************************/
115/*
116 * AFS volume synchronisation information
117 */
118struct afs_volsync
119{
120 time_t creation; /* volume creation time */
121};
122
123#endif /* __KERNEL__ */
124
125#endif /* _LINUX_AFS_TYPES_H */
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 7b0e3192ee39..36c1306e09e0 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -1,4 +1,4 @@
1/* vlclient.c: AFS Volume Location Service client 1/* AFS Volume Location Service client
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
@@ -11,247 +11,76 @@
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <rxrpc/rxrpc.h>
15#include <rxrpc/transport.h>
16#include <rxrpc/connection.h>
17#include <rxrpc/call.h>
18#include "server.h"
19#include "volume.h"
20#include "vlclient.h"
21#include "kafsasyncd.h"
22#include "kafstimod.h"
23#include "errors.h"
24#include "internal.h" 14#include "internal.h"
25 15
26#define VLGETENTRYBYID 503 /* AFS Get Cache Entry By ID operation ID */
27#define VLGETENTRYBYNAME 504 /* AFS Get Cache Entry By Name operation ID */
28#define VLPROBE 514 /* AFS Probe Volume Location Service operation ID */
29
30static void afs_rxvl_get_entry_by_id_attn(struct rxrpc_call *call);
31static void afs_rxvl_get_entry_by_id_error(struct rxrpc_call *call);
32
33/*****************************************************************************/
34/* 16/*
35 * map afs VL abort codes to/from Linux error codes 17 * map volume locator abort codes to error codes
36 * - called with call->lock held
37 */ 18 */
38static void afs_rxvl_aemap(struct rxrpc_call *call) 19static int afs_vl_abort_to_error(u32 abort_code)
39{ 20{
40 int err; 21 _enter("%u", abort_code);
41 22
42 _enter("{%u,%u,%d}", 23 switch (abort_code) {
43 call->app_err_state, call->app_abort_code, call->app_errno); 24 case AFSVL_IDEXIST: return -EEXIST;
44 25 case AFSVL_IO: return -EREMOTEIO;
45 switch (call->app_err_state) { 26 case AFSVL_NAMEEXIST: return -EEXIST;
46 case RXRPC_ESTATE_LOCAL_ABORT: 27 case AFSVL_CREATEFAIL: return -EREMOTEIO;
47 call->app_abort_code = -call->app_errno; 28 case AFSVL_NOENT: return -ENOMEDIUM;
48 return; 29 case AFSVL_EMPTY: return -ENOMEDIUM;
49 30 case AFSVL_ENTDELETED: return -ENOMEDIUM;
50 case RXRPC_ESTATE_PEER_ABORT: 31 case AFSVL_BADNAME: return -EINVAL;
51 switch (call->app_abort_code) { 32 case AFSVL_BADINDEX: return -EINVAL;
52 case AFSVL_IDEXIST: err = -EEXIST; break; 33 case AFSVL_BADVOLTYPE: return -EINVAL;
53 case AFSVL_IO: err = -EREMOTEIO; break; 34 case AFSVL_BADSERVER: return -EINVAL;
54 case AFSVL_NAMEEXIST: err = -EEXIST; break; 35 case AFSVL_BADPARTITION: return -EINVAL;
55 case AFSVL_CREATEFAIL: err = -EREMOTEIO; break; 36 case AFSVL_REPSFULL: return -EFBIG;
56 case AFSVL_NOENT: err = -ENOMEDIUM; break; 37 case AFSVL_NOREPSERVER: return -ENOENT;
57 case AFSVL_EMPTY: err = -ENOMEDIUM; break; 38 case AFSVL_DUPREPSERVER: return -EEXIST;
58 case AFSVL_ENTDELETED: err = -ENOMEDIUM; break; 39 case AFSVL_RWNOTFOUND: return -ENOENT;
59 case AFSVL_BADNAME: err = -EINVAL; break; 40 case AFSVL_BADREFCOUNT: return -EINVAL;
60 case AFSVL_BADINDEX: err = -EINVAL; break; 41 case AFSVL_SIZEEXCEEDED: return -EINVAL;
61 case AFSVL_BADVOLTYPE: err = -EINVAL; break; 42 case AFSVL_BADENTRY: return -EINVAL;
62 case AFSVL_BADSERVER: err = -EINVAL; break; 43 case AFSVL_BADVOLIDBUMP: return -EINVAL;
63 case AFSVL_BADPARTITION: err = -EINVAL; break; 44 case AFSVL_IDALREADYHASHED: return -EINVAL;
64 case AFSVL_REPSFULL: err = -EFBIG; break; 45 case AFSVL_ENTRYLOCKED: return -EBUSY;
65 case AFSVL_NOREPSERVER: err = -ENOENT; break; 46 case AFSVL_BADVOLOPER: return -EBADRQC;
66 case AFSVL_DUPREPSERVER: err = -EEXIST; break; 47 case AFSVL_BADRELLOCKTYPE: return -EINVAL;
67 case AFSVL_RWNOTFOUND: err = -ENOENT; break; 48 case AFSVL_RERELEASE: return -EREMOTEIO;
68 case AFSVL_BADREFCOUNT: err = -EINVAL; break; 49 case AFSVL_BADSERVERFLAG: return -EINVAL;
69 case AFSVL_SIZEEXCEEDED: err = -EINVAL; break; 50 case AFSVL_PERM: return -EACCES;
70 case AFSVL_BADENTRY: err = -EINVAL; break; 51 case AFSVL_NOMEM: return -EREMOTEIO;
71 case AFSVL_BADVOLIDBUMP: err = -EINVAL; break;
72 case AFSVL_IDALREADYHASHED: err = -EINVAL; break;
73 case AFSVL_ENTRYLOCKED: err = -EBUSY; break;
74 case AFSVL_BADVOLOPER: err = -EBADRQC; break;
75 case AFSVL_BADRELLOCKTYPE: err = -EINVAL; break;
76 case AFSVL_RERELEASE: err = -EREMOTEIO; break;
77 case AFSVL_BADSERVERFLAG: err = -EINVAL; break;
78 case AFSVL_PERM: err = -EACCES; break;
79 case AFSVL_NOMEM: err = -EREMOTEIO; break;
80 default:
81 err = afs_abort_to_error(call->app_abort_code);
82 break;
83 }
84 call->app_errno = err;
85 return;
86
87 default: 52 default:
88 return; 53 return afs_abort_to_error(abort_code);
89 } 54 }
90} /* end afs_rxvl_aemap() */ 55}
91 56
92#if 0
93/*****************************************************************************/
94/* 57/*
95 * probe a volume location server to see if it is still alive -- unused 58 * deliver reply data to a VL.GetEntryByXXX call
96 */ 59 */
97static int afs_rxvl_probe(struct afs_server *server, int alloc_flags) 60static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
61 struct sk_buff *skb, bool last)
98{ 62{
99 struct rxrpc_connection *conn; 63 struct afs_cache_vlocation *entry;
100 struct rxrpc_call *call; 64 __be32 *bp;
101 struct kvec piov[1]; 65 u32 tmp;
102 size_t sent; 66 int loop;
103 int ret;
104 __be32 param[1];
105
106 DECLARE_WAITQUEUE(myself, current);
107
108 /* get hold of the vlserver connection */
109 ret = afs_server_get_vlconn(server, &conn);
110 if (ret < 0)
111 goto out;
112
113 /* create a call through that connection */
114 ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
115 if (ret < 0) {
116 printk("kAFS: Unable to create call: %d\n", ret);
117 goto out_put_conn;
118 }
119 call->app_opcode = VLPROBE;
120
121 /* we want to get event notifications from the call */
122 add_wait_queue(&call->waitq, &myself);
123
124 /* marshall the parameters */
125 param[0] = htonl(VLPROBE);
126 piov[0].iov_len = sizeof(param);
127 piov[0].iov_base = param;
128
129 /* send the parameters to the server */
130 ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET,
131 alloc_flags, 0, &sent);
132 if (ret < 0)
133 goto abort;
134
135 /* wait for the reply to completely arrive */
136 for (;;) {
137 set_current_state(TASK_INTERRUPTIBLE);
138 if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
139 signal_pending(current))
140 break;
141 schedule();
142 }
143 set_current_state(TASK_RUNNING);
144
145 ret = -EINTR;
146 if (signal_pending(current))
147 goto abort;
148
149 switch (call->app_call_state) {
150 case RXRPC_CSTATE_ERROR:
151 ret = call->app_errno;
152 goto out_unwait;
153
154 case RXRPC_CSTATE_CLNT_GOT_REPLY:
155 ret = 0;
156 goto out_unwait;
157
158 default:
159 BUG();
160 }
161
162 abort:
163 set_current_state(TASK_UNINTERRUPTIBLE);
164 rxrpc_call_abort(call, ret);
165 schedule();
166
167 out_unwait:
168 set_current_state(TASK_RUNNING);
169 remove_wait_queue(&call->waitq, &myself);
170 rxrpc_put_call(call);
171 out_put_conn:
172 rxrpc_put_connection(conn);
173 out:
174 return ret;
175 67
176} /* end afs_rxvl_probe() */ 68 _enter(",,%u", last);
177#endif
178 69
179/*****************************************************************************/ 70 afs_transfer_reply(call, skb);
180/* 71 if (!last)
181 * look up a volume location database entry by name 72 return 0;
182 */
183int afs_rxvl_get_entry_by_name(struct afs_server *server,
184 const char *volname,
185 unsigned volnamesz,
186 struct afs_cache_vlocation *entry)
187{
188 DECLARE_WAITQUEUE(myself, current);
189
190 struct rxrpc_connection *conn;
191 struct rxrpc_call *call;
192 struct kvec piov[3];
193 unsigned tmp;
194 size_t sent;
195 int ret, loop;
196 __be32 *bp, param[2], zero;
197
198 _enter(",%*.*s,%u,", volnamesz, volnamesz, volname, volnamesz);
199
200 memset(entry, 0, sizeof(*entry));
201
202 /* get hold of the vlserver connection */
203 ret = afs_server_get_vlconn(server, &conn);
204 if (ret < 0)
205 goto out;
206
207 /* create a call through that connection */
208 ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
209 if (ret < 0) {
210 printk("kAFS: Unable to create call: %d\n", ret);
211 goto out_put_conn;
212 }
213 call->app_opcode = VLGETENTRYBYNAME;
214 73
215 /* we want to get event notifications from the call */ 74 if (call->reply_size != call->reply_max)
216 add_wait_queue(&call->waitq, &myself); 75 return -EBADMSG;
217 76
218 /* marshall the parameters */ 77 /* unmarshall the reply once we've received all of it */
219 piov[1].iov_len = volnamesz; 78 entry = call->reply;
220 piov[1].iov_base = (char *) volname; 79 bp = call->buffer;
221
222 zero = 0;
223 piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
224 piov[2].iov_base = &zero;
225
226 param[0] = htonl(VLGETENTRYBYNAME);
227 param[1] = htonl(piov[1].iov_len);
228
229 piov[0].iov_len = sizeof(param);
230 piov[0].iov_base = param;
231
232 /* send the parameters to the server */
233 ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
234 0, &sent);
235 if (ret < 0)
236 goto abort;
237
238 /* wait for the reply to completely arrive */
239 bp = rxrpc_call_alloc_scratch(call, 384);
240
241 ret = rxrpc_call_read_data(call, bp, 384,
242 RXRPC_CALL_READ_BLOCK |
243 RXRPC_CALL_READ_ALL);
244 if (ret < 0) {
245 if (ret == -ECONNABORTED) {
246 ret = call->app_errno;
247 goto out_unwait;
248 }
249 goto abort;
250 }
251 80
252 /* unmarshall the reply */
253 for (loop = 0; loop < 64; loop++) 81 for (loop = 0; loop < 64; loop++)
254 entry->name[loop] = ntohl(*bp++); 82 entry->name[loop] = ntohl(*bp++);
83 entry->name[loop] = 0;
255 bp++; /* final NUL */ 84 bp++; /* final NUL */
256 85
257 bp++; /* type */ 86 bp++; /* type */
@@ -264,6 +93,7 @@ int afs_rxvl_get_entry_by_name(struct afs_server *server,
264 93
265 for (loop = 0; loop < 8; loop++) { 94 for (loop = 0; loop < 8; loop++) {
266 tmp = ntohl(*bp++); 95 tmp = ntohl(*bp++);
96 entry->srvtmask[loop] = 0;
267 if (tmp & AFS_VLSF_RWVOL) 97 if (tmp & AFS_VLSF_RWVOL)
268 entry->srvtmask[loop] |= AFS_VOL_VTM_RW; 98 entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
269 if (tmp & AFS_VLSF_ROVOL) 99 if (tmp & AFS_VLSF_ROVOL)
@@ -279,417 +109,110 @@ int afs_rxvl_get_entry_by_name(struct afs_server *server,
279 bp++; /* clone ID */ 109 bp++; /* clone ID */
280 110
281 tmp = ntohl(*bp++); /* flags */ 111 tmp = ntohl(*bp++); /* flags */
112 entry->vidmask = 0;
282 if (tmp & AFS_VLF_RWEXISTS) 113 if (tmp & AFS_VLF_RWEXISTS)
283 entry->vidmask |= AFS_VOL_VTM_RW; 114 entry->vidmask |= AFS_VOL_VTM_RW;
284 if (tmp & AFS_VLF_ROEXISTS) 115 if (tmp & AFS_VLF_ROEXISTS)
285 entry->vidmask |= AFS_VOL_VTM_RO; 116 entry->vidmask |= AFS_VOL_VTM_RO;
286 if (tmp & AFS_VLF_BACKEXISTS) 117 if (tmp & AFS_VLF_BACKEXISTS)
287 entry->vidmask |= AFS_VOL_VTM_BAK; 118 entry->vidmask |= AFS_VOL_VTM_BAK;
288
289 ret = -ENOMEDIUM;
290 if (!entry->vidmask) 119 if (!entry->vidmask)
291 goto abort; 120 return -EBADMSG;
292
293 /* success */
294 entry->rtime = get_seconds();
295 ret = 0;
296
297 out_unwait:
298 set_current_state(TASK_RUNNING);
299 remove_wait_queue(&call->waitq, &myself);
300 rxrpc_put_call(call);
301 out_put_conn:
302 rxrpc_put_connection(conn);
303 out:
304 _leave(" = %d", ret);
305 return ret;
306
307 abort:
308 set_current_state(TASK_UNINTERRUPTIBLE);
309 rxrpc_call_abort(call, ret);
310 schedule();
311 goto out_unwait;
312} /* end afs_rxvl_get_entry_by_name() */
313
314/*****************************************************************************/
315/*
316 * look up a volume location database entry by ID
317 */
318int afs_rxvl_get_entry_by_id(struct afs_server *server,
319 afs_volid_t volid,
320 afs_voltype_t voltype,
321 struct afs_cache_vlocation *entry)
322{
323 DECLARE_WAITQUEUE(myself, current);
324
325 struct rxrpc_connection *conn;
326 struct rxrpc_call *call;
327 struct kvec piov[1];
328 unsigned tmp;
329 size_t sent;
330 int ret, loop;
331 __be32 *bp, param[3];
332
333 _enter(",%x,%d,", volid, voltype);
334
335 memset(entry, 0, sizeof(*entry));
336
337 /* get hold of the vlserver connection */
338 ret = afs_server_get_vlconn(server, &conn);
339 if (ret < 0)
340 goto out;
341
342 /* create a call through that connection */
343 ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
344 if (ret < 0) {
345 printk("kAFS: Unable to create call: %d\n", ret);
346 goto out_put_conn;
347 }
348 call->app_opcode = VLGETENTRYBYID;
349
350 /* we want to get event notifications from the call */
351 add_wait_queue(&call->waitq, &myself);
352
353 /* marshall the parameters */
354 param[0] = htonl(VLGETENTRYBYID);
355 param[1] = htonl(volid);
356 param[2] = htonl(voltype);
357
358 piov[0].iov_len = sizeof(param);
359 piov[0].iov_base = param;
360
361 /* send the parameters to the server */
362 ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
363 0, &sent);
364 if (ret < 0)
365 goto abort;
366
367 /* wait for the reply to completely arrive */
368 bp = rxrpc_call_alloc_scratch(call, 384);
369
370 ret = rxrpc_call_read_data(call, bp, 384,
371 RXRPC_CALL_READ_BLOCK |
372 RXRPC_CALL_READ_ALL);
373 if (ret < 0) {
374 if (ret == -ECONNABORTED) {
375 ret = call->app_errno;
376 goto out_unwait;
377 }
378 goto abort;
379 }
380
381 /* unmarshall the reply */
382 for (loop = 0; loop < 64; loop++)
383 entry->name[loop] = ntohl(*bp++);
384 bp++; /* final NUL */
385 121
386 bp++; /* type */ 122 _leave(" = 0 [done]");
387 entry->nservers = ntohl(*bp++); 123 return 0;
388 124}
389 for (loop = 0; loop < 8; loop++)
390 entry->servers[loop].s_addr = *bp++;
391
392 bp += 8; /* partition IDs */
393 125
394 for (loop = 0; loop < 8; loop++) {
395 tmp = ntohl(*bp++);
396 if (tmp & AFS_VLSF_RWVOL)
397 entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
398 if (tmp & AFS_VLSF_ROVOL)
399 entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
400 if (tmp & AFS_VLSF_BACKVOL)
401 entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
402 }
403
404 entry->vid[0] = ntohl(*bp++);
405 entry->vid[1] = ntohl(*bp++);
406 entry->vid[2] = ntohl(*bp++);
407
408 bp++; /* clone ID */
409
410 tmp = ntohl(*bp++); /* flags */
411 if (tmp & AFS_VLF_RWEXISTS)
412 entry->vidmask |= AFS_VOL_VTM_RW;
413 if (tmp & AFS_VLF_ROEXISTS)
414 entry->vidmask |= AFS_VOL_VTM_RO;
415 if (tmp & AFS_VLF_BACKEXISTS)
416 entry->vidmask |= AFS_VOL_VTM_BAK;
417
418 ret = -ENOMEDIUM;
419 if (!entry->vidmask)
420 goto abort;
421
422#if 0 /* TODO: remove */
423 entry->nservers = 3;
424 entry->servers[0].s_addr = htonl(0xac101249);
425 entry->servers[1].s_addr = htonl(0xac101243);
426 entry->servers[2].s_addr = htonl(0xac10125b /*0xac10125b*/);
427
428 entry->srvtmask[0] = AFS_VOL_VTM_RO;
429 entry->srvtmask[1] = AFS_VOL_VTM_RO;
430 entry->srvtmask[2] = AFS_VOL_VTM_RO | AFS_VOL_VTM_RW;
431#endif
432
433 /* success */
434 entry->rtime = get_seconds();
435 ret = 0;
436
437 out_unwait:
438 set_current_state(TASK_RUNNING);
439 remove_wait_queue(&call->waitq, &myself);
440 rxrpc_put_call(call);
441 out_put_conn:
442 rxrpc_put_connection(conn);
443 out:
444 _leave(" = %d", ret);
445 return ret;
446
447 abort:
448 set_current_state(TASK_UNINTERRUPTIBLE);
449 rxrpc_call_abort(call, ret);
450 schedule();
451 goto out_unwait;
452} /* end afs_rxvl_get_entry_by_id() */
453
454/*****************************************************************************/
455/* 126/*
456 * look up a volume location database entry by ID asynchronously 127 * VL.GetEntryByName operation type
457 */ 128 */
458int afs_rxvl_get_entry_by_id_async(struct afs_async_op *op, 129static const struct afs_call_type afs_RXVLGetEntryByName = {
459 afs_volid_t volid, 130 .name = "VL.GetEntryByName",
460 afs_voltype_t voltype) 131 .deliver = afs_deliver_vl_get_entry_by_xxx,
461{ 132 .abort_to_error = afs_vl_abort_to_error,
462 struct rxrpc_connection *conn; 133 .destructor = afs_flat_call_destructor,
463 struct rxrpc_call *call; 134};
464 struct kvec piov[1];
465 size_t sent;
466 int ret;
467 __be32 param[3];
468
469 _enter(",%x,%d,", volid, voltype);
470
471 /* get hold of the vlserver connection */
472 ret = afs_server_get_vlconn(op->server, &conn);
473 if (ret < 0) {
474 _leave(" = %d", ret);
475 return ret;
476 }
477
478 /* create a call through that connection */
479 ret = rxrpc_create_call(conn,
480 afs_rxvl_get_entry_by_id_attn,
481 afs_rxvl_get_entry_by_id_error,
482 afs_rxvl_aemap,
483 &op->call);
484 rxrpc_put_connection(conn);
485
486 if (ret < 0) {
487 printk("kAFS: Unable to create call: %d\n", ret);
488 _leave(" = %d", ret);
489 return ret;
490 }
491 135
492 op->call->app_opcode = VLGETENTRYBYID; 136/*
493 op->call->app_user = op; 137 * VL.GetEntryById operation type
494 138 */
495 call = op->call; 139static const struct afs_call_type afs_RXVLGetEntryById = {
496 rxrpc_get_call(call); 140 .name = "VL.GetEntryById",
497 141 .deliver = afs_deliver_vl_get_entry_by_xxx,
498 /* send event notifications from the call to kafsasyncd */ 142 .abort_to_error = afs_vl_abort_to_error,
499 afs_kafsasyncd_begin_op(op); 143 .destructor = afs_flat_call_destructor,
500 144};
501 /* marshall the parameters */
502 param[0] = htonl(VLGETENTRYBYID);
503 param[1] = htonl(volid);
504 param[2] = htonl(voltype);
505
506 piov[0].iov_len = sizeof(param);
507 piov[0].iov_base = param;
508
509 /* allocate result read buffer in scratch space */
510 call->app_scr_ptr = rxrpc_call_alloc_scratch(op->call, 384);
511
512 /* send the parameters to the server */
513 ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
514 0, &sent);
515 if (ret < 0) {
516 rxrpc_call_abort(call, ret); /* handle from kafsasyncd */
517 ret = 0;
518 goto out;
519 }
520
521 /* wait for the reply to completely arrive */
522 ret = rxrpc_call_read_data(call, call->app_scr_ptr, 384, 0);
523 switch (ret) {
524 case 0:
525 case -EAGAIN:
526 case -ECONNABORTED:
527 ret = 0;
528 break; /* all handled by kafsasyncd */
529
530 default:
531 rxrpc_call_abort(call, ret); /* make kafsasyncd handle it */
532 ret = 0;
533 break;
534 }
535
536 out:
537 rxrpc_put_call(call);
538 _leave(" = %d", ret);
539 return ret;
540
541} /* end afs_rxvl_get_entry_by_id_async() */
542 145
543/*****************************************************************************/
544/* 146/*
545 * attend to the asynchronous get VLDB entry by ID 147 * dispatch a get volume entry by name operation
546 */ 148 */
547int afs_rxvl_get_entry_by_id_async2(struct afs_async_op *op, 149int afs_vl_get_entry_by_name(struct in_addr *addr,
548 struct afs_cache_vlocation *entry) 150 struct key *key,
151 const char *volname,
152 struct afs_cache_vlocation *entry,
153 const struct afs_wait_mode *wait_mode)
549{ 154{
155 struct afs_call *call;
156 size_t volnamesz, reqsz, padsz;
550 __be32 *bp; 157 __be32 *bp;
551 __u32 tmp;
552 int loop, ret;
553
554 _enter("{op=%p cst=%u}", op, op->call->app_call_state);
555
556 memset(entry, 0, sizeof(*entry));
557
558 if (op->call->app_call_state == RXRPC_CSTATE_COMPLETE) {
559 /* operation finished */
560 afs_kafsasyncd_terminate_op(op);
561
562 bp = op->call->app_scr_ptr;
563
564 /* unmarshall the reply */
565 for (loop = 0; loop < 64; loop++)
566 entry->name[loop] = ntohl(*bp++);
567 bp++; /* final NUL */
568
569 bp++; /* type */
570 entry->nservers = ntohl(*bp++);
571
572 for (loop = 0; loop < 8; loop++)
573 entry->servers[loop].s_addr = *bp++;
574
575 bp += 8; /* partition IDs */
576
577 for (loop = 0; loop < 8; loop++) {
578 tmp = ntohl(*bp++);
579 if (tmp & AFS_VLSF_RWVOL)
580 entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
581 if (tmp & AFS_VLSF_ROVOL)
582 entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
583 if (tmp & AFS_VLSF_BACKVOL)
584 entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
585 }
586
587 entry->vid[0] = ntohl(*bp++);
588 entry->vid[1] = ntohl(*bp++);
589 entry->vid[2] = ntohl(*bp++);
590
591 bp++; /* clone ID */
592
593 tmp = ntohl(*bp++); /* flags */
594 if (tmp & AFS_VLF_RWEXISTS)
595 entry->vidmask |= AFS_VOL_VTM_RW;
596 if (tmp & AFS_VLF_ROEXISTS)
597 entry->vidmask |= AFS_VOL_VTM_RO;
598 if (tmp & AFS_VLF_BACKEXISTS)
599 entry->vidmask |= AFS_VOL_VTM_BAK;
600
601 ret = -ENOMEDIUM;
602 if (!entry->vidmask) {
603 rxrpc_call_abort(op->call, ret);
604 goto done;
605 }
606
607#if 0 /* TODO: remove */
608 entry->nservers = 3;
609 entry->servers[0].s_addr = htonl(0xac101249);
610 entry->servers[1].s_addr = htonl(0xac101243);
611 entry->servers[2].s_addr = htonl(0xac10125b /*0xac10125b*/);
612
613 entry->srvtmask[0] = AFS_VOL_VTM_RO;
614 entry->srvtmask[1] = AFS_VOL_VTM_RO;
615 entry->srvtmask[2] = AFS_VOL_VTM_RO | AFS_VOL_VTM_RW;
616#endif
617
618 /* success */
619 entry->rtime = get_seconds();
620 ret = 0;
621 goto done;
622 }
623 158
624 if (op->call->app_call_state == RXRPC_CSTATE_ERROR) { 159 _enter("");
625 /* operation error */
626 ret = op->call->app_errno;
627 goto done;
628 }
629 160
630 _leave(" = -EAGAIN"); 161 volnamesz = strlen(volname);
631 return -EAGAIN; 162 padsz = (4 - (volnamesz & 3)) & 3;
163 reqsz = 8 + volnamesz + padsz;
632 164
633 done: 165 call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384);
634 rxrpc_put_call(op->call); 166 if (!call)
635 op->call = NULL; 167 return -ENOMEM;
636 _leave(" = %d", ret);
637 return ret;
638} /* end afs_rxvl_get_entry_by_id_async2() */
639 168
640/*****************************************************************************/ 169 call->key = key;
641/* 170 call->reply = entry;
642 * handle attention events on an async get-entry-by-ID op 171 call->service_id = VL_SERVICE;
643 * - called from krxiod 172 call->port = htons(AFS_VL_PORT);
644 */
645static void afs_rxvl_get_entry_by_id_attn(struct rxrpc_call *call)
646{
647 struct afs_async_op *op = call->app_user;
648
649 _enter("{op=%p cst=%u}", op, call->app_call_state);
650
651 switch (call->app_call_state) {
652 case RXRPC_CSTATE_COMPLETE:
653 afs_kafsasyncd_attend_op(op);
654 break;
655 case RXRPC_CSTATE_CLNT_RCV_REPLY:
656 if (call->app_async_read)
657 break;
658 case RXRPC_CSTATE_CLNT_GOT_REPLY:
659 if (call->app_read_count == 0)
660 break;
661 printk("kAFS: Reply bigger than expected"
662 " {cst=%u asyn=%d mark=%Zu rdy=%Zu pr=%u%s}",
663 call->app_call_state,
664 call->app_async_read,
665 call->app_mark,
666 call->app_ready_qty,
667 call->pkt_rcv_count,
668 call->app_last_rcv ? " last" : "");
669
670 rxrpc_call_abort(call, -EBADMSG);
671 break;
672 default:
673 BUG();
674 }
675 173
676 _leave(""); 174 /* marshall the parameters */
175 bp = call->request;
176 *bp++ = htonl(VLGETENTRYBYNAME);
177 *bp++ = htonl(volnamesz);
178 memcpy(bp, volname, volnamesz);
179 if (padsz > 0)
180 memset((void *) bp + volnamesz, 0, padsz);
677 181
678} /* end afs_rxvl_get_entry_by_id_attn() */ 182 /* initiate the call */
183 return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
184}
679 185
680/*****************************************************************************/
681/* 186/*
682 * handle error events on an async get-entry-by-ID op 187 * dispatch a get volume entry by ID operation
683 * - called from krxiod
684 */ 188 */
685static void afs_rxvl_get_entry_by_id_error(struct rxrpc_call *call) 189int afs_vl_get_entry_by_id(struct in_addr *addr,
190 struct key *key,
191 afs_volid_t volid,
192 afs_voltype_t voltype,
193 struct afs_cache_vlocation *entry,
194 const struct afs_wait_mode *wait_mode)
686{ 195{
687 struct afs_async_op *op = call->app_user; 196 struct afs_call *call;
197 __be32 *bp;
688 198
689 _enter("{op=%p cst=%u}", op, call->app_call_state); 199 _enter("");
690 200
691 afs_kafsasyncd_attend_op(op); 201 call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384);
202 if (!call)
203 return -ENOMEM;
692 204
693 _leave(""); 205 call->key = key;
206 call->reply = entry;
207 call->service_id = VL_SERVICE;
208 call->port = htons(AFS_VL_PORT);
694 209
695} /* end afs_rxvl_get_entry_by_id_error() */ 210 /* marshall the parameters */
211 bp = call->request;
212 *bp++ = htonl(VLGETENTRYBYID);
213 *bp++ = htonl(volid);
214 *bp = htonl(voltype);
215
216 /* initiate the call */
217 return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
218}
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 782ee7c600ca..3370cdb72566 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -1,6 +1,6 @@
1/* vlocation.c: volume location management 1/* AFS volume location management
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -12,131 +12,61 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h>
17#include <linux/pagemap.h>
18#include "volume.h"
19#include "cell.h"
20#include "cmservice.h"
21#include "fsclient.h"
22#include "vlclient.h"
23#include "kafstimod.h"
24#include <rxrpc/connection.h>
25#include "internal.h" 15#include "internal.h"
26 16
27#define AFS_VLDB_TIMEOUT HZ*1000 17unsigned afs_vlocation_timeout = 10; /* volume location timeout in seconds */
18unsigned afs_vlocation_update_timeout = 10 * 60;
28 19
29static void afs_vlocation_update_timer(struct afs_timer *timer); 20static void afs_vlocation_reaper(struct work_struct *);
30static void afs_vlocation_update_attend(struct afs_async_op *op); 21static void afs_vlocation_updater(struct work_struct *);
31static void afs_vlocation_update_discard(struct afs_async_op *op);
32static void __afs_put_vlocation(struct afs_vlocation *vlocation);
33 22
34static void __afs_vlocation_timeout(struct afs_timer *timer) 23static LIST_HEAD(afs_vlocation_updates);
35{ 24static LIST_HEAD(afs_vlocation_graveyard);
36 struct afs_vlocation *vlocation = 25static DEFINE_SPINLOCK(afs_vlocation_updates_lock);
37 list_entry(timer, struct afs_vlocation, timeout); 26static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock);
38 27static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper);
39 _debug("VL TIMEOUT [%s{u=%d}]", 28static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater);
40 vlocation->vldb.name, atomic_read(&vlocation->usage)); 29static struct workqueue_struct *afs_vlocation_update_worker;
41
42 afs_vlocation_do_timeout(vlocation);
43}
44
45static const struct afs_timer_ops afs_vlocation_timer_ops = {
46 .timed_out = __afs_vlocation_timeout,
47};
48 30
49static const struct afs_timer_ops afs_vlocation_update_timer_ops = {
50 .timed_out = afs_vlocation_update_timer,
51};
52
53static const struct afs_async_op_ops afs_vlocation_update_op_ops = {
54 .attend = afs_vlocation_update_attend,
55 .discard = afs_vlocation_update_discard,
56};
57
58static LIST_HEAD(afs_vlocation_update_pendq); /* queue of VLs awaiting update */
59static struct afs_vlocation *afs_vlocation_update; /* VL currently being updated */
60static DEFINE_SPINLOCK(afs_vlocation_update_lock); /* lock guarding update queue */
61
62#ifdef AFS_CACHING_SUPPORT
63static cachefs_match_val_t afs_vlocation_cache_match(void *target,
64 const void *entry);
65static void afs_vlocation_cache_update(void *source, void *entry);
66
67struct cachefs_index_def afs_vlocation_cache_index_def = {
68 .name = "vldb",
69 .data_size = sizeof(struct afs_cache_vlocation),
70 .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
71 .match = afs_vlocation_cache_match,
72 .update = afs_vlocation_cache_update,
73};
74#endif
75
76/*****************************************************************************/
77/* 31/*
78 * iterate through the VL servers in a cell until one of them admits knowing 32 * iterate through the VL servers in a cell until one of them admits knowing
79 * about the volume in question 33 * about the volume in question
80 * - caller must have cell->vl_sem write-locked
81 */ 34 */
82static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vlocation, 35static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
83 const char *name, 36 struct key *key,
84 unsigned namesz,
85 struct afs_cache_vlocation *vldb) 37 struct afs_cache_vlocation *vldb)
86{ 38{
87 struct afs_server *server = NULL; 39 struct afs_cell *cell = vl->cell;
88 struct afs_cell *cell = vlocation->cell; 40 struct in_addr addr;
89 int count, ret; 41 int count, ret;
90 42
91 _enter("%s,%*.*s,%u", cell->name, namesz, namesz, name, namesz); 43 _enter("%s,%s", cell->name, vl->vldb.name);
92 44
45 down_write(&vl->cell->vl_sem);
93 ret = -ENOMEDIUM; 46 ret = -ENOMEDIUM;
94 for (count = cell->vl_naddrs; count > 0; count--) { 47 for (count = cell->vl_naddrs; count > 0; count--) {
95 _debug("CellServ[%hu]: %08x", 48 addr = cell->vl_addrs[cell->vl_curr_svix];
96 cell->vl_curr_svix, 49
97 cell->vl_addrs[cell->vl_curr_svix].s_addr); 50 _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
98
99 /* try and create a server */
100 ret = afs_server_lookup(cell,
101 &cell->vl_addrs[cell->vl_curr_svix],
102 &server);
103 switch (ret) {
104 case 0:
105 break;
106 case -ENOMEM:
107 case -ENONET:
108 goto out;
109 default:
110 goto rotate;
111 }
112 51
113 /* attempt to access the VL server */ 52 /* attempt to access the VL server */
114 ret = afs_rxvl_get_entry_by_name(server, name, namesz, vldb); 53 ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb,
54 &afs_sync_call);
115 switch (ret) { 55 switch (ret) {
116 case 0: 56 case 0:
117 afs_put_server(server);
118 goto out; 57 goto out;
119 case -ENOMEM: 58 case -ENOMEM:
120 case -ENONET: 59 case -ENONET:
121 case -ENETUNREACH: 60 case -ENETUNREACH:
122 case -EHOSTUNREACH: 61 case -EHOSTUNREACH:
123 case -ECONNREFUSED: 62 case -ECONNREFUSED:
124 down_write(&server->sem);
125 if (server->vlserver) {
126 rxrpc_put_connection(server->vlserver);
127 server->vlserver = NULL;
128 }
129 up_write(&server->sem);
130 afs_put_server(server);
131 if (ret == -ENOMEM || ret == -ENONET) 63 if (ret == -ENOMEM || ret == -ENONET)
132 goto out; 64 goto out;
133 goto rotate; 65 goto rotate;
134 case -ENOMEDIUM: 66 case -ENOMEDIUM:
135 afs_put_server(server);
136 goto out; 67 goto out;
137 default: 68 default:
138 afs_put_server(server); 69 ret = -EIO;
139 ret = -ENOMEDIUM;
140 goto rotate; 70 goto rotate;
141 } 71 }
142 72
@@ -146,76 +76,66 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vlocation,
146 cell->vl_curr_svix %= cell->vl_naddrs; 76 cell->vl_curr_svix %= cell->vl_naddrs;
147 } 77 }
148 78
149 out: 79out:
80 up_write(&vl->cell->vl_sem);
150 _leave(" = %d", ret); 81 _leave(" = %d", ret);
151 return ret; 82 return ret;
83}
152 84
153} /* end afs_vlocation_access_vl_by_name() */
154
155/*****************************************************************************/
156/* 85/*
157 * iterate through the VL servers in a cell until one of them admits knowing 86 * iterate through the VL servers in a cell until one of them admits knowing
158 * about the volume in question 87 * about the volume in question
159 * - caller must have cell->vl_sem write-locked
160 */ 88 */
161static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vlocation, 89static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
90 struct key *key,
162 afs_volid_t volid, 91 afs_volid_t volid,
163 afs_voltype_t voltype, 92 afs_voltype_t voltype,
164 struct afs_cache_vlocation *vldb) 93 struct afs_cache_vlocation *vldb)
165{ 94{
166 struct afs_server *server = NULL; 95 struct afs_cell *cell = vl->cell;
167 struct afs_cell *cell = vlocation->cell; 96 struct in_addr addr;
168 int count, ret; 97 int count, ret;
169 98
170 _enter("%s,%x,%d,", cell->name, volid, voltype); 99 _enter("%s,%x,%d,", cell->name, volid, voltype);
171 100
101 down_write(&vl->cell->vl_sem);
172 ret = -ENOMEDIUM; 102 ret = -ENOMEDIUM;
173 for (count = cell->vl_naddrs; count > 0; count--) { 103 for (count = cell->vl_naddrs; count > 0; count--) {
174 _debug("CellServ[%hu]: %08x", 104 addr = cell->vl_addrs[cell->vl_curr_svix];
175 cell->vl_curr_svix, 105
176 cell->vl_addrs[cell->vl_curr_svix].s_addr); 106 _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
177
178 /* try and create a server */
179 ret = afs_server_lookup(cell,
180 &cell->vl_addrs[cell->vl_curr_svix],
181 &server);
182 switch (ret) {
183 case 0:
184 break;
185 case -ENOMEM:
186 case -ENONET:
187 goto out;
188 default:
189 goto rotate;
190 }
191 107
192 /* attempt to access the VL server */ 108 /* attempt to access the VL server */
193 ret = afs_rxvl_get_entry_by_id(server, volid, voltype, vldb); 109 ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb,
110 &afs_sync_call);
194 switch (ret) { 111 switch (ret) {
195 case 0: 112 case 0:
196 afs_put_server(server);
197 goto out; 113 goto out;
198 case -ENOMEM: 114 case -ENOMEM:
199 case -ENONET: 115 case -ENONET:
200 case -ENETUNREACH: 116 case -ENETUNREACH:
201 case -EHOSTUNREACH: 117 case -EHOSTUNREACH:
202 case -ECONNREFUSED: 118 case -ECONNREFUSED:
203 down_write(&server->sem);
204 if (server->vlserver) {
205 rxrpc_put_connection(server->vlserver);
206 server->vlserver = NULL;
207 }
208 up_write(&server->sem);
209 afs_put_server(server);
210 if (ret == -ENOMEM || ret == -ENONET) 119 if (ret == -ENOMEM || ret == -ENONET)
211 goto out; 120 goto out;
212 goto rotate; 121 goto rotate;
122 case -EBUSY:
123 vl->upd_busy_cnt++;
124 if (vl->upd_busy_cnt <= 3) {
125 if (vl->upd_busy_cnt > 1) {
126 /* second+ BUSY - sleep a little bit */
127 set_current_state(TASK_UNINTERRUPTIBLE);
128 schedule_timeout(1);
129 __set_current_state(TASK_RUNNING);
130 }
131 continue;
132 }
133 break;
213 case -ENOMEDIUM: 134 case -ENOMEDIUM:
214 afs_put_server(server); 135 vl->upd_rej_cnt++;
215 goto out; 136 goto rotate;
216 default: 137 default:
217 afs_put_server(server); 138 ret = -EIO;
218 ret = -ENOMEDIUM;
219 goto rotate; 139 goto rotate;
220 } 140 }
221 141
@@ -223,729 +143,579 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vlocation,
223 rotate: 143 rotate:
224 cell->vl_curr_svix++; 144 cell->vl_curr_svix++;
225 cell->vl_curr_svix %= cell->vl_naddrs; 145 cell->vl_curr_svix %= cell->vl_naddrs;
146 vl->upd_busy_cnt = 0;
226 } 147 }
227 148
228 out: 149out:
150 if (ret < 0 && vl->upd_rej_cnt > 0) {
151 printk(KERN_NOTICE "kAFS:"
152 " Active volume no longer valid '%s'\n",
153 vl->vldb.name);
154 vl->valid = 0;
155 ret = -ENOMEDIUM;
156 }
157
158 up_write(&vl->cell->vl_sem);
229 _leave(" = %d", ret); 159 _leave(" = %d", ret);
230 return ret; 160 return ret;
161}
231 162
232} /* end afs_vlocation_access_vl_by_id() */
233
234/*****************************************************************************/
235/* 163/*
236 * lookup volume location 164 * allocate a volume location record
237 * - caller must have cell->vol_sem write-locked
238 * - iterate through the VL servers in a cell until one of them admits knowing
239 * about the volume in question
240 * - lookup in the local cache if not able to find on the VL server
241 * - insert/update in the local cache if did get a VL response
242 */ 165 */
243int afs_vlocation_lookup(struct afs_cell *cell, 166static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell,
244 const char *name, 167 const char *name,
245 unsigned namesz, 168 size_t namesz)
246 struct afs_vlocation **_vlocation)
247{ 169{
248 struct afs_cache_vlocation vldb; 170 struct afs_vlocation *vl;
249 struct afs_vlocation *vlocation; 171
250 afs_voltype_t voltype; 172 vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
251 afs_volid_t vid; 173 if (vl) {
252 int active = 0, ret; 174 vl->cell = cell;
253 175 vl->state = AFS_VL_NEW;
254 _enter("{%s},%*.*s,%u,", cell->name, namesz, namesz, name, namesz); 176 atomic_set(&vl->usage, 1);
255 177 INIT_LIST_HEAD(&vl->link);
256 if (namesz > sizeof(vlocation->vldb.name)) { 178 INIT_LIST_HEAD(&vl->grave);
257 _leave(" = -ENAMETOOLONG"); 179 INIT_LIST_HEAD(&vl->update);
258 return -ENAMETOOLONG; 180 init_waitqueue_head(&vl->waitq);
259 } 181 spin_lock_init(&vl->lock);
260 182 memcpy(vl->vldb.name, name, namesz);
261 /* search the cell's active list first */
262 list_for_each_entry(vlocation, &cell->vl_list, link) {
263 if (namesz < sizeof(vlocation->vldb.name) &&
264 vlocation->vldb.name[namesz] != '\0')
265 continue;
266
267 if (memcmp(vlocation->vldb.name, name, namesz) == 0)
268 goto found_in_memory;
269 }
270
271 /* search the cell's graveyard list second */
272 spin_lock(&cell->vl_gylock);
273 list_for_each_entry(vlocation, &cell->vl_graveyard, link) {
274 if (namesz < sizeof(vlocation->vldb.name) &&
275 vlocation->vldb.name[namesz] != '\0')
276 continue;
277
278 if (memcmp(vlocation->vldb.name, name, namesz) == 0)
279 goto found_in_graveyard;
280 }
281 spin_unlock(&cell->vl_gylock);
282
283 /* not in the cell's in-memory lists - create a new record */
284 vlocation = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
285 if (!vlocation)
286 return -ENOMEM;
287
288 atomic_set(&vlocation->usage, 1);
289 INIT_LIST_HEAD(&vlocation->link);
290 rwlock_init(&vlocation->lock);
291 memcpy(vlocation->vldb.name, name, namesz);
292
293 afs_timer_init(&vlocation->timeout, &afs_vlocation_timer_ops);
294 afs_timer_init(&vlocation->upd_timer, &afs_vlocation_update_timer_ops);
295 afs_async_op_init(&vlocation->upd_op, &afs_vlocation_update_op_ops);
296
297 afs_get_cell(cell);
298 vlocation->cell = cell;
299
300 list_add_tail(&vlocation->link, &cell->vl_list);
301
302#ifdef AFS_CACHING_SUPPORT
303 /* we want to store it in the cache, plus it might already be
304 * encached */
305 cachefs_acquire_cookie(cell->cache,
306 &afs_volume_cache_index_def,
307 vlocation,
308 &vlocation->cache);
309
310 if (vlocation->valid)
311 goto found_in_cache;
312#endif
313
314 /* try to look up an unknown volume in the cell VL databases by name */
315 ret = afs_vlocation_access_vl_by_name(vlocation, name, namesz, &vldb);
316 if (ret < 0) {
317 printk("kAFS: failed to locate '%*.*s' in cell '%s'\n",
318 namesz, namesz, name, cell->name);
319 goto error;
320 } 183 }
321 184
322 goto found_on_vlserver; 185 _leave(" = %p", vl);
323 186 return vl;
324 found_in_graveyard: 187}
325 /* found in the graveyard - resurrect */
326 _debug("found in graveyard");
327 atomic_inc(&vlocation->usage);
328 list_move_tail(&vlocation->link, &cell->vl_list);
329 spin_unlock(&cell->vl_gylock);
330
331 afs_kafstimod_del_timer(&vlocation->timeout);
332 goto active;
333
334 found_in_memory:
335 /* found in memory - check to see if it's active */
336 _debug("found in memory");
337 atomic_inc(&vlocation->usage);
338 188
339 active: 189/*
340 active = 1; 190 * update record if we found it in the cache
191 */
192static int afs_vlocation_update_record(struct afs_vlocation *vl,
193 struct key *key,
194 struct afs_cache_vlocation *vldb)
195{
196 afs_voltype_t voltype;
197 afs_volid_t vid;
198 int ret;
341 199
342#ifdef AFS_CACHING_SUPPORT
343 found_in_cache:
344#endif
345 /* try to look up a cached volume in the cell VL databases by ID */ 200 /* try to look up a cached volume in the cell VL databases by ID */
346 _debug("found in cache");
347
348 _debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }", 201 _debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
349 vlocation->vldb.name, 202 vl->vldb.name,
350 vlocation->vldb.vidmask, 203 vl->vldb.vidmask,
351 ntohl(vlocation->vldb.servers[0].s_addr), 204 ntohl(vl->vldb.servers[0].s_addr),
352 vlocation->vldb.srvtmask[0], 205 vl->vldb.srvtmask[0],
353 ntohl(vlocation->vldb.servers[1].s_addr), 206 ntohl(vl->vldb.servers[1].s_addr),
354 vlocation->vldb.srvtmask[1], 207 vl->vldb.srvtmask[1],
355 ntohl(vlocation->vldb.servers[2].s_addr), 208 ntohl(vl->vldb.servers[2].s_addr),
356 vlocation->vldb.srvtmask[2] 209 vl->vldb.srvtmask[2]);
357 );
358 210
359 _debug("Vids: %08x %08x %08x", 211 _debug("Vids: %08x %08x %08x",
360 vlocation->vldb.vid[0], 212 vl->vldb.vid[0],
361 vlocation->vldb.vid[1], 213 vl->vldb.vid[1],
362 vlocation->vldb.vid[2]); 214 vl->vldb.vid[2]);
363 215
364 if (vlocation->vldb.vidmask & AFS_VOL_VTM_RW) { 216 if (vl->vldb.vidmask & AFS_VOL_VTM_RW) {
365 vid = vlocation->vldb.vid[0]; 217 vid = vl->vldb.vid[0];
366 voltype = AFSVL_RWVOL; 218 voltype = AFSVL_RWVOL;
367 } 219 } else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) {
368 else if (vlocation->vldb.vidmask & AFS_VOL_VTM_RO) { 220 vid = vl->vldb.vid[1];
369 vid = vlocation->vldb.vid[1];
370 voltype = AFSVL_ROVOL; 221 voltype = AFSVL_ROVOL;
371 } 222 } else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) {
372 else if (vlocation->vldb.vidmask & AFS_VOL_VTM_BAK) { 223 vid = vl->vldb.vid[2];
373 vid = vlocation->vldb.vid[2];
374 voltype = AFSVL_BACKVOL; 224 voltype = AFSVL_BACKVOL;
375 } 225 } else {
376 else {
377 BUG(); 226 BUG();
378 vid = 0; 227 vid = 0;
379 voltype = 0; 228 voltype = 0;
380 } 229 }
381 230
382 ret = afs_vlocation_access_vl_by_id(vlocation, vid, voltype, &vldb); 231 /* contact the server to make sure the volume is still available
232 * - TODO: need to handle disconnected operation here
233 */
234 ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb);
383 switch (ret) { 235 switch (ret) {
384 /* net error */ 236 /* net error */
385 default: 237 default:
386 printk("kAFS: failed to volume '%*.*s' (%x) up in '%s': %d\n", 238 printk(KERN_WARNING "kAFS:"
387 namesz, namesz, name, vid, cell->name, ret); 239 " failed to update volume '%s' (%x) up in '%s': %d\n",
388 goto error; 240 vl->vldb.name, vid, vl->cell->name, ret);
241 _leave(" = %d", ret);
242 return ret;
389 243
390 /* pulled from local cache into memory */ 244 /* pulled from local cache into memory */
391 case 0: 245 case 0:
392 goto found_on_vlserver; 246 _leave(" = 0");
247 return 0;
393 248
394 /* uh oh... looks like the volume got deleted */ 249 /* uh oh... looks like the volume got deleted */
395 case -ENOMEDIUM: 250 case -ENOMEDIUM:
396 printk("kAFS: volume '%*.*s' (%x) does not exist '%s'\n", 251 printk(KERN_ERR "kAFS:"
397 namesz, namesz, name, vid, cell->name); 252 " volume '%s' (%x) does not exist '%s'\n",
253 vl->vldb.name, vid, vl->cell->name);
398 254
399 /* TODO: make existing record unavailable */ 255 /* TODO: make existing record unavailable */
400 goto error; 256 _leave(" = %d", ret);
257 return ret;
401 } 258 }
259}
402 260
403 found_on_vlserver: 261/*
404 _debug("Done VL Lookup: %*.*s %02x { %08x(%x) %08x(%x) %08x(%x) }", 262 * apply the update to a VL record
405 namesz, namesz, name, 263 */
406 vldb.vidmask, 264static void afs_vlocation_apply_update(struct afs_vlocation *vl,
407 ntohl(vldb.servers[0].s_addr), vldb.srvtmask[0], 265 struct afs_cache_vlocation *vldb)
408 ntohl(vldb.servers[1].s_addr), vldb.srvtmask[1], 266{
409 ntohl(vldb.servers[2].s_addr), vldb.srvtmask[2] 267 _debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
410 ); 268 vldb->name, vldb->vidmask,
411 269 ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0],
412 _debug("Vids: %08x %08x %08x", vldb.vid[0], vldb.vid[1], vldb.vid[2]); 270 ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1],
271 ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]);
413 272
414 if ((namesz < sizeof(vlocation->vldb.name) && 273 _debug("Vids: %08x %08x %08x",
415 vlocation->vldb.name[namesz] != '\0') || 274 vldb->vid[0], vldb->vid[1], vldb->vid[2]);
416 memcmp(vldb.name, name, namesz) != 0)
417 printk("kAFS: name of volume '%*.*s' changed to '%s' on server\n",
418 namesz, namesz, name, vldb.name);
419 275
420 memcpy(&vlocation->vldb, &vldb, sizeof(vlocation->vldb)); 276 if (strcmp(vldb->name, vl->vldb.name) != 0)
277 printk(KERN_NOTICE "kAFS:"
278 " name of volume '%s' changed to '%s' on server\n",
279 vl->vldb.name, vldb->name);
421 280
422 afs_kafstimod_add_timer(&vlocation->upd_timer, 10 * HZ); 281 vl->vldb = *vldb;
423 282
424#ifdef AFS_CACHING_SUPPORT 283#ifdef AFS_CACHING_SUPPORT
425 /* update volume entry in local cache */ 284 /* update volume entry in local cache */
426 cachefs_update_cookie(vlocation->cache); 285 cachefs_update_cookie(vl->cache);
427#endif
428
429 *_vlocation = vlocation;
430 _leave(" = 0 (%p)",vlocation);
431 return 0;
432
433 error:
434 if (vlocation) {
435 if (active) {
436 __afs_put_vlocation(vlocation);
437 }
438 else {
439 list_del(&vlocation->link);
440#ifdef AFS_CACHING_SUPPORT
441 cachefs_relinquish_cookie(vlocation->cache, 0);
442#endif 286#endif
443 afs_put_cell(vlocation->cell); 287}
444 kfree(vlocation);
445 }
446 }
447
448 _leave(" = %d", ret);
449 return ret;
450} /* end afs_vlocation_lookup() */
451 288
452/*****************************************************************************/
453/* 289/*
454 * finish using a volume location record 290 * fill in a volume location record, consulting the cache and the VL server
455 * - caller must have cell->vol_sem write-locked 291 * both
456 */ 292 */
457static void __afs_put_vlocation(struct afs_vlocation *vlocation) 293static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
294 struct key *key)
458{ 295{
459 struct afs_cell *cell; 296 struct afs_cache_vlocation vldb;
297 int ret;
460 298
461 if (!vlocation) 299 _enter("");
462 return;
463 300
464 _enter("%s", vlocation->vldb.name); 301 ASSERTCMP(vl->valid, ==, 0);
465 302
466 cell = vlocation->cell; 303 memset(&vldb, 0, sizeof(vldb));
467 304
468 /* sanity check */ 305 /* see if we have an in-cache copy (will set vl->valid if there is) */
469 BUG_ON(atomic_read(&vlocation->usage) <= 0); 306#ifdef AFS_CACHING_SUPPORT
307 cachefs_acquire_cookie(cell->cache,
308 &afs_volume_cache_index_def,
309 vlocation,
310 &vl->cache);
311#endif
470 312
471 spin_lock(&cell->vl_gylock); 313 if (vl->valid) {
472 if (likely(!atomic_dec_and_test(&vlocation->usage))) { 314 /* try to update a known volume in the cell VL databases by
473 spin_unlock(&cell->vl_gylock); 315 * ID as the name may have changed */
474 _leave(""); 316 _debug("found in cache");
475 return; 317 ret = afs_vlocation_update_record(vl, key, &vldb);
318 } else {
319 /* try to look up an unknown volume in the cell VL databases by
320 * name */
321 ret = afs_vlocation_access_vl_by_name(vl, key, &vldb);
322 if (ret < 0) {
323 printk("kAFS: failed to locate '%s' in cell '%s'\n",
324 vl->vldb.name, vl->cell->name);
325 return ret;
326 }
476 } 327 }
477 328
478 /* move to graveyard queue */ 329 afs_vlocation_apply_update(vl, &vldb);
479 list_move_tail(&vlocation->link,&cell->vl_graveyard); 330 _leave(" = 0");
480 331 return 0;
481 /* remove from pending timeout queue (refcounted if actually being 332}
482 * updated) */
483 list_del_init(&vlocation->upd_op.link);
484
485 /* time out in 10 secs */
486 afs_kafstimod_del_timer(&vlocation->upd_timer);
487 afs_kafstimod_add_timer(&vlocation->timeout, 10 * HZ);
488
489 spin_unlock(&cell->vl_gylock);
490
491 _leave(" [killed]");
492} /* end __afs_put_vlocation() */
493
494/*****************************************************************************/
495/*
496 * finish using a volume location record
497 */
498void afs_put_vlocation(struct afs_vlocation *vlocation)
499{
500 if (vlocation) {
501 struct afs_cell *cell = vlocation->cell;
502
503 down_write(&cell->vl_sem);
504 __afs_put_vlocation(vlocation);
505 up_write(&cell->vl_sem);
506 }
507} /* end afs_put_vlocation() */
508 333
509/*****************************************************************************/
510/* 334/*
511 * timeout vlocation record 335 * queue a vlocation record for updates
512 * - removes from the cell's graveyard if the usage count is zero
513 */ 336 */
514void afs_vlocation_do_timeout(struct afs_vlocation *vlocation) 337void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
515{ 338{
516 struct afs_cell *cell; 339 struct afs_vlocation *xvl;
517 340
518 _enter("%s", vlocation->vldb.name); 341 /* wait at least 10 minutes before updating... */
342 vl->update_at = get_seconds() + afs_vlocation_update_timeout;
519 343
520 cell = vlocation->cell; 344 spin_lock(&afs_vlocation_updates_lock);
521 345
522 BUG_ON(atomic_read(&vlocation->usage) < 0); 346 if (!list_empty(&afs_vlocation_updates)) {
523 347 /* ... but wait at least 1 second more than the newest record
524 /* remove from graveyard if still dead */ 348 * already queued so that we don't spam the VL server suddenly
525 spin_lock(&cell->vl_gylock); 349 * with lots of requests
526 if (atomic_read(&vlocation->usage) == 0) 350 */
527 list_del_init(&vlocation->link); 351 xvl = list_entry(afs_vlocation_updates.prev,
528 else 352 struct afs_vlocation, update);
529 vlocation = NULL; 353 if (vl->update_at <= xvl->update_at)
530 spin_unlock(&cell->vl_gylock); 354 vl->update_at = xvl->update_at + 1;
531 355 } else {
532 if (!vlocation) { 356 queue_delayed_work(afs_vlocation_update_worker,
533 _leave(""); 357 &afs_vlocation_update,
534 return; /* resurrected */ 358 afs_vlocation_update_timeout * HZ);
535 } 359 }
536 360
537 /* we can now destroy it properly */ 361 list_add_tail(&vl->update, &afs_vlocation_updates);
538#ifdef AFS_CACHING_SUPPORT 362 spin_unlock(&afs_vlocation_updates_lock);
539 cachefs_relinquish_cookie(vlocation->cache, 0); 363}
540#endif
541 afs_put_cell(cell);
542
543 kfree(vlocation);
544
545 _leave(" [destroyed]");
546} /* end afs_vlocation_do_timeout() */
547 364
548/*****************************************************************************/
549/* 365/*
550 * send an update operation to the currently selected server 366 * lookup volume location
367 * - iterate through the VL servers in a cell until one of them admits knowing
368 * about the volume in question
369 * - lookup in the local cache if not able to find on the VL server
370 * - insert/update in the local cache if did get a VL response
551 */ 371 */
552static int afs_vlocation_update_begin(struct afs_vlocation *vlocation) 372struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
373 struct key *key,
374 const char *name,
375 size_t namesz)
553{ 376{
554 afs_voltype_t voltype; 377 struct afs_vlocation *vl;
555 afs_volid_t vid;
556 int ret; 378 int ret;
557 379
558 _enter("%s{ufs=%u ucs=%u}", 380 _enter("{%s},{%x},%*.*s,%zu",
559 vlocation->vldb.name, 381 cell->name, key_serial(key),
560 vlocation->upd_first_svix, 382 (int) namesz, (int) namesz, name, namesz);
561 vlocation->upd_curr_svix);
562 383
563 /* try to look up a cached volume in the cell VL databases by ID */ 384 if (namesz > sizeof(vl->vldb.name)) {
564 if (vlocation->vldb.vidmask & AFS_VOL_VTM_RW) { 385 _leave(" = -ENAMETOOLONG");
565 vid = vlocation->vldb.vid[0]; 386 return ERR_PTR(-ENAMETOOLONG);
566 voltype = AFSVL_RWVOL;
567 }
568 else if (vlocation->vldb.vidmask & AFS_VOL_VTM_RO) {
569 vid = vlocation->vldb.vid[1];
570 voltype = AFSVL_ROVOL;
571 } 387 }
572 else if (vlocation->vldb.vidmask & AFS_VOL_VTM_BAK) { 388
573 vid = vlocation->vldb.vid[2]; 389 /* see if we have an in-memory copy first */
574 voltype = AFSVL_BACKVOL; 390 down_write(&cell->vl_sem);
391 spin_lock(&cell->vl_lock);
392 list_for_each_entry(vl, &cell->vl_list, link) {
393 if (vl->vldb.name[namesz] != '\0')
394 continue;
395 if (memcmp(vl->vldb.name, name, namesz) == 0)
396 goto found_in_memory;
575 } 397 }
576 else { 398 spin_unlock(&cell->vl_lock);
577 BUG(); 399
578 vid = 0; 400 /* not in the cell's in-memory lists - create a new record */
579 voltype = 0; 401 vl = afs_vlocation_alloc(cell, name, namesz);
402 if (!vl) {
403 up_write(&cell->vl_sem);
404 return ERR_PTR(-ENOMEM);
580 } 405 }
581 406
582 /* contact the chosen server */ 407 afs_get_cell(cell);
583 ret = afs_server_lookup(
584 vlocation->cell,
585 &vlocation->cell->vl_addrs[vlocation->upd_curr_svix],
586 &vlocation->upd_op.server);
587 408
588 switch (ret) { 409 list_add_tail(&vl->link, &cell->vl_list);
589 case 0: 410 vl->state = AFS_VL_CREATING;
590 break; 411 up_write(&cell->vl_sem);
591 case -ENOMEM:
592 case -ENONET:
593 default:
594 _leave(" = %d", ret);
595 return ret;
596 }
597 412
598 /* initiate the update operation */ 413fill_in_record:
599 ret = afs_rxvl_get_entry_by_id_async(&vlocation->upd_op, vid, voltype); 414 ret = afs_vlocation_fill_in_record(vl, key);
600 if (ret < 0) { 415 if (ret < 0)
601 _leave(" = %d", ret); 416 goto error_abandon;
602 return ret; 417 spin_lock(&vl->lock);
418 vl->state = AFS_VL_VALID;
419 spin_unlock(&vl->lock);
420 wake_up(&vl->waitq);
421
422 /* schedule for regular updates */
423 afs_vlocation_queue_for_updates(vl);
424 goto success;
425
426found_in_memory:
427 /* found in memory */
428 _debug("found in memory");
429 atomic_inc(&vl->usage);
430 spin_unlock(&cell->vl_lock);
431 if (!list_empty(&vl->grave)) {
432 spin_lock(&afs_vlocation_graveyard_lock);
433 list_del_init(&vl->grave);
434 spin_unlock(&afs_vlocation_graveyard_lock);
603 } 435 }
436 up_write(&cell->vl_sem);
437
438 /* see if it was an abandoned record that we might try filling in */
439 spin_lock(&vl->lock);
440 while (vl->state != AFS_VL_VALID) {
441 afs_vlocation_state_t state = vl->state;
442
443 _debug("invalid [state %d]", state);
444
445 if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) {
446 vl->state = AFS_VL_CREATING;
447 spin_unlock(&vl->lock);
448 goto fill_in_record;
449 }
450
451 /* must now wait for creation or update by someone else to
452 * complete */
453 _debug("wait");
604 454
455 spin_unlock(&vl->lock);
456 ret = wait_event_interruptible(vl->waitq,
457 vl->state == AFS_VL_NEW ||
458 vl->state == AFS_VL_VALID ||
459 vl->state == AFS_VL_NO_VOLUME);
460 if (ret < 0)
461 goto error;
462 spin_lock(&vl->lock);
463 }
464 spin_unlock(&vl->lock);
465
466success:
467 _leave(" = %p",vl);
468 return vl;
469
470error_abandon:
471 spin_lock(&vl->lock);
472 vl->state = AFS_VL_NEW;
473 spin_unlock(&vl->lock);
474 wake_up(&vl->waitq);
475error:
476 ASSERT(vl != NULL);
477 afs_put_vlocation(vl);
605 _leave(" = %d", ret); 478 _leave(" = %d", ret);
606 return ret; 479 return ERR_PTR(ret);
607} /* end afs_vlocation_update_begin() */ 480}
608 481
609/*****************************************************************************/
610/* 482/*
611 * abandon updating a VL record 483 * finish using a volume location record
612 * - does not restart the update timer
613 */ 484 */
614static void afs_vlocation_update_abandon(struct afs_vlocation *vlocation, 485void afs_put_vlocation(struct afs_vlocation *vl)
615 afs_vlocation_upd_t state,
616 int ret)
617{ 486{
618 _enter("%s,%u", vlocation->vldb.name, state); 487 if (!vl)
619 488 return;
620 if (ret < 0)
621 printk("kAFS: Abandoning VL update '%s': %d\n",
622 vlocation->vldb.name, ret);
623
624 /* discard the server record */
625 afs_put_server(vlocation->upd_op.server);
626 vlocation->upd_op.server = NULL;
627 489
628 spin_lock(&afs_vlocation_update_lock); 490 _enter("%s", vl->vldb.name);
629 afs_vlocation_update = NULL;
630 vlocation->upd_state = state;
631 491
632 /* TODO: start updating next VL record on pending list */ 492 ASSERTCMP(atomic_read(&vl->usage), >, 0);
633 493
634 spin_unlock(&afs_vlocation_update_lock); 494 if (likely(!atomic_dec_and_test(&vl->usage))) {
495 _leave("");
496 return;
497 }
635 498
636 _leave(""); 499 spin_lock(&afs_vlocation_graveyard_lock);
637} /* end afs_vlocation_update_abandon() */ 500 if (atomic_read(&vl->usage) == 0) {
501 _debug("buried");
502 list_move_tail(&vl->grave, &afs_vlocation_graveyard);
503 vl->time_of_death = get_seconds();
504 schedule_delayed_work(&afs_vlocation_reap,
505 afs_vlocation_timeout * HZ);
506
507 /* suspend updates on this record */
508 if (!list_empty(&vl->update)) {
509 spin_lock(&afs_vlocation_updates_lock);
510 list_del_init(&vl->update);
511 spin_unlock(&afs_vlocation_updates_lock);
512 }
513 }
514 spin_unlock(&afs_vlocation_graveyard_lock);
515 _leave(" [killed?]");
516}
638 517
639/*****************************************************************************/
640/* 518/*
641 * handle periodic update timeouts and busy retry timeouts 519 * destroy a dead volume location record
642 * - called from kafstimod
643 */ 520 */
644static void afs_vlocation_update_timer(struct afs_timer *timer) 521static void afs_vlocation_destroy(struct afs_vlocation *vl)
645{ 522{
646 struct afs_vlocation *vlocation = 523 _enter("%p", vl);
647 list_entry(timer, struct afs_vlocation, upd_timer);
648 int ret;
649 524
650 _enter("%s", vlocation->vldb.name); 525#ifdef AFS_CACHING_SUPPORT
526 cachefs_relinquish_cookie(vl->cache, 0);
527#endif
651 528
652 /* only update if not in the graveyard (defend against putting too) */ 529 afs_put_cell(vl->cell);
653 spin_lock(&vlocation->cell->vl_gylock); 530 kfree(vl);
531}
654 532
655 if (!atomic_read(&vlocation->usage)) 533/*
656 goto out_unlock1; 534 * reap dead volume location records
535 */
536static void afs_vlocation_reaper(struct work_struct *work)
537{
538 LIST_HEAD(corpses);
539 struct afs_vlocation *vl;
540 unsigned long delay, expiry;
541 time_t now;
657 542
658 spin_lock(&afs_vlocation_update_lock); 543 _enter("");
659 544
660 /* if we were woken up due to EBUSY sleep then restart immediately if 545 now = get_seconds();
661 * possible or else jump to front of pending queue */ 546 spin_lock(&afs_vlocation_graveyard_lock);
662 if (vlocation->upd_state == AFS_VLUPD_BUSYSLEEP) { 547
663 if (afs_vlocation_update) { 548 while (!list_empty(&afs_vlocation_graveyard)) {
664 list_add(&vlocation->upd_op.link, 549 vl = list_entry(afs_vlocation_graveyard.next,
665 &afs_vlocation_update_pendq); 550 struct afs_vlocation, grave);
551
552 _debug("check %p", vl);
553
554 /* the queue is ordered most dead first */
555 expiry = vl->time_of_death + afs_vlocation_timeout;
556 if (expiry > now) {
557 delay = (expiry - now) * HZ;
558 _debug("delay %lu", delay);
559 if (!schedule_delayed_work(&afs_vlocation_reap,
560 delay)) {
561 cancel_delayed_work(&afs_vlocation_reap);
562 schedule_delayed_work(&afs_vlocation_reap,
563 delay);
564 }
565 break;
666 } 566 }
667 else { 567
668 afs_get_vlocation(vlocation); 568 spin_lock(&vl->cell->vl_lock);
669 afs_vlocation_update = vlocation; 569 if (atomic_read(&vl->usage) > 0) {
670 vlocation->upd_state = AFS_VLUPD_INPROGRESS; 570 _debug("no reap");
571 list_del_init(&vl->grave);
572 } else {
573 _debug("reap");
574 list_move_tail(&vl->grave, &corpses);
575 list_del_init(&vl->link);
671 } 576 }
672 goto out_unlock2; 577 spin_unlock(&vl->cell->vl_lock);
673 } 578 }
674 579
675 /* put on pending queue if there's already another update in progress */ 580 spin_unlock(&afs_vlocation_graveyard_lock);
676 if (afs_vlocation_update) {
677 vlocation->upd_state = AFS_VLUPD_PENDING;
678 list_add_tail(&vlocation->upd_op.link,
679 &afs_vlocation_update_pendq);
680 goto out_unlock2;
681 }
682 581
683 /* hold a ref on it while actually updating */ 582 /* now reap the corpses we've extracted */
684 afs_get_vlocation(vlocation); 583 while (!list_empty(&corpses)) {
685 afs_vlocation_update = vlocation; 584 vl = list_entry(corpses.next, struct afs_vlocation, grave);
686 vlocation->upd_state = AFS_VLUPD_INPROGRESS; 585 list_del(&vl->grave);
687 586 afs_vlocation_destroy(vl);
688 spin_unlock(&afs_vlocation_update_lock);
689 spin_unlock(&vlocation->cell->vl_gylock);
690
691 /* okay... we can start the update */
692 _debug("BEGIN VL UPDATE [%s]", vlocation->vldb.name);
693 vlocation->upd_first_svix = vlocation->cell->vl_curr_svix;
694 vlocation->upd_curr_svix = vlocation->upd_first_svix;
695 vlocation->upd_rej_cnt = 0;
696 vlocation->upd_busy_cnt = 0;
697
698 ret = afs_vlocation_update_begin(vlocation);
699 if (ret < 0) {
700 afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, ret);
701 afs_kafstimod_add_timer(&vlocation->upd_timer,
702 AFS_VLDB_TIMEOUT);
703 afs_put_vlocation(vlocation);
704 } 587 }
705 588
706 _leave(""); 589 _leave("");
707 return; 590}
708 591
709 out_unlock2: 592/*
710 spin_unlock(&afs_vlocation_update_lock); 593 * initialise the VL update process
711 out_unlock1: 594 */
712 spin_unlock(&vlocation->cell->vl_gylock); 595int __init afs_vlocation_update_init(void)
713 _leave(""); 596{
714 return; 597 afs_vlocation_update_worker =
598 create_singlethread_workqueue("kafs_vlupdated");
599 return afs_vlocation_update_worker ? 0 : -ENOMEM;
600}
715 601
716} /* end afs_vlocation_update_timer() */ 602/*
603 * discard all the volume location records for rmmod
604 */
605void afs_vlocation_purge(void)
606{
607 afs_vlocation_timeout = 0;
608
609 spin_lock(&afs_vlocation_updates_lock);
610 list_del_init(&afs_vlocation_updates);
611 spin_unlock(&afs_vlocation_updates_lock);
612 cancel_delayed_work(&afs_vlocation_update);
613 queue_delayed_work(afs_vlocation_update_worker,
614 &afs_vlocation_update, 0);
615 destroy_workqueue(afs_vlocation_update_worker);
616
617 cancel_delayed_work(&afs_vlocation_reap);
618 schedule_delayed_work(&afs_vlocation_reap, 0);
619}
717 620
718/*****************************************************************************/
719/* 621/*
720 * attend to an update operation upon which an event happened 622 * update a volume location
721 * - called in kafsasyncd context
722 */ 623 */
723static void afs_vlocation_update_attend(struct afs_async_op *op) 624static void afs_vlocation_updater(struct work_struct *work)
724{ 625{
725 struct afs_cache_vlocation vldb; 626 struct afs_cache_vlocation vldb;
726 struct afs_vlocation *vlocation = 627 struct afs_vlocation *vl, *xvl;
727 list_entry(op, struct afs_vlocation, upd_op); 628 time_t now;
728 unsigned tmp; 629 long timeout;
729 int ret; 630 int ret;
730 631
731 _enter("%s", vlocation->vldb.name); 632 _enter("");
732
733 ret = afs_rxvl_get_entry_by_id_async2(op, &vldb);
734 switch (ret) {
735 case -EAGAIN:
736 _leave(" [unfinished]");
737 return;
738
739 case 0:
740 _debug("END VL UPDATE: %d\n", ret);
741 vlocation->valid = 1;
742
743 _debug("Done VL Lookup: %02x { %08x(%x) %08x(%x) %08x(%x) }",
744 vldb.vidmask,
745 ntohl(vldb.servers[0].s_addr), vldb.srvtmask[0],
746 ntohl(vldb.servers[1].s_addr), vldb.srvtmask[1],
747 ntohl(vldb.servers[2].s_addr), vldb.srvtmask[2]
748 );
749
750 _debug("Vids: %08x %08x %08x",
751 vldb.vid[0], vldb.vid[1], vldb.vid[2]);
752
753 afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, 0);
754
755 down_write(&vlocation->cell->vl_sem);
756
757 /* actually update the cache */
758 if (strncmp(vldb.name, vlocation->vldb.name,
759 sizeof(vlocation->vldb.name)) != 0)
760 printk("kAFS: name of volume '%s'"
761 " changed to '%s' on server\n",
762 vlocation->vldb.name, vldb.name);
763
764 memcpy(&vlocation->vldb, &vldb, sizeof(vlocation->vldb));
765
766#if 0
767 /* TODO update volume entry in local cache */
768#endif
769
770 up_write(&vlocation->cell->vl_sem);
771
772 if (ret < 0)
773 printk("kAFS: failed to update local cache: %d\n", ret);
774
775 afs_kafstimod_add_timer(&vlocation->upd_timer,
776 AFS_VLDB_TIMEOUT);
777 afs_put_vlocation(vlocation);
778 _leave(" [found]");
779 return;
780
781 case -ENOMEDIUM:
782 vlocation->upd_rej_cnt++;
783 goto try_next;
784
785 /* the server is locked - retry in a very short while */
786 case -EBUSY:
787 vlocation->upd_busy_cnt++;
788 if (vlocation->upd_busy_cnt > 3)
789 goto try_next; /* too many retries */
790
791 afs_vlocation_update_abandon(vlocation,
792 AFS_VLUPD_BUSYSLEEP, 0);
793 afs_kafstimod_add_timer(&vlocation->upd_timer, HZ / 2);
794 afs_put_vlocation(vlocation);
795 _leave(" [busy]");
796 return;
797
798 case -ENETUNREACH:
799 case -EHOSTUNREACH:
800 case -ECONNREFUSED:
801 case -EREMOTEIO:
802 /* record bad vlserver info in the cell too
803 * - TODO: use down_write_trylock() if available
804 */
805 if (vlocation->upd_curr_svix == vlocation->cell->vl_curr_svix)
806 vlocation->cell->vl_curr_svix =
807 vlocation->cell->vl_curr_svix %
808 vlocation->cell->vl_naddrs;
809
810 case -EBADRQC:
811 case -EINVAL:
812 case -EACCES:
813 case -EBADMSG:
814 goto try_next;
815
816 default:
817 goto abandon;
818 }
819
820 /* try contacting the next server */
821 try_next:
822 vlocation->upd_busy_cnt = 0;
823
824 /* discard the server record */
825 afs_put_server(vlocation->upd_op.server);
826 vlocation->upd_op.server = NULL;
827 633
828 tmp = vlocation->cell->vl_naddrs; 634 now = get_seconds();
829 if (tmp == 0)
830 goto abandon;
831 635
832 vlocation->upd_curr_svix++; 636 /* find a record to update */
833 if (vlocation->upd_curr_svix >= tmp) 637 spin_lock(&afs_vlocation_updates_lock);
834 vlocation->upd_curr_svix = 0; 638 for (;;) {
835 if (vlocation->upd_first_svix >= tmp) 639 if (list_empty(&afs_vlocation_updates)) {
836 vlocation->upd_first_svix = tmp - 1; 640 spin_unlock(&afs_vlocation_updates_lock);
641 _leave(" [nothing]");
642 return;
643 }
837 644
838 /* move to the next server */ 645 vl = list_entry(afs_vlocation_updates.next,
839 if (vlocation->upd_curr_svix != vlocation->upd_first_svix) { 646 struct afs_vlocation, update);
840 afs_vlocation_update_begin(vlocation); 647 if (atomic_read(&vl->usage) > 0)
841 _leave(" [next]"); 648 break;
842 return; 649 list_del_init(&vl->update);
843 } 650 }
844 651
845 /* run out of servers to try - was the volume rejected? */ 652 timeout = vl->update_at - now;
846 if (vlocation->upd_rej_cnt > 0) { 653 if (timeout > 0) {
847 printk("kAFS: Active volume no longer valid '%s'\n", 654 queue_delayed_work(afs_vlocation_update_worker,
848 vlocation->vldb.name); 655 &afs_vlocation_update, timeout * HZ);
849 vlocation->valid = 0; 656 spin_unlock(&afs_vlocation_updates_lock);
850 afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, 0); 657 _leave(" [nothing]");
851 afs_kafstimod_add_timer(&vlocation->upd_timer,
852 AFS_VLDB_TIMEOUT);
853 afs_put_vlocation(vlocation);
854 _leave(" [invalidated]");
855 return; 658 return;
856 } 659 }
857 660
858 /* abandon the update */ 661 list_del_init(&vl->update);
859 abandon: 662 atomic_inc(&vl->usage);
860 afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, ret); 663 spin_unlock(&afs_vlocation_updates_lock);
861 afs_kafstimod_add_timer(&vlocation->upd_timer, HZ * 10);
862 afs_put_vlocation(vlocation);
863 _leave(" [abandoned]");
864
865} /* end afs_vlocation_update_attend() */
866
867/*****************************************************************************/
868/*
869 * deal with an update operation being discarded
870 * - called in kafsasyncd context when it's dying due to rmmod
871 * - the call has already been aborted and put()'d
872 */
873static void afs_vlocation_update_discard(struct afs_async_op *op)
874{
875 struct afs_vlocation *vlocation =
876 list_entry(op, struct afs_vlocation, upd_op);
877 664
878 _enter("%s", vlocation->vldb.name); 665 /* we can now perform the update */
666 _debug("update %s", vl->vldb.name);
667 vl->state = AFS_VL_UPDATING;
668 vl->upd_rej_cnt = 0;
669 vl->upd_busy_cnt = 0;
879 670
880 afs_put_server(op->server); 671 ret = afs_vlocation_update_record(vl, NULL, &vldb);
881 op->server = NULL; 672 spin_lock(&vl->lock);
673 switch (ret) {
674 case 0:
675 afs_vlocation_apply_update(vl, &vldb);
676 vl->state = AFS_VL_VALID;
677 break;
678 case -ENOMEDIUM:
679 vl->state = AFS_VL_VOLUME_DELETED;
680 break;
681 default:
682 vl->state = AFS_VL_UNCERTAIN;
683 break;
684 }
685 spin_unlock(&vl->lock);
686 wake_up(&vl->waitq);
882 687
883 afs_put_vlocation(vlocation); 688 /* and then reschedule */
689 _debug("reschedule");
690 vl->update_at = get_seconds() + afs_vlocation_update_timeout;
884 691
885 _leave(""); 692 spin_lock(&afs_vlocation_updates_lock);
886} /* end afs_vlocation_update_discard() */
887 693
888/*****************************************************************************/ 694 if (!list_empty(&afs_vlocation_updates)) {
889/* 695 /* next update in 10 minutes, but wait at least 1 second more
890 * match a VLDB record stored in the cache 696 * than the newest record already queued so that we don't spam
891 * - may also load target from entry 697 * the VL server suddenly with lots of requests
892 */ 698 */
893#ifdef AFS_CACHING_SUPPORT 699 xvl = list_entry(afs_vlocation_updates.prev,
894static cachefs_match_val_t afs_vlocation_cache_match(void *target, 700 struct afs_vlocation, update);
895 const void *entry) 701 if (vl->update_at <= xvl->update_at)
896{ 702 vl->update_at = xvl->update_at + 1;
897 const struct afs_cache_vlocation *vldb = entry; 703 xvl = list_entry(afs_vlocation_updates.next,
898 struct afs_vlocation *vlocation = target; 704 struct afs_vlocation, update);
899 705 timeout = xvl->update_at - now;
900 _enter("{%s},{%s}", vlocation->vldb.name, vldb->name); 706 if (timeout < 0)
901 707 timeout = 0;
902 if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0 708 } else {
903 ) { 709 timeout = afs_vlocation_update_timeout;
904 if (!vlocation->valid ||
905 vlocation->vldb.rtime == vldb->rtime
906 ) {
907 vlocation->vldb = *vldb;
908 vlocation->valid = 1;
909 _leave(" = SUCCESS [c->m]");
910 return CACHEFS_MATCH_SUCCESS;
911 }
912 /* need to update cache if cached info differs */
913 else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
914 /* delete if VIDs for this name differ */
915 if (memcmp(&vlocation->vldb.vid,
916 &vldb->vid,
917 sizeof(vldb->vid)) != 0) {
918 _leave(" = DELETE");
919 return CACHEFS_MATCH_SUCCESS_DELETE;
920 }
921
922 _leave(" = UPDATE");
923 return CACHEFS_MATCH_SUCCESS_UPDATE;
924 }
925 else {
926 _leave(" = SUCCESS");
927 return CACHEFS_MATCH_SUCCESS;
928 }
929 } 710 }
930 711
931 _leave(" = FAILED"); 712 ASSERT(list_empty(&vl->update));
932 return CACHEFS_MATCH_FAILED;
933} /* end afs_vlocation_cache_match() */
934#endif
935
936/*****************************************************************************/
937/*
938 * update a VLDB record stored in the cache
939 */
940#ifdef AFS_CACHING_SUPPORT
941static void afs_vlocation_cache_update(void *source, void *entry)
942{
943 struct afs_cache_vlocation *vldb = entry;
944 struct afs_vlocation *vlocation = source;
945 713
946 _enter(""); 714 list_add_tail(&vl->update, &afs_vlocation_updates);
947
948 *vldb = vlocation->vldb;
949 715
950} /* end afs_vlocation_cache_update() */ 716 _debug("timeout %ld", timeout);
951#endif 717 queue_delayed_work(afs_vlocation_update_worker,
718 &afs_vlocation_update, timeout * HZ);
719 spin_unlock(&afs_vlocation_updates_lock);
720 afs_put_vlocation(vl);
721}
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index cf62da5d7825..a1904ab8426a 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -1,6 +1,6 @@
1/* vnode.c: AFS vnode management 1/* AFS vnode management
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -14,142 +14,237 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/pagemap.h>
18#include "volume.h"
19#include "cell.h"
20#include "cmservice.h"
21#include "fsclient.h"
22#include "vlclient.h"
23#include "vnode.h"
24#include "internal.h" 17#include "internal.h"
25 18
26static void afs_vnode_cb_timed_out(struct afs_timer *timer); 19#if 0
20static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent,
21 int depth, char lr)
22{
23 struct afs_vnode *vnode;
24 bool bad = false;
25
26 if (!node)
27 return false;
28
29 if (node->rb_left)
30 bad = dump_tree_aux(node->rb_left, node, depth + 2, '/');
31
32 vnode = rb_entry(node, struct afs_vnode, cb_promise);
33 _debug("%c %*.*s%c%p {%d}",
34 rb_is_red(node) ? 'R' : 'B',
35 depth, depth, "", lr,
36 vnode, vnode->cb_expires_at);
37 if (rb_parent(node) != parent) {
38 printk("BAD: %p != %p\n", rb_parent(node), parent);
39 bad = true;
40 }
27 41
28struct afs_timer_ops afs_vnode_cb_timed_out_ops = { 42 if (node->rb_right)
29 .timed_out = afs_vnode_cb_timed_out, 43 bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\');
30};
31 44
32#ifdef AFS_CACHING_SUPPORT 45 return bad;
33static cachefs_match_val_t afs_vnode_cache_match(void *target, 46}
34 const void *entry);
35static void afs_vnode_cache_update(void *source, void *entry);
36 47
37struct cachefs_index_def afs_vnode_cache_index_def = { 48static noinline void dump_tree(const char *name, struct afs_server *server)
38 .name = "vnode", 49{
39 .data_size = sizeof(struct afs_cache_vnode), 50 _enter("%s", name);
40 .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 4 }, 51 if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-'))
41 .match = afs_vnode_cache_match, 52 BUG();
42 .update = afs_vnode_cache_update, 53}
43};
44#endif 54#endif
45 55
46/*****************************************************************************/
47/* 56/*
48 * handle a callback timing out 57 * insert a vnode into the backing server's vnode tree
49 * TODO: retain a ref to vnode struct for an outstanding callback timeout
50 */ 58 */
51static void afs_vnode_cb_timed_out(struct afs_timer *timer) 59static void afs_install_vnode(struct afs_vnode *vnode,
60 struct afs_server *server)
52{ 61{
53 struct afs_server *oldserver; 62 struct afs_server *old_server = vnode->server;
54 struct afs_vnode *vnode; 63 struct afs_vnode *xvnode;
64 struct rb_node *parent, **p;
55 65
56 vnode = list_entry(timer, struct afs_vnode, cb_timeout); 66 _enter("%p,%p", vnode, server);
57 67
58 _enter("%p", vnode); 68 if (old_server) {
69 spin_lock(&old_server->fs_lock);
70 rb_erase(&vnode->server_rb, &old_server->fs_vnodes);
71 spin_unlock(&old_server->fs_lock);
72 }
59 73
60 /* set the changed flag in the vnode and release the server */ 74 afs_get_server(server);
61 spin_lock(&vnode->lock); 75 vnode->server = server;
76 afs_put_server(old_server);
77
78 /* insert into the server's vnode tree in FID order */
79 spin_lock(&server->fs_lock);
80
81 parent = NULL;
82 p = &server->fs_vnodes.rb_node;
83 while (*p) {
84 parent = *p;
85 xvnode = rb_entry(parent, struct afs_vnode, server_rb);
86 if (vnode->fid.vid < xvnode->fid.vid)
87 p = &(*p)->rb_left;
88 else if (vnode->fid.vid > xvnode->fid.vid)
89 p = &(*p)->rb_right;
90 else if (vnode->fid.vnode < xvnode->fid.vnode)
91 p = &(*p)->rb_left;
92 else if (vnode->fid.vnode > xvnode->fid.vnode)
93 p = &(*p)->rb_right;
94 else if (vnode->fid.unique < xvnode->fid.unique)
95 p = &(*p)->rb_left;
96 else if (vnode->fid.unique > xvnode->fid.unique)
97 p = &(*p)->rb_right;
98 else
99 BUG(); /* can't happen unless afs_iget() malfunctions */
100 }
101
102 rb_link_node(&vnode->server_rb, parent, p);
103 rb_insert_color(&vnode->server_rb, &server->fs_vnodes);
62 104
63 oldserver = xchg(&vnode->cb_server, NULL); 105 spin_unlock(&server->fs_lock);
64 if (oldserver) { 106 _leave("");
65 vnode->flags |= AFS_VNODE_CHANGED; 107}
66 108
67 spin_lock(&afs_cb_hash_lock); 109/*
68 list_del_init(&vnode->cb_hash_link); 110 * insert a vnode into the promising server's update/expiration tree
69 spin_unlock(&afs_cb_hash_lock); 111 * - caller must hold vnode->lock
112 */
113static void afs_vnode_note_promise(struct afs_vnode *vnode,
114 struct afs_server *server)
115{
116 struct afs_server *old_server;
117 struct afs_vnode *xvnode;
118 struct rb_node *parent, **p;
70 119
71 spin_lock(&oldserver->cb_lock); 120 _enter("%p,%p", vnode, server);
72 list_del_init(&vnode->cb_link); 121
73 spin_unlock(&oldserver->cb_lock); 122 ASSERT(server != NULL);
123
124 old_server = vnode->server;
125 if (vnode->cb_promised) {
126 if (server == old_server &&
127 vnode->cb_expires == vnode->cb_expires_at) {
128 _leave(" [no change]");
129 return;
130 }
131
132 spin_lock(&old_server->cb_lock);
133 if (vnode->cb_promised) {
134 _debug("delete");
135 rb_erase(&vnode->cb_promise, &old_server->cb_promises);
136 vnode->cb_promised = false;
137 }
138 spin_unlock(&old_server->cb_lock);
74 } 139 }
75 140
76 spin_unlock(&vnode->lock); 141 if (vnode->server != server)
142 afs_install_vnode(vnode, server);
143
144 vnode->cb_expires_at = vnode->cb_expires;
145 _debug("PROMISE on %p {%lu}",
146 vnode, (unsigned long) vnode->cb_expires_at);
147
148 /* abuse an RB-tree to hold the expiration order (we may have multiple
149 * items with the same expiration time) */
150 spin_lock(&server->cb_lock);
151
152 parent = NULL;
153 p = &server->cb_promises.rb_node;
154 while (*p) {
155 parent = *p;
156 xvnode = rb_entry(parent, struct afs_vnode, cb_promise);
157 if (vnode->cb_expires_at < xvnode->cb_expires_at)
158 p = &(*p)->rb_left;
159 else
160 p = &(*p)->rb_right;
161 }
77 162
78 afs_put_server(oldserver); 163 rb_link_node(&vnode->cb_promise, parent, p);
164 rb_insert_color(&vnode->cb_promise, &server->cb_promises);
165 vnode->cb_promised = true;
79 166
167 spin_unlock(&server->cb_lock);
80 _leave(""); 168 _leave("");
81} /* end afs_vnode_cb_timed_out() */ 169}
82 170
83/*****************************************************************************/
84/* 171/*
85 * finish off updating the recorded status of a file 172 * handle remote file deletion by discarding the callback promise
173 */
174static void afs_vnode_deleted_remotely(struct afs_vnode *vnode)
175{
176 struct afs_server *server;
177
178 set_bit(AFS_VNODE_DELETED, &vnode->flags);
179
180 server = vnode->server;
181 if (vnode->cb_promised) {
182 spin_lock(&server->cb_lock);
183 if (vnode->cb_promised) {
184 rb_erase(&vnode->cb_promise, &server->cb_promises);
185 vnode->cb_promised = false;
186 }
187 spin_unlock(&server->cb_lock);
188 }
189
190 spin_lock(&vnode->server->fs_lock);
191 rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
192 spin_unlock(&vnode->server->fs_lock);
193
194 vnode->server = NULL;
195 afs_put_server(server);
196}
197
198/*
199 * finish off updating the recorded status of a file after a successful
200 * operation completion
86 * - starts callback expiry timer 201 * - starts callback expiry timer
87 * - adds to server's callback list 202 * - adds to server's callback list
88 */ 203 */
89static void afs_vnode_finalise_status_update(struct afs_vnode *vnode, 204void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
90 struct afs_server *server, 205 struct afs_server *server)
91 int ret)
92{ 206{
93 struct afs_server *oldserver = NULL; 207 struct afs_server *oldserver = NULL;
94 208
95 _enter("%p,%p,%d", vnode, server, ret); 209 _enter("%p,%p", vnode, server);
96 210
97 spin_lock(&vnode->lock); 211 spin_lock(&vnode->lock);
212 clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
213 afs_vnode_note_promise(vnode, server);
214 vnode->update_cnt--;
215 ASSERTCMP(vnode->update_cnt, >=, 0);
216 spin_unlock(&vnode->lock);
217
218 wake_up_all(&vnode->update_waitq);
219 afs_put_server(oldserver);
220 _leave("");
221}
98 222
99 vnode->flags &= ~AFS_VNODE_CHANGED; 223/*
224 * finish off updating the recorded status of a file after an operation failed
225 */
226static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)
227{
228 _enter("%p,%d", vnode, ret);
100 229
101 if (ret == 0) { 230 spin_lock(&vnode->lock);
102 /* adjust the callback timeout appropriately */
103 afs_kafstimod_add_timer(&vnode->cb_timeout,
104 vnode->cb_expiry * HZ);
105
106 spin_lock(&afs_cb_hash_lock);
107 list_move_tail(&vnode->cb_hash_link,
108 &afs_cb_hash(server, &vnode->fid));
109 spin_unlock(&afs_cb_hash_lock);
110
111 /* swap ref to old callback server with that for new callback
112 * server */
113 oldserver = xchg(&vnode->cb_server, server);
114 if (oldserver != server) {
115 if (oldserver) {
116 spin_lock(&oldserver->cb_lock);
117 list_del_init(&vnode->cb_link);
118 spin_unlock(&oldserver->cb_lock);
119 }
120 231
121 afs_get_server(server); 232 clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
122 spin_lock(&server->cb_lock);
123 list_add_tail(&vnode->cb_link, &server->cb_promises);
124 spin_unlock(&server->cb_lock);
125 }
126 else {
127 /* same server */
128 oldserver = NULL;
129 }
130 }
131 else if (ret == -ENOENT) {
132 /* the file was deleted - clear the callback timeout */
133 oldserver = xchg(&vnode->cb_server, NULL);
134 afs_kafstimod_del_timer(&vnode->cb_timeout);
135 233
234 if (ret == -ENOENT) {
235 /* the file was deleted on the server */
136 _debug("got NOENT from server - marking file deleted"); 236 _debug("got NOENT from server - marking file deleted");
137 vnode->flags |= AFS_VNODE_DELETED; 237 afs_vnode_deleted_remotely(vnode);
138 } 238 }
139 239
140 vnode->update_cnt--; 240 vnode->update_cnt--;
141 241 ASSERTCMP(vnode->update_cnt, >=, 0);
142 spin_unlock(&vnode->lock); 242 spin_unlock(&vnode->lock);
143 243
144 wake_up_all(&vnode->update_waitq); 244 wake_up_all(&vnode->update_waitq);
145
146 afs_put_server(oldserver);
147
148 _leave(""); 245 _leave("");
246}
149 247
150} /* end afs_vnode_finalise_status_update() */
151
152/*****************************************************************************/
153/* 248/*
154 * fetch file status from the volume 249 * fetch file status from the volume
155 * - don't issue a fetch if: 250 * - don't issue a fetch if:
@@ -157,9 +252,11 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
157 * - there are any outstanding ops that will fetch the status 252 * - there are any outstanding ops that will fetch the status
158 * - TODO implement local caching 253 * - TODO implement local caching
159 */ 254 */
160int afs_vnode_fetch_status(struct afs_vnode *vnode) 255int afs_vnode_fetch_status(struct afs_vnode *vnode,
256 struct afs_vnode *auth_vnode, struct key *key)
161{ 257{
162 struct afs_server *server; 258 struct afs_server *server;
259 unsigned long acl_order;
163 int ret; 260 int ret;
164 261
165 DECLARE_WAITQUEUE(myself, current); 262 DECLARE_WAITQUEUE(myself, current);
@@ -168,38 +265,49 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
168 vnode->volume->vlocation->vldb.name, 265 vnode->volume->vlocation->vldb.name,
169 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); 266 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
170 267
171 if (!(vnode->flags & AFS_VNODE_CHANGED) && vnode->cb_server) { 268 if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
269 vnode->cb_promised) {
172 _leave(" [unchanged]"); 270 _leave(" [unchanged]");
173 return 0; 271 return 0;
174 } 272 }
175 273
176 if (vnode->flags & AFS_VNODE_DELETED) { 274 if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
177 _leave(" [deleted]"); 275 _leave(" [deleted]");
178 return -ENOENT; 276 return -ENOENT;
179 } 277 }
180 278
279 acl_order = 0;
280 if (auth_vnode)
281 acl_order = auth_vnode->acl_order;
282
181 spin_lock(&vnode->lock); 283 spin_lock(&vnode->lock);
182 284
183 if (!(vnode->flags & AFS_VNODE_CHANGED)) { 285 if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
286 vnode->cb_promised) {
184 spin_unlock(&vnode->lock); 287 spin_unlock(&vnode->lock);
185 _leave(" [unchanged]"); 288 _leave(" [unchanged]");
186 return 0; 289 return 0;
187 } 290 }
188 291
292 ASSERTCMP(vnode->update_cnt, >=, 0);
293
189 if (vnode->update_cnt > 0) { 294 if (vnode->update_cnt > 0) {
190 /* someone else started a fetch */ 295 /* someone else started a fetch */
296 _debug("wait on fetch %d", vnode->update_cnt);
297
191 set_current_state(TASK_UNINTERRUPTIBLE); 298 set_current_state(TASK_UNINTERRUPTIBLE);
299 ASSERT(myself.func != NULL);
192 add_wait_queue(&vnode->update_waitq, &myself); 300 add_wait_queue(&vnode->update_waitq, &myself);
193 301
194 /* wait for the status to be updated */ 302 /* wait for the status to be updated */
195 for (;;) { 303 for (;;) {
196 if (!(vnode->flags & AFS_VNODE_CHANGED)) 304 if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
197 break; 305 break;
198 if (vnode->flags & AFS_VNODE_DELETED) 306 if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
199 break; 307 break;
200 308
201 /* it got updated and invalidated all before we saw 309 /* check to see if it got updated and invalidated all
202 * it */ 310 * before we saw it */
203 if (vnode->update_cnt == 0) { 311 if (vnode->update_cnt == 0) {
204 remove_wait_queue(&vnode->update_waitq, 312 remove_wait_queue(&vnode->update_waitq,
205 &myself); 313 &myself);
@@ -219,10 +327,11 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
219 spin_unlock(&vnode->lock); 327 spin_unlock(&vnode->lock);
220 set_current_state(TASK_RUNNING); 328 set_current_state(TASK_RUNNING);
221 329
222 return vnode->flags & AFS_VNODE_DELETED ? -ENOENT : 0; 330 return test_bit(AFS_VNODE_DELETED, &vnode->flags) ?
331 -ENOENT : 0;
223 } 332 }
224 333
225 get_anyway: 334get_anyway:
226 /* okay... we're going to have to initiate the op */ 335 /* okay... we're going to have to initiate the op */
227 vnode->update_cnt++; 336 vnode->update_cnt++;
228 337
@@ -232,39 +341,60 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
232 * vnode */ 341 * vnode */
233 do { 342 do {
234 /* pick a server to query */ 343 /* pick a server to query */
235 ret = afs_volume_pick_fileserver(vnode->volume, &server); 344 server = afs_volume_pick_fileserver(vnode);
236 if (ret<0) 345 if (IS_ERR(server))
237 return ret; 346 goto no_server;
238 347
239 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); 348 _debug("USING SERVER: %p{%08x}",
349 server, ntohl(server->addr.s_addr));
240 350
241 ret = afs_rxfs_fetch_file_status(server, vnode, NULL); 351 ret = afs_fs_fetch_file_status(server, key, vnode, NULL,
352 &afs_sync_call);
242 353
243 } while (!afs_volume_release_fileserver(vnode->volume, server, ret)); 354 } while (!afs_volume_release_fileserver(vnode, server, ret));
244 355
245 /* adjust the flags */ 356 /* adjust the flags */
246 afs_vnode_finalise_status_update(vnode, server, ret); 357 if (ret == 0) {
358 _debug("adjust");
359 if (auth_vnode)
360 afs_cache_permit(vnode, key, acl_order);
361 afs_vnode_finalise_status_update(vnode, server);
362 afs_put_server(server);
363 } else {
364 _debug("failed [%d]", ret);
365 afs_vnode_status_update_failed(vnode, ret);
366 }
247 367
248 _leave(" = %d", ret); 368 ASSERTCMP(vnode->update_cnt, >=, 0);
369
370 _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
249 return ret; 371 return ret;
250} /* end afs_vnode_fetch_status() */
251 372
252/*****************************************************************************/ 373no_server:
374 spin_lock(&vnode->lock);
375 vnode->update_cnt--;
376 ASSERTCMP(vnode->update_cnt, >=, 0);
377 spin_unlock(&vnode->lock);
378 _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
379 return PTR_ERR(server);
380}
381
253/* 382/*
254 * fetch file data from the volume 383 * fetch file data from the volume
255 * - TODO implement caching and server failover 384 * - TODO implement caching
256 */ 385 */
257int afs_vnode_fetch_data(struct afs_vnode *vnode, 386int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
258 struct afs_rxfs_fetch_descriptor *desc) 387 off_t offset, size_t length, struct page *page)
259{ 388{
260 struct afs_server *server; 389 struct afs_server *server;
261 int ret; 390 int ret;
262 391
263 _enter("%s,{%u,%u,%u}", 392 _enter("%s{%u,%u,%u},%x,,,",
264 vnode->volume->vlocation->vldb.name, 393 vnode->volume->vlocation->vldb.name,
265 vnode->fid.vid, 394 vnode->fid.vid,
266 vnode->fid.vnode, 395 vnode->fid.vnode,
267 vnode->fid.unique); 396 vnode->fid.unique,
397 key_serial(key));
268 398
269 /* this op will fetch the status */ 399 /* this op will fetch the status */
270 spin_lock(&vnode->lock); 400 spin_lock(&vnode->lock);
@@ -275,120 +405,351 @@ int afs_vnode_fetch_data(struct afs_vnode *vnode,
275 * vnode */ 405 * vnode */
276 do { 406 do {
277 /* pick a server to query */ 407 /* pick a server to query */
278 ret = afs_volume_pick_fileserver(vnode->volume, &server); 408 server = afs_volume_pick_fileserver(vnode);
279 if (ret < 0) 409 if (IS_ERR(server))
280 return ret; 410 goto no_server;
281 411
282 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); 412 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
283 413
284 ret = afs_rxfs_fetch_file_data(server, vnode, desc, NULL); 414 ret = afs_fs_fetch_data(server, key, vnode, offset, length,
415 page, &afs_sync_call);
285 416
286 } while (!afs_volume_release_fileserver(vnode->volume, server, ret)); 417 } while (!afs_volume_release_fileserver(vnode, server, ret));
287 418
288 /* adjust the flags */ 419 /* adjust the flags */
289 afs_vnode_finalise_status_update(vnode, server, ret); 420 if (ret == 0) {
421 afs_vnode_finalise_status_update(vnode, server);
422 afs_put_server(server);
423 } else {
424 afs_vnode_status_update_failed(vnode, ret);
425 }
290 426
291 _leave(" = %d", ret); 427 _leave(" = %d", ret);
292 return ret; 428 return ret;
293 429
294} /* end afs_vnode_fetch_data() */ 430no_server:
431 spin_lock(&vnode->lock);
432 vnode->update_cnt--;
433 ASSERTCMP(vnode->update_cnt, >=, 0);
434 spin_unlock(&vnode->lock);
435 return PTR_ERR(server);
436}
295 437
296/*****************************************************************************/
297/* 438/*
298 * break any outstanding callback on a vnode 439 * make a file or a directory
299 * - only relevent to server that issued it
300 */ 440 */
301int afs_vnode_give_up_callback(struct afs_vnode *vnode) 441int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
442 const char *name, umode_t mode, struct afs_fid *newfid,
443 struct afs_file_status *newstatus,
444 struct afs_callback *newcb, struct afs_server **_server)
302{ 445{
303 struct afs_server *server; 446 struct afs_server *server;
304 int ret; 447 int ret;
305 448
306 _enter("%s,{%u,%u,%u}", 449 _enter("%s{%u,%u,%u},%x,%s,,",
307 vnode->volume->vlocation->vldb.name, 450 vnode->volume->vlocation->vldb.name,
308 vnode->fid.vid, 451 vnode->fid.vid,
309 vnode->fid.vnode, 452 vnode->fid.vnode,
310 vnode->fid.unique); 453 vnode->fid.unique,
311 454 key_serial(key),
312 spin_lock(&afs_cb_hash_lock); 455 name);
313 list_del_init(&vnode->cb_hash_link);
314 spin_unlock(&afs_cb_hash_lock);
315 456
316 /* set the changed flag in the vnode and release the server */ 457 /* this op will fetch the status on the directory we're creating in */
317 spin_lock(&vnode->lock); 458 spin_lock(&vnode->lock);
459 vnode->update_cnt++;
460 spin_unlock(&vnode->lock);
318 461
319 afs_kafstimod_del_timer(&vnode->cb_timeout); 462 do {
463 /* pick a server to query */
464 server = afs_volume_pick_fileserver(vnode);
465 if (IS_ERR(server))
466 goto no_server;
467
468 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
320 469
321 server = xchg(&vnode->cb_server, NULL); 470 ret = afs_fs_create(server, key, vnode, name, mode, newfid,
322 if (server) { 471 newstatus, newcb, &afs_sync_call);
323 vnode->flags |= AFS_VNODE_CHANGED;
324 472
325 spin_lock(&server->cb_lock); 473 } while (!afs_volume_release_fileserver(vnode, server, ret));
326 list_del_init(&vnode->cb_link); 474
327 spin_unlock(&server->cb_lock); 475 /* adjust the flags */
476 if (ret == 0) {
477 afs_vnode_finalise_status_update(vnode, server);
478 *_server = server;
479 } else {
480 afs_vnode_status_update_failed(vnode, ret);
481 *_server = NULL;
328 } 482 }
329 483
484 _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
485 return ret;
486
487no_server:
488 spin_lock(&vnode->lock);
489 vnode->update_cnt--;
490 ASSERTCMP(vnode->update_cnt, >=, 0);
330 spin_unlock(&vnode->lock); 491 spin_unlock(&vnode->lock);
492 _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
493 return PTR_ERR(server);
494}
331 495
332 ret = 0; 496/*
333 if (server) { 497 * remove a file or directory
334 ret = afs_rxfs_give_up_callback(server, vnode); 498 */
499int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
500 bool isdir)
501{
502 struct afs_server *server;
503 int ret;
504
505 _enter("%s{%u,%u,%u},%x,%s",
506 vnode->volume->vlocation->vldb.name,
507 vnode->fid.vid,
508 vnode->fid.vnode,
509 vnode->fid.unique,
510 key_serial(key),
511 name);
512
513 /* this op will fetch the status on the directory we're removing from */
514 spin_lock(&vnode->lock);
515 vnode->update_cnt++;
516 spin_unlock(&vnode->lock);
517
518 do {
519 /* pick a server to query */
520 server = afs_volume_pick_fileserver(vnode);
521 if (IS_ERR(server))
522 goto no_server;
523
524 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
525
526 ret = afs_fs_remove(server, key, vnode, name, isdir,
527 &afs_sync_call);
528
529 } while (!afs_volume_release_fileserver(vnode, server, ret));
530
531 /* adjust the flags */
532 if (ret == 0) {
533 afs_vnode_finalise_status_update(vnode, server);
335 afs_put_server(server); 534 afs_put_server(server);
535 } else {
536 afs_vnode_status_update_failed(vnode, ret);
336 } 537 }
337 538
338 _leave(" = %d", ret); 539 _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
339 return ret; 540 return ret;
340} /* end afs_vnode_give_up_callback() */
341 541
342/*****************************************************************************/ 542no_server:
543 spin_lock(&vnode->lock);
544 vnode->update_cnt--;
545 ASSERTCMP(vnode->update_cnt, >=, 0);
546 spin_unlock(&vnode->lock);
547 _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
548 return PTR_ERR(server);
549}
550
343/* 551/*
344 * match a vnode record stored in the cache 552 * create a hard link
345 */ 553 */
346#ifdef AFS_CACHING_SUPPORT 554extern int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
347static cachefs_match_val_t afs_vnode_cache_match(void *target, 555 struct key *key, const char *name)
348 const void *entry)
349{ 556{
350 const struct afs_cache_vnode *cvnode = entry; 557 struct afs_server *server;
351 struct afs_vnode *vnode = target; 558 int ret;
352 559
353 _enter("{%x,%x,%Lx},{%x,%x,%Lx}", 560 _enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s",
561 dvnode->volume->vlocation->vldb.name,
562 dvnode->fid.vid,
563 dvnode->fid.vnode,
564 dvnode->fid.unique,
565 vnode->volume->vlocation->vldb.name,
566 vnode->fid.vid,
354 vnode->fid.vnode, 567 vnode->fid.vnode,
355 vnode->fid.unique, 568 vnode->fid.unique,
356 vnode->status.version, 569 key_serial(key),
357 cvnode->vnode_id, 570 name);
358 cvnode->vnode_unique, 571
359 cvnode->data_version); 572 /* this op will fetch the status on the directory we're removing from */
360 573 spin_lock(&vnode->lock);
361 if (vnode->fid.vnode != cvnode->vnode_id) { 574 vnode->update_cnt++;
362 _leave(" = FAILED"); 575 spin_unlock(&vnode->lock);
363 return CACHEFS_MATCH_FAILED; 576 spin_lock(&dvnode->lock);
577 dvnode->update_cnt++;
578 spin_unlock(&dvnode->lock);
579
580 do {
581 /* pick a server to query */
582 server = afs_volume_pick_fileserver(dvnode);
583 if (IS_ERR(server))
584 goto no_server;
585
586 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
587
588 ret = afs_fs_link(server, key, dvnode, vnode, name,
589 &afs_sync_call);
590
591 } while (!afs_volume_release_fileserver(dvnode, server, ret));
592
593 /* adjust the flags */
594 if (ret == 0) {
595 afs_vnode_finalise_status_update(vnode, server);
596 afs_vnode_finalise_status_update(dvnode, server);
597 afs_put_server(server);
598 } else {
599 afs_vnode_status_update_failed(vnode, ret);
600 afs_vnode_status_update_failed(dvnode, ret);
364 } 601 }
365 602
366 if (vnode->fid.unique != cvnode->vnode_unique || 603 _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
367 vnode->status.version != cvnode->data_version) { 604 return ret;
368 _leave(" = DELETE"); 605
369 return CACHEFS_MATCH_SUCCESS_DELETE; 606no_server:
607 spin_lock(&vnode->lock);
608 vnode->update_cnt--;
609 ASSERTCMP(vnode->update_cnt, >=, 0);
610 spin_unlock(&vnode->lock);
611 spin_lock(&dvnode->lock);
612 dvnode->update_cnt--;
613 ASSERTCMP(dvnode->update_cnt, >=, 0);
614 spin_unlock(&dvnode->lock);
615 _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
616 return PTR_ERR(server);
617}
618
619/*
620 * create a symbolic link
621 */
622int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
623 const char *name, const char *content,
624 struct afs_fid *newfid,
625 struct afs_file_status *newstatus,
626 struct afs_server **_server)
627{
628 struct afs_server *server;
629 int ret;
630
631 _enter("%s{%u,%u,%u},%x,%s,%s,,,",
632 vnode->volume->vlocation->vldb.name,
633 vnode->fid.vid,
634 vnode->fid.vnode,
635 vnode->fid.unique,
636 key_serial(key),
637 name, content);
638
639 /* this op will fetch the status on the directory we're creating in */
640 spin_lock(&vnode->lock);
641 vnode->update_cnt++;
642 spin_unlock(&vnode->lock);
643
644 do {
645 /* pick a server to query */
646 server = afs_volume_pick_fileserver(vnode);
647 if (IS_ERR(server))
648 goto no_server;
649
650 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
651
652 ret = afs_fs_symlink(server, key, vnode, name, content,
653 newfid, newstatus, &afs_sync_call);
654
655 } while (!afs_volume_release_fileserver(vnode, server, ret));
656
657 /* adjust the flags */
658 if (ret == 0) {
659 afs_vnode_finalise_status_update(vnode, server);
660 *_server = server;
661 } else {
662 afs_vnode_status_update_failed(vnode, ret);
663 *_server = NULL;
370 } 664 }
371 665
372 _leave(" = SUCCESS"); 666 _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
373 return CACHEFS_MATCH_SUCCESS; 667 return ret;
374} /* end afs_vnode_cache_match() */ 668
375#endif 669no_server:
670 spin_lock(&vnode->lock);
671 vnode->update_cnt--;
672 ASSERTCMP(vnode->update_cnt, >=, 0);
673 spin_unlock(&vnode->lock);
674 _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
675 return PTR_ERR(server);
676}
376 677
377/*****************************************************************************/
378/* 678/*
379 * update a vnode record stored in the cache 679 * rename a file
380 */ 680 */
381#ifdef AFS_CACHING_SUPPORT 681int afs_vnode_rename(struct afs_vnode *orig_dvnode,
382static void afs_vnode_cache_update(void *source, void *entry) 682 struct afs_vnode *new_dvnode,
683 struct key *key,
684 const char *orig_name,
685 const char *new_name)
383{ 686{
384 struct afs_cache_vnode *cvnode = entry; 687 struct afs_server *server;
385 struct afs_vnode *vnode = source; 688 int ret;
386 689
387 _enter(""); 690 _enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s,%s",
691 orig_dvnode->volume->vlocation->vldb.name,
692 orig_dvnode->fid.vid,
693 orig_dvnode->fid.vnode,
694 orig_dvnode->fid.unique,
695 new_dvnode->volume->vlocation->vldb.name,
696 new_dvnode->fid.vid,
697 new_dvnode->fid.vnode,
698 new_dvnode->fid.unique,
699 key_serial(key),
700 orig_name,
701 new_name);
702
703 /* this op will fetch the status on both the directories we're dealing
704 * with */
705 spin_lock(&orig_dvnode->lock);
706 orig_dvnode->update_cnt++;
707 spin_unlock(&orig_dvnode->lock);
708 if (new_dvnode != orig_dvnode) {
709 spin_lock(&new_dvnode->lock);
710 new_dvnode->update_cnt++;
711 spin_unlock(&new_dvnode->lock);
712 }
388 713
389 cvnode->vnode_id = vnode->fid.vnode; 714 do {
390 cvnode->vnode_unique = vnode->fid.unique; 715 /* pick a server to query */
391 cvnode->data_version = vnode->status.version; 716 server = afs_volume_pick_fileserver(orig_dvnode);
717 if (IS_ERR(server))
718 goto no_server;
392 719
393} /* end afs_vnode_cache_update() */ 720 _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
394#endif 721
722 ret = afs_fs_rename(server, key, orig_dvnode, orig_name,
723 new_dvnode, new_name, &afs_sync_call);
724
725 } while (!afs_volume_release_fileserver(orig_dvnode, server, ret));
726
727 /* adjust the flags */
728 if (ret == 0) {
729 afs_vnode_finalise_status_update(orig_dvnode, server);
730 if (new_dvnode != orig_dvnode)
731 afs_vnode_finalise_status_update(new_dvnode, server);
732 afs_put_server(server);
733 } else {
734 afs_vnode_status_update_failed(orig_dvnode, ret);
735 if (new_dvnode != orig_dvnode)
736 afs_vnode_status_update_failed(new_dvnode, ret);
737 }
738
739 _leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt);
740 return ret;
741
742no_server:
743 spin_lock(&orig_dvnode->lock);
744 orig_dvnode->update_cnt--;
745 ASSERTCMP(orig_dvnode->update_cnt, >=, 0);
746 spin_unlock(&orig_dvnode->lock);
747 if (new_dvnode != orig_dvnode) {
748 spin_lock(&new_dvnode->lock);
749 new_dvnode->update_cnt--;
750 ASSERTCMP(new_dvnode->update_cnt, >=, 0);
751 spin_unlock(&new_dvnode->lock);
752 }
753 _leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt);
754 return PTR_ERR(server);
755}
diff --git a/fs/afs/vnode.h b/fs/afs/vnode.h
deleted file mode 100644
index b86a97102e8b..000000000000
--- a/fs/afs/vnode.h
+++ /dev/null
@@ -1,94 +0,0 @@
1/* vnode.h: AFS vnode record
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef _LINUX_AFS_VNODE_H
13#define _LINUX_AFS_VNODE_H
14
15#include <linux/fs.h>
16#include "server.h"
17#include "kafstimod.h"
18#include "cache.h"
19
20#ifdef __KERNEL__
21
22struct afs_rxfs_fetch_descriptor;
23
24/*****************************************************************************/
25/*
26 * vnode catalogue entry
27 */
28struct afs_cache_vnode
29{
30 afs_vnodeid_t vnode_id; /* vnode ID */
31 unsigned vnode_unique; /* vnode ID uniquifier */
32 afs_dataversion_t data_version; /* data version */
33};
34
35#ifdef AFS_CACHING_SUPPORT
36extern struct cachefs_index_def afs_vnode_cache_index_def;
37#endif
38
39/*****************************************************************************/
40/*
41 * AFS inode private data
42 */
43struct afs_vnode
44{
45 struct inode vfs_inode; /* the VFS's inode record */
46
47 struct afs_volume *volume; /* volume on which vnode resides */
48 struct afs_fid fid; /* the file identifier for this inode */
49 struct afs_file_status status; /* AFS status info for this file */
50#ifdef AFS_CACHING_SUPPORT
51 struct cachefs_cookie *cache; /* caching cookie */
52#endif
53
54 wait_queue_head_t update_waitq; /* status fetch waitqueue */
55 unsigned update_cnt; /* number of outstanding ops that will update the
56 * status */
57 spinlock_t lock; /* waitqueue/flags lock */
58 unsigned flags;
59#define AFS_VNODE_CHANGED 0x00000001 /* set if vnode reported changed by callback */
60#define AFS_VNODE_DELETED 0x00000002 /* set if vnode deleted on server */
61#define AFS_VNODE_MOUNTPOINT 0x00000004 /* set if vnode is a mountpoint symlink */
62
63 /* outstanding callback notification on this file */
64 struct afs_server *cb_server; /* server that made the current promise */
65 struct list_head cb_link; /* link in server's promises list */
66 struct list_head cb_hash_link; /* link in master callback hash */
67 struct afs_timer cb_timeout; /* timeout on promise */
68 unsigned cb_version; /* callback version */
69 unsigned cb_expiry; /* callback expiry time */
70 afs_callback_type_t cb_type; /* type of callback */
71};
72
73static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
74{
75 return container_of(inode,struct afs_vnode,vfs_inode);
76}
77
78static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
79{
80 return &vnode->vfs_inode;
81}
82
83extern int afs_vnode_fetch_status(struct afs_vnode *vnode);
84
85extern int afs_vnode_fetch_data(struct afs_vnode *vnode,
86 struct afs_rxfs_fetch_descriptor *desc);
87
88extern int afs_vnode_give_up_callback(struct afs_vnode *vnode);
89
90extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
91
92#endif /* __KERNEL__ */
93
94#endif /* _LINUX_AFS_VNODE_H */
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 768c6dbd323a..dd160cada45d 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -1,6 +1,6 @@
1/* volume.c: AFS volume management 1/* AFS volume management
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -15,35 +15,10 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/pagemap.h> 17#include <linux/pagemap.h>
18#include "volume.h"
19#include "vnode.h"
20#include "cell.h"
21#include "cache.h"
22#include "cmservice.h"
23#include "fsclient.h"
24#include "vlclient.h"
25#include "internal.h" 18#include "internal.h"
26 19
27#ifdef __KDEBUG
28static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" }; 20static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
29#endif
30
31#ifdef AFS_CACHING_SUPPORT
32static cachefs_match_val_t afs_volume_cache_match(void *target,
33 const void *entry);
34static void afs_volume_cache_update(void *source, void *entry);
35
36struct cachefs_index_def afs_volume_cache_index_def = {
37 .name = "volume",
38 .data_size = sizeof(struct afs_cache_vhash),
39 .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 1 },
40 .keys[1] = { CACHEFS_INDEX_KEYS_BIN, 1 },
41 .match = afs_volume_cache_match,
42 .update = afs_volume_cache_update,
43};
44#endif
45 21
46/*****************************************************************************/
47/* 22/*
48 * lookup a volume by name 23 * lookup a volume by name
49 * - this can be one of the following: 24 * - this can be one of the following:
@@ -66,118 +41,52 @@ struct cachefs_index_def afs_volume_cache_index_def = {
66 * - Rule 3: If parent volume is R/W, then only mount R/W volume unless 41 * - Rule 3: If parent volume is R/W, then only mount R/W volume unless
67 * explicitly told otherwise 42 * explicitly told otherwise
68 */ 43 */
69int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath, 44struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
70 struct afs_volume **_volume)
71{ 45{
72 struct afs_vlocation *vlocation = NULL; 46 struct afs_vlocation *vlocation = NULL;
73 struct afs_volume *volume = NULL; 47 struct afs_volume *volume = NULL;
74 afs_voltype_t type; 48 struct afs_server *server = NULL;
75 const char *cellname, *volname, *suffix;
76 char srvtmask; 49 char srvtmask;
77 int force, ret, loop, cellnamesz, volnamesz; 50 int ret, loop;
78
79 _enter("%s,,%d,", name, rwpath);
80
81 if (!name || (name[0] != '%' && name[0] != '#') || !name[1]) {
82 printk("kAFS: unparsable volume name\n");
83 return -EINVAL;
84 }
85
86 /* determine the type of volume we're looking for */
87 force = 0;
88 type = AFSVL_ROVOL;
89
90 if (rwpath || name[0] == '%') {
91 type = AFSVL_RWVOL;
92 force = 1;
93 }
94
95 suffix = strrchr(name, '.');
96 if (suffix) {
97 if (strcmp(suffix, ".readonly") == 0) {
98 type = AFSVL_ROVOL;
99 force = 1;
100 }
101 else if (strcmp(suffix, ".backup") == 0) {
102 type = AFSVL_BACKVOL;
103 force = 1;
104 }
105 else if (suffix[1] == 0) {
106 }
107 else {
108 suffix = NULL;
109 }
110 }
111 51
112 /* split the cell and volume names */ 52 _enter("{%*.*s,%d}",
113 name++; 53 params->volnamesz, params->volnamesz, params->volname, params->rwpath);
114 volname = strchr(name, ':');
115 if (volname) {
116 cellname = name;
117 cellnamesz = volname - name;
118 volname++;
119 }
120 else {
121 volname = name;
122 cellname = NULL;
123 cellnamesz = 0;
124 }
125
126 volnamesz = suffix ? suffix - volname : strlen(volname);
127
128 _debug("CELL:%*.*s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
129 cellnamesz, cellnamesz, cellname ?: "", cell,
130 volnamesz, volnamesz, volname, suffix ?: "-",
131 type,
132 force ? " FORCE" : "");
133
134 /* lookup the cell record */
135 if (cellname || !cell) {
136 ret = afs_cell_lookup(cellname, cellnamesz, &cell);
137 if (ret<0) {
138 printk("kAFS: unable to lookup cell '%s'\n",
139 cellname ?: "");
140 goto error;
141 }
142 }
143 else {
144 afs_get_cell(cell);
145 }
146 54
147 /* lookup the volume location record */ 55 /* lookup the volume location record */
148 ret = afs_vlocation_lookup(cell, volname, volnamesz, &vlocation); 56 vlocation = afs_vlocation_lookup(params->cell, params->key,
149 if (ret < 0) 57 params->volname, params->volnamesz);
58 if (IS_ERR(vlocation)) {
59 ret = PTR_ERR(vlocation);
60 vlocation = NULL;
150 goto error; 61 goto error;
62 }
151 63
152 /* make the final decision on the type we want */ 64 /* make the final decision on the type we want */
153 ret = -ENOMEDIUM; 65 ret = -ENOMEDIUM;
154 if (force && !(vlocation->vldb.vidmask & (1 << type))) 66 if (params->force && !(vlocation->vldb.vidmask & (1 << params->type)))
155 goto error; 67 goto error;
156 68
157 srvtmask = 0; 69 srvtmask = 0;
158 for (loop = 0; loop < vlocation->vldb.nservers; loop++) 70 for (loop = 0; loop < vlocation->vldb.nservers; loop++)
159 srvtmask |= vlocation->vldb.srvtmask[loop]; 71 srvtmask |= vlocation->vldb.srvtmask[loop];
160 72
161 if (force) { 73 if (params->force) {
162 if (!(srvtmask & (1 << type))) 74 if (!(srvtmask & (1 << params->type)))
163 goto error; 75 goto error;
164 } 76 } else if (srvtmask & AFS_VOL_VTM_RO) {
165 else if (srvtmask & AFS_VOL_VTM_RO) { 77 params->type = AFSVL_ROVOL;
166 type = AFSVL_ROVOL; 78 } else if (srvtmask & AFS_VOL_VTM_RW) {
167 } 79 params->type = AFSVL_RWVOL;
168 else if (srvtmask & AFS_VOL_VTM_RW) { 80 } else {
169 type = AFSVL_RWVOL;
170 }
171 else {
172 goto error; 81 goto error;
173 } 82 }
174 83
175 down_write(&cell->vl_sem); 84 down_write(&params->cell->vl_sem);
176 85
177 /* is the volume already active? */ 86 /* is the volume already active? */
178 if (vlocation->vols[type]) { 87 if (vlocation->vols[params->type]) {
179 /* yes - re-use it */ 88 /* yes - re-use it */
180 volume = vlocation->vols[type]; 89 volume = vlocation->vols[params->type];
181 afs_get_volume(volume); 90 afs_get_volume(volume);
182 goto success; 91 goto success;
183 } 92 }
@@ -191,23 +100,24 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
191 goto error_up; 100 goto error_up;
192 101
193 atomic_set(&volume->usage, 1); 102 atomic_set(&volume->usage, 1);
194 volume->type = type; 103 volume->type = params->type;
195 volume->type_force = force; 104 volume->type_force = params->force;
196 volume->cell = cell; 105 volume->cell = params->cell;
197 volume->vid = vlocation->vldb.vid[type]; 106 volume->vid = vlocation->vldb.vid[params->type];
198 107
199 init_rwsem(&volume->server_sem); 108 init_rwsem(&volume->server_sem);
200 109
201 /* look up all the applicable server records */ 110 /* look up all the applicable server records */
202 for (loop = 0; loop < 8; loop++) { 111 for (loop = 0; loop < 8; loop++) {
203 if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) { 112 if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) {
204 ret = afs_server_lookup( 113 server = afs_lookup_server(
205 volume->cell, 114 volume->cell, &vlocation->vldb.servers[loop]);
206 &vlocation->vldb.servers[loop], 115 if (IS_ERR(server)) {
207 &volume->servers[volume->nservers]); 116 ret = PTR_ERR(server);
208 if (ret < 0)
209 goto error_discard; 117 goto error_discard;
118 }
210 119
120 volume->servers[volume->nservers] = server;
211 volume->nservers++; 121 volume->nservers++;
212 } 122 }
213 } 123 }
@@ -223,35 +133,34 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
223 afs_get_vlocation(vlocation); 133 afs_get_vlocation(vlocation);
224 volume->vlocation = vlocation; 134 volume->vlocation = vlocation;
225 135
226 vlocation->vols[type] = volume; 136 vlocation->vols[volume->type] = volume;
227 137
228 success: 138success:
229 _debug("kAFS selected %s volume %08x", 139 _debug("kAFS selected %s volume %08x",
230 afs_voltypes[volume->type], volume->vid); 140 afs_voltypes[volume->type], volume->vid);
231 *_volume = volume; 141 up_write(&params->cell->vl_sem);
232 ret = 0; 142 afs_put_vlocation(vlocation);
143 _leave(" = %p", volume);
144 return volume;
233 145
234 /* clean up */ 146 /* clean up */
235 error_up: 147error_up:
236 up_write(&cell->vl_sem); 148 up_write(&params->cell->vl_sem);
237 error: 149error:
238 afs_put_vlocation(vlocation); 150 afs_put_vlocation(vlocation);
239 afs_put_cell(cell); 151 _leave(" = %d", ret);
240 152 return ERR_PTR(ret);
241 _leave(" = %d (%p)", ret, volume);
242 return ret;
243 153
244 error_discard: 154error_discard:
245 up_write(&cell->vl_sem); 155 up_write(&params->cell->vl_sem);
246 156
247 for (loop = volume->nservers - 1; loop >= 0; loop--) 157 for (loop = volume->nservers - 1; loop >= 0; loop--)
248 afs_put_server(volume->servers[loop]); 158 afs_put_server(volume->servers[loop]);
249 159
250 kfree(volume); 160 kfree(volume);
251 goto error; 161 goto error;
252} /* end afs_volume_lookup() */ 162}
253 163
254/*****************************************************************************/
255/* 164/*
256 * destroy a volume record 165 * destroy a volume record
257 */ 166 */
@@ -265,10 +174,9 @@ void afs_put_volume(struct afs_volume *volume)
265 174
266 _enter("%p", volume); 175 _enter("%p", volume);
267 176
268 vlocation = volume->vlocation; 177 ASSERTCMP(atomic_read(&volume->usage), >, 0);
269 178
270 /* sanity check */ 179 vlocation = volume->vlocation;
271 BUG_ON(atomic_read(&volume->usage) <= 0);
272 180
273 /* to prevent a race, the decrement and the dequeue must be effectively 181 /* to prevent a race, the decrement and the dequeue must be effectively
274 * atomic */ 182 * atomic */
@@ -296,21 +204,27 @@ void afs_put_volume(struct afs_volume *volume)
296 kfree(volume); 204 kfree(volume);
297 205
298 _leave(" [destroyed]"); 206 _leave(" [destroyed]");
299} /* end afs_put_volume() */ 207}
300 208
301/*****************************************************************************/
302/* 209/*
303 * pick a server to use to try accessing this volume 210 * pick a server to use to try accessing this volume
304 * - returns with an elevated usage count on the server chosen 211 * - returns with an elevated usage count on the server chosen
305 */ 212 */
306int afs_volume_pick_fileserver(struct afs_volume *volume, 213struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
307 struct afs_server **_server)
308{ 214{
215 struct afs_volume *volume = vnode->volume;
309 struct afs_server *server; 216 struct afs_server *server;
310 int ret, state, loop; 217 int ret, state, loop;
311 218
312 _enter("%s", volume->vlocation->vldb.name); 219 _enter("%s", volume->vlocation->vldb.name);
313 220
221 /* stick with the server we're already using if we can */
222 if (vnode->server && vnode->server->fs_state == 0) {
223 afs_get_server(vnode->server);
224 _leave(" = %p [current]", vnode->server);
225 return vnode->server;
226 }
227
314 down_read(&volume->server_sem); 228 down_read(&volume->server_sem);
315 229
316 /* handle the no-server case */ 230 /* handle the no-server case */
@@ -318,7 +232,7 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
318 ret = volume->rjservers ? -ENOMEDIUM : -ESTALE; 232 ret = volume->rjservers ? -ENOMEDIUM : -ESTALE;
319 up_read(&volume->server_sem); 233 up_read(&volume->server_sem);
320 _leave(" = %d [no servers]", ret); 234 _leave(" = %d [no servers]", ret);
321 return ret; 235 return ERR_PTR(ret);
322 } 236 }
323 237
324 /* basically, just search the list for the first live server and use 238 /* basically, just search the list for the first live server and use
@@ -328,15 +242,16 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
328 server = volume->servers[loop]; 242 server = volume->servers[loop];
329 state = server->fs_state; 243 state = server->fs_state;
330 244
245 _debug("consider %d [%d]", loop, state);
246
331 switch (state) { 247 switch (state) {
332 /* found an apparently healthy server */ 248 /* found an apparently healthy server */
333 case 0: 249 case 0:
334 afs_get_server(server); 250 afs_get_server(server);
335 up_read(&volume->server_sem); 251 up_read(&volume->server_sem);
336 *_server = server; 252 _leave(" = %p (picked %08x)",
337 _leave(" = 0 (picked %08x)", 253 server, ntohl(server->addr.s_addr));
338 ntohl(server->addr.s_addr)); 254 return server;
339 return 0;
340 255
341 case -ENETUNREACH: 256 case -ENETUNREACH:
342 if (ret == 0) 257 if (ret == 0)
@@ -372,20 +287,21 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
372 */ 287 */
373 up_read(&volume->server_sem); 288 up_read(&volume->server_sem);
374 _leave(" = %d", ret); 289 _leave(" = %d", ret);
375 return ret; 290 return ERR_PTR(ret);
376} /* end afs_volume_pick_fileserver() */ 291}
377 292
378/*****************************************************************************/
379/* 293/*
380 * release a server after use 294 * release a server after use
381 * - releases the ref on the server struct that was acquired by picking 295 * - releases the ref on the server struct that was acquired by picking
382 * - records result of using a particular server to access a volume 296 * - records result of using a particular server to access a volume
383 * - return 0 to try again, 1 if okay or to issue error 297 * - return 0 to try again, 1 if okay or to issue error
298 * - the caller must release the server struct if result was 0
384 */ 299 */
385int afs_volume_release_fileserver(struct afs_volume *volume, 300int afs_volume_release_fileserver(struct afs_vnode *vnode,
386 struct afs_server *server, 301 struct afs_server *server,
387 int result) 302 int result)
388{ 303{
304 struct afs_volume *volume = vnode->volume;
389 unsigned loop; 305 unsigned loop;
390 306
391 _enter("%s,%08x,%d", 307 _enter("%s,%08x,%d",
@@ -396,14 +312,16 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
396 /* success */ 312 /* success */
397 case 0: 313 case 0:
398 server->fs_act_jif = jiffies; 314 server->fs_act_jif = jiffies;
399 break; 315 server->fs_state = 0;
316 _leave("");
317 return 1;
400 318
401 /* the fileserver denied all knowledge of the volume */ 319 /* the fileserver denied all knowledge of the volume */
402 case -ENOMEDIUM: 320 case -ENOMEDIUM:
403 server->fs_act_jif = jiffies; 321 server->fs_act_jif = jiffies;
404 down_write(&volume->server_sem); 322 down_write(&volume->server_sem);
405 323
406 /* first, find where the server is in the active list (if it 324 /* firstly, find where the server is in the active list (if it
407 * is) */ 325 * is) */
408 for (loop = 0; loop < volume->nservers; loop++) 326 for (loop = 0; loop < volume->nservers; loop++)
409 if (volume->servers[loop] == server) 327 if (volume->servers[loop] == server)
@@ -441,6 +359,7 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
441 case -ENETUNREACH: 359 case -ENETUNREACH:
442 case -EHOSTUNREACH: 360 case -EHOSTUNREACH:
443 case -ECONNREFUSED: 361 case -ECONNREFUSED:
362 case -ETIME:
444 case -ETIMEDOUT: 363 case -ETIMEDOUT:
445 case -EREMOTEIO: 364 case -EREMOTEIO:
446 /* mark the server as dead 365 /* mark the server as dead
@@ -460,60 +379,17 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
460 server->fs_act_jif = jiffies; 379 server->fs_act_jif = jiffies;
461 case -ENOMEM: 380 case -ENOMEM:
462 case -ENONET: 381 case -ENONET:
463 break; 382 /* tell the caller to accept the result */
383 afs_put_server(server);
384 _leave(" [local failure]");
385 return 1;
464 } 386 }
465 387
466 /* tell the caller to accept the result */
467 afs_put_server(server);
468 _leave("");
469 return 1;
470
471 /* tell the caller to loop around and try the next server */ 388 /* tell the caller to loop around and try the next server */
472 try_next_server_upw: 389try_next_server_upw:
473 up_write(&volume->server_sem); 390 up_write(&volume->server_sem);
474 try_next_server: 391try_next_server:
475 afs_put_server(server); 392 afs_put_server(server);
476 _leave(" [try next server]"); 393 _leave(" [try next server]");
477 return 0; 394 return 0;
478 395}
479} /* end afs_volume_release_fileserver() */
480
481/*****************************************************************************/
482/*
483 * match a volume hash record stored in the cache
484 */
485#ifdef AFS_CACHING_SUPPORT
486static cachefs_match_val_t afs_volume_cache_match(void *target,
487 const void *entry)
488{
489 const struct afs_cache_vhash *vhash = entry;
490 struct afs_volume *volume = target;
491
492 _enter("{%u},{%u}", volume->type, vhash->vtype);
493
494 if (volume->type == vhash->vtype) {
495 _leave(" = SUCCESS");
496 return CACHEFS_MATCH_SUCCESS;
497 }
498
499 _leave(" = FAILED");
500 return CACHEFS_MATCH_FAILED;
501} /* end afs_volume_cache_match() */
502#endif
503
504/*****************************************************************************/
505/*
506 * update a volume hash record stored in the cache
507 */
508#ifdef AFS_CACHING_SUPPORT
509static void afs_volume_cache_update(void *source, void *entry)
510{
511 struct afs_cache_vhash *vhash = entry;
512 struct afs_volume *volume = source;
513
514 _enter("");
515
516 vhash->vtype = volume->type;
517
518} /* end afs_volume_cache_update() */
519#endif
diff --git a/fs/afs/volume.h b/fs/afs/volume.h
deleted file mode 100644
index bfdcf19ba3f3..000000000000
--- a/fs/afs/volume.h
+++ /dev/null
@@ -1,140 +0,0 @@
1/* volume.h: AFS volume management
2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#ifndef _LINUX_AFS_VOLUME_H
13#define _LINUX_AFS_VOLUME_H
14
15#include "types.h"
16#include "fsclient.h"
17#include "kafstimod.h"
18#include "kafsasyncd.h"
19#include "cache.h"
20
21typedef enum {
22 AFS_VLUPD_SLEEP, /* sleeping waiting for update timer to fire */
23 AFS_VLUPD_PENDING, /* on pending queue */
24 AFS_VLUPD_INPROGRESS, /* op in progress */
25 AFS_VLUPD_BUSYSLEEP, /* sleeping because server returned EBUSY */
26
27} __attribute__((packed)) afs_vlocation_upd_t;
28
29/*****************************************************************************/
30/*
31 * entry in the cached volume location catalogue
32 */
33struct afs_cache_vlocation
34{
35 uint8_t name[64]; /* volume name (lowercase, padded with NULs) */
36 uint8_t nservers; /* number of entries used in servers[] */
37 uint8_t vidmask; /* voltype mask for vid[] */
38 uint8_t srvtmask[8]; /* voltype masks for servers[] */
39#define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */
40#define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */
41#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */
42
43 afs_volid_t vid[3]; /* volume IDs for R/W, R/O and Bak volumes */
44 struct in_addr servers[8]; /* fileserver addresses */
45 time_t rtime; /* last retrieval time */
46};
47
48#ifdef AFS_CACHING_SUPPORT
49extern struct cachefs_index_def afs_vlocation_cache_index_def;
50#endif
51
52/*****************************************************************************/
53/*
54 * volume -> vnode hash table entry
55 */
56struct afs_cache_vhash
57{
58 afs_voltype_t vtype; /* which volume variation */
59 uint8_t hash_bucket; /* which hash bucket this represents */
60} __attribute__((packed));
61
62#ifdef AFS_CACHING_SUPPORT
63extern struct cachefs_index_def afs_volume_cache_index_def;
64#endif
65
66/*****************************************************************************/
67/*
68 * AFS volume location record
69 */
70struct afs_vlocation
71{
72 atomic_t usage;
73 struct list_head link; /* link in cell volume location list */
74 struct afs_timer timeout; /* decaching timer */
75 struct afs_cell *cell; /* cell to which volume belongs */
76#ifdef AFS_CACHING_SUPPORT
77 struct cachefs_cookie *cache; /* caching cookie */
78#endif
79 struct afs_cache_vlocation vldb; /* volume information DB record */
80 struct afs_volume *vols[3]; /* volume access record pointer (index by type) */
81 rwlock_t lock; /* access lock */
82 unsigned long read_jif; /* time at which last read from vlserver */
83 struct afs_timer upd_timer; /* update timer */
84 struct afs_async_op upd_op; /* update operation */
85 afs_vlocation_upd_t upd_state; /* update state */
86 unsigned short upd_first_svix; /* first server index during update */
87 unsigned short upd_curr_svix; /* current server index during update */
88 unsigned short upd_rej_cnt; /* ENOMEDIUM count during update */
89 unsigned short upd_busy_cnt; /* EBUSY count during update */
90 unsigned short valid; /* T if valid */
91};
92
93extern int afs_vlocation_lookup(struct afs_cell *cell,
94 const char *name,
95 unsigned namesz,
96 struct afs_vlocation **_vlocation);
97
98#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
99
100extern void afs_put_vlocation(struct afs_vlocation *vlocation);
101extern void afs_vlocation_do_timeout(struct afs_vlocation *vlocation);
102
103/*****************************************************************************/
104/*
105 * AFS volume access record
106 */
107struct afs_volume
108{
109 atomic_t usage;
110 struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */
111 struct afs_vlocation *vlocation; /* volume location */
112#ifdef AFS_CACHING_SUPPORT
113 struct cachefs_cookie *cache; /* caching cookie */
114#endif
115 afs_volid_t vid; /* volume ID */
116 afs_voltype_t type; /* type of volume */
117 char type_force; /* force volume type (suppress R/O -> R/W) */
118 unsigned short nservers; /* number of server slots filled */
119 unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */
120 struct afs_server *servers[8]; /* servers on which volume resides (ordered) */
121 struct rw_semaphore server_sem; /* lock for accessing current server */
122};
123
124extern int afs_volume_lookup(const char *name,
125 struct afs_cell *cell,
126 int rwpath,
127 struct afs_volume **_volume);
128
129#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
130
131extern void afs_put_volume(struct afs_volume *volume);
132
133extern int afs_volume_pick_fileserver(struct afs_volume *volume,
134 struct afs_server **_server);
135
136extern int afs_volume_release_fileserver(struct afs_volume *volume,
137 struct afs_server *server,
138 int result);
139
140#endif /* _LINUX_AFS_VOLUME_H */
diff --git a/fs/aio.c b/fs/aio.c
index e4598d6d49dd..b97ab8028b6d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -68,10 +68,8 @@ static void aio_queue_work(struct kioctx *);
68 */ 68 */
69static int __init aio_setup(void) 69static int __init aio_setup(void)
70{ 70{
71 kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb), 71 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
72 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 72 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
73 kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx),
74 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
75 73
76 aio_wq = create_workqueue("aio"); 74 aio_wq = create_workqueue("aio");
77 75
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index cc6cc8ed2e39..fe96108a788d 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -293,8 +293,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
293{ 293{
294 struct befs_inode_info *bi = (struct befs_inode_info *) foo; 294 struct befs_inode_info *bi = (struct befs_inode_info *) foo;
295 295
296 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 296 if (flags & SLAB_CTOR_CONSTRUCTOR) {
297 SLAB_CTOR_CONSTRUCTOR) {
298 inode_init_once(&bi->vfs_inode); 297 inode_init_once(&bi->vfs_inode);
299 } 298 }
300} 299}
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 93d6219243ad..edc08d89aabc 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -248,8 +248,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
248{ 248{
249 struct bfs_inode_info *bi = foo; 249 struct bfs_inode_info *bi = foo;
250 250
251 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 251 if (flags & SLAB_CTOR_CONSTRUCTOR)
252 SLAB_CTOR_CONSTRUCTOR)
253 inode_init_once(&bi->vfs_inode); 252 inode_init_once(&bi->vfs_inode);
254} 253}
255 254
diff --git a/fs/bio.c b/fs/bio.c
index 7618bcb18368..093345f00128 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,7 +28,7 @@
28#include <linux/blktrace_api.h> 28#include <linux/blktrace_api.h>
29#include <scsi/sg.h> /* for struct sg_iovec */ 29#include <scsi/sg.h> /* for struct sg_iovec */
30 30
31#define BIO_POOL_SIZE 256 31#define BIO_POOL_SIZE 2
32 32
33static struct kmem_cache *bio_slab __read_mostly; 33static struct kmem_cache *bio_slab __read_mostly;
34 34
@@ -38,7 +38,7 @@ static struct kmem_cache *bio_slab __read_mostly;
38 * a small number of entries is fine, not going to be performance critical. 38 * a small number of entries is fine, not going to be performance critical.
39 * basically we just need to survive 39 * basically we just need to survive
40 */ 40 */
41#define BIO_SPLIT_ENTRIES 8 41#define BIO_SPLIT_ENTRIES 2
42mempool_t *bio_split_pool __read_mostly; 42mempool_t *bio_split_pool __read_mostly;
43 43
44struct biovec_slab { 44struct biovec_slab {
@@ -1120,7 +1120,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
1120 * create memory pools for biovec's in a bio_set. 1120 * create memory pools for biovec's in a bio_set.
1121 * use the global biovec slabs created for general use. 1121 * use the global biovec slabs created for general use.
1122 */ 1122 */
1123static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale) 1123static int biovec_create_pools(struct bio_set *bs, int pool_entries)
1124{ 1124{
1125 int i; 1125 int i;
1126 1126
@@ -1128,9 +1128,6 @@ static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
1128 struct biovec_slab *bp = bvec_slabs + i; 1128 struct biovec_slab *bp = bvec_slabs + i;
1129 mempool_t **bvp = bs->bvec_pools + i; 1129 mempool_t **bvp = bs->bvec_pools + i;
1130 1130
1131 if (pool_entries > 1 && i >= scale)
1132 pool_entries >>= 1;
1133
1134 *bvp = mempool_create_slab_pool(pool_entries, bp->slab); 1131 *bvp = mempool_create_slab_pool(pool_entries, bp->slab);
1135 if (!*bvp) 1132 if (!*bvp)
1136 return -ENOMEM; 1133 return -ENOMEM;
@@ -1161,7 +1158,7 @@ void bioset_free(struct bio_set *bs)
1161 kfree(bs); 1158 kfree(bs);
1162} 1159}
1163 1160
1164struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale) 1161struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
1165{ 1162{
1166 struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL); 1163 struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
1167 1164
@@ -1172,7 +1169,7 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
1172 if (!bs->bio_pool) 1169 if (!bs->bio_pool)
1173 goto bad; 1170 goto bad;
1174 1171
1175 if (!biovec_create_pools(bs, bvec_pool_size, scale)) 1172 if (!biovec_create_pools(bs, bvec_pool_size))
1176 return bs; 1173 return bs;
1177 1174
1178bad: 1175bad:
@@ -1196,38 +1193,11 @@ static void __init biovec_init_slabs(void)
1196 1193
1197static int __init init_bio(void) 1194static int __init init_bio(void)
1198{ 1195{
1199 int megabytes, bvec_pool_entries; 1196 bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
1200 int scale = BIOVEC_NR_POOLS;
1201
1202 bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0,
1203 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1204 1197
1205 biovec_init_slabs(); 1198 biovec_init_slabs();
1206 1199
1207 megabytes = nr_free_pages() >> (20 - PAGE_SHIFT); 1200 fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
1208
1209 /*
1210 * find out where to start scaling
1211 */
1212 if (megabytes <= 16)
1213 scale = 0;
1214 else if (megabytes <= 32)
1215 scale = 1;
1216 else if (megabytes <= 64)
1217 scale = 2;
1218 else if (megabytes <= 96)
1219 scale = 3;
1220 else if (megabytes <= 128)
1221 scale = 4;
1222
1223 /*
1224 * Limit number of entries reserved -- mempools are only used when
1225 * the system is completely unable to allocate memory, so we only
1226 * need enough to make progress.
1227 */
1228 bvec_pool_entries = 1 + scale;
1229
1230 fs_bio_set = bioset_create(BIO_POOL_SIZE, bvec_pool_entries, scale);
1231 if (!fs_bio_set) 1201 if (!fs_bio_set)
1232 panic("bio: can't allocate bios\n"); 1202 panic("bio: can't allocate bios\n");
1233 1203
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 575076c018f4..f02b7bdd9864 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -55,10 +55,12 @@ static sector_t max_block(struct block_device *bdev)
55 return retval; 55 return retval;
56} 56}
57 57
58/* Kill _all_ buffers, dirty or not.. */ 58/* Kill _all_ buffers and pagecache , dirty or not.. */
59static void kill_bdev(struct block_device *bdev) 59static void kill_bdev(struct block_device *bdev)
60{ 60{
61 invalidate_bdev(bdev, 1); 61 if (bdev->bd_inode->i_mapping->nrpages == 0)
62 return;
63 invalidate_bh_lrus();
62 truncate_inode_pages(bdev->bd_inode->i_mapping, 0); 64 truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
63} 65}
64 66
@@ -455,9 +457,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
455 struct bdev_inode *ei = (struct bdev_inode *) foo; 457 struct bdev_inode *ei = (struct bdev_inode *) foo;
456 struct block_device *bdev = &ei->bdev; 458 struct block_device *bdev = &ei->bdev;
457 459
458 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 460 if (flags & SLAB_CTOR_CONSTRUCTOR) {
459 SLAB_CTOR_CONSTRUCTOR)
460 {
461 memset(bdev, 0, sizeof(*bdev)); 461 memset(bdev, 0, sizeof(*bdev));
462 mutex_init(&bdev->bd_mutex); 462 mutex_init(&bdev->bd_mutex);
463 sema_init(&bdev->bd_mount_sem, 1); 463 sema_init(&bdev->bd_mount_sem, 1);
@@ -1478,7 +1478,7 @@ int __invalidate_device(struct block_device *bdev)
1478 res = invalidate_inodes(sb); 1478 res = invalidate_inodes(sb);
1479 drop_super(sb); 1479 drop_super(sb);
1480 } 1480 }
1481 invalidate_bdev(bdev, 0); 1481 invalidate_bdev(bdev);
1482 return res; 1482 return res;
1483} 1483}
1484EXPORT_SYMBOL(__invalidate_device); 1484EXPORT_SYMBOL(__invalidate_device);
diff --git a/fs/buffer.c b/fs/buffer.c
index 1d0852fa728b..7db24b9e5449 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -44,7 +44,6 @@
44#include <linux/bit_spinlock.h> 44#include <linux/bit_spinlock.h>
45 45
46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); 46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47static void invalidate_bh_lrus(void);
48 47
49#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) 48#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
50 49
@@ -333,7 +332,7 @@ out:
333 we think the disk contains more recent information than the buffercache. 332 we think the disk contains more recent information than the buffercache.
334 The update == 1 pass marks the buffers we need to update, the update == 2 333 The update == 1 pass marks the buffers we need to update, the update == 2
335 pass does the actual I/O. */ 334 pass does the actual I/O. */
336void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) 335void invalidate_bdev(struct block_device *bdev)
337{ 336{
338 struct address_space *mapping = bdev->bd_inode->i_mapping; 337 struct address_space *mapping = bdev->bd_inode->i_mapping;
339 338
@@ -341,11 +340,6 @@ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
341 return; 340 return;
342 341
343 invalidate_bh_lrus(); 342 invalidate_bh_lrus();
344 /*
345 * FIXME: what about destroy_dirty_buffers?
346 * We really want to use invalidate_inode_pages2() for
347 * that, but not until that's cleaned up.
348 */
349 invalidate_mapping_pages(mapping, 0, -1); 343 invalidate_mapping_pages(mapping, 0, -1);
350} 344}
351 345
@@ -1408,7 +1402,7 @@ static void invalidate_bh_lru(void *arg)
1408 put_cpu_var(bh_lrus); 1402 put_cpu_var(bh_lrus);
1409} 1403}
1410 1404
1411static void invalidate_bh_lrus(void) 1405void invalidate_bh_lrus(void)
1412{ 1406{
1413 on_each_cpu(invalidate_bh_lru, NULL, 1, 1); 1407 on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1414} 1408}
@@ -1700,17 +1694,8 @@ done:
1700 * clean. Someone wrote them back by hand with 1694 * clean. Someone wrote them back by hand with
1701 * ll_rw_block/submit_bh. A rare case. 1695 * ll_rw_block/submit_bh. A rare case.
1702 */ 1696 */
1703 int uptodate = 1;
1704 do {
1705 if (!buffer_uptodate(bh)) {
1706 uptodate = 0;
1707 break;
1708 }
1709 bh = bh->b_this_page;
1710 } while (bh != head);
1711 if (uptodate)
1712 SetPageUptodate(page);
1713 end_page_writeback(page); 1697 end_page_writeback(page);
1698
1714 /* 1699 /*
1715 * The page and buffer_heads can be released at any time from 1700 * The page and buffer_heads can be released at any time from
1716 * here on. 1701 * here on.
@@ -2968,8 +2953,7 @@ EXPORT_SYMBOL(free_buffer_head);
2968static void 2953static void
2969init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags) 2954init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags)
2970{ 2955{
2971 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 2956 if (flags & SLAB_CTOR_CONSTRUCTOR) {
2972 SLAB_CTOR_CONSTRUCTOR) {
2973 struct buffer_head * bh = (struct buffer_head *)data; 2957 struct buffer_head * bh = (struct buffer_head *)data;
2974 2958
2975 memset(bh, 0, sizeof(*bh)); 2959 memset(bh, 0, sizeof(*bh));
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 5d1f4873d701..a9b6bc5157b8 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,4 +1,16 @@
1Verison 1.48 1Version 1.49
2------------
3IPv6 support. Enable ipv6 addresses to be passed on mount (put the ipv6
4address after the "ip=" mount option, at least until mount.cifs is fixed to
5handle DNS host to ipv6 name translation). Accept override of uid or gid
6on mount even when Unix Extensions are negotiated (it used to be ignored
7when Unix Extensions were ignored). This allows users to override the
8default uid and gid for files when they are certain that the uids or
9gids on the server do not match those of the client. Make "sec=none"
10mount override username (so that null user connection is attempted)
11to match what documentation said.
12
13Version 1.48
2------------ 14------------
3Fix mtime bouncing around from local idea of last write times to remote time. 15Fix mtime bouncing around from local idea of last write times to remote time.
4Fix hang (in i_size_read) when simultaneous size update of same remote file 16Fix hang (in i_size_read) when simultaneous size update of same remote file
@@ -9,7 +21,13 @@ from read-only back to read-write, reflect this change in default file mode
9(we had been leaving a file's mode read-only until the inode were reloaded). 21(we had been leaving a file's mode read-only until the inode were reloaded).
10Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute 22Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute
11when archive dos attribute not set and we are changing mode back to writeable 23when archive dos attribute not set and we are changing mode back to writeable
12on server which does not support the Unix Extensions). 24on server which does not support the Unix Extensions). Remove read only dos
25attribute on chmod when adding any write permission (ie on any of
26user/group/other (not all of user/group/other ie 0222) when
27mounted to windows. Add support for POSIX MkDir (slight performance
28enhancement and eliminates the network race between the mkdir and set
29path info of the mode).
30
13 31
14Version 1.47 32Version 1.47
15------------ 33------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 080c5eba112b..4d01697722cc 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -257,13 +257,19 @@ A partial list of the supported mount options follows:
257 mount. 257 mount.
258 domain Set the SMB/CIFS workgroup name prepended to the 258 domain Set the SMB/CIFS workgroup name prepended to the
259 username during CIFS session establishment 259 username during CIFS session establishment
260 uid If CIFS Unix extensions are not supported by the server 260 uid Set the default uid for inodes. For mounts to servers
261 this overrides the default uid for inodes. For mounts to 261 which do support the CIFS Unix extensions, such as a
262 servers which do support the CIFS Unix extensions, such 262 properly configured Samba server, the server provides
263 as a properly configured Samba server, the server provides 263 the uid, gid and mode so this parameter should not be
264 the uid, gid and mode. For servers which do not support 264 specified unless the server and clients uid and gid
265 the Unix extensions, the default uid (and gid) returned on 265 numbering differ. If the server and client are in the
266 lookup of existing files is the uid (gid) of the person 266 same domain (e.g. running winbind or nss_ldap) and
267 the server supports the Unix Extensions then the uid
268 and gid can be retrieved from the server (and uid
269 and gid would not have to be specifed on the mount.
270 For servers which do not support the CIFS Unix
271 extensions, the default uid (and gid) returned on lookup
272 of existing files will be the uid (gid) of the person
267 who executed the mount (root, except when mount.cifs 273 who executed the mount (root, except when mount.cifs
268 is configured setuid for user mounts) unless the "uid=" 274 is configured setuid for user mounts) unless the "uid="
269 (gid) mount option is specified. For the uid (gid) of newly 275 (gid) mount option is specified. For the uid (gid) of newly
@@ -281,8 +287,7 @@ A partial list of the supported mount options follows:
281 the client. Note that the mount.cifs helper must be 287 the client. Note that the mount.cifs helper must be
282 at version 1.10 or higher to support specifying the uid 288 at version 1.10 or higher to support specifying the uid
283 (or gid) in non-numberic form. 289 (or gid) in non-numberic form.
284 gid If CIFS Unix extensions are not supported by the server 290 gid Set the default gid for inodes (similar to above).
285 this overrides the default gid for inodes.
286 file_mode If CIFS Unix extensions are not supported by the server 291 file_mode If CIFS Unix extensions are not supported by the server
287 this overrides the default mode for file inodes. 292 this overrides the default mode for file inodes.
288 dir_mode If CIFS Unix extensions are not supported by the server 293 dir_mode If CIFS Unix extensions are not supported by the server
@@ -467,7 +472,7 @@ including:
467 -V print mount.cifs version 472 -V print mount.cifs version
468 -? display simple usage information 473 -? display simple usage information
469 474
470With recent 2.6 kernel versions of modutils, the version of the cifs kernel 475With most 2.6 kernel versions of modutils, the version of the cifs kernel
471module can be displayed via modinfo. 476module can be displayed via modinfo.
472 477
473Misc /proc/fs/cifs Flags and Debug Info 478Misc /proc/fs/cifs Flags and Debug Info
@@ -516,8 +521,22 @@ SecurityFlags Flags which control security negotiation and
516 must use plaintext passwords 0x20020 521 must use plaintext passwords 0x20020
517 (reserved for future packet encryption) 0x00040 522 (reserved for future packet encryption) 0x00040
518 523
519cifsFYI If set to one, additional debug information is 524cifsFYI If set to non-zero value, additional debug information
520 logged to the system error log. (default 0) 525 will be logged to the system error log. This field
526 contains three flags controlling different classes of
527 debugging entries. The maximum value it can be set
528 to is 7 which enables all debugging points (default 0).
529 Some debugging statements are not compiled into the
530 cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the
531 kernel configuration. cifsFYI may be set to one or
532 nore of the following flags (7 sets them all):
533
534 log cifs informational messages 0x01
535 log return codes from cifs entry points 0x02
536 log slow responses (ie which take longer than 1 second)
537 CONFIG_CIFS_STATS2 must be enabled in .config 0x04
538
539
521traceSMB If set to one, debug information is logged to the 540traceSMB If set to one, debug information is logged to the
522 system error log with the start of smb requests 541 system error log with the start of smb requests
523 and responses (default 0) 542 and responses (default 0)
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index d7b9c27c942d..78b620e332bd 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -1,4 +1,4 @@
1Version 1.39 November 30, 2005 1Version 1.49 April 26, 2007
2 2
3A Partial List of Missing Features 3A Partial List of Missing Features
4================================== 4==================================
@@ -18,7 +18,7 @@ better)
18 18
19d) Kerberos/SPNEGO session setup support - (started) 19d) Kerberos/SPNEGO session setup support - (started)
20 20
21e) NTLMv2 authentication (mostly implemented - double check 21e) More testing of NTLMv2 authentication (mostly implemented - double check
22that NTLMv2 signing works, also need to cleanup now unneeded SessSetup code in 22that NTLMv2 signing works, also need to cleanup now unneeded SessSetup code in
23fs/cifs/connect.c) 23fs/cifs/connect.c)
24 24
@@ -27,55 +27,44 @@ used (Kerberos or NTLMSSP). Signing alreadyimplemented for NTLM
27and raw NTLMSSP already. This is important when enabling 27and raw NTLMSSP already. This is important when enabling
28extended security and mounting to Windows 2003 Servers 28extended security and mounting to Windows 2003 Servers
29 29
30f) Directory entry caching relies on a 1 second timer, rather than 30g) Directory entry caching relies on a 1 second timer, rather than
31using FindNotify or equivalent. - (started) 31using FindNotify or equivalent. - (started)
32 32
33g) A few byte range testcases fail due to POSIX vs. Windows/CIFS 33h) quota support (needs minor kernel change since quota calls
34style byte range lock differences. Save byte range locks so
35reconnect can replay them.
36
37h) Support unlock all (unlock 0,MAX_OFFSET)
38by unlocking all known byte range locks that we locked on the file.
39
40i) quota support (needs minor kernel change since quota calls
41to make it to network filesystems or deviceless filesystems) 34to make it to network filesystems or deviceless filesystems)
42 35
43j) investigate sync behavior (including syncpage) and check 36i) investigate sync behavior (including syncpage) and check
44for proper behavior of intr/nointr 37for proper behavior of intr/nointr
45 38
46k) hook lower into the sockets api (as NFS/SunRPC does) to avoid the 39j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
47extra copy in/out of the socket buffers in some cases. 40extra copy in/out of the socket buffers in some cases.
48 41
49l) finish support for IPv6. This is mostly complete but 42k) Better optimize open (and pathbased setfilesize) to reduce the
50needs a simple conversion of ipv6 to sin6_addr from the
51address in string representation.
52
53m) Better optimize open (and pathbased setfilesize) to reduce the
54oplock breaks coming from windows srv. Piggyback identical file 43oplock breaks coming from windows srv. Piggyback identical file
55opens on top of each other by incrementing reference count rather 44opens on top of each other by incrementing reference count rather
56than resending (helps reduce server resource utilization and avoid 45than resending (helps reduce server resource utilization and avoid
57spurious oplock breaks). 46spurious oplock breaks).
58 47
59o) Improve performance of readpages by sending more than one read 48l) Improve performance of readpages by sending more than one read
60at a time when 8 pages or more are requested. In conjuntion 49at a time when 8 pages or more are requested. In conjuntion
61add support for async_cifs_readpages. 50add support for async_cifs_readpages.
62 51
63p) Add support for storing symlink info to Windows servers 52m) Add support for storing symlink info to Windows servers
64in the Extended Attribute format their SFU clients would recognize. 53in the Extended Attribute format their SFU clients would recognize.
65 54
66q) Finish fcntl D_NOTIFY support so kde and gnome file list windows 55n) Finish fcntl D_NOTIFY support so kde and gnome file list windows
67will autorefresh (partially complete by Asser). Needs minor kernel 56will autorefresh (partially complete by Asser). Needs minor kernel
68vfs change to support removing D_NOTIFY on a file. 57vfs change to support removing D_NOTIFY on a file.
69 58
70r) Add GUI tool to configure /proc/fs/cifs settings and for display of 59o) Add GUI tool to configure /proc/fs/cifs settings and for display of
71the CIFS statistics (started) 60the CIFS statistics (started)
72 61
73s) implement support for security and trusted categories of xattrs 62p) implement support for security and trusted categories of xattrs
74(requires minor protocol extension) to enable better support for SELINUX 63(requires minor protocol extension) to enable better support for SELINUX
75 64
76t) Implement O_DIRECT flag on open (already supported on mount) 65q) Implement O_DIRECT flag on open (already supported on mount)
77 66
78u) Create UID mapping facility so server UIDs can be mapped on a per 67r) Create UID mapping facility so server UIDs can be mapped on a per
79mount or a per server basis to client UIDs or nobody if no mapping 68mount or a per server basis to client UIDs or nobody if no mapping
80exists. This is helpful when Unix extensions are negotiated to 69exists. This is helpful when Unix extensions are negotiated to
81allow better permission checking when UIDs differ on the server 70allow better permission checking when UIDs differ on the server
@@ -83,19 +72,26 @@ and client. Add new protocol request to the CIFS protocol
83standard for asking the server for the corresponding name of a 72standard for asking the server for the corresponding name of a
84particular uid. 73particular uid.
85 74
86v) Add support for CIFS Unix and also the newer POSIX extensions to the 75s) Add support for CIFS Unix and also the newer POSIX extensions to the
87server side for Samba 4. 76server side for Samba 4.
88 77
89w) Finish up the dos time conversion routines needed to return old server 78t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers)
90time to the client (default time, of now or time 0 is used now for these
91very old servers)
92
93x) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers)
94need to add ability to set time to server (utimes command) 79need to add ability to set time to server (utimes command)
95 80
96y) Finish testing of Windows 9x/Windows ME server support (started). 81u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
82
83v) mount check for unmatched uids
84
85w) Add mount option for Linux extension disable per mount, and partial
86disable per mount (uid off, symlink/fifo/mknod on but what about posix acls?)
97 87
98KNOWN BUGS (updated February 26, 2007) 88x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of
89processes can proceed better in parallel (on the server)
90
91y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount
92restriction of wsize max being 127K)
93
94KNOWN BUGS (updated April 24, 2007)
99==================================== 95====================================
100See http://bugzilla.samba.org - search on product "CifsVFS" for 96See http://bugzilla.samba.org - search on product "CifsVFS" for
101current bug list. 97current bug list.
@@ -127,10 +123,3 @@ negotiated size) and send larger write sizes to modern servers.
1274) More exhaustively test against less common servers. More testing 1234) More exhaustively test against less common servers. More testing
128against Windows 9x, Windows ME servers. 124against Windows 9x, Windows ME servers.
129 125
130DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
131
132mount check for unmatched uids - and uid override
133
134Add mount option for Linux extension disable per mount, and partial disable per mount (uid off, symlink/fifo/mknod on but what about posix acls?)
135
136Free threads at umount --force that are stuck on the sesSem
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index fd1e52ebcee6..4cc2012e9322 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -22,12 +22,14 @@
22#define CIFS_MOUNT_SET_UID 2 /* set current->euid in create etc. */ 22#define CIFS_MOUNT_SET_UID 2 /* set current->euid in create etc. */
23#define CIFS_MOUNT_SERVER_INUM 4 /* inode numbers from uniqueid from server */ 23#define CIFS_MOUNT_SERVER_INUM 4 /* inode numbers from uniqueid from server */
24#define CIFS_MOUNT_DIRECT_IO 8 /* do not write nor read through page cache */ 24#define CIFS_MOUNT_DIRECT_IO 8 /* do not write nor read through page cache */
25#define CIFS_MOUNT_NO_XATTR 0x10 /* if set - disable xattr support */ 25#define CIFS_MOUNT_NO_XATTR 0x10 /* if set - disable xattr support */
26#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames */ 26#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames */
27#define CIFS_MOUNT_POSIX_PATHS 0x40 /* Negotiate posix pathnames if possible. */ 27#define CIFS_MOUNT_POSIX_PATHS 0x40 /* Negotiate posix pathnames if possible*/
28#define CIFS_MOUNT_UNX_EMUL 0x80 /* Network compat with SFUnix emulation */ 28#define CIFS_MOUNT_UNX_EMUL 0x80 /* Network compat with SFUnix emulation */
29#define CIFS_MOUNT_NO_BRL 0x100 /* No sending byte range locks to srv */ 29#define CIFS_MOUNT_NO_BRL 0x100 /* No sending byte range locks to srv */
30#define CIFS_MOUNT_CIFS_ACL 0x200 /* send ACL requests to non-POSIX srv */ 30#define CIFS_MOUNT_CIFS_ACL 0x200 /* send ACL requests to non-POSIX srv */
31#define CIFS_MOUNT_OVERR_UID 0x400 /* override uid returned from server */
32#define CIFS_MOUNT_OVERR_GID 0x800 /* override gid returned from server */
31 33
32struct cifs_sb_info { 34struct cifs_sb_info {
33 struct cifsTconInfo *tcon; /* primary mount */ 35 struct cifsTconInfo *tcon; /* primary mount */
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index d2a8b2941fc2..793c4b95c164 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -74,8 +74,8 @@ cifs_strtoUCS(__le16 * to, const char *from, int len,
74 charlen = codepage->char2uni(from, len, &wchar_to[i]); 74 charlen = codepage->char2uni(from, len, &wchar_to[i]);
75 if (charlen < 1) { 75 if (charlen < 1) {
76 cERROR(1, 76 cERROR(1,
77 ("cifs_strtoUCS: char2uni returned %d", 77 ("strtoUCS: char2uni of %d returned %d",
78 charlen)); 78 (int)*from, charlen));
79 /* A question mark */ 79 /* A question mark */
80 to[i] = cpu_to_le16(0x003f); 80 to[i] = cpu_to_le16(0x003f);
81 charlen = 1; 81 charlen = 1;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index faba4d69fe91..8568e100953c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -100,7 +100,7 @@ cifs_read_super(struct super_block *sb, void *data,
100 sb->s_flags |= MS_NODIRATIME | MS_NOATIME; 100 sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
101 sb->s_fs_info = kzalloc(sizeof(struct cifs_sb_info),GFP_KERNEL); 101 sb->s_fs_info = kzalloc(sizeof(struct cifs_sb_info),GFP_KERNEL);
102 cifs_sb = CIFS_SB(sb); 102 cifs_sb = CIFS_SB(sb);
103 if(cifs_sb == NULL) 103 if (cifs_sb == NULL)
104 return -ENOMEM; 104 return -ENOMEM;
105 105
106 rc = cifs_mount(sb, cifs_sb, data, devname); 106 rc = cifs_mount(sb, cifs_sb, data, devname);
@@ -115,10 +115,10 @@ cifs_read_super(struct super_block *sb, void *data,
115 sb->s_magic = CIFS_MAGIC_NUMBER; 115 sb->s_magic = CIFS_MAGIC_NUMBER;
116 sb->s_op = &cifs_super_ops; 116 sb->s_op = &cifs_super_ops;
117#ifdef CONFIG_CIFS_EXPERIMENTAL 117#ifdef CONFIG_CIFS_EXPERIMENTAL
118 if(experimEnabled != 0) 118 if (experimEnabled != 0)
119 sb->s_export_op = &cifs_export_ops; 119 sb->s_export_op = &cifs_export_ops;
120#endif /* EXPERIMENTAL */ 120#endif /* EXPERIMENTAL */
121/* if(cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512) 121/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
122 sb->s_blocksize = cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */ 122 sb->s_blocksize = cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
123#ifdef CONFIG_CIFS_QUOTA 123#ifdef CONFIG_CIFS_QUOTA
124 sb->s_qcop = &cifs_quotactl_ops; 124 sb->s_qcop = &cifs_quotactl_ops;
@@ -147,8 +147,8 @@ out_no_root:
147 iput(inode); 147 iput(inode);
148 148
149out_mount_failed: 149out_mount_failed:
150 if(cifs_sb) { 150 if (cifs_sb) {
151 if(cifs_sb->local_nls) 151 if (cifs_sb->local_nls)
152 unload_nls(cifs_sb->local_nls); 152 unload_nls(cifs_sb->local_nls);
153 kfree(cifs_sb); 153 kfree(cifs_sb);
154 } 154 }
@@ -163,7 +163,7 @@ cifs_put_super(struct super_block *sb)
163 163
164 cFYI(1, ("In cifs_put_super")); 164 cFYI(1, ("In cifs_put_super"));
165 cifs_sb = CIFS_SB(sb); 165 cifs_sb = CIFS_SB(sb);
166 if(cifs_sb == NULL) { 166 if (cifs_sb == NULL) {
167 cFYI(1,("Empty cifs superblock info passed to unmount")); 167 cFYI(1,("Empty cifs superblock info passed to unmount"));
168 return; 168 return;
169 } 169 }
@@ -208,14 +208,14 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
208 208
209 /* Only need to call the old QFSInfo if failed 209 /* Only need to call the old QFSInfo if failed
210 on newer one */ 210 on newer one */
211 if(rc) 211 if (rc)
212 if(pTcon->ses->capabilities & CAP_NT_SMBS) 212 if (pTcon->ses->capabilities & CAP_NT_SMBS)
213 rc = CIFSSMBQFSInfo(xid, pTcon, buf); /* not supported by OS2 */ 213 rc = CIFSSMBQFSInfo(xid, pTcon, buf); /* not supported by OS2 */
214 214
215 /* Some old Windows servers also do not support level 103, retry with 215 /* Some old Windows servers also do not support level 103, retry with
216 older level one if old server failed the previous call or we 216 older level one if old server failed the previous call or we
217 bypassed it because we detected that this was an older LANMAN sess */ 217 bypassed it because we detected that this was an older LANMAN sess */
218 if(rc) 218 if (rc)
219 rc = SMBOldQFSInfo(xid, pTcon, buf); 219 rc = SMBOldQFSInfo(xid, pTcon, buf);
220 /* 220 /*
221 int f_type; 221 int f_type;
@@ -301,11 +301,19 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
301 if (cifs_sb->tcon->ses->userName) 301 if (cifs_sb->tcon->ses->userName)
302 seq_printf(s, ",username=%s", 302 seq_printf(s, ",username=%s",
303 cifs_sb->tcon->ses->userName); 303 cifs_sb->tcon->ses->userName);
304 if(cifs_sb->tcon->ses->domainName) 304 if (cifs_sb->tcon->ses->domainName)
305 seq_printf(s, ",domain=%s", 305 seq_printf(s, ",domain=%s",
306 cifs_sb->tcon->ses->domainName); 306 cifs_sb->tcon->ses->domainName);
307 } 307 }
308 } 308 }
309 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
310 seq_printf(s, ",posixpaths");
311 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) ||
312 !(cifs_sb->tcon->ses->capabilities & CAP_UNIX))
313 seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
314 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) ||
315 !(cifs_sb->tcon->ses->capabilities & CAP_UNIX))
316 seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
309 seq_printf(s, ",rsize=%d",cifs_sb->rsize); 317 seq_printf(s, ",rsize=%d",cifs_sb->rsize);
310 seq_printf(s, ",wsize=%d",cifs_sb->wsize); 318 seq_printf(s, ",wsize=%d",cifs_sb->wsize);
311 } 319 }
@@ -321,14 +329,14 @@ int cifs_xquota_set(struct super_block * sb, int quota_type, qid_t qid,
321 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 329 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
322 struct cifsTconInfo *pTcon; 330 struct cifsTconInfo *pTcon;
323 331
324 if(cifs_sb) 332 if (cifs_sb)
325 pTcon = cifs_sb->tcon; 333 pTcon = cifs_sb->tcon;
326 else 334 else
327 return -EIO; 335 return -EIO;
328 336
329 337
330 xid = GetXid(); 338 xid = GetXid();
331 if(pTcon) { 339 if (pTcon) {
332 cFYI(1,("set type: 0x%x id: %d",quota_type,qid)); 340 cFYI(1,("set type: 0x%x id: %d",quota_type,qid));
333 } else { 341 } else {
334 return -EIO; 342 return -EIO;
@@ -346,13 +354,13 @@ int cifs_xquota_get(struct super_block * sb, int quota_type, qid_t qid,
346 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 354 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
347 struct cifsTconInfo *pTcon; 355 struct cifsTconInfo *pTcon;
348 356
349 if(cifs_sb) 357 if (cifs_sb)
350 pTcon = cifs_sb->tcon; 358 pTcon = cifs_sb->tcon;
351 else 359 else
352 return -EIO; 360 return -EIO;
353 361
354 xid = GetXid(); 362 xid = GetXid();
355 if(pTcon) { 363 if (pTcon) {
356 cFYI(1,("set type: 0x%x id: %d",quota_type,qid)); 364 cFYI(1,("set type: 0x%x id: %d",quota_type,qid));
357 } else { 365 } else {
358 rc = -EIO; 366 rc = -EIO;
@@ -369,13 +377,13 @@ int cifs_xstate_set(struct super_block * sb, unsigned int flags, int operation)
369 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 377 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
370 struct cifsTconInfo *pTcon; 378 struct cifsTconInfo *pTcon;
371 379
372 if(cifs_sb) 380 if (cifs_sb)
373 pTcon = cifs_sb->tcon; 381 pTcon = cifs_sb->tcon;
374 else 382 else
375 return -EIO; 383 return -EIO;
376 384
377 xid = GetXid(); 385 xid = GetXid();
378 if(pTcon) { 386 if (pTcon) {
379 cFYI(1,("flags: 0x%x operation: 0x%x",flags,operation)); 387 cFYI(1,("flags: 0x%x operation: 0x%x",flags,operation));
380 } else { 388 } else {
381 rc = -EIO; 389 rc = -EIO;
@@ -392,13 +400,13 @@ int cifs_xstate_get(struct super_block * sb, struct fs_quota_stat *qstats)
392 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 400 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
393 struct cifsTconInfo *pTcon; 401 struct cifsTconInfo *pTcon;
394 402
395 if(cifs_sb) { 403 if (cifs_sb) {
396 pTcon = cifs_sb->tcon; 404 pTcon = cifs_sb->tcon;
397 } else { 405 } else {
398 return -EIO; 406 return -EIO;
399 } 407 }
400 xid = GetXid(); 408 xid = GetXid();
401 if(pTcon) { 409 if (pTcon) {
402 cFYI(1,("pqstats %p",qstats)); 410 cFYI(1,("pqstats %p",qstats));
403 } else { 411 } else {
404 rc = -EIO; 412 rc = -EIO;
@@ -424,11 +432,11 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
424 if (!(flags & MNT_FORCE)) 432 if (!(flags & MNT_FORCE))
425 return; 433 return;
426 cifs_sb = CIFS_SB(vfsmnt->mnt_sb); 434 cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
427 if(cifs_sb == NULL) 435 if (cifs_sb == NULL)
428 return; 436 return;
429 437
430 tcon = cifs_sb->tcon; 438 tcon = cifs_sb->tcon;
431 if(tcon == NULL) 439 if (tcon == NULL)
432 return; 440 return;
433 down(&tcon->tconSem); 441 down(&tcon->tconSem);
434 if (atomic_read(&tcon->useCount) == 1) 442 if (atomic_read(&tcon->useCount) == 1)
@@ -437,7 +445,7 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
437 445
438 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ 446 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
439 /* cancel_notify_requests(tcon); */ 447 /* cancel_notify_requests(tcon); */
440 if(tcon->ses && tcon->ses->server) 448 if (tcon->ses && tcon->ses->server)
441 { 449 {
442 cFYI(1,("wake up tasks now - umount begin not complete")); 450 cFYI(1,("wake up tasks now - umount begin not complete"));
443 wake_up_all(&tcon->ses->server->request_q); 451 wake_up_all(&tcon->ses->server->request_q);
@@ -529,8 +537,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
529 /* some applications poll for the file length in this strange 537 /* some applications poll for the file length in this strange
530 way so we must seek to end on non-oplocked files by 538 way so we must seek to end on non-oplocked files by
531 setting the revalidate time to zero */ 539 setting the revalidate time to zero */
532 if(file->f_path.dentry->d_inode) 540 CIFS_I(file->f_path.dentry->d_inode)->time = 0;
533 CIFS_I(file->f_path.dentry->d_inode)->time = 0;
534 541
535 retval = cifs_revalidate(file->f_path.dentry); 542 retval = cifs_revalidate(file->f_path.dentry);
536 if (retval < 0) 543 if (retval < 0)
@@ -694,8 +701,7 @@ cifs_init_once(void *inode, struct kmem_cache * cachep, unsigned long flags)
694{ 701{
695 struct cifsInodeInfo *cifsi = inode; 702 struct cifsInodeInfo *cifsi = inode;
696 703
697 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == 704 if (flags & SLAB_CTOR_CONSTRUCTOR) {
698 SLAB_CTOR_CONSTRUCTOR) {
699 inode_init_once(&cifsi->vfs_inode); 705 inode_init_once(&cifsi->vfs_inode);
700 INIT_LIST_HEAD(&cifsi->lockList); 706 INIT_LIST_HEAD(&cifsi->lockList);
701 } 707 }
@@ -724,7 +730,7 @@ cifs_destroy_inodecache(void)
724static int 730static int
725cifs_init_request_bufs(void) 731cifs_init_request_bufs(void)
726{ 732{
727 if(CIFSMaxBufSize < 8192) { 733 if (CIFSMaxBufSize < 8192) {
728 /* Buffer size can not be smaller than 2 * PATH_MAX since maximum 734 /* Buffer size can not be smaller than 2 * PATH_MAX since maximum
729 Unicode path name has to fit in any SMB/CIFS path based frames */ 735 Unicode path name has to fit in any SMB/CIFS path based frames */
730 CIFSMaxBufSize = 8192; 736 CIFSMaxBufSize = 8192;
@@ -741,7 +747,7 @@ cifs_init_request_bufs(void)
741 if (cifs_req_cachep == NULL) 747 if (cifs_req_cachep == NULL)
742 return -ENOMEM; 748 return -ENOMEM;
743 749
744 if(cifs_min_rcv < 1) 750 if (cifs_min_rcv < 1)
745 cifs_min_rcv = 1; 751 cifs_min_rcv = 1;
746 else if (cifs_min_rcv > 64) { 752 else if (cifs_min_rcv > 64) {
747 cifs_min_rcv = 64; 753 cifs_min_rcv = 64;
@@ -751,7 +757,7 @@ cifs_init_request_bufs(void)
751 cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv, 757 cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
752 cifs_req_cachep); 758 cifs_req_cachep);
753 759
754 if(cifs_req_poolp == NULL) { 760 if (cifs_req_poolp == NULL) {
755 kmem_cache_destroy(cifs_req_cachep); 761 kmem_cache_destroy(cifs_req_cachep);
756 return -ENOMEM; 762 return -ENOMEM;
757 } 763 }
@@ -772,7 +778,7 @@ cifs_init_request_bufs(void)
772 return -ENOMEM; 778 return -ENOMEM;
773 } 779 }
774 780
775 if(cifs_min_small < 2) 781 if (cifs_min_small < 2)
776 cifs_min_small = 2; 782 cifs_min_small = 2;
777 else if (cifs_min_small > 256) { 783 else if (cifs_min_small > 256) {
778 cifs_min_small = 256; 784 cifs_min_small = 256;
@@ -782,7 +788,7 @@ cifs_init_request_bufs(void)
782 cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small, 788 cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
783 cifs_sm_req_cachep); 789 cifs_sm_req_cachep);
784 790
785 if(cifs_sm_req_poolp == NULL) { 791 if (cifs_sm_req_poolp == NULL) {
786 mempool_destroy(cifs_req_poolp); 792 mempool_destroy(cifs_req_poolp);
787 kmem_cache_destroy(cifs_req_cachep); 793 kmem_cache_destroy(cifs_req_cachep);
788 kmem_cache_destroy(cifs_sm_req_cachep); 794 kmem_cache_destroy(cifs_sm_req_cachep);
@@ -812,7 +818,7 @@ cifs_init_mids(void)
812 818
813 /* 3 is a reasonable minimum number of simultaneous operations */ 819 /* 3 is a reasonable minimum number of simultaneous operations */
814 cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep); 820 cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep);
815 if(cifs_mid_poolp == NULL) { 821 if (cifs_mid_poolp == NULL) {
816 kmem_cache_destroy(cifs_mid_cachep); 822 kmem_cache_destroy(cifs_mid_cachep);
817 return -ENOMEM; 823 return -ENOMEM;
818 } 824 }
@@ -850,14 +856,14 @@ static int cifs_oplock_thread(void * dummyarg)
850 continue; 856 continue;
851 857
852 spin_lock(&GlobalMid_Lock); 858 spin_lock(&GlobalMid_Lock);
853 if(list_empty(&GlobalOplock_Q)) { 859 if (list_empty(&GlobalOplock_Q)) {
854 spin_unlock(&GlobalMid_Lock); 860 spin_unlock(&GlobalMid_Lock);
855 set_current_state(TASK_INTERRUPTIBLE); 861 set_current_state(TASK_INTERRUPTIBLE);
856 schedule_timeout(39*HZ); 862 schedule_timeout(39*HZ);
857 } else { 863 } else {
858 oplock_item = list_entry(GlobalOplock_Q.next, 864 oplock_item = list_entry(GlobalOplock_Q.next,
859 struct oplock_q_entry, qhead); 865 struct oplock_q_entry, qhead);
860 if(oplock_item) { 866 if (oplock_item) {
861 cFYI(1,("found oplock item to write out")); 867 cFYI(1,("found oplock item to write out"));
862 pTcon = oplock_item->tcon; 868 pTcon = oplock_item->tcon;
863 inode = oplock_item->pinode; 869 inode = oplock_item->pinode;
@@ -871,7 +877,7 @@ static int cifs_oplock_thread(void * dummyarg)
871 /* mutex_lock(&inode->i_mutex);*/ 877 /* mutex_lock(&inode->i_mutex);*/
872 if (S_ISREG(inode->i_mode)) { 878 if (S_ISREG(inode->i_mode)) {
873 rc = filemap_fdatawrite(inode->i_mapping); 879 rc = filemap_fdatawrite(inode->i_mapping);
874 if(CIFS_I(inode)->clientCanCacheRead == 0) { 880 if (CIFS_I(inode)->clientCanCacheRead == 0) {
875 filemap_fdatawait(inode->i_mapping); 881 filemap_fdatawait(inode->i_mapping);
876 invalidate_remote_inode(inode); 882 invalidate_remote_inode(inode);
877 } 883 }
@@ -888,7 +894,7 @@ static int cifs_oplock_thread(void * dummyarg)
888 not bother sending an oplock release if session 894 not bother sending an oplock release if session
889 to server still is disconnected since oplock 895 to server still is disconnected since oplock
890 already released by the server in that case */ 896 already released by the server in that case */
891 if(pTcon->tidStatus != CifsNeedReconnect) { 897 if (pTcon->tidStatus != CifsNeedReconnect) {
892 rc = CIFSSMBLock(0, pTcon, netfid, 898 rc = CIFSSMBLock(0, pTcon, netfid,
893 0 /* len */ , 0 /* offset */, 0, 899 0 /* len */ , 0 /* offset */, 0,
894 0, LOCKING_ANDX_OPLOCK_RELEASE, 900 0, LOCKING_ANDX_OPLOCK_RELEASE,
@@ -922,7 +928,7 @@ static int cifs_dnotify_thread(void * dummyarg)
922 list_for_each(tmp, &GlobalSMBSessionList) { 928 list_for_each(tmp, &GlobalSMBSessionList) {
923 ses = list_entry(tmp, struct cifsSesInfo, 929 ses = list_entry(tmp, struct cifsSesInfo,
924 cifsSessionList); 930 cifsSessionList);
925 if(ses && ses->server && 931 if (ses && ses->server &&
926 atomic_read(&ses->server->inFlight)) 932 atomic_read(&ses->server->inFlight))
927 wake_up_all(&ses->server->response_q); 933 wake_up_all(&ses->server->response_q);
928 } 934 }
@@ -971,10 +977,10 @@ init_cifs(void)
971 rwlock_init(&GlobalSMBSeslock); 977 rwlock_init(&GlobalSMBSeslock);
972 spin_lock_init(&GlobalMid_Lock); 978 spin_lock_init(&GlobalMid_Lock);
973 979
974 if(cifs_max_pending < 2) { 980 if (cifs_max_pending < 2) {
975 cifs_max_pending = 2; 981 cifs_max_pending = 2;
976 cFYI(1,("cifs_max_pending set to min of 2")); 982 cFYI(1,("cifs_max_pending set to min of 2"));
977 } else if(cifs_max_pending > 256) { 983 } else if (cifs_max_pending > 256) {
978 cifs_max_pending = 256; 984 cifs_max_pending = 256;
979 cFYI(1,("cifs_max_pending set to max of 256")); 985 cFYI(1,("cifs_max_pending set to max of 256"));
980 } 986 }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2c2c384894d8..c235d32ad4a8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
100extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); 100extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
101extern int cifs_ioctl (struct inode * inode, struct file * filep, 101extern int cifs_ioctl (struct inode * inode, struct file * filep,
102 unsigned int command, unsigned long arg); 102 unsigned int command, unsigned long arg);
103#define CIFS_VERSION "1.48" 103#define CIFS_VERSION "1.49"
104#endif /* _CIFSFS_H */ 104#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e4de8eba4780..23655de2f4a4 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -311,7 +311,7 @@ struct cifsFileInfo {
311 /* lock scope id (0 if none) */ 311 /* lock scope id (0 if none) */
312 struct file * pfile; /* needed for writepage */ 312 struct file * pfile; /* needed for writepage */
313 struct inode * pInode; /* needed for oplock break */ 313 struct inode * pInode; /* needed for oplock break */
314 struct semaphore lock_sem; 314 struct mutex lock_mutex;
315 struct list_head llist; /* list of byte range locks we have. */ 315 struct list_head llist; /* list of byte range locks we have. */
316 unsigned closePend:1; /* file is marked to close */ 316 unsigned closePend:1; /* file is marked to close */
317 unsigned invalidHandle:1; /* file closed via session abend */ 317 unsigned invalidHandle:1; /* file closed via session abend */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 4d8948e8762c..d619ca7d1416 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1388,7 +1388,7 @@ struct smb_t2_rsp {
1388#define SMB_SET_POSIX_LOCK 0x208 1388#define SMB_SET_POSIX_LOCK 0x208
1389#define SMB_POSIX_OPEN 0x209 1389#define SMB_POSIX_OPEN 0x209
1390#define SMB_POSIX_UNLINK 0x20a 1390#define SMB_POSIX_UNLINK 0x20a
1391#define SMB_SET_FILE_UNIX_INFO2 1391#define SMB_SET_FILE_UNIX_INFO2 0x20b
1392#define SMB_SET_FILE_BASIC_INFO2 0x3ec 1392#define SMB_SET_FILE_BASIC_INFO2 0x3ec
1393#define SMB_SET_FILE_RENAME_INFORMATION 0x3f2 /* BB check if qpathinfo too */ 1393#define SMB_SET_FILE_RENAME_INFORMATION 0x3f2 /* BB check if qpathinfo too */
1394#define SMB_FILE_ALL_INFO2 0x3fa 1394#define SMB_FILE_ALL_INFO2 0x3fa
@@ -2109,22 +2109,40 @@ struct cifs_posix_acl { /* access conrol list (ACL) */
2109 2109
2110/* end of POSIX ACL definitions */ 2110/* end of POSIX ACL definitions */
2111 2111
2112/* POSIX Open Flags */
2113#define SMB_O_RDONLY 0x1
2114#define SMB_O_WRONLY 0x2
2115#define SMB_O_RDWR 0x4
2116#define SMB_O_CREAT 0x10
2117#define SMB_O_EXCL 0x20
2118#define SMB_O_TRUNC 0x40
2119#define SMB_O_APPEND 0x80
2120#define SMB_O_SYNC 0x100
2121#define SMB_O_DIRECTORY 0x200
2122#define SMB_O_NOFOLLOW 0x400
2123#define SMB_O_DIRECT 0x800
2124
2112typedef struct { 2125typedef struct {
2113 __u32 OpenFlags; /* same as NT CreateX */ 2126 __le32 OpenFlags; /* same as NT CreateX */
2114 __u32 PosixOpenFlags; 2127 __le32 PosixOpenFlags;
2115 __u32 Mode; 2128 __le64 Permissions;
2116 __u16 Level; /* reply level requested (see QPathInfo levels) */ 2129 __le16 Level; /* reply level requested (see QPathInfo levels) */
2117 __u16 Pad; /* reserved - MBZ */
2118} __attribute__((packed)) OPEN_PSX_REQ; /* level 0x209 SetPathInfo data */ 2130} __attribute__((packed)) OPEN_PSX_REQ; /* level 0x209 SetPathInfo data */
2119 2131
2120typedef struct { 2132typedef struct {
2121 /* reply varies based on requested level */ 2133 __le16 OplockFlags;
2134 __u16 Fid;
2135 __le32 CreateAction;
2136 __le16 ReturnedLevel;
2137 __le16 Pad;
2138 /* struct following varies based on requested level */
2122} __attribute__((packed)) OPEN_PSX_RSP; /* level 0x209 SetPathInfo data */ 2139} __attribute__((packed)) OPEN_PSX_RSP; /* level 0x209 SetPathInfo data */
2123 2140
2124 2141
2125struct file_internal_info { 2142struct file_internal_info {
2126 __u64 UniqueId; /* inode number */ 2143 __u64 UniqueId; /* inode number */
2127} __attribute__((packed)); /* level 0x3ee */ 2144} __attribute__((packed)); /* level 0x3ee */
2145
2128struct file_mode_info { 2146struct file_mode_info {
2129 __le32 Mode; 2147 __le32 Mode;
2130} __attribute__((packed)); /* level 0x3f8 */ 2148} __attribute__((packed)); /* level 0x3f8 */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 32eb1acab630..5d163e2b6143 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifsproto.h 2 * fs/cifs/cifsproto.h
3 * 3 *
4 * Copyright (c) International Business Machines Corp., 2002,2006 4 * Copyright (c) International Business Machines Corp., 2002,2007
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -244,6 +244,11 @@ extern int SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
244 const int access_flags, const int omode, 244 const int access_flags, const int omode,
245 __u16 * netfid, int *pOplock, FILE_ALL_INFO *, 245 __u16 * netfid, int *pOplock, FILE_ALL_INFO *,
246 const struct nls_table *nls_codepage, int remap); 246 const struct nls_table *nls_codepage, int remap);
247extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon,
248 u32 posix_flags, __u64 mode, __u16 * netfid,
249 FILE_UNIX_BASIC_INFO *pRetData,
250 __u32 *pOplock, const char *name,
251 const struct nls_table *nls_codepage, int remap);
247extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, 252extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
248 const int smb_file_id); 253 const int smb_file_id);
249 254
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 48fc0c2ab0e5..14de58fa1437 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifssmb.c 2 * fs/cifs/cifssmb.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2006 4 * Copyright (C) International Business Machines Corp., 2002,2007
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * Contains the routines for constructing the SMB PDUs themselves 7 * Contains the routines for constructing the SMB PDUs themselves
@@ -24,8 +24,8 @@
24 /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c */ 24 /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c */
25 /* These are mostly routines that operate on a pathname, or on a tree id */ 25 /* These are mostly routines that operate on a pathname, or on a tree id */
26 /* (mounted volume), but there are eight handle based routines which must be */ 26 /* (mounted volume), but there are eight handle based routines which must be */
27 /* treated slightly different for reconnection purposes since we never want */ 27 /* treated slightly differently for reconnection purposes since we never */
28 /* to reuse a stale file handle and the caller knows the file handle */ 28 /* want to reuse a stale file handle and only the caller knows the file info */
29 29
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
@@ -913,6 +913,130 @@ MkDirRetry:
913 return rc; 913 return rc;
914} 914}
915 915
916int
917CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
918 __u64 mode, __u16 * netfid, FILE_UNIX_BASIC_INFO *pRetData,
919 __u32 *pOplock, const char *name,
920 const struct nls_table *nls_codepage, int remap)
921{
922 TRANSACTION2_SPI_REQ *pSMB = NULL;
923 TRANSACTION2_SPI_RSP *pSMBr = NULL;
924 int name_len;
925 int rc = 0;
926 int bytes_returned = 0;
927 char *data_offset;
928 __u16 params, param_offset, offset, byte_count, count;
929 OPEN_PSX_REQ * pdata;
930 OPEN_PSX_RSP * psx_rsp;
931
932 cFYI(1, ("In POSIX Create"));
933PsxCreat:
934 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
935 (void **) &pSMBr);
936 if (rc)
937 return rc;
938
939 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
940 name_len =
941 cifsConvertToUCS((__le16 *) pSMB->FileName, name,
942 PATH_MAX, nls_codepage, remap);
943 name_len++; /* trailing null */
944 name_len *= 2;
945 } else { /* BB improve the check for buffer overruns BB */
946 name_len = strnlen(name, PATH_MAX);
947 name_len++; /* trailing null */
948 strncpy(pSMB->FileName, name, name_len);
949 }
950
951 params = 6 + name_len;
952 count = sizeof(OPEN_PSX_REQ);
953 pSMB->MaxParameterCount = cpu_to_le16(2);
954 pSMB->MaxDataCount = cpu_to_le16(1000); /* large enough */
955 pSMB->MaxSetupCount = 0;
956 pSMB->Reserved = 0;
957 pSMB->Flags = 0;
958 pSMB->Timeout = 0;
959 pSMB->Reserved2 = 0;
960 param_offset = offsetof(struct smb_com_transaction2_spi_req,
961 InformationLevel) - 4;
962 offset = param_offset + params;
963 data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
964 pdata = (OPEN_PSX_REQ *)(((char *)&pSMB->hdr.Protocol) + offset);
965 pdata->Level = SMB_QUERY_FILE_UNIX_BASIC;
966 pdata->Permissions = cpu_to_le64(mode);
967 pdata->PosixOpenFlags = cpu_to_le32(posix_flags);
968 pdata->OpenFlags = cpu_to_le32(*pOplock);
969 pSMB->ParameterOffset = cpu_to_le16(param_offset);
970 pSMB->DataOffset = cpu_to_le16(offset);
971 pSMB->SetupCount = 1;
972 pSMB->Reserved3 = 0;
973 pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
974 byte_count = 3 /* pad */ + params + count;
975
976 pSMB->DataCount = cpu_to_le16(count);
977 pSMB->ParameterCount = cpu_to_le16(params);
978 pSMB->TotalDataCount = pSMB->DataCount;
979 pSMB->TotalParameterCount = pSMB->ParameterCount;
980 pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN);
981 pSMB->Reserved4 = 0;
982 pSMB->hdr.smb_buf_length += byte_count;
983 pSMB->ByteCount = cpu_to_le16(byte_count);
984 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
985 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
986 if (rc) {
987 cFYI(1, ("Posix create returned %d", rc));
988 goto psx_create_err;
989 }
990
991 cFYI(1,("copying inode info"));
992 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
993
994 if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
995 rc = -EIO; /* bad smb */
996 goto psx_create_err;
997 }
998
999 /* copy return information to pRetData */
1000 psx_rsp = (OPEN_PSX_RSP *)((char *) &pSMBr->hdr.Protocol
1001 + le16_to_cpu(pSMBr->t2.DataOffset));
1002
1003 *pOplock = le16_to_cpu(psx_rsp->OplockFlags);
1004 if(netfid)
1005 *netfid = psx_rsp->Fid; /* cifs fid stays in le */
1006 /* Let caller know file was created so we can set the mode. */
1007 /* Do we care about the CreateAction in any other cases? */
1008 if(cpu_to_le32(FILE_CREATE) == psx_rsp->CreateAction)
1009 *pOplock |= CIFS_CREATE_ACTION;
1010 /* check to make sure response data is there */
1011 if(psx_rsp->ReturnedLevel != SMB_QUERY_FILE_UNIX_BASIC) {
1012 pRetData->Type = -1; /* unknown */
1013#ifdef CONFIG_CIFS_DEBUG2
1014 cFYI(1,("unknown type"));
1015#endif
1016 } else {
1017 if(pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
1018 + sizeof(FILE_UNIX_BASIC_INFO)) {
1019 cERROR(1,("Open response data too small"));
1020 pRetData->Type = -1;
1021 goto psx_create_err;
1022 }
1023 memcpy((char *) pRetData,
1024 (char *)psx_rsp + sizeof(OPEN_PSX_RSP),
1025 sizeof (FILE_UNIX_BASIC_INFO));
1026 }
1027
1028
1029psx_create_err:
1030 cifs_buf_release(pSMB);
1031
1032 cifs_stats_inc(&tcon->num_mkdirs);
1033
1034 if (rc == -EAGAIN)
1035 goto PsxCreat;
1036
1037 return rc;
1038}
1039
916static __u16 convert_disposition(int disposition) 1040static __u16 convert_disposition(int disposition)
917{ 1041{
918 __u16 ofun = 0; 1042 __u16 ofun = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 20ba7dcc9959..216fb625843f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -30,6 +30,7 @@
30#include <linux/mempool.h> 30#include <linux/mempool.h>
31#include <linux/delay.h> 31#include <linux/delay.h>
32#include <linux/completion.h> 32#include <linux/completion.h>
33#include <linux/kthread.h>
33#include <linux/pagevec.h> 34#include <linux/pagevec.h>
34#include <linux/freezer.h> 35#include <linux/freezer.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
@@ -74,6 +75,8 @@ struct smb_vol {
74 unsigned retry:1; 75 unsigned retry:1;
75 unsigned intr:1; 76 unsigned intr:1;
76 unsigned setuids:1; 77 unsigned setuids:1;
78 unsigned override_uid:1;
79 unsigned override_gid:1;
77 unsigned noperm:1; 80 unsigned noperm:1;
78 unsigned no_psx_acl:1; /* set if posix acl support should be disabled */ 81 unsigned no_psx_acl:1; /* set if posix acl support should be disabled */
79 unsigned cifs_acl:1; 82 unsigned cifs_acl:1;
@@ -120,7 +123,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
120 struct mid_q_entry * mid_entry; 123 struct mid_q_entry * mid_entry;
121 124
122 spin_lock(&GlobalMid_Lock); 125 spin_lock(&GlobalMid_Lock);
123 if(server->tcpStatus == CifsExiting) { 126 if( kthread_should_stop() ) {
124 /* the demux thread will exit normally 127 /* the demux thread will exit normally
125 next time through the loop */ 128 next time through the loop */
126 spin_unlock(&GlobalMid_Lock); 129 spin_unlock(&GlobalMid_Lock);
@@ -182,7 +185,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
182 spin_unlock(&GlobalMid_Lock); 185 spin_unlock(&GlobalMid_Lock);
183 up(&server->tcpSem); 186 up(&server->tcpSem);
184 187
185 while ((server->tcpStatus != CifsExiting) && (server->tcpStatus != CifsGood)) 188 while ( (!kthread_should_stop()) && (server->tcpStatus != CifsGood))
186 { 189 {
187 try_to_freeze(); 190 try_to_freeze();
188 if(server->protocolType == IPV6) { 191 if(server->protocolType == IPV6) {
@@ -199,7 +202,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
199 } else { 202 } else {
200 atomic_inc(&tcpSesReconnectCount); 203 atomic_inc(&tcpSesReconnectCount);
201 spin_lock(&GlobalMid_Lock); 204 spin_lock(&GlobalMid_Lock);
202 if(server->tcpStatus != CifsExiting) 205 if( !kthread_should_stop() )
203 server->tcpStatus = CifsGood; 206 server->tcpStatus = CifsGood;
204 server->sequence_number = 0; 207 server->sequence_number = 0;
205 spin_unlock(&GlobalMid_Lock); 208 spin_unlock(&GlobalMid_Lock);
@@ -345,7 +348,6 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
345 int isMultiRsp; 348 int isMultiRsp;
346 int reconnect; 349 int reconnect;
347 350
348 daemonize("cifsd");
349 allow_signal(SIGKILL); 351 allow_signal(SIGKILL);
350 current->flags |= PF_MEMALLOC; 352 current->flags |= PF_MEMALLOC;
351 server->tsk = current; /* save process info to wake at shutdown */ 353 server->tsk = current; /* save process info to wake at shutdown */
@@ -361,7 +363,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
361 GFP_KERNEL); 363 GFP_KERNEL);
362 } 364 }
363 365
364 while (server->tcpStatus != CifsExiting) { 366 while (!kthread_should_stop()) {
365 if (try_to_freeze()) 367 if (try_to_freeze())
366 continue; 368 continue;
367 if (bigbuf == NULL) { 369 if (bigbuf == NULL) {
@@ -400,7 +402,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
400 kernel_recvmsg(csocket, &smb_msg, 402 kernel_recvmsg(csocket, &smb_msg,
401 &iov, 1, 4, 0 /* BB see socket.h flags */); 403 &iov, 1, 4, 0 /* BB see socket.h flags */);
402 404
403 if (server->tcpStatus == CifsExiting) { 405 if ( kthread_should_stop() ) {
404 break; 406 break;
405 } else if (server->tcpStatus == CifsNeedReconnect) { 407 } else if (server->tcpStatus == CifsNeedReconnect) {
406 cFYI(1, ("Reconnect after server stopped responding")); 408 cFYI(1, ("Reconnect after server stopped responding"));
@@ -524,7 +526,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
524 total_read += length) { 526 total_read += length) {
525 length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, 527 length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
526 pdu_length - total_read, 0); 528 pdu_length - total_read, 0);
527 if((server->tcpStatus == CifsExiting) || 529 if( kthread_should_stop() ||
528 (length == -EINTR)) { 530 (length == -EINTR)) {
529 /* then will exit */ 531 /* then will exit */
530 reconnect = 2; 532 reconnect = 2;
@@ -757,7 +759,6 @@ multi_t2_fnd:
757 GFP_KERNEL); 759 GFP_KERNEL);
758 } 760 }
759 761
760 complete_and_exit(&cifsd_complete, 0);
761 return 0; 762 return 0;
762} 763}
763 764
@@ -973,7 +974,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
973 } 974 }
974 if ((temp_len = strnlen(value, 300)) < 300) { 975 if ((temp_len = strnlen(value, 300)) < 300) {
975 vol->UNC = kmalloc(temp_len+1,GFP_KERNEL); 976 vol->UNC = kmalloc(temp_len+1,GFP_KERNEL);
976 if(vol->UNC == NULL) 977 if (vol->UNC == NULL)
977 return 1; 978 return 1;
978 strcpy(vol->UNC,value); 979 strcpy(vol->UNC,value);
979 if (strncmp(vol->UNC, "//", 2) == 0) { 980 if (strncmp(vol->UNC, "//", 2) == 0) {
@@ -1010,12 +1011,12 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
1010 return 1; /* needs_arg; */ 1011 return 1; /* needs_arg; */
1011 } 1012 }
1012 if ((temp_len = strnlen(value, 1024)) < 1024) { 1013 if ((temp_len = strnlen(value, 1024)) < 1024) {
1013 if(value[0] != '/') 1014 if (value[0] != '/')
1014 temp_len++; /* missing leading slash */ 1015 temp_len++; /* missing leading slash */
1015 vol->prepath = kmalloc(temp_len+1,GFP_KERNEL); 1016 vol->prepath = kmalloc(temp_len+1,GFP_KERNEL);
1016 if(vol->prepath == NULL) 1017 if (vol->prepath == NULL)
1017 return 1; 1018 return 1;
1018 if(value[0] != '/') { 1019 if (value[0] != '/') {
1019 vol->prepath[0] = '/'; 1020 vol->prepath[0] = '/';
1020 strcpy(vol->prepath+1,value); 1021 strcpy(vol->prepath+1,value);
1021 } else 1022 } else
@@ -1031,7 +1032,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
1031 return 1; /* needs_arg; */ 1032 return 1; /* needs_arg; */
1032 } 1033 }
1033 if (strnlen(value, 65) < 65) { 1034 if (strnlen(value, 65) < 65) {
1034 if(strnicmp(value,"default",7)) 1035 if (strnicmp(value,"default",7))
1035 vol->iocharset = value; 1036 vol->iocharset = value;
1036 /* if iocharset not set load_nls_default used by caller */ 1037 /* if iocharset not set load_nls_default used by caller */
1037 cFYI(1, ("iocharset set to %s",value)); 1038 cFYI(1, ("iocharset set to %s",value));
@@ -1043,11 +1044,13 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
1043 if (value && *value) { 1044 if (value && *value) {
1044 vol->linux_uid = 1045 vol->linux_uid =
1045 simple_strtoul(value, &value, 0); 1046 simple_strtoul(value, &value, 0);
1047 vol->override_uid = 1;
1046 } 1048 }
1047 } else if (strnicmp(data, "gid", 3) == 0) { 1049 } else if (strnicmp(data, "gid", 3) == 0) {
1048 if (value && *value) { 1050 if (value && *value) {
1049 vol->linux_gid = 1051 vol->linux_gid =
1050 simple_strtoul(value, &value, 0); 1052 simple_strtoul(value, &value, 0);
1053 vol->override_gid = 1;
1051 } 1054 }
1052 } else if (strnicmp(data, "file_mode", 4) == 0) { 1055 } else if (strnicmp(data, "file_mode", 4) == 0) {
1053 if (value && *value) { 1056 if (value && *value) {
@@ -1102,7 +1105,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
1102 } 1105 }
1103 /* The string has 16th byte zero still from 1106 /* The string has 16th byte zero still from
1104 set at top of the function */ 1107 set at top of the function */
1105 if((i==15) && (value[i] != 0)) 1108 if ((i==15) && (value[i] != 0))
1106 printk(KERN_WARNING "CIFS: netbiosname longer than 15 truncated.\n"); 1109 printk(KERN_WARNING "CIFS: netbiosname longer than 15 truncated.\n");
1107 } 1110 }
1108 } else if (strnicmp(data, "servern", 7) == 0) { 1111 } else if (strnicmp(data, "servern", 7) == 0) {
@@ -1126,7 +1129,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
1126 } 1129 }
1127 /* The string has 16th byte zero still from 1130 /* The string has 16th byte zero still from
1128 set at top of the function */ 1131 set at top of the function */
1129 if((i==15) && (value[i] != 0)) 1132 if ((i==15) && (value[i] != 0))
1130 printk(KERN_WARNING "CIFS: server netbiosname longer than 15 truncated.\n"); 1133 printk(KERN_WARNING "CIFS: server netbiosname longer than 15 truncated.\n");
1131 } 1134 }
1132 } else if (strnicmp(data, "credentials", 4) == 0) { 1135 } else if (strnicmp(data, "credentials", 4) == 0) {
@@ -1233,13 +1236,13 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
1233 printk(KERN_WARNING "CIFS: Unknown mount option %s\n",data); 1236 printk(KERN_WARNING "CIFS: Unknown mount option %s\n",data);
1234 } 1237 }
1235 if (vol->UNC == NULL) { 1238 if (vol->UNC == NULL) {
1236 if(devname == NULL) { 1239 if (devname == NULL) {
1237 printk(KERN_WARNING "CIFS: Missing UNC name for mount target\n"); 1240 printk(KERN_WARNING "CIFS: Missing UNC name for mount target\n");
1238 return 1; 1241 return 1;
1239 } 1242 }
1240 if ((temp_len = strnlen(devname, 300)) < 300) { 1243 if ((temp_len = strnlen(devname, 300)) < 300) {
1241 vol->UNC = kmalloc(temp_len+1,GFP_KERNEL); 1244 vol->UNC = kmalloc(temp_len+1,GFP_KERNEL);
1242 if(vol->UNC == NULL) 1245 if (vol->UNC == NULL)
1243 return 1; 1246 return 1;
1244 strcpy(vol->UNC,devname); 1247 strcpy(vol->UNC,devname);
1245 if (strncmp(vol->UNC, "//", 2) == 0) { 1248 if (strncmp(vol->UNC, "//", 2) == 0) {
@@ -1663,7 +1666,13 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo * tcon,
1663 CIFS_SB(sb)->mnt_cifs_flags |= 1666 CIFS_SB(sb)->mnt_cifs_flags |=
1664 CIFS_MOUNT_POSIX_PATHS; 1667 CIFS_MOUNT_POSIX_PATHS;
1665 } 1668 }
1666 1669
1670 /* We might be setting the path sep back to a different
1671 form if we are reconnecting and the server switched its
1672 posix path capability for this share */
1673 if(sb && (CIFS_SB(sb)->prepathlen > 0))
1674 CIFS_SB(sb)->prepath[0] = CIFS_DIR_SEP(CIFS_SB(sb));
1675
1667 cFYI(1,("Negotiate caps 0x%x",(int)cap)); 1676 cFYI(1,("Negotiate caps 0x%x",(int)cap));
1668#ifdef CONFIG_CIFS_DEBUG2 1677#ifdef CONFIG_CIFS_DEBUG2
1669 if(cap & CIFS_UNIX_FCNTL_CAP) 1678 if(cap & CIFS_UNIX_FCNTL_CAP)
@@ -1712,12 +1721,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1712 return -EINVAL; 1721 return -EINVAL;
1713 } 1722 }
1714 1723
1715 if (volume_info.username) { 1724 if (volume_info.nullauth) {
1725 cFYI(1,("null user"));
1726 volume_info.username = NULL;
1727 } else if (volume_info.username) {
1716 /* BB fixme parse for domain name here */ 1728 /* BB fixme parse for domain name here */
1717 cFYI(1, ("Username: %s ", volume_info.username)); 1729 cFYI(1, ("Username: %s ", volume_info.username));
1718
1719 } else if (volume_info.nullauth) {
1720 cFYI(1,("null user"));
1721 } else { 1730 } else {
1722 cifserror("No username specified"); 1731 cifserror("No username specified");
1723 /* In userspace mount helper we can get user name from alternate 1732 /* In userspace mount helper we can get user name from alternate
@@ -1791,11 +1800,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1791 existingCifsSes = cifs_find_tcp_session(&sin_server.sin_addr, 1800 existingCifsSes = cifs_find_tcp_session(&sin_server.sin_addr,
1792 NULL /* no ipv6 addr */, 1801 NULL /* no ipv6 addr */,
1793 volume_info.username, &srvTcp); 1802 volume_info.username, &srvTcp);
1794 else if(address_type == AF_INET6) 1803 else if(address_type == AF_INET6) {
1804 cFYI(1,("looking for ipv6 address"));
1795 existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */, 1805 existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */,
1796 &sin_server6.sin6_addr, 1806 &sin_server6.sin6_addr,
1797 volume_info.username, &srvTcp); 1807 volume_info.username, &srvTcp);
1798 else { 1808 } else {
1799 kfree(volume_info.UNC); 1809 kfree(volume_info.UNC);
1800 kfree(volume_info.password); 1810 kfree(volume_info.password);
1801 kfree(volume_info.prepath); 1811 kfree(volume_info.prepath);
@@ -1807,17 +1817,23 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1807 if (srvTcp) { 1817 if (srvTcp) {
1808 cFYI(1, ("Existing tcp session with server found")); 1818 cFYI(1, ("Existing tcp session with server found"));
1809 } else { /* create socket */ 1819 } else { /* create socket */
1810 if(volume_info.port) 1820 if (volume_info.port)
1811 sin_server.sin_port = htons(volume_info.port); 1821 sin_server.sin_port = htons(volume_info.port);
1812 else 1822 else
1813 sin_server.sin_port = 0; 1823 sin_server.sin_port = 0;
1814 rc = ipv4_connect(&sin_server,&csocket, 1824 if (address_type == AF_INET6) {
1825 cFYI(1,("attempting ipv6 connect"));
1826 /* BB should we allow ipv6 on port 139? */
1827 /* other OS never observed in Wild doing 139 with v6 */
1828 rc = ipv6_connect(&sin_server6,&csocket);
1829 } else
1830 rc = ipv4_connect(&sin_server,&csocket,
1815 volume_info.source_rfc1001_name, 1831 volume_info.source_rfc1001_name,
1816 volume_info.target_rfc1001_name); 1832 volume_info.target_rfc1001_name);
1817 if (rc < 0) { 1833 if (rc < 0) {
1818 cERROR(1, 1834 cERROR(1,
1819 ("Error connecting to IPv4 socket. Aborting operation")); 1835 ("Error connecting to IPv4 socket. Aborting operation"));
1820 if(csocket != NULL) 1836 if (csocket != NULL)
1821 sock_release(csocket); 1837 sock_release(csocket);
1822 kfree(volume_info.UNC); 1838 kfree(volume_info.UNC);
1823 kfree(volume_info.password); 1839 kfree(volume_info.password);
@@ -1850,10 +1866,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1850 so no need to spinlock this init of tcpStatus */ 1866 so no need to spinlock this init of tcpStatus */
1851 srvTcp->tcpStatus = CifsNew; 1867 srvTcp->tcpStatus = CifsNew;
1852 init_MUTEX(&srvTcp->tcpSem); 1868 init_MUTEX(&srvTcp->tcpSem);
1853 rc = (int)kernel_thread((void *)(void *)cifs_demultiplex_thread, srvTcp, 1869 srvTcp->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread, srvTcp, "cifsd");
1854 CLONE_FS | CLONE_FILES | CLONE_VM); 1870 if ( IS_ERR(srvTcp->tsk) ) {
1855 if(rc < 0) { 1871 rc = PTR_ERR(srvTcp->tsk);
1856 rc = -ENOMEM; 1872 cERROR(1,("error %d create cifsd thread", rc));
1873 srvTcp->tsk = NULL;
1857 sock_release(csocket); 1874 sock_release(csocket);
1858 kfree(volume_info.UNC); 1875 kfree(volume_info.UNC);
1859 kfree(volume_info.password); 1876 kfree(volume_info.password);
@@ -1896,7 +1913,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1896 int len = strlen(volume_info.domainname); 1913 int len = strlen(volume_info.domainname);
1897 pSesInfo->domainName = 1914 pSesInfo->domainName =
1898 kmalloc(len + 1, GFP_KERNEL); 1915 kmalloc(len + 1, GFP_KERNEL);
1899 if(pSesInfo->domainName) 1916 if (pSesInfo->domainName)
1900 strcpy(pSesInfo->domainName, 1917 strcpy(pSesInfo->domainName,
1901 volume_info.domainname); 1918 volume_info.domainname);
1902 } 1919 }
@@ -1906,7 +1923,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1906 /* BB FIXME need to pass vol->secFlgs BB */ 1923 /* BB FIXME need to pass vol->secFlgs BB */
1907 rc = cifs_setup_session(xid,pSesInfo, cifs_sb->local_nls); 1924 rc = cifs_setup_session(xid,pSesInfo, cifs_sb->local_nls);
1908 up(&pSesInfo->sesSem); 1925 up(&pSesInfo->sesSem);
1909 if(!rc) 1926 if (!rc)
1910 atomic_inc(&srvTcp->socketUseCount); 1927 atomic_inc(&srvTcp->socketUseCount);
1911 } else 1928 } else
1912 kfree(volume_info.password); 1929 kfree(volume_info.password);
@@ -1914,7 +1931,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1914 1931
1915 /* search for existing tcon to this server share */ 1932 /* search for existing tcon to this server share */
1916 if (!rc) { 1933 if (!rc) {
1917 if(volume_info.rsize > CIFSMaxBufSize) { 1934 if (volume_info.rsize > CIFSMaxBufSize) {
1918 cERROR(1,("rsize %d too large, using MaxBufSize", 1935 cERROR(1,("rsize %d too large, using MaxBufSize",
1919 volume_info.rsize)); 1936 volume_info.rsize));
1920 cifs_sb->rsize = CIFSMaxBufSize; 1937 cifs_sb->rsize = CIFSMaxBufSize;
@@ -1923,11 +1940,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1923 else /* default */ 1940 else /* default */
1924 cifs_sb->rsize = CIFSMaxBufSize; 1941 cifs_sb->rsize = CIFSMaxBufSize;
1925 1942
1926 if(volume_info.wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) { 1943 if (volume_info.wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
1927 cERROR(1,("wsize %d too large using 4096 instead", 1944 cERROR(1,("wsize %d too large using 4096 instead",
1928 volume_info.wsize)); 1945 volume_info.wsize));
1929 cifs_sb->wsize = 4096; 1946 cifs_sb->wsize = 4096;
1930 } else if(volume_info.wsize) 1947 } else if (volume_info.wsize)
1931 cifs_sb->wsize = volume_info.wsize; 1948 cifs_sb->wsize = volume_info.wsize;
1932 else 1949 else
1933 cifs_sb->wsize = 1950 cifs_sb->wsize =
@@ -1940,14 +1957,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1940 conjunction with 52K kvec constraint on arch with 4K 1957 conjunction with 52K kvec constraint on arch with 4K
1941 page size */ 1958 page size */
1942 1959
1943 if(cifs_sb->rsize < 2048) { 1960 if (cifs_sb->rsize < 2048) {
1944 cifs_sb->rsize = 2048; 1961 cifs_sb->rsize = 2048;
1945 /* Windows ME may prefer this */ 1962 /* Windows ME may prefer this */
1946 cFYI(1,("readsize set to minimum 2048")); 1963 cFYI(1,("readsize set to minimum 2048"));
1947 } 1964 }
1948 /* calculate prepath */ 1965 /* calculate prepath */
1949 cifs_sb->prepath = volume_info.prepath; 1966 cifs_sb->prepath = volume_info.prepath;
1950 if(cifs_sb->prepath) { 1967 if (cifs_sb->prepath) {
1951 cifs_sb->prepathlen = strlen(cifs_sb->prepath); 1968 cifs_sb->prepathlen = strlen(cifs_sb->prepath);
1952 cifs_sb->prepath[0] = CIFS_DIR_SEP(cifs_sb); 1969 cifs_sb->prepath[0] = CIFS_DIR_SEP(cifs_sb);
1953 volume_info.prepath = NULL; 1970 volume_info.prepath = NULL;
@@ -1960,24 +1977,27 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1960 cFYI(1,("file mode: 0x%x dir mode: 0x%x", 1977 cFYI(1,("file mode: 0x%x dir mode: 0x%x",
1961 cifs_sb->mnt_file_mode,cifs_sb->mnt_dir_mode)); 1978 cifs_sb->mnt_file_mode,cifs_sb->mnt_dir_mode));
1962 1979
1963 if(volume_info.noperm) 1980 if (volume_info.noperm)
1964 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; 1981 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
1965 if(volume_info.setuids) 1982 if (volume_info.setuids)
1966 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID; 1983 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
1967 if(volume_info.server_ino) 1984 if (volume_info.server_ino)
1968 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM; 1985 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
1969 if(volume_info.remap) 1986 if (volume_info.remap)
1970 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR; 1987 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
1971 if(volume_info.no_xattr) 1988 if (volume_info.no_xattr)
1972 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR; 1989 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
1973 if(volume_info.sfu_emul) 1990 if (volume_info.sfu_emul)
1974 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL; 1991 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
1975 if(volume_info.nobrl) 1992 if (volume_info.nobrl)
1976 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL; 1993 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
1977 if(volume_info.cifs_acl) 1994 if (volume_info.cifs_acl)
1978 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; 1995 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
1979 1996 if (volume_info.override_uid)
1980 if(volume_info.direct_io) { 1997 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
1998 if (volume_info.override_gid)
1999 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
2000 if (volume_info.direct_io) {
1981 cFYI(1,("mounting share using direct i/o")); 2001 cFYI(1,("mounting share using direct i/o"));
1982 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; 2002 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
1983 } 2003 }
@@ -2030,7 +2050,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2030 } 2050 }
2031 } 2051 }
2032 } 2052 }
2033 if(pSesInfo) { 2053 if (pSesInfo) {
2034 if (pSesInfo->capabilities & CAP_LARGE_FILES) { 2054 if (pSesInfo->capabilities & CAP_LARGE_FILES) {
2035 sb->s_maxbytes = (u64) 1 << 63; 2055 sb->s_maxbytes = (u64) 1 << 63;
2036 } else 2056 } else
@@ -2044,13 +2064,13 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2044 if (rc) { 2064 if (rc) {
2045 /* if session setup failed, use count is zero but 2065 /* if session setup failed, use count is zero but
2046 we still need to free cifsd thread */ 2066 we still need to free cifsd thread */
2047 if(atomic_read(&srvTcp->socketUseCount) == 0) { 2067 if (atomic_read(&srvTcp->socketUseCount) == 0) {
2048 spin_lock(&GlobalMid_Lock); 2068 spin_lock(&GlobalMid_Lock);
2049 srvTcp->tcpStatus = CifsExiting; 2069 srvTcp->tcpStatus = CifsExiting;
2050 spin_unlock(&GlobalMid_Lock); 2070 spin_unlock(&GlobalMid_Lock);
2051 if(srvTcp->tsk) { 2071 if (srvTcp->tsk) {
2052 send_sig(SIGKILL,srvTcp->tsk,1); 2072 send_sig(SIGKILL,srvTcp->tsk,1);
2053 wait_for_completion(&cifsd_complete); 2073 kthread_stop(srvTcp->tsk);
2054 } 2074 }
2055 } 2075 }
2056 /* If find_unc succeeded then rc == 0 so we can not end */ 2076 /* If find_unc succeeded then rc == 0 so we can not end */
@@ -2063,10 +2083,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2063 int temp_rc; 2083 int temp_rc;
2064 temp_rc = CIFSSMBLogoff(xid, pSesInfo); 2084 temp_rc = CIFSSMBLogoff(xid, pSesInfo);
2065 /* if the socketUseCount is now zero */ 2085 /* if the socketUseCount is now zero */
2066 if((temp_rc == -ESHUTDOWN) && 2086 if ((temp_rc == -ESHUTDOWN) &&
2067 (pSesInfo->server->tsk)) { 2087 (pSesInfo->server) && (pSesInfo->server->tsk)) {
2068 send_sig(SIGKILL,pSesInfo->server->tsk,1); 2088 send_sig(SIGKILL,pSesInfo->server->tsk,1);
2069 wait_for_completion(&cifsd_complete); 2089 kthread_stop(pSesInfo->server->tsk);
2070 } 2090 }
2071 } else 2091 } else
2072 cFYI(1, ("No session or bad tcon")); 2092 cFYI(1, ("No session or bad tcon"));
@@ -2127,7 +2147,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2127 __u16 count; 2147 __u16 count;
2128 2148
2129 cFYI(1, ("In sesssetup")); 2149 cFYI(1, ("In sesssetup"));
2130 if(ses == NULL) 2150 if (ses == NULL)
2131 return -EINVAL; 2151 return -EINVAL;
2132 user = ses->userName; 2152 user = ses->userName;
2133 domain = ses->domainName; 2153 domain = ses->domainName;
@@ -2182,7 +2202,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2182 *bcc_ptr = 0; 2202 *bcc_ptr = 0;
2183 bcc_ptr++; 2203 bcc_ptr++;
2184 } 2204 }
2185 if(user == NULL) 2205 if (user == NULL)
2186 bytes_returned = 0; /* skip null user */ 2206 bytes_returned = 0; /* skip null user */
2187 else 2207 else
2188 bytes_returned = 2208 bytes_returned =
@@ -2216,7 +2236,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2216 bcc_ptr += 2 * bytes_returned; 2236 bcc_ptr += 2 * bytes_returned;
2217 bcc_ptr += 2; 2237 bcc_ptr += 2;
2218 } else { 2238 } else {
2219 if(user != NULL) { 2239 if (user != NULL) {
2220 strncpy(bcc_ptr, user, 200); 2240 strncpy(bcc_ptr, user, 200);
2221 bcc_ptr += strnlen(user, 200); 2241 bcc_ptr += strnlen(user, 200);
2222 } 2242 }
@@ -3316,7 +3336,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
3316 cFYI(1,("Waking up socket by sending it signal")); 3336 cFYI(1,("Waking up socket by sending it signal"));
3317 if(cifsd_task) { 3337 if(cifsd_task) {
3318 send_sig(SIGKILL,cifsd_task,1); 3338 send_sig(SIGKILL,cifsd_task,1);
3319 wait_for_completion(&cifsd_complete); 3339 kthread_stop(cifsd_task);
3320 } 3340 }
3321 rc = 0; 3341 rc = 0;
3322 } /* else - we have an smb session 3342 } /* else - we have an smb session
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3fad638d26d3..e5210519ac4b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -274,7 +274,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
274 pCifsFile->invalidHandle = FALSE; 274 pCifsFile->invalidHandle = FALSE;
275 pCifsFile->closePend = FALSE; 275 pCifsFile->closePend = FALSE;
276 init_MUTEX(&pCifsFile->fh_sem); 276 init_MUTEX(&pCifsFile->fh_sem);
277 init_MUTEX(&pCifsFile->lock_sem); 277 mutex_init(&pCifsFile->lock_mutex);
278 INIT_LIST_HEAD(&pCifsFile->llist); 278 INIT_LIST_HEAD(&pCifsFile->llist);
279 atomic_set(&pCifsFile->wrtPending,0); 279 atomic_set(&pCifsFile->wrtPending,0);
280 280
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 2d3275bedb55..b570530f97bf 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -48,7 +48,7 @@ static inline struct cifsFileInfo *cifs_init_private(
48 private_data->netfid = netfid; 48 private_data->netfid = netfid;
49 private_data->pid = current->tgid; 49 private_data->pid = current->tgid;
50 init_MUTEX(&private_data->fh_sem); 50 init_MUTEX(&private_data->fh_sem);
51 init_MUTEX(&private_data->lock_sem); 51 mutex_init(&private_data->lock_mutex);
52 INIT_LIST_HEAD(&private_data->llist); 52 INIT_LIST_HEAD(&private_data->llist);
53 private_data->pfile = file; /* needed for writepage */ 53 private_data->pfile = file; /* needed for writepage */
54 private_data->pInode = inode; 54 private_data->pInode = inode;
@@ -338,8 +338,7 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
338 return rc; 338 return rc;
339} 339}
340 340
341static int cifs_reopen_file(struct inode *inode, struct file *file, 341static int cifs_reopen_file(struct file *file, int can_flush)
342 int can_flush)
343{ 342{
344 int rc = -EACCES; 343 int rc = -EACCES;
345 int xid, oplock; 344 int xid, oplock;
@@ -347,13 +346,12 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
347 struct cifsTconInfo *pTcon; 346 struct cifsTconInfo *pTcon;
348 struct cifsFileInfo *pCifsFile; 347 struct cifsFileInfo *pCifsFile;
349 struct cifsInodeInfo *pCifsInode; 348 struct cifsInodeInfo *pCifsInode;
349 struct inode * inode;
350 char *full_path = NULL; 350 char *full_path = NULL;
351 int desiredAccess; 351 int desiredAccess;
352 int disposition = FILE_OPEN; 352 int disposition = FILE_OPEN;
353 __u16 netfid; 353 __u16 netfid;
354 354
355 if (inode == NULL)
356 return -EBADF;
357 if (file->private_data) { 355 if (file->private_data) {
358 pCifsFile = (struct cifsFileInfo *)file->private_data; 356 pCifsFile = (struct cifsFileInfo *)file->private_data;
359 } else 357 } else
@@ -368,25 +366,37 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
368 } 366 }
369 367
370 if (file->f_path.dentry == NULL) { 368 if (file->f_path.dentry == NULL) {
371 up(&pCifsFile->fh_sem); 369 cERROR(1, ("no valid name if dentry freed"));
372 cFYI(1, ("failed file reopen, no valid name if dentry freed")); 370 dump_stack();
373 FreeXid(xid); 371 rc = -EBADF;
374 return -EBADF; 372 goto reopen_error_exit;
375 } 373 }
374
375 inode = file->f_path.dentry->d_inode;
376 if(inode == NULL) {
377 cERROR(1, ("inode not valid"));
378 dump_stack();
379 rc = -EBADF;
380 goto reopen_error_exit;
381 }
382
376 cifs_sb = CIFS_SB(inode->i_sb); 383 cifs_sb = CIFS_SB(inode->i_sb);
377 pTcon = cifs_sb->tcon; 384 pTcon = cifs_sb->tcon;
385
378/* can not grab rename sem here because various ops, including 386/* can not grab rename sem here because various ops, including
379 those that already have the rename sem can end up causing writepage 387 those that already have the rename sem can end up causing writepage
380 to get called and if the server was down that means we end up here, 388 to get called and if the server was down that means we end up here,
381 and we can never tell if the caller already has the rename_sem */ 389 and we can never tell if the caller already has the rename_sem */
382 full_path = build_path_from_dentry(file->f_path.dentry); 390 full_path = build_path_from_dentry(file->f_path.dentry);
383 if (full_path == NULL) { 391 if (full_path == NULL) {
392 rc = -ENOMEM;
393reopen_error_exit:
384 up(&pCifsFile->fh_sem); 394 up(&pCifsFile->fh_sem);
385 FreeXid(xid); 395 FreeXid(xid);
386 return -ENOMEM; 396 return rc;
387 } 397 }
388 398
389 cFYI(1, (" inode = 0x%p file flags are 0x%x for %s", 399 cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
390 inode, file->f_flags,full_path)); 400 inode, file->f_flags,full_path));
391 desiredAccess = cifs_convert_flags(file->f_flags); 401 desiredAccess = cifs_convert_flags(file->f_flags);
392 402
@@ -401,13 +411,6 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
401 and server version of file size can be stale. If we knew for sure 411 and server version of file size can be stale. If we knew for sure
402 that inode was not dirty locally we could do this */ 412 that inode was not dirty locally we could do this */
403 413
404/* buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
405 if (buf == 0) {
406 up(&pCifsFile->fh_sem);
407 kfree(full_path);
408 FreeXid(xid);
409 return -ENOMEM;
410 } */
411 rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess, 414 rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess,
412 CREATE_NOT_DIR, &netfid, &oplock, NULL, 415 CREATE_NOT_DIR, &netfid, &oplock, NULL,
413 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 416 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
@@ -508,12 +511,12 @@ int cifs_close(struct inode *inode, struct file *file)
508 511
509 /* Delete any outstanding lock records. 512 /* Delete any outstanding lock records.
510 We'll lose them when the file is closed anyway. */ 513 We'll lose them when the file is closed anyway. */
511 down(&pSMBFile->lock_sem); 514 mutex_lock(&pSMBFile->lock_mutex);
512 list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) { 515 list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
513 list_del(&li->llist); 516 list_del(&li->llist);
514 kfree(li); 517 kfree(li);
515 } 518 }
516 up(&pSMBFile->lock_sem); 519 mutex_unlock(&pSMBFile->lock_mutex);
517 520
518 write_lock(&GlobalSMBSeslock); 521 write_lock(&GlobalSMBSeslock);
519 list_del(&pSMBFile->flist); 522 list_del(&pSMBFile->flist);
@@ -598,9 +601,9 @@ static int store_file_lock(struct cifsFileInfo *fid, __u64 len,
598 li->offset = offset; 601 li->offset = offset;
599 li->length = len; 602 li->length = len;
600 li->type = lockType; 603 li->type = lockType;
601 down(&fid->lock_sem); 604 mutex_lock(&fid->lock_mutex);
602 list_add(&li->llist, &fid->llist); 605 list_add(&li->llist, &fid->llist);
603 up(&fid->lock_sem); 606 mutex_unlock(&fid->lock_mutex);
604 return 0; 607 return 0;
605} 608}
606 609
@@ -757,7 +760,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
757 struct cifsLockInfo *li, *tmp; 760 struct cifsLockInfo *li, *tmp;
758 761
759 rc = 0; 762 rc = 0;
760 down(&fid->lock_sem); 763 mutex_lock(&fid->lock_mutex);
761 list_for_each_entry_safe(li, tmp, &fid->llist, llist) { 764 list_for_each_entry_safe(li, tmp, &fid->llist, llist) {
762 if (pfLock->fl_start <= li->offset && 765 if (pfLock->fl_start <= li->offset &&
763 length >= li->length) { 766 length >= li->length) {
@@ -771,7 +774,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
771 kfree(li); 774 kfree(li);
772 } 775 }
773 } 776 }
774 up(&fid->lock_sem); 777 mutex_unlock(&fid->lock_mutex);
775 } 778 }
776 } 779 }
777 780
@@ -792,12 +795,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
792 int xid, long_op; 795 int xid, long_op;
793 struct cifsFileInfo *open_file; 796 struct cifsFileInfo *open_file;
794 797
795 if (file->f_path.dentry == NULL)
796 return -EBADF;
797
798 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 798 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
799 if (cifs_sb == NULL)
800 return -EBADF;
801 799
802 pTcon = cifs_sb->tcon; 800 pTcon = cifs_sb->tcon;
803 801
@@ -807,14 +805,9 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
807 805
808 if (file->private_data == NULL) 806 if (file->private_data == NULL)
809 return -EBADF; 807 return -EBADF;
810 else 808 open_file = (struct cifsFileInfo *) file->private_data;
811 open_file = (struct cifsFileInfo *) file->private_data;
812 809
813 xid = GetXid(); 810 xid = GetXid();
814 if (file->f_path.dentry->d_inode == NULL) {
815 FreeXid(xid);
816 return -EBADF;
817 }
818 811
819 if (*poffset > file->f_path.dentry->d_inode->i_size) 812 if (*poffset > file->f_path.dentry->d_inode->i_size)
820 long_op = 2; /* writes past end of file can take a long time */ 813 long_op = 2; /* writes past end of file can take a long time */
@@ -841,17 +834,11 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
841 return -EBADF; 834 return -EBADF;
842 } 835 }
843 if (open_file->invalidHandle) { 836 if (open_file->invalidHandle) {
844 if ((file->f_path.dentry == NULL) ||
845 (file->f_path.dentry->d_inode == NULL)) {
846 FreeXid(xid);
847 return total_written;
848 }
849 /* we could deadlock if we called 837 /* we could deadlock if we called
850 filemap_fdatawait from here so tell 838 filemap_fdatawait from here so tell
851 reopen_file not to flush data to server 839 reopen_file not to flush data to server
852 now */ 840 now */
853 rc = cifs_reopen_file(file->f_path.dentry->d_inode, 841 rc = cifs_reopen_file(file, FALSE);
854 file, FALSE);
855 if (rc != 0) 842 if (rc != 0)
856 break; 843 break;
857 } 844 }
@@ -908,12 +895,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
908 int xid, long_op; 895 int xid, long_op;
909 struct cifsFileInfo *open_file; 896 struct cifsFileInfo *open_file;
910 897
911 if (file->f_path.dentry == NULL)
912 return -EBADF;
913
914 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 898 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
915 if (cifs_sb == NULL)
916 return -EBADF;
917 899
918 pTcon = cifs_sb->tcon; 900 pTcon = cifs_sb->tcon;
919 901
@@ -922,14 +904,9 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
922 904
923 if (file->private_data == NULL) 905 if (file->private_data == NULL)
924 return -EBADF; 906 return -EBADF;
925 else 907 open_file = (struct cifsFileInfo *)file->private_data;
926 open_file = (struct cifsFileInfo *)file->private_data;
927 908
928 xid = GetXid(); 909 xid = GetXid();
929 if (file->f_path.dentry->d_inode == NULL) {
930 FreeXid(xid);
931 return -EBADF;
932 }
933 910
934 if (*poffset > file->f_path.dentry->d_inode->i_size) 911 if (*poffset > file->f_path.dentry->d_inode->i_size)
935 long_op = 2; /* writes past end of file can take a long time */ 912 long_op = 2; /* writes past end of file can take a long time */
@@ -957,17 +934,11 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
957 return -EBADF; 934 return -EBADF;
958 } 935 }
959 if (open_file->invalidHandle) { 936 if (open_file->invalidHandle) {
960 if ((file->f_path.dentry == NULL) ||
961 (file->f_path.dentry->d_inode == NULL)) {
962 FreeXid(xid);
963 return total_written;
964 }
965 /* we could deadlock if we called 937 /* we could deadlock if we called
966 filemap_fdatawait from here so tell 938 filemap_fdatawait from here so tell
967 reopen_file not to flush data to 939 reopen_file not to flush data to
968 server now */ 940 server now */
969 rc = cifs_reopen_file(file->f_path.dentry->d_inode, 941 rc = cifs_reopen_file(file, FALSE);
970 file, FALSE);
971 if (rc != 0) 942 if (rc != 0)
972 break; 943 break;
973 } 944 }
@@ -1056,8 +1027,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1056 read_unlock(&GlobalSMBSeslock); 1027 read_unlock(&GlobalSMBSeslock);
1057 if((open_file->invalidHandle) && 1028 if((open_file->invalidHandle) &&
1058 (!open_file->closePend) /* BB fixme -since the second clause can not be true remove it BB */) { 1029 (!open_file->closePend) /* BB fixme -since the second clause can not be true remove it BB */) {
1059 rc = cifs_reopen_file(&cifs_inode->vfs_inode, 1030 rc = cifs_reopen_file(open_file->pfile, FALSE);
1060 open_file->pfile, FALSE);
1061 /* if it fails, try another handle - might be */ 1031 /* if it fails, try another handle - might be */
1062 /* dangerous to hold up writepages with retry */ 1032 /* dangerous to hold up writepages with retry */
1063 if(rc) { 1033 if(rc) {
@@ -1404,32 +1374,6 @@ static int cifs_commit_write(struct file *file, struct page *page,
1404 spin_lock(&inode->i_lock); 1374 spin_lock(&inode->i_lock);
1405 if (position > inode->i_size) { 1375 if (position > inode->i_size) {
1406 i_size_write(inode, position); 1376 i_size_write(inode, position);
1407 /* if (file->private_data == NULL) {
1408 rc = -EBADF;
1409 } else {
1410 open_file = (struct cifsFileInfo *)file->private_data;
1411 cifs_sb = CIFS_SB(inode->i_sb);
1412 rc = -EAGAIN;
1413 while (rc == -EAGAIN) {
1414 if ((open_file->invalidHandle) &&
1415 (!open_file->closePend)) {
1416 rc = cifs_reopen_file(
1417 file->f_path.dentry->d_inode, file);
1418 if (rc != 0)
1419 break;
1420 }
1421 if (!open_file->closePend) {
1422 rc = CIFSSMBSetFileSize(xid,
1423 cifs_sb->tcon, position,
1424 open_file->netfid,
1425 open_file->pid, FALSE);
1426 } else {
1427 rc = -EBADF;
1428 break;
1429 }
1430 }
1431 cFYI(1, (" SetEOF (commit write) rc = %d", rc));
1432 } */
1433 } 1377 }
1434 spin_unlock(&inode->i_lock); 1378 spin_unlock(&inode->i_lock);
1435 if (!PageUptodate(page)) { 1379 if (!PageUptodate(page)) {
@@ -1573,8 +1517,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1573 int buf_type = CIFS_NO_BUFFER; 1517 int buf_type = CIFS_NO_BUFFER;
1574 if ((open_file->invalidHandle) && 1518 if ((open_file->invalidHandle) &&
1575 (!open_file->closePend)) { 1519 (!open_file->closePend)) {
1576 rc = cifs_reopen_file(file->f_path.dentry->d_inode, 1520 rc = cifs_reopen_file(file, TRUE);
1577 file, TRUE);
1578 if (rc != 0) 1521 if (rc != 0)
1579 break; 1522 break;
1580 } 1523 }
@@ -1660,8 +1603,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1660 while (rc == -EAGAIN) { 1603 while (rc == -EAGAIN) {
1661 if ((open_file->invalidHandle) && 1604 if ((open_file->invalidHandle) &&
1662 (!open_file->closePend)) { 1605 (!open_file->closePend)) {
1663 rc = cifs_reopen_file(file->f_path.dentry->d_inode, 1606 rc = cifs_reopen_file(file, TRUE);
1664 file, TRUE);
1665 if (rc != 0) 1607 if (rc != 0)
1666 break; 1608 break;
1667 } 1609 }
@@ -1817,8 +1759,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1817 while (rc == -EAGAIN) { 1759 while (rc == -EAGAIN) {
1818 if ((open_file->invalidHandle) && 1760 if ((open_file->invalidHandle) &&
1819 (!open_file->closePend)) { 1761 (!open_file->closePend)) {
1820 rc = cifs_reopen_file(file->f_path.dentry->d_inode, 1762 rc = cifs_reopen_file(file, TRUE);
1821 file, TRUE);
1822 if (rc != 0) 1763 if (rc != 0)
1823 break; 1764 break;
1824 } 1765 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f414526e476a..3e87dad3367c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/inode.c 2 * fs/cifs/inode.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2005 4 * Copyright (C) International Business Machines Corp., 2002,2007
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -90,7 +90,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
90 (*pinode)->i_ino = 90 (*pinode)->i_ino =
91 (unsigned long)findData.UniqueId; 91 (unsigned long)findData.UniqueId;
92 } /* note ino incremented to unique num in new_inode */ 92 } /* note ino incremented to unique num in new_inode */
93 if(sb->s_flags & MS_NOATIME) 93 if (sb->s_flags & MS_NOATIME)
94 (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME; 94 (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
95 95
96 insert_inode_hash(*pinode); 96 insert_inode_hash(*pinode);
@@ -139,8 +139,17 @@ int cifs_get_inode_info_unix(struct inode **pinode,
139 inode->i_mode |= S_IFREG; 139 inode->i_mode |= S_IFREG;
140 cFYI(1,("unknown type %d",type)); 140 cFYI(1,("unknown type %d",type));
141 } 141 }
142 inode->i_uid = le64_to_cpu(findData.Uid); 142
143 inode->i_gid = le64_to_cpu(findData.Gid); 143 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
144 inode->i_uid = cifs_sb->mnt_uid;
145 else
146 inode->i_uid = le64_to_cpu(findData.Uid);
147
148 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
149 inode->i_gid = cifs_sb->mnt_gid;
150 else
151 inode->i_gid = le64_to_cpu(findData.Gid);
152
144 inode->i_nlink = le64_to_cpu(findData.Nlinks); 153 inode->i_nlink = le64_to_cpu(findData.Nlinks);
145 154
146 spin_lock(&inode->i_lock); 155 spin_lock(&inode->i_lock);
@@ -178,13 +187,13 @@ int cifs_get_inode_info_unix(struct inode **pinode,
178 &cifs_file_direct_nobrl_ops; 187 &cifs_file_direct_nobrl_ops;
179 else 188 else
180 inode->i_fop = &cifs_file_direct_ops; 189 inode->i_fop = &cifs_file_direct_ops;
181 } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 190 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
182 inode->i_fop = &cifs_file_nobrl_ops; 191 inode->i_fop = &cifs_file_nobrl_ops;
183 else /* not direct, send byte range locks */ 192 else /* not direct, send byte range locks */
184 inode->i_fop = &cifs_file_ops; 193 inode->i_fop = &cifs_file_ops;
185 194
186 /* check if server can support readpages */ 195 /* check if server can support readpages */
187 if(pTcon->ses->server->maxBuf < 196 if (pTcon->ses->server->maxBuf <
188 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) 197 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
189 inode->i_data.a_ops = &cifs_addr_ops_smallbuf; 198 inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
190 else 199 else
@@ -220,7 +229,7 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
220 229
221 pbuf = buf; 230 pbuf = buf;
222 231
223 if(size == 0) { 232 if (size == 0) {
224 inode->i_mode |= S_IFIFO; 233 inode->i_mode |= S_IFIFO;
225 return 0; 234 return 0;
226 } else if (size < 8) { 235 } else if (size < 8) {
@@ -239,11 +248,11 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
239 netfid, 248 netfid,
240 24 /* length */, 0 /* offset */, 249 24 /* length */, 0 /* offset */,
241 &bytes_read, &pbuf, &buf_type); 250 &bytes_read, &pbuf, &buf_type);
242 if((rc == 0) && (bytes_read >= 8)) { 251 if ((rc == 0) && (bytes_read >= 8)) {
243 if(memcmp("IntxBLK", pbuf, 8) == 0) { 252 if (memcmp("IntxBLK", pbuf, 8) == 0) {
244 cFYI(1,("Block device")); 253 cFYI(1,("Block device"));
245 inode->i_mode |= S_IFBLK; 254 inode->i_mode |= S_IFBLK;
246 if(bytes_read == 24) { 255 if (bytes_read == 24) {
247 /* we have enough to decode dev num */ 256 /* we have enough to decode dev num */
248 __u64 mjr; /* major */ 257 __u64 mjr; /* major */
249 __u64 mnr; /* minor */ 258 __u64 mnr; /* minor */
@@ -251,10 +260,10 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
251 mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); 260 mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
252 inode->i_rdev = MKDEV(mjr, mnr); 261 inode->i_rdev = MKDEV(mjr, mnr);
253 } 262 }
254 } else if(memcmp("IntxCHR", pbuf, 8) == 0) { 263 } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
255 cFYI(1,("Char device")); 264 cFYI(1,("Char device"));
256 inode->i_mode |= S_IFCHR; 265 inode->i_mode |= S_IFCHR;
257 if(bytes_read == 24) { 266 if (bytes_read == 24) {
258 /* we have enough to decode dev num */ 267 /* we have enough to decode dev num */
259 __u64 mjr; /* major */ 268 __u64 mjr; /* major */
260 __u64 mnr; /* minor */ 269 __u64 mnr; /* minor */
@@ -262,7 +271,7 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
262 mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); 271 mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
263 inode->i_rdev = MKDEV(mjr, mnr); 272 inode->i_rdev = MKDEV(mjr, mnr);
264 } 273 }
265 } else if(memcmp("IntxLNK", pbuf, 7) == 0) { 274 } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
266 cFYI(1,("Symlink")); 275 cFYI(1,("Symlink"));
267 inode->i_mode |= S_IFLNK; 276 inode->i_mode |= S_IFLNK;
268 } else { 277 } else {
@@ -293,7 +302,7 @@ static int get_sfu_uid_mode(struct inode * inode,
293 rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS", 302 rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS",
294 ea_value, 4 /* size of buf */, cifs_sb->local_nls, 303 ea_value, 4 /* size of buf */, cifs_sb->local_nls,
295 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 304 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
296 if(rc < 0) 305 if (rc < 0)
297 return (int)rc; 306 return (int)rc;
298 else if (rc > 3) { 307 else if (rc > 3) {
299 mode = le32_to_cpu(*((__le32 *)ea_value)); 308 mode = le32_to_cpu(*((__le32 *)ea_value));
@@ -348,7 +357,7 @@ int cifs_get_inode_info(struct inode **pinode,
348 /* BB optimize code so we do not make the above call 357 /* BB optimize code so we do not make the above call
349 when server claims no NT SMB support and the above call 358 when server claims no NT SMB support and the above call
350 failed at least once - set flag in tcon or mount */ 359 failed at least once - set flag in tcon or mount */
351 if((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { 360 if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
352 rc = SMBQueryInformation(xid, pTcon, search_path, 361 rc = SMBQueryInformation(xid, pTcon, search_path,
353 pfindData, cifs_sb->local_nls, 362 pfindData, cifs_sb->local_nls,
354 cifs_sb->mnt_cifs_flags & 363 cifs_sb->mnt_cifs_flags &
@@ -425,7 +434,7 @@ int cifs_get_inode_info(struct inode **pinode,
425 } else /* do we need cast or hash to ino? */ 434 } else /* do we need cast or hash to ino? */
426 (*pinode)->i_ino = inode_num; 435 (*pinode)->i_ino = inode_num;
427 } /* else ino incremented to unique num in new_inode*/ 436 } /* else ino incremented to unique num in new_inode*/
428 if(sb->s_flags & MS_NOATIME) 437 if (sb->s_flags & MS_NOATIME)
429 (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME; 438 (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
430 insert_inode_hash(*pinode); 439 insert_inode_hash(*pinode);
431 } 440 }
@@ -442,7 +451,7 @@ int cifs_get_inode_info(struct inode **pinode,
442 (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/ 451 (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
443 452
444 /* Linux can not store file creation time so ignore it */ 453 /* Linux can not store file creation time so ignore it */
445 if(pfindData->LastAccessTime) 454 if (pfindData->LastAccessTime)
446 inode->i_atime = cifs_NTtimeToUnix 455 inode->i_atime = cifs_NTtimeToUnix
447 (le64_to_cpu(pfindData->LastAccessTime)); 456 (le64_to_cpu(pfindData->LastAccessTime));
448 else /* do not need to use current_fs_time - time not stored */ 457 else /* do not need to use current_fs_time - time not stored */
@@ -452,7 +461,7 @@ int cifs_get_inode_info(struct inode **pinode,
452 inode->i_ctime = 461 inode->i_ctime =
453 cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime)); 462 cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
454 cFYI(0, ("Attributes came in as 0x%x", attr)); 463 cFYI(0, ("Attributes came in as 0x%x", attr));
455 if(adjustTZ && (pTcon->ses) && (pTcon->ses->server)) { 464 if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
456 inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj; 465 inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
457 inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj; 466 inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
458 } 467 }
@@ -521,8 +530,10 @@ int cifs_get_inode_info(struct inode **pinode,
521 530
522 /* BB fill in uid and gid here? with help from winbind? 531 /* BB fill in uid and gid here? with help from winbind?
523 or retrieve from NTFS stream extended attribute */ 532 or retrieve from NTFS stream extended attribute */
524 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { 533 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
525 /* fill in uid, gid, mode from server ACL */ 534 /* fill in uid, gid, mode from server ACL */
535 /* BB FIXME this should also take into account the
536 * default uid specified on mount if present */
526 get_sfu_uid_mode(inode, search_path, cifs_sb, xid); 537 get_sfu_uid_mode(inode, search_path, cifs_sb, xid);
527 } else if (atomic_read(&cifsInfo->inUse) == 0) { 538 } else if (atomic_read(&cifsInfo->inUse) == 0) {
528 inode->i_uid = cifs_sb->mnt_uid; 539 inode->i_uid = cifs_sb->mnt_uid;
@@ -541,12 +552,12 @@ int cifs_get_inode_info(struct inode **pinode,
541 &cifs_file_direct_nobrl_ops; 552 &cifs_file_direct_nobrl_ops;
542 else 553 else
543 inode->i_fop = &cifs_file_direct_ops; 554 inode->i_fop = &cifs_file_direct_ops;
544 } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 555 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
545 inode->i_fop = &cifs_file_nobrl_ops; 556 inode->i_fop = &cifs_file_nobrl_ops;
546 else /* not direct, send byte range locks */ 557 else /* not direct, send byte range locks */
547 inode->i_fop = &cifs_file_ops; 558 inode->i_fop = &cifs_file_ops;
548 559
549 if(pTcon->ses->server->maxBuf < 560 if (pTcon->ses->server->maxBuf <
550 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) 561 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
551 inode->i_data.a_ops = &cifs_addr_ops_smallbuf; 562 inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
552 else 563 else
@@ -597,7 +608,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
597 608
598 xid = GetXid(); 609 xid = GetXid();
599 610
600 if(inode) 611 if (inode)
601 cifs_sb = CIFS_SB(inode->i_sb); 612 cifs_sb = CIFS_SB(inode->i_sb);
602 else 613 else
603 cifs_sb = CIFS_SB(direntry->d_sb); 614 cifs_sb = CIFS_SB(direntry->d_sb);
@@ -723,7 +734,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
723 when needed */ 734 when needed */
724 direntry->d_inode->i_ctime = current_fs_time(inode->i_sb); 735 direntry->d_inode->i_ctime = current_fs_time(inode->i_sb);
725 } 736 }
726 if(inode) { 737 if (inode) {
727 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); 738 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
728 cifsInode = CIFS_I(inode); 739 cifsInode = CIFS_I(inode);
729 cifsInode->time = 0; /* force revalidate of dir as well */ 740 cifsInode->time = 0; /* force revalidate of dir as well */
@@ -734,6 +745,136 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
734 return rc; 745 return rc;
735} 746}
736 747
748static void posix_fill_in_inode(struct inode *tmp_inode,
749 FILE_UNIX_BASIC_INFO *pData, int *pobject_type, int isNewInode)
750{
751 loff_t local_size;
752 struct timespec local_mtime;
753
754 struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
755 struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb);
756
757 __u32 type = le32_to_cpu(pData->Type);
758 __u64 num_of_bytes = le64_to_cpu(pData->NumOfBytes);
759 __u64 end_of_file = le64_to_cpu(pData->EndOfFile);
760 cifsInfo->time = jiffies;
761 atomic_inc(&cifsInfo->inUse);
762
763 /* save mtime and size */
764 local_mtime = tmp_inode->i_mtime;
765 local_size = tmp_inode->i_size;
766
767 tmp_inode->i_atime =
768 cifs_NTtimeToUnix(le64_to_cpu(pData->LastAccessTime));
769 tmp_inode->i_mtime =
770 cifs_NTtimeToUnix(le64_to_cpu(pData->LastModificationTime));
771 tmp_inode->i_ctime =
772 cifs_NTtimeToUnix(le64_to_cpu(pData->LastStatusChange));
773
774 tmp_inode->i_mode = le64_to_cpu(pData->Permissions);
775 /* since we set the inode type below we need to mask off type
776 to avoid strange results if bits above were corrupt */
777 tmp_inode->i_mode &= ~S_IFMT;
778 if (type == UNIX_FILE) {
779 *pobject_type = DT_REG;
780 tmp_inode->i_mode |= S_IFREG;
781 } else if (type == UNIX_SYMLINK) {
782 *pobject_type = DT_LNK;
783 tmp_inode->i_mode |= S_IFLNK;
784 } else if (type == UNIX_DIR) {
785 *pobject_type = DT_DIR;
786 tmp_inode->i_mode |= S_IFDIR;
787 } else if (type == UNIX_CHARDEV) {
788 *pobject_type = DT_CHR;
789 tmp_inode->i_mode |= S_IFCHR;
790 tmp_inode->i_rdev = MKDEV(le64_to_cpu(pData->DevMajor),
791 le64_to_cpu(pData->DevMinor) & MINORMASK);
792 } else if (type == UNIX_BLOCKDEV) {
793 *pobject_type = DT_BLK;
794 tmp_inode->i_mode |= S_IFBLK;
795 tmp_inode->i_rdev = MKDEV(le64_to_cpu(pData->DevMajor),
796 le64_to_cpu(pData->DevMinor) & MINORMASK);
797 } else if (type == UNIX_FIFO) {
798 *pobject_type = DT_FIFO;
799 tmp_inode->i_mode |= S_IFIFO;
800 } else if (type == UNIX_SOCKET) {
801 *pobject_type = DT_SOCK;
802 tmp_inode->i_mode |= S_IFSOCK;
803 } else {
804 /* safest to just call it a file */
805 *pobject_type = DT_REG;
806 tmp_inode->i_mode |= S_IFREG;
807 cFYI(1,("unknown inode type %d",type));
808 }
809
810#ifdef CONFIG_CIFS_DEBUG2
811 cFYI(1,("object type: %d", type));
812#endif
813 tmp_inode->i_uid = le64_to_cpu(pData->Uid);
814 tmp_inode->i_gid = le64_to_cpu(pData->Gid);
815 tmp_inode->i_nlink = le64_to_cpu(pData->Nlinks);
816
817 spin_lock(&tmp_inode->i_lock);
818 if (is_size_safe_to_change(cifsInfo, end_of_file)) {
819 /* can not safely change the file size here if the
820 client is writing to it due to potential races */
821 i_size_write(tmp_inode, end_of_file);
822
823 /* 512 bytes (2**9) is the fake blocksize that must be used */
824 /* for this calculation, not the real blocksize */
825 tmp_inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
826 }
827 spin_unlock(&tmp_inode->i_lock);
828
829 if (S_ISREG(tmp_inode->i_mode)) {
830 cFYI(1, ("File inode"));
831 tmp_inode->i_op = &cifs_file_inode_ops;
832
833 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
834 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
835 tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
836 else
837 tmp_inode->i_fop = &cifs_file_direct_ops;
838
839 } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
840 tmp_inode->i_fop = &cifs_file_nobrl_ops;
841 else
842 tmp_inode->i_fop = &cifs_file_ops;
843
844 if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
845 (cifs_sb->tcon->ses->server->maxBuf <
846 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
847 tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
848 else
849 tmp_inode->i_data.a_ops = &cifs_addr_ops;
850
851 if(isNewInode)
852 return; /* No sense invalidating pages for new inode since we
853 have not started caching readahead file data yet */
854
855 if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
856 (local_size == tmp_inode->i_size)) {
857 cFYI(1, ("inode exists but unchanged"));
858 } else {
859 /* file may have changed on server */
860 cFYI(1, ("invalidate inode, readdir detected change"));
861 invalidate_remote_inode(tmp_inode);
862 }
863 } else if (S_ISDIR(tmp_inode->i_mode)) {
864 cFYI(1, ("Directory inode"));
865 tmp_inode->i_op = &cifs_dir_inode_ops;
866 tmp_inode->i_fop = &cifs_dir_ops;
867 } else if (S_ISLNK(tmp_inode->i_mode)) {
868 cFYI(1, ("Symbolic Link inode"));
869 tmp_inode->i_op = &cifs_symlink_inode_ops;
870/* tmp_inode->i_fop = *//* do not need to set to anything */
871 } else {
872 cFYI(1, ("Special inode"));
873 init_special_inode(tmp_inode, tmp_inode->i_mode,
874 tmp_inode->i_rdev);
875 }
876}
877
737int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) 878int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
738{ 879{
739 int rc = 0; 880 int rc = 0;
@@ -755,6 +896,71 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
755 FreeXid(xid); 896 FreeXid(xid);
756 return -ENOMEM; 897 return -ENOMEM;
757 } 898 }
899
900 if((pTcon->ses->capabilities & CAP_UNIX) &&
901 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
902 le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
903 u32 oplock = 0;
904 FILE_UNIX_BASIC_INFO * pInfo =
905 kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
906 if(pInfo == NULL) {
907 rc = -ENOMEM;
908 goto mkdir_out;
909 }
910
911 rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
912 mode, NULL /* netfid */, pInfo, &oplock,
913 full_path, cifs_sb->local_nls,
914 cifs_sb->mnt_cifs_flags &
915 CIFS_MOUNT_MAP_SPECIAL_CHR);
916 if (rc) {
917 cFYI(1, ("posix mkdir returned 0x%x", rc));
918 d_drop(direntry);
919 } else {
920 int obj_type;
921 if (pInfo->Type == -1) /* no return info - go query */
922 goto mkdir_get_info;
923/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need to set uid/gid */
924 inc_nlink(inode);
925 if (pTcon->nocase)
926 direntry->d_op = &cifs_ci_dentry_ops;
927 else
928 direntry->d_op = &cifs_dentry_ops;
929
930 newinode = new_inode(inode->i_sb);
931 if (newinode == NULL)
932 goto mkdir_get_info;
933 /* Is an i_ino of zero legal? */
934 /* Are there sanity checks we can use to ensure that
935 the server is really filling in that field? */
936 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
937 newinode->i_ino =
938 (unsigned long)pInfo->UniqueId;
939 } /* note ino incremented to unique num in new_inode */
940 if(inode->i_sb->s_flags & MS_NOATIME)
941 newinode->i_flags |= S_NOATIME | S_NOCMTIME;
942 newinode->i_nlink = 2;
943
944 insert_inode_hash(newinode);
945 d_instantiate(direntry, newinode);
946
947 /* we already checked in POSIXCreate whether
948 frame was long enough */
949 posix_fill_in_inode(direntry->d_inode,
950 pInfo, &obj_type, 1 /* NewInode */);
951#ifdef CONFIG_CIFS_DEBUG2
952 cFYI(1,("instantiated dentry %p %s to inode %p",
953 direntry, direntry->d_name.name, newinode));
954
955 if(newinode->i_nlink != 2)
956 cFYI(1,("unexpected number of links %d",
957 newinode->i_nlink));
958#endif
959 }
960 kfree(pInfo);
961 goto mkdir_out;
962 }
963
758 /* BB add setting the equivalent of mode via CreateX w/ACLs */ 964 /* BB add setting the equivalent of mode via CreateX w/ACLs */
759 rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls, 965 rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
760 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 966 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -762,6 +968,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
762 cFYI(1, ("cifs_mkdir returned 0x%x", rc)); 968 cFYI(1, ("cifs_mkdir returned 0x%x", rc));
763 d_drop(direntry); 969 d_drop(direntry);
764 } else { 970 } else {
971mkdir_get_info:
765 inc_nlink(inode); 972 inc_nlink(inode);
766 if (pTcon->ses->capabilities & CAP_UNIX) 973 if (pTcon->ses->capabilities & CAP_UNIX)
767 rc = cifs_get_inode_info_unix(&newinode, full_path, 974 rc = cifs_get_inode_info_unix(&newinode, full_path,
@@ -775,8 +982,10 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
775 else 982 else
776 direntry->d_op = &cifs_dentry_ops; 983 direntry->d_op = &cifs_dentry_ops;
777 d_instantiate(direntry, newinode); 984 d_instantiate(direntry, newinode);
778 if (direntry->d_inode) 985 /* setting nlink not necessary except in cases where we
779 direntry->d_inode->i_nlink = 2; 986 * failed to get it from the server or was set bogus */
987 if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
988 direntry->d_inode->i_nlink = 2;
780 if (cifs_sb->tcon->ses->capabilities & CAP_UNIX) 989 if (cifs_sb->tcon->ses->capabilities & CAP_UNIX)
781 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { 990 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
782 CIFSSMBUnixSetPerms(xid, pTcon, full_path, 991 CIFSSMBUnixSetPerms(xid, pTcon, full_path,
@@ -812,6 +1021,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
812 } 1021 }
813 } 1022 }
814 } 1023 }
1024mkdir_out:
815 kfree(full_path); 1025 kfree(full_path);
816 FreeXid(xid); 1026 FreeXid(xid);
817 return rc; 1027 return rc;
@@ -1339,17 +1549,17 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
1339 cpu_to_le32(cifsInode->cifsAttrs | 1549 cpu_to_le32(cifsInode->cifsAttrs |
1340 ATTR_READONLY); 1550 ATTR_READONLY);
1341 } 1551 }
1342 } else if ((mode & S_IWUGO) == S_IWUGO) { 1552 } else if (cifsInode->cifsAttrs & ATTR_READONLY) {
1343 if (cifsInode->cifsAttrs & ATTR_READONLY) { 1553 /* If file is readonly on server, we would
1344 set_dosattr = TRUE; 1554 not be able to write to it - so if any write
1345 time_buf.Attributes = 1555 bit is enabled for user or group or other we
1346 cpu_to_le32(cifsInode->cifsAttrs & 1556 need to at least try to remove r/o dos attr */
1347 (~ATTR_READONLY)); 1557 set_dosattr = TRUE;
1348 /* Windows ignores set to zero */ 1558 time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs &
1349 if(time_buf.Attributes == 0) 1559 (~ATTR_READONLY));
1350 time_buf.Attributes |= 1560 /* Windows ignores set to zero */
1351 cpu_to_le32(ATTR_NORMAL); 1561 if(time_buf.Attributes == 0)
1352 } 1562 time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
1353 } 1563 }
1354 /* BB to be implemented - 1564 /* BB to be implemented -
1355 via Windows security descriptors or streams */ 1565 via Windows security descriptors or streams */
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 992e80edc720..53e304d59544 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -30,6 +30,9 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <asm/div64.h> 31#include <asm/div64.h>
32#include <asm/byteorder.h> 32#include <asm/byteorder.h>
33#ifdef CONFIG_CIFS_EXPERIMENTAL
34#include <linux/inet.h>
35#endif
33#include "cifsfs.h" 36#include "cifsfs.h"
34#include "cifspdu.h" 37#include "cifspdu.h"
35#include "cifsglob.h" 38#include "cifsglob.h"
@@ -129,11 +132,27 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
129/* Convert string containing dotted ip address to binary form */ 132/* Convert string containing dotted ip address to binary form */
130/* returns 0 if invalid address */ 133/* returns 0 if invalid address */
131 134
132/* BB add address family, change rc to status flag and return union or for ipv6 */
133/* will need parent to call something like inet_pton to convert ipv6 address BB */
134int 135int
135cifs_inet_pton(int address_family, char *cp,void *dst) 136cifs_inet_pton(int address_family, char *cp,void *dst)
136{ 137{
138#ifdef CONFIG_CIFS_EXPERIMENTAL
139 int ret = 0;
140
141 /* calculate length by finding first slash or NULL */
142 /* BB Should we convert '/' slash to '\' here since it seems already done
143 before this */
144 if( address_family == AF_INET ){
145 ret = in4_pton(cp, -1 /* len */, dst , '\\', NULL);
146 } else if( address_family == AF_INET6 ){
147 ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
148 }
149#ifdef CONFIG_CIFS_DEBUG2
150 cFYI(1,("address conversion returned %d for %s", ret, cp));
151#endif
152 if (ret > 0)
153 ret = 1;
154 return ret;
155#else
137 int value; 156 int value;
138 int digit; 157 int digit;
139 int i; 158 int i;
@@ -192,6 +211,7 @@ cifs_inet_pton(int address_family, char *cp,void *dst)
192 211
193 *((__be32 *)dst) = *((__be32 *) bytes) | htonl(value); 212 *((__be32 *)dst) = *((__be32 *) bytes) | htonl(value);
194 return 1; /* success */ 213 return 1; /* success */
214#endif /* EXPERIMENTAL */
195} 215}
196 216
197/***************************************************************************** 217/*****************************************************************************
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 2a374d5215ab..b5364f90d551 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -37,19 +37,19 @@ static void dump_cifs_file_struct(struct file *file, char *label)
37{ 37{
38 struct cifsFileInfo * cf; 38 struct cifsFileInfo * cf;
39 39
40 if(file) { 40 if (file) {
41 cf = file->private_data; 41 cf = file->private_data;
42 if(cf == NULL) { 42 if (cf == NULL) {
43 cFYI(1,("empty cifs private file data")); 43 cFYI(1,("empty cifs private file data"));
44 return; 44 return;
45 } 45 }
46 if(cf->invalidHandle) { 46 if (cf->invalidHandle) {
47 cFYI(1,("invalid handle")); 47 cFYI(1,("invalid handle"));
48 } 48 }
49 if(cf->srch_inf.endOfSearch) { 49 if (cf->srch_inf.endOfSearch) {
50 cFYI(1,("end of search")); 50 cFYI(1,("end of search"));
51 } 51 }
52 if(cf->srch_inf.emptyDir) { 52 if (cf->srch_inf.emptyDir) {
53 cFYI(1,("empty dir")); 53 cFYI(1,("empty dir"));
54 } 54 }
55 55
@@ -77,17 +77,17 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
77 cFYI(0, ("existing dentry with inode 0x%p", tmp_dentry->d_inode)); 77 cFYI(0, ("existing dentry with inode 0x%p", tmp_dentry->d_inode));
78 *ptmp_inode = tmp_dentry->d_inode; 78 *ptmp_inode = tmp_dentry->d_inode;
79/* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/ 79/* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/
80 if(*ptmp_inode == NULL) { 80 if (*ptmp_inode == NULL) {
81 *ptmp_inode = new_inode(file->f_path.dentry->d_sb); 81 *ptmp_inode = new_inode(file->f_path.dentry->d_sb);
82 if(*ptmp_inode == NULL) 82 if (*ptmp_inode == NULL)
83 return rc; 83 return rc;
84 rc = 1; 84 rc = 1;
85 } 85 }
86 if(file->f_path.dentry->d_sb->s_flags & MS_NOATIME) 86 if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
87 (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME; 87 (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
88 } else { 88 } else {
89 tmp_dentry = d_alloc(file->f_path.dentry, qstring); 89 tmp_dentry = d_alloc(file->f_path.dentry, qstring);
90 if(tmp_dentry == NULL) { 90 if (tmp_dentry == NULL) {
91 cERROR(1,("Failed allocating dentry")); 91 cERROR(1,("Failed allocating dentry"));
92 *ptmp_inode = NULL; 92 *ptmp_inode = NULL;
93 return rc; 93 return rc;
@@ -98,9 +98,9 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
98 tmp_dentry->d_op = &cifs_ci_dentry_ops; 98 tmp_dentry->d_op = &cifs_ci_dentry_ops;
99 else 99 else
100 tmp_dentry->d_op = &cifs_dentry_ops; 100 tmp_dentry->d_op = &cifs_dentry_ops;
101 if(*ptmp_inode == NULL) 101 if (*ptmp_inode == NULL)
102 return rc; 102 return rc;
103 if(file->f_path.dentry->d_sb->s_flags & MS_NOATIME) 103 if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
104 (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME; 104 (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
105 rc = 2; 105 rc = 2;
106 } 106 }
@@ -112,7 +112,7 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
112 112
113static void AdjustForTZ(struct cifsTconInfo * tcon, struct inode * inode) 113static void AdjustForTZ(struct cifsTconInfo * tcon, struct inode * inode)
114{ 114{
115 if((tcon) && (tcon->ses) && (tcon->ses->server)) { 115 if ((tcon) && (tcon->ses) && (tcon->ses->server)) {
116 inode->i_ctime.tv_sec += tcon->ses->server->timeAdj; 116 inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
117 inode->i_mtime.tv_sec += tcon->ses->server->timeAdj; 117 inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
118 inode->i_atime.tv_sec += tcon->ses->server->timeAdj; 118 inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
@@ -137,7 +137,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
137 local_mtime = tmp_inode->i_mtime; 137 local_mtime = tmp_inode->i_mtime;
138 local_size = tmp_inode->i_size; 138 local_size = tmp_inode->i_size;
139 139
140 if(new_buf_type) { 140 if (new_buf_type) {
141 FILE_DIRECTORY_INFO *pfindData = (FILE_DIRECTORY_INFO *)buf; 141 FILE_DIRECTORY_INFO *pfindData = (FILE_DIRECTORY_INFO *)buf;
142 142
143 attr = le32_to_cpu(pfindData->ExtFileAttributes); 143 attr = le32_to_cpu(pfindData->ExtFileAttributes);
@@ -193,7 +193,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
193 if (attr & ATTR_DIRECTORY) { 193 if (attr & ATTR_DIRECTORY) {
194 *pobject_type = DT_DIR; 194 *pobject_type = DT_DIR;
195 /* override default perms since we do not lock dirs */ 195 /* override default perms since we do not lock dirs */
196 if(atomic_read(&cifsInfo->inUse) == 0) { 196 if (atomic_read(&cifsInfo->inUse) == 0) {
197 tmp_inode->i_mode = cifs_sb->mnt_dir_mode; 197 tmp_inode->i_mode = cifs_sb->mnt_dir_mode;
198 } 198 }
199 tmp_inode->i_mode |= S_IFDIR; 199 tmp_inode->i_mode |= S_IFDIR;
@@ -250,25 +250,25 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
250 if (S_ISREG(tmp_inode->i_mode)) { 250 if (S_ISREG(tmp_inode->i_mode)) {
251 cFYI(1, ("File inode")); 251 cFYI(1, ("File inode"));
252 tmp_inode->i_op = &cifs_file_inode_ops; 252 tmp_inode->i_op = &cifs_file_inode_ops;
253 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { 253 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
254 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 254 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
255 tmp_inode->i_fop = &cifs_file_direct_nobrl_ops; 255 tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
256 else 256 else
257 tmp_inode->i_fop = &cifs_file_direct_ops; 257 tmp_inode->i_fop = &cifs_file_direct_ops;
258 258
259 } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 259 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
260 tmp_inode->i_fop = &cifs_file_nobrl_ops; 260 tmp_inode->i_fop = &cifs_file_nobrl_ops;
261 else 261 else
262 tmp_inode->i_fop = &cifs_file_ops; 262 tmp_inode->i_fop = &cifs_file_ops;
263 263
264 if((cifs_sb->tcon) && (cifs_sb->tcon->ses) && 264 if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
265 (cifs_sb->tcon->ses->server->maxBuf < 265 (cifs_sb->tcon->ses->server->maxBuf <
266 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)) 266 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
267 tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf; 267 tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
268 else 268 else
269 tmp_inode->i_data.a_ops = &cifs_addr_ops; 269 tmp_inode->i_data.a_ops = &cifs_addr_ops;
270 270
271 if(isNewInode) 271 if (isNewInode)
272 return; /* No sense invalidating pages for new inode 272 return; /* No sense invalidating pages for new inode
273 since have not started caching readahead file 273 since have not started caching readahead file
274 data yet */ 274 data yet */
@@ -357,8 +357,14 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
357 cFYI(1,("unknown inode type %d",type)); 357 cFYI(1,("unknown inode type %d",type));
358 } 358 }
359 359
360 tmp_inode->i_uid = le64_to_cpu(pfindData->Uid); 360 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
361 tmp_inode->i_gid = le64_to_cpu(pfindData->Gid); 361 tmp_inode->i_uid = cifs_sb->mnt_uid;
362 else
363 tmp_inode->i_uid = le64_to_cpu(pfindData->Uid);
364 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
365 tmp_inode->i_gid = cifs_sb->mnt_gid;
366 else
367 tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
362 tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks); 368 tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
363 369
364 spin_lock(&tmp_inode->i_lock); 370 spin_lock(&tmp_inode->i_lock);
@@ -377,25 +383,24 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
377 cFYI(1, ("File inode")); 383 cFYI(1, ("File inode"));
378 tmp_inode->i_op = &cifs_file_inode_ops; 384 tmp_inode->i_op = &cifs_file_inode_ops;
379 385
380 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { 386 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
381 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 387 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
382 tmp_inode->i_fop = &cifs_file_direct_nobrl_ops; 388 tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
383 else 389 else
384 tmp_inode->i_fop = &cifs_file_direct_ops; 390 tmp_inode->i_fop = &cifs_file_direct_ops;
385 391 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
386 } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
387 tmp_inode->i_fop = &cifs_file_nobrl_ops; 392 tmp_inode->i_fop = &cifs_file_nobrl_ops;
388 else 393 else
389 tmp_inode->i_fop = &cifs_file_ops; 394 tmp_inode->i_fop = &cifs_file_ops;
390 395
391 if((cifs_sb->tcon) && (cifs_sb->tcon->ses) && 396 if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
392 (cifs_sb->tcon->ses->server->maxBuf < 397 (cifs_sb->tcon->ses->server->maxBuf <
393 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)) 398 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
394 tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf; 399 tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
395 else 400 else
396 tmp_inode->i_data.a_ops = &cifs_addr_ops; 401 tmp_inode->i_data.a_ops = &cifs_addr_ops;
397 402
398 if(isNewInode) 403 if (isNewInode)
399 return; /* No sense invalidating pages for new inode since we 404 return; /* No sense invalidating pages for new inode since we
400 have not started caching readahead file data yet */ 405 have not started caching readahead file data yet */
401 406
@@ -430,34 +435,28 @@ static int initiate_cifs_search(const int xid, struct file *file)
430 struct cifs_sb_info *cifs_sb; 435 struct cifs_sb_info *cifs_sb;
431 struct cifsTconInfo *pTcon; 436 struct cifsTconInfo *pTcon;
432 437
433 if(file->private_data == NULL) { 438 if (file->private_data == NULL) {
434 file->private_data = 439 file->private_data =
435 kmalloc(sizeof(struct cifsFileInfo),GFP_KERNEL); 440 kzalloc(sizeof(struct cifsFileInfo),GFP_KERNEL);
436 } 441 }
437 442
438 if(file->private_data == NULL) { 443 if (file->private_data == NULL)
439 return -ENOMEM; 444 return -ENOMEM;
440 } else {
441 memset(file->private_data,0,sizeof(struct cifsFileInfo));
442 }
443 cifsFile = file->private_data; 445 cifsFile = file->private_data;
444 cifsFile->invalidHandle = TRUE; 446 cifsFile->invalidHandle = TRUE;
445 cifsFile->srch_inf.endOfSearch = FALSE; 447 cifsFile->srch_inf.endOfSearch = FALSE;
446 448
447 if(file->f_path.dentry == NULL)
448 return -ENOENT;
449
450 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 449 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
451 if(cifs_sb == NULL) 450 if (cifs_sb == NULL)
452 return -EINVAL; 451 return -EINVAL;
453 452
454 pTcon = cifs_sb->tcon; 453 pTcon = cifs_sb->tcon;
455 if(pTcon == NULL) 454 if (pTcon == NULL)
456 return -EINVAL; 455 return -EINVAL;
457 456
458 full_path = build_path_from_dentry(file->f_path.dentry); 457 full_path = build_path_from_dentry(file->f_path.dentry);
459 458
460 if(full_path == NULL) { 459 if (full_path == NULL) {
461 return -ENOMEM; 460 return -ENOMEM;
462 } 461 }
463 462
@@ -480,9 +479,9 @@ ffirst_retry:
480 &cifsFile->netfid, &cifsFile->srch_inf, 479 &cifsFile->netfid, &cifsFile->srch_inf,
481 cifs_sb->mnt_cifs_flags & 480 cifs_sb->mnt_cifs_flags &
482 CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb)); 481 CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb));
483 if(rc == 0) 482 if (rc == 0)
484 cifsFile->invalidHandle = FALSE; 483 cifsFile->invalidHandle = FALSE;
485 if((rc == -EOPNOTSUPP) && 484 if ((rc == -EOPNOTSUPP) &&
486 (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { 485 (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
487 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; 486 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
488 goto ffirst_retry; 487 goto ffirst_retry;
@@ -498,7 +497,7 @@ static int cifs_unicode_bytelen(char *str)
498 __le16 * ustr = (__le16 *)str; 497 __le16 * ustr = (__le16 *)str;
499 498
500 for(len=0;len <= PATH_MAX;len++) { 499 for(len=0;len <= PATH_MAX;len++) {
501 if(ustr[len] == 0) 500 if (ustr[len] == 0)
502 return len << 1; 501 return len << 1;
503 } 502 }
504 cFYI(1,("Unicode string longer than PATH_MAX found")); 503 cFYI(1,("Unicode string longer than PATH_MAX found"));
@@ -510,7 +509,7 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
510 char * new_entry; 509 char * new_entry;
511 FILE_DIRECTORY_INFO * pDirInfo = (FILE_DIRECTORY_INFO *)old_entry; 510 FILE_DIRECTORY_INFO * pDirInfo = (FILE_DIRECTORY_INFO *)old_entry;
512 511
513 if(level == SMB_FIND_FILE_INFO_STANDARD) { 512 if (level == SMB_FIND_FILE_INFO_STANDARD) {
514 FIND_FILE_STANDARD_INFO * pfData; 513 FIND_FILE_STANDARD_INFO * pfData;
515 pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo; 514 pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo;
516 515
@@ -520,12 +519,12 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
520 new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset); 519 new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
521 cFYI(1,("new entry %p old entry %p",new_entry,old_entry)); 520 cFYI(1,("new entry %p old entry %p",new_entry,old_entry));
522 /* validate that new_entry is not past end of SMB */ 521 /* validate that new_entry is not past end of SMB */
523 if(new_entry >= end_of_smb) { 522 if (new_entry >= end_of_smb) {
524 cERROR(1, 523 cERROR(1,
525 ("search entry %p began after end of SMB %p old entry %p", 524 ("search entry %p began after end of SMB %p old entry %p",
526 new_entry, end_of_smb, old_entry)); 525 new_entry, end_of_smb, old_entry));
527 return NULL; 526 return NULL;
528 } else if(((level == SMB_FIND_FILE_INFO_STANDARD) && 527 } else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
529 (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) || 528 (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) ||
530 ((level != SMB_FIND_FILE_INFO_STANDARD) && 529 ((level != SMB_FIND_FILE_INFO_STANDARD) &&
531 (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) { 530 (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) {
@@ -546,39 +545,39 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
546 char * filename = NULL; 545 char * filename = NULL;
547 int len = 0; 546 int len = 0;
548 547
549 if(cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) { 548 if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
550 FILE_UNIX_INFO * pFindData = (FILE_UNIX_INFO *)current_entry; 549 FILE_UNIX_INFO * pFindData = (FILE_UNIX_INFO *)current_entry;
551 filename = &pFindData->FileName[0]; 550 filename = &pFindData->FileName[0];
552 if(cfile->srch_inf.unicode) { 551 if (cfile->srch_inf.unicode) {
553 len = cifs_unicode_bytelen(filename); 552 len = cifs_unicode_bytelen(filename);
554 } else { 553 } else {
555 /* BB should we make this strnlen of PATH_MAX? */ 554 /* BB should we make this strnlen of PATH_MAX? */
556 len = strnlen(filename, 5); 555 len = strnlen(filename, 5);
557 } 556 }
558 } else if(cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) { 557 } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
559 FILE_DIRECTORY_INFO * pFindData = 558 FILE_DIRECTORY_INFO * pFindData =
560 (FILE_DIRECTORY_INFO *)current_entry; 559 (FILE_DIRECTORY_INFO *)current_entry;
561 filename = &pFindData->FileName[0]; 560 filename = &pFindData->FileName[0];
562 len = le32_to_cpu(pFindData->FileNameLength); 561 len = le32_to_cpu(pFindData->FileNameLength);
563 } else if(cfile->srch_inf.info_level == 562 } else if (cfile->srch_inf.info_level ==
564 SMB_FIND_FILE_FULL_DIRECTORY_INFO) { 563 SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
565 FILE_FULL_DIRECTORY_INFO * pFindData = 564 FILE_FULL_DIRECTORY_INFO * pFindData =
566 (FILE_FULL_DIRECTORY_INFO *)current_entry; 565 (FILE_FULL_DIRECTORY_INFO *)current_entry;
567 filename = &pFindData->FileName[0]; 566 filename = &pFindData->FileName[0];
568 len = le32_to_cpu(pFindData->FileNameLength); 567 len = le32_to_cpu(pFindData->FileNameLength);
569 } else if(cfile->srch_inf.info_level == 568 } else if (cfile->srch_inf.info_level ==
570 SMB_FIND_FILE_ID_FULL_DIR_INFO) { 569 SMB_FIND_FILE_ID_FULL_DIR_INFO) {
571 SEARCH_ID_FULL_DIR_INFO * pFindData = 570 SEARCH_ID_FULL_DIR_INFO * pFindData =
572 (SEARCH_ID_FULL_DIR_INFO *)current_entry; 571 (SEARCH_ID_FULL_DIR_INFO *)current_entry;
573 filename = &pFindData->FileName[0]; 572 filename = &pFindData->FileName[0];
574 len = le32_to_cpu(pFindData->FileNameLength); 573 len = le32_to_cpu(pFindData->FileNameLength);
575 } else if(cfile->srch_inf.info_level == 574 } else if (cfile->srch_inf.info_level ==
576 SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { 575 SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
577 FILE_BOTH_DIRECTORY_INFO * pFindData = 576 FILE_BOTH_DIRECTORY_INFO * pFindData =
578 (FILE_BOTH_DIRECTORY_INFO *)current_entry; 577 (FILE_BOTH_DIRECTORY_INFO *)current_entry;
579 filename = &pFindData->FileName[0]; 578 filename = &pFindData->FileName[0];
580 len = le32_to_cpu(pFindData->FileNameLength); 579 len = le32_to_cpu(pFindData->FileNameLength);
581 } else if(cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) { 580 } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
582 FIND_FILE_STANDARD_INFO * pFindData = 581 FIND_FILE_STANDARD_INFO * pFindData =
583 (FIND_FILE_STANDARD_INFO *)current_entry; 582 (FIND_FILE_STANDARD_INFO *)current_entry;
584 filename = &pFindData->FileName[0]; 583 filename = &pFindData->FileName[0];
@@ -587,25 +586,25 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
587 cFYI(1,("Unknown findfirst level %d",cfile->srch_inf.info_level)); 586 cFYI(1,("Unknown findfirst level %d",cfile->srch_inf.info_level));
588 } 587 }
589 588
590 if(filename) { 589 if (filename) {
591 if(cfile->srch_inf.unicode) { 590 if (cfile->srch_inf.unicode) {
592 __le16 *ufilename = (__le16 *)filename; 591 __le16 *ufilename = (__le16 *)filename;
593 if(len == 2) { 592 if (len == 2) {
594 /* check for . */ 593 /* check for . */
595 if(ufilename[0] == UNICODE_DOT) 594 if (ufilename[0] == UNICODE_DOT)
596 rc = 1; 595 rc = 1;
597 } else if(len == 4) { 596 } else if (len == 4) {
598 /* check for .. */ 597 /* check for .. */
599 if((ufilename[0] == UNICODE_DOT) 598 if ((ufilename[0] == UNICODE_DOT)
600 &&(ufilename[1] == UNICODE_DOT)) 599 &&(ufilename[1] == UNICODE_DOT))
601 rc = 2; 600 rc = 2;
602 } 601 }
603 } else /* ASCII */ { 602 } else /* ASCII */ {
604 if(len == 1) { 603 if (len == 1) {
605 if(filename[0] == '.') 604 if (filename[0] == '.')
606 rc = 1; 605 rc = 1;
607 } else if(len == 2) { 606 } else if (len == 2) {
608 if((filename[0] == '.') && (filename[1] == '.')) 607 if((filename[0] == '.') && (filename[1] == '.'))
609 rc = 2; 608 rc = 2;
610 } 609 }
611 } 610 }
@@ -618,20 +617,10 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
618 whether we can use the cached search results from the previous search */ 617 whether we can use the cached search results from the previous search */
619static int is_dir_changed(struct file * file) 618static int is_dir_changed(struct file * file)
620{ 619{
621 struct inode * inode; 620 struct inode *inode = file->f_path.dentry->d_inode;
622 struct cifsInodeInfo *cifsInfo; 621 struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
623 622
624 if(file->f_path.dentry == NULL) 623 if (cifsInfo->time == 0)
625 return 0;
626
627 inode = file->f_path.dentry->d_inode;
628
629 if(inode == NULL)
630 return 0;
631
632 cifsInfo = CIFS_I(inode);
633
634 if(cifsInfo->time == 0)
635 return 1; /* directory was changed, perhaps due to unlink */ 624 return 1; /* directory was changed, perhaps due to unlink */
636 else 625 else
637 return 0; 626 return 0;
@@ -654,7 +643,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
654 struct cifsFileInfo * cifsFile = file->private_data; 643 struct cifsFileInfo * cifsFile = file->private_data;
655 /* check if index in the buffer */ 644 /* check if index in the buffer */
656 645
657 if((cifsFile == NULL) || (ppCurrentEntry == NULL) || 646 if ((cifsFile == NULL) || (ppCurrentEntry == NULL) ||
658 (num_to_ret == NULL)) 647 (num_to_ret == NULL))
659 return -ENOENT; 648 return -ENOENT;
660 649
@@ -672,7 +661,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
672#ifdef CONFIG_CIFS_DEBUG2 661#ifdef CONFIG_CIFS_DEBUG2
673 dump_cifs_file_struct(file, "In fce "); 662 dump_cifs_file_struct(file, "In fce ");
674#endif 663#endif
675 if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && 664 if (((index_to_find < cifsFile->srch_inf.index_of_last_entry) &&
676 is_dir_changed(file)) || 665 is_dir_changed(file)) ||
677 (index_to_find < first_entry_in_buffer)) { 666 (index_to_find < first_entry_in_buffer)) {
678 /* close and restart search */ 667 /* close and restart search */
@@ -681,9 +670,9 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
681 CIFSFindClose(xid, pTcon, cifsFile->netfid); 670 CIFSFindClose(xid, pTcon, cifsFile->netfid);
682 kfree(cifsFile->search_resume_name); 671 kfree(cifsFile->search_resume_name);
683 cifsFile->search_resume_name = NULL; 672 cifsFile->search_resume_name = NULL;
684 if(cifsFile->srch_inf.ntwrk_buf_start) { 673 if (cifsFile->srch_inf.ntwrk_buf_start) {
685 cFYI(1,("freeing SMB ff cache buf on search rewind")); 674 cFYI(1,("freeing SMB ff cache buf on search rewind"));
686 if(cifsFile->srch_inf.smallBuf) 675 if (cifsFile->srch_inf.smallBuf)
687 cifs_small_buf_release(cifsFile->srch_inf. 676 cifs_small_buf_release(cifsFile->srch_inf.
688 ntwrk_buf_start); 677 ntwrk_buf_start);
689 else 678 else
@@ -691,7 +680,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
691 ntwrk_buf_start); 680 ntwrk_buf_start);
692 } 681 }
693 rc = initiate_cifs_search(xid,file); 682 rc = initiate_cifs_search(xid,file);
694 if(rc) { 683 if (rc) {
695 cFYI(1,("error %d reinitiating a search on rewind",rc)); 684 cFYI(1,("error %d reinitiating a search on rewind",rc));
696 return rc; 685 return rc;
697 } 686 }
@@ -702,10 +691,10 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
702 cFYI(1,("calling findnext2")); 691 cFYI(1,("calling findnext2"));
703 rc = CIFSFindNext(xid,pTcon,cifsFile->netfid, 692 rc = CIFSFindNext(xid,pTcon,cifsFile->netfid,
704 &cifsFile->srch_inf); 693 &cifsFile->srch_inf);
705 if(rc) 694 if (rc)
706 return -ENOENT; 695 return -ENOENT;
707 } 696 }
708 if(index_to_find < cifsFile->srch_inf.index_of_last_entry) { 697 if (index_to_find < cifsFile->srch_inf.index_of_last_entry) {
709 /* we found the buffer that contains the entry */ 698 /* we found the buffer that contains the entry */
710 /* scan and find it */ 699 /* scan and find it */
711 int i; 700 int i;
@@ -851,9 +840,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
851 if((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL)) 840 if((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL))
852 return -ENOENT; 841 return -ENOENT;
853 842
854 if(file->f_path.dentry == NULL)
855 return -ENOENT;
856
857 rc = cifs_entry_is_dot(pfindEntry,pCifsF); 843 rc = cifs_entry_is_dot(pfindEntry,pCifsF);
858 /* skip . and .. since we added them first */ 844 /* skip . and .. since we added them first */
859 if(rc != 0) 845 if(rc != 0)
@@ -997,11 +983,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
997 983
998 xid = GetXid(); 984 xid = GetXid();
999 985
1000 if(file->f_path.dentry == NULL) {
1001 FreeXid(xid);
1002 return -EIO;
1003 }
1004
1005 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 986 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1006 pTcon = cifs_sb->tcon; 987 pTcon = cifs_sb->tcon;
1007 if(pTcon == NULL) 988 if(pTcon == NULL)
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 614175a3b02e..0aaff3651d14 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -62,8 +62,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
62{ 62{
63 struct coda_inode_info *ei = (struct coda_inode_info *) foo; 63 struct coda_inode_info *ei = (struct coda_inode_info *) foo;
64 64
65 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 65 if (flags & SLAB_CTOR_CONSTRUCTOR)
66 SLAB_CTOR_CONSTRUCTOR)
67 inode_init_once(&ei->vfs_inode); 66 inode_init_once(&ei->vfs_inode);
68} 67}
69 68
diff --git a/fs/compat.c b/fs/compat.c
index 040a8be38a48..72e5e6923828 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -371,13 +371,14 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
371 fn = "?"; 371 fn = "?";
372 } 372 }
373 373
374 sprintf(buf,"'%c'", (cmd>>24) & 0x3f); 374 sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK);
375 if (!isprint(buf[1])) 375 if (!isprint(buf[1]))
376 sprintf(buf, "%02x", buf[1]); 376 sprintf(buf, "%02x", buf[1]);
377 compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) " 377 compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
378 "cmd(%08x){%s} arg(%08x) on %s\n", 378 "cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n",
379 current->comm, current->pid, 379 current->comm, current->pid,
380 (int)fd, (unsigned int)cmd, buf, 380 (int)fd, (unsigned int)cmd, buf,
381 (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
381 (unsigned int)arg, fn); 382 (unsigned int)arg, fn);
382 383
383 if (path) 384 if (path)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8b1c5d8bf4ef..464c04a9541d 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -266,6 +266,23 @@ static int do_siocgstamp(unsigned int fd, unsigned int cmd, unsigned long arg)
266 return err; 266 return err;
267} 267}
268 268
269static int do_siocgstampns(unsigned int fd, unsigned int cmd, unsigned long arg)
270{
271 struct compat_timespec __user *up = compat_ptr(arg);
272 struct timespec kts;
273 mm_segment_t old_fs = get_fs();
274 int err;
275
276 set_fs(KERNEL_DS);
277 err = sys_ioctl(fd, cmd, (unsigned long)&kts);
278 set_fs(old_fs);
279 if (!err) {
280 err = put_user(kts.tv_sec, &up->tv_sec);
281 err |= __put_user(kts.tv_nsec, &up->tv_nsec);
282 }
283 return err;
284}
285
269struct ifmap32 { 286struct ifmap32 {
270 compat_ulong_t mem_start; 287 compat_ulong_t mem_start;
271 compat_ulong_t mem_end; 288 compat_ulong_t mem_end;
@@ -2379,6 +2396,14 @@ lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
2379#define ULONG_IOCTL(cmd) \ 2396#define ULONG_IOCTL(cmd) \
2380 { (cmd), (ioctl_trans_handler_t)sys_ioctl }, 2397 { (cmd), (ioctl_trans_handler_t)sys_ioctl },
2381 2398
2399/* ioctl should not be warned about even if it's not implemented.
2400 Valid reasons to use this:
2401 - It is implemented with ->compat_ioctl on some device, but programs
2402 call it on others too.
2403 - The ioctl is not implemented in the native kernel, but programs
2404 call it commonly anyways.
2405 Most other reasons are not valid. */
2406#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd)
2382 2407
2383struct ioctl_trans ioctl_start[] = { 2408struct ioctl_trans ioctl_start[] = {
2384#include <linux/compat_ioctl.h> 2409#include <linux/compat_ioctl.h>
@@ -2437,6 +2462,7 @@ HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
2437/* Note SIOCRTMSG is no longer, so this is safe and * the user would have seen just an -EINVAL anyways. */ 2462/* Note SIOCRTMSG is no longer, so this is safe and * the user would have seen just an -EINVAL anyways. */
2438HANDLE_IOCTL(SIOCRTMSG, ret_einval) 2463HANDLE_IOCTL(SIOCRTMSG, ret_einval)
2439HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp) 2464HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
2465HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
2440#endif 2466#endif
2441#ifdef CONFIG_BLOCK 2467#ifdef CONFIG_BLOCK
2442HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo) 2468HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo)
@@ -2576,6 +2602,8 @@ HANDLE_IOCTL(SIOCGIWENCODEEXT, do_wireless_ioctl)
2576HANDLE_IOCTL(SIOCSIWPMKSA, do_wireless_ioctl) 2602HANDLE_IOCTL(SIOCSIWPMKSA, do_wireless_ioctl)
2577HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl) 2603HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
2578HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl) 2604HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
2605/* Not implemented in the native kernel */
2606IGNORE_IOCTL(SIOCGIFCOUNT)
2579HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl) 2607HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl)
2580HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl) 2608HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl)
2581HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl) 2609HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl)
@@ -2599,6 +2627,15 @@ COMPATIBLE_IOCTL(LPRESET)
2599/*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/ 2627/*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/
2600COMPATIBLE_IOCTL(LPGETFLAGS) 2628COMPATIBLE_IOCTL(LPGETFLAGS)
2601HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans) 2629HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
2630
2631/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
2632 but we don't want warnings on other file systems. So declare
2633 them as compatible here. */
2634#define VFAT_IOCTL_READDIR_BOTH32 _IOR('r', 1, struct compat_dirent[2])
2635#define VFAT_IOCTL_READDIR_SHORT32 _IOR('r', 2, struct compat_dirent[2])
2636
2637IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
2638IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
2602}; 2639};
2603 2640
2604int ioctl_table_size = ARRAY_SIZE(ioctl_start); 2641int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 6f573004cd7d..b00d962de833 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -140,7 +140,7 @@ static int __init configfs_init(void)
140 if (!configfs_dir_cachep) 140 if (!configfs_dir_cachep)
141 goto out; 141 goto out;
142 142
143 kset_set_kset_s(&config_subsys, kernel_subsys); 143 kobj_set_kset_s(&config_subsys, kernel_subsys);
144 err = subsystem_register(&config_subsys); 144 err = subsystem_register(&config_subsys);
145 if (err) { 145 if (err) {
146 kmem_cache_destroy(configfs_dir_cachep); 146 kmem_cache_destroy(configfs_dir_cachep);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index facd0c89be8f..3d194a2be3f5 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -180,7 +180,8 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
180 struct page *page = NULL; 180 struct page *page = NULL;
181 181
182 if (blocknr + i < devsize) { 182 if (blocknr + i < devsize) {
183 page = read_mapping_page(mapping, blocknr + i, NULL); 183 page = read_mapping_page_async(mapping, blocknr + i,
184 NULL);
184 /* synchronous error? */ 185 /* synchronous error? */
185 if (IS_ERR(page)) 186 if (IS_ERR(page))
186 page = NULL; 187 page = NULL;
diff --git a/fs/dcache.c b/fs/dcache.c
index d68631f18df1..d1bf5d8aeb5a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2052,12 +2052,8 @@ static void __init dcache_init(unsigned long mempages)
2052 * but it is probably not worth it because of the cache nature 2052 * but it is probably not worth it because of the cache nature
2053 * of the dcache. 2053 * of the dcache.
2054 */ 2054 */
2055 dentry_cache = kmem_cache_create("dentry_cache", 2055 dentry_cache = KMEM_CACHE(dentry,
2056 sizeof(struct dentry), 2056 SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
2057 0,
2058 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
2059 SLAB_MEM_SPREAD),
2060 NULL, NULL);
2061 2057
2062 set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory); 2058 set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
2063 2059
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 682f928b7f4d..2e124e0075c5 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -179,6 +179,48 @@ struct dentry *debugfs_create_u32(const char *name, mode_t mode,
179} 179}
180EXPORT_SYMBOL_GPL(debugfs_create_u32); 180EXPORT_SYMBOL_GPL(debugfs_create_u32);
181 181
182static void debugfs_u64_set(void *data, u64 val)
183{
184 *(u64 *)data = val;
185}
186
187static u64 debugfs_u64_get(void *data)
188{
189 return *(u64 *)data;
190}
191DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
192
193/**
194 * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value
195 * @name: a pointer to a string containing the name of the file to create.
196 * @mode: the permission that the file should have
197 * @parent: a pointer to the parent dentry for this file. This should be a
198 * directory dentry if set. If this parameter is %NULL, then the
199 * file will be created in the root of the debugfs filesystem.
200 * @value: a pointer to the variable that the file should read to and write
201 * from.
202 *
203 * This function creates a file in debugfs with the given name that
204 * contains the value of the variable @value. If the @mode variable is so
205 * set, it can be read from, and written to.
206 *
207 * This function will return a pointer to a dentry if it succeeds. This
208 * pointer must be passed to the debugfs_remove() function when the file is
209 * to be removed (no automatic cleanup happens if your module is unloaded,
210 * you are responsible here.) If an error occurs, %NULL will be returned.
211 *
212 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
213 * returned. It is not wise to check for this value, but rather, check for
214 * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
215 * code.
216 */
217struct dentry *debugfs_create_u64(const char *name, mode_t mode,
218 struct dentry *parent, u64 *value)
219{
220 return debugfs_create_file(name, mode, parent, value, &fops_u64);
221}
222EXPORT_SYMBOL_GPL(debugfs_create_u64);
223
182static ssize_t read_file_bool(struct file *file, char __user *user_buf, 224static ssize_t read_file_bool(struct file *file, char __user *user_buf,
183 size_t count, loff_t *ppos) 225 size_t count, loff_t *ppos)
184{ 226{
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7b324cfebcb1..ec8896b264de 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -374,7 +374,7 @@ static int __init debugfs_init(void)
374{ 374{
375 int retval; 375 int retval;
376 376
377 kset_set_kset_s(&debug_subsys, kernel_subsys); 377 kobj_set_kset_s(&debug_subsys, kernel_subsys);
378 retval = subsystem_register(&debug_subsys); 378 retval = subsystem_register(&debug_subsys);
379 if (retval) 379 if (retval)
380 return retval; 380 return retval;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 6fa7b0d5c043..69a94690e493 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -3,36 +3,19 @@ menu "Distributed Lock Manager"
3 3
4config DLM 4config DLM
5 tristate "Distributed Lock Manager (DLM)" 5 tristate "Distributed Lock Manager (DLM)"
6 depends on SYSFS && (IPV6 || IPV6=n) 6 depends on IPV6 || IPV6=n
7 select CONFIGFS_FS 7 select CONFIGFS_FS
8 select IP_SCTP if DLM_SCTP 8 select IP_SCTP
9 help 9 help
10 A general purpose distributed lock manager for kernel or userspace 10 A general purpose distributed lock manager for kernel or userspace
11 applications. 11 applications.
12
13choice
14 prompt "Select DLM communications protocol"
15 depends on DLM
16 default DLM_TCP
17 help
18 The DLM Can use TCP or SCTP for it's network communications.
19 SCTP supports multi-homed operations whereas TCP doesn't.
20 However, SCTP seems to have stability problems at the moment.
21
22config DLM_TCP
23 bool "TCP/IP"
24
25config DLM_SCTP
26 bool "SCTP"
27
28endchoice
29 12
30config DLM_DEBUG 13config DLM_DEBUG
31 bool "DLM debugging" 14 bool "DLM debugging"
32 depends on DLM 15 depends on DLM
33 help 16 help
34 Under the debugfs mount point, the name of each lockspace will 17 Under the debugfs mount point, the name of each lockspace will
35 appear as a file in the "dlm" directory. The output is the 18 appear as a file in the "dlm" directory. The output is the
36 list of resource and locks the local node knows about. 19 list of resource and locks the local node knows about.
37 20
38endmenu 21endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index 65388944eba0..604cf7dc5f39 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -8,14 +8,12 @@ dlm-y := ast.o \
8 member.o \ 8 member.o \
9 memory.o \ 9 memory.o \
10 midcomms.o \ 10 midcomms.o \
11 lowcomms.o \
11 rcom.o \ 12 rcom.o \
12 recover.o \ 13 recover.o \
13 recoverd.o \ 14 recoverd.o \
14 requestqueue.o \ 15 requestqueue.o \
15 user.o \ 16 user.o \
16 util.o 17 util.o
17dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o 18dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
18 19
19dlm-$(CONFIG_DLM_TCP) += lowcomms-tcp.o
20
21dlm-$(CONFIG_DLM_SCTP) += lowcomms-sctp.o \ No newline at end of file
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index f91d39cb1e0b..6308122890ca 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -14,6 +14,7 @@
14#include "dlm_internal.h" 14#include "dlm_internal.h"
15#include "lock.h" 15#include "lock.h"
16#include "user.h" 16#include "user.h"
17#include "ast.h"
17 18
18#define WAKE_ASTS 0 19#define WAKE_ASTS 0
19 20
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 8665c88e5af2..822abdcd1434 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -89,6 +89,7 @@ struct cluster {
89 unsigned int cl_toss_secs; 89 unsigned int cl_toss_secs;
90 unsigned int cl_scan_secs; 90 unsigned int cl_scan_secs;
91 unsigned int cl_log_debug; 91 unsigned int cl_log_debug;
92 unsigned int cl_protocol;
92}; 93};
93 94
94enum { 95enum {
@@ -101,6 +102,7 @@ enum {
101 CLUSTER_ATTR_TOSS_SECS, 102 CLUSTER_ATTR_TOSS_SECS,
102 CLUSTER_ATTR_SCAN_SECS, 103 CLUSTER_ATTR_SCAN_SECS,
103 CLUSTER_ATTR_LOG_DEBUG, 104 CLUSTER_ATTR_LOG_DEBUG,
105 CLUSTER_ATTR_PROTOCOL,
104}; 106};
105 107
106struct cluster_attribute { 108struct cluster_attribute {
@@ -159,6 +161,7 @@ CLUSTER_ATTR(recover_timer, 1);
159CLUSTER_ATTR(toss_secs, 1); 161CLUSTER_ATTR(toss_secs, 1);
160CLUSTER_ATTR(scan_secs, 1); 162CLUSTER_ATTR(scan_secs, 1);
161CLUSTER_ATTR(log_debug, 0); 163CLUSTER_ATTR(log_debug, 0);
164CLUSTER_ATTR(protocol, 0);
162 165
163static struct configfs_attribute *cluster_attrs[] = { 166static struct configfs_attribute *cluster_attrs[] = {
164 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, 167 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -170,6 +173,7 @@ static struct configfs_attribute *cluster_attrs[] = {
170 [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr, 173 [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
171 [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr, 174 [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
172 [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr, 175 [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
176 [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
173 NULL, 177 NULL,
174}; 178};
175 179
@@ -904,6 +908,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
904#define DEFAULT_TOSS_SECS 10 908#define DEFAULT_TOSS_SECS 10
905#define DEFAULT_SCAN_SECS 5 909#define DEFAULT_SCAN_SECS 5
906#define DEFAULT_LOG_DEBUG 0 910#define DEFAULT_LOG_DEBUG 0
911#define DEFAULT_PROTOCOL 0
907 912
908struct dlm_config_info dlm_config = { 913struct dlm_config_info dlm_config = {
909 .ci_tcp_port = DEFAULT_TCP_PORT, 914 .ci_tcp_port = DEFAULT_TCP_PORT,
@@ -914,6 +919,7 @@ struct dlm_config_info dlm_config = {
914 .ci_recover_timer = DEFAULT_RECOVER_TIMER, 919 .ci_recover_timer = DEFAULT_RECOVER_TIMER,
915 .ci_toss_secs = DEFAULT_TOSS_SECS, 920 .ci_toss_secs = DEFAULT_TOSS_SECS,
916 .ci_scan_secs = DEFAULT_SCAN_SECS, 921 .ci_scan_secs = DEFAULT_SCAN_SECS,
917 .ci_log_debug = DEFAULT_LOG_DEBUG 922 .ci_log_debug = DEFAULT_LOG_DEBUG,
923 .ci_protocol = DEFAULT_PROTOCOL
918}; 924};
919 925
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 1e978611a96e..967cc3d72e5e 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -26,6 +26,7 @@ struct dlm_config_info {
26 int ci_toss_secs; 26 int ci_toss_secs;
27 int ci_scan_secs; 27 int ci_scan_secs;
28 int ci_log_debug; 28 int ci_log_debug;
29 int ci_protocol;
29}; 30};
30 31
31extern struct dlm_config_info dlm_config; 32extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 61d93201e1b2..30994d68f6a0 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -210,6 +210,9 @@ struct dlm_args {
210#define DLM_IFL_MSTCPY 0x00010000 210#define DLM_IFL_MSTCPY 0x00010000
211#define DLM_IFL_RESEND 0x00020000 211#define DLM_IFL_RESEND 0x00020000
212#define DLM_IFL_DEAD 0x00040000 212#define DLM_IFL_DEAD 0x00040000
213#define DLM_IFL_OVERLAP_UNLOCK 0x00080000
214#define DLM_IFL_OVERLAP_CANCEL 0x00100000
215#define DLM_IFL_ENDOFLIFE 0x00200000
213#define DLM_IFL_USER 0x00000001 216#define DLM_IFL_USER 0x00000001
214#define DLM_IFL_ORPHAN 0x00000002 217#define DLM_IFL_ORPHAN 0x00000002
215 218
@@ -230,8 +233,8 @@ struct dlm_lkb {
230 int8_t lkb_grmode; /* granted lock mode */ 233 int8_t lkb_grmode; /* granted lock mode */
231 int8_t lkb_bastmode; /* requested mode */ 234 int8_t lkb_bastmode; /* requested mode */
232 int8_t lkb_highbast; /* highest mode bast sent for */ 235 int8_t lkb_highbast; /* highest mode bast sent for */
233
234 int8_t lkb_wait_type; /* type of reply waiting for */ 236 int8_t lkb_wait_type; /* type of reply waiting for */
237 int8_t lkb_wait_count;
235 int8_t lkb_ast_type; /* type of ast queued for */ 238 int8_t lkb_ast_type; /* type of ast queued for */
236 239
237 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ 240 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
@@ -339,6 +342,7 @@ struct dlm_header {
339#define DLM_MSG_LOOKUP 11 342#define DLM_MSG_LOOKUP 11
340#define DLM_MSG_REMOVE 12 343#define DLM_MSG_REMOVE 12
341#define DLM_MSG_LOOKUP_REPLY 13 344#define DLM_MSG_LOOKUP_REPLY 13
345#define DLM_MSG_PURGE 14
342 346
343struct dlm_message { 347struct dlm_message {
344 struct dlm_header m_header; 348 struct dlm_header m_header;
@@ -440,6 +444,9 @@ struct dlm_ls {
440 struct mutex ls_waiters_mutex; 444 struct mutex ls_waiters_mutex;
441 struct list_head ls_waiters; /* lkbs needing a reply */ 445 struct list_head ls_waiters; /* lkbs needing a reply */
442 446
447 struct mutex ls_orphans_mutex;
448 struct list_head ls_orphans;
449
443 struct list_head ls_nodes; /* current nodes in ls */ 450 struct list_head ls_nodes; /* current nodes in ls */
444 struct list_head ls_nodes_gone; /* dead node list, recovery */ 451 struct list_head ls_nodes_gone; /* dead node list, recovery */
445 int ls_num_nodes; /* number of nodes in ls */ 452 int ls_num_nodes; /* number of nodes in ls */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e725005fafd0..d8d6e729f96b 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -85,6 +85,7 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
85static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, 85static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
86 struct dlm_message *ms); 86 struct dlm_message *ms);
87static int receive_extralen(struct dlm_message *ms); 87static int receive_extralen(struct dlm_message *ms);
88static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
88 89
89/* 90/*
90 * Lock compatibilty matrix - thanks Steve 91 * Lock compatibilty matrix - thanks Steve
@@ -223,6 +224,16 @@ static inline int is_demoted(struct dlm_lkb *lkb)
223 return (lkb->lkb_sbflags & DLM_SBF_DEMOTED); 224 return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
224} 225}
225 226
227static inline int is_altmode(struct dlm_lkb *lkb)
228{
229 return (lkb->lkb_sbflags & DLM_SBF_ALTMODE);
230}
231
232static inline int is_granted(struct dlm_lkb *lkb)
233{
234 return (lkb->lkb_status == DLM_LKSTS_GRANTED);
235}
236
226static inline int is_remote(struct dlm_rsb *r) 237static inline int is_remote(struct dlm_rsb *r)
227{ 238{
228 DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r);); 239 DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
@@ -254,6 +265,22 @@ static inline int down_conversion(struct dlm_lkb *lkb)
254 return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode); 265 return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
255} 266}
256 267
268static inline int is_overlap_unlock(struct dlm_lkb *lkb)
269{
270 return lkb->lkb_flags & DLM_IFL_OVERLAP_UNLOCK;
271}
272
273static inline int is_overlap_cancel(struct dlm_lkb *lkb)
274{
275 return lkb->lkb_flags & DLM_IFL_OVERLAP_CANCEL;
276}
277
278static inline int is_overlap(struct dlm_lkb *lkb)
279{
280 return (lkb->lkb_flags & (DLM_IFL_OVERLAP_UNLOCK |
281 DLM_IFL_OVERLAP_CANCEL));
282}
283
257static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) 284static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
258{ 285{
259 if (is_master_copy(lkb)) 286 if (is_master_copy(lkb))
@@ -267,6 +294,12 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
267 dlm_add_ast(lkb, AST_COMP); 294 dlm_add_ast(lkb, AST_COMP);
268} 295}
269 296
297static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
298{
299 queue_cast(r, lkb,
300 is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL);
301}
302
270static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) 303static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
271{ 304{
272 if (is_master_copy(lkb)) 305 if (is_master_copy(lkb))
@@ -547,6 +580,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
547 lkb->lkb_grmode = DLM_LOCK_IV; 580 lkb->lkb_grmode = DLM_LOCK_IV;
548 kref_init(&lkb->lkb_ref); 581 kref_init(&lkb->lkb_ref);
549 INIT_LIST_HEAD(&lkb->lkb_ownqueue); 582 INIT_LIST_HEAD(&lkb->lkb_ownqueue);
583 INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
550 584
551 get_random_bytes(&bucket, sizeof(bucket)); 585 get_random_bytes(&bucket, sizeof(bucket));
552 bucket &= (ls->ls_lkbtbl_size - 1); 586 bucket &= (ls->ls_lkbtbl_size - 1);
@@ -556,7 +590,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
556 /* counter can roll over so we must verify lkid is not in use */ 590 /* counter can roll over so we must verify lkid is not in use */
557 591
558 while (lkid == 0) { 592 while (lkid == 0) {
559 lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16); 593 lkid = (bucket << 16) | ls->ls_lkbtbl[bucket].counter++;
560 594
561 list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list, 595 list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
562 lkb_idtbl_list) { 596 lkb_idtbl_list) {
@@ -577,8 +611,8 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
577 611
578static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid) 612static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
579{ 613{
580 uint16_t bucket = lkid & 0xFFFF;
581 struct dlm_lkb *lkb; 614 struct dlm_lkb *lkb;
615 uint16_t bucket = (lkid >> 16);
582 616
583 list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) { 617 list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
584 if (lkb->lkb_id == lkid) 618 if (lkb->lkb_id == lkid)
@@ -590,7 +624,7 @@ static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
590static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret) 624static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
591{ 625{
592 struct dlm_lkb *lkb; 626 struct dlm_lkb *lkb;
593 uint16_t bucket = lkid & 0xFFFF; 627 uint16_t bucket = (lkid >> 16);
594 628
595 if (bucket >= ls->ls_lkbtbl_size) 629 if (bucket >= ls->ls_lkbtbl_size)
596 return -EBADSLT; 630 return -EBADSLT;
@@ -620,7 +654,7 @@ static void kill_lkb(struct kref *kref)
620 654
621static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) 655static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
622{ 656{
623 uint16_t bucket = lkb->lkb_id & 0xFFFF; 657 uint16_t bucket = (lkb->lkb_id >> 16);
624 658
625 write_lock(&ls->ls_lkbtbl[bucket].lock); 659 write_lock(&ls->ls_lkbtbl[bucket].lock);
626 if (kref_put(&lkb->lkb_ref, kill_lkb)) { 660 if (kref_put(&lkb->lkb_ref, kill_lkb)) {
@@ -735,23 +769,75 @@ static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
735 unhold_lkb(lkb); 769 unhold_lkb(lkb);
736} 770}
737 771
772static int msg_reply_type(int mstype)
773{
774 switch (mstype) {
775 case DLM_MSG_REQUEST:
776 return DLM_MSG_REQUEST_REPLY;
777 case DLM_MSG_CONVERT:
778 return DLM_MSG_CONVERT_REPLY;
779 case DLM_MSG_UNLOCK:
780 return DLM_MSG_UNLOCK_REPLY;
781 case DLM_MSG_CANCEL:
782 return DLM_MSG_CANCEL_REPLY;
783 case DLM_MSG_LOOKUP:
784 return DLM_MSG_LOOKUP_REPLY;
785 }
786 return -1;
787}
788
738/* add/remove lkb from global waiters list of lkb's waiting for 789/* add/remove lkb from global waiters list of lkb's waiting for
739 a reply from a remote node */ 790 a reply from a remote node */
740 791
741static void add_to_waiters(struct dlm_lkb *lkb, int mstype) 792static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
742{ 793{
743 struct dlm_ls *ls = lkb->lkb_resource->res_ls; 794 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
795 int error = 0;
744 796
745 mutex_lock(&ls->ls_waiters_mutex); 797 mutex_lock(&ls->ls_waiters_mutex);
746 if (lkb->lkb_wait_type) { 798
747 log_print("add_to_waiters error %d", lkb->lkb_wait_type); 799 if (is_overlap_unlock(lkb) ||
800 (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) {
801 error = -EINVAL;
802 goto out;
803 }
804
805 if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) {
806 switch (mstype) {
807 case DLM_MSG_UNLOCK:
808 lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
809 break;
810 case DLM_MSG_CANCEL:
811 lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
812 break;
813 default:
814 error = -EBUSY;
815 goto out;
816 }
817 lkb->lkb_wait_count++;
818 hold_lkb(lkb);
819
820 log_debug(ls, "add overlap %x cur %d new %d count %d flags %x",
821 lkb->lkb_id, lkb->lkb_wait_type, mstype,
822 lkb->lkb_wait_count, lkb->lkb_flags);
748 goto out; 823 goto out;
749 } 824 }
825
826 DLM_ASSERT(!lkb->lkb_wait_count,
827 dlm_print_lkb(lkb);
828 printk("wait_count %d\n", lkb->lkb_wait_count););
829
830 lkb->lkb_wait_count++;
750 lkb->lkb_wait_type = mstype; 831 lkb->lkb_wait_type = mstype;
751 kref_get(&lkb->lkb_ref); 832 hold_lkb(lkb);
752 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); 833 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
753 out: 834 out:
835 if (error)
836 log_error(ls, "add_to_waiters %x error %d flags %x %d %d %s",
837 lkb->lkb_id, error, lkb->lkb_flags, mstype,
838 lkb->lkb_wait_type, lkb->lkb_resource->res_name);
754 mutex_unlock(&ls->ls_waiters_mutex); 839 mutex_unlock(&ls->ls_waiters_mutex);
840 return error;
755} 841}
756 842
757/* We clear the RESEND flag because we might be taking an lkb off the waiters 843/* We clear the RESEND flag because we might be taking an lkb off the waiters
@@ -759,34 +845,85 @@ static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
759 request reply on the requestqueue) between dlm_recover_waiters_pre() which 845 request reply on the requestqueue) between dlm_recover_waiters_pre() which
760 set RESEND and dlm_recover_waiters_post() */ 846 set RESEND and dlm_recover_waiters_post() */
761 847
762static int _remove_from_waiters(struct dlm_lkb *lkb) 848static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype)
763{ 849{
764 int error = 0; 850 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
851 int overlap_done = 0;
765 852
766 if (!lkb->lkb_wait_type) { 853 if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) {
767 log_print("remove_from_waiters error"); 854 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
768 error = -EINVAL; 855 overlap_done = 1;
769 goto out; 856 goto out_del;
857 }
858
859 if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) {
860 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
861 overlap_done = 1;
862 goto out_del;
863 }
864
865 /* N.B. type of reply may not always correspond to type of original
866 msg due to lookup->request optimization, verify others? */
867
868 if (lkb->lkb_wait_type) {
869 lkb->lkb_wait_type = 0;
870 goto out_del;
871 }
872
873 log_error(ls, "remove_from_waiters lkid %x flags %x types %d %d",
874 lkb->lkb_id, lkb->lkb_flags, mstype, lkb->lkb_wait_type);
875 return -1;
876
877 out_del:
878 /* the force-unlock/cancel has completed and we haven't recvd a reply
879 to the op that was in progress prior to the unlock/cancel; we
880 give up on any reply to the earlier op. FIXME: not sure when/how
881 this would happen */
882
883 if (overlap_done && lkb->lkb_wait_type) {
884 log_error(ls, "remove_from_waiters %x reply %d give up on %d",
885 lkb->lkb_id, mstype, lkb->lkb_wait_type);
886 lkb->lkb_wait_count--;
887 lkb->lkb_wait_type = 0;
770 } 888 }
771 lkb->lkb_wait_type = 0; 889
890 DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb););
891
772 lkb->lkb_flags &= ~DLM_IFL_RESEND; 892 lkb->lkb_flags &= ~DLM_IFL_RESEND;
773 list_del(&lkb->lkb_wait_reply); 893 lkb->lkb_wait_count--;
894 if (!lkb->lkb_wait_count)
895 list_del_init(&lkb->lkb_wait_reply);
774 unhold_lkb(lkb); 896 unhold_lkb(lkb);
775 out: 897 return 0;
776 return error;
777} 898}
778 899
779static int remove_from_waiters(struct dlm_lkb *lkb) 900static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
780{ 901{
781 struct dlm_ls *ls = lkb->lkb_resource->res_ls; 902 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
782 int error; 903 int error;
783 904
784 mutex_lock(&ls->ls_waiters_mutex); 905 mutex_lock(&ls->ls_waiters_mutex);
785 error = _remove_from_waiters(lkb); 906 error = _remove_from_waiters(lkb, mstype);
786 mutex_unlock(&ls->ls_waiters_mutex); 907 mutex_unlock(&ls->ls_waiters_mutex);
787 return error; 908 return error;
788} 909}
789 910
911/* Handles situations where we might be processing a "fake" or "stub" reply in
912 which we can't try to take waiters_mutex again. */
913
914static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
915{
916 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
917 int error;
918
919 if (ms != &ls->ls_stub_ms)
920 mutex_lock(&ls->ls_waiters_mutex);
921 error = _remove_from_waiters(lkb, ms->m_type);
922 if (ms != &ls->ls_stub_ms)
923 mutex_unlock(&ls->ls_waiters_mutex);
924 return error;
925}
926
790static void dir_remove(struct dlm_rsb *r) 927static void dir_remove(struct dlm_rsb *r)
791{ 928{
792 int to_nodeid; 929 int to_nodeid;
@@ -988,8 +1125,14 @@ static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
988 _remove_lock(r, lkb); 1125 _remove_lock(r, lkb);
989} 1126}
990 1127
991static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 1128/* returns: 0 did nothing
1129 1 moved lock to granted
1130 -1 removed lock */
1131
1132static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
992{ 1133{
1134 int rv = 0;
1135
993 lkb->lkb_rqmode = DLM_LOCK_IV; 1136 lkb->lkb_rqmode = DLM_LOCK_IV;
994 1137
995 switch (lkb->lkb_status) { 1138 switch (lkb->lkb_status) {
@@ -997,6 +1140,7 @@ static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
997 break; 1140 break;
998 case DLM_LKSTS_CONVERT: 1141 case DLM_LKSTS_CONVERT:
999 move_lkb(r, lkb, DLM_LKSTS_GRANTED); 1142 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
1143 rv = 1;
1000 break; 1144 break;
1001 case DLM_LKSTS_WAITING: 1145 case DLM_LKSTS_WAITING:
1002 del_lkb(r, lkb); 1146 del_lkb(r, lkb);
@@ -1004,15 +1148,17 @@ static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1004 /* this unhold undoes the original ref from create_lkb() 1148 /* this unhold undoes the original ref from create_lkb()
1005 so this leads to the lkb being freed */ 1149 so this leads to the lkb being freed */
1006 unhold_lkb(lkb); 1150 unhold_lkb(lkb);
1151 rv = -1;
1007 break; 1152 break;
1008 default: 1153 default:
1009 log_print("invalid status for revert %d", lkb->lkb_status); 1154 log_print("invalid status for revert %d", lkb->lkb_status);
1010 } 1155 }
1156 return rv;
1011} 1157}
1012 1158
1013static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb) 1159static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
1014{ 1160{
1015 revert_lock(r, lkb); 1161 return revert_lock(r, lkb);
1016} 1162}
1017 1163
1018static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 1164static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -1055,6 +1201,50 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1055 queue_cast(r, lkb, 0); 1201 queue_cast(r, lkb, 0);
1056} 1202}
1057 1203
1204/* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to
1205 change the granted/requested modes. We're munging things accordingly in
1206 the process copy.
1207 CONVDEADLK: our grmode may have been forced down to NL to resolve a
1208 conversion deadlock
1209 ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
1210 compatible with other granted locks */
1211
1212static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms)
1213{
1214 if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
1215 log_print("munge_demoted %x invalid reply type %d",
1216 lkb->lkb_id, ms->m_type);
1217 return;
1218 }
1219
1220 if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
1221 log_print("munge_demoted %x invalid modes gr %d rq %d",
1222 lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
1223 return;
1224 }
1225
1226 lkb->lkb_grmode = DLM_LOCK_NL;
1227}
1228
1229static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms)
1230{
1231 if (ms->m_type != DLM_MSG_REQUEST_REPLY &&
1232 ms->m_type != DLM_MSG_GRANT) {
1233 log_print("munge_altmode %x invalid reply type %d",
1234 lkb->lkb_id, ms->m_type);
1235 return;
1236 }
1237
1238 if (lkb->lkb_exflags & DLM_LKF_ALTPR)
1239 lkb->lkb_rqmode = DLM_LOCK_PR;
1240 else if (lkb->lkb_exflags & DLM_LKF_ALTCW)
1241 lkb->lkb_rqmode = DLM_LOCK_CW;
1242 else {
1243 log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags);
1244 dlm_print_lkb(lkb);
1245 }
1246}
1247
1058static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head) 1248static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
1059{ 1249{
1060 struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb, 1250 struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
@@ -1499,7 +1689,7 @@ static void process_lookup_list(struct dlm_rsb *r)
1499 struct dlm_lkb *lkb, *safe; 1689 struct dlm_lkb *lkb, *safe;
1500 1690
1501 list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) { 1691 list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
1502 list_del(&lkb->lkb_rsb_lookup); 1692 list_del_init(&lkb->lkb_rsb_lookup);
1503 _request_lock(r, lkb); 1693 _request_lock(r, lkb);
1504 schedule(); 1694 schedule();
1505 } 1695 }
@@ -1530,7 +1720,7 @@ static void confirm_master(struct dlm_rsb *r, int error)
1530 if (!list_empty(&r->res_lookup)) { 1720 if (!list_empty(&r->res_lookup)) {
1531 lkb = list_entry(r->res_lookup.next, struct dlm_lkb, 1721 lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
1532 lkb_rsb_lookup); 1722 lkb_rsb_lookup);
1533 list_del(&lkb->lkb_rsb_lookup); 1723 list_del_init(&lkb->lkb_rsb_lookup);
1534 r->res_first_lkid = lkb->lkb_id; 1724 r->res_first_lkid = lkb->lkb_id;
1535 _request_lock(r, lkb); 1725 _request_lock(r, lkb);
1536 } else 1726 } else
@@ -1614,6 +1804,9 @@ static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
1614 DLM_LKF_FORCEUNLOCK)) 1804 DLM_LKF_FORCEUNLOCK))
1615 return -EINVAL; 1805 return -EINVAL;
1616 1806
1807 if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK)
1808 return -EINVAL;
1809
1617 args->flags = flags; 1810 args->flags = flags;
1618 args->astparam = (long) astarg; 1811 args->astparam = (long) astarg;
1619 return 0; 1812 return 0;
@@ -1638,6 +1831,9 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1638 1831
1639 if (lkb->lkb_wait_type) 1832 if (lkb->lkb_wait_type)
1640 goto out; 1833 goto out;
1834
1835 if (is_overlap(lkb))
1836 goto out;
1641 } 1837 }
1642 1838
1643 lkb->lkb_exflags = args->flags; 1839 lkb->lkb_exflags = args->flags;
@@ -1654,35 +1850,126 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1654 return rv; 1850 return rv;
1655} 1851}
1656 1852
1853/* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0
1854 for success */
1855
1856/* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here
1857 because there may be a lookup in progress and it's valid to do
1858 cancel/unlockf on it */
1859
1657static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) 1860static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1658{ 1861{
1862 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1659 int rv = -EINVAL; 1863 int rv = -EINVAL;
1660 1864
1661 if (lkb->lkb_flags & DLM_IFL_MSTCPY) 1865 if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
1866 log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
1867 dlm_print_lkb(lkb);
1662 goto out; 1868 goto out;
1869 }
1663 1870
1664 if (args->flags & DLM_LKF_FORCEUNLOCK) 1871 /* an lkb may still exist even though the lock is EOL'ed due to a
1665 goto out_ok; 1872 cancel, unlock or failed noqueue request; an app can't use these
1873 locks; return same error as if the lkid had not been found at all */
1666 1874
1667 if (args->flags & DLM_LKF_CANCEL && 1875 if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
1668 lkb->lkb_status == DLM_LKSTS_GRANTED) 1876 log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
1877 rv = -ENOENT;
1669 goto out; 1878 goto out;
1879 }
1670 1880
1671 if (!(args->flags & DLM_LKF_CANCEL) && 1881 /* an lkb may be waiting for an rsb lookup to complete where the
1672 lkb->lkb_status != DLM_LKSTS_GRANTED) 1882 lookup was initiated by another lock */
1673 goto out; 1883
1884 if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
1885 if (!list_empty(&lkb->lkb_rsb_lookup)) {
1886 log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
1887 list_del_init(&lkb->lkb_rsb_lookup);
1888 queue_cast(lkb->lkb_resource, lkb,
1889 args->flags & DLM_LKF_CANCEL ?
1890 -DLM_ECANCEL : -DLM_EUNLOCK);
1891 unhold_lkb(lkb); /* undoes create_lkb() */
1892 rv = -EBUSY;
1893 goto out;
1894 }
1895 }
1896
1897 /* cancel not allowed with another cancel/unlock in progress */
1898
1899 if (args->flags & DLM_LKF_CANCEL) {
1900 if (lkb->lkb_exflags & DLM_LKF_CANCEL)
1901 goto out;
1902
1903 if (is_overlap(lkb))
1904 goto out;
1905
1906 if (lkb->lkb_flags & DLM_IFL_RESEND) {
1907 lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
1908 rv = -EBUSY;
1909 goto out;
1910 }
1911
1912 switch (lkb->lkb_wait_type) {
1913 case DLM_MSG_LOOKUP:
1914 case DLM_MSG_REQUEST:
1915 lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
1916 rv = -EBUSY;
1917 goto out;
1918 case DLM_MSG_UNLOCK:
1919 case DLM_MSG_CANCEL:
1920 goto out;
1921 }
1922 /* add_to_waiters() will set OVERLAP_CANCEL */
1923 goto out_ok;
1924 }
1925
1926 /* do we need to allow a force-unlock if there's a normal unlock
1927 already in progress? in what conditions could the normal unlock
1928 fail such that we'd want to send a force-unlock to be sure? */
1929
1930 if (args->flags & DLM_LKF_FORCEUNLOCK) {
1931 if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK)
1932 goto out;
1933
1934 if (is_overlap_unlock(lkb))
1935 goto out;
1674 1936
1937 if (lkb->lkb_flags & DLM_IFL_RESEND) {
1938 lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
1939 rv = -EBUSY;
1940 goto out;
1941 }
1942
1943 switch (lkb->lkb_wait_type) {
1944 case DLM_MSG_LOOKUP:
1945 case DLM_MSG_REQUEST:
1946 lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
1947 rv = -EBUSY;
1948 goto out;
1949 case DLM_MSG_UNLOCK:
1950 goto out;
1951 }
1952 /* add_to_waiters() will set OVERLAP_UNLOCK */
1953 goto out_ok;
1954 }
1955
1956 /* normal unlock not allowed if there's any op in progress */
1675 rv = -EBUSY; 1957 rv = -EBUSY;
1676 if (lkb->lkb_wait_type) 1958 if (lkb->lkb_wait_type || lkb->lkb_wait_count)
1677 goto out; 1959 goto out;
1678 1960
1679 out_ok: 1961 out_ok:
1680 lkb->lkb_exflags = args->flags; 1962 /* an overlapping op shouldn't blow away exflags from other op */
1963 lkb->lkb_exflags |= args->flags;
1681 lkb->lkb_sbflags = 0; 1964 lkb->lkb_sbflags = 0;
1682 lkb->lkb_astparam = args->astparam; 1965 lkb->lkb_astparam = args->astparam;
1683
1684 rv = 0; 1966 rv = 0;
1685 out: 1967 out:
1968 if (rv)
1969 log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv,
1970 lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
1971 args->flags, lkb->lkb_wait_type,
1972 lkb->lkb_resource->res_name);
1686 return rv; 1973 return rv;
1687} 1974}
1688 1975
@@ -1732,9 +2019,24 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
1732 goto out; 2019 goto out;
1733 } 2020 }
1734 2021
1735 if (can_be_queued(lkb)) { 2022 /* is_demoted() means the can_be_granted() above set the grmode
1736 if (is_demoted(lkb)) 2023 to NL, and left us on the granted queue. This auto-demotion
2024 (due to CONVDEADLK) might mean other locks, and/or this lock, are
2025 now grantable. We have to try to grant other converting locks
2026 before we try again to grant this one. */
2027
2028 if (is_demoted(lkb)) {
2029 grant_pending_convert(r, DLM_LOCK_IV);
2030 if (_can_be_granted(r, lkb, 1)) {
2031 grant_lock(r, lkb);
2032 queue_cast(r, lkb, 0);
1737 grant_pending_locks(r); 2033 grant_pending_locks(r);
2034 goto out;
2035 }
2036 /* else fall through and move to convert queue */
2037 }
2038
2039 if (can_be_queued(lkb)) {
1738 error = -EINPROGRESS; 2040 error = -EINPROGRESS;
1739 del_lkb(r, lkb); 2041 del_lkb(r, lkb);
1740 add_lkb(r, lkb, DLM_LKSTS_CONVERT); 2042 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
@@ -1759,17 +2061,19 @@ static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1759 return -DLM_EUNLOCK; 2061 return -DLM_EUNLOCK;
1760} 2062}
1761 2063
1762/* FIXME: if revert_lock() finds that the lkb is granted, we should 2064/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
1763 skip the queue_cast(ECANCEL). It indicates that the request/convert
1764 completed (and queued a normal ast) just before the cancel; we don't
1765 want to clobber the sb_result for the normal ast with ECANCEL. */
1766 2065
1767static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) 2066static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
1768{ 2067{
1769 revert_lock(r, lkb); 2068 int error;
1770 queue_cast(r, lkb, -DLM_ECANCEL); 2069
1771 grant_pending_locks(r); 2070 error = revert_lock(r, lkb);
1772 return -DLM_ECANCEL; 2071 if (error) {
2072 queue_cast(r, lkb, -DLM_ECANCEL);
2073 grant_pending_locks(r);
2074 return -DLM_ECANCEL;
2075 }
2076 return 0;
1773} 2077}
1774 2078
1775/* 2079/*
@@ -2035,6 +2339,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
2035 2339
2036 if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL) 2340 if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
2037 error = 0; 2341 error = 0;
2342 if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)))
2343 error = 0;
2038 out_put: 2344 out_put:
2039 dlm_put_lkb(lkb); 2345 dlm_put_lkb(lkb);
2040 out: 2346 out:
@@ -2065,31 +2371,14 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
2065 * receive_lookup_reply send_lookup_reply 2371 * receive_lookup_reply send_lookup_reply
2066 */ 2372 */
2067 2373
2068static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, 2374static int _create_message(struct dlm_ls *ls, int mb_len,
2069 int to_nodeid, int mstype, 2375 int to_nodeid, int mstype,
2070 struct dlm_message **ms_ret, 2376 struct dlm_message **ms_ret,
2071 struct dlm_mhandle **mh_ret) 2377 struct dlm_mhandle **mh_ret)
2072{ 2378{
2073 struct dlm_message *ms; 2379 struct dlm_message *ms;
2074 struct dlm_mhandle *mh; 2380 struct dlm_mhandle *mh;
2075 char *mb; 2381 char *mb;
2076 int mb_len = sizeof(struct dlm_message);
2077
2078 switch (mstype) {
2079 case DLM_MSG_REQUEST:
2080 case DLM_MSG_LOOKUP:
2081 case DLM_MSG_REMOVE:
2082 mb_len += r->res_length;
2083 break;
2084 case DLM_MSG_CONVERT:
2085 case DLM_MSG_UNLOCK:
2086 case DLM_MSG_REQUEST_REPLY:
2087 case DLM_MSG_CONVERT_REPLY:
2088 case DLM_MSG_GRANT:
2089 if (lkb && lkb->lkb_lvbptr)
2090 mb_len += r->res_ls->ls_lvblen;
2091 break;
2092 }
2093 2382
2094 /* get_buffer gives us a message handle (mh) that we need to 2383 /* get_buffer gives us a message handle (mh) that we need to
2095 pass into lowcomms_commit and a message buffer (mb) that we 2384 pass into lowcomms_commit and a message buffer (mb) that we
@@ -2104,7 +2393,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2104 ms = (struct dlm_message *) mb; 2393 ms = (struct dlm_message *) mb;
2105 2394
2106 ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); 2395 ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
2107 ms->m_header.h_lockspace = r->res_ls->ls_global_id; 2396 ms->m_header.h_lockspace = ls->ls_global_id;
2108 ms->m_header.h_nodeid = dlm_our_nodeid(); 2397 ms->m_header.h_nodeid = dlm_our_nodeid();
2109 ms->m_header.h_length = mb_len; 2398 ms->m_header.h_length = mb_len;
2110 ms->m_header.h_cmd = DLM_MSG; 2399 ms->m_header.h_cmd = DLM_MSG;
@@ -2116,6 +2405,33 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2116 return 0; 2405 return 0;
2117} 2406}
2118 2407
2408static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2409 int to_nodeid, int mstype,
2410 struct dlm_message **ms_ret,
2411 struct dlm_mhandle **mh_ret)
2412{
2413 int mb_len = sizeof(struct dlm_message);
2414
2415 switch (mstype) {
2416 case DLM_MSG_REQUEST:
2417 case DLM_MSG_LOOKUP:
2418 case DLM_MSG_REMOVE:
2419 mb_len += r->res_length;
2420 break;
2421 case DLM_MSG_CONVERT:
2422 case DLM_MSG_UNLOCK:
2423 case DLM_MSG_REQUEST_REPLY:
2424 case DLM_MSG_CONVERT_REPLY:
2425 case DLM_MSG_GRANT:
2426 if (lkb && lkb->lkb_lvbptr)
2427 mb_len += r->res_ls->ls_lvblen;
2428 break;
2429 }
2430
2431 return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
2432 ms_ret, mh_ret);
2433}
2434
2119/* further lowcomms enhancements or alternate implementations may make 2435/* further lowcomms enhancements or alternate implementations may make
2120 the return value from this function useful at some point */ 2436 the return value from this function useful at some point */
2121 2437
@@ -2176,7 +2492,9 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2176 struct dlm_mhandle *mh; 2492 struct dlm_mhandle *mh;
2177 int to_nodeid, error; 2493 int to_nodeid, error;
2178 2494
2179 add_to_waiters(lkb, mstype); 2495 error = add_to_waiters(lkb, mstype);
2496 if (error)
2497 return error;
2180 2498
2181 to_nodeid = r->res_nodeid; 2499 to_nodeid = r->res_nodeid;
2182 2500
@@ -2192,7 +2510,7 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2192 return 0; 2510 return 0;
2193 2511
2194 fail: 2512 fail:
2195 remove_from_waiters(lkb); 2513 remove_from_waiters(lkb, msg_reply_type(mstype));
2196 return error; 2514 return error;
2197} 2515}
2198 2516
@@ -2209,7 +2527,8 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2209 2527
2210 /* down conversions go without a reply from the master */ 2528 /* down conversions go without a reply from the master */
2211 if (!error && down_conversion(lkb)) { 2529 if (!error && down_conversion(lkb)) {
2212 remove_from_waiters(lkb); 2530 remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
2531 r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
2213 r->res_ls->ls_stub_ms.m_result = 0; 2532 r->res_ls->ls_stub_ms.m_result = 0;
2214 r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags; 2533 r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
2215 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms); 2534 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
@@ -2280,7 +2599,9 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2280 struct dlm_mhandle *mh; 2599 struct dlm_mhandle *mh;
2281 int to_nodeid, error; 2600 int to_nodeid, error;
2282 2601
2283 add_to_waiters(lkb, DLM_MSG_LOOKUP); 2602 error = add_to_waiters(lkb, DLM_MSG_LOOKUP);
2603 if (error)
2604 return error;
2284 2605
2285 to_nodeid = dlm_dir_nodeid(r); 2606 to_nodeid = dlm_dir_nodeid(r);
2286 2607
@@ -2296,7 +2617,7 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2296 return 0; 2617 return 0;
2297 2618
2298 fail: 2619 fail:
2299 remove_from_waiters(lkb); 2620 remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
2300 return error; 2621 return error;
2301} 2622}
2302 2623
@@ -2656,6 +2977,8 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
2656 lock_rsb(r); 2977 lock_rsb(r);
2657 2978
2658 receive_flags_reply(lkb, ms); 2979 receive_flags_reply(lkb, ms);
2980 if (is_altmode(lkb))
2981 munge_altmode(lkb, ms);
2659 grant_lock_pc(r, lkb, ms); 2982 grant_lock_pc(r, lkb, ms);
2660 queue_cast(r, lkb, 0); 2983 queue_cast(r, lkb, 0);
2661 2984
@@ -2736,11 +3059,16 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
2736 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len); 3059 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
2737} 3060}
2738 3061
3062static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
3063{
3064 do_purge(ls, ms->m_nodeid, ms->m_pid);
3065}
3066
2739static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) 3067static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2740{ 3068{
2741 struct dlm_lkb *lkb; 3069 struct dlm_lkb *lkb;
2742 struct dlm_rsb *r; 3070 struct dlm_rsb *r;
2743 int error, mstype; 3071 int error, mstype, result;
2744 3072
2745 error = find_lkb(ls, ms->m_remid, &lkb); 3073 error = find_lkb(ls, ms->m_remid, &lkb);
2746 if (error) { 3074 if (error) {
@@ -2749,20 +3077,15 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2749 } 3077 }
2750 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); 3078 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2751 3079
2752 mstype = lkb->lkb_wait_type;
2753 error = remove_from_waiters(lkb);
2754 if (error) {
2755 log_error(ls, "receive_request_reply not on waiters");
2756 goto out;
2757 }
2758
2759 /* this is the value returned from do_request() on the master */
2760 error = ms->m_result;
2761
2762 r = lkb->lkb_resource; 3080 r = lkb->lkb_resource;
2763 hold_rsb(r); 3081 hold_rsb(r);
2764 lock_rsb(r); 3082 lock_rsb(r);
2765 3083
3084 mstype = lkb->lkb_wait_type;
3085 error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
3086 if (error)
3087 goto out;
3088
2766 /* Optimization: the dir node was also the master, so it took our 3089 /* Optimization: the dir node was also the master, so it took our
2767 lookup as a request and sent request reply instead of lookup reply */ 3090 lookup as a request and sent request reply instead of lookup reply */
2768 if (mstype == DLM_MSG_LOOKUP) { 3091 if (mstype == DLM_MSG_LOOKUP) {
@@ -2770,14 +3093,15 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2770 lkb->lkb_nodeid = r->res_nodeid; 3093 lkb->lkb_nodeid = r->res_nodeid;
2771 } 3094 }
2772 3095
2773 switch (error) { 3096 /* this is the value returned from do_request() on the master */
3097 result = ms->m_result;
3098
3099 switch (result) {
2774 case -EAGAIN: 3100 case -EAGAIN:
2775 /* request would block (be queued) on remote master; 3101 /* request would block (be queued) on remote master */
2776 the unhold undoes the original ref from create_lkb()
2777 so it leads to the lkb being freed */
2778 queue_cast(r, lkb, -EAGAIN); 3102 queue_cast(r, lkb, -EAGAIN);
2779 confirm_master(r, -EAGAIN); 3103 confirm_master(r, -EAGAIN);
2780 unhold_lkb(lkb); 3104 unhold_lkb(lkb); /* undoes create_lkb() */
2781 break; 3105 break;
2782 3106
2783 case -EINPROGRESS: 3107 case -EINPROGRESS:
@@ -2785,41 +3109,64 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2785 /* request was queued or granted on remote master */ 3109 /* request was queued or granted on remote master */
2786 receive_flags_reply(lkb, ms); 3110 receive_flags_reply(lkb, ms);
2787 lkb->lkb_remid = ms->m_lkid; 3111 lkb->lkb_remid = ms->m_lkid;
2788 if (error) 3112 if (is_altmode(lkb))
3113 munge_altmode(lkb, ms);
3114 if (result)
2789 add_lkb(r, lkb, DLM_LKSTS_WAITING); 3115 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2790 else { 3116 else {
2791 grant_lock_pc(r, lkb, ms); 3117 grant_lock_pc(r, lkb, ms);
2792 queue_cast(r, lkb, 0); 3118 queue_cast(r, lkb, 0);
2793 } 3119 }
2794 confirm_master(r, error); 3120 confirm_master(r, result);
2795 break; 3121 break;
2796 3122
2797 case -EBADR: 3123 case -EBADR:
2798 case -ENOTBLK: 3124 case -ENOTBLK:
2799 /* find_rsb failed to find rsb or rsb wasn't master */ 3125 /* find_rsb failed to find rsb or rsb wasn't master */
3126 log_debug(ls, "receive_request_reply %x %x master diff %d %d",
3127 lkb->lkb_id, lkb->lkb_flags, r->res_nodeid, result);
2800 r->res_nodeid = -1; 3128 r->res_nodeid = -1;
2801 lkb->lkb_nodeid = -1; 3129 lkb->lkb_nodeid = -1;
2802 _request_lock(r, lkb); 3130
3131 if (is_overlap(lkb)) {
3132 /* we'll ignore error in cancel/unlock reply */
3133 queue_cast_overlap(r, lkb);
3134 unhold_lkb(lkb); /* undoes create_lkb() */
3135 } else
3136 _request_lock(r, lkb);
2803 break; 3137 break;
2804 3138
2805 default: 3139 default:
2806 log_error(ls, "receive_request_reply error %d", error); 3140 log_error(ls, "receive_request_reply %x error %d",
3141 lkb->lkb_id, result);
2807 } 3142 }
2808 3143
3144 if (is_overlap_unlock(lkb) && (result == 0 || result == -EINPROGRESS)) {
3145 log_debug(ls, "receive_request_reply %x result %d unlock",
3146 lkb->lkb_id, result);
3147 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
3148 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
3149 send_unlock(r, lkb);
3150 } else if (is_overlap_cancel(lkb) && (result == -EINPROGRESS)) {
3151 log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id);
3152 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
3153 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
3154 send_cancel(r, lkb);
3155 } else {
3156 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
3157 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
3158 }
3159 out:
2809 unlock_rsb(r); 3160 unlock_rsb(r);
2810 put_rsb(r); 3161 put_rsb(r);
2811 out:
2812 dlm_put_lkb(lkb); 3162 dlm_put_lkb(lkb);
2813} 3163}
2814 3164
2815static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, 3165static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2816 struct dlm_message *ms) 3166 struct dlm_message *ms)
2817{ 3167{
2818 int error = ms->m_result;
2819
2820 /* this is the value returned from do_convert() on the master */ 3168 /* this is the value returned from do_convert() on the master */
2821 3169 switch (ms->m_result) {
2822 switch (error) {
2823 case -EAGAIN: 3170 case -EAGAIN:
2824 /* convert would block (be queued) on remote master */ 3171 /* convert would block (be queued) on remote master */
2825 queue_cast(r, lkb, -EAGAIN); 3172 queue_cast(r, lkb, -EAGAIN);
@@ -2827,6 +3174,9 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2827 3174
2828 case -EINPROGRESS: 3175 case -EINPROGRESS:
2829 /* convert was queued on remote master */ 3176 /* convert was queued on remote master */
3177 receive_flags_reply(lkb, ms);
3178 if (is_demoted(lkb))
3179 munge_demoted(lkb, ms);
2830 del_lkb(r, lkb); 3180 del_lkb(r, lkb);
2831 add_lkb(r, lkb, DLM_LKSTS_CONVERT); 3181 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2832 break; 3182 break;
@@ -2834,24 +3184,33 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2834 case 0: 3184 case 0:
2835 /* convert was granted on remote master */ 3185 /* convert was granted on remote master */
2836 receive_flags_reply(lkb, ms); 3186 receive_flags_reply(lkb, ms);
3187 if (is_demoted(lkb))
3188 munge_demoted(lkb, ms);
2837 grant_lock_pc(r, lkb, ms); 3189 grant_lock_pc(r, lkb, ms);
2838 queue_cast(r, lkb, 0); 3190 queue_cast(r, lkb, 0);
2839 break; 3191 break;
2840 3192
2841 default: 3193 default:
2842 log_error(r->res_ls, "receive_convert_reply error %d", error); 3194 log_error(r->res_ls, "receive_convert_reply %x error %d",
3195 lkb->lkb_id, ms->m_result);
2843 } 3196 }
2844} 3197}
2845 3198
2846static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms) 3199static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2847{ 3200{
2848 struct dlm_rsb *r = lkb->lkb_resource; 3201 struct dlm_rsb *r = lkb->lkb_resource;
3202 int error;
2849 3203
2850 hold_rsb(r); 3204 hold_rsb(r);
2851 lock_rsb(r); 3205 lock_rsb(r);
2852 3206
2853 __receive_convert_reply(r, lkb, ms); 3207 /* stub reply can happen with waiters_mutex held */
3208 error = remove_from_waiters_ms(lkb, ms);
3209 if (error)
3210 goto out;
2854 3211
3212 __receive_convert_reply(r, lkb, ms);
3213 out:
2855 unlock_rsb(r); 3214 unlock_rsb(r);
2856 put_rsb(r); 3215 put_rsb(r);
2857} 3216}
@@ -2868,37 +3227,38 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
2868 } 3227 }
2869 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); 3228 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2870 3229
2871 error = remove_from_waiters(lkb);
2872 if (error) {
2873 log_error(ls, "receive_convert_reply not on waiters");
2874 goto out;
2875 }
2876
2877 _receive_convert_reply(lkb, ms); 3230 _receive_convert_reply(lkb, ms);
2878 out:
2879 dlm_put_lkb(lkb); 3231 dlm_put_lkb(lkb);
2880} 3232}
2881 3233
2882static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) 3234static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2883{ 3235{
2884 struct dlm_rsb *r = lkb->lkb_resource; 3236 struct dlm_rsb *r = lkb->lkb_resource;
2885 int error = ms->m_result; 3237 int error;
2886 3238
2887 hold_rsb(r); 3239 hold_rsb(r);
2888 lock_rsb(r); 3240 lock_rsb(r);
2889 3241
3242 /* stub reply can happen with waiters_mutex held */
3243 error = remove_from_waiters_ms(lkb, ms);
3244 if (error)
3245 goto out;
3246
2890 /* this is the value returned from do_unlock() on the master */ 3247 /* this is the value returned from do_unlock() on the master */
2891 3248
2892 switch (error) { 3249 switch (ms->m_result) {
2893 case -DLM_EUNLOCK: 3250 case -DLM_EUNLOCK:
2894 receive_flags_reply(lkb, ms); 3251 receive_flags_reply(lkb, ms);
2895 remove_lock_pc(r, lkb); 3252 remove_lock_pc(r, lkb);
2896 queue_cast(r, lkb, -DLM_EUNLOCK); 3253 queue_cast(r, lkb, -DLM_EUNLOCK);
2897 break; 3254 break;
3255 case -ENOENT:
3256 break;
2898 default: 3257 default:
2899 log_error(r->res_ls, "receive_unlock_reply error %d", error); 3258 log_error(r->res_ls, "receive_unlock_reply %x error %d",
3259 lkb->lkb_id, ms->m_result);
2900 } 3260 }
2901 3261 out:
2902 unlock_rsb(r); 3262 unlock_rsb(r);
2903 put_rsb(r); 3263 put_rsb(r);
2904} 3264}
@@ -2915,37 +3275,39 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
2915 } 3275 }
2916 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); 3276 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2917 3277
2918 error = remove_from_waiters(lkb);
2919 if (error) {
2920 log_error(ls, "receive_unlock_reply not on waiters");
2921 goto out;
2922 }
2923
2924 _receive_unlock_reply(lkb, ms); 3278 _receive_unlock_reply(lkb, ms);
2925 out:
2926 dlm_put_lkb(lkb); 3279 dlm_put_lkb(lkb);
2927} 3280}
2928 3281
2929static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) 3282static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2930{ 3283{
2931 struct dlm_rsb *r = lkb->lkb_resource; 3284 struct dlm_rsb *r = lkb->lkb_resource;
2932 int error = ms->m_result; 3285 int error;
2933 3286
2934 hold_rsb(r); 3287 hold_rsb(r);
2935 lock_rsb(r); 3288 lock_rsb(r);
2936 3289
3290 /* stub reply can happen with waiters_mutex held */
3291 error = remove_from_waiters_ms(lkb, ms);
3292 if (error)
3293 goto out;
3294
2937 /* this is the value returned from do_cancel() on the master */ 3295 /* this is the value returned from do_cancel() on the master */
2938 3296
2939 switch (error) { 3297 switch (ms->m_result) {
2940 case -DLM_ECANCEL: 3298 case -DLM_ECANCEL:
2941 receive_flags_reply(lkb, ms); 3299 receive_flags_reply(lkb, ms);
2942 revert_lock_pc(r, lkb); 3300 revert_lock_pc(r, lkb);
2943 queue_cast(r, lkb, -DLM_ECANCEL); 3301 if (ms->m_result)
3302 queue_cast(r, lkb, -DLM_ECANCEL);
3303 break;
3304 case 0:
2944 break; 3305 break;
2945 default: 3306 default:
2946 log_error(r->res_ls, "receive_cancel_reply error %d", error); 3307 log_error(r->res_ls, "receive_cancel_reply %x error %d",
3308 lkb->lkb_id, ms->m_result);
2947 } 3309 }
2948 3310 out:
2949 unlock_rsb(r); 3311 unlock_rsb(r);
2950 put_rsb(r); 3312 put_rsb(r);
2951} 3313}
@@ -2962,14 +3324,7 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
2962 } 3324 }
2963 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); 3325 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2964 3326
2965 error = remove_from_waiters(lkb);
2966 if (error) {
2967 log_error(ls, "receive_cancel_reply not on waiters");
2968 goto out;
2969 }
2970
2971 _receive_cancel_reply(lkb, ms); 3327 _receive_cancel_reply(lkb, ms);
2972 out:
2973 dlm_put_lkb(lkb); 3328 dlm_put_lkb(lkb);
2974} 3329}
2975 3330
@@ -2985,20 +3340,17 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
2985 return; 3340 return;
2986 } 3341 }
2987 3342
2988 error = remove_from_waiters(lkb); 3343 /* ms->m_result is the value returned by dlm_dir_lookup on dir node
2989 if (error) {
2990 log_error(ls, "receive_lookup_reply not on waiters");
2991 goto out;
2992 }
2993
2994 /* this is the value returned by dlm_dir_lookup on dir node
2995 FIXME: will a non-zero error ever be returned? */ 3344 FIXME: will a non-zero error ever be returned? */
2996 error = ms->m_result;
2997 3345
2998 r = lkb->lkb_resource; 3346 r = lkb->lkb_resource;
2999 hold_rsb(r); 3347 hold_rsb(r);
3000 lock_rsb(r); 3348 lock_rsb(r);
3001 3349
3350 error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
3351 if (error)
3352 goto out;
3353
3002 ret_nodeid = ms->m_nodeid; 3354 ret_nodeid = ms->m_nodeid;
3003 if (ret_nodeid == dlm_our_nodeid()) { 3355 if (ret_nodeid == dlm_our_nodeid()) {
3004 r->res_nodeid = 0; 3356 r->res_nodeid = 0;
@@ -3009,14 +3361,22 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
3009 r->res_nodeid = ret_nodeid; 3361 r->res_nodeid = ret_nodeid;
3010 } 3362 }
3011 3363
3364 if (is_overlap(lkb)) {
3365 log_debug(ls, "receive_lookup_reply %x unlock %x",
3366 lkb->lkb_id, lkb->lkb_flags);
3367 queue_cast_overlap(r, lkb);
3368 unhold_lkb(lkb); /* undoes create_lkb() */
3369 goto out_list;
3370 }
3371
3012 _request_lock(r, lkb); 3372 _request_lock(r, lkb);
3013 3373
3374 out_list:
3014 if (!ret_nodeid) 3375 if (!ret_nodeid)
3015 process_lookup_list(r); 3376 process_lookup_list(r);
3016 3377 out:
3017 unlock_rsb(r); 3378 unlock_rsb(r);
3018 put_rsb(r); 3379 put_rsb(r);
3019 out:
3020 dlm_put_lkb(lkb); 3380 dlm_put_lkb(lkb);
3021} 3381}
3022 3382
@@ -3133,6 +3493,12 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3133 receive_lookup_reply(ls, ms); 3493 receive_lookup_reply(ls, ms);
3134 break; 3494 break;
3135 3495
3496 /* other messages */
3497
3498 case DLM_MSG_PURGE:
3499 receive_purge(ls, ms);
3500 break;
3501
3136 default: 3502 default:
3137 log_error(ls, "unknown message type %d", ms->m_type); 3503 log_error(ls, "unknown message type %d", ms->m_type);
3138 } 3504 }
@@ -3153,9 +3519,9 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3153{ 3519{
3154 if (middle_conversion(lkb)) { 3520 if (middle_conversion(lkb)) {
3155 hold_lkb(lkb); 3521 hold_lkb(lkb);
3522 ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
3156 ls->ls_stub_ms.m_result = -EINPROGRESS; 3523 ls->ls_stub_ms.m_result = -EINPROGRESS;
3157 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 3524 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3158 _remove_from_waiters(lkb);
3159 _receive_convert_reply(lkb, &ls->ls_stub_ms); 3525 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3160 3526
3161 /* Same special case as in receive_rcom_lock_args() */ 3527 /* Same special case as in receive_rcom_lock_args() */
@@ -3227,18 +3593,18 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3227 3593
3228 case DLM_MSG_UNLOCK: 3594 case DLM_MSG_UNLOCK:
3229 hold_lkb(lkb); 3595 hold_lkb(lkb);
3596 ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
3230 ls->ls_stub_ms.m_result = -DLM_EUNLOCK; 3597 ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
3231 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 3598 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3232 _remove_from_waiters(lkb);
3233 _receive_unlock_reply(lkb, &ls->ls_stub_ms); 3599 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3234 dlm_put_lkb(lkb); 3600 dlm_put_lkb(lkb);
3235 break; 3601 break;
3236 3602
3237 case DLM_MSG_CANCEL: 3603 case DLM_MSG_CANCEL:
3238 hold_lkb(lkb); 3604 hold_lkb(lkb);
3605 ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
3239 ls->ls_stub_ms.m_result = -DLM_ECANCEL; 3606 ls->ls_stub_ms.m_result = -DLM_ECANCEL;
3240 ls->ls_stub_ms.m_flags = lkb->lkb_flags; 3607 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
3241 _remove_from_waiters(lkb);
3242 _receive_cancel_reply(lkb, &ls->ls_stub_ms); 3608 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3243 dlm_put_lkb(lkb); 3609 dlm_put_lkb(lkb);
3244 break; 3610 break;
@@ -3252,37 +3618,47 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
3252 mutex_unlock(&ls->ls_waiters_mutex); 3618 mutex_unlock(&ls->ls_waiters_mutex);
3253} 3619}
3254 3620
3255static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) 3621static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
3256{ 3622{
3257 struct dlm_lkb *lkb; 3623 struct dlm_lkb *lkb;
3258 int rv = 0; 3624 int found = 0;
3259 3625
3260 mutex_lock(&ls->ls_waiters_mutex); 3626 mutex_lock(&ls->ls_waiters_mutex);
3261 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { 3627 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
3262 if (lkb->lkb_flags & DLM_IFL_RESEND) { 3628 if (lkb->lkb_flags & DLM_IFL_RESEND) {
3263 rv = lkb->lkb_wait_type; 3629 hold_lkb(lkb);
3264 _remove_from_waiters(lkb); 3630 found = 1;
3265 lkb->lkb_flags &= ~DLM_IFL_RESEND;
3266 break; 3631 break;
3267 } 3632 }
3268 } 3633 }
3269 mutex_unlock(&ls->ls_waiters_mutex); 3634 mutex_unlock(&ls->ls_waiters_mutex);
3270 3635
3271 if (!rv) 3636 if (!found)
3272 lkb = NULL; 3637 lkb = NULL;
3273 *lkb_ret = lkb; 3638 return lkb;
3274 return rv;
3275} 3639}
3276 3640
3277/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the 3641/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
3278 master or dir-node for r. Processing the lkb may result in it being placed 3642 master or dir-node for r. Processing the lkb may result in it being placed
3279 back on waiters. */ 3643 back on waiters. */
3280 3644
3645/* We do this after normal locking has been enabled and any saved messages
3646 (in requestqueue) have been processed. We should be confident that at
3647 this point we won't get or process a reply to any of these waiting
3648 operations. But, new ops may be coming in on the rsbs/locks here from
3649 userspace or remotely. */
3650
3651/* there may have been an overlap unlock/cancel prior to recovery or after
3652 recovery. if before, the lkb may still have a pos wait_count; if after, the
3653 overlap flag would just have been set and nothing new sent. we can be
3654 confident here than any replies to either the initial op or overlap ops
3655 prior to recovery have been received. */
3656
3281int dlm_recover_waiters_post(struct dlm_ls *ls) 3657int dlm_recover_waiters_post(struct dlm_ls *ls)
3282{ 3658{
3283 struct dlm_lkb *lkb; 3659 struct dlm_lkb *lkb;
3284 struct dlm_rsb *r; 3660 struct dlm_rsb *r;
3285 int error = 0, mstype; 3661 int error = 0, mstype, err, oc, ou;
3286 3662
3287 while (1) { 3663 while (1) {
3288 if (dlm_locking_stopped(ls)) { 3664 if (dlm_locking_stopped(ls)) {
@@ -3291,48 +3667,78 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
3291 break; 3667 break;
3292 } 3668 }
3293 3669
3294 mstype = remove_resend_waiter(ls, &lkb); 3670 lkb = find_resend_waiter(ls);
3295 if (!mstype) 3671 if (!lkb)
3296 break; 3672 break;
3297 3673
3298 r = lkb->lkb_resource; 3674 r = lkb->lkb_resource;
3675 hold_rsb(r);
3676 lock_rsb(r);
3677
3678 mstype = lkb->lkb_wait_type;
3679 oc = is_overlap_cancel(lkb);
3680 ou = is_overlap_unlock(lkb);
3681 err = 0;
3299 3682
3300 log_debug(ls, "recover_waiters_post %x type %d flags %x %s", 3683 log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
3301 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name); 3684 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
3302 3685
3303 switch (mstype) { 3686 /* At this point we assume that we won't get a reply to any
3304 3687 previous op or overlap op on this lock. First, do a big
3305 case DLM_MSG_LOOKUP: 3688 remove_from_waiters() for all previous ops. */
3306 hold_rsb(r); 3689
3307 lock_rsb(r); 3690 lkb->lkb_flags &= ~DLM_IFL_RESEND;
3308 _request_lock(r, lkb); 3691 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
3309 if (is_master(r)) 3692 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
3310 confirm_master(r, 0); 3693 lkb->lkb_wait_type = 0;
3311 unlock_rsb(r); 3694 lkb->lkb_wait_count = 0;
3312 put_rsb(r); 3695 mutex_lock(&ls->ls_waiters_mutex);
3313 break; 3696 list_del_init(&lkb->lkb_wait_reply);
3314 3697 mutex_unlock(&ls->ls_waiters_mutex);
3315 case DLM_MSG_REQUEST: 3698 unhold_lkb(lkb); /* for waiters list */
3316 hold_rsb(r); 3699
3317 lock_rsb(r); 3700 if (oc || ou) {
3318 _request_lock(r, lkb); 3701 /* do an unlock or cancel instead of resending */
3319 if (is_master(r)) 3702 switch (mstype) {
3320 confirm_master(r, 0); 3703 case DLM_MSG_LOOKUP:
3321 unlock_rsb(r); 3704 case DLM_MSG_REQUEST:
3322 put_rsb(r); 3705 queue_cast(r, lkb, ou ? -DLM_EUNLOCK :
3323 break; 3706 -DLM_ECANCEL);
3324 3707 unhold_lkb(lkb); /* undoes create_lkb() */
3325 case DLM_MSG_CONVERT: 3708 break;
3326 hold_rsb(r); 3709 case DLM_MSG_CONVERT:
3327 lock_rsb(r); 3710 if (oc) {
3328 _convert_lock(r, lkb); 3711 queue_cast(r, lkb, -DLM_ECANCEL);
3329 unlock_rsb(r); 3712 } else {
3330 put_rsb(r); 3713 lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK;
3331 break; 3714 _unlock_lock(r, lkb);
3332 3715 }
3333 default: 3716 break;
3334 log_error(ls, "recover_waiters_post type %d", mstype); 3717 default:
3718 err = 1;
3719 }
3720 } else {
3721 switch (mstype) {
3722 case DLM_MSG_LOOKUP:
3723 case DLM_MSG_REQUEST:
3724 _request_lock(r, lkb);
3725 if (is_master(r))
3726 confirm_master(r, 0);
3727 break;
3728 case DLM_MSG_CONVERT:
3729 _convert_lock(r, lkb);
3730 break;
3731 default:
3732 err = 1;
3733 }
3335 } 3734 }
3735
3736 if (err)
3737 log_error(ls, "recover_waiters_post %x %d %x %d %d",
3738 lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou);
3739 unlock_rsb(r);
3740 put_rsb(r);
3741 dlm_put_lkb(lkb);
3336 } 3742 }
3337 3743
3338 return error; 3744 return error;
@@ -3684,7 +4090,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
3684 4090
3685 /* add this new lkb to the per-process list of locks */ 4091 /* add this new lkb to the per-process list of locks */
3686 spin_lock(&ua->proc->locks_spin); 4092 spin_lock(&ua->proc->locks_spin);
3687 kref_get(&lkb->lkb_ref); 4093 hold_lkb(lkb);
3688 list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); 4094 list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
3689 spin_unlock(&ua->proc->locks_spin); 4095 spin_unlock(&ua->proc->locks_spin);
3690 out: 4096 out:
@@ -3774,6 +4180,9 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3774 4180
3775 if (error == -DLM_EUNLOCK) 4181 if (error == -DLM_EUNLOCK)
3776 error = 0; 4182 error = 0;
4183 /* from validate_unlock_args() */
4184 if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK))
4185 error = 0;
3777 if (error) 4186 if (error)
3778 goto out_put; 4187 goto out_put;
3779 4188
@@ -3786,6 +4195,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3786 dlm_put_lkb(lkb); 4195 dlm_put_lkb(lkb);
3787 out: 4196 out:
3788 unlock_recovery(ls); 4197 unlock_recovery(ls);
4198 kfree(ua_tmp);
3789 return error; 4199 return error;
3790} 4200}
3791 4201
@@ -3815,33 +4225,37 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3815 4225
3816 if (error == -DLM_ECANCEL) 4226 if (error == -DLM_ECANCEL)
3817 error = 0; 4227 error = 0;
3818 if (error) 4228 /* from validate_unlock_args() */
3819 goto out_put; 4229 if (error == -EBUSY)
3820 4230 error = 0;
3821 /* this lkb was removed from the WAITING queue */
3822 if (lkb->lkb_grmode == DLM_LOCK_IV) {
3823 spin_lock(&ua->proc->locks_spin);
3824 list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
3825 spin_unlock(&ua->proc->locks_spin);
3826 }
3827 out_put: 4231 out_put:
3828 dlm_put_lkb(lkb); 4232 dlm_put_lkb(lkb);
3829 out: 4233 out:
3830 unlock_recovery(ls); 4234 unlock_recovery(ls);
4235 kfree(ua_tmp);
3831 return error; 4236 return error;
3832} 4237}
3833 4238
4239/* lkb's that are removed from the waiters list by revert are just left on the
4240 orphans list with the granted orphan locks, to be freed by purge */
4241
3834static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) 4242static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3835{ 4243{
3836 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam; 4244 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
4245 struct dlm_args args;
4246 int error;
3837 4247
3838 if (ua->lksb.sb_lvbptr) 4248 hold_lkb(lkb);
3839 kfree(ua->lksb.sb_lvbptr); 4249 mutex_lock(&ls->ls_orphans_mutex);
3840 kfree(ua); 4250 list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans);
3841 lkb->lkb_astparam = (long)NULL; 4251 mutex_unlock(&ls->ls_orphans_mutex);
3842 4252
3843 /* TODO: propogate to master if needed */ 4253 set_unlock_args(0, ua, &args);
3844 return 0; 4254
4255 error = cancel_lock(ls, lkb, &args);
4256 if (error == -DLM_ECANCEL)
4257 error = 0;
4258 return error;
3845} 4259}
3846 4260
3847/* The force flag allows the unlock to go ahead even if the lkb isn't granted. 4261/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
@@ -3853,10 +4267,6 @@ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3853 struct dlm_args args; 4267 struct dlm_args args;
3854 int error; 4268 int error;
3855 4269
3856 /* FIXME: we need to handle the case where the lkb is in limbo
3857 while the rsb is being looked up, currently we assert in
3858 _unlock_lock/is_remote because rsb nodeid is -1. */
3859
3860 set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args); 4270 set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
3861 4271
3862 error = unlock_lock(ls, lkb, &args); 4272 error = unlock_lock(ls, lkb, &args);
@@ -3865,6 +4275,31 @@ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3865 return error; 4275 return error;
3866} 4276}
3867 4277
4278/* We have to release clear_proc_locks mutex before calling unlock_proc_lock()
4279 (which does lock_rsb) due to deadlock with receiving a message that does
4280 lock_rsb followed by dlm_user_add_ast() */
4281
4282static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
4283 struct dlm_user_proc *proc)
4284{
4285 struct dlm_lkb *lkb = NULL;
4286
4287 mutex_lock(&ls->ls_clear_proc_locks);
4288 if (list_empty(&proc->locks))
4289 goto out;
4290
4291 lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue);
4292 list_del_init(&lkb->lkb_ownqueue);
4293
4294 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
4295 lkb->lkb_flags |= DLM_IFL_ORPHAN;
4296 else
4297 lkb->lkb_flags |= DLM_IFL_DEAD;
4298 out:
4299 mutex_unlock(&ls->ls_clear_proc_locks);
4300 return lkb;
4301}
4302
3868/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which 4303/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
3869 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts, 4304 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
3870 which we clear here. */ 4305 which we clear here. */
@@ -3880,18 +4315,15 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
3880 struct dlm_lkb *lkb, *safe; 4315 struct dlm_lkb *lkb, *safe;
3881 4316
3882 lock_recovery(ls); 4317 lock_recovery(ls);
3883 mutex_lock(&ls->ls_clear_proc_locks);
3884 4318
3885 list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) { 4319 while (1) {
3886 list_del_init(&lkb->lkb_ownqueue); 4320 lkb = del_proc_lock(ls, proc);
3887 4321 if (!lkb)
3888 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) { 4322 break;
3889 lkb->lkb_flags |= DLM_IFL_ORPHAN; 4323 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
3890 orphan_proc_lock(ls, lkb); 4324 orphan_proc_lock(ls, lkb);
3891 } else { 4325 else
3892 lkb->lkb_flags |= DLM_IFL_DEAD;
3893 unlock_proc_lock(ls, lkb); 4326 unlock_proc_lock(ls, lkb);
3894 }
3895 4327
3896 /* this removes the reference for the proc->locks list 4328 /* this removes the reference for the proc->locks list
3897 added by dlm_user_request, it may result in the lkb 4329 added by dlm_user_request, it may result in the lkb
@@ -3900,6 +4332,8 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
3900 dlm_put_lkb(lkb); 4332 dlm_put_lkb(lkb);
3901 } 4333 }
3902 4334
4335 mutex_lock(&ls->ls_clear_proc_locks);
4336
3903 /* in-progress unlocks */ 4337 /* in-progress unlocks */
3904 list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { 4338 list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
3905 list_del_init(&lkb->lkb_ownqueue); 4339 list_del_init(&lkb->lkb_ownqueue);
@@ -3916,3 +4350,92 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
3916 unlock_recovery(ls); 4350 unlock_recovery(ls);
3917} 4351}
3918 4352
4353static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
4354{
4355 struct dlm_lkb *lkb, *safe;
4356
4357 while (1) {
4358 lkb = NULL;
4359 spin_lock(&proc->locks_spin);
4360 if (!list_empty(&proc->locks)) {
4361 lkb = list_entry(proc->locks.next, struct dlm_lkb,
4362 lkb_ownqueue);
4363 list_del_init(&lkb->lkb_ownqueue);
4364 }
4365 spin_unlock(&proc->locks_spin);
4366
4367 if (!lkb)
4368 break;
4369
4370 lkb->lkb_flags |= DLM_IFL_DEAD;
4371 unlock_proc_lock(ls, lkb);
4372 dlm_put_lkb(lkb); /* ref from proc->locks list */
4373 }
4374
4375 spin_lock(&proc->locks_spin);
4376 list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
4377 list_del_init(&lkb->lkb_ownqueue);
4378 lkb->lkb_flags |= DLM_IFL_DEAD;
4379 dlm_put_lkb(lkb);
4380 }
4381 spin_unlock(&proc->locks_spin);
4382
4383 spin_lock(&proc->asts_spin);
4384 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
4385 list_del(&lkb->lkb_astqueue);
4386 dlm_put_lkb(lkb);
4387 }
4388 spin_unlock(&proc->asts_spin);
4389}
4390
4391/* pid of 0 means purge all orphans */
4392
4393static void do_purge(struct dlm_ls *ls, int nodeid, int pid)
4394{
4395 struct dlm_lkb *lkb, *safe;
4396
4397 mutex_lock(&ls->ls_orphans_mutex);
4398 list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) {
4399 if (pid && lkb->lkb_ownpid != pid)
4400 continue;
4401 unlock_proc_lock(ls, lkb);
4402 list_del_init(&lkb->lkb_ownqueue);
4403 dlm_put_lkb(lkb);
4404 }
4405 mutex_unlock(&ls->ls_orphans_mutex);
4406}
4407
4408static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
4409{
4410 struct dlm_message *ms;
4411 struct dlm_mhandle *mh;
4412 int error;
4413
4414 error = _create_message(ls, sizeof(struct dlm_message), nodeid,
4415 DLM_MSG_PURGE, &ms, &mh);
4416 if (error)
4417 return error;
4418 ms->m_nodeid = nodeid;
4419 ms->m_pid = pid;
4420
4421 return send_message(mh, ms);
4422}
4423
4424int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
4425 int nodeid, int pid)
4426{
4427 int error = 0;
4428
4429 if (nodeid != dlm_our_nodeid()) {
4430 error = send_purge(ls, nodeid, pid);
4431 } else {
4432 lock_recovery(ls);
4433 if (pid == current->pid)
4434 purge_proc_locks(ls, proc);
4435 else
4436 do_purge(ls, nodeid, pid);
4437 unlock_recovery(ls);
4438 }
4439 return error;
4440}
4441
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 0843a3073ec3..64fc4ec40668 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -41,6 +41,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
41 uint32_t flags, uint32_t lkid, char *lvb_in); 41 uint32_t flags, uint32_t lkid, char *lvb_in);
42int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, 42int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
43 uint32_t flags, uint32_t lkid); 43 uint32_t flags, uint32_t lkid);
44int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
45 int nodeid, int pid);
44void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc); 46void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
45 47
46static inline int is_master(struct dlm_rsb *r) 48static inline int is_master(struct dlm_rsb *r)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f40817b53c6f..a677b2a5eed4 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -167,7 +167,6 @@ static struct kobj_type dlm_ktype = {
167}; 167};
168 168
169static struct kset dlm_kset = { 169static struct kset dlm_kset = {
170 .subsys = &kernel_subsys,
171 .kobj = {.name = "dlm",}, 170 .kobj = {.name = "dlm",},
172 .ktype = &dlm_ktype, 171 .ktype = &dlm_ktype,
173}; 172};
@@ -218,6 +217,7 @@ int dlm_lockspace_init(void)
218 INIT_LIST_HEAD(&lslist); 217 INIT_LIST_HEAD(&lslist);
219 spin_lock_init(&lslist_lock); 218 spin_lock_init(&lslist_lock);
220 219
220 kobj_set_kset_s(&dlm_kset, kernel_subsys);
221 error = kset_register(&dlm_kset); 221 error = kset_register(&dlm_kset);
222 if (error) 222 if (error)
223 printk("dlm_lockspace_init: cannot register kset %d\n", error); 223 printk("dlm_lockspace_init: cannot register kset %d\n", error);
@@ -459,6 +459,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
459 459
460 INIT_LIST_HEAD(&ls->ls_waiters); 460 INIT_LIST_HEAD(&ls->ls_waiters);
461 mutex_init(&ls->ls_waiters_mutex); 461 mutex_init(&ls->ls_waiters_mutex);
462 INIT_LIST_HEAD(&ls->ls_orphans);
463 mutex_init(&ls->ls_orphans_mutex);
462 464
463 INIT_LIST_HEAD(&ls->ls_nodes); 465 INIT_LIST_HEAD(&ls->ls_nodes);
464 INIT_LIST_HEAD(&ls->ls_nodes_gone); 466 INIT_LIST_HEAD(&ls->ls_nodes_gone);
diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c
deleted file mode 100644
index dc83a9d979b5..000000000000
--- a/fs/dlm/lowcomms-sctp.c
+++ /dev/null
@@ -1,1210 +0,0 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * lowcomms.c
16 *
17 * This is the "low-level" comms layer.
18 *
19 * It is responsible for sending/receiving messages
20 * from other nodes in the cluster.
21 *
22 * Cluster nodes are referred to by their nodeids. nodeids are
23 * simply 32 bit numbers to the locking module - if they need to
24 * be expanded for the cluster infrastructure then that is it's
25 * responsibility. It is this layer's
26 * responsibility to resolve these into IP address or
27 * whatever it needs for inter-node communication.
28 *
29 * The comms level is two kernel threads that deal mainly with
30 * the receiving of messages from other nodes and passing them
31 * up to the mid-level comms layer (which understands the
32 * message format) for execution by the locking core, and
33 * a send thread which does all the setting up of connections
34 * to remote nodes and the sending of data. Threads are not allowed
35 * to send their own data because it may cause them to wait in times
36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block.
38 *
39 * I don't see any problem with the recv thread executing the locking
40 * code on behalf of remote processes as the locking code is
41 * short, efficient and never (well, hardly ever) waits.
42 *
43 */
44
45#include <asm/ioctls.h>
46#include <net/sock.h>
47#include <net/tcp.h>
48#include <net/sctp/user.h>
49#include <linux/pagemap.h>
50#include <linux/socket.h>
51#include <linux/idr.h>
52
53#include "dlm_internal.h"
54#include "lowcomms.h"
55#include "config.h"
56#include "midcomms.h"
57
58static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
59static int dlm_local_count;
60static int dlm_local_nodeid;
61
62/* One of these per connected node */
63
64#define NI_INIT_PENDING 1
65#define NI_WRITE_PENDING 2
66
67struct nodeinfo {
68 spinlock_t lock;
69 sctp_assoc_t assoc_id;
70 unsigned long flags;
71 struct list_head write_list; /* nodes with pending writes */
72 struct list_head writequeue; /* outgoing writequeue_entries */
73 spinlock_t writequeue_lock;
74 int nodeid;
75 struct work_struct swork; /* Send workqueue */
76 struct work_struct lwork; /* Locking workqueue */
77};
78
79static DEFINE_IDR(nodeinfo_idr);
80static DECLARE_RWSEM(nodeinfo_lock);
81static int max_nodeid;
82
83struct cbuf {
84 unsigned int base;
85 unsigned int len;
86 unsigned int mask;
87};
88
89/* Just the one of these, now. But this struct keeps
90 the connection-specific variables together */
91
92#define CF_READ_PENDING 1
93
94struct connection {
95 struct socket *sock;
96 unsigned long flags;
97 struct page *rx_page;
98 atomic_t waiting_requests;
99 struct cbuf cb;
100 int eagain_flag;
101 struct work_struct work; /* Send workqueue */
102};
103
104/* An entry waiting to be sent */
105
106struct writequeue_entry {
107 struct list_head list;
108 struct page *page;
109 int offset;
110 int len;
111 int end;
112 int users;
113 struct nodeinfo *ni;
114};
115
116static void cbuf_add(struct cbuf *cb, int n)
117{
118 cb->len += n;
119}
120
121static int cbuf_data(struct cbuf *cb)
122{
123 return ((cb->base + cb->len) & cb->mask);
124}
125
126static void cbuf_init(struct cbuf *cb, int size)
127{
128 cb->base = cb->len = 0;
129 cb->mask = size-1;
130}
131
132static void cbuf_eat(struct cbuf *cb, int n)
133{
134 cb->len -= n;
135 cb->base += n;
136 cb->base &= cb->mask;
137}
138
139/* List of nodes which have writes pending */
140static LIST_HEAD(write_nodes);
141static DEFINE_SPINLOCK(write_nodes_lock);
142
143
144/* Maximum number of incoming messages to process before
145 * doing a schedule()
146 */
147#define MAX_RX_MSG_COUNT 25
148
149/* Work queues */
150static struct workqueue_struct *recv_workqueue;
151static struct workqueue_struct *send_workqueue;
152static struct workqueue_struct *lock_workqueue;
153
154/* The SCTP connection */
155static struct connection sctp_con;
156
157static void process_send_sockets(struct work_struct *work);
158static void process_recv_sockets(struct work_struct *work);
159static void process_lock_request(struct work_struct *work);
160
161static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
162{
163 struct sockaddr_storage addr;
164 int error;
165
166 if (!dlm_local_count)
167 return -1;
168
169 error = dlm_nodeid_to_addr(nodeid, &addr);
170 if (error)
171 return error;
172
173 if (dlm_local_addr[0]->ss_family == AF_INET) {
174 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
175 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
176 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
177 } else {
178 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
179 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
180 memcpy(&ret6->sin6_addr, &in6->sin6_addr,
181 sizeof(in6->sin6_addr));
182 }
183
184 return 0;
185}
186
187/* If alloc is 0 here we will not attempt to allocate a new
188 nodeinfo struct */
189static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
190{
191 struct nodeinfo *ni;
192 int r;
193 int n;
194
195 down_read(&nodeinfo_lock);
196 ni = idr_find(&nodeinfo_idr, nodeid);
197 up_read(&nodeinfo_lock);
198
199 if (ni || !alloc)
200 return ni;
201
202 down_write(&nodeinfo_lock);
203
204 ni = idr_find(&nodeinfo_idr, nodeid);
205 if (ni)
206 goto out_up;
207
208 r = idr_pre_get(&nodeinfo_idr, alloc);
209 if (!r)
210 goto out_up;
211
212 ni = kmalloc(sizeof(struct nodeinfo), alloc);
213 if (!ni)
214 goto out_up;
215
216 r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
217 if (r) {
218 kfree(ni);
219 ni = NULL;
220 goto out_up;
221 }
222 if (n != nodeid) {
223 idr_remove(&nodeinfo_idr, n);
224 kfree(ni);
225 ni = NULL;
226 goto out_up;
227 }
228 memset(ni, 0, sizeof(struct nodeinfo));
229 spin_lock_init(&ni->lock);
230 INIT_LIST_HEAD(&ni->writequeue);
231 spin_lock_init(&ni->writequeue_lock);
232 INIT_WORK(&ni->lwork, process_lock_request);
233 INIT_WORK(&ni->swork, process_send_sockets);
234 ni->nodeid = nodeid;
235
236 if (nodeid > max_nodeid)
237 max_nodeid = nodeid;
238out_up:
239 up_write(&nodeinfo_lock);
240
241 return ni;
242}
243
244/* Don't call this too often... */
245static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
246{
247 int i;
248 struct nodeinfo *ni;
249
250 for (i=1; i<=max_nodeid; i++) {
251 ni = nodeid2nodeinfo(i, 0);
252 if (ni && ni->assoc_id == assoc)
253 return ni;
254 }
255 return NULL;
256}
257
258/* Data or notification available on socket */
259static void lowcomms_data_ready(struct sock *sk, int count_unused)
260{
261 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
262 queue_work(recv_workqueue, &sctp_con.work);
263}
264
265
266/* Add the port number to an IP6 or 4 sockaddr and return the address length.
267 Also padd out the struct with zeros to make comparisons meaningful */
268
269static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
270 int *addr_len)
271{
272 struct sockaddr_in *local4_addr;
273 struct sockaddr_in6 *local6_addr;
274
275 if (!dlm_local_count)
276 return;
277
278 if (!port) {
279 if (dlm_local_addr[0]->ss_family == AF_INET) {
280 local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
281 port = be16_to_cpu(local4_addr->sin_port);
282 } else {
283 local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
284 port = be16_to_cpu(local6_addr->sin6_port);
285 }
286 }
287
288 saddr->ss_family = dlm_local_addr[0]->ss_family;
289 if (dlm_local_addr[0]->ss_family == AF_INET) {
290 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
291 in4_addr->sin_port = cpu_to_be16(port);
292 memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
293 memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
294 sizeof(struct sockaddr_in));
295 *addr_len = sizeof(struct sockaddr_in);
296 } else {
297 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
298 in6_addr->sin6_port = cpu_to_be16(port);
299 memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
300 sizeof(struct sockaddr_in6));
301 *addr_len = sizeof(struct sockaddr_in6);
302 }
303}
304
305/* Close the connection and tidy up */
306static void close_connection(void)
307{
308 if (sctp_con.sock) {
309 sock_release(sctp_con.sock);
310 sctp_con.sock = NULL;
311 }
312
313 if (sctp_con.rx_page) {
314 __free_page(sctp_con.rx_page);
315 sctp_con.rx_page = NULL;
316 }
317}
318
319/* We only send shutdown messages to nodes that are not part of the cluster */
320static void send_shutdown(sctp_assoc_t associd)
321{
322 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
323 struct msghdr outmessage;
324 struct cmsghdr *cmsg;
325 struct sctp_sndrcvinfo *sinfo;
326 int ret;
327
328 outmessage.msg_name = NULL;
329 outmessage.msg_namelen = 0;
330 outmessage.msg_control = outcmsg;
331 outmessage.msg_controllen = sizeof(outcmsg);
332 outmessage.msg_flags = MSG_EOR;
333
334 cmsg = CMSG_FIRSTHDR(&outmessage);
335 cmsg->cmsg_level = IPPROTO_SCTP;
336 cmsg->cmsg_type = SCTP_SNDRCV;
337 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
338 outmessage.msg_controllen = cmsg->cmsg_len;
339 sinfo = CMSG_DATA(cmsg);
340 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
341
342 sinfo->sinfo_flags |= MSG_EOF;
343 sinfo->sinfo_assoc_id = associd;
344
345 ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
346
347 if (ret != 0)
348 log_print("send EOF to node failed: %d", ret);
349}
350
351
352/* INIT failed but we don't know which node...
353 restart INIT on all pending nodes */
354static void init_failed(void)
355{
356 int i;
357 struct nodeinfo *ni;
358
359 for (i=1; i<=max_nodeid; i++) {
360 ni = nodeid2nodeinfo(i, 0);
361 if (!ni)
362 continue;
363
364 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
365 ni->assoc_id = 0;
366 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
367 spin_lock_bh(&write_nodes_lock);
368 list_add_tail(&ni->write_list, &write_nodes);
369 spin_unlock_bh(&write_nodes_lock);
370 queue_work(send_workqueue, &ni->swork);
371 }
372 }
373 }
374}
375
376/* Something happened to an association */
377static void process_sctp_notification(struct msghdr *msg, char *buf)
378{
379 union sctp_notification *sn = (union sctp_notification *)buf;
380
381 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
382 switch (sn->sn_assoc_change.sac_state) {
383
384 case SCTP_COMM_UP:
385 case SCTP_RESTART:
386 {
387 /* Check that the new node is in the lockspace */
388 struct sctp_prim prim;
389 mm_segment_t fs;
390 int nodeid;
391 int prim_len, ret;
392 int addr_len;
393 struct nodeinfo *ni;
394
395 /* This seems to happen when we received a connection
396 * too early... or something... anyway, it happens but
397 * we always seem to get a real message too, see
398 * receive_from_sock */
399
400 if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
401 log_print("COMM_UP for invalid assoc ID %d",
402 (int)sn->sn_assoc_change.sac_assoc_id);
403 init_failed();
404 return;
405 }
406 memset(&prim, 0, sizeof(struct sctp_prim));
407 prim_len = sizeof(struct sctp_prim);
408 prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
409
410 fs = get_fs();
411 set_fs(get_ds());
412 ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
413 IPPROTO_SCTP,
414 SCTP_PRIMARY_ADDR,
415 (char*)&prim,
416 &prim_len);
417 set_fs(fs);
418 if (ret < 0) {
419 struct nodeinfo *ni;
420
421 log_print("getsockopt/sctp_primary_addr on "
422 "new assoc %d failed : %d",
423 (int)sn->sn_assoc_change.sac_assoc_id,
424 ret);
425
426 /* Retry INIT later */
427 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
428 if (ni)
429 clear_bit(NI_INIT_PENDING, &ni->flags);
430 return;
431 }
432 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
433 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
434 log_print("reject connect from unknown addr");
435 send_shutdown(prim.ssp_assoc_id);
436 return;
437 }
438
439 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
440 if (!ni)
441 return;
442
443 /* Save the assoc ID */
444 ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
445
446 log_print("got new/restarted association %d nodeid %d",
447 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
448
449 /* Send any pending writes */
450 clear_bit(NI_INIT_PENDING, &ni->flags);
451 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
452 spin_lock_bh(&write_nodes_lock);
453 list_add_tail(&ni->write_list, &write_nodes);
454 spin_unlock_bh(&write_nodes_lock);
455 queue_work(send_workqueue, &ni->swork);
456 }
457 }
458 break;
459
460 case SCTP_COMM_LOST:
461 case SCTP_SHUTDOWN_COMP:
462 {
463 struct nodeinfo *ni;
464
465 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
466 if (ni) {
467 spin_lock(&ni->lock);
468 ni->assoc_id = 0;
469 spin_unlock(&ni->lock);
470 }
471 }
472 break;
473
474 /* We don't know which INIT failed, so clear the PENDING flags
475 * on them all. if assoc_id is zero then it will then try
476 * again */
477
478 case SCTP_CANT_STR_ASSOC:
479 {
480 log_print("Can't start SCTP association - retrying");
481 init_failed();
482 }
483 break;
484
485 default:
486 log_print("unexpected SCTP assoc change id=%d state=%d",
487 (int)sn->sn_assoc_change.sac_assoc_id,
488 sn->sn_assoc_change.sac_state);
489 }
490 }
491}
492
493/* Data received from remote end */
494static int receive_from_sock(void)
495{
496 int ret = 0;
497 struct msghdr msg;
498 struct kvec iov[2];
499 unsigned len;
500 int r;
501 struct sctp_sndrcvinfo *sinfo;
502 struct cmsghdr *cmsg;
503 struct nodeinfo *ni;
504
505 /* These two are marginally too big for stack allocation, but this
506 * function is (currently) only called by dlm_recvd so static should be
507 * OK.
508 */
509 static struct sockaddr_storage msgname;
510 static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
511
512 if (sctp_con.sock == NULL)
513 goto out;
514
515 if (sctp_con.rx_page == NULL) {
516 /*
517 * This doesn't need to be atomic, but I think it should
518 * improve performance if it is.
519 */
520 sctp_con.rx_page = alloc_page(GFP_ATOMIC);
521 if (sctp_con.rx_page == NULL)
522 goto out_resched;
523 cbuf_init(&sctp_con.cb, PAGE_CACHE_SIZE);
524 }
525
526 memset(&incmsg, 0, sizeof(incmsg));
527 memset(&msgname, 0, sizeof(msgname));
528
529 msg.msg_name = &msgname;
530 msg.msg_namelen = sizeof(msgname);
531 msg.msg_flags = 0;
532 msg.msg_control = incmsg;
533 msg.msg_controllen = sizeof(incmsg);
534 msg.msg_iovlen = 1;
535
536 /* I don't see why this circular buffer stuff is necessary for SCTP
537 * which is a packet-based protocol, but the whole thing breaks under
538 * load without it! The overhead is minimal (and is in the TCP lowcomms
539 * anyway, of course) so I'll leave it in until I can figure out what's
540 * really happening.
541 */
542
543 /*
544 * iov[0] is the bit of the circular buffer between the current end
545 * point (cb.base + cb.len) and the end of the buffer.
546 */
547 iov[0].iov_len = sctp_con.cb.base - cbuf_data(&sctp_con.cb);
548 iov[0].iov_base = page_address(sctp_con.rx_page) +
549 cbuf_data(&sctp_con.cb);
550 iov[1].iov_len = 0;
551
552 /*
553 * iov[1] is the bit of the circular buffer between the start of the
554 * buffer and the start of the currently used section (cb.base)
555 */
556 if (cbuf_data(&sctp_con.cb) >= sctp_con.cb.base) {
557 iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&sctp_con.cb);
558 iov[1].iov_len = sctp_con.cb.base;
559 iov[1].iov_base = page_address(sctp_con.rx_page);
560 msg.msg_iovlen = 2;
561 }
562 len = iov[0].iov_len + iov[1].iov_len;
563
564 r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len,
565 MSG_NOSIGNAL | MSG_DONTWAIT);
566 if (ret <= 0)
567 goto out_close;
568
569 msg.msg_control = incmsg;
570 msg.msg_controllen = sizeof(incmsg);
571 cmsg = CMSG_FIRSTHDR(&msg);
572 sinfo = CMSG_DATA(cmsg);
573
574 if (msg.msg_flags & MSG_NOTIFICATION) {
575 process_sctp_notification(&msg, page_address(sctp_con.rx_page));
576 return 0;
577 }
578
579 /* Is this a new association ? */
580 ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
581 if (ni) {
582 ni->assoc_id = sinfo->sinfo_assoc_id;
583 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
584
585 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
586 spin_lock_bh(&write_nodes_lock);
587 list_add_tail(&ni->write_list, &write_nodes);
588 spin_unlock_bh(&write_nodes_lock);
589 queue_work(send_workqueue, &ni->swork);
590 }
591 }
592 }
593
594 /* INIT sends a message with length of 1 - ignore it */
595 if (r == 1)
596 return 0;
597
598 cbuf_add(&sctp_con.cb, ret);
599 // PJC: TODO: Add to node's workqueue....can we ??
600 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
601 page_address(sctp_con.rx_page),
602 sctp_con.cb.base, sctp_con.cb.len,
603 PAGE_CACHE_SIZE);
604 if (ret < 0)
605 goto out_close;
606 cbuf_eat(&sctp_con.cb, ret);
607
608out:
609 ret = 0;
610 goto out_ret;
611
612out_resched:
613 lowcomms_data_ready(sctp_con.sock->sk, 0);
614 ret = 0;
615 cond_resched();
616 goto out_ret;
617
618out_close:
619 if (ret != -EAGAIN)
620 log_print("error reading from sctp socket: %d", ret);
621out_ret:
622 return ret;
623}
624
625/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
626static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
627{
628 mm_segment_t fs;
629 int result = 0;
630
631 fs = get_fs();
632 set_fs(get_ds());
633 if (num == 1)
634 result = sctp_con.sock->ops->bind(sctp_con.sock,
635 (struct sockaddr *) addr,
636 addr_len);
637 else
638 result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
639 SCTP_SOCKOPT_BINDX_ADD,
640 (char *)addr, addr_len);
641 set_fs(fs);
642
643 if (result < 0)
644 log_print("Can't bind to port %d addr number %d",
645 dlm_config.ci_tcp_port, num);
646
647 return result;
648}
649
650static void init_local(void)
651{
652 struct sockaddr_storage sas, *addr;
653 int i;
654
655 dlm_local_nodeid = dlm_our_nodeid();
656
657 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
658 if (dlm_our_addr(&sas, i))
659 break;
660
661 addr = kmalloc(sizeof(*addr), GFP_KERNEL);
662 if (!addr)
663 break;
664 memcpy(addr, &sas, sizeof(*addr));
665 dlm_local_addr[dlm_local_count++] = addr;
666 }
667}
668
669/* Initialise SCTP socket and bind to all interfaces */
670static int init_sock(void)
671{
672 mm_segment_t fs;
673 struct socket *sock = NULL;
674 struct sockaddr_storage localaddr;
675 struct sctp_event_subscribe subscribe;
676 int result = -EINVAL, num = 1, i, addr_len;
677
678 if (!dlm_local_count) {
679 init_local();
680 if (!dlm_local_count) {
681 log_print("no local IP address has been set");
682 goto out;
683 }
684 }
685
686 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
687 IPPROTO_SCTP, &sock);
688 if (result < 0) {
689 log_print("Can't create comms socket, check SCTP is loaded");
690 goto out;
691 }
692
693 /* Listen for events */
694 memset(&subscribe, 0, sizeof(subscribe));
695 subscribe.sctp_data_io_event = 1;
696 subscribe.sctp_association_event = 1;
697 subscribe.sctp_send_failure_event = 1;
698 subscribe.sctp_shutdown_event = 1;
699 subscribe.sctp_partial_delivery_event = 1;
700
701 fs = get_fs();
702 set_fs(get_ds());
703 result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
704 (char *)&subscribe, sizeof(subscribe));
705 set_fs(fs);
706
707 if (result < 0) {
708 log_print("Failed to set SCTP_EVENTS on socket: result=%d",
709 result);
710 goto create_delsock;
711 }
712
713 /* Init con struct */
714 sock->sk->sk_user_data = &sctp_con;
715 sctp_con.sock = sock;
716 sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
717
718 /* Bind to all interfaces. */
719 for (i = 0; i < dlm_local_count; i++) {
720 memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
721 make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
722
723 result = add_bind_addr(&localaddr, addr_len, num);
724 if (result)
725 goto create_delsock;
726 ++num;
727 }
728
729 result = sock->ops->listen(sock, 5);
730 if (result < 0) {
731 log_print("Can't set socket listening");
732 goto create_delsock;
733 }
734
735 return 0;
736
737create_delsock:
738 sock_release(sock);
739 sctp_con.sock = NULL;
740out:
741 return result;
742}
743
744
745static struct writequeue_entry *new_writequeue_entry(gfp_t allocation)
746{
747 struct writequeue_entry *entry;
748
749 entry = kmalloc(sizeof(struct writequeue_entry), allocation);
750 if (!entry)
751 return NULL;
752
753 entry->page = alloc_page(allocation);
754 if (!entry->page) {
755 kfree(entry);
756 return NULL;
757 }
758
759 entry->offset = 0;
760 entry->len = 0;
761 entry->end = 0;
762 entry->users = 0;
763
764 return entry;
765}
766
767void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
768{
769 struct writequeue_entry *e;
770 int offset = 0;
771 int users = 0;
772 struct nodeinfo *ni;
773
774 ni = nodeid2nodeinfo(nodeid, allocation);
775 if (!ni)
776 return NULL;
777
778 spin_lock(&ni->writequeue_lock);
779 e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
780 if ((&e->list == &ni->writequeue) ||
781 (PAGE_CACHE_SIZE - e->end < len)) {
782 e = NULL;
783 } else {
784 offset = e->end;
785 e->end += len;
786 users = e->users++;
787 }
788 spin_unlock(&ni->writequeue_lock);
789
790 if (e) {
791 got_one:
792 if (users == 0)
793 kmap(e->page);
794 *ppc = page_address(e->page) + offset;
795 return e;
796 }
797
798 e = new_writequeue_entry(allocation);
799 if (e) {
800 spin_lock(&ni->writequeue_lock);
801 offset = e->end;
802 e->end += len;
803 e->ni = ni;
804 users = e->users++;
805 list_add_tail(&e->list, &ni->writequeue);
806 spin_unlock(&ni->writequeue_lock);
807 goto got_one;
808 }
809 return NULL;
810}
811
812void dlm_lowcomms_commit_buffer(void *arg)
813{
814 struct writequeue_entry *e = (struct writequeue_entry *) arg;
815 int users;
816 struct nodeinfo *ni = e->ni;
817
818 spin_lock(&ni->writequeue_lock);
819 users = --e->users;
820 if (users)
821 goto out;
822 e->len = e->end - e->offset;
823 kunmap(e->page);
824 spin_unlock(&ni->writequeue_lock);
825
826 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
827 spin_lock_bh(&write_nodes_lock);
828 list_add_tail(&ni->write_list, &write_nodes);
829 spin_unlock_bh(&write_nodes_lock);
830
831 queue_work(send_workqueue, &ni->swork);
832 }
833 return;
834
835out:
836 spin_unlock(&ni->writequeue_lock);
837 return;
838}
839
840static void free_entry(struct writequeue_entry *e)
841{
842 __free_page(e->page);
843 kfree(e);
844}
845
846/* Initiate an SCTP association. In theory we could just use sendmsg() on
847 the first IP address and it should work, but this allows us to set up the
848 association before sending any valuable data that we can't afford to lose.
849 It also keeps the send path clean as it can now always use the association ID */
850static void initiate_association(int nodeid)
851{
852 struct sockaddr_storage rem_addr;
853 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
854 struct msghdr outmessage;
855 struct cmsghdr *cmsg;
856 struct sctp_sndrcvinfo *sinfo;
857 int ret;
858 int addrlen;
859 char buf[1];
860 struct kvec iov[1];
861 struct nodeinfo *ni;
862
863 log_print("Initiating association with node %d", nodeid);
864
865 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
866 if (!ni)
867 return;
868
869 if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
870 log_print("no address for nodeid %d", nodeid);
871 return;
872 }
873
874 make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
875
876 outmessage.msg_name = &rem_addr;
877 outmessage.msg_namelen = addrlen;
878 outmessage.msg_control = outcmsg;
879 outmessage.msg_controllen = sizeof(outcmsg);
880 outmessage.msg_flags = MSG_EOR;
881
882 iov[0].iov_base = buf;
883 iov[0].iov_len = 1;
884
885 /* Real INIT messages seem to cause trouble. Just send a 1 byte message
886 we can afford to lose */
887 cmsg = CMSG_FIRSTHDR(&outmessage);
888 cmsg->cmsg_level = IPPROTO_SCTP;
889 cmsg->cmsg_type = SCTP_SNDRCV;
890 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
891 sinfo = CMSG_DATA(cmsg);
892 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
893 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
894
895 outmessage.msg_controllen = cmsg->cmsg_len;
896 ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
897 if (ret < 0) {
898 log_print("send INIT to node failed: %d", ret);
899 /* Try again later */
900 clear_bit(NI_INIT_PENDING, &ni->flags);
901 }
902}
903
904/* Send a message */
905static void send_to_sock(struct nodeinfo *ni)
906{
907 int ret = 0;
908 struct writequeue_entry *e;
909 int len, offset;
910 struct msghdr outmsg;
911 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
912 struct cmsghdr *cmsg;
913 struct sctp_sndrcvinfo *sinfo;
914 struct kvec iov;
915
916 /* See if we need to init an association before we start
917 sending precious messages */
918 spin_lock(&ni->lock);
919 if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
920 spin_unlock(&ni->lock);
921 initiate_association(ni->nodeid);
922 return;
923 }
924 spin_unlock(&ni->lock);
925
926 outmsg.msg_name = NULL; /* We use assoc_id */
927 outmsg.msg_namelen = 0;
928 outmsg.msg_control = outcmsg;
929 outmsg.msg_controllen = sizeof(outcmsg);
930 outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
931
932 cmsg = CMSG_FIRSTHDR(&outmsg);
933 cmsg->cmsg_level = IPPROTO_SCTP;
934 cmsg->cmsg_type = SCTP_SNDRCV;
935 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
936 sinfo = CMSG_DATA(cmsg);
937 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
938 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
939 sinfo->sinfo_assoc_id = ni->assoc_id;
940 outmsg.msg_controllen = cmsg->cmsg_len;
941
942 spin_lock(&ni->writequeue_lock);
943 for (;;) {
944 if (list_empty(&ni->writequeue))
945 break;
946 e = list_entry(ni->writequeue.next, struct writequeue_entry,
947 list);
948 len = e->len;
949 offset = e->offset;
950 BUG_ON(len == 0 && e->users == 0);
951 spin_unlock(&ni->writequeue_lock);
952 kmap(e->page);
953
954 ret = 0;
955 if (len) {
956 iov.iov_base = page_address(e->page)+offset;
957 iov.iov_len = len;
958
959 ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
960 len);
961 if (ret == -EAGAIN) {
962 sctp_con.eagain_flag = 1;
963 goto out;
964 } else if (ret < 0)
965 goto send_error;
966 } else {
967 /* Don't starve people filling buffers */
968 cond_resched();
969 }
970
971 spin_lock(&ni->writequeue_lock);
972 e->offset += ret;
973 e->len -= ret;
974
975 if (e->len == 0 && e->users == 0) {
976 list_del(&e->list);
977 kunmap(e->page);
978 free_entry(e);
979 continue;
980 }
981 }
982 spin_unlock(&ni->writequeue_lock);
983out:
984 return;
985
986send_error:
987 log_print("Error sending to node %d %d", ni->nodeid, ret);
988 spin_lock(&ni->lock);
989 if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
990 ni->assoc_id = 0;
991 spin_unlock(&ni->lock);
992 initiate_association(ni->nodeid);
993 } else
994 spin_unlock(&ni->lock);
995
996 return;
997}
998
999/* Try to send any messages that are pending */
1000static void process_output_queue(void)
1001{
1002 struct list_head *list;
1003 struct list_head *temp;
1004
1005 spin_lock_bh(&write_nodes_lock);
1006 list_for_each_safe(list, temp, &write_nodes) {
1007 struct nodeinfo *ni =
1008 list_entry(list, struct nodeinfo, write_list);
1009 clear_bit(NI_WRITE_PENDING, &ni->flags);
1010 list_del(&ni->write_list);
1011
1012 spin_unlock_bh(&write_nodes_lock);
1013
1014 send_to_sock(ni);
1015 spin_lock_bh(&write_nodes_lock);
1016 }
1017 spin_unlock_bh(&write_nodes_lock);
1018}
1019
1020/* Called after we've had -EAGAIN and been woken up */
1021static void refill_write_queue(void)
1022{
1023 int i;
1024
1025 for (i=1; i<=max_nodeid; i++) {
1026 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1027
1028 if (ni) {
1029 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
1030 spin_lock_bh(&write_nodes_lock);
1031 list_add_tail(&ni->write_list, &write_nodes);
1032 spin_unlock_bh(&write_nodes_lock);
1033 }
1034 }
1035 }
1036}
1037
1038static void clean_one_writequeue(struct nodeinfo *ni)
1039{
1040 struct list_head *list;
1041 struct list_head *temp;
1042
1043 spin_lock(&ni->writequeue_lock);
1044 list_for_each_safe(list, temp, &ni->writequeue) {
1045 struct writequeue_entry *e =
1046 list_entry(list, struct writequeue_entry, list);
1047 list_del(&e->list);
1048 free_entry(e);
1049 }
1050 spin_unlock(&ni->writequeue_lock);
1051}
1052
1053static void clean_writequeues(void)
1054{
1055 int i;
1056
1057 for (i=1; i<=max_nodeid; i++) {
1058 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1059 if (ni)
1060 clean_one_writequeue(ni);
1061 }
1062}
1063
1064
1065static void dealloc_nodeinfo(void)
1066{
1067 int i;
1068
1069 for (i=1; i<=max_nodeid; i++) {
1070 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1071 if (ni) {
1072 idr_remove(&nodeinfo_idr, i);
1073 kfree(ni);
1074 }
1075 }
1076}
1077
1078int dlm_lowcomms_close(int nodeid)
1079{
1080 struct nodeinfo *ni;
1081
1082 ni = nodeid2nodeinfo(nodeid, 0);
1083 if (!ni)
1084 return -1;
1085
1086 spin_lock(&ni->lock);
1087 if (ni->assoc_id) {
1088 ni->assoc_id = 0;
1089 /* Don't send shutdown here, sctp will just queue it
1090 till the node comes back up! */
1091 }
1092 spin_unlock(&ni->lock);
1093
1094 clean_one_writequeue(ni);
1095 clear_bit(NI_INIT_PENDING, &ni->flags);
1096 return 0;
1097}
1098
1099// PJC: The work queue function for receiving.
1100static void process_recv_sockets(struct work_struct *work)
1101{
1102 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1103 int ret;
1104 int count = 0;
1105
1106 do {
1107 ret = receive_from_sock();
1108
1109 /* Don't starve out everyone else */
1110 if (++count >= MAX_RX_MSG_COUNT) {
1111 cond_resched();
1112 count = 0;
1113 }
1114 } while (!kthread_should_stop() && ret >=0);
1115 }
1116 cond_resched();
1117}
1118
1119// PJC: the work queue function for sending
1120static void process_send_sockets(struct work_struct *work)
1121{
1122 if (sctp_con.eagain_flag) {
1123 sctp_con.eagain_flag = 0;
1124 refill_write_queue();
1125 }
1126 process_output_queue();
1127}
1128
1129// PJC: Process lock requests from a particular node.
1130// TODO: can we optimise this out on UP ??
1131static void process_lock_request(struct work_struct *work)
1132{
1133}
1134
1135static void daemons_stop(void)
1136{
1137 destroy_workqueue(recv_workqueue);
1138 destroy_workqueue(send_workqueue);
1139 destroy_workqueue(lock_workqueue);
1140}
1141
1142static int daemons_start(void)
1143{
1144 int error;
1145 recv_workqueue = create_workqueue("dlm_recv");
1146 error = IS_ERR(recv_workqueue);
1147 if (error) {
1148 log_print("can't start dlm_recv %d", error);
1149 return error;
1150 }
1151
1152 send_workqueue = create_singlethread_workqueue("dlm_send");
1153 error = IS_ERR(send_workqueue);
1154 if (error) {
1155 log_print("can't start dlm_send %d", error);
1156 destroy_workqueue(recv_workqueue);
1157 return error;
1158 }
1159
1160 lock_workqueue = create_workqueue("dlm_rlock");
1161 error = IS_ERR(lock_workqueue);
1162 if (error) {
1163 log_print("can't start dlm_rlock %d", error);
1164 destroy_workqueue(send_workqueue);
1165 destroy_workqueue(recv_workqueue);
1166 return error;
1167 }
1168
1169 return 0;
1170}
1171
1172/*
1173 * This is quite likely to sleep...
1174 */
1175int dlm_lowcomms_start(void)
1176{
1177 int error;
1178
1179 INIT_WORK(&sctp_con.work, process_recv_sockets);
1180
1181 error = init_sock();
1182 if (error)
1183 goto fail_sock;
1184 error = daemons_start();
1185 if (error)
1186 goto fail_sock;
1187 return 0;
1188
1189fail_sock:
1190 close_connection();
1191 return error;
1192}
1193
1194void dlm_lowcomms_stop(void)
1195{
1196 int i;
1197
1198 sctp_con.flags = 0x7;
1199 daemons_stop();
1200 clean_writequeues();
1201 close_connection();
1202 dealloc_nodeinfo();
1203 max_nodeid = 0;
1204
1205 dlm_local_count = 0;
1206 dlm_local_nodeid = 0;
1207
1208 for (i = 0; i < dlm_local_count; i++)
1209 kfree(dlm_local_addr[i]);
1210}
diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms.c
index 07e0a122c32f..27970a58d29b 100644
--- a/fs/dlm/lowcomms-tcp.c
+++ b/fs/dlm/lowcomms.c
@@ -36,30 +36,36 @@
36 * of high load. Also, this way, the sending thread can collect together 36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block. 37 * messages bound for one node and send them in one block.
38 * 38 *
39 * I don't see any problem with the recv thread executing the locking 39 * lowcomms will choose to use wither TCP or SCTP as its transport layer
40 * code on behalf of remote processes as the locking code is 40 * depending on the configuration variable 'protocol'. This should be set
41 * short, efficient and never waits. 41 * to 0 (default) for TCP or 1 for SCTP. It shouldbe configured using a
42 * cluster-wide mechanism as it must be the same on all nodes of the cluster
43 * for the DLM to function.
42 * 44 *
43 */ 45 */
44 46
45
46#include <asm/ioctls.h> 47#include <asm/ioctls.h>
47#include <net/sock.h> 48#include <net/sock.h>
48#include <net/tcp.h> 49#include <net/tcp.h>
49#include <linux/pagemap.h> 50#include <linux/pagemap.h>
51#include <linux/idr.h>
52#include <linux/file.h>
53#include <linux/sctp.h>
54#include <net/sctp/user.h>
50 55
51#include "dlm_internal.h" 56#include "dlm_internal.h"
52#include "lowcomms.h" 57#include "lowcomms.h"
53#include "midcomms.h" 58#include "midcomms.h"
54#include "config.h" 59#include "config.h"
55 60
61#define NEEDED_RMEM (4*1024*1024)
62
56struct cbuf { 63struct cbuf {
57 unsigned int base; 64 unsigned int base;
58 unsigned int len; 65 unsigned int len;
59 unsigned int mask; 66 unsigned int mask;
60}; 67};
61 68
62#define NODE_INCREMENT 32
63static void cbuf_add(struct cbuf *cb, int n) 69static void cbuf_add(struct cbuf *cb, int n)
64{ 70{
65 cb->len += n; 71 cb->len += n;
@@ -88,28 +94,25 @@ static bool cbuf_empty(struct cbuf *cb)
88 return cb->len == 0; 94 return cb->len == 0;
89} 95}
90 96
91/* Maximum number of incoming messages to process before
92 doing a cond_resched()
93*/
94#define MAX_RX_MSG_COUNT 25
95
96struct connection { 97struct connection {
97 struct socket *sock; /* NULL if not connected */ 98 struct socket *sock; /* NULL if not connected */
98 uint32_t nodeid; /* So we know who we are in the list */ 99 uint32_t nodeid; /* So we know who we are in the list */
99 struct mutex sock_mutex; 100 struct mutex sock_mutex;
100 unsigned long flags; /* bit 1,2 = We are on the read/write lists */ 101 unsigned long flags;
101#define CF_READ_PENDING 1 102#define CF_READ_PENDING 1
102#define CF_WRITE_PENDING 2 103#define CF_WRITE_PENDING 2
103#define CF_CONNECT_PENDING 3 104#define CF_CONNECT_PENDING 3
104#define CF_IS_OTHERCON 4 105#define CF_INIT_PENDING 4
106#define CF_IS_OTHERCON 5
105 struct list_head writequeue; /* List of outgoing writequeue_entries */ 107 struct list_head writequeue; /* List of outgoing writequeue_entries */
106 struct list_head listenlist; /* List of allocated listening sockets */
107 spinlock_t writequeue_lock; 108 spinlock_t writequeue_lock;
108 int (*rx_action) (struct connection *); /* What to do when active */ 109 int (*rx_action) (struct connection *); /* What to do when active */
110 void (*connect_action) (struct connection *); /* What to do to connect */
109 struct page *rx_page; 111 struct page *rx_page;
110 struct cbuf cb; 112 struct cbuf cb;
111 int retries; 113 int retries;
112#define MAX_CONNECT_RETRIES 3 114#define MAX_CONNECT_RETRIES 3
115 int sctp_assoc;
113 struct connection *othercon; 116 struct connection *othercon;
114 struct work_struct rwork; /* Receive workqueue */ 117 struct work_struct rwork; /* Receive workqueue */
115 struct work_struct swork; /* Send workqueue */ 118 struct work_struct swork; /* Send workqueue */
@@ -127,68 +130,136 @@ struct writequeue_entry {
127 struct connection *con; 130 struct connection *con;
128}; 131};
129 132
130static struct sockaddr_storage dlm_local_addr; 133static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
134static int dlm_local_count;
131 135
132/* Work queues */ 136/* Work queues */
133static struct workqueue_struct *recv_workqueue; 137static struct workqueue_struct *recv_workqueue;
134static struct workqueue_struct *send_workqueue; 138static struct workqueue_struct *send_workqueue;
135 139
136/* An array of pointers to connections, indexed by NODEID */ 140static DEFINE_IDR(connections_idr);
137static struct connection **connections;
138static DECLARE_MUTEX(connections_lock); 141static DECLARE_MUTEX(connections_lock);
142static int max_nodeid;
139static struct kmem_cache *con_cache; 143static struct kmem_cache *con_cache;
140static int conn_array_size;
141 144
142static void process_recv_sockets(struct work_struct *work); 145static void process_recv_sockets(struct work_struct *work);
143static void process_send_sockets(struct work_struct *work); 146static void process_send_sockets(struct work_struct *work);
144 147
145static struct connection *nodeid2con(int nodeid, gfp_t allocation) 148/*
149 * If 'allocation' is zero then we don't attempt to create a new
150 * connection structure for this node.
151 */
152static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
146{ 153{
147 struct connection *con = NULL; 154 struct connection *con = NULL;
155 int r;
156 int n;
148 157
149 down(&connections_lock); 158 con = idr_find(&connections_idr, nodeid);
150 if (nodeid >= conn_array_size) { 159 if (con || !alloc)
151 int new_size = nodeid + NODE_INCREMENT; 160 return con;
152 struct connection **new_conns;
153 161
154 new_conns = kzalloc(sizeof(struct connection *) * 162 r = idr_pre_get(&connections_idr, alloc);
155 new_size, allocation); 163 if (!r)
156 if (!new_conns) 164 return NULL;
157 goto finish; 165
166 con = kmem_cache_zalloc(con_cache, alloc);
167 if (!con)
168 return NULL;
158 169
159 memcpy(new_conns, connections, sizeof(struct connection *) * conn_array_size); 170 r = idr_get_new_above(&connections_idr, con, nodeid, &n);
160 conn_array_size = new_size; 171 if (r) {
161 kfree(connections); 172 kmem_cache_free(con_cache, con);
162 connections = new_conns; 173 return NULL;
174 }
163 175
176 if (n != nodeid) {
177 idr_remove(&connections_idr, n);
178 kmem_cache_free(con_cache, con);
179 return NULL;
164 } 180 }
165 181
166 con = connections[nodeid]; 182 con->nodeid = nodeid;
167 if (con == NULL && allocation) { 183 mutex_init(&con->sock_mutex);
168 con = kmem_cache_zalloc(con_cache, allocation); 184 INIT_LIST_HEAD(&con->writequeue);
169 if (!con) 185 spin_lock_init(&con->writequeue_lock);
170 goto finish; 186 INIT_WORK(&con->swork, process_send_sockets);
187 INIT_WORK(&con->rwork, process_recv_sockets);
171 188
172 con->nodeid = nodeid; 189 /* Setup action pointers for child sockets */
173 mutex_init(&con->sock_mutex); 190 if (con->nodeid) {
174 INIT_LIST_HEAD(&con->writequeue); 191 struct connection *zerocon = idr_find(&connections_idr, 0);
175 spin_lock_init(&con->writequeue_lock);
176 INIT_WORK(&con->swork, process_send_sockets);
177 INIT_WORK(&con->rwork, process_recv_sockets);
178 192
179 connections[nodeid] = con; 193 con->connect_action = zerocon->connect_action;
194 if (!con->rx_action)
195 con->rx_action = zerocon->rx_action;
180 } 196 }
181 197
182finish: 198 if (nodeid > max_nodeid)
199 max_nodeid = nodeid;
200
201 return con;
202}
203
204static struct connection *nodeid2con(int nodeid, gfp_t allocation)
205{
206 struct connection *con;
207
208 down(&connections_lock);
209 con = __nodeid2con(nodeid, allocation);
183 up(&connections_lock); 210 up(&connections_lock);
211
184 return con; 212 return con;
185} 213}
186 214
215/* This is a bit drastic, but only called when things go wrong */
216static struct connection *assoc2con(int assoc_id)
217{
218 int i;
219 struct connection *con;
220
221 down(&connections_lock);
222 for (i=0; i<=max_nodeid; i++) {
223 con = __nodeid2con(i, 0);
224 if (con && con->sctp_assoc == assoc_id) {
225 up(&connections_lock);
226 return con;
227 }
228 }
229 up(&connections_lock);
230 return NULL;
231}
232
233static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
234{
235 struct sockaddr_storage addr;
236 int error;
237
238 if (!dlm_local_count)
239 return -1;
240
241 error = dlm_nodeid_to_addr(nodeid, &addr);
242 if (error)
243 return error;
244
245 if (dlm_local_addr[0]->ss_family == AF_INET) {
246 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
247 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
248 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
249 } else {
250 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
251 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
252 memcpy(&ret6->sin6_addr, &in6->sin6_addr,
253 sizeof(in6->sin6_addr));
254 }
255
256 return 0;
257}
258
187/* Data available on socket or listen socket received a connect */ 259/* Data available on socket or listen socket received a connect */
188static void lowcomms_data_ready(struct sock *sk, int count_unused) 260static void lowcomms_data_ready(struct sock *sk, int count_unused)
189{ 261{
190 struct connection *con = sock2con(sk); 262 struct connection *con = sock2con(sk);
191
192 if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) 263 if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
193 queue_work(recv_workqueue, &con->rwork); 264 queue_work(recv_workqueue, &con->rwork);
194} 265}
@@ -222,20 +293,21 @@ static int add_sock(struct socket *sock, struct connection *con)
222 con->sock->sk->sk_data_ready = lowcomms_data_ready; 293 con->sock->sk->sk_data_ready = lowcomms_data_ready;
223 con->sock->sk->sk_write_space = lowcomms_write_space; 294 con->sock->sk->sk_write_space = lowcomms_write_space;
224 con->sock->sk->sk_state_change = lowcomms_state_change; 295 con->sock->sk->sk_state_change = lowcomms_state_change;
225 296 con->sock->sk->sk_user_data = con;
226 return 0; 297 return 0;
227} 298}
228 299
229/* Add the port number to an IP6 or 4 sockaddr and return the address 300/* Add the port number to an IPv6 or 4 sockaddr and return the address
230 length */ 301 length */
231static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, 302static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
232 int *addr_len) 303 int *addr_len)
233{ 304{
234 saddr->ss_family = dlm_local_addr.ss_family; 305 saddr->ss_family = dlm_local_addr[0]->ss_family;
235 if (saddr->ss_family == AF_INET) { 306 if (saddr->ss_family == AF_INET) {
236 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; 307 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
237 in4_addr->sin_port = cpu_to_be16(port); 308 in4_addr->sin_port = cpu_to_be16(port);
238 *addr_len = sizeof(struct sockaddr_in); 309 *addr_len = sizeof(struct sockaddr_in);
310 memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
239 } else { 311 } else {
240 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; 312 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
241 in6_addr->sin6_port = cpu_to_be16(port); 313 in6_addr->sin6_port = cpu_to_be16(port);
@@ -264,6 +336,193 @@ static void close_connection(struct connection *con, bool and_other)
264 mutex_unlock(&con->sock_mutex); 336 mutex_unlock(&con->sock_mutex);
265} 337}
266 338
339/* We only send shutdown messages to nodes that are not part of the cluster */
340static void sctp_send_shutdown(sctp_assoc_t associd)
341{
342 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
343 struct msghdr outmessage;
344 struct cmsghdr *cmsg;
345 struct sctp_sndrcvinfo *sinfo;
346 int ret;
347 struct connection *con;
348
349 con = nodeid2con(0,0);
350 BUG_ON(con == NULL);
351
352 outmessage.msg_name = NULL;
353 outmessage.msg_namelen = 0;
354 outmessage.msg_control = outcmsg;
355 outmessage.msg_controllen = sizeof(outcmsg);
356 outmessage.msg_flags = MSG_EOR;
357
358 cmsg = CMSG_FIRSTHDR(&outmessage);
359 cmsg->cmsg_level = IPPROTO_SCTP;
360 cmsg->cmsg_type = SCTP_SNDRCV;
361 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
362 outmessage.msg_controllen = cmsg->cmsg_len;
363 sinfo = CMSG_DATA(cmsg);
364 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
365
366 sinfo->sinfo_flags |= MSG_EOF;
367 sinfo->sinfo_assoc_id = associd;
368
369 ret = kernel_sendmsg(con->sock, &outmessage, NULL, 0, 0);
370
371 if (ret != 0)
372 log_print("send EOF to node failed: %d", ret);
373}
374
375/* INIT failed but we don't know which node...
376 restart INIT on all pending nodes */
377static void sctp_init_failed(void)
378{
379 int i;
380 struct connection *con;
381
382 down(&connections_lock);
383 for (i=1; i<=max_nodeid; i++) {
384 con = __nodeid2con(i, 0);
385 if (!con)
386 continue;
387 con->sctp_assoc = 0;
388 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
389 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
390 queue_work(send_workqueue, &con->swork);
391 }
392 }
393 }
394 up(&connections_lock);
395}
396
397/* Something happened to an association */
398static void process_sctp_notification(struct connection *con,
399 struct msghdr *msg, char *buf)
400{
401 union sctp_notification *sn = (union sctp_notification *)buf;
402
403 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
404 switch (sn->sn_assoc_change.sac_state) {
405
406 case SCTP_COMM_UP:
407 case SCTP_RESTART:
408 {
409 /* Check that the new node is in the lockspace */
410 struct sctp_prim prim;
411 int nodeid;
412 int prim_len, ret;
413 int addr_len;
414 struct connection *new_con;
415 struct file *file;
416 sctp_peeloff_arg_t parg;
417 int parglen = sizeof(parg);
418
419 /*
420 * We get this before any data for an association.
421 * We verify that the node is in the cluster and
422 * then peel off a socket for it.
423 */
424 if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
425 log_print("COMM_UP for invalid assoc ID %d",
426 (int)sn->sn_assoc_change.sac_assoc_id);
427 sctp_init_failed();
428 return;
429 }
430 memset(&prim, 0, sizeof(struct sctp_prim));
431 prim_len = sizeof(struct sctp_prim);
432 prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
433
434 ret = kernel_getsockopt(con->sock,
435 IPPROTO_SCTP,
436 SCTP_PRIMARY_ADDR,
437 (char*)&prim,
438 &prim_len);
439 if (ret < 0) {
440 log_print("getsockopt/sctp_primary_addr on "
441 "new assoc %d failed : %d",
442 (int)sn->sn_assoc_change.sac_assoc_id,
443 ret);
444
445 /* Retry INIT later */
446 new_con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
447 if (new_con)
448 clear_bit(CF_CONNECT_PENDING, &con->flags);
449 return;
450 }
451 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
452 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
453 int i;
454 unsigned char *b=(unsigned char *)&prim.ssp_addr;
455 log_print("reject connect from unknown addr");
456 for (i=0; i<sizeof(struct sockaddr_storage);i++)
457 printk("%02x ", b[i]);
458 printk("\n");
459 sctp_send_shutdown(prim.ssp_assoc_id);
460 return;
461 }
462
463 new_con = nodeid2con(nodeid, GFP_KERNEL);
464 if (!new_con)
465 return;
466
467 /* Peel off a new sock */
468 parg.associd = sn->sn_assoc_change.sac_assoc_id;
469 ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
470 SCTP_SOCKOPT_PEELOFF,
471 (void *)&parg, &parglen);
472 if (ret) {
473 log_print("Can't peel off a socket for "
474 "connection %d to node %d: err=%d\n",
475 parg.associd, nodeid, ret);
476 }
477 file = fget(parg.sd);
478 new_con->sock = SOCKET_I(file->f_dentry->d_inode);
479 add_sock(new_con->sock, new_con);
480 fput(file);
481 put_unused_fd(parg.sd);
482
483 log_print("got new/restarted association %d nodeid %d",
484 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
485
486 /* Send any pending writes */
487 clear_bit(CF_CONNECT_PENDING, &new_con->flags);
488 clear_bit(CF_INIT_PENDING, &con->flags);
489 if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
490 queue_work(send_workqueue, &new_con->swork);
491 }
492 if (!test_and_set_bit(CF_READ_PENDING, &new_con->flags))
493 queue_work(recv_workqueue, &new_con->rwork);
494 }
495 break;
496
497 case SCTP_COMM_LOST:
498 case SCTP_SHUTDOWN_COMP:
499 {
500 con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
501 if (con) {
502 con->sctp_assoc = 0;
503 }
504 }
505 break;
506
507 /* We don't know which INIT failed, so clear the PENDING flags
508 * on them all. if assoc_id is zero then it will then try
509 * again */
510
511 case SCTP_CANT_STR_ASSOC:
512 {
513 log_print("Can't start SCTP association - retrying");
514 sctp_init_failed();
515 }
516 break;
517
518 default:
519 log_print("unexpected SCTP assoc change id=%d state=%d",
520 (int)sn->sn_assoc_change.sac_assoc_id,
521 sn->sn_assoc_change.sac_state);
522 }
523 }
524}
525
267/* Data received from remote end */ 526/* Data received from remote end */
268static int receive_from_sock(struct connection *con) 527static int receive_from_sock(struct connection *con)
269{ 528{
@@ -274,6 +533,7 @@ static int receive_from_sock(struct connection *con)
274 int r; 533 int r;
275 int call_again_soon = 0; 534 int call_again_soon = 0;
276 int nvec; 535 int nvec;
536 char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
277 537
278 mutex_lock(&con->sock_mutex); 538 mutex_lock(&con->sock_mutex);
279 539
@@ -293,12 +553,18 @@ static int receive_from_sock(struct connection *con)
293 cbuf_init(&con->cb, PAGE_CACHE_SIZE); 553 cbuf_init(&con->cb, PAGE_CACHE_SIZE);
294 } 554 }
295 555
556 /* Only SCTP needs these really */
557 memset(&incmsg, 0, sizeof(incmsg));
558 msg.msg_control = incmsg;
559 msg.msg_controllen = sizeof(incmsg);
560
296 /* 561 /*
297 * iov[0] is the bit of the circular buffer between the current end 562 * iov[0] is the bit of the circular buffer between the current end
298 * point (cb.base + cb.len) and the end of the buffer. 563 * point (cb.base + cb.len) and the end of the buffer.
299 */ 564 */
300 iov[0].iov_len = con->cb.base - cbuf_data(&con->cb); 565 iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
301 iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb); 566 iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
567 iov[1].iov_len = 0;
302 nvec = 1; 568 nvec = 1;
303 569
304 /* 570 /*
@@ -315,11 +581,20 @@ static int receive_from_sock(struct connection *con)
315 581
316 r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len, 582 r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len,
317 MSG_DONTWAIT | MSG_NOSIGNAL); 583 MSG_DONTWAIT | MSG_NOSIGNAL);
318
319 if (ret <= 0) 584 if (ret <= 0)
320 goto out_close; 585 goto out_close;
321 if (ret == -EAGAIN) 586
322 goto out_resched; 587 /* Process SCTP notifications */
588 if (msg.msg_flags & MSG_NOTIFICATION) {
589 msg.msg_control = incmsg;
590 msg.msg_controllen = sizeof(incmsg);
591
592 process_sctp_notification(con, &msg,
593 page_address(con->rx_page) + con->cb.base);
594 mutex_unlock(&con->sock_mutex);
595 return 0;
596 }
597 BUG_ON(con->nodeid == 0);
323 598
324 if (ret == len) 599 if (ret == len)
325 call_again_soon = 1; 600 call_again_soon = 1;
@@ -329,10 +604,10 @@ static int receive_from_sock(struct connection *con)
329 con->cb.base, con->cb.len, 604 con->cb.base, con->cb.len,
330 PAGE_CACHE_SIZE); 605 PAGE_CACHE_SIZE);
331 if (ret == -EBADMSG) { 606 if (ret == -EBADMSG) {
332 printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, " 607 log_print("lowcomms: addr=%p, base=%u, len=%u, "
333 "iov_len=%u, iov_base[0]=%p, read=%d\n", 608 "iov_len=%u, iov_base[0]=%p, read=%d",
334 page_address(con->rx_page), con->cb.base, con->cb.len, 609 page_address(con->rx_page), con->cb.base, con->cb.len,
335 len, iov[0].iov_base, r); 610 len, iov[0].iov_base, r);
336 } 611 }
337 if (ret < 0) 612 if (ret < 0)
338 goto out_close; 613 goto out_close;
@@ -368,7 +643,7 @@ out_close:
368} 643}
369 644
370/* Listening socket is busy, accept a connection */ 645/* Listening socket is busy, accept a connection */
371static int accept_from_sock(struct connection *con) 646static int tcp_accept_from_sock(struct connection *con)
372{ 647{
373 int result; 648 int result;
374 struct sockaddr_storage peeraddr; 649 struct sockaddr_storage peeraddr;
@@ -379,7 +654,7 @@ static int accept_from_sock(struct connection *con)
379 struct connection *addcon; 654 struct connection *addcon;
380 655
381 memset(&peeraddr, 0, sizeof(peeraddr)); 656 memset(&peeraddr, 0, sizeof(peeraddr));
382 result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, 657 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
383 IPPROTO_TCP, &newsock); 658 IPPROTO_TCP, &newsock);
384 if (result < 0) 659 if (result < 0)
385 return -ENOMEM; 660 return -ENOMEM;
@@ -408,7 +683,7 @@ static int accept_from_sock(struct connection *con)
408 /* Get the new node's NODEID */ 683 /* Get the new node's NODEID */
409 make_sockaddr(&peeraddr, 0, &len); 684 make_sockaddr(&peeraddr, 0, &len);
410 if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) { 685 if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
411 printk("dlm: connect from non cluster node\n"); 686 log_print("connect from non cluster node");
412 sock_release(newsock); 687 sock_release(newsock);
413 mutex_unlock(&con->sock_mutex); 688 mutex_unlock(&con->sock_mutex);
414 return -1; 689 return -1;
@@ -419,7 +694,6 @@ static int accept_from_sock(struct connection *con)
419 /* Check to see if we already have a connection to this node. This 694 /* Check to see if we already have a connection to this node. This
420 * could happen if the two nodes initiate a connection at roughly 695 * could happen if the two nodes initiate a connection at roughly
421 * the same time and the connections cross on the wire. 696 * the same time and the connections cross on the wire.
422 * TEMPORARY FIX:
423 * In this case we store the incoming one in "othercon" 697 * In this case we store the incoming one in "othercon"
424 */ 698 */
425 newcon = nodeid2con(nodeid, GFP_KERNEL); 699 newcon = nodeid2con(nodeid, GFP_KERNEL);
@@ -434,7 +708,7 @@ static int accept_from_sock(struct connection *con)
434 if (!othercon) { 708 if (!othercon) {
435 othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL); 709 othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL);
436 if (!othercon) { 710 if (!othercon) {
437 printk("dlm: failed to allocate incoming socket\n"); 711 log_print("failed to allocate incoming socket");
438 mutex_unlock(&newcon->sock_mutex); 712 mutex_unlock(&newcon->sock_mutex);
439 result = -ENOMEM; 713 result = -ENOMEM;
440 goto accept_err; 714 goto accept_err;
@@ -477,12 +751,107 @@ accept_err:
477 sock_release(newsock); 751 sock_release(newsock);
478 752
479 if (result != -EAGAIN) 753 if (result != -EAGAIN)
480 printk("dlm: error accepting connection from node: %d\n", result); 754 log_print("error accepting connection from node: %d", result);
481 return result; 755 return result;
482} 756}
483 757
758static void free_entry(struct writequeue_entry *e)
759{
760 __free_page(e->page);
761 kfree(e);
762}
763
764/* Initiate an SCTP association.
765 This is a special case of send_to_sock() in that we don't yet have a
766 peeled-off socket for this association, so we use the listening socket
767 and add the primary IP address of the remote node.
768 */
769static void sctp_init_assoc(struct connection *con)
770{
771 struct sockaddr_storage rem_addr;
772 char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
773 struct msghdr outmessage;
774 struct cmsghdr *cmsg;
775 struct sctp_sndrcvinfo *sinfo;
776 struct connection *base_con;
777 struct writequeue_entry *e;
778 int len, offset;
779 int ret;
780 int addrlen;
781 struct kvec iov[1];
782
783 if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
784 return;
785
786 if (con->retries++ > MAX_CONNECT_RETRIES)
787 return;
788
789 log_print("Initiating association with node %d", con->nodeid);
790
791 if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
792 log_print("no address for nodeid %d", con->nodeid);
793 return;
794 }
795 base_con = nodeid2con(0, 0);
796 BUG_ON(base_con == NULL);
797
798 make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
799
800 outmessage.msg_name = &rem_addr;
801 outmessage.msg_namelen = addrlen;
802 outmessage.msg_control = outcmsg;
803 outmessage.msg_controllen = sizeof(outcmsg);
804 outmessage.msg_flags = MSG_EOR;
805
806 spin_lock(&con->writequeue_lock);
807 e = list_entry(con->writequeue.next, struct writequeue_entry,
808 list);
809
810 BUG_ON((struct list_head *) e == &con->writequeue);
811
812 len = e->len;
813 offset = e->offset;
814 spin_unlock(&con->writequeue_lock);
815 kmap(e->page);
816
817 /* Send the first block off the write queue */
818 iov[0].iov_base = page_address(e->page)+offset;
819 iov[0].iov_len = len;
820
821 cmsg = CMSG_FIRSTHDR(&outmessage);
822 cmsg->cmsg_level = IPPROTO_SCTP;
823 cmsg->cmsg_type = SCTP_SNDRCV;
824 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
825 sinfo = CMSG_DATA(cmsg);
826 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
827 sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
828 outmessage.msg_controllen = cmsg->cmsg_len;
829
830 ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
831 if (ret < 0) {
832 log_print("Send first packet to node %d failed: %d",
833 con->nodeid, ret);
834
835 /* Try again later */
836 clear_bit(CF_CONNECT_PENDING, &con->flags);
837 clear_bit(CF_INIT_PENDING, &con->flags);
838 }
839 else {
840 spin_lock(&con->writequeue_lock);
841 e->offset += ret;
842 e->len -= ret;
843
844 if (e->len == 0 && e->users == 0) {
845 list_del(&e->list);
846 kunmap(e->page);
847 free_entry(e);
848 }
849 spin_unlock(&con->writequeue_lock);
850 }
851}
852
484/* Connect a new socket to its peer */ 853/* Connect a new socket to its peer */
485static void connect_to_sock(struct connection *con) 854static void tcp_connect_to_sock(struct connection *con)
486{ 855{
487 int result = -EHOSTUNREACH; 856 int result = -EHOSTUNREACH;
488 struct sockaddr_storage saddr; 857 struct sockaddr_storage saddr;
@@ -505,7 +874,7 @@ static void connect_to_sock(struct connection *con)
505 } 874 }
506 875
507 /* Create a socket to communicate with */ 876 /* Create a socket to communicate with */
508 result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, 877 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
509 IPPROTO_TCP, &sock); 878 IPPROTO_TCP, &sock);
510 if (result < 0) 879 if (result < 0)
511 goto out_err; 880 goto out_err;
@@ -516,11 +885,11 @@ static void connect_to_sock(struct connection *con)
516 885
517 sock->sk->sk_user_data = con; 886 sock->sk->sk_user_data = con;
518 con->rx_action = receive_from_sock; 887 con->rx_action = receive_from_sock;
888 con->connect_action = tcp_connect_to_sock;
889 add_sock(sock, con);
519 890
520 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); 891 make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
521 892
522 add_sock(sock, con);
523
524 log_print("connecting to %d", con->nodeid); 893 log_print("connecting to %d", con->nodeid);
525 result = 894 result =
526 sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, 895 sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
@@ -550,64 +919,57 @@ out:
550 return; 919 return;
551} 920}
552 921
553static struct socket *create_listen_sock(struct connection *con, 922static struct socket *tcp_create_listen_sock(struct connection *con,
554 struct sockaddr_storage *saddr) 923 struct sockaddr_storage *saddr)
555{ 924{
556 struct socket *sock = NULL; 925 struct socket *sock = NULL;
557 mm_segment_t fs;
558 int result = 0; 926 int result = 0;
559 int one = 1; 927 int one = 1;
560 int addr_len; 928 int addr_len;
561 929
562 if (dlm_local_addr.ss_family == AF_INET) 930 if (dlm_local_addr[0]->ss_family == AF_INET)
563 addr_len = sizeof(struct sockaddr_in); 931 addr_len = sizeof(struct sockaddr_in);
564 else 932 else
565 addr_len = sizeof(struct sockaddr_in6); 933 addr_len = sizeof(struct sockaddr_in6);
566 934
567 /* Create a socket to communicate with */ 935 /* Create a socket to communicate with */
568 result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); 936 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
937 IPPROTO_TCP, &sock);
569 if (result < 0) { 938 if (result < 0) {
570 printk("dlm: Can't create listening comms socket\n"); 939 log_print("Can't create listening comms socket");
571 goto create_out; 940 goto create_out;
572 } 941 }
573 942
574 fs = get_fs(); 943 result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
575 set_fs(get_ds()); 944 (char *)&one, sizeof(one));
576 result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, 945
577 (char *)&one, sizeof(one));
578 set_fs(fs);
579 if (result < 0) { 946 if (result < 0) {
580 printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n", 947 log_print("Failed to set SO_REUSEADDR on socket: %d", result);
581 result);
582 } 948 }
583 sock->sk->sk_user_data = con; 949 sock->sk->sk_user_data = con;
584 con->rx_action = accept_from_sock; 950 con->rx_action = tcp_accept_from_sock;
951 con->connect_action = tcp_connect_to_sock;
585 con->sock = sock; 952 con->sock = sock;
586 953
587 /* Bind to our port */ 954 /* Bind to our port */
588 make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); 955 make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
589 result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); 956 result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
590 if (result < 0) { 957 if (result < 0) {
591 printk("dlm: Can't bind to port %d\n", dlm_config.ci_tcp_port); 958 log_print("Can't bind to port %d", dlm_config.ci_tcp_port);
592 sock_release(sock); 959 sock_release(sock);
593 sock = NULL; 960 sock = NULL;
594 con->sock = NULL; 961 con->sock = NULL;
595 goto create_out; 962 goto create_out;
596 } 963 }
597 964 result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
598 fs = get_fs();
599 set_fs(get_ds());
600
601 result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
602 (char *)&one, sizeof(one)); 965 (char *)&one, sizeof(one));
603 set_fs(fs);
604 if (result < 0) { 966 if (result < 0) {
605 printk("dlm: Set keepalive failed: %d\n", result); 967 log_print("Set keepalive failed: %d", result);
606 } 968 }
607 969
608 result = sock->ops->listen(sock, 5); 970 result = sock->ops->listen(sock, 5);
609 if (result < 0) { 971 if (result < 0) {
610 printk("dlm: Can't listen on port %d\n", dlm_config.ci_tcp_port); 972 log_print("Can't listen on port %d", dlm_config.ci_tcp_port);
611 sock_release(sock); 973 sock_release(sock);
612 sock = NULL; 974 sock = NULL;
613 goto create_out; 975 goto create_out;
@@ -617,18 +979,146 @@ create_out:
617 return sock; 979 return sock;
618} 980}
619 981
982/* Get local addresses */
983static void init_local(void)
984{
985 struct sockaddr_storage sas, *addr;
986 int i;
987
988 dlm_local_count = 0;
989 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
990 if (dlm_our_addr(&sas, i))
991 break;
992
993 addr = kmalloc(sizeof(*addr), GFP_KERNEL);
994 if (!addr)
995 break;
996 memcpy(addr, &sas, sizeof(*addr));
997 dlm_local_addr[dlm_local_count++] = addr;
998 }
999}
1000
1001/* Bind to an IP address. SCTP allows multiple address so it can do
1002 multi-homing */
1003static int add_sctp_bind_addr(struct connection *sctp_con,
1004 struct sockaddr_storage *addr,
1005 int addr_len, int num)
1006{
1007 int result = 0;
1008
1009 if (num == 1)
1010 result = kernel_bind(sctp_con->sock,
1011 (struct sockaddr *) addr,
1012 addr_len);
1013 else
1014 result = kernel_setsockopt(sctp_con->sock, SOL_SCTP,
1015 SCTP_SOCKOPT_BINDX_ADD,
1016 (char *)addr, addr_len);
1017
1018 if (result < 0)
1019 log_print("Can't bind to port %d addr number %d",
1020 dlm_config.ci_tcp_port, num);
1021
1022 return result;
1023}
620 1024
621/* Listen on all interfaces */ 1025/* Initialise SCTP socket and bind to all interfaces */
622static int listen_for_all(void) 1026static int sctp_listen_for_all(void)
1027{
1028 struct socket *sock = NULL;
1029 struct sockaddr_storage localaddr;
1030 struct sctp_event_subscribe subscribe;
1031 int result = -EINVAL, num = 1, i, addr_len;
1032 struct connection *con = nodeid2con(0, GFP_KERNEL);
1033 int bufsize = NEEDED_RMEM;
1034
1035 if (!con)
1036 return -ENOMEM;
1037
1038 log_print("Using SCTP for communications");
1039
1040 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
1041 IPPROTO_SCTP, &sock);
1042 if (result < 0) {
1043 log_print("Can't create comms socket, check SCTP is loaded");
1044 goto out;
1045 }
1046
1047 /* Listen for events */
1048 memset(&subscribe, 0, sizeof(subscribe));
1049 subscribe.sctp_data_io_event = 1;
1050 subscribe.sctp_association_event = 1;
1051 subscribe.sctp_send_failure_event = 1;
1052 subscribe.sctp_shutdown_event = 1;
1053 subscribe.sctp_partial_delivery_event = 1;
1054
1055 result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
1056 (char *)&bufsize, sizeof(bufsize));
1057 if (result)
1058 log_print("Error increasing buffer space on socket %d", result);
1059
1060 result = kernel_setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
1061 (char *)&subscribe, sizeof(subscribe));
1062 if (result < 0) {
1063 log_print("Failed to set SCTP_EVENTS on socket: result=%d",
1064 result);
1065 goto create_delsock;
1066 }
1067
1068 /* Init con struct */
1069 sock->sk->sk_user_data = con;
1070 con->sock = sock;
1071 con->sock->sk->sk_data_ready = lowcomms_data_ready;
1072 con->rx_action = receive_from_sock;
1073 con->connect_action = sctp_init_assoc;
1074
1075 /* Bind to all interfaces. */
1076 for (i = 0; i < dlm_local_count; i++) {
1077 memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
1078 make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
1079
1080 result = add_sctp_bind_addr(con, &localaddr, addr_len, num);
1081 if (result)
1082 goto create_delsock;
1083 ++num;
1084 }
1085
1086 result = sock->ops->listen(sock, 5);
1087 if (result < 0) {
1088 log_print("Can't set socket listening");
1089 goto create_delsock;
1090 }
1091
1092 return 0;
1093
1094create_delsock:
1095 sock_release(sock);
1096 con->sock = NULL;
1097out:
1098 return result;
1099}
1100
1101static int tcp_listen_for_all(void)
623{ 1102{
624 struct socket *sock = NULL; 1103 struct socket *sock = NULL;
625 struct connection *con = nodeid2con(0, GFP_KERNEL); 1104 struct connection *con = nodeid2con(0, GFP_KERNEL);
626 int result = -EINVAL; 1105 int result = -EINVAL;
627 1106
1107 if (!con)
1108 return -ENOMEM;
1109
628 /* We don't support multi-homed hosts */ 1110 /* We don't support multi-homed hosts */
1111 if (dlm_local_addr[1] != NULL) {
1112 log_print("TCP protocol can't handle multi-homed hosts, "
1113 "try SCTP");
1114 return -EINVAL;
1115 }
1116
1117 log_print("Using TCP for communications");
1118
629 set_bit(CF_IS_OTHERCON, &con->flags); 1119 set_bit(CF_IS_OTHERCON, &con->flags);
630 1120
631 sock = create_listen_sock(con, &dlm_local_addr); 1121 sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
632 if (sock) { 1122 if (sock) {
633 add_sock(sock, con); 1123 add_sock(sock, con);
634 result = 0; 1124 result = 0;
@@ -666,8 +1156,7 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con,
666 return entry; 1156 return entry;
667} 1157}
668 1158
669void *dlm_lowcomms_get_buffer(int nodeid, int len, 1159void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
670 gfp_t allocation, char **ppc)
671{ 1160{
672 struct connection *con; 1161 struct connection *con;
673 struct writequeue_entry *e; 1162 struct writequeue_entry *e;
@@ -735,12 +1224,6 @@ out:
735 return; 1224 return;
736} 1225}
737 1226
738static void free_entry(struct writequeue_entry *e)
739{
740 __free_page(e->page);
741 kfree(e);
742}
743
744/* Send a message */ 1227/* Send a message */
745static void send_to_sock(struct connection *con) 1228static void send_to_sock(struct connection *con)
746{ 1229{
@@ -777,8 +1260,7 @@ static void send_to_sock(struct connection *con)
777 goto out; 1260 goto out;
778 if (ret <= 0) 1261 if (ret <= 0)
779 goto send_error; 1262 goto send_error;
780 } 1263 } else {
781 else {
782 /* Don't starve people filling buffers */ 1264 /* Don't starve people filling buffers */
783 cond_resched(); 1265 cond_resched();
784 } 1266 }
@@ -807,7 +1289,8 @@ send_error:
807 1289
808out_connect: 1290out_connect:
809 mutex_unlock(&con->sock_mutex); 1291 mutex_unlock(&con->sock_mutex);
810 connect_to_sock(con); 1292 if (!test_bit(CF_INIT_PENDING, &con->flags))
1293 lowcomms_connect_sock(con);
811 return; 1294 return;
812} 1295}
813 1296
@@ -832,9 +1315,6 @@ int dlm_lowcomms_close(int nodeid)
832{ 1315{
833 struct connection *con; 1316 struct connection *con;
834 1317
835 if (!connections)
836 goto out;
837
838 log_print("closing connection to node %d", nodeid); 1318 log_print("closing connection to node %d", nodeid);
839 con = nodeid2con(nodeid, 0); 1319 con = nodeid2con(nodeid, 0);
840 if (con) { 1320 if (con) {
@@ -842,12 +1322,9 @@ int dlm_lowcomms_close(int nodeid)
842 close_connection(con, true); 1322 close_connection(con, true);
843 } 1323 }
844 return 0; 1324 return 0;
845
846out:
847 return -1;
848} 1325}
849 1326
850/* Look for activity on active sockets */ 1327/* Receive workqueue function */
851static void process_recv_sockets(struct work_struct *work) 1328static void process_recv_sockets(struct work_struct *work)
852{ 1329{
853 struct connection *con = container_of(work, struct connection, rwork); 1330 struct connection *con = container_of(work, struct connection, rwork);
@@ -859,15 +1336,14 @@ static void process_recv_sockets(struct work_struct *work)
859 } while (!err); 1336 } while (!err);
860} 1337}
861 1338
862 1339/* Send workqueue function */
863static void process_send_sockets(struct work_struct *work) 1340static void process_send_sockets(struct work_struct *work)
864{ 1341{
865 struct connection *con = container_of(work, struct connection, swork); 1342 struct connection *con = container_of(work, struct connection, swork);
866 1343
867 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { 1344 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
868 connect_to_sock(con); 1345 con->connect_action(con);
869 } 1346 }
870
871 clear_bit(CF_WRITE_PENDING, &con->flags); 1347 clear_bit(CF_WRITE_PENDING, &con->flags);
872 send_to_sock(con); 1348 send_to_sock(con);
873} 1349}
@@ -878,8 +1354,8 @@ static void clean_writequeues(void)
878{ 1354{
879 int nodeid; 1355 int nodeid;
880 1356
881 for (nodeid = 1; nodeid < conn_array_size; nodeid++) { 1357 for (nodeid = 1; nodeid <= max_nodeid; nodeid++) {
882 struct connection *con = nodeid2con(nodeid, 0); 1358 struct connection *con = __nodeid2con(nodeid, 0);
883 1359
884 if (con) 1360 if (con)
885 clean_one_writequeue(con); 1361 clean_one_writequeue(con);
@@ -916,64 +1392,67 @@ static int work_start(void)
916void dlm_lowcomms_stop(void) 1392void dlm_lowcomms_stop(void)
917{ 1393{
918 int i; 1394 int i;
1395 struct connection *con;
919 1396
920 /* Set all the flags to prevent any 1397 /* Set all the flags to prevent any
921 socket activity. 1398 socket activity.
922 */ 1399 */
923 for (i = 0; i < conn_array_size; i++) { 1400 down(&connections_lock);
924 if (connections[i]) 1401 for (i = 0; i <= max_nodeid; i++) {
925 connections[i]->flags |= 0xFF; 1402 con = __nodeid2con(i, 0);
1403 if (con)
1404 con->flags |= 0xFF;
926 } 1405 }
1406 up(&connections_lock);
927 1407
928 work_stop(); 1408 work_stop();
1409
1410 down(&connections_lock);
929 clean_writequeues(); 1411 clean_writequeues();
930 1412
931 for (i = 0; i < conn_array_size; i++) { 1413 for (i = 0; i <= max_nodeid; i++) {
932 if (connections[i]) { 1414 con = __nodeid2con(i, 0);
933 close_connection(connections[i], true); 1415 if (con) {
934 if (connections[i]->othercon) 1416 close_connection(con, true);
935 kmem_cache_free(con_cache, connections[i]->othercon); 1417 if (con->othercon)
936 kmem_cache_free(con_cache, connections[i]); 1418 kmem_cache_free(con_cache, con->othercon);
1419 kmem_cache_free(con_cache, con);
937 } 1420 }
938 } 1421 }
939 1422 max_nodeid = 0;
940 kfree(connections); 1423 up(&connections_lock);
941 connections = NULL;
942
943 kmem_cache_destroy(con_cache); 1424 kmem_cache_destroy(con_cache);
1425 idr_init(&connections_idr);
944} 1426}
945 1427
946/* This is quite likely to sleep... */
947int dlm_lowcomms_start(void) 1428int dlm_lowcomms_start(void)
948{ 1429{
949 int error = 0; 1430 int error = -EINVAL;
950 1431 struct connection *con;
951 error = -ENOMEM;
952 connections = kzalloc(sizeof(struct connection *) *
953 NODE_INCREMENT, GFP_KERNEL);
954 if (!connections)
955 goto out;
956
957 conn_array_size = NODE_INCREMENT;
958 1432
959 if (dlm_our_addr(&dlm_local_addr, 0)) { 1433 init_local();
1434 if (!dlm_local_count) {
1435 error = -ENOTCONN;
960 log_print("no local IP address has been set"); 1436 log_print("no local IP address has been set");
961 goto fail_free_conn; 1437 goto out;
962 }
963 if (!dlm_our_addr(&dlm_local_addr, 1)) {
964 log_print("This dlm comms module does not support multi-homed clustering");
965 goto fail_free_conn;
966 } 1438 }
967 1439
1440 error = -ENOMEM;
968 con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection), 1441 con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
969 __alignof__(struct connection), 0, 1442 __alignof__(struct connection), 0,
970 NULL, NULL); 1443 NULL, NULL);
971 if (!con_cache) 1444 if (!con_cache)
972 goto fail_free_conn; 1445 goto out;
973 1446
1447 /* Set some sysctl minima */
1448 if (sysctl_rmem_max < NEEDED_RMEM)
1449 sysctl_rmem_max = NEEDED_RMEM;
974 1450
975 /* Start listening */ 1451 /* Start listening */
976 error = listen_for_all(); 1452 if (dlm_config.ci_protocol == 0)
1453 error = tcp_listen_for_all();
1454 else
1455 error = sctp_listen_for_all();
977 if (error) 1456 if (error)
978 goto fail_unlisten; 1457 goto fail_unlisten;
979 1458
@@ -984,24 +1463,13 @@ int dlm_lowcomms_start(void)
984 return 0; 1463 return 0;
985 1464
986fail_unlisten: 1465fail_unlisten:
987 close_connection(connections[0], false); 1466 con = nodeid2con(0,0);
988 kmem_cache_free(con_cache, connections[0]); 1467 if (con) {
1468 close_connection(con, false);
1469 kmem_cache_free(con_cache, con);
1470 }
989 kmem_cache_destroy(con_cache); 1471 kmem_cache_destroy(con_cache);
990 1472
991fail_free_conn:
992 kfree(connections);
993
994out: 1473out:
995 return error; 1474 return error;
996} 1475}
997
998/*
999 * Overrides for Emacs so that we follow Linus's tabbing style.
1000 * Emacs will notice this stuff at the end of the file and automatically
1001 * adjust the settings for this buffer only. This must remain at the end
1002 * of the file.
1003 * ---------------------------------------------------------------------------
1004 * Local variables:
1005 * c-file-style: "linux"
1006 * End:
1007 */
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 3870150b83a4..b0201ec325a7 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -56,6 +56,7 @@ struct dlm_write_request32 {
56 union { 56 union {
57 struct dlm_lock_params32 lock; 57 struct dlm_lock_params32 lock;
58 struct dlm_lspace_params lspace; 58 struct dlm_lspace_params lspace;
59 struct dlm_purge_params purge;
59 } i; 60 } i;
60}; 61};
61 62
@@ -92,6 +93,9 @@ static void compat_input(struct dlm_write_request *kb,
92 kb->i.lspace.flags = kb32->i.lspace.flags; 93 kb->i.lspace.flags = kb32->i.lspace.flags;
93 kb->i.lspace.minor = kb32->i.lspace.minor; 94 kb->i.lspace.minor = kb32->i.lspace.minor;
94 strcpy(kb->i.lspace.name, kb32->i.lspace.name); 95 strcpy(kb->i.lspace.name, kb32->i.lspace.name);
96 } else if (kb->cmd == DLM_USER_PURGE) {
97 kb->i.purge.nodeid = kb32->i.purge.nodeid;
98 kb->i.purge.pid = kb32->i.purge.pid;
95 } else { 99 } else {
96 kb->i.lock.mode = kb32->i.lock.mode; 100 kb->i.lock.mode = kb32->i.lock.mode;
97 kb->i.lock.namelen = kb32->i.lock.namelen; 101 kb->i.lock.namelen = kb32->i.lock.namelen;
@@ -111,8 +115,6 @@ static void compat_input(struct dlm_write_request *kb,
111static void compat_output(struct dlm_lock_result *res, 115static void compat_output(struct dlm_lock_result *res,
112 struct dlm_lock_result32 *res32) 116 struct dlm_lock_result32 *res32)
113{ 117{
114 res32->length = res->length - (sizeof(struct dlm_lock_result) -
115 sizeof(struct dlm_lock_result32));
116 res32->user_astaddr = (__u32)(long)res->user_astaddr; 118 res32->user_astaddr = (__u32)(long)res->user_astaddr;
117 res32->user_astparam = (__u32)(long)res->user_astparam; 119 res32->user_astparam = (__u32)(long)res->user_astparam;
118 res32->user_lksb = (__u32)(long)res->user_lksb; 120 res32->user_lksb = (__u32)(long)res->user_lksb;
@@ -128,35 +130,30 @@ static void compat_output(struct dlm_lock_result *res,
128} 130}
129#endif 131#endif
130 132
133/* we could possibly check if the cancel of an orphan has resulted in the lkb
134 being removed and then remove that lkb from the orphans list and free it */
131 135
132void dlm_user_add_ast(struct dlm_lkb *lkb, int type) 136void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
133{ 137{
134 struct dlm_ls *ls; 138 struct dlm_ls *ls;
135 struct dlm_user_args *ua; 139 struct dlm_user_args *ua;
136 struct dlm_user_proc *proc; 140 struct dlm_user_proc *proc;
137 int remove_ownqueue = 0; 141 int eol = 0, ast_type;
138 142
139 /* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each 143 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
140 lkb before dealing with it. We need to check this
141 flag before taking ls_clear_proc_locks mutex because if
142 it's set, dlm_clear_proc_locks() holds the mutex. */
143
144 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
145 /* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
146 return; 144 return;
147 }
148 145
149 ls = lkb->lkb_resource->res_ls; 146 ls = lkb->lkb_resource->res_ls;
150 mutex_lock(&ls->ls_clear_proc_locks); 147 mutex_lock(&ls->ls_clear_proc_locks);
151 148
152 /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast 149 /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
153 can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed 150 can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed
154 lkb->ua so we can't try to use it. */ 151 lkb->ua so we can't try to use it. This second check is necessary
152 for cases where a completion ast is received for an operation that
153 began before clear_proc_locks did its cancel/unlock. */
155 154
156 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) { 155 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
157 /* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
158 goto out; 156 goto out;
159 }
160 157
161 DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb);); 158 DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
162 ua = (struct dlm_user_args *)lkb->lkb_astparam; 159 ua = (struct dlm_user_args *)lkb->lkb_astparam;
@@ -166,28 +163,42 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
166 goto out; 163 goto out;
167 164
168 spin_lock(&proc->asts_spin); 165 spin_lock(&proc->asts_spin);
169 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) { 166
167 ast_type = lkb->lkb_ast_type;
168 lkb->lkb_ast_type |= type;
169
170 if (!ast_type) {
170 kref_get(&lkb->lkb_ref); 171 kref_get(&lkb->lkb_ref);
171 list_add_tail(&lkb->lkb_astqueue, &proc->asts); 172 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
172 lkb->lkb_ast_type |= type;
173 wake_up_interruptible(&proc->wait); 173 wake_up_interruptible(&proc->wait);
174 } 174 }
175 175 if (type == AST_COMP && (ast_type & AST_COMP))
176 /* noqueue requests that fail may need to be removed from the 176 log_debug(ls, "ast overlap %x status %x %x",
177 proc's locks list, there should be a better way of detecting 177 lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
178 this situation than checking all these things... */ 178
179 179 /* Figure out if this lock is at the end of its life and no longer
180 if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV && 180 available for the application to use. The lkb still exists until
181 ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue)) 181 the final ast is read. A lock becomes EOL in three situations:
182 remove_ownqueue = 1; 182 1. a noqueue request fails with EAGAIN
183 183 2. an unlock completes with EUNLOCK
184 /* unlocks or cancels of waiting requests need to be removed from the 184 3. a cancel of a waiting request completes with ECANCEL
185 proc's unlocking list, again there must be a better way... */ 185 An EOL lock needs to be removed from the process's list of locks.
186 186 And we can't allow any new operation on an EOL lock. This is
187 if (ua->lksb.sb_status == -DLM_EUNLOCK || 187 not related to the lifetime of the lkb struct which is managed
188 entirely by refcount. */
189
190 if (type == AST_COMP &&
191 lkb->lkb_grmode == DLM_LOCK_IV &&
192 ua->lksb.sb_status == -EAGAIN)
193 eol = 1;
194 else if (ua->lksb.sb_status == -DLM_EUNLOCK ||
188 (ua->lksb.sb_status == -DLM_ECANCEL && 195 (ua->lksb.sb_status == -DLM_ECANCEL &&
189 lkb->lkb_grmode == DLM_LOCK_IV)) 196 lkb->lkb_grmode == DLM_LOCK_IV))
190 remove_ownqueue = 1; 197 eol = 1;
198 if (eol) {
199 lkb->lkb_ast_type &= ~AST_BAST;
200 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
201 }
191 202
192 /* We want to copy the lvb to userspace when the completion 203 /* We want to copy the lvb to userspace when the completion
193 ast is read if the status is 0, the lock has an lvb and 204 ast is read if the status is 0, the lock has an lvb and
@@ -204,11 +215,13 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
204 215
205 spin_unlock(&proc->asts_spin); 216 spin_unlock(&proc->asts_spin);
206 217
207 if (remove_ownqueue) { 218 if (eol) {
208 spin_lock(&ua->proc->locks_spin); 219 spin_lock(&ua->proc->locks_spin);
209 list_del_init(&lkb->lkb_ownqueue); 220 if (!list_empty(&lkb->lkb_ownqueue)) {
221 list_del_init(&lkb->lkb_ownqueue);
222 dlm_put_lkb(lkb);
223 }
210 spin_unlock(&ua->proc->locks_spin); 224 spin_unlock(&ua->proc->locks_spin);
211 dlm_put_lkb(lkb);
212 } 225 }
213 out: 226 out:
214 mutex_unlock(&ls->ls_clear_proc_locks); 227 mutex_unlock(&ls->ls_clear_proc_locks);
@@ -286,47 +299,71 @@ static int device_user_unlock(struct dlm_user_proc *proc,
286 return error; 299 return error;
287} 300}
288 301
289static int device_create_lockspace(struct dlm_lspace_params *params) 302static int create_misc_device(struct dlm_ls *ls, char *name)
290{ 303{
291 dlm_lockspace_t *lockspace;
292 struct dlm_ls *ls;
293 int error, len; 304 int error, len;
294 305
295 if (!capable(CAP_SYS_ADMIN))
296 return -EPERM;
297
298 error = dlm_new_lockspace(params->name, strlen(params->name),
299 &lockspace, 0, DLM_USER_LVB_LEN);
300 if (error)
301 return error;
302
303 ls = dlm_find_lockspace_local(lockspace);
304 if (!ls)
305 return -ENOENT;
306
307 error = -ENOMEM; 306 error = -ENOMEM;
308 len = strlen(params->name) + strlen(name_prefix) + 2; 307 len = strlen(name) + strlen(name_prefix) + 2;
309 ls->ls_device.name = kzalloc(len, GFP_KERNEL); 308 ls->ls_device.name = kzalloc(len, GFP_KERNEL);
310 if (!ls->ls_device.name) 309 if (!ls->ls_device.name)
311 goto fail; 310 goto fail;
311
312 snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix, 312 snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
313 params->name); 313 name);
314 ls->ls_device.fops = &device_fops; 314 ls->ls_device.fops = &device_fops;
315 ls->ls_device.minor = MISC_DYNAMIC_MINOR; 315 ls->ls_device.minor = MISC_DYNAMIC_MINOR;
316 316
317 error = misc_register(&ls->ls_device); 317 error = misc_register(&ls->ls_device);
318 if (error) { 318 if (error) {
319 kfree(ls->ls_device.name); 319 kfree(ls->ls_device.name);
320 goto fail;
321 } 320 }
321fail:
322 return error;
323}
324
325static int device_user_purge(struct dlm_user_proc *proc,
326 struct dlm_purge_params *params)
327{
328 struct dlm_ls *ls;
329 int error;
330
331 ls = dlm_find_lockspace_local(proc->lockspace);
332 if (!ls)
333 return -ENOENT;
334
335 error = dlm_user_purge(ls, proc, params->nodeid, params->pid);
322 336
323 error = ls->ls_device.minor;
324 dlm_put_lockspace(ls); 337 dlm_put_lockspace(ls);
325 return error; 338 return error;
339}
340
341static int device_create_lockspace(struct dlm_lspace_params *params)
342{
343 dlm_lockspace_t *lockspace;
344 struct dlm_ls *ls;
345 int error;
326 346
327 fail: 347 if (!capable(CAP_SYS_ADMIN))
348 return -EPERM;
349
350 error = dlm_new_lockspace(params->name, strlen(params->name),
351 &lockspace, 0, DLM_USER_LVB_LEN);
352 if (error)
353 return error;
354
355 ls = dlm_find_lockspace_local(lockspace);
356 if (!ls)
357 return -ENOENT;
358
359 error = create_misc_device(ls, params->name);
328 dlm_put_lockspace(ls); 360 dlm_put_lockspace(ls);
329 dlm_release_lockspace(lockspace, 0); 361
362 if (error)
363 dlm_release_lockspace(lockspace, 0);
364 else
365 error = ls->ls_device.minor;
366
330 return error; 367 return error;
331} 368}
332 369
@@ -343,6 +380,10 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
343 if (!ls) 380 if (!ls)
344 return -ENOENT; 381 return -ENOENT;
345 382
383 /* Deregister the misc device first, so we don't have
384 * a device that's not attached to a lockspace. If
385 * dlm_release_lockspace fails then we can recreate it
386 */
346 error = misc_deregister(&ls->ls_device); 387 error = misc_deregister(&ls->ls_device);
347 if (error) { 388 if (error) {
348 dlm_put_lockspace(ls); 389 dlm_put_lockspace(ls);
@@ -361,6 +402,8 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
361 402
362 dlm_put_lockspace(ls); 403 dlm_put_lockspace(ls);
363 error = dlm_release_lockspace(lockspace, force); 404 error = dlm_release_lockspace(lockspace, force);
405 if (error)
406 create_misc_device(ls, ls->ls_name);
364 out: 407 out:
365 return error; 408 return error;
366} 409}
@@ -497,6 +540,14 @@ static ssize_t device_write(struct file *file, const char __user *buf,
497 error = device_remove_lockspace(&kbuf->i.lspace); 540 error = device_remove_lockspace(&kbuf->i.lspace);
498 break; 541 break;
499 542
543 case DLM_USER_PURGE:
544 if (!proc) {
545 log_print("no locking on control device");
546 goto out_sig;
547 }
548 error = device_user_purge(proc, &kbuf->i.purge);
549 break;
550
500 default: 551 default:
501 log_print("Unknown command passed to DLM device : %d\n", 552 log_print("Unknown command passed to DLM device : %d\n",
502 kbuf->cmd); 553 kbuf->cmd);
diff --git a/fs/dquot.c b/fs/dquot.c
index b16f991662c1..0a5febc159f2 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1432,7 +1432,7 @@ int vfs_quota_off(struct super_block *sb, int type)
1432 mutex_unlock(&dqopt->dqonoff_mutex); 1432 mutex_unlock(&dqopt->dqonoff_mutex);
1433 } 1433 }
1434 if (sb->s_bdev) 1434 if (sb->s_bdev)
1435 invalidate_bdev(sb->s_bdev, 0); 1435 invalidate_bdev(sb->s_bdev);
1436 return 0; 1436 return 0;
1437} 1437}
1438 1438
@@ -1468,7 +1468,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
1468 * we see all the changes from userspace... */ 1468 * we see all the changes from userspace... */
1469 write_inode_now(inode, 1); 1469 write_inode_now(inode, 1);
1470 /* And now flush the block cache so that kernel sees the changes */ 1470 /* And now flush the block cache so that kernel sees the changes */
1471 invalidate_bdev(sb->s_bdev, 0); 1471 invalidate_bdev(sb->s_bdev);
1472 mutex_lock(&inode->i_mutex); 1472 mutex_lock(&inode->i_mutex);
1473 mutex_lock(&dqopt->dqonoff_mutex); 1473 mutex_lock(&dqopt->dqonoff_mutex);
1474 if (sb_has_quota_enabled(sb, type)) { 1474 if (sb_has_quota_enabled(sb, type)) {
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fc4a3a224641..8cbf3f69ebe5 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -583,8 +583,7 @@ inode_info_init_once(void *vptr, struct kmem_cache *cachep, unsigned long flags)
583{ 583{
584 struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr; 584 struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
585 585
586 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == 586 if (flags & SLAB_CTOR_CONSTRUCTOR)
587 SLAB_CTOR_CONSTRUCTOR)
588 inode_init_once(&ei->vfs_inode); 587 inode_init_once(&ei->vfs_inode);
589} 588}
590 589
@@ -793,7 +792,7 @@ static int do_sysfs_registration(void)
793 "Unable to register ecryptfs sysfs subsystem\n"); 792 "Unable to register ecryptfs sysfs subsystem\n");
794 goto out; 793 goto out;
795 } 794 }
796 rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj, 795 rc = sysfs_create_file(&ecryptfs_subsys.kobj,
797 &sysfs_attr_version.attr); 796 &sysfs_attr_version.attr);
798 if (rc) { 797 if (rc) {
799 printk(KERN_ERR 798 printk(KERN_ERR
@@ -801,12 +800,12 @@ static int do_sysfs_registration(void)
801 subsystem_unregister(&ecryptfs_subsys); 800 subsystem_unregister(&ecryptfs_subsys);
802 goto out; 801 goto out;
803 } 802 }
804 rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj, 803 rc = sysfs_create_file(&ecryptfs_subsys.kobj,
805 &sysfs_attr_version_str.attr); 804 &sysfs_attr_version_str.attr);
806 if (rc) { 805 if (rc) {
807 printk(KERN_ERR 806 printk(KERN_ERR
808 "Unable to create ecryptfs version_str attribute\n"); 807 "Unable to create ecryptfs version_str attribute\n");
809 sysfs_remove_file(&ecryptfs_subsys.kset.kobj, 808 sysfs_remove_file(&ecryptfs_subsys.kobj,
810 &sysfs_attr_version.attr); 809 &sysfs_attr_version.attr);
811 subsystem_unregister(&ecryptfs_subsys); 810 subsystem_unregister(&ecryptfs_subsys);
812 goto out; 811 goto out;
@@ -841,7 +840,7 @@ static int __init ecryptfs_init(void)
841 ecryptfs_free_kmem_caches(); 840 ecryptfs_free_kmem_caches();
842 goto out; 841 goto out;
843 } 842 }
844 kset_set_kset_s(&ecryptfs_subsys, fs_subsys); 843 kobj_set_kset_s(&ecryptfs_subsys, fs_subsys);
845 sysfs_attr_version.attr.owner = THIS_MODULE; 844 sysfs_attr_version.attr.owner = THIS_MODULE;
846 sysfs_attr_version_str.attr.owner = THIS_MODULE; 845 sysfs_attr_version_str.attr.owner = THIS_MODULE;
847 rc = do_sysfs_registration(); 846 rc = do_sysfs_registration();
@@ -862,9 +861,9 @@ out:
862 861
863static void __exit ecryptfs_exit(void) 862static void __exit ecryptfs_exit(void)
864{ 863{
865 sysfs_remove_file(&ecryptfs_subsys.kset.kobj, 864 sysfs_remove_file(&ecryptfs_subsys.kobj,
866 &sysfs_attr_version.attr); 865 &sysfs_attr_version.attr);
867 sysfs_remove_file(&ecryptfs_subsys.kset.kobj, 866 sysfs_remove_file(&ecryptfs_subsys.kobj,
868 &sysfs_attr_version_str.attr); 867 &sysfs_attr_version_str.attr);
869 subsystem_unregister(&ecryptfs_subsys); 868 subsystem_unregister(&ecryptfs_subsys);
870 ecryptfs_release_messaging(ecryptfs_transport); 869 ecryptfs_release_messaging(ecryptfs_transport);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b731b09499cb..0770c4b66f53 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -46,7 +46,6 @@ struct kmem_cache *ecryptfs_lower_page_cache;
46 */ 46 */
47static struct page *ecryptfs_get1page(struct file *file, int index) 47static struct page *ecryptfs_get1page(struct file *file, int index)
48{ 48{
49 struct page *page;
50 struct dentry *dentry; 49 struct dentry *dentry;
51 struct inode *inode; 50 struct inode *inode;
52 struct address_space *mapping; 51 struct address_space *mapping;
@@ -54,14 +53,7 @@ static struct page *ecryptfs_get1page(struct file *file, int index)
54 dentry = file->f_path.dentry; 53 dentry = file->f_path.dentry;
55 inode = dentry->d_inode; 54 inode = dentry->d_inode;
56 mapping = inode->i_mapping; 55 mapping = inode->i_mapping;
57 page = read_cache_page(mapping, index, 56 return read_mapping_page(mapping, index, (void *)file);
58 (filler_t *)mapping->a_ops->readpage,
59 (void *)file);
60 if (IS_ERR(page))
61 goto out;
62 wait_on_page_locked(page);
63out:
64 return page;
65} 57}
66 58
67static 59static
@@ -233,7 +225,6 @@ int ecryptfs_do_readpage(struct file *file, struct page *page,
233 ecryptfs_printk(KERN_ERR, "Error reading from page cache\n"); 225 ecryptfs_printk(KERN_ERR, "Error reading from page cache\n");
234 goto out; 226 goto out;
235 } 227 }
236 wait_on_page_locked(lower_page);
237 page_data = kmap_atomic(page, KM_USER0); 228 page_data = kmap_atomic(page, KM_USER0);
238 lower_page_data = kmap_atomic(lower_page, KM_USER1); 229 lower_page_data = kmap_atomic(lower_page, KM_USER1);
239 memcpy(page_data, lower_page_data, PAGE_CACHE_SIZE); 230 memcpy(page_data, lower_page_data, PAGE_CACHE_SIZE);
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index e3aa2253c850..fe9186312d7c 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -97,7 +97,7 @@ out:
97 */ 97 */
98static int ecryptfs_process_nl_response(struct sk_buff *skb) 98static int ecryptfs_process_nl_response(struct sk_buff *skb)
99{ 99{
100 struct nlmsghdr *nlh = (struct nlmsghdr*)skb->data; 100 struct nlmsghdr *nlh = nlmsg_hdr(skb);
101 struct ecryptfs_message *msg = NLMSG_DATA(nlh); 101 struct ecryptfs_message *msg = NLMSG_DATA(nlh);
102 int rc; 102 int rc;
103 103
@@ -181,7 +181,7 @@ receive:
181 "rc = [%d]\n", rc); 181 "rc = [%d]\n", rc);
182 return; 182 return;
183 } 183 }
184 nlh = (struct nlmsghdr *)skb->data; 184 nlh = nlmsg_hdr(skb);
185 if (!NLMSG_OK(nlh, skb->len)) { 185 if (!NLMSG_OK(nlh, skb->len)) {
186 ecryptfs_printk(KERN_ERR, "Received corrupt netlink " 186 ecryptfs_printk(KERN_ERR, "Received corrupt netlink "
187 "message\n"); 187 "message\n");
@@ -229,7 +229,7 @@ int ecryptfs_init_netlink(void)
229 229
230 ecryptfs_nl_sock = netlink_kernel_create(NETLINK_ECRYPTFS, 0, 230 ecryptfs_nl_sock = netlink_kernel_create(NETLINK_ECRYPTFS, 0,
231 ecryptfs_receive_nl_message, 231 ecryptfs_receive_nl_message,
232 THIS_MODULE); 232 NULL, THIS_MODULE);
233 if (!ecryptfs_nl_sock) { 233 if (!ecryptfs_nl_sock) {
234 rc = -EIO; 234 rc = -EIO;
235 ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n"); 235 ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n");
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c2235e46edcd..ba7a8b9da0c1 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -72,8 +72,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
72{ 72{
73 struct efs_inode_info *ei = (struct efs_inode_info *) foo; 73 struct efs_inode_info *ei = (struct efs_inode_info *) foo;
74 74
75 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 75 if (flags & SLAB_CTOR_CONSTRUCTOR)
76 SLAB_CTOR_CONSTRUCTOR)
77 inode_init_once(&ei->vfs_inode); 76 inode_init_once(&ei->vfs_inode);
78} 77}
79 78
diff --git a/fs/exec.c b/fs/exec.c
index 7e36c6f6f538..3155e915307a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1244,13 +1244,17 @@ EXPORT_SYMBOL(set_binfmt);
1244 * name into corename, which must have space for at least 1244 * name into corename, which must have space for at least
1245 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. 1245 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
1246 */ 1246 */
1247static void format_corename(char *corename, const char *pattern, long signr) 1247static int format_corename(char *corename, const char *pattern, long signr)
1248{ 1248{
1249 const char *pat_ptr = pattern; 1249 const char *pat_ptr = pattern;
1250 char *out_ptr = corename; 1250 char *out_ptr = corename;
1251 char *const out_end = corename + CORENAME_MAX_SIZE; 1251 char *const out_end = corename + CORENAME_MAX_SIZE;
1252 int rc; 1252 int rc;
1253 int pid_in_pattern = 0; 1253 int pid_in_pattern = 0;
1254 int ispipe = 0;
1255
1256 if (*pattern == '|')
1257 ispipe = 1;
1254 1258
1255 /* Repeat as long as we have more pattern to process and more output 1259 /* Repeat as long as we have more pattern to process and more output
1256 space */ 1260 space */
@@ -1341,8 +1345,8 @@ static void format_corename(char *corename, const char *pattern, long signr)
1341 * 1345 *
1342 * If core_pattern does not include a %p (as is the default) 1346 * If core_pattern does not include a %p (as is the default)
1343 * and core_uses_pid is set, then .%pid will be appended to 1347 * and core_uses_pid is set, then .%pid will be appended to
1344 * the filename */ 1348 * the filename. Do not do this for piped commands. */
1345 if (!pid_in_pattern 1349 if (!ispipe && !pid_in_pattern
1346 && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) { 1350 && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
1347 rc = snprintf(out_ptr, out_end - out_ptr, 1351 rc = snprintf(out_ptr, out_end - out_ptr,
1348 ".%d", current->tgid); 1352 ".%d", current->tgid);
@@ -1350,8 +1354,9 @@ static void format_corename(char *corename, const char *pattern, long signr)
1350 goto out; 1354 goto out;
1351 out_ptr += rc; 1355 out_ptr += rc;
1352 } 1356 }
1353 out: 1357out:
1354 *out_ptr = 0; 1358 *out_ptr = 0;
1359 return ispipe;
1355} 1360}
1356 1361
1357static void zap_process(struct task_struct *start) 1362static void zap_process(struct task_struct *start)
@@ -1502,16 +1507,15 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
1502 * uses lock_kernel() 1507 * uses lock_kernel()
1503 */ 1508 */
1504 lock_kernel(); 1509 lock_kernel();
1505 format_corename(corename, core_pattern, signr); 1510 ispipe = format_corename(corename, core_pattern, signr);
1506 unlock_kernel(); 1511 unlock_kernel();
1507 if (corename[0] == '|') { 1512 if (ispipe) {
1508 /* SIGPIPE can happen, but it's just never processed */ 1513 /* SIGPIPE can happen, but it's just never processed */
1509 if(call_usermodehelper_pipe(corename+1, NULL, NULL, &file)) { 1514 if(call_usermodehelper_pipe(corename+1, NULL, NULL, &file)) {
1510 printk(KERN_INFO "Core dump to %s pipe failed\n", 1515 printk(KERN_INFO "Core dump to %s pipe failed\n",
1511 corename); 1516 corename);
1512 goto fail_unlock; 1517 goto fail_unlock;
1513 } 1518 }
1514 ispipe = 1;
1515 } else 1519 } else
1516 file = filp_open(corename, 1520 file = filp_open(corename,
1517 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 1521 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index e89bfc8cf957..1d1e7e30d70e 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -161,10 +161,7 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n)
161 struct address_space *mapping = dir->i_mapping; 161 struct address_space *mapping = dir->i_mapping;
162 struct page *page = read_mapping_page(mapping, n, NULL); 162 struct page *page = read_mapping_page(mapping, n, NULL);
163 if (!IS_ERR(page)) { 163 if (!IS_ERR(page)) {
164 wait_on_page_locked(page);
165 kmap(page); 164 kmap(page);
166 if (!PageUptodate(page))
167 goto fail;
168 if (!PageChecked(page)) 165 if (!PageChecked(page))
169 ext2_check_page(page); 166 ext2_check_page(page);
170 if (PageError(page)) 167 if (PageError(page))
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a046a419d8af..685a1c287177 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -160,8 +160,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
160{ 160{
161 struct ext2_inode_info *ei = (struct ext2_inode_info *) foo; 161 struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
162 162
163 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 163 if (flags & SLAB_CTOR_CONSTRUCTOR) {
164 SLAB_CTOR_CONSTRUCTOR) {
165 rwlock_init(&ei->i_meta_lock); 164 rwlock_init(&ei->i_meta_lock);
166#ifdef CONFIG_EXT2_FS_XATTR 165#ifdef CONFIG_EXT2_FS_XATTR
167 init_rwsem(&ei->xattr_sem); 166 init_rwsem(&ei->xattr_sem);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 4a4fcd6868c7..54d3c9041259 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -420,7 +420,7 @@ static void ext3_put_super (struct super_block * sb)
420 dump_orphan_list(sb, sbi); 420 dump_orphan_list(sb, sbi);
421 J_ASSERT(list_empty(&sbi->s_orphan)); 421 J_ASSERT(list_empty(&sbi->s_orphan));
422 422
423 invalidate_bdev(sb->s_bdev, 0); 423 invalidate_bdev(sb->s_bdev);
424 if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { 424 if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
425 /* 425 /*
426 * Invalidate the journal device's buffers. We don't want them 426 * Invalidate the journal device's buffers. We don't want them
@@ -428,7 +428,7 @@ static void ext3_put_super (struct super_block * sb)
428 * hotswapped, and it breaks the `ro-after' testing code. 428 * hotswapped, and it breaks the `ro-after' testing code.
429 */ 429 */
430 sync_blockdev(sbi->journal_bdev); 430 sync_blockdev(sbi->journal_bdev);
431 invalidate_bdev(sbi->journal_bdev, 0); 431 invalidate_bdev(sbi->journal_bdev);
432 ext3_blkdev_remove(sbi); 432 ext3_blkdev_remove(sbi);
433 } 433 }
434 sb->s_fs_info = NULL; 434 sb->s_fs_info = NULL;
@@ -466,8 +466,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
466{ 466{
467 struct ext3_inode_info *ei = (struct ext3_inode_info *) foo; 467 struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
468 468
469 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 469 if (flags & SLAB_CTOR_CONSTRUCTOR) {
470 SLAB_CTOR_CONSTRUCTOR) {
471 INIT_LIST_HEAD(&ei->i_orphan); 470 INIT_LIST_HEAD(&ei->i_orphan);
472#ifdef CONFIG_EXT3_FS_XATTR 471#ifdef CONFIG_EXT3_FS_XATTR
473 init_rwsem(&ei->xattr_sem); 472 init_rwsem(&ei->xattr_sem);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 61c4718e4a53..719126932354 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -470,7 +470,7 @@ static void ext4_put_super (struct super_block * sb)
470 dump_orphan_list(sb, sbi); 470 dump_orphan_list(sb, sbi);
471 J_ASSERT(list_empty(&sbi->s_orphan)); 471 J_ASSERT(list_empty(&sbi->s_orphan));
472 472
473 invalidate_bdev(sb->s_bdev, 0); 473 invalidate_bdev(sb->s_bdev);
474 if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { 474 if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
475 /* 475 /*
476 * Invalidate the journal device's buffers. We don't want them 476 * Invalidate the journal device's buffers. We don't want them
@@ -478,7 +478,7 @@ static void ext4_put_super (struct super_block * sb)
478 * hotswapped, and it breaks the `ro-after' testing code. 478 * hotswapped, and it breaks the `ro-after' testing code.
479 */ 479 */
480 sync_blockdev(sbi->journal_bdev); 480 sync_blockdev(sbi->journal_bdev);
481 invalidate_bdev(sbi->journal_bdev, 0); 481 invalidate_bdev(sbi->journal_bdev);
482 ext4_blkdev_remove(sbi); 482 ext4_blkdev_remove(sbi);
483 } 483 }
484 sb->s_fs_info = NULL; 484 sb->s_fs_info = NULL;
@@ -517,8 +517,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
517{ 517{
518 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; 518 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
519 519
520 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 520 if (flags & SLAB_CTOR_CONSTRUCTOR) {
521 SLAB_CTOR_CONSTRUCTOR) {
522 INIT_LIST_HEAD(&ei->i_orphan); 521 INIT_LIST_HEAD(&ei->i_orphan);
523#ifdef CONFIG_EXT4DEV_FS_XATTR 522#ifdef CONFIG_EXT4DEV_FS_XATTR
524 init_rwsem(&ei->xattr_sem); 523 init_rwsem(&ei->xattr_sem);
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 05c2941c74f2..1959143c1d27 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -40,8 +40,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
40{ 40{
41 struct fat_cache *cache = (struct fat_cache *)foo; 41 struct fat_cache *cache = (struct fat_cache *)foo;
42 42
43 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 43 if (flags & SLAB_CTOR_CONSTRUCTOR)
44 SLAB_CTOR_CONSTRUCTOR)
45 INIT_LIST_HEAD(&cache->cache_list); 44 INIT_LIST_HEAD(&cache->cache_list);
46} 45}
47 46
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 9bfe607c892e..65cb54bde481 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -499,8 +499,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
499{ 499{
500 struct msdos_inode_info *ei = (struct msdos_inode_info *)foo; 500 struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
501 501
502 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 502 if (flags & SLAB_CTOR_CONSTRUCTOR) {
503 SLAB_CTOR_CONSTRUCTOR) {
504 spin_lock_init(&ei->cache_lru_lock); 503 spin_lock_init(&ei->cache_lru_lock);
505 ei->nr_caches = 0; 504 ei->nr_caches = 0;
506 ei->cache_valid_id = FAT_CACHE_VALID + 1; 505 ei->cache_valid_id = FAT_CACHE_VALID + 1;
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index decac62efe57..ed8f0b0dd880 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -74,10 +74,7 @@ vxfs_get_page(struct address_space *mapping, u_long n)
74 pp = read_mapping_page(mapping, n, NULL); 74 pp = read_mapping_page(mapping, n, NULL);
75 75
76 if (!IS_ERR(pp)) { 76 if (!IS_ERR(pp)) {
77 wait_on_page_locked(pp);
78 kmap(pp); 77 kmap(pp);
79 if (!PageUptodate(pp))
80 goto fail;
81 /** if (!PageChecked(pp)) **/ 78 /** if (!PageChecked(pp)) **/
82 /** vxfs_check_page(pp); **/ 79 /** vxfs_check_page(pp); **/
83 if (PageError(pp)) 80 if (PageError(pp))
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 608db81219a0..d8003be56e05 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -685,8 +685,7 @@ static void fuse_inode_init_once(void *foo, struct kmem_cache *cachep,
685{ 685{
686 struct inode * inode = foo; 686 struct inode * inode = foo;
687 687
688 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 688 if (flags & SLAB_CTOR_CONSTRUCTOR)
689 SLAB_CTOR_CONSTRUCTOR)
690 inode_init_once(inode); 689 inode_init_once(inode);
691} 690}
692 691
@@ -731,12 +730,12 @@ static int fuse_sysfs_init(void)
731{ 730{
732 int err; 731 int err;
733 732
734 kset_set_kset_s(&fuse_subsys, fs_subsys); 733 kobj_set_kset_s(&fuse_subsys, fs_subsys);
735 err = subsystem_register(&fuse_subsys); 734 err = subsystem_register(&fuse_subsys);
736 if (err) 735 if (err)
737 goto out_err; 736 goto out_err;
738 737
739 kset_set_kset_s(&connections_subsys, fuse_subsys); 738 kobj_set_kset_s(&connections_subsys, fuse_subsys);
740 err = subsystem_register(&connections_subsys); 739 err = subsystem_register(&connections_subsys);
741 if (err) 740 if (err)
742 goto out_fuse_unregister; 741 goto out_fuse_unregister;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 82a1ac7895a2..a96fa07b3f3b 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1262,9 +1262,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1262 u64 leaf_no) 1262 u64 leaf_no)
1263{ 1263{
1264 struct gfs2_inode *ip = GFS2_I(inode); 1264 struct gfs2_inode *ip = GFS2_I(inode);
1265 struct gfs2_sbd *sdp = GFS2_SB(inode);
1265 struct buffer_head *bh; 1266 struct buffer_head *bh;
1266 struct gfs2_leaf *lf; 1267 struct gfs2_leaf *lf;
1267 unsigned entries = 0; 1268 unsigned entries = 0, entries2 = 0;
1268 unsigned leaves = 0; 1269 unsigned leaves = 0;
1269 const struct gfs2_dirent **darr, *dent; 1270 const struct gfs2_dirent **darr, *dent;
1270 struct dirent_gather g; 1271 struct dirent_gather g;
@@ -1290,7 +1291,13 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1290 return 0; 1291 return 0;
1291 1292
1292 error = -ENOMEM; 1293 error = -ENOMEM;
1293 larr = vmalloc((leaves + entries) * sizeof(void *)); 1294 /*
1295 * The extra 99 entries are not normally used, but are a buffer
1296 * zone in case the number of entries in the leaf is corrupt.
1297 * 99 is the maximum number of entries that can fit in a single
1298 * leaf block.
1299 */
1300 larr = vmalloc((leaves + entries + 99) * sizeof(void *));
1294 if (!larr) 1301 if (!larr)
1295 goto out; 1302 goto out;
1296 darr = (const struct gfs2_dirent **)(larr + leaves); 1303 darr = (const struct gfs2_dirent **)(larr + leaves);
@@ -1305,10 +1312,20 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1305 lf = (struct gfs2_leaf *)bh->b_data; 1312 lf = (struct gfs2_leaf *)bh->b_data;
1306 lfn = be64_to_cpu(lf->lf_next); 1313 lfn = be64_to_cpu(lf->lf_next);
1307 if (lf->lf_entries) { 1314 if (lf->lf_entries) {
1315 entries2 += be16_to_cpu(lf->lf_entries);
1308 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, 1316 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
1309 gfs2_dirent_gather, NULL, &g); 1317 gfs2_dirent_gather, NULL, &g);
1310 error = PTR_ERR(dent); 1318 error = PTR_ERR(dent);
1311 if (IS_ERR(dent)) { 1319 if (IS_ERR(dent))
1320 goto out_kfree;
1321 if (entries2 != g.offset) {
1322 fs_warn(sdp, "Number of entries corrupt in dir "
1323 "leaf %llu, entries2 (%u) != "
1324 "g.offset (%u)\n",
1325 (unsigned long long)bh->b_blocknr,
1326 entries2, g.offset);
1327
1328 error = -EIO;
1312 goto out_kfree; 1329 goto out_kfree;
1313 } 1330 }
1314 error = 0; 1331 error = 0;
@@ -1318,6 +1335,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1318 } 1335 }
1319 } while(lfn); 1336 } while(lfn);
1320 1337
1338 BUG_ON(entries2 != entries);
1321 error = do_filldir_main(ip, offset, opaque, filldir, darr, 1339 error = do_filldir_main(ip, offset, opaque, filldir, darr,
1322 entries, copied); 1340 entries, copied);
1323out_kfree: 1341out_kfree:
@@ -1401,6 +1419,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1401 filldir_t filldir) 1419 filldir_t filldir)
1402{ 1420{
1403 struct gfs2_inode *dip = GFS2_I(inode); 1421 struct gfs2_inode *dip = GFS2_I(inode);
1422 struct gfs2_sbd *sdp = GFS2_SB(inode);
1404 struct dirent_gather g; 1423 struct dirent_gather g;
1405 const struct gfs2_dirent **darr, *dent; 1424 const struct gfs2_dirent **darr, *dent;
1406 struct buffer_head *dibh; 1425 struct buffer_head *dibh;
@@ -1423,8 +1442,8 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1423 return error; 1442 return error;
1424 1443
1425 error = -ENOMEM; 1444 error = -ENOMEM;
1426 darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *), 1445 /* 96 is max number of dirents which can be stuffed into an inode */
1427 GFP_KERNEL); 1446 darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_KERNEL);
1428 if (darr) { 1447 if (darr) {
1429 g.pdent = darr; 1448 g.pdent = darr;
1430 g.offset = 0; 1449 g.offset = 0;
@@ -1434,6 +1453,15 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1434 error = PTR_ERR(dent); 1453 error = PTR_ERR(dent);
1435 goto out; 1454 goto out;
1436 } 1455 }
1456 if (dip->i_di.di_entries != g.offset) {
1457 fs_warn(sdp, "Number of entries corrupt in dir %llu, "
1458 "ip->i_di.di_entries (%u) != g.offset (%u)\n",
1459 (unsigned long long)dip->i_num.no_addr,
1460 dip->i_di.di_entries,
1461 g.offset);
1462 error = -EIO;
1463 goto out;
1464 }
1437 error = do_filldir_main(dip, offset, opaque, filldir, darr, 1465 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1438 dip->i_di.di_entries, &copied); 1466 dip->i_di.di_entries, &copied);
1439out: 1467out:
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 12accb08fe02..1815429a2978 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -23,6 +23,10 @@
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/rwsem.h> 24#include <linux/rwsem.h>
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26#include <linux/seq_file.h>
27#include <linux/debugfs.h>
28#include <linux/module.h>
29#include <linux/kallsyms.h>
26 30
27#include "gfs2.h" 31#include "gfs2.h"
28#include "incore.h" 32#include "incore.h"
@@ -40,20 +44,30 @@ struct gfs2_gl_hash_bucket {
40 struct hlist_head hb_list; 44 struct hlist_head hb_list;
41}; 45};
42 46
47struct glock_iter {
48 int hash; /* hash bucket index */
49 struct gfs2_sbd *sdp; /* incore superblock */
50 struct gfs2_glock *gl; /* current glock struct */
51 struct hlist_head *hb_list; /* current hash bucket ptr */
52 struct seq_file *seq; /* sequence file for debugfs */
53 char string[512]; /* scratch space */
54};
55
43typedef void (*glock_examiner) (struct gfs2_glock * gl); 56typedef void (*glock_examiner) (struct gfs2_glock * gl);
44 57
45static int gfs2_dump_lockstate(struct gfs2_sbd *sdp); 58static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
46static int dump_glock(struct gfs2_glock *gl); 59static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
47static int dump_inode(struct gfs2_inode *ip); 60static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
48static void gfs2_glock_xmote_th(struct gfs2_holder *gh);
49static void gfs2_glock_drop_th(struct gfs2_glock *gl); 61static void gfs2_glock_drop_th(struct gfs2_glock *gl);
50static DECLARE_RWSEM(gfs2_umount_flush_sem); 62static DECLARE_RWSEM(gfs2_umount_flush_sem);
63static struct dentry *gfs2_root;
51 64
52#define GFS2_GL_HASH_SHIFT 15 65#define GFS2_GL_HASH_SHIFT 15
53#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) 66#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
54#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1) 67#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
55 68
56static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE]; 69static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
70static struct dentry *gfs2_root;
57 71
58/* 72/*
59 * Despite what you might think, the numbers below are not arbitrary :-) 73 * Despite what you might think, the numbers below are not arbitrary :-)
@@ -202,7 +216,6 @@ int gfs2_glock_put(struct gfs2_glock *gl)
202 gfs2_assert(sdp, list_empty(&gl->gl_reclaim)); 216 gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
203 gfs2_assert(sdp, list_empty(&gl->gl_holders)); 217 gfs2_assert(sdp, list_empty(&gl->gl_holders));
204 gfs2_assert(sdp, list_empty(&gl->gl_waiters1)); 218 gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
205 gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
206 gfs2_assert(sdp, list_empty(&gl->gl_waiters3)); 219 gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
207 glock_free(gl); 220 glock_free(gl);
208 rv = 1; 221 rv = 1;
@@ -303,7 +316,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
303 atomic_set(&gl->gl_ref, 1); 316 atomic_set(&gl->gl_ref, 1);
304 gl->gl_state = LM_ST_UNLOCKED; 317 gl->gl_state = LM_ST_UNLOCKED;
305 gl->gl_hash = hash; 318 gl->gl_hash = hash;
306 gl->gl_owner = NULL; 319 gl->gl_owner_pid = 0;
307 gl->gl_ip = 0; 320 gl->gl_ip = 0;
308 gl->gl_ops = glops; 321 gl->gl_ops = glops;
309 gl->gl_req_gh = NULL; 322 gl->gl_req_gh = NULL;
@@ -367,7 +380,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
367 INIT_LIST_HEAD(&gh->gh_list); 380 INIT_LIST_HEAD(&gh->gh_list);
368 gh->gh_gl = gl; 381 gh->gh_gl = gl;
369 gh->gh_ip = (unsigned long)__builtin_return_address(0); 382 gh->gh_ip = (unsigned long)__builtin_return_address(0);
370 gh->gh_owner = current; 383 gh->gh_owner_pid = current->pid;
371 gh->gh_state = state; 384 gh->gh_state = state;
372 gh->gh_flags = flags; 385 gh->gh_flags = flags;
373 gh->gh_error = 0; 386 gh->gh_error = 0;
@@ -389,7 +402,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
389{ 402{
390 gh->gh_state = state; 403 gh->gh_state = state;
391 gh->gh_flags = flags; 404 gh->gh_flags = flags;
392 gh->gh_iflags &= 1 << HIF_ALLOCED; 405 gh->gh_iflags = 0;
393 gh->gh_ip = (unsigned long)__builtin_return_address(0); 406 gh->gh_ip = (unsigned long)__builtin_return_address(0);
394} 407}
395 408
@@ -406,54 +419,8 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
406 gh->gh_ip = 0; 419 gh->gh_ip = 0;
407} 420}
408 421
409/** 422static void gfs2_holder_wake(struct gfs2_holder *gh)
410 * gfs2_holder_get - get a struct gfs2_holder structure
411 * @gl: the glock
412 * @state: the state we're requesting
413 * @flags: the modifier flags
414 * @gfp_flags:
415 *
416 * Figure out how big an impact this function has. Either:
417 * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
418 * 2) Leave it like it is
419 *
420 * Returns: the holder structure, NULL on ENOMEM
421 */
422
423static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
424 unsigned int state,
425 int flags, gfp_t gfp_flags)
426{
427 struct gfs2_holder *gh;
428
429 gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
430 if (!gh)
431 return NULL;
432
433 gfs2_holder_init(gl, state, flags, gh);
434 set_bit(HIF_ALLOCED, &gh->gh_iflags);
435 gh->gh_ip = (unsigned long)__builtin_return_address(0);
436 return gh;
437}
438
439/**
440 * gfs2_holder_put - get rid of a struct gfs2_holder structure
441 * @gh: the holder structure
442 *
443 */
444
445static void gfs2_holder_put(struct gfs2_holder *gh)
446{ 423{
447 gfs2_holder_uninit(gh);
448 kfree(gh);
449}
450
451static void gfs2_holder_dispose_or_wake(struct gfs2_holder *gh)
452{
453 if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) {
454 gfs2_holder_put(gh);
455 return;
456 }
457 clear_bit(HIF_WAIT, &gh->gh_iflags); 424 clear_bit(HIF_WAIT, &gh->gh_iflags);
458 smp_mb(); 425 smp_mb();
459 wake_up_bit(&gh->gh_iflags, HIF_WAIT); 426 wake_up_bit(&gh->gh_iflags, HIF_WAIT);
@@ -519,7 +486,7 @@ static int rq_promote(struct gfs2_holder *gh)
519 gfs2_reclaim_glock(sdp); 486 gfs2_reclaim_glock(sdp);
520 } 487 }
521 488
522 gfs2_glock_xmote_th(gh); 489 gfs2_glock_xmote_th(gh->gh_gl, gh);
523 spin_lock(&gl->gl_spin); 490 spin_lock(&gl->gl_spin);
524 } 491 }
525 return 1; 492 return 1;
@@ -542,7 +509,7 @@ static int rq_promote(struct gfs2_holder *gh)
542 gh->gh_error = 0; 509 gh->gh_error = 0;
543 set_bit(HIF_HOLDER, &gh->gh_iflags); 510 set_bit(HIF_HOLDER, &gh->gh_iflags);
544 511
545 gfs2_holder_dispose_or_wake(gh); 512 gfs2_holder_wake(gh);
546 513
547 return 0; 514 return 0;
548} 515}
@@ -554,32 +521,24 @@ static int rq_promote(struct gfs2_holder *gh)
554 * Returns: 1 if the queue is blocked 521 * Returns: 1 if the queue is blocked
555 */ 522 */
556 523
557static int rq_demote(struct gfs2_holder *gh) 524static int rq_demote(struct gfs2_glock *gl)
558{ 525{
559 struct gfs2_glock *gl = gh->gh_gl;
560
561 if (!list_empty(&gl->gl_holders)) 526 if (!list_empty(&gl->gl_holders))
562 return 1; 527 return 1;
563 528
564 if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) { 529 if (gl->gl_state == gl->gl_demote_state ||
565 list_del_init(&gh->gh_list); 530 gl->gl_state == LM_ST_UNLOCKED) {
566 gh->gh_error = 0; 531 clear_bit(GLF_DEMOTE, &gl->gl_flags);
567 spin_unlock(&gl->gl_spin); 532 return 0;
568 gfs2_holder_dispose_or_wake(gh);
569 spin_lock(&gl->gl_spin);
570 } else {
571 gl->gl_req_gh = gh;
572 set_bit(GLF_LOCK, &gl->gl_flags);
573 spin_unlock(&gl->gl_spin);
574
575 if (gh->gh_state == LM_ST_UNLOCKED ||
576 gl->gl_state != LM_ST_EXCLUSIVE)
577 gfs2_glock_drop_th(gl);
578 else
579 gfs2_glock_xmote_th(gh);
580
581 spin_lock(&gl->gl_spin);
582 } 533 }
534 set_bit(GLF_LOCK, &gl->gl_flags);
535 spin_unlock(&gl->gl_spin);
536 if (gl->gl_demote_state == LM_ST_UNLOCKED ||
537 gl->gl_state != LM_ST_EXCLUSIVE)
538 gfs2_glock_drop_th(gl);
539 else
540 gfs2_glock_xmote_th(gl, NULL);
541 spin_lock(&gl->gl_spin);
583 542
584 return 0; 543 return 0;
585} 544}
@@ -607,16 +566,8 @@ static void run_queue(struct gfs2_glock *gl)
607 else 566 else
608 gfs2_assert_warn(gl->gl_sbd, 0); 567 gfs2_assert_warn(gl->gl_sbd, 0);
609 568
610 } else if (!list_empty(&gl->gl_waiters2) && 569 } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
611 !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) { 570 blocked = rq_demote(gl);
612 gh = list_entry(gl->gl_waiters2.next,
613 struct gfs2_holder, gh_list);
614
615 if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
616 blocked = rq_demote(gh);
617 else
618 gfs2_assert_warn(gl->gl_sbd, 0);
619
620 } else if (!list_empty(&gl->gl_waiters3)) { 571 } else if (!list_empty(&gl->gl_waiters3)) {
621 gh = list_entry(gl->gl_waiters3.next, 572 gh = list_entry(gl->gl_waiters3.next,
622 struct gfs2_holder, gh_list); 573 struct gfs2_holder, gh_list);
@@ -654,7 +605,7 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl)
654 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { 605 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
655 list_add_tail(&gh.gh_list, &gl->gl_waiters1); 606 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
656 } else { 607 } else {
657 gl->gl_owner = current; 608 gl->gl_owner_pid = current->pid;
658 gl->gl_ip = (unsigned long)__builtin_return_address(0); 609 gl->gl_ip = (unsigned long)__builtin_return_address(0);
659 clear_bit(HIF_WAIT, &gh.gh_iflags); 610 clear_bit(HIF_WAIT, &gh.gh_iflags);
660 smp_mb(); 611 smp_mb();
@@ -681,7 +632,7 @@ static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
681 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { 632 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
682 acquired = 0; 633 acquired = 0;
683 } else { 634 } else {
684 gl->gl_owner = current; 635 gl->gl_owner_pid = current->pid;
685 gl->gl_ip = (unsigned long)__builtin_return_address(0); 636 gl->gl_ip = (unsigned long)__builtin_return_address(0);
686 } 637 }
687 spin_unlock(&gl->gl_spin); 638 spin_unlock(&gl->gl_spin);
@@ -699,7 +650,7 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
699{ 650{
700 spin_lock(&gl->gl_spin); 651 spin_lock(&gl->gl_spin);
701 clear_bit(GLF_LOCK, &gl->gl_flags); 652 clear_bit(GLF_LOCK, &gl->gl_flags);
702 gl->gl_owner = NULL; 653 gl->gl_owner_pid = 0;
703 gl->gl_ip = 0; 654 gl->gl_ip = 0;
704 run_queue(gl); 655 run_queue(gl);
705 BUG_ON(!spin_is_locked(&gl->gl_spin)); 656 BUG_ON(!spin_is_locked(&gl->gl_spin));
@@ -707,50 +658,24 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
707} 658}
708 659
709/** 660/**
710 * handle_callback - add a demote request to a lock's queue 661 * handle_callback - process a demote request
711 * @gl: the glock 662 * @gl: the glock
712 * @state: the state the caller wants us to change to 663 * @state: the state the caller wants us to change to
713 * 664 *
714 * Note: This may fail sliently if we are out of memory. 665 * There are only two requests that we are going to see in actual
666 * practise: LM_ST_SHARED and LM_ST_UNLOCKED
715 */ 667 */
716 668
717static void handle_callback(struct gfs2_glock *gl, unsigned int state) 669static void handle_callback(struct gfs2_glock *gl, unsigned int state)
718{ 670{
719 struct gfs2_holder *gh, *new_gh = NULL;
720
721restart:
722 spin_lock(&gl->gl_spin); 671 spin_lock(&gl->gl_spin);
723 672 if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) {
724 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) { 673 gl->gl_demote_state = state;
725 if (test_bit(HIF_DEMOTE, &gh->gh_iflags) && 674 gl->gl_demote_time = jiffies;
726 gl->gl_req_gh != gh) { 675 } else if (gl->gl_demote_state != LM_ST_UNLOCKED) {
727 if (gh->gh_state != state) 676 gl->gl_demote_state = state;
728 gh->gh_state = LM_ST_UNLOCKED;
729 goto out;
730 }
731 }
732
733 if (new_gh) {
734 list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
735 new_gh = NULL;
736 } else {
737 spin_unlock(&gl->gl_spin);
738
739 new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_NOFS);
740 if (!new_gh)
741 return;
742 set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
743 set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
744 set_bit(HIF_WAIT, &new_gh->gh_iflags);
745
746 goto restart;
747 } 677 }
748
749out:
750 spin_unlock(&gl->gl_spin); 678 spin_unlock(&gl->gl_spin);
751
752 if (new_gh)
753 gfs2_holder_put(new_gh);
754} 679}
755 680
756/** 681/**
@@ -810,56 +735,37 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
810 735
811 /* Deal with each possible exit condition */ 736 /* Deal with each possible exit condition */
812 737
813 if (!gh) 738 if (!gh) {
814 gl->gl_stamp = jiffies; 739 gl->gl_stamp = jiffies;
815 else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { 740 if (ret & LM_OUT_CANCELED)
741 op_done = 0;
742 else
743 clear_bit(GLF_DEMOTE, &gl->gl_flags);
744 } else {
816 spin_lock(&gl->gl_spin); 745 spin_lock(&gl->gl_spin);
817 list_del_init(&gh->gh_list); 746 list_del_init(&gh->gh_list);
818 gh->gh_error = -EIO; 747 gh->gh_error = -EIO;
819 spin_unlock(&gl->gl_spin); 748 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
820 } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) { 749 goto out;
821 spin_lock(&gl->gl_spin); 750 gh->gh_error = GLR_CANCELED;
822 list_del_init(&gh->gh_list); 751 if (ret & LM_OUT_CANCELED)
823 if (gl->gl_state == gh->gh_state || 752 goto out;
824 gl->gl_state == LM_ST_UNLOCKED) { 753 if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
754 list_add_tail(&gh->gh_list, &gl->gl_holders);
825 gh->gh_error = 0; 755 gh->gh_error = 0;
826 } else { 756 set_bit(HIF_HOLDER, &gh->gh_iflags);
827 if (gfs2_assert_warn(sdp, gh->gh_flags & 757 set_bit(HIF_FIRST, &gh->gh_iflags);
828 (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1) 758 op_done = 0;
829 fs_warn(sdp, "ret = 0x%.8X\n", ret); 759 goto out;
830 gh->gh_error = GLR_TRYFAILED;
831 } 760 }
832 spin_unlock(&gl->gl_spin);
833
834 if (ret & LM_OUT_CANCELED)
835 handle_callback(gl, LM_ST_UNLOCKED);
836
837 } else if (ret & LM_OUT_CANCELED) {
838 spin_lock(&gl->gl_spin);
839 list_del_init(&gh->gh_list);
840 gh->gh_error = GLR_CANCELED;
841 spin_unlock(&gl->gl_spin);
842
843 } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
844 spin_lock(&gl->gl_spin);
845 list_move_tail(&gh->gh_list, &gl->gl_holders);
846 gh->gh_error = 0;
847 set_bit(HIF_HOLDER, &gh->gh_iflags);
848 spin_unlock(&gl->gl_spin);
849
850 set_bit(HIF_FIRST, &gh->gh_iflags);
851
852 op_done = 0;
853
854 } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
855 spin_lock(&gl->gl_spin);
856 list_del_init(&gh->gh_list);
857 gh->gh_error = GLR_TRYFAILED; 761 gh->gh_error = GLR_TRYFAILED;
858 spin_unlock(&gl->gl_spin); 762 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
859 763 goto out;
860 } else { 764 gh->gh_error = -EINVAL;
861 if (gfs2_assert_withdraw(sdp, 0) == -1) 765 if (gfs2_assert_withdraw(sdp, 0) == -1)
862 fs_err(sdp, "ret = 0x%.8X\n", ret); 766 fs_err(sdp, "ret = 0x%.8X\n", ret);
767out:
768 spin_unlock(&gl->gl_spin);
863 } 769 }
864 770
865 if (glops->go_xmote_bh) 771 if (glops->go_xmote_bh)
@@ -877,7 +783,7 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
877 gfs2_glock_put(gl); 783 gfs2_glock_put(gl);
878 784
879 if (gh) 785 if (gh)
880 gfs2_holder_dispose_or_wake(gh); 786 gfs2_holder_wake(gh);
881} 787}
882 788
883/** 789/**
@@ -888,12 +794,11 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
888 * 794 *
889 */ 795 */
890 796
891void gfs2_glock_xmote_th(struct gfs2_holder *gh) 797void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
892{ 798{
893 struct gfs2_glock *gl = gh->gh_gl;
894 struct gfs2_sbd *sdp = gl->gl_sbd; 799 struct gfs2_sbd *sdp = gl->gl_sbd;
895 int flags = gh->gh_flags; 800 int flags = gh ? gh->gh_flags : 0;
896 unsigned state = gh->gh_state; 801 unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
897 const struct gfs2_glock_operations *glops = gl->gl_ops; 802 const struct gfs2_glock_operations *glops = gl->gl_ops;
898 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB | 803 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
899 LM_FLAG_NOEXP | LM_FLAG_ANY | 804 LM_FLAG_NOEXP | LM_FLAG_ANY |
@@ -943,6 +848,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
943 gfs2_assert_warn(sdp, !ret); 848 gfs2_assert_warn(sdp, !ret);
944 849
945 state_change(gl, LM_ST_UNLOCKED); 850 state_change(gl, LM_ST_UNLOCKED);
851 clear_bit(GLF_DEMOTE, &gl->gl_flags);
946 852
947 if (glops->go_inval) 853 if (glops->go_inval)
948 glops->go_inval(gl, DIO_METADATA); 854 glops->go_inval(gl, DIO_METADATA);
@@ -964,7 +870,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
964 gfs2_glock_put(gl); 870 gfs2_glock_put(gl);
965 871
966 if (gh) 872 if (gh)
967 gfs2_holder_dispose_or_wake(gh); 873 gfs2_holder_wake(gh);
968} 874}
969 875
970/** 876/**
@@ -1097,18 +1003,32 @@ static int glock_wait_internal(struct gfs2_holder *gh)
1097} 1003}
1098 1004
1099static inline struct gfs2_holder * 1005static inline struct gfs2_holder *
1100find_holder_by_owner(struct list_head *head, struct task_struct *owner) 1006find_holder_by_owner(struct list_head *head, pid_t pid)
1101{ 1007{
1102 struct gfs2_holder *gh; 1008 struct gfs2_holder *gh;
1103 1009
1104 list_for_each_entry(gh, head, gh_list) { 1010 list_for_each_entry(gh, head, gh_list) {
1105 if (gh->gh_owner == owner) 1011 if (gh->gh_owner_pid == pid)
1106 return gh; 1012 return gh;
1107 } 1013 }
1108 1014
1109 return NULL; 1015 return NULL;
1110} 1016}
1111 1017
1018static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
1019{
1020 va_list args;
1021
1022 va_start(args, fmt);
1023 if (gi) {
1024 vsprintf(gi->string, fmt, args);
1025 seq_printf(gi->seq, gi->string);
1026 }
1027 else
1028 vprintk(fmt, args);
1029 va_end(args);
1030}
1031
1112/** 1032/**
1113 * add_to_queue - Add a holder to the wait queue (but look for recursion) 1033 * add_to_queue - Add a holder to the wait queue (but look for recursion)
1114 * @gh: the holder structure to add 1034 * @gh: the holder structure to add
@@ -1120,24 +1040,24 @@ static void add_to_queue(struct gfs2_holder *gh)
1120 struct gfs2_glock *gl = gh->gh_gl; 1040 struct gfs2_glock *gl = gh->gh_gl;
1121 struct gfs2_holder *existing; 1041 struct gfs2_holder *existing;
1122 1042
1123 BUG_ON(!gh->gh_owner); 1043 BUG_ON(!gh->gh_owner_pid);
1124 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) 1044 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
1125 BUG(); 1045 BUG();
1126 1046
1127 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner); 1047 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner_pid);
1128 if (existing) { 1048 if (existing) {
1129 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip); 1049 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1130 printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid); 1050 printk(KERN_INFO "pid : %d\n", existing->gh_owner_pid);
1131 printk(KERN_INFO "lock type : %d lock state : %d\n", 1051 printk(KERN_INFO "lock type : %d lock state : %d\n",
1132 existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state); 1052 existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
1133 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); 1053 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1134 printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid); 1054 printk(KERN_INFO "pid : %d\n", gh->gh_owner_pid);
1135 printk(KERN_INFO "lock type : %d lock state : %d\n", 1055 printk(KERN_INFO "lock type : %d lock state : %d\n",
1136 gl->gl_name.ln_type, gl->gl_state); 1056 gl->gl_name.ln_type, gl->gl_state);
1137 BUG(); 1057 BUG();
1138 } 1058 }
1139 1059
1140 existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner); 1060 existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner_pid);
1141 if (existing) { 1061 if (existing) {
1142 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip); 1062 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1143 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); 1063 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
@@ -1267,9 +1187,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1267 if (glops->go_unlock) 1187 if (glops->go_unlock)
1268 glops->go_unlock(gh); 1188 glops->go_unlock(gh);
1269 1189
1270 gl->gl_stamp = jiffies;
1271
1272 spin_lock(&gl->gl_spin); 1190 spin_lock(&gl->gl_spin);
1191 gl->gl_stamp = jiffies;
1273 } 1192 }
1274 1193
1275 clear_bit(GLF_LOCK, &gl->gl_flags); 1194 clear_bit(GLF_LOCK, &gl->gl_flags);
@@ -1841,6 +1760,15 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
1841 * Diagnostic routines to help debug distributed deadlock 1760 * Diagnostic routines to help debug distributed deadlock
1842 */ 1761 */
1843 1762
1763static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
1764 unsigned long address)
1765{
1766 char buffer[KSYM_SYMBOL_LEN];
1767
1768 sprint_symbol(buffer, address);
1769 print_dbg(gi, fmt, buffer);
1770}
1771
1844/** 1772/**
1845 * dump_holder - print information about a glock holder 1773 * dump_holder - print information about a glock holder
1846 * @str: a string naming the type of holder 1774 * @str: a string naming the type of holder
@@ -1849,31 +1777,37 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
1849 * Returns: 0 on success, -ENOBUFS when we run out of space 1777 * Returns: 0 on success, -ENOBUFS when we run out of space
1850 */ 1778 */
1851 1779
1852static int dump_holder(char *str, struct gfs2_holder *gh) 1780static int dump_holder(struct glock_iter *gi, char *str,
1781 struct gfs2_holder *gh)
1853{ 1782{
1854 unsigned int x; 1783 unsigned int x;
1855 int error = -ENOBUFS; 1784 struct task_struct *gh_owner;
1856 1785
1857 printk(KERN_INFO " %s\n", str); 1786 print_dbg(gi, " %s\n", str);
1858 printk(KERN_INFO " owner = %ld\n", 1787 if (gh->gh_owner_pid) {
1859 (gh->gh_owner) ? (long)gh->gh_owner->pid : -1); 1788 print_dbg(gi, " owner = %ld ", (long)gh->gh_owner_pid);
1860 printk(KERN_INFO " gh_state = %u\n", gh->gh_state); 1789 gh_owner = find_task_by_pid(gh->gh_owner_pid);
1861 printk(KERN_INFO " gh_flags ="); 1790 if (gh_owner)
1791 print_dbg(gi, "(%s)\n", gh_owner->comm);
1792 else
1793 print_dbg(gi, "(ended)\n");
1794 } else
1795 print_dbg(gi, " owner = -1\n");
1796 print_dbg(gi, " gh_state = %u\n", gh->gh_state);
1797 print_dbg(gi, " gh_flags =");
1862 for (x = 0; x < 32; x++) 1798 for (x = 0; x < 32; x++)
1863 if (gh->gh_flags & (1 << x)) 1799 if (gh->gh_flags & (1 << x))
1864 printk(" %u", x); 1800 print_dbg(gi, " %u", x);
1865 printk(" \n"); 1801 print_dbg(gi, " \n");
1866 printk(KERN_INFO " error = %d\n", gh->gh_error); 1802 print_dbg(gi, " error = %d\n", gh->gh_error);
1867 printk(KERN_INFO " gh_iflags ="); 1803 print_dbg(gi, " gh_iflags =");
1868 for (x = 0; x < 32; x++) 1804 for (x = 0; x < 32; x++)
1869 if (test_bit(x, &gh->gh_iflags)) 1805 if (test_bit(x, &gh->gh_iflags))
1870 printk(" %u", x); 1806 print_dbg(gi, " %u", x);
1871 printk(" \n"); 1807 print_dbg(gi, " \n");
1872 print_symbol(KERN_INFO " initialized at: %s\n", gh->gh_ip); 1808 gfs2_print_symbol(gi, " initialized at: %s\n", gh->gh_ip);
1873
1874 error = 0;
1875 1809
1876 return error; 1810 return 0;
1877} 1811}
1878 1812
1879/** 1813/**
@@ -1883,25 +1817,20 @@ static int dump_holder(char *str, struct gfs2_holder *gh)
1883 * Returns: 0 on success, -ENOBUFS when we run out of space 1817 * Returns: 0 on success, -ENOBUFS when we run out of space
1884 */ 1818 */
1885 1819
1886static int dump_inode(struct gfs2_inode *ip) 1820static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
1887{ 1821{
1888 unsigned int x; 1822 unsigned int x;
1889 int error = -ENOBUFS;
1890 1823
1891 printk(KERN_INFO " Inode:\n"); 1824 print_dbg(gi, " Inode:\n");
1892 printk(KERN_INFO " num = %llu %llu\n", 1825 print_dbg(gi, " num = %llu/%llu\n",
1893 (unsigned long long)ip->i_num.no_formal_ino, 1826 ip->i_num.no_formal_ino, ip->i_num.no_addr);
1894 (unsigned long long)ip->i_num.no_addr); 1827 print_dbg(gi, " type = %u\n", IF2DT(ip->i_inode.i_mode));
1895 printk(KERN_INFO " type = %u\n", IF2DT(ip->i_inode.i_mode)); 1828 print_dbg(gi, " i_flags =");
1896 printk(KERN_INFO " i_flags =");
1897 for (x = 0; x < 32; x++) 1829 for (x = 0; x < 32; x++)
1898 if (test_bit(x, &ip->i_flags)) 1830 if (test_bit(x, &ip->i_flags))
1899 printk(" %u", x); 1831 print_dbg(gi, " %u", x);
1900 printk(" \n"); 1832 print_dbg(gi, " \n");
1901 1833 return 0;
1902 error = 0;
1903
1904 return error;
1905} 1834}
1906 1835
1907/** 1836/**
@@ -1912,74 +1841,86 @@ static int dump_inode(struct gfs2_inode *ip)
1912 * Returns: 0 on success, -ENOBUFS when we run out of space 1841 * Returns: 0 on success, -ENOBUFS when we run out of space
1913 */ 1842 */
1914 1843
1915static int dump_glock(struct gfs2_glock *gl) 1844static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
1916{ 1845{
1917 struct gfs2_holder *gh; 1846 struct gfs2_holder *gh;
1918 unsigned int x; 1847 unsigned int x;
1919 int error = -ENOBUFS; 1848 int error = -ENOBUFS;
1849 struct task_struct *gl_owner;
1920 1850
1921 spin_lock(&gl->gl_spin); 1851 spin_lock(&gl->gl_spin);
1922 1852
1923 printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type, 1853 print_dbg(gi, "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
1924 (unsigned long long)gl->gl_name.ln_number); 1854 (unsigned long long)gl->gl_name.ln_number);
1925 printk(KERN_INFO " gl_flags ="); 1855 print_dbg(gi, " gl_flags =");
1926 for (x = 0; x < 32; x++) { 1856 for (x = 0; x < 32; x++) {
1927 if (test_bit(x, &gl->gl_flags)) 1857 if (test_bit(x, &gl->gl_flags))
1928 printk(" %u", x); 1858 print_dbg(gi, " %u", x);
1929 } 1859 }
1930 printk(" \n"); 1860 if (!test_bit(GLF_LOCK, &gl->gl_flags))
1931 printk(KERN_INFO " gl_ref = %d\n", atomic_read(&gl->gl_ref)); 1861 print_dbg(gi, " (unlocked)");
1932 printk(KERN_INFO " gl_state = %u\n", gl->gl_state); 1862 print_dbg(gi, " \n");
1933 printk(KERN_INFO " gl_owner = %s\n", gl->gl_owner->comm); 1863 print_dbg(gi, " gl_ref = %d\n", atomic_read(&gl->gl_ref));
1934 print_symbol(KERN_INFO " gl_ip = %s\n", gl->gl_ip); 1864 print_dbg(gi, " gl_state = %u\n", gl->gl_state);
1935 printk(KERN_INFO " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no"); 1865 if (gl->gl_owner_pid) {
1936 printk(KERN_INFO " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no"); 1866 gl_owner = find_task_by_pid(gl->gl_owner_pid);
1937 printk(KERN_INFO " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count)); 1867 if (gl_owner)
1938 printk(KERN_INFO " object = %s\n", (gl->gl_object) ? "yes" : "no"); 1868 print_dbg(gi, " gl_owner = pid %d (%s)\n",
1939 printk(KERN_INFO " le = %s\n", 1869 gl->gl_owner_pid, gl_owner->comm);
1870 else
1871 print_dbg(gi, " gl_owner = %d (ended)\n",
1872 gl->gl_owner_pid);
1873 } else
1874 print_dbg(gi, " gl_owner = -1\n");
1875 print_dbg(gi, " gl_ip = %lu\n", gl->gl_ip);
1876 print_dbg(gi, " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
1877 print_dbg(gi, " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
1878 print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
1879 print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no");
1880 print_dbg(gi, " le = %s\n",
1940 (list_empty(&gl->gl_le.le_list)) ? "no" : "yes"); 1881 (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
1941 printk(KERN_INFO " reclaim = %s\n", 1882 print_dbg(gi, " reclaim = %s\n",
1942 (list_empty(&gl->gl_reclaim)) ? "no" : "yes"); 1883 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
1943 if (gl->gl_aspace) 1884 if (gl->gl_aspace)
1944 printk(KERN_INFO " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace, 1885 print_dbg(gi, " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
1945 gl->gl_aspace->i_mapping->nrpages); 1886 gl->gl_aspace->i_mapping->nrpages);
1946 else 1887 else
1947 printk(KERN_INFO " aspace = no\n"); 1888 print_dbg(gi, " aspace = no\n");
1948 printk(KERN_INFO " ail = %d\n", atomic_read(&gl->gl_ail_count)); 1889 print_dbg(gi, " ail = %d\n", atomic_read(&gl->gl_ail_count));
1949 if (gl->gl_req_gh) { 1890 if (gl->gl_req_gh) {
1950 error = dump_holder("Request", gl->gl_req_gh); 1891 error = dump_holder(gi, "Request", gl->gl_req_gh);
1951 if (error) 1892 if (error)
1952 goto out; 1893 goto out;
1953 } 1894 }
1954 list_for_each_entry(gh, &gl->gl_holders, gh_list) { 1895 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
1955 error = dump_holder("Holder", gh); 1896 error = dump_holder(gi, "Holder", gh);
1956 if (error) 1897 if (error)
1957 goto out; 1898 goto out;
1958 } 1899 }
1959 list_for_each_entry(gh, &gl->gl_waiters1, gh_list) { 1900 list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
1960 error = dump_holder("Waiter1", gh); 1901 error = dump_holder(gi, "Waiter1", gh);
1961 if (error)
1962 goto out;
1963 }
1964 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
1965 error = dump_holder("Waiter2", gh);
1966 if (error) 1902 if (error)
1967 goto out; 1903 goto out;
1968 } 1904 }
1969 list_for_each_entry(gh, &gl->gl_waiters3, gh_list) { 1905 list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
1970 error = dump_holder("Waiter3", gh); 1906 error = dump_holder(gi, "Waiter3", gh);
1971 if (error) 1907 if (error)
1972 goto out; 1908 goto out;
1973 } 1909 }
1910 if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
1911 print_dbg(gi, " Demotion req to state %u (%llu uS ago)\n",
1912 gl->gl_demote_state,
1913 (u64)(jiffies - gl->gl_demote_time)*(1000000/HZ));
1914 }
1974 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) { 1915 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
1975 if (!test_bit(GLF_LOCK, &gl->gl_flags) && 1916 if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
1976 list_empty(&gl->gl_holders)) { 1917 list_empty(&gl->gl_holders)) {
1977 error = dump_inode(gl->gl_object); 1918 error = dump_inode(gi, gl->gl_object);
1978 if (error) 1919 if (error)
1979 goto out; 1920 goto out;
1980 } else { 1921 } else {
1981 error = -ENOBUFS; 1922 error = -ENOBUFS;
1982 printk(KERN_INFO " Inode: busy\n"); 1923 print_dbg(gi, " Inode: busy\n");
1983 } 1924 }
1984 } 1925 }
1985 1926
@@ -2014,7 +1955,7 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
2014 if (gl->gl_sbd != sdp) 1955 if (gl->gl_sbd != sdp)
2015 continue; 1956 continue;
2016 1957
2017 error = dump_glock(gl); 1958 error = dump_glock(NULL, gl);
2018 if (error) 1959 if (error)
2019 break; 1960 break;
2020 } 1961 }
@@ -2043,3 +1984,189 @@ int __init gfs2_glock_init(void)
2043 return 0; 1984 return 0;
2044} 1985}
2045 1986
1987static int gfs2_glock_iter_next(struct glock_iter *gi)
1988{
1989 read_lock(gl_lock_addr(gi->hash));
1990 while (1) {
1991 if (!gi->hb_list) { /* If we don't have a hash bucket yet */
1992 gi->hb_list = &gl_hash_table[gi->hash].hb_list;
1993 if (hlist_empty(gi->hb_list)) {
1994 read_unlock(gl_lock_addr(gi->hash));
1995 gi->hash++;
1996 read_lock(gl_lock_addr(gi->hash));
1997 gi->hb_list = NULL;
1998 if (gi->hash >= GFS2_GL_HASH_SIZE) {
1999 read_unlock(gl_lock_addr(gi->hash));
2000 return 1;
2001 }
2002 else
2003 continue;
2004 }
2005 if (!hlist_empty(gi->hb_list)) {
2006 gi->gl = list_entry(gi->hb_list->first,
2007 struct gfs2_glock,
2008 gl_list);
2009 }
2010 } else {
2011 if (gi->gl->gl_list.next == NULL) {
2012 read_unlock(gl_lock_addr(gi->hash));
2013 gi->hash++;
2014 read_lock(gl_lock_addr(gi->hash));
2015 gi->hb_list = NULL;
2016 continue;
2017 }
2018 gi->gl = list_entry(gi->gl->gl_list.next,
2019 struct gfs2_glock, gl_list);
2020 }
2021 if (gi->gl)
2022 break;
2023 }
2024 read_unlock(gl_lock_addr(gi->hash));
2025 return 0;
2026}
2027
2028static void gfs2_glock_iter_free(struct glock_iter *gi)
2029{
2030 kfree(gi);
2031}
2032
2033static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
2034{
2035 struct glock_iter *gi;
2036
2037 gi = kmalloc(sizeof (*gi), GFP_KERNEL);
2038 if (!gi)
2039 return NULL;
2040
2041 gi->sdp = sdp;
2042 gi->hash = 0;
2043 gi->gl = NULL;
2044 gi->hb_list = NULL;
2045 gi->seq = NULL;
2046 memset(gi->string, 0, sizeof(gi->string));
2047
2048 if (gfs2_glock_iter_next(gi)) {
2049 gfs2_glock_iter_free(gi);
2050 return NULL;
2051 }
2052
2053 return gi;
2054}
2055
2056static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
2057{
2058 struct glock_iter *gi;
2059 loff_t n = *pos;
2060
2061 gi = gfs2_glock_iter_init(file->private);
2062 if (!gi)
2063 return NULL;
2064
2065 while (n--) {
2066 if (gfs2_glock_iter_next(gi)) {
2067 gfs2_glock_iter_free(gi);
2068 return NULL;
2069 }
2070 }
2071
2072 return gi;
2073}
2074
2075static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
2076 loff_t *pos)
2077{
2078 struct glock_iter *gi = iter_ptr;
2079
2080 (*pos)++;
2081
2082 if (gfs2_glock_iter_next(gi)) {
2083 gfs2_glock_iter_free(gi);
2084 return NULL;
2085 }
2086
2087 return gi;
2088}
2089
2090static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
2091{
2092 /* nothing for now */
2093}
2094
2095static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
2096{
2097 struct glock_iter *gi = iter_ptr;
2098
2099 gi->seq = file;
2100 dump_glock(gi, gi->gl);
2101
2102 return 0;
2103}
2104
2105static struct seq_operations gfs2_glock_seq_ops = {
2106 .start = gfs2_glock_seq_start,
2107 .next = gfs2_glock_seq_next,
2108 .stop = gfs2_glock_seq_stop,
2109 .show = gfs2_glock_seq_show,
2110};
2111
2112static int gfs2_debugfs_open(struct inode *inode, struct file *file)
2113{
2114 struct seq_file *seq;
2115 int ret;
2116
2117 ret = seq_open(file, &gfs2_glock_seq_ops);
2118 if (ret)
2119 return ret;
2120
2121 seq = file->private_data;
2122 seq->private = inode->i_private;
2123
2124 return 0;
2125}
2126
2127static const struct file_operations gfs2_debug_fops = {
2128 .owner = THIS_MODULE,
2129 .open = gfs2_debugfs_open,
2130 .read = seq_read,
2131 .llseek = seq_lseek,
2132 .release = seq_release
2133};
2134
2135int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
2136{
2137 sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
2138 if (!sdp->debugfs_dir)
2139 return -ENOMEM;
2140 sdp->debugfs_dentry_glocks = debugfs_create_file("glocks",
2141 S_IFREG | S_IRUGO,
2142 sdp->debugfs_dir, sdp,
2143 &gfs2_debug_fops);
2144 if (!sdp->debugfs_dentry_glocks)
2145 return -ENOMEM;
2146
2147 return 0;
2148}
2149
2150void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
2151{
2152 if (sdp && sdp->debugfs_dir) {
2153 if (sdp->debugfs_dentry_glocks) {
2154 debugfs_remove(sdp->debugfs_dentry_glocks);
2155 sdp->debugfs_dentry_glocks = NULL;
2156 }
2157 debugfs_remove(sdp->debugfs_dir);
2158 sdp->debugfs_dir = NULL;
2159 }
2160}
2161
2162int gfs2_register_debugfs(void)
2163{
2164 gfs2_root = debugfs_create_dir("gfs2", NULL);
2165 return gfs2_root ? 0 : -ENOMEM;
2166}
2167
2168void gfs2_unregister_debugfs(void)
2169{
2170 debugfs_remove(gfs2_root);
2171 gfs2_root = NULL;
2172}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index f50e40ceca43..11477ca3a3c0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -38,7 +38,7 @@ static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
38 /* Look in glock's list of holders for one with current task as owner */ 38 /* Look in glock's list of holders for one with current task as owner */
39 spin_lock(&gl->gl_spin); 39 spin_lock(&gl->gl_spin);
40 list_for_each_entry(gh, &gl->gl_holders, gh_list) { 40 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
41 if (gh->gh_owner == current) { 41 if (gh->gh_owner_pid == current->pid) {
42 locked = 1; 42 locked = 1;
43 break; 43 break;
44 } 44 }
@@ -67,7 +67,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
67{ 67{
68 int ret; 68 int ret;
69 spin_lock(&gl->gl_spin); 69 spin_lock(&gl->gl_spin);
70 ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3); 70 ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
71 spin_unlock(&gl->gl_spin); 71 spin_unlock(&gl->gl_spin);
72 return ret; 72 return ret;
73} 73}
@@ -135,5 +135,9 @@ void gfs2_scand_internal(struct gfs2_sbd *sdp);
135void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait); 135void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
136 136
137int __init gfs2_glock_init(void); 137int __init gfs2_glock_init(void);
138int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
139void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
140int gfs2_register_debugfs(void);
141void gfs2_unregister_debugfs(void);
138 142
139#endif /* __GLOCK_DOT_H__ */ 143#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 49f0dbf40d86..d995441373ab 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -115,11 +115,8 @@ enum {
115 /* Actions */ 115 /* Actions */
116 HIF_MUTEX = 0, 116 HIF_MUTEX = 0,
117 HIF_PROMOTE = 1, 117 HIF_PROMOTE = 1,
118 HIF_DEMOTE = 2,
119 118
120 /* States */ 119 /* States */
121 HIF_ALLOCED = 4,
122 HIF_DEALLOC = 5,
123 HIF_HOLDER = 6, 120 HIF_HOLDER = 6,
124 HIF_FIRST = 7, 121 HIF_FIRST = 7,
125 HIF_ABORTED = 9, 122 HIF_ABORTED = 9,
@@ -130,7 +127,7 @@ struct gfs2_holder {
130 struct list_head gh_list; 127 struct list_head gh_list;
131 128
132 struct gfs2_glock *gh_gl; 129 struct gfs2_glock *gh_gl;
133 struct task_struct *gh_owner; 130 pid_t gh_owner_pid;
134 unsigned int gh_state; 131 unsigned int gh_state;
135 unsigned gh_flags; 132 unsigned gh_flags;
136 133
@@ -142,8 +139,8 @@ struct gfs2_holder {
142enum { 139enum {
143 GLF_LOCK = 1, 140 GLF_LOCK = 1,
144 GLF_STICKY = 2, 141 GLF_STICKY = 2,
142 GLF_DEMOTE = 3,
145 GLF_DIRTY = 5, 143 GLF_DIRTY = 5,
146 GLF_SKIP_WAITERS2 = 6,
147}; 144};
148 145
149struct gfs2_glock { 146struct gfs2_glock {
@@ -156,11 +153,12 @@ struct gfs2_glock {
156 153
157 unsigned int gl_state; 154 unsigned int gl_state;
158 unsigned int gl_hash; 155 unsigned int gl_hash;
159 struct task_struct *gl_owner; 156 unsigned int gl_demote_state; /* state requested by remote node */
157 unsigned long gl_demote_time; /* time of first demote request */
158 pid_t gl_owner_pid;
160 unsigned long gl_ip; 159 unsigned long gl_ip;
161 struct list_head gl_holders; 160 struct list_head gl_holders;
162 struct list_head gl_waiters1; /* HIF_MUTEX */ 161 struct list_head gl_waiters1; /* HIF_MUTEX */
163 struct list_head gl_waiters2; /* HIF_DEMOTE */
164 struct list_head gl_waiters3; /* HIF_PROMOTE */ 162 struct list_head gl_waiters3; /* HIF_PROMOTE */
165 163
166 const struct gfs2_glock_operations *gl_ops; 164 const struct gfs2_glock_operations *gl_ops;
@@ -611,6 +609,8 @@ struct gfs2_sbd {
611 609
612 unsigned long sd_last_warning; 610 unsigned long sd_last_warning;
613 struct vfsmount *sd_gfs2mnt; 611 struct vfsmount *sd_gfs2mnt;
612 struct dentry *debugfs_dir; /* debugfs directory */
613 struct dentry *debugfs_dentry_glocks; /* for debugfs */
614}; 614};
615 615
616#endif /* __INCORE_DOT_H__ */ 616#endif /* __INCORE_DOT_H__ */
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index b167addf9fd1..c305255bfe8a 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -151,7 +151,7 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
151 151
152/* make_strname - convert GFS lock numbers to a string */ 152/* make_strname - convert GFS lock numbers to a string */
153 153
154static inline void make_strname(struct lm_lockname *lockname, 154static inline void make_strname(const struct lm_lockname *lockname,
155 struct gdlm_strname *str) 155 struct gdlm_strname *str)
156{ 156{
157 sprintf(str->name, "%8x%16llx", lockname->ln_type, 157 sprintf(str->name, "%8x%16llx", lockname->ln_type,
@@ -169,6 +169,7 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
169 return -ENOMEM; 169 return -ENOMEM;
170 170
171 lp->lockname = *name; 171 lp->lockname = *name;
172 make_strname(name, &lp->strname);
172 lp->ls = ls; 173 lp->ls = ls;
173 lp->cur = DLM_LOCK_IV; 174 lp->cur = DLM_LOCK_IV;
174 lp->lvb = NULL; 175 lp->lvb = NULL;
@@ -227,7 +228,6 @@ void gdlm_put_lock(void *lock)
227unsigned int gdlm_do_lock(struct gdlm_lock *lp) 228unsigned int gdlm_do_lock(struct gdlm_lock *lp)
228{ 229{
229 struct gdlm_ls *ls = lp->ls; 230 struct gdlm_ls *ls = lp->ls;
230 struct gdlm_strname str;
231 int error, bast = 1; 231 int error, bast = 1;
232 232
233 /* 233 /*
@@ -249,8 +249,6 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
249 if (test_bit(LFL_NOBAST, &lp->flags)) 249 if (test_bit(LFL_NOBAST, &lp->flags))
250 bast = 0; 250 bast = 0;
251 251
252 make_strname(&lp->lockname, &str);
253
254 set_bit(LFL_ACTIVE, &lp->flags); 252 set_bit(LFL_ACTIVE, &lp->flags);
255 253
256 log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type, 254 log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
@@ -258,8 +256,8 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
258 lp->cur, lp->req, lp->lkf); 256 lp->cur, lp->req, lp->lkf);
259 257
260 error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf, 258 error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
261 str.name, str.namelen, 0, gdlm_ast, lp, 259 lp->strname.name, lp->strname.namelen, 0, gdlm_ast,
262 bast ? gdlm_bast : NULL); 260 lp, bast ? gdlm_bast : NULL);
263 261
264 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) { 262 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
265 lp->lksb.sb_status = -EAGAIN; 263 lp->lksb.sb_status = -EAGAIN;
@@ -268,7 +266,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
268 } 266 }
269 267
270 if (error) { 268 if (error) {
271 log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x " 269 log_error("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
272 "flags=%lx", ls->fsname, lp->lockname.ln_type, 270 "flags=%lx", ls->fsname, lp->lockname.ln_type,
273 (unsigned long long)lp->lockname.ln_number, error, 271 (unsigned long long)lp->lockname.ln_number, error,
274 lp->cur, lp->req, lp->lkf, lp->flags); 272 lp->cur, lp->req, lp->lkf, lp->flags);
@@ -296,7 +294,7 @@ static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
296 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp); 294 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
297 295
298 if (error) { 296 if (error) {
299 log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x " 297 log_error("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
300 "flags=%lx", ls->fsname, lp->lockname.ln_type, 298 "flags=%lx", ls->fsname, lp->lockname.ln_type,
301 (unsigned long long)lp->lockname.ln_number, error, 299 (unsigned long long)lp->lockname.ln_number, error,
302 lp->cur, lp->req, lp->lkf, lp->flags); 300 lp->cur, lp->req, lp->lkf, lp->flags);
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a87c7bf3c568..d074c6e6f9bf 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -36,7 +36,7 @@
36 36
37#define GDLM_STRNAME_BYTES 24 37#define GDLM_STRNAME_BYTES 24
38#define GDLM_LVB_SIZE 32 38#define GDLM_LVB_SIZE 32
39#define GDLM_DROP_COUNT 200000 39#define GDLM_DROP_COUNT 0
40#define GDLM_DROP_PERIOD 60 40#define GDLM_DROP_PERIOD 60
41#define GDLM_NAME_LEN 128 41#define GDLM_NAME_LEN 128
42 42
@@ -106,6 +106,7 @@ enum {
106struct gdlm_lock { 106struct gdlm_lock {
107 struct gdlm_ls *ls; 107 struct gdlm_ls *ls;
108 struct lm_lockname lockname; 108 struct lm_lockname lockname;
109 struct gdlm_strname strname;
109 char *lvb; 110 char *lvb;
110 struct dlm_lksb lksb; 111 struct dlm_lksb lksb;
111 112
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4746b884662d..d9fe3ca40e18 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -190,7 +190,6 @@ static struct kobj_type gdlm_ktype = {
190}; 190};
191 191
192static struct kset gdlm_kset = { 192static struct kset gdlm_kset = {
193 .subsys = &kernel_subsys,
194 .kobj = {.name = "lock_dlm",}, 193 .kobj = {.name = "lock_dlm",},
195 .ktype = &gdlm_ktype, 194 .ktype = &gdlm_ktype,
196}; 195};
@@ -225,6 +224,7 @@ int gdlm_sysfs_init(void)
225{ 224{
226 int error; 225 int error;
227 226
227 kobj_set_kset_s(&gdlm_kset, kernel_subsys);
228 error = kset_register(&gdlm_kset); 228 error = kset_register(&gdlm_kset);
229 if (error) 229 if (error)
230 printk("lock_dlm: cannot register kset %d\n", error); 230 printk("lock_dlm: cannot register kset %d\n", error);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 16bb4b4561ae..f82d84d05d23 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -33,16 +33,17 @@ static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
33 33
34 tr->tr_touched = 1; 34 tr->tr_touched = 1;
35 35
36 if (!list_empty(&le->le_list))
37 return;
38
39 gl = container_of(le, struct gfs2_glock, gl_le); 36 gl = container_of(le, struct gfs2_glock, gl_le);
40 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl))) 37 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
41 return; 38 return;
42 gfs2_glock_hold(gl);
43 set_bit(GLF_DIRTY, &gl->gl_flags);
44 39
45 gfs2_log_lock(sdp); 40 gfs2_log_lock(sdp);
41 if (!list_empty(&le->le_list)){
42 gfs2_log_unlock(sdp);
43 return;
44 }
45 gfs2_glock_hold(gl);
46 set_bit(GLF_DIRTY, &gl->gl_flags);
46 sdp->sd_log_num_gl++; 47 sdp->sd_log_num_gl++;
47 list_add(&le->le_list, &sdp->sd_log_le_gl); 48 list_add(&le->le_list, &sdp->sd_log_le_gl);
48 gfs2_log_unlock(sdp); 49 gfs2_log_unlock(sdp);
@@ -415,13 +416,14 @@ static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
415 416
416 tr->tr_touched = 1; 417 tr->tr_touched = 1;
417 418
418 if (!list_empty(&le->le_list))
419 return;
420
421 rgd = container_of(le, struct gfs2_rgrpd, rd_le); 419 rgd = container_of(le, struct gfs2_rgrpd, rd_le);
422 gfs2_rgrp_bh_hold(rgd);
423 420
424 gfs2_log_lock(sdp); 421 gfs2_log_lock(sdp);
422 if (!list_empty(&le->le_list)){
423 gfs2_log_unlock(sdp);
424 return;
425 }
426 gfs2_rgrp_bh_hold(rgd);
425 sdp->sd_log_num_rg++; 427 sdp->sd_log_num_rg++;
426 list_add(&le->le_list, &sdp->sd_log_le_rg); 428 list_add(&le->le_list, &sdp->sd_log_le_rg);
427 gfs2_log_unlock(sdp); 429 gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 6e8a59809abf..e460487c0557 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -27,8 +27,7 @@
27static void gfs2_init_inode_once(void *foo, struct kmem_cache *cachep, unsigned long flags) 27static void gfs2_init_inode_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
28{ 28{
29 struct gfs2_inode *ip = foo; 29 struct gfs2_inode *ip = foo;
30 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 30 if (flags & SLAB_CTOR_CONSTRUCTOR) {
31 SLAB_CTOR_CONSTRUCTOR) {
32 inode_init_once(&ip->i_inode); 31 inode_init_once(&ip->i_inode);
33 spin_lock_init(&ip->i_spin); 32 spin_lock_init(&ip->i_spin);
34 init_rwsem(&ip->i_rw_mutex); 33 init_rwsem(&ip->i_rw_mutex);
@@ -39,13 +38,11 @@ static void gfs2_init_inode_once(void *foo, struct kmem_cache *cachep, unsigned
39static void gfs2_init_glock_once(void *foo, struct kmem_cache *cachep, unsigned long flags) 38static void gfs2_init_glock_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
40{ 39{
41 struct gfs2_glock *gl = foo; 40 struct gfs2_glock *gl = foo;
42 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 41 if (flags & SLAB_CTOR_CONSTRUCTOR) {
43 SLAB_CTOR_CONSTRUCTOR) {
44 INIT_HLIST_NODE(&gl->gl_list); 42 INIT_HLIST_NODE(&gl->gl_list);
45 spin_lock_init(&gl->gl_spin); 43 spin_lock_init(&gl->gl_spin);
46 INIT_LIST_HEAD(&gl->gl_holders); 44 INIT_LIST_HEAD(&gl->gl_holders);
47 INIT_LIST_HEAD(&gl->gl_waiters1); 45 INIT_LIST_HEAD(&gl->gl_waiters1);
48 INIT_LIST_HEAD(&gl->gl_waiters2);
49 INIT_LIST_HEAD(&gl->gl_waiters3); 46 INIT_LIST_HEAD(&gl->gl_waiters3);
50 gl->gl_lvb = NULL; 47 gl->gl_lvb = NULL;
51 atomic_set(&gl->gl_lvb_count, 0); 48 atomic_set(&gl->gl_lvb_count, 0);
@@ -103,6 +100,8 @@ static int __init init_gfs2_fs(void)
103 if (error) 100 if (error)
104 goto fail_unregister; 101 goto fail_unregister;
105 102
103 gfs2_register_debugfs();
104
106 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__); 105 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
107 106
108 return 0; 107 return 0;
@@ -130,6 +129,7 @@ fail:
130 129
131static void __exit exit_gfs2_fs(void) 130static void __exit exit_gfs2_fs(void)
132{ 131{
132 gfs2_unregister_debugfs();
133 unregister_filesystem(&gfs2_fs_type); 133 unregister_filesystem(&gfs2_fs_type);
134 unregister_filesystem(&gfs2meta_fs_type); 134 unregister_filesystem(&gfs2meta_fs_type);
135 135
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 32caecd20300..4864659555d4 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -13,6 +13,7 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/lm_interface.h> 15#include <linux/lm_interface.h>
16#include <linux/parser.h>
16 17
17#include "gfs2.h" 18#include "gfs2.h"
18#include "incore.h" 19#include "incore.h"
@@ -20,6 +21,52 @@
20#include "sys.h" 21#include "sys.h"
21#include "util.h" 22#include "util.h"
22 23
24enum {
25 Opt_lockproto,
26 Opt_locktable,
27 Opt_hostdata,
28 Opt_spectator,
29 Opt_ignore_local_fs,
30 Opt_localflocks,
31 Opt_localcaching,
32 Opt_debug,
33 Opt_nodebug,
34 Opt_upgrade,
35 Opt_num_glockd,
36 Opt_acl,
37 Opt_noacl,
38 Opt_quota_off,
39 Opt_quota_account,
40 Opt_quota_on,
41 Opt_suiddir,
42 Opt_nosuiddir,
43 Opt_data_writeback,
44 Opt_data_ordered,
45};
46
47static match_table_t tokens = {
48 {Opt_lockproto, "lockproto=%s"},
49 {Opt_locktable, "locktable=%s"},
50 {Opt_hostdata, "hostdata=%s"},
51 {Opt_spectator, "spectator"},
52 {Opt_ignore_local_fs, "ignore_local_fs"},
53 {Opt_localflocks, "localflocks"},
54 {Opt_localcaching, "localcaching"},
55 {Opt_debug, "debug"},
56 {Opt_nodebug, "nodebug"},
57 {Opt_upgrade, "upgrade"},
58 {Opt_num_glockd, "num_glockd=%d"},
59 {Opt_acl, "acl"},
60 {Opt_noacl, "noacl"},
61 {Opt_quota_off, "quota=off"},
62 {Opt_quota_account, "quota=account"},
63 {Opt_quota_on, "quota=on"},
64 {Opt_suiddir, "suiddir"},
65 {Opt_nosuiddir, "nosuiddir"},
66 {Opt_data_writeback, "data=writeback"},
67 {Opt_data_ordered, "data=ordered"}
68};
69
23/** 70/**
24 * gfs2_mount_args - Parse mount options 71 * gfs2_mount_args - Parse mount options
25 * @sdp: 72 * @sdp:
@@ -54,146 +101,150 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
54 process them */ 101 process them */
55 102
56 for (options = data; (o = strsep(&options, ",")); ) { 103 for (options = data; (o = strsep(&options, ",")); ) {
104 int token, option;
105 substring_t tmp[MAX_OPT_ARGS];
106
57 if (!*o) 107 if (!*o)
58 continue; 108 continue;
59 109
60 v = strchr(o, '='); 110 token = match_token(o, tokens, tmp);
61 if (v) 111 switch (token) {
62 *v++ = 0; 112 case Opt_lockproto:
113 v = match_strdup(&tmp[0]);
114 if (!v) {
115 fs_info(sdp, "no memory for lockproto\n");
116 error = -ENOMEM;
117 goto out_error;
118 }
63 119
64 if (!strcmp(o, "lockproto")) { 120 if (remount && strcmp(v, args->ar_lockproto)) {
65 if (!v) 121 kfree(v);
66 goto need_value;
67 if (remount && strcmp(v, args->ar_lockproto))
68 goto cant_remount; 122 goto cant_remount;
123 }
124
69 strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN); 125 strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
70 args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0; 126 args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
71 } 127 kfree(v);
128 break;
129 case Opt_locktable:
130 v = match_strdup(&tmp[0]);
131 if (!v) {
132 fs_info(sdp, "no memory for locktable\n");
133 error = -ENOMEM;
134 goto out_error;
135 }
72 136
73 else if (!strcmp(o, "locktable")) { 137 if (remount && strcmp(v, args->ar_locktable)) {
74 if (!v) 138 kfree(v);
75 goto need_value;
76 if (remount && strcmp(v, args->ar_locktable))
77 goto cant_remount; 139 goto cant_remount;
140 }
141
78 strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN); 142 strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
79 args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0; 143 args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
80 } 144 kfree(v);
145 break;
146 case Opt_hostdata:
147 v = match_strdup(&tmp[0]);
148 if (!v) {
149 fs_info(sdp, "no memory for hostdata\n");
150 error = -ENOMEM;
151 goto out_error;
152 }
81 153
82 else if (!strcmp(o, "hostdata")) { 154 if (remount && strcmp(v, args->ar_hostdata)) {
83 if (!v) 155 kfree(v);
84 goto need_value;
85 if (remount && strcmp(v, args->ar_hostdata))
86 goto cant_remount; 156 goto cant_remount;
157 }
158
87 strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN); 159 strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
88 args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0; 160 args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
89 } 161 kfree(v);
90 162 break;
91 else if (!strcmp(o, "spectator")) { 163 case Opt_spectator:
92 if (remount && !args->ar_spectator) 164 if (remount && !args->ar_spectator)
93 goto cant_remount; 165 goto cant_remount;
94 args->ar_spectator = 1; 166 args->ar_spectator = 1;
95 sdp->sd_vfs->s_flags |= MS_RDONLY; 167 sdp->sd_vfs->s_flags |= MS_RDONLY;
96 } 168 break;
97 169 case Opt_ignore_local_fs:
98 else if (!strcmp(o, "ignore_local_fs")) {
99 if (remount && !args->ar_ignore_local_fs) 170 if (remount && !args->ar_ignore_local_fs)
100 goto cant_remount; 171 goto cant_remount;
101 args->ar_ignore_local_fs = 1; 172 args->ar_ignore_local_fs = 1;
102 } 173 break;
103 174 case Opt_localflocks:
104 else if (!strcmp(o, "localflocks")) {
105 if (remount && !args->ar_localflocks) 175 if (remount && !args->ar_localflocks)
106 goto cant_remount; 176 goto cant_remount;
107 args->ar_localflocks = 1; 177 args->ar_localflocks = 1;
108 } 178 break;
109 179 case Opt_localcaching:
110 else if (!strcmp(o, "localcaching")) {
111 if (remount && !args->ar_localcaching) 180 if (remount && !args->ar_localcaching)
112 goto cant_remount; 181 goto cant_remount;
113 args->ar_localcaching = 1; 182 args->ar_localcaching = 1;
114 } 183 break;
115 184 case Opt_debug:
116 else if (!strcmp(o, "debug"))
117 args->ar_debug = 1; 185 args->ar_debug = 1;
118 186 break;
119 else if (!strcmp(o, "nodebug")) 187 case Opt_nodebug:
120 args->ar_debug = 0; 188 args->ar_debug = 0;
121 189 break;
122 else if (!strcmp(o, "upgrade")) { 190 case Opt_upgrade:
123 if (remount && !args->ar_upgrade) 191 if (remount && !args->ar_upgrade)
124 goto cant_remount; 192 goto cant_remount;
125 args->ar_upgrade = 1; 193 args->ar_upgrade = 1;
126 } 194 break;
195 case Opt_num_glockd:
196 if ((error = match_int(&tmp[0], &option))) {
197 fs_info(sdp, "problem getting num_glockd\n");
198 goto out_error;
199 }
127 200
128 else if (!strcmp(o, "num_glockd")) { 201 if (remount && option != args->ar_num_glockd)
129 unsigned int x;
130 if (!v)
131 goto need_value;
132 sscanf(v, "%u", &x);
133 if (remount && x != args->ar_num_glockd)
134 goto cant_remount; 202 goto cant_remount;
135 if (!x || x > GFS2_GLOCKD_MAX) { 203 if (!option || option > GFS2_GLOCKD_MAX) {
136 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n", 204 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
137 GFS2_GLOCKD_MAX, x); 205 GFS2_GLOCKD_MAX, option);
138 error = -EINVAL; 206 error = -EINVAL;
139 break; 207 goto out_error;
140 } 208 }
141 args->ar_num_glockd = x; 209 args->ar_num_glockd = option;
142 } 210 break;
143 211 case Opt_acl:
144 else if (!strcmp(o, "acl")) {
145 args->ar_posix_acl = 1; 212 args->ar_posix_acl = 1;
146 sdp->sd_vfs->s_flags |= MS_POSIXACL; 213 sdp->sd_vfs->s_flags |= MS_POSIXACL;
147 } 214 break;
148 215 case Opt_noacl:
149 else if (!strcmp(o, "noacl")) {
150 args->ar_posix_acl = 0; 216 args->ar_posix_acl = 0;
151 sdp->sd_vfs->s_flags &= ~MS_POSIXACL; 217 sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
152 } 218 break;
153 219 case Opt_quota_off:
154 else if (!strcmp(o, "quota")) { 220 args->ar_quota = GFS2_QUOTA_OFF;
155 if (!v) 221 break;
156 goto need_value; 222 case Opt_quota_account:
157 if (!strcmp(v, "off")) 223 args->ar_quota = GFS2_QUOTA_ACCOUNT;
158 args->ar_quota = GFS2_QUOTA_OFF; 224 break;
159 else if (!strcmp(v, "account")) 225 case Opt_quota_on:
160 args->ar_quota = GFS2_QUOTA_ACCOUNT; 226 args->ar_quota = GFS2_QUOTA_ON;
161 else if (!strcmp(v, "on")) 227 break;
162 args->ar_quota = GFS2_QUOTA_ON; 228 case Opt_suiddir:
163 else {
164 fs_info(sdp, "invalid value for quota\n");
165 error = -EINVAL;
166 break;
167 }
168 }
169
170 else if (!strcmp(o, "suiddir"))
171 args->ar_suiddir = 1; 229 args->ar_suiddir = 1;
172 230 break;
173 else if (!strcmp(o, "nosuiddir")) 231 case Opt_nosuiddir:
174 args->ar_suiddir = 0; 232 args->ar_suiddir = 0;
175 233 break;
176 else if (!strcmp(o, "data")) { 234 case Opt_data_writeback:
177 if (!v) 235 args->ar_data = GFS2_DATA_WRITEBACK;
178 goto need_value; 236 break;
179 if (!strcmp(v, "writeback")) 237 case Opt_data_ordered:
180 args->ar_data = GFS2_DATA_WRITEBACK; 238 args->ar_data = GFS2_DATA_ORDERED;
181 else if (!strcmp(v, "ordered")) 239 break;
182 args->ar_data = GFS2_DATA_ORDERED; 240 default:
183 else {
184 fs_info(sdp, "invalid value for data\n");
185 error = -EINVAL;
186 break;
187 }
188 }
189
190 else {
191 fs_info(sdp, "unknown option: %s\n", o); 241 fs_info(sdp, "unknown option: %s\n", o);
192 error = -EINVAL; 242 error = -EINVAL;
193 break; 243 goto out_error;
194 } 244 }
195 } 245 }
196 246
247out_error:
197 if (error) 248 if (error)
198 fs_info(sdp, "invalid mount option(s)\n"); 249 fs_info(sdp, "invalid mount option(s)\n");
199 250
@@ -202,10 +253,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
202 253
203 return error; 254 return error;
204 255
205need_value:
206 fs_info(sdp, "need value for option %s\n", o);
207 return -EINVAL;
208
209cant_remount: 256cant_remount:
210 fs_info(sdp, "can't remount with option %s\n", o); 257 fs_info(sdp, "can't remount with option %s\n", o);
211 return -EINVAL; 258 return -EINVAL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index b3b7e8475359..30c15622174f 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -197,7 +197,19 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
197 void *kaddr; 197 void *kaddr;
198 int error; 198 int error;
199 199
200 BUG_ON(page->index); 200 /*
201 * Due to the order of unstuffing files and ->nopage(), we can be
202 * asked for a zero page in the case of a stuffed file being extended,
203 * so we need to supply one here. It doesn't happen often.
204 */
205 if (unlikely(page->index)) {
206 kaddr = kmap_atomic(page, KM_USER0);
207 memset(kaddr, 0, PAGE_CACHE_SIZE);
208 kunmap_atomic(kaddr, KM_USER0);
209 flush_dcache_page(page);
210 SetPageUptodate(page);
211 return 0;
212 }
201 213
202 error = gfs2_meta_inode_buffer(ip, &dibh); 214 error = gfs2_meta_inode_buffer(ip, &dibh);
203 if (error) 215 if (error)
@@ -208,9 +220,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
208 ip->i_di.di_size); 220 ip->i_di.di_size);
209 memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size); 221 memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
210 kunmap_atomic(kaddr, KM_USER0); 222 kunmap_atomic(kaddr, KM_USER0);
211 223 flush_dcache_page(page);
212 brelse(dibh); 224 brelse(dibh);
213
214 SetPageUptodate(page); 225 SetPageUptodate(page);
215 226
216 return 0; 227 return 0;
@@ -507,7 +518,9 @@ static int gfs2_commit_write(struct file *file, struct page *page,
507 gfs2_quota_unlock(ip); 518 gfs2_quota_unlock(ip);
508 gfs2_alloc_put(ip); 519 gfs2_alloc_put(ip);
509 } 520 }
521 unlock_page(page);
510 gfs2_glock_dq_m(1, &ip->i_gh); 522 gfs2_glock_dq_m(1, &ip->i_gh);
523 lock_page(page);
511 gfs2_holder_uninit(&ip->i_gh); 524 gfs2_holder_uninit(&ip->i_gh);
512 return 0; 525 return 0;
513 526
@@ -520,7 +533,9 @@ fail_endtrans:
520 gfs2_quota_unlock(ip); 533 gfs2_quota_unlock(ip);
521 gfs2_alloc_put(ip); 534 gfs2_alloc_put(ip);
522 } 535 }
536 unlock_page(page);
523 gfs2_glock_dq_m(1, &ip->i_gh); 537 gfs2_glock_dq_m(1, &ip->i_gh);
538 lock_page(page);
524 gfs2_holder_uninit(&ip->i_gh); 539 gfs2_holder_uninit(&ip->i_gh);
525fail_nounlock: 540fail_nounlock:
526 ClearPageUptodate(page); 541 ClearPageUptodate(page);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ee54cb667083..2c5f8e7def0d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -690,6 +690,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
690 if (error) 690 if (error)
691 goto fail; 691 goto fail;
692 692
693 gfs2_create_debugfs_file(sdp);
694
693 error = gfs2_sys_fs_add(sdp); 695 error = gfs2_sys_fs_add(sdp);
694 if (error) 696 if (error)
695 goto fail; 697 goto fail;
@@ -754,6 +756,7 @@ fail_lm:
754fail_sys: 756fail_sys:
755 gfs2_sys_fs_del(sdp); 757 gfs2_sys_fs_del(sdp);
756fail: 758fail:
759 gfs2_delete_debugfs_file(sdp);
757 kfree(sdp); 760 kfree(sdp);
758 sb->s_fs_info = NULL; 761 sb->s_fs_info = NULL;
759 return error; 762 return error;
@@ -896,6 +899,7 @@ error:
896 899
897static void gfs2_kill_sb(struct super_block *sb) 900static void gfs2_kill_sb(struct super_block *sb)
898{ 901{
902 gfs2_delete_debugfs_file(sb->s_fs_info);
899 kill_block_super(sb); 903 kill_block_super(sb);
900} 904}
901 905
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index b89999d3a767..485ce3d49923 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -284,6 +284,31 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
284} 284}
285 285
286/** 286/**
287 * gfs2_drop_inode - Drop an inode (test for remote unlink)
288 * @inode: The inode to drop
289 *
290 * If we've received a callback on an iopen lock then its because a
291 * remote node tried to deallocate the inode but failed due to this node
292 * still having the inode open. Here we mark the link count zero
293 * since we know that it must have reached zero if the GLF_DEMOTE flag
294 * is set on the iopen glock. If we didn't do a disk read since the
295 * remote node removed the final link then we might otherwise miss
296 * this event. This check ensures that this node will deallocate the
297 * inode's blocks, or alternatively pass the baton on to another
298 * node for later deallocation.
299 */
300static void gfs2_drop_inode(struct inode *inode)
301{
302 if (inode->i_private && inode->i_nlink) {
303 struct gfs2_inode *ip = GFS2_I(inode);
304 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
305 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
306 clear_nlink(inode);
307 }
308 generic_drop_inode(inode);
309}
310
311/**
287 * gfs2_clear_inode - Deallocate an inode when VFS is done with it 312 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
288 * @inode: The VFS inode 313 * @inode: The VFS inode
289 * 314 *
@@ -441,7 +466,7 @@ out_unlock:
441out_uninit: 466out_uninit:
442 gfs2_holder_uninit(&ip->i_iopen_gh); 467 gfs2_holder_uninit(&ip->i_iopen_gh);
443 gfs2_glock_dq_uninit(&gh); 468 gfs2_glock_dq_uninit(&gh);
444 if (error) 469 if (error && error != GLR_TRYFAILED)
445 fs_warn(sdp, "gfs2_delete_inode: %d\n", error); 470 fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
446out: 471out:
447 truncate_inode_pages(&inode->i_data, 0); 472 truncate_inode_pages(&inode->i_data, 0);
@@ -481,6 +506,7 @@ const struct super_operations gfs2_super_ops = {
481 .statfs = gfs2_statfs, 506 .statfs = gfs2_statfs,
482 .remount_fs = gfs2_remount_fs, 507 .remount_fs = gfs2_remount_fs,
483 .clear_inode = gfs2_clear_inode, 508 .clear_inode = gfs2_clear_inode,
509 .drop_inode = gfs2_drop_inode,
484 .show_options = gfs2_show_options, 510 .show_options = gfs2_show_options,
485}; 511};
486 512
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8d9c08b5c4b6..1727f5012efe 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -27,6 +27,7 @@
27#include "trans.h" 27#include "trans.h"
28#include "ops_file.h" 28#include "ops_file.h"
29#include "util.h" 29#include "util.h"
30#include "log.h"
30 31
31#define BFITNOENT ((u32)~0) 32#define BFITNOENT ((u32)~0)
32 33
@@ -697,8 +698,6 @@ struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
697 * @al: the struct gfs2_alloc structure describing the reservation 698 * @al: the struct gfs2_alloc structure describing the reservation
698 * 699 *
699 * If there's room for the requested blocks to be allocated from the RG: 700 * If there's room for the requested blocks to be allocated from the RG:
700 * Sets the $al_reserved_data field in @al.
701 * Sets the $al_reserved_meta field in @al.
702 * Sets the $al_rgd field in @al. 701 * Sets the $al_rgd field in @al.
703 * 702 *
704 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) 703 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
@@ -709,6 +708,9 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
709 struct gfs2_sbd *sdp = rgd->rd_sbd; 708 struct gfs2_sbd *sdp = rgd->rd_sbd;
710 int ret = 0; 709 int ret = 0;
711 710
711 if (rgd->rd_rg.rg_flags & GFS2_RGF_NOALLOC)
712 return 0;
713
712 spin_lock(&sdp->sd_rindex_spin); 714 spin_lock(&sdp->sd_rindex_spin);
713 if (rgd->rd_free_clone >= al->al_requested) { 715 if (rgd->rd_free_clone >= al->al_requested) {
714 al->al_rgd = rgd; 716 al->al_rgd = rgd;
@@ -941,9 +943,13 @@ static int get_local_rgrp(struct gfs2_inode *ip)
941 rgd = gfs2_rgrpd_get_first(sdp); 943 rgd = gfs2_rgrpd_get_first(sdp);
942 944
943 if (rgd == begin) { 945 if (rgd == begin) {
944 if (++loops >= 2 || !skipped) 946 if (++loops >= 3)
945 return -ENOSPC; 947 return -ENOSPC;
948 if (!skipped)
949 loops++;
946 flags = 0; 950 flags = 0;
951 if (loops == 2)
952 gfs2_log_flush(sdp, NULL);
947 } 953 }
948 } 954 }
949 955
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d01f9f0fda26..c26c21b53c19 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -222,7 +222,6 @@ static struct kobj_type gfs2_ktype = {
222}; 222};
223 223
224static struct kset gfs2_kset = { 224static struct kset gfs2_kset = {
225 .subsys = &fs_subsys,
226 .kobj = {.name = "gfs2"}, 225 .kobj = {.name = "gfs2"},
227 .ktype = &gfs2_ktype, 226 .ktype = &gfs2_ktype,
228}; 227};
@@ -554,6 +553,7 @@ int gfs2_sys_init(void)
554{ 553{
555 gfs2_sys_margs = NULL; 554 gfs2_sys_margs = NULL;
556 spin_lock_init(&gfs2_sys_margs_lock); 555 spin_lock_init(&gfs2_sys_margs_lock);
556 kobj_set_kset_s(&gfs2_kset, fs_subsys);
557 return kset_register(&gfs2_kset); 557 return kset_register(&gfs2_kset);
558} 558}
559 559
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 623f509f1d47..4f1888f16cf0 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -434,7 +434,7 @@ static void hfs_init_once(void *p, struct kmem_cache *cachep, unsigned long flag
434{ 434{
435 struct hfs_inode_info *i = p; 435 struct hfs_inode_info *i = p;
436 436
437 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) 437 if (flags & SLAB_CTOR_CONSTRUCTOR)
438 inode_init_once(&i->vfs_inode); 438 inode_init_once(&i->vfs_inode);
439} 439}
440 440
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1a97f9293447..37afbec8a761 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -470,7 +470,7 @@ static void hfsplus_init_once(void *p, struct kmem_cache *cachep, unsigned long
470{ 470{
471 struct hfsplus_inode_info *i = p; 471 struct hfsplus_inode_info *i = p;
472 472
473 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) 473 if (flags & SLAB_CTOR_CONSTRUCTOR)
474 inode_init_once(&i->vfs_inode); 474 inode_init_once(&i->vfs_inode);
475} 475}
476 476
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index e0174e338526..1b95f39fbc37 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -176,8 +176,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
176{ 176{
177 struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; 177 struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
178 178
179 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 179 if (flags & SLAB_CTOR_CONSTRUCTOR) {
180 SLAB_CTOR_CONSTRUCTOR) {
181 mutex_init(&ei->i_mutex); 180 mutex_init(&ei->i_mutex);
182 mutex_init(&ei->i_parent_mutex); 181 mutex_init(&ei->i_parent_mutex);
183 inode_init_once(&ei->vfs_inode); 182 inode_init_once(&ei->vfs_inode);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c718a3d413f..98959b87cdf8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -22,6 +22,7 @@
22#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
23#include <linux/hugetlb.h> 23#include <linux/hugetlb.h>
24#include <linux/pagevec.h> 24#include <linux/pagevec.h>
25#include <linux/mman.h>
25#include <linux/quotaops.h> 26#include <linux/quotaops.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
27#include <linux/dnotify.h> 28#include <linux/dnotify.h>
@@ -98,10 +99,7 @@ out:
98 * Called under down_write(mmap_sem). 99 * Called under down_write(mmap_sem).
99 */ 100 */
100 101
101#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 102#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
102unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
103 unsigned long len, unsigned long pgoff, unsigned long flags);
104#else
105static unsigned long 103static unsigned long
106hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 104hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
107 unsigned long len, unsigned long pgoff, unsigned long flags) 105 unsigned long len, unsigned long pgoff, unsigned long flags)
@@ -115,6 +113,12 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
115 if (len > TASK_SIZE) 113 if (len > TASK_SIZE)
116 return -ENOMEM; 114 return -ENOMEM;
117 115
116 if (flags & MAP_FIXED) {
117 if (prepare_hugepage_range(addr, len, pgoff))
118 return -EINVAL;
119 return addr;
120 }
121
118 if (addr) { 122 if (addr) {
119 addr = ALIGN(addr, HPAGE_SIZE); 123 addr = ALIGN(addr, HPAGE_SIZE);
120 vma = find_vma(mm, addr); 124 vma = find_vma(mm, addr);
@@ -453,7 +457,7 @@ static int hugetlbfs_symlink(struct inode *dir,
453 */ 457 */
454static int hugetlbfs_set_page_dirty(struct page *page) 458static int hugetlbfs_set_page_dirty(struct page *page)
455{ 459{
456 struct page *head = (struct page *)page_private(page); 460 struct page *head = compound_head(page);
457 461
458 SetPageDirty(head); 462 SetPageDirty(head);
459 return 0; 463 return 0;
@@ -552,8 +556,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
552{ 556{
553 struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; 557 struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
554 558
555 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 559 if (flags & SLAB_CTOR_CONSTRUCTOR)
556 SLAB_CTOR_CONSTRUCTOR)
557 inode_init_once(&ei->vfs_inode); 560 inode_init_once(&ei->vfs_inode);
558} 561}
559 562
@@ -744,6 +747,9 @@ struct file *hugetlb_zero_setup(size_t size)
744 char buf[16]; 747 char buf[16];
745 static atomic_t counter; 748 static atomic_t counter;
746 749
750 if (!hugetlbfs_vfsmount)
751 return ERR_PTR(-ENOENT);
752
747 if (!can_do_hugetlb_shm()) 753 if (!can_do_hugetlb_shm())
748 return ERR_PTR(-EPERM); 754 return ERR_PTR(-EPERM);
749 755
diff --git a/fs/inode.c b/fs/inode.c
index 5abb097ab1b0..b4296bf62739 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -213,8 +213,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
213{ 213{
214 struct inode * inode = (struct inode *) foo; 214 struct inode * inode = (struct inode *) foo;
215 215
216 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 216 if (flags & SLAB_CTOR_CONSTRUCTOR)
217 SLAB_CTOR_CONSTRUCTOR)
218 inode_init_once(inode); 217 inode_init_once(inode);
219} 218}
220 219
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 64a96cdfe3a4..e99f7ff4ecb4 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -77,8 +77,7 @@ static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags
77{ 77{
78 struct iso_inode_info *ei = foo; 78 struct iso_inode_info *ei = foo;
79 79
80 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 80 if (flags & SLAB_CTOR_CONSTRUCTOR)
81 SLAB_CTOR_CONSTRUCTOR)
82 inode_init_once(&ei->vfs_inode); 81 inode_init_once(&ei->vfs_inode);
83} 82}
84 83
diff --git a/fs/jffs2/LICENCE b/fs/jffs2/LICENCE
index cd81d83e4ad2..562885908135 100644
--- a/fs/jffs2/LICENCE
+++ b/fs/jffs2/LICENCE
@@ -1,7 +1,7 @@
1The files in this directory and elsewhere which refer to this LICENCE 1The files in this directory and elsewhere which refer to this LICENCE
2file are part of JFFS2, the Journalling Flash File System v2. 2file are part of JFFS2, the Journalling Flash File System v2.
3 3
4 Copyright (C) 2001, 2002 Red Hat, Inc. 4 Copyright © 2001-2007 Red Hat, Inc. and others
5 5
6JFFS2 is free software; you can redistribute it and/or modify it under 6JFFS2 is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free 7the terms of the GNU General Public License as published by the Free
@@ -28,8 +28,3 @@ of the GNU General Public License.
28This exception does not invalidate any other reasons why a work based on 28This exception does not invalidate any other reasons why a work based on
29this file might be covered by the GNU General Public License. 29this file might be covered by the GNU General Public License.
30 30
31For information on obtaining alternative licences for JFFS2, see
32http://sources.redhat.com/jffs2/jffs2-licence.html
33
34
35 $Id: LICENCE,v 1.1 2002/05/20 14:56:37 dwmw2 Exp $
diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile
index 7f28ee0bd132..c32b241e3d91 100644
--- a/fs/jffs2/Makefile
+++ b/fs/jffs2/Makefile
@@ -1,7 +1,6 @@
1# 1#
2# Makefile for the Linux Journalling Flash File System v2 (JFFS2) 2# Makefile for the Linux Journalling Flash File System v2 (JFFS2)
3# 3#
4# $Id: Makefile.common,v 1.11 2005/09/07 08:34:53 havasi Exp $
5# 4#
6 5
7obj-$(CONFIG_JFFS2_FS) += jffs2.o 6obj-$(CONFIG_JFFS2_FS) += jffs2.o
diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
index c8f0bd64e53e..d14d5a4dc5ac 100644
--- a/fs/jffs2/README.Locking
+++ b/fs/jffs2/README.Locking
@@ -1,4 +1,3 @@
1 $Id: README.Locking,v 1.12 2005/04/13 13:22:35 dwmw2 Exp $
2 1
3 JFFS2 LOCKING DOCUMENTATION 2 JFFS2 LOCKING DOCUMENTATION
4 --------------------------- 3 ---------------------------
diff --git a/fs/jffs2/TODO b/fs/jffs2/TODO
index d0e23b26fa50..5d3ea4070f01 100644
--- a/fs/jffs2/TODO
+++ b/fs/jffs2/TODO
@@ -1,4 +1,3 @@
1$Id: TODO,v 1.18 2005/09/22 11:24:56 dedekind Exp $
2 1
3 - support asynchronous operation -- add a per-fs 'reserved_space' count, 2 - support asynchronous operation -- add a per-fs 'reserved_space' count,
4 let each outstanding write reserve the _maximum_ amount of physical 3 let each outstanding write reserve the _maximum_ amount of physical
@@ -30,8 +29,6 @@ $Id: TODO,v 1.18 2005/09/22 11:24:56 dedekind Exp $
30 the full dirent, we only need to go to the flash in lookup() when we think we've 29 the full dirent, we only need to go to the flash in lookup() when we think we've
31 got a match, and in readdir(). 30 got a match, and in readdir().
32 - Doubly-linked next_in_ino list to allow us to free obsoleted raw_node_refs immediately? 31 - Doubly-linked next_in_ino list to allow us to free obsoleted raw_node_refs immediately?
33 - Remove totlen from jffs2_raw_node_ref? Need to have totlen passed into
34 jffs2_mark_node_obsolete(). Can all callers work it out?
35 - Remove size from jffs2_raw_node_frag. 32 - Remove size from jffs2_raw_node_frag.
36 33
37dedekind: 34dedekind:
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 73f0d60f73a5..a46101ee867a 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -1,13 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2006 NEC Corporation 4 * Copyright © 2006 NEC Corporation
5 * 5 *
6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com> 6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 */ 10 */
11
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fa327dbd3171..c84378cee82a 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -1,13 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2006 NEC Corporation 4 * Copyright © 2006 NEC Corporation
5 * 5 *
6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com> 6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 */ 10 */
11
11struct jffs2_acl_entry { 12struct jffs2_acl_entry {
12 jint16_t e_tag; 13 jint16_t e_tag;
13 jint16_t e_perm; 14 jint16_t e_perm;
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 888f236e5494..0c82dfcfd246 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: background.c,v 1.54 2005/05/20 21:37:12 gleixner Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 07119c42a861..0ca2fff2617f 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: build.c,v 1.85 2005/11/07 11:14:38 gleixner Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 7001ba26c067..485d065de41f 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -1,16 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Created by Arjan van de Ven <arjanv@redhat.com> 5 * Created by Arjan van de Ven <arjanv@redhat.com>
6 * 6 *
7 * Copyright (C) 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>, 7 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
8 * University of Szeged, Hungary 8 * University of Szeged, Hungary
9 * 9 *
10 * For licensing information, see the file 'LICENCE' in this directory. 10 * For licensing information, see the file 'LICENCE' in this directory.
11 * 11 *
12 * $Id: compr.c,v 1.46 2005/11/07 11:14:38 gleixner Exp $
13 *
14 */ 12 */
15 13
16#include "compr.h" 14#include "compr.h"
@@ -268,144 +266,6 @@ int jffs2_unregister_compressor(struct jffs2_compressor *comp)
268 return 0; 266 return 0;
269} 267}
270 268
271#ifdef CONFIG_JFFS2_PROC
272
273#define JFFS2_STAT_BUF_SIZE 16000
274
275char *jffs2_list_compressors(void)
276{
277 struct jffs2_compressor *this;
278 char *buf, *act_buf;
279
280 act_buf = buf = kmalloc(JFFS2_STAT_BUF_SIZE,GFP_KERNEL);
281 list_for_each_entry(this, &jffs2_compressor_list, list) {
282 act_buf += sprintf(act_buf, "%10s priority:%d ", this->name, this->priority);
283 if ((this->disabled)||(!this->compress))
284 act_buf += sprintf(act_buf,"disabled");
285 else
286 act_buf += sprintf(act_buf,"enabled");
287 act_buf += sprintf(act_buf,"\n");
288 }
289 return buf;
290}
291
292char *jffs2_stats(void)
293{
294 struct jffs2_compressor *this;
295 char *buf, *act_buf;
296
297 act_buf = buf = kmalloc(JFFS2_STAT_BUF_SIZE,GFP_KERNEL);
298
299 act_buf += sprintf(act_buf,"JFFS2 compressor statistics:\n");
300 act_buf += sprintf(act_buf,"%10s ","none");
301 act_buf += sprintf(act_buf,"compr: %d blocks (%d) decompr: %d blocks\n", none_stat_compr_blocks,
302 none_stat_compr_size, none_stat_decompr_blocks);
303 spin_lock(&jffs2_compressor_list_lock);
304 list_for_each_entry(this, &jffs2_compressor_list, list) {
305 act_buf += sprintf(act_buf,"%10s ",this->name);
306 if ((this->disabled)||(!this->compress))
307 act_buf += sprintf(act_buf,"- ");
308 else
309 act_buf += sprintf(act_buf,"+ ");
310 act_buf += sprintf(act_buf,"compr: %d blocks (%d/%d) decompr: %d blocks ", this->stat_compr_blocks,
311 this->stat_compr_new_size, this->stat_compr_orig_size,
312 this->stat_decompr_blocks);
313 act_buf += sprintf(act_buf,"\n");
314 }
315 spin_unlock(&jffs2_compressor_list_lock);
316
317 return buf;
318}
319
320char *jffs2_get_compression_mode_name(void)
321{
322 switch (jffs2_compression_mode) {
323 case JFFS2_COMPR_MODE_NONE:
324 return "none";
325 case JFFS2_COMPR_MODE_PRIORITY:
326 return "priority";
327 case JFFS2_COMPR_MODE_SIZE:
328 return "size";
329 }
330 return "unkown";
331}
332
333int jffs2_set_compression_mode_name(const char *name)
334{
335 if (!strcmp("none",name)) {
336 jffs2_compression_mode = JFFS2_COMPR_MODE_NONE;
337 return 0;
338 }
339 if (!strcmp("priority",name)) {
340 jffs2_compression_mode = JFFS2_COMPR_MODE_PRIORITY;
341 return 0;
342 }
343 if (!strcmp("size",name)) {
344 jffs2_compression_mode = JFFS2_COMPR_MODE_SIZE;
345 return 0;
346 }
347 return 1;
348}
349
350static int jffs2_compressor_Xable(const char *name, int disabled)
351{
352 struct jffs2_compressor *this;
353 spin_lock(&jffs2_compressor_list_lock);
354 list_for_each_entry(this, &jffs2_compressor_list, list) {
355 if (!strcmp(this->name, name)) {
356 this->disabled = disabled;
357 spin_unlock(&jffs2_compressor_list_lock);
358 return 0;
359 }
360 }
361 spin_unlock(&jffs2_compressor_list_lock);
362 printk(KERN_WARNING "JFFS2: compressor %s not found.\n",name);
363 return 1;
364}
365
366int jffs2_enable_compressor_name(const char *name)
367{
368 return jffs2_compressor_Xable(name, 0);
369}
370
371int jffs2_disable_compressor_name(const char *name)
372{
373 return jffs2_compressor_Xable(name, 1);
374}
375
376int jffs2_set_compressor_priority(const char *name, int priority)
377{
378 struct jffs2_compressor *this,*comp;
379 spin_lock(&jffs2_compressor_list_lock);
380 list_for_each_entry(this, &jffs2_compressor_list, list) {
381 if (!strcmp(this->name, name)) {
382 this->priority = priority;
383 comp = this;
384 goto reinsert;
385 }
386 }
387 spin_unlock(&jffs2_compressor_list_lock);
388 printk(KERN_WARNING "JFFS2: compressor %s not found.\n",name);
389 return 1;
390reinsert:
391 /* list is sorted in the order of priority, so if
392 we change it we have to reinsert it into the
393 good place */
394 list_del(&comp->list);
395 list_for_each_entry(this, &jffs2_compressor_list, list) {
396 if (this->priority < comp->priority) {
397 list_add(&comp->list, this->list.prev);
398 spin_unlock(&jffs2_compressor_list_lock);
399 return 0;
400 }
401 }
402 list_add_tail(&comp->list, &jffs2_compressor_list);
403 spin_unlock(&jffs2_compressor_list_lock);
404 return 0;
405}
406
407#endif
408
409void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig) 269void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig)
410{ 270{
411 if (orig != comprbuf) 271 if (orig != comprbuf)
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 509b8b1c0811..68cc7010dbdf 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -1,13 +1,10 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>, 4 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
5 * University of Szeged, Hungary 5 * University of Szeged, Hungary
6 * 6 *
7 * For licensing information, see the file 'LICENCE' in the 7 * For licensing information, see the file 'LICENCE' in this directory.
8 * jffs2 directory.
9 *
10 * $Id: compr.h,v 1.9 2005/11/07 11:14:38 gleixner Exp $
11 * 8 *
12 */ 9 */
13 10
@@ -76,16 +73,6 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
76 73
77void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig); 74void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig);
78 75
79#ifdef CONFIG_JFFS2_PROC
80int jffs2_enable_compressor_name(const char *name);
81int jffs2_disable_compressor_name(const char *name);
82int jffs2_set_compression_mode_name(const char *mode_name);
83char *jffs2_get_compression_mode_name(void);
84int jffs2_set_compressor_priority(const char *mode_name, int priority);
85char *jffs2_list_compressors(void);
86char *jffs2_stats(void);
87#endif
88
89/* Compressor modules */ 76/* Compressor modules */
90/* These functions will be called by jffs2_compressors_init/exit */ 77/* These functions will be called by jffs2_compressors_init/exit */
91 78
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 2eb1b7428d16..0d0bfd2e4e0d 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -1,13 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by Arjan van de Ven <arjanv@redhat.com> 6 * Created by Arjan van de Ven <arjanv@redhat.com>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: compr_rtime.c,v 1.14 2004/06/23 16:34:40 havasi Exp $
11 * 10 *
12 * 11 *
13 * Very simple lz77-ish encoder. 12 * Very simple lz77-ish encoder.
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index e792e675d624..ea0431e047d5 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -1,23 +1,94 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001, 2002 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by Arjan van de Ven <arjanv@redhat.com> 6 * Created by Arjan van de Ven <arjanv@redhat.com>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: compr_rubin.c,v 1.20 2004/06/23 16:34:40 havasi Exp $
11 *
12 */ 10 */
13 11
14#include <linux/string.h> 12#include <linux/string.h>
15#include <linux/types.h> 13#include <linux/types.h>
16#include <linux/jffs2.h> 14#include <linux/jffs2.h>
17#include "compr_rubin.h" 15#include <linux/errno.h>
18#include "histo_mips.h"
19#include "compr.h" 16#include "compr.h"
20 17
18
19#define RUBIN_REG_SIZE 16
20#define UPPER_BIT_RUBIN (((long) 1)<<(RUBIN_REG_SIZE-1))
21#define LOWER_BITS_RUBIN ((((long) 1)<<(RUBIN_REG_SIZE-1))-1)
22
23
24#define BIT_DIVIDER_MIPS 1043
25static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */
26
27#include <linux/errno.h>
28
29struct pushpull {
30 unsigned char *buf;
31 unsigned int buflen;
32 unsigned int ofs;
33 unsigned int reserve;
34};
35
36struct rubin_state {
37 unsigned long p;
38 unsigned long q;
39 unsigned long rec_q;
40 long bit_number;
41 struct pushpull pp;
42 int bit_divider;
43 int bits[8];
44};
45
46static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve)
47{
48 pp->buf = buf;
49 pp->buflen = buflen;
50 pp->ofs = ofs;
51 pp->reserve = reserve;
52}
53
54static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
55{
56 if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) {
57 return -ENOSPC;
58 }
59
60 if (bit) {
61 pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7)));
62 }
63 else {
64 pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7)));
65 }
66 pp->ofs++;
67
68 return 0;
69}
70
71static inline int pushedbits(struct pushpull *pp)
72{
73 return pp->ofs;
74}
75
76static inline int pullbit(struct pushpull *pp)
77{
78 int bit;
79
80 bit = (pp->buf[pp->ofs >> 3] >> (7-(pp->ofs & 7))) & 1;
81
82 pp->ofs++;
83 return bit;
84}
85
86static inline int pulledbits(struct pushpull *pp)
87{
88 return pp->ofs;
89}
90
91
21static void init_rubin(struct rubin_state *rs, int div, int *bits) 92static void init_rubin(struct rubin_state *rs, int div, int *bits)
22{ 93{
23 int c; 94 int c;
diff --git a/fs/jffs2/compr_rubin.h b/fs/jffs2/compr_rubin.h
deleted file mode 100644
index bf1a93451621..000000000000
--- a/fs/jffs2/compr_rubin.h
+++ /dev/null
@@ -1,21 +0,0 @@
1/* Rubin encoder/decoder header */
2/* work started at : aug 3, 1994 */
3/* last modification : aug 15, 1994 */
4/* $Id: compr_rubin.h,v 1.7 2005/11/07 11:14:38 gleixner Exp $ */
5
6#include "pushpull.h"
7
8#define RUBIN_REG_SIZE 16
9#define UPPER_BIT_RUBIN (((long) 1)<<(RUBIN_REG_SIZE-1))
10#define LOWER_BITS_RUBIN ((((long) 1)<<(RUBIN_REG_SIZE-1))-1)
11
12
13struct rubin_state {
14 unsigned long p;
15 unsigned long q;
16 unsigned long rec_q;
17 long bit_number;
18 struct pushpull pp;
19 int bit_divider;
20 int bits[8];
21};
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 0c1fc6e20b43..2b87fccc1557 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: compr_zlib.c,v 1.32 2005/11/07 11:14:38 gleixner Exp $
11 *
12 */ 10 */
13 11
14#if !defined(__KERNEL__) && !defined(__ECOS) 12#if !defined(__KERNEL__) && !defined(__ECOS)
diff --git a/fs/jffs2/comprtest.c b/fs/jffs2/comprtest.c
deleted file mode 100644
index f0fb8be7740c..000000000000
--- a/fs/jffs2/comprtest.c
+++ /dev/null
@@ -1,307 +0,0 @@
1/* $Id: comprtest.c,v 1.6 2005/11/07 11:14:38 gleixner Exp $ */
2
3#include <linux/kernel.h>
4#include <linux/string.h>
5#include <linux/module.h>
6#include <asm/types.h>
7#if 0
8#define TESTDATA_LEN 512
9static unsigned char testdata[TESTDATA_LEN] = {
10 0x7f, 0x45, 0x4c, 0x46, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
11 0x02, 0x00, 0x03, 0x00, 0x01, 0x00, 0x00, 0x00, 0x60, 0x83, 0x04, 0x08, 0x34, 0x00, 0x00, 0x00,
12 0xb0, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x34, 0x00, 0x20, 0x00, 0x06, 0x00, 0x28, 0x00,
13 0x1e, 0x00, 0x1b, 0x00, 0x06, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x34, 0x80, 0x04, 0x08,
14 0x34, 0x80, 0x04, 0x08, 0xc0, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
15 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00, 0xf4, 0x80, 0x04, 0x08,
16 0xf4, 0x80, 0x04, 0x08, 0x13, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
17 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x04, 0x08,
18 0x00, 0x80, 0x04, 0x08, 0x0d, 0x05, 0x00, 0x00, 0x0d, 0x05, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
19 0x00, 0x10, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x05, 0x00, 0x00, 0x10, 0x95, 0x04, 0x08,
20 0x10, 0x95, 0x04, 0x08, 0xe8, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
21 0x00, 0x10, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x58, 0x05, 0x00, 0x00, 0x58, 0x95, 0x04, 0x08,
22 0x58, 0x95, 0x04, 0x08, 0xa0, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
23 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x08, 0x81, 0x04, 0x08,
24 0x08, 0x81, 0x04, 0x08, 0x20, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
25 0x04, 0x00, 0x00, 0x00, 0x2f, 0x6c, 0x69, 0x62, 0x2f, 0x6c, 0x64, 0x2d, 0x6c, 0x69, 0x6e, 0x75,
26 0x78, 0x2e, 0x73, 0x6f, 0x2e, 0x32, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
27 0x01, 0x00, 0x00, 0x00, 0x47, 0x4e, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
28 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
29 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
30 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
31 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
32 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00,
33 0x0c, 0x83, 0x04, 0x08, 0x81, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
34 0x1c, 0x83, 0x04, 0x08, 0xac, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00,
35 0x2c, 0x83, 0x04, 0x08, 0xdd, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
36 0x3c, 0x83, 0x04, 0x08, 0x2e, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
37 0x4c, 0x83, 0x04, 0x08, 0x7d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
38 0x00, 0x85, 0x04, 0x08, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x0e, 0x00, 0x01, 0x00, 0x00, 0x00,
39 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x5f, 0x5f, 0x67,
40 0x6d, 0x6f, 0x6e, 0x5f, 0x73, 0x74, 0x61, 0x72, 0x74, 0x5f, 0x5f, 0x00, 0x6c, 0x69, 0x62, 0x63,
41 0x2e, 0x73, 0x6f, 0x2e, 0x36, 0x00, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x00, 0x5f, 0x5f, 0x63};
42#else
43#define TESTDATA_LEN 3481
44static unsigned char testdata[TESTDATA_LEN] = {
45 0x23, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x20, 0x22, 0x64, 0x62, 0x65, 0x6e, 0x63, 0x68,
46 0x2e, 0x68, 0x22, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x41, 0x58,
47 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x20, 0x31, 0x30, 0x30, 0x30, 0x0a, 0x0a, 0x73, 0x74, 0x61,
48 0x74, 0x69, 0x63, 0x20, 0x63, 0x68, 0x61, 0x72, 0x20, 0x62, 0x75, 0x66, 0x5b, 0x37, 0x30, 0x30,
49 0x30, 0x30, 0x5d, 0x3b, 0x0a, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x69, 0x6e, 0x74, 0x20,
50 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x73, 0x74, 0x61,
51 0x74, 0x69, 0x63, 0x20, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x7b, 0x0a, 0x09, 0x69, 0x6e,
52 0x74, 0x20, 0x66, 0x64, 0x3b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c,
53 0x65, 0x3b, 0x0a, 0x7d, 0x20, 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x4d, 0x41, 0x58, 0x5f,
54 0x46, 0x49, 0x4c, 0x45, 0x53, 0x5d, 0x3b, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f,
55 0x5f, 0x75, 0x6e, 0x6c, 0x69, 0x6e, 0x6b, 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e,
56 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72,
57 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x75,
58 0x6e, 0x6c, 0x69, 0x6e, 0x6b, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x21, 0x3d, 0x20,
59 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28,
60 0x25, 0x64, 0x29, 0x20, 0x75, 0x6e, 0x6c, 0x69, 0x6e, 0x6b, 0x20, 0x25, 0x73, 0x20, 0x66, 0x61,
61 0x69, 0x6c, 0x65, 0x64, 0x20, 0x28, 0x25, 0x73, 0x29, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09,
62 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75,
63 0x6e, 0x74, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72,
64 0x72, 0x6f, 0x72, 0x28, 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a,
65 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x5f, 0x66,
66 0x69, 0x6c, 0x65, 0x28, 0x69, 0x6e, 0x74, 0x20, 0x66, 0x64, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20,
67 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x73, 0x3b, 0x0a,
68 0x09, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x7b, 0x0a,
69 0x09, 0x09, 0x73, 0x20, 0x3d, 0x20, 0x4d, 0x49, 0x4e, 0x28, 0x73, 0x69, 0x7a, 0x65, 0x6f, 0x66,
70 0x28, 0x62, 0x75, 0x66, 0x29, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x09,
71 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x66, 0x64, 0x2c, 0x20, 0x62, 0x75, 0x66, 0x2c, 0x20, 0x73,
72 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x2d, 0x3d, 0x20, 0x73, 0x3b, 0x0a,
73 0x09, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x6f, 0x70,
74 0x65, 0x6e, 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
75 0x69, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20,
76 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x66, 0x64, 0x2c,
77 0x20, 0x69, 0x3b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x20, 0x3d,
78 0x20, 0x4f, 0x5f, 0x52, 0x44, 0x57, 0x52, 0x7c, 0x4f, 0x5f, 0x43, 0x52, 0x45, 0x41, 0x54, 0x3b,
79 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x73, 0x74, 0x61, 0x74, 0x20, 0x73, 0x74,
80 0x3b, 0x0a, 0x09, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f,
81 0x75, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28,
82 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x73, 0x69,
83 0x7a, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x20, 0x7c,
84 0x3d, 0x20, 0x4f, 0x5f, 0x54, 0x52, 0x55, 0x4e, 0x43, 0x3b, 0x0a, 0x0a, 0x09, 0x66, 0x64, 0x20,
85 0x3d, 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x66, 0x6c,
86 0x61, 0x67, 0x73, 0x2c, 0x20, 0x30, 0x36, 0x30, 0x30, 0x29, 0x3b, 0x0a, 0x09, 0x69, 0x66, 0x20,
87 0x28, 0x66, 0x64, 0x20, 0x3d, 0x3d, 0x20, 0x2d, 0x31, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70,
88 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x6f, 0x70, 0x65, 0x6e,
89 0x20, 0x25, 0x73, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x68,
90 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x25, 0x64, 0x20, 0x28, 0x25, 0x73, 0x29, 0x5c, 0x6e, 0x22,
91 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65,
92 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x68,
93 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28,
94 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74, 0x75, 0x72,
95 0x6e, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x66, 0x73, 0x74, 0x61, 0x74, 0x28, 0x66, 0x64, 0x2c,
96 0x20, 0x26, 0x73, 0x74, 0x29, 0x3b, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x73, 0x69, 0x7a, 0x65,
97 0x20, 0x3e, 0x20, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x7b,
98 0x0a, 0x23, 0x69, 0x66, 0x20, 0x44, 0x45, 0x42, 0x55, 0x47, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69,
99 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64,
100 0x69, 0x6e, 0x67, 0x20, 0x25, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x25, 0x64, 0x20, 0x66, 0x72, 0x6f,
101 0x6d, 0x20, 0x25, 0x64, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20,
102 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x66,
103 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x28, 0x69, 0x6e, 0x74,
104 0x29, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x23, 0x65,
105 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x09, 0x09, 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x5f, 0x66, 0x69,
106 0x6c, 0x65, 0x28, 0x66, 0x64, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x2d, 0x20, 0x73, 0x74,
107 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x20, 0x65, 0x6c,
108 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x3c, 0x20, 0x73, 0x74,
109 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72,
110 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x74, 0x72, 0x75, 0x6e, 0x63, 0x61, 0x74, 0x69, 0x6e, 0x67,
111 0x20, 0x25, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x25, 0x64, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x25,
112 0x64, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
113 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x28, 0x69, 0x6e,
114 0x74, 0x29, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09,
115 0x09, 0x66, 0x74, 0x72, 0x75, 0x6e, 0x63, 0x61, 0x74, 0x65, 0x28, 0x66, 0x64, 0x2c, 0x20, 0x73,
116 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69,
117 0x3d, 0x30, 0x3b, 0x69, 0x3c, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x3b, 0x69,
118 0x2b, 0x2b, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x66, 0x74, 0x61, 0x62,
119 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d, 0x3d, 0x20,
120 0x30, 0x29, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x69, 0x66,
121 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x20, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53,
122 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x66, 0x69,
123 0x6c, 0x65, 0x20, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x20, 0x66, 0x75, 0x6c, 0x6c, 0x20, 0x66, 0x6f,
124 0x72, 0x20, 0x25, 0x73, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x3b,
125 0x0a, 0x09, 0x09, 0x65, 0x78, 0x69, 0x74, 0x28, 0x31, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09,
126 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65,
127 0x20, 0x3d, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x3b, 0x0a, 0x09, 0x66, 0x74, 0x61, 0x62,
128 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66, 0x64, 0x20, 0x3d, 0x20, 0x66, 0x64, 0x3b, 0x0a, 0x09,
129 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2b, 0x2b, 0x20, 0x25, 0x20, 0x31, 0x30,
130 0x30, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e,
131 0x74, 0x66, 0x28, 0x22, 0x2e, 0x22, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x76,
132 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x69, 0x6e, 0x74,
133 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x73, 0x69, 0x7a,
134 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x29, 0x0a, 0x7b,
135 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x62,
136 0x75, 0x66, 0x5b, 0x30, 0x5d, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x6d, 0x65, 0x6d, 0x73,
137 0x65, 0x74, 0x28, 0x62, 0x75, 0x66, 0x2c, 0x20, 0x31, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x6f,
138 0x66, 0x28, 0x62, 0x75, 0x66, 0x29, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x66, 0x6f, 0x72, 0x20, 0x28,
139 0x69, 0x3d, 0x30, 0x3b, 0x69, 0x3c, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x3b,
140 0x69, 0x2b, 0x2b, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x66, 0x74, 0x61,
141 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d, 0x3d,
142 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a,
143 0x09, 0x7d, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x20, 0x4d, 0x41, 0x58,
144 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x29, 0x20, 0x7b, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x31, 0x0a,
145 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x64,
146 0x6f, 0x5f, 0x77, 0x72, 0x69, 0x74, 0x65, 0x3a, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20,
147 0x25, 0x64, 0x20, 0x77, 0x61, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x20,
148 0x73, 0x69, 0x7a, 0x65, 0x3d, 0x25, 0x64, 0x20, 0x6f, 0x66, 0x73, 0x3d, 0x25, 0x64, 0x5c, 0x6e,
149 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e,
150 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c,
151 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x29, 0x3b, 0x0a,
152 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b,
153 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x6c, 0x73, 0x65, 0x65, 0x6b, 0x28, 0x66, 0x74, 0x61, 0x62, 0x6c,
154 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66, 0x64, 0x2c, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x2c,
155 0x20, 0x53, 0x45, 0x45, 0x4b, 0x5f, 0x53, 0x45, 0x54, 0x29, 0x3b, 0x0a, 0x09, 0x69, 0x66, 0x20,
156 0x28, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d,
157 0x2e, 0x66, 0x64, 0x2c, 0x20, 0x62, 0x75, 0x66, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20,
158 0x21, 0x3d, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69,
159 0x6e, 0x74, 0x66, 0x28, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65,
160 0x64, 0x20, 0x6f, 0x6e, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x25, 0x64, 0x5c, 0x6e,
161 0x22, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d,
162 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x72, 0x65, 0x61, 0x64, 0x28, 0x69,
163 0x6e, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x73,
164 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x29,
165 0x0a, 0x7b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x0a, 0x09, 0x66, 0x6f, 0x72, 0x20,
166 0x28, 0x69, 0x3d, 0x30, 0x3b, 0x69, 0x3c, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53,
167 0x3b, 0x69, 0x2b, 0x2b, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x66, 0x74,
168 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d,
169 0x3d, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b,
170 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x20, 0x4d, 0x41,
171 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69,
172 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x64, 0x6f, 0x5f, 0x72, 0x65, 0x61,
173 0x64, 0x3a, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x25, 0x64, 0x20, 0x77, 0x61, 0x73,
174 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x3d, 0x25,
175 0x64, 0x20, 0x6f, 0x66, 0x73, 0x3d, 0x25, 0x64, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09,
176 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e,
177 0x74, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c,
178 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74, 0x75,
179 0x72, 0x6e, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x6c, 0x73, 0x65, 0x65, 0x6b, 0x28, 0x66, 0x74,
180 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66, 0x64, 0x2c, 0x20, 0x6f, 0x66, 0x66, 0x73,
181 0x65, 0x74, 0x2c, 0x20, 0x53, 0x45, 0x45, 0x4b, 0x5f, 0x53, 0x45, 0x54, 0x29, 0x3b, 0x0a, 0x09,
182 0x72, 0x65, 0x61, 0x64, 0x28, 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66,
183 0x64, 0x2c, 0x20, 0x62, 0x75, 0x66, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x7d,
184 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x28,
185 0x69, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x69,
186 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x0a, 0x09, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, 0x3d, 0x30, 0x3b,
187 0x69, 0x3c, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x3b, 0x69, 0x2b, 0x2b, 0x29,
188 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b,
189 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x68, 0x61, 0x6e,
190 0x64, 0x6c, 0x65, 0x29, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09,
191 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x20, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c,
192 0x45, 0x53, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22,
193 0x28, 0x25, 0x64, 0x29, 0x20, 0x64, 0x6f, 0x5f, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x3a, 0x20, 0x68,
194 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x25, 0x64, 0x20, 0x77, 0x61, 0x73, 0x20, 0x6e, 0x6f, 0x74,
195 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20,
196 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20,
197 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74, 0x75, 0x72,
198 0x6e, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x28, 0x66, 0x74, 0x61,
199 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66, 0x64, 0x29, 0x3b, 0x0a, 0x09, 0x66, 0x74, 0x61,
200 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d, 0x20,
201 0x30, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x6d, 0x6b,
202 0x64, 0x69, 0x72, 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29,
203 0x0a, 0x7b, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28, 0x66, 0x6e, 0x61,
204 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x6b, 0x64, 0x69, 0x72,
205 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x30, 0x37, 0x30, 0x30, 0x29, 0x20, 0x21, 0x3d,
206 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x44, 0x45, 0x42, 0x55, 0x47, 0x0a,
207 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x6d, 0x6b, 0x64, 0x69, 0x72, 0x20,
208 0x25, 0x73, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x28, 0x25, 0x73, 0x29, 0x5c, 0x6e,
209 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6e, 0x61,
210 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x65, 0x72, 0x72,
211 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x09, 0x7d, 0x0a,
212 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x72, 0x6d, 0x64, 0x69, 0x72,
213 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x7b, 0x0a,
214 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29,
215 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x72, 0x6d, 0x64, 0x69, 0x72, 0x28, 0x66, 0x6e,
216 0x61, 0x6d, 0x65, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70,
217 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x72, 0x6d, 0x64, 0x69, 0x72, 0x20, 0x25, 0x73, 0x20,
218 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x28, 0x25, 0x73, 0x29, 0x5c, 0x6e, 0x22, 0x2c, 0x20,
219 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c,
220 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29,
221 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f,
222 0x5f, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x6f, 0x6c,
223 0x64, 0x2c, 0x20, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x6e, 0x65, 0x77, 0x29, 0x0a, 0x7b, 0x0a,
224 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a,
225 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28, 0x6e, 0x65, 0x77, 0x29, 0x3b, 0x0a,
226 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x6f, 0x6c, 0x64,
227 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09,
228 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x20,
229 0x25, 0x73, 0x20, 0x25, 0x73, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x28, 0x25, 0x73,
230 0x29, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
231 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72,
232 0x6f, 0x72, 0x28, 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d,
233 0x0a, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x28,
234 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74,
235 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74,
236 0x20, 0x73, 0x74, 0x61, 0x74, 0x20, 0x73, 0x74, 0x3b, 0x0a, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75,
237 0x70, 0x70, 0x65, 0x72, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x69,
238 0x66, 0x20, 0x28, 0x73, 0x74, 0x61, 0x74, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x26,
239 0x73, 0x74, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72,
240 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x64, 0x6f, 0x5f, 0x73, 0x74,
241 0x61, 0x74, 0x3a, 0x20, 0x25, 0x73, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x3d, 0x25, 0x64, 0x20, 0x25,
242 0x73, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
243 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d,
244 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72, 0x6f,
245 0x72, 0x28, 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74,
246 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x53, 0x5f, 0x49,
247 0x53, 0x44, 0x49, 0x52, 0x28, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x29,
248 0x29, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28,
249 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x21, 0x3d, 0x20, 0x73, 0x69,
250 0x7a, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22,
251 0x28, 0x25, 0x64, 0x29, 0x20, 0x64, 0x6f, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x3a, 0x20, 0x25, 0x73,
252 0x20, 0x77, 0x72, 0x6f, 0x6e, 0x67, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x25, 0x64, 0x20, 0x25,
253 0x64, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
254 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d,
255 0x65, 0x2c, 0x20, 0x28, 0x69, 0x6e, 0x74, 0x29, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69,
256 0x7a, 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d, 0x0a,
257 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x28,
258 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74,
259 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x64, 0x6f, 0x5f, 0x6f, 0x70, 0x65,
260 0x6e, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x35, 0x30, 0x30, 0x30, 0x2c, 0x20, 0x73,
261 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x64, 0x6f, 0x5f, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x28,
262 0x35, 0x30, 0x30, 0x30, 0x29, 0x3b, 0x0a, 0x7d, 0x0a
263};
264#endif
265static unsigned char comprbuf[TESTDATA_LEN];
266static unsigned char decomprbuf[TESTDATA_LEN];
267
268int jffs2_decompress(unsigned char comprtype, unsigned char *cdata_in,
269 unsigned char *data_out, uint32_t cdatalen, uint32_t datalen);
270unsigned char jffs2_compress(unsigned char *data_in, unsigned char *cpage_out,
271 uint32_t *datalen, uint32_t *cdatalen);
272
273int init_module(void ) {
274 unsigned char comprtype;
275 uint32_t c, d;
276 int ret;
277
278 printk("Original data: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
279 testdata[0],testdata[1],testdata[2],testdata[3],
280 testdata[4],testdata[5],testdata[6],testdata[7],
281 testdata[8],testdata[9],testdata[10],testdata[11],
282 testdata[12],testdata[13],testdata[14],testdata[15]);
283 d = TESTDATA_LEN;
284 c = TESTDATA_LEN;
285 comprtype = jffs2_compress(testdata, comprbuf, &d, &c);
286
287 printk("jffs2_compress used compression type %d. Compressed size %d, uncompressed size %d\n",
288 comprtype, c, d);
289 printk("Compressed data: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
290 comprbuf[0],comprbuf[1],comprbuf[2],comprbuf[3],
291 comprbuf[4],comprbuf[5],comprbuf[6],comprbuf[7],
292 comprbuf[8],comprbuf[9],comprbuf[10],comprbuf[11],
293 comprbuf[12],comprbuf[13],comprbuf[14],comprbuf[15]);
294
295 ret = jffs2_decompress(comprtype, comprbuf, decomprbuf, c, d);
296 printk("jffs2_decompress returned %d\n", ret);
297 printk("Decompressed data: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
298 decomprbuf[0],decomprbuf[1],decomprbuf[2],decomprbuf[3],
299 decomprbuf[4],decomprbuf[5],decomprbuf[6],decomprbuf[7],
300 decomprbuf[8],decomprbuf[9],decomprbuf[10],decomprbuf[11],
301 decomprbuf[12],decomprbuf[13],decomprbuf[14],decomprbuf[15]);
302 if (memcmp(decomprbuf, testdata, d))
303 printk("Compression and decompression corrupted data\n");
304 else
305 printk("Compression good for %d bytes\n", d);
306 return 1;
307}
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 4189e4a36050..3a32c64ed497 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -1,15 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: debug.c,v 1.12 2005/11/07 11:14:39 gleixner Exp $
11 *
12 */ 10 */
11
13#include <linux/kernel.h> 12#include <linux/kernel.h>
14#include <linux/types.h> 13#include <linux/types.h>
15#include <linux/pagemap.h> 14#include <linux/pagemap.h>
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index f89c85d5a3f8..2a49f2c51a9f 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -1,15 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: debug.h,v 1.21 2005/11/07 11:14:39 gleixner Exp $
11 *
12 */ 10 */
11
13#ifndef _JFFS2_DEBUG_H_ 12#ifndef _JFFS2_DEBUG_H_
14#define _JFFS2_DEBUG_H_ 13#define _JFFS2_DEBUG_H_
15 14
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9fa2e27f0641..c1dfca310dd6 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: dir.c,v 1.90 2005/11/07 11:14:39 gleixner Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index ad0121088dde..66e7c2f1e644 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: erase.c,v 1.85 2005/09/20 14:53:15 dedekind Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
@@ -333,7 +331,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
333 331
334 *bad_offset = ofs; 332 *bad_offset = ofs;
335 333
336 ret = jffs2_flash_read(c, ofs, readlen, &retlen, ebuf); 334 ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf);
337 if (ret) { 335 if (ret) {
338 printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret); 336 printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret);
339 goto fail; 337 goto fail;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e82eeaf7590d..99871279a1ed 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: file.c,v 1.104 2005/10/18 23:29:35 tpoynor Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index abb90c0c09cc..1d3b7a9fc828 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: fs.c,v 1.66 2005/09/27 13:17:29 dedekind Exp $
11 *
12 */ 10 */
13 11
14#include <linux/capability.h> 12#include <linux/capability.h>
@@ -672,6 +670,13 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) {
672 return ret; 670 return ret;
673 } 671 }
674 672
673 /* and an UBI volume */
674 if (jffs2_ubivol(c)) {
675 ret = jffs2_ubivol_setup(c);
676 if (ret)
677 return ret;
678 }
679
675 return ret; 680 return ret;
676} 681}
677 682
@@ -690,4 +695,9 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) {
690 if (jffs2_nor_wbuf_flash(c)) { 695 if (jffs2_nor_wbuf_flash(c)) {
691 jffs2_nor_wbuf_flash_cleanup(c); 696 jffs2_nor_wbuf_flash_cleanup(c);
692 } 697 }
698
699 /* and an UBI volume */
700 if (jffs2_ubivol(c)) {
701 jffs2_ubivol_cleanup(c);
702 }
693} 703}
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3a3cf225981f..2d99e06ab223 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: gc.c,v 1.155 2005/11/07 11:14:39 gleixner Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
@@ -144,7 +142,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
144 c->unchecked_size); 142 c->unchecked_size);
145 jffs2_dbg_dump_block_lists_nolock(c); 143 jffs2_dbg_dump_block_lists_nolock(c);
146 spin_unlock(&c->erase_completion_lock); 144 spin_unlock(&c->erase_completion_lock);
147 BUG(); 145 up(&c->alloc_sem);
146 return -ENOSPC;
148 } 147 }
149 148
150 spin_unlock(&c->erase_completion_lock); 149 spin_unlock(&c->erase_completion_lock);
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
index 69099835de1c..f4d525b0ea53 100644
--- a/fs/jffs2/ioctl.c
+++ b/fs/jffs2/ioctl.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: ioctl.c,v 1.10 2005/11/07 11:14:40 gleixner Exp $
11 *
12 */ 10 */
13 11
14#include <linux/fs.h> 12#include <linux/fs.h>
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 3a566077ac95..0b78fdc9773b 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -1,4 +1,13 @@
1/* $Id: jffs2_fs_i.h,v 1.19 2005/11/07 11:14:52 gleixner Exp $ */ 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2.
3 *
4 * Copyright © 2001-2007 Red Hat, Inc.
5 *
6 * Created by David Woodhouse <dwmw2@infradead.org>
7 *
8 * For licensing information, see the file 'LICENCE' in this directory.
9 *
10 */
2 11
3#ifndef _JFFS2_FS_I 12#ifndef _JFFS2_FS_I
4#define _JFFS2_FS_I 13#define _JFFS2_FS_I
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index ea88f69af130..b13298a824ed 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -1,4 +1,13 @@
1/* $Id: jffs2_fs_sb.h,v 1.54 2005/09/21 13:37:34 dedekind Exp $ */ 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2.
3 *
4 * Copyright © 2001-2007 Red Hat, Inc.
5 *
6 * Created by David Woodhouse <dwmw2@infradead.org>
7 *
8 * For licensing information, see the file 'LICENCE' in this directory.
9 *
10 */
2 11
3#ifndef _JFFS2_FS_SB 12#ifndef _JFFS2_FS_SB
4#define _JFFS2_FS_SB 13#define _JFFS2_FS_SB
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 83f9881ec4cc..35c1a5e30ba1 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: malloc.c,v 1.31 2005/11/07 11:14:40 gleixner Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 5a6b4d64206c..4bf86088b3ae 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: nodelist.c,v 1.115 2005/11/07 11:14:40 gleixner Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
@@ -54,7 +52,7 @@ void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new
54 *prev = new; 52 *prev = new;
55} 53}
56 54
57void jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint32_t size) 55uint32_t jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint32_t size)
58{ 56{
59 struct jffs2_node_frag *frag = jffs2_lookup_node_frag(list, size); 57 struct jffs2_node_frag *frag = jffs2_lookup_node_frag(list, size);
60 58
@@ -76,18 +74,24 @@ void jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint
76 } 74 }
77 75
78 if (size == 0) 76 if (size == 0)
79 return; 77 return 0;
80 78
81 /*
82 * If the last fragment starts at the RAM page boundary, it is
83 * REF_PRISTINE irrespective of its size.
84 */
85 frag = frag_last(list); 79 frag = frag_last(list);
80
81 /* Sanity check for truncation to longer than we started with... */
82 if (!frag)
83 return 0;
84 if (frag->ofs + frag->size < size)
85 return frag->ofs + frag->size;
86
87 /* If the last fragment starts at the RAM page boundary, it is
88 * REF_PRISTINE irrespective of its size. */
86 if (frag->node && (frag->ofs & (PAGE_CACHE_SIZE - 1)) == 0) { 89 if (frag->node && (frag->ofs & (PAGE_CACHE_SIZE - 1)) == 0) {
87 dbg_fragtree2("marking the last fragment 0x%08x-0x%08x REF_PRISTINE.\n", 90 dbg_fragtree2("marking the last fragment 0x%08x-0x%08x REF_PRISTINE.\n",
88 frag->ofs, frag->ofs + frag->size); 91 frag->ofs, frag->ofs + frag->size);
89 frag->node->raw->flash_offset = ref_offset(frag->node->raw) | REF_PRISTINE; 92 frag->node->raw->flash_offset = ref_offset(frag->node->raw) | REF_PRISTINE;
90 } 93 }
94 return size;
91} 95}
92 96
93static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c, 97static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c,
@@ -397,466 +401,6 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in
397 return 0; 401 return 0;
398} 402}
399 403
400/*
401 * Check the data CRC of the node.
402 *
403 * Returns: 0 if the data CRC is correct;
404 * 1 - if incorrect;
405 * error code if an error occured.
406 */
407static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
408{
409 struct jffs2_raw_node_ref *ref = tn->fn->raw;
410 int err = 0, pointed = 0;
411 struct jffs2_eraseblock *jeb;
412 unsigned char *buffer;
413 uint32_t crc, ofs, len;
414 size_t retlen;
415
416 BUG_ON(tn->csize == 0);
417
418 if (!jffs2_is_writebuffered(c))
419 goto adj_acc;
420
421 /* Calculate how many bytes were already checked */
422 ofs = ref_offset(ref) + sizeof(struct jffs2_raw_inode);
423 len = ofs % c->wbuf_pagesize;
424 if (likely(len))
425 len = c->wbuf_pagesize - len;
426
427 if (len >= tn->csize) {
428 dbg_readinode("no need to check node at %#08x, data length %u, data starts at %#08x - it has already been checked.\n",
429 ref_offset(ref), tn->csize, ofs);
430 goto adj_acc;
431 }
432
433 ofs += len;
434 len = tn->csize - len;
435
436 dbg_readinode("check node at %#08x, data length %u, partial CRC %#08x, correct CRC %#08x, data starts at %#08x, start checking from %#08x - %u bytes.\n",
437 ref_offset(ref), tn->csize, tn->partial_crc, tn->data_crc, ofs - len, ofs, len);
438
439#ifndef __ECOS
440 /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
441 * adding and jffs2_flash_read_end() interface. */
442 if (c->mtd->point) {
443 err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer);
444 if (!err && retlen < tn->csize) {
445 JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
446 c->mtd->unpoint(c->mtd, buffer, ofs, len);
447 } else if (err)
448 JFFS2_WARNING("MTD point failed: error code %d.\n", err);
449 else
450 pointed = 1; /* succefully pointed to device */
451 }
452#endif
453
454 if (!pointed) {
455 buffer = kmalloc(len, GFP_KERNEL);
456 if (unlikely(!buffer))
457 return -ENOMEM;
458
459 /* TODO: this is very frequent pattern, make it a separate
460 * routine */
461 err = jffs2_flash_read(c, ofs, len, &retlen, buffer);
462 if (err) {
463 JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ofs, err);
464 goto free_out;
465 }
466
467 if (retlen != len) {
468 JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len);
469 err = -EIO;
470 goto free_out;
471 }
472 }
473
474 /* Continue calculating CRC */
475 crc = crc32(tn->partial_crc, buffer, len);
476 if(!pointed)
477 kfree(buffer);
478#ifndef __ECOS
479 else
480 c->mtd->unpoint(c->mtd, buffer, ofs, len);
481#endif
482
483 if (crc != tn->data_crc) {
484 JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n",
485 ofs, tn->data_crc, crc);
486 return 1;
487 }
488
489adj_acc:
490 jeb = &c->blocks[ref->flash_offset / c->sector_size];
491 len = ref_totlen(c, jeb, ref);
492
493 /*
494 * Mark the node as having been checked and fix the
495 * accounting accordingly.
496 */
497 spin_lock(&c->erase_completion_lock);
498 jeb->used_size += len;
499 jeb->unchecked_size -= len;
500 c->used_size += len;
501 c->unchecked_size -= len;
502 spin_unlock(&c->erase_completion_lock);
503
504 return 0;
505
506free_out:
507 if(!pointed)
508 kfree(buffer);
509#ifndef __ECOS
510 else
511 c->mtd->unpoint(c->mtd, buffer, ofs, len);
512#endif
513 return err;
514}
515
516/*
517 * Helper function for jffs2_add_older_frag_to_fragtree().
518 *
519 * Checks the node if we are in the checking stage.
520 */
521static int check_node(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn)
522{
523 int ret;
524
525 BUG_ON(ref_obsolete(tn->fn->raw));
526
527 /* We only check the data CRC of unchecked nodes */
528 if (ref_flags(tn->fn->raw) != REF_UNCHECKED)
529 return 0;
530
531 dbg_fragtree2("check node %#04x-%#04x, phys offs %#08x.\n",
532 tn->fn->ofs, tn->fn->ofs + tn->fn->size, ref_offset(tn->fn->raw));
533
534 ret = check_node_data(c, tn);
535 if (unlikely(ret < 0)) {
536 JFFS2_ERROR("check_node_data() returned error: %d.\n",
537 ret);
538 } else if (unlikely(ret > 0)) {
539 dbg_fragtree2("CRC error, mark it obsolete.\n");
540 jffs2_mark_node_obsolete(c, tn->fn->raw);
541 }
542
543 return ret;
544}
545
546/*
547 * Helper function for jffs2_add_older_frag_to_fragtree().
548 *
549 * Called when the new fragment that is being inserted
550 * splits a hole fragment.
551 */
552static int split_hole(struct jffs2_sb_info *c, struct rb_root *root,
553 struct jffs2_node_frag *newfrag, struct jffs2_node_frag *hole)
554{
555 dbg_fragtree2("fragment %#04x-%#04x splits the hole %#04x-%#04x\n",
556 newfrag->ofs, newfrag->ofs + newfrag->size, hole->ofs, hole->ofs + hole->size);
557
558 if (hole->ofs == newfrag->ofs) {
559 /*
560 * Well, the new fragment actually starts at the same offset as
561 * the hole.
562 */
563 if (hole->ofs + hole->size > newfrag->ofs + newfrag->size) {
564 /*
565 * We replace the overlapped left part of the hole by
566 * the new node.
567 */
568
569 dbg_fragtree2("insert fragment %#04x-%#04x and cut the left part of the hole\n",
570 newfrag->ofs, newfrag->ofs + newfrag->size);
571 rb_replace_node(&hole->rb, &newfrag->rb, root);
572
573 hole->ofs += newfrag->size;
574 hole->size -= newfrag->size;
575
576 /*
577 * We know that 'hole' should be the right hand
578 * fragment.
579 */
580 jffs2_fragtree_insert(hole, newfrag);
581 rb_insert_color(&hole->rb, root);
582 } else {
583 /*
584 * Ah, the new fragment is of the same size as the hole.
585 * Relace the hole by it.
586 */
587 dbg_fragtree2("insert fragment %#04x-%#04x and overwrite hole\n",
588 newfrag->ofs, newfrag->ofs + newfrag->size);
589 rb_replace_node(&hole->rb, &newfrag->rb, root);
590 jffs2_free_node_frag(hole);
591 }
592 } else {
593 /* The new fragment lefts some hole space at the left */
594
595 struct jffs2_node_frag * newfrag2 = NULL;
596
597 if (hole->ofs + hole->size > newfrag->ofs + newfrag->size) {
598 /* The new frag also lefts some space at the right */
599 newfrag2 = new_fragment(NULL, newfrag->ofs +
600 newfrag->size, hole->ofs + hole->size
601 - newfrag->ofs - newfrag->size);
602 if (unlikely(!newfrag2)) {
603 jffs2_free_node_frag(newfrag);
604 return -ENOMEM;
605 }
606 }
607
608 hole->size = newfrag->ofs - hole->ofs;
609 dbg_fragtree2("left the hole %#04x-%#04x at the left and inserd fragment %#04x-%#04x\n",
610 hole->ofs, hole->ofs + hole->size, newfrag->ofs, newfrag->ofs + newfrag->size);
611
612 jffs2_fragtree_insert(newfrag, hole);
613 rb_insert_color(&newfrag->rb, root);
614
615 if (newfrag2) {
616 dbg_fragtree2("left the hole %#04x-%#04x at the right\n",
617 newfrag2->ofs, newfrag2->ofs + newfrag2->size);
618 jffs2_fragtree_insert(newfrag2, newfrag);
619 rb_insert_color(&newfrag2->rb, root);
620 }
621 }
622
623 return 0;
624}
625
626/*
627 * This function is used when we build inode. It expects the nodes are passed
628 * in the decreasing version order. The whole point of this is to improve the
629 * inodes checking on NAND: we check the nodes' data CRC only when they are not
630 * obsoleted. Previously, add_frag_to_fragtree() function was used and
631 * nodes were passed to it in the increasing version ordes and CRCs of all
632 * nodes were checked.
633 *
634 * Note: tn->fn->size shouldn't be zero.
635 *
636 * Returns 0 if the node was inserted
637 * 1 if it wasn't inserted (since it is obsolete)
638 * < 0 an if error occured
639 */
640int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
641 struct jffs2_tmp_dnode_info *tn)
642{
643 struct jffs2_node_frag *this, *newfrag;
644 uint32_t lastend;
645 struct jffs2_full_dnode *fn = tn->fn;
646 struct rb_root *root = &f->fragtree;
647 uint32_t fn_size = fn->size, fn_ofs = fn->ofs;
648 int err, checked = 0;
649 int ref_flag;
650
651 dbg_fragtree("insert fragment %#04x-%#04x, ver %u\n", fn_ofs, fn_ofs + fn_size, tn->version);
652
653 /* Skip all the nodes which are completed before this one starts */
654 this = jffs2_lookup_node_frag(root, fn_ofs);
655 if (this)
656 dbg_fragtree2("'this' found %#04x-%#04x (%s)\n", this->ofs, this->ofs + this->size, this->node ? "data" : "hole");
657
658 if (this)
659 lastend = this->ofs + this->size;
660 else
661 lastend = 0;
662
663 /* Detect the preliminary type of node */
664 if (fn->size >= PAGE_CACHE_SIZE)
665 ref_flag = REF_PRISTINE;
666 else
667 ref_flag = REF_NORMAL;
668
669 /* See if we ran off the end of the root */
670 if (lastend <= fn_ofs) {
671 /* We did */
672
673 /*
674 * We are going to insert the new node into the
675 * fragment tree, so check it.
676 */
677 err = check_node(c, f, tn);
678 if (err != 0)
679 return err;
680
681 fn->frags = 1;
682
683 newfrag = new_fragment(fn, fn_ofs, fn_size);
684 if (unlikely(!newfrag))
685 return -ENOMEM;
686
687 err = no_overlapping_node(c, root, newfrag, this, lastend);
688 if (unlikely(err != 0)) {
689 jffs2_free_node_frag(newfrag);
690 return err;
691 }
692
693 goto out_ok;
694 }
695
696 fn->frags = 0;
697
698 while (1) {
699 /*
700 * Here we have:
701 * fn_ofs < this->ofs + this->size && fn_ofs >= this->ofs.
702 *
703 * Remember, 'this' has higher version, any non-hole node
704 * which is already in the fragtree is newer then the newly
705 * inserted.
706 */
707 if (!this->node) {
708 /*
709 * 'this' is the hole fragment, so at least the
710 * beginning of the new fragment is valid.
711 */
712
713 /*
714 * We are going to insert the new node into the
715 * fragment tree, so check it.
716 */
717 if (!checked) {
718 err = check_node(c, f, tn);
719 if (unlikely(err != 0))
720 return err;
721 checked = 1;
722 }
723
724 if (this->ofs + this->size >= fn_ofs + fn_size) {
725 /* We split the hole on two parts */
726
727 fn->frags += 1;
728 newfrag = new_fragment(fn, fn_ofs, fn_size);
729 if (unlikely(!newfrag))
730 return -ENOMEM;
731
732 err = split_hole(c, root, newfrag, this);
733 if (unlikely(err))
734 return err;
735 goto out_ok;
736 }
737
738 /*
739 * The beginning of the new fragment is valid since it
740 * overlaps the hole node.
741 */
742
743 ref_flag = REF_NORMAL;
744
745 fn->frags += 1;
746 newfrag = new_fragment(fn, fn_ofs,
747 this->ofs + this->size - fn_ofs);
748 if (unlikely(!newfrag))
749 return -ENOMEM;
750
751 if (fn_ofs == this->ofs) {
752 /*
753 * The new node starts at the same offset as
754 * the hole and supersieds the hole.
755 */
756 dbg_fragtree2("add the new fragment instead of hole %#04x-%#04x, refcnt %d\n",
757 fn_ofs, fn_ofs + this->ofs + this->size - fn_ofs, fn->frags);
758
759 rb_replace_node(&this->rb, &newfrag->rb, root);
760 jffs2_free_node_frag(this);
761 } else {
762 /*
763 * The hole becomes shorter as its right part
764 * is supersieded by the new fragment.
765 */
766 dbg_fragtree2("reduce size of hole %#04x-%#04x to %#04x-%#04x\n",
767 this->ofs, this->ofs + this->size, this->ofs, this->ofs + this->size - newfrag->size);
768
769 dbg_fragtree2("add new fragment %#04x-%#04x, refcnt %d\n", fn_ofs,
770 fn_ofs + this->ofs + this->size - fn_ofs, fn->frags);
771
772 this->size -= newfrag->size;
773 jffs2_fragtree_insert(newfrag, this);
774 rb_insert_color(&newfrag->rb, root);
775 }
776
777 fn_ofs += newfrag->size;
778 fn_size -= newfrag->size;
779 this = rb_entry(rb_next(&newfrag->rb),
780 struct jffs2_node_frag, rb);
781
782 dbg_fragtree2("switch to the next 'this' fragment: %#04x-%#04x %s\n",
783 this->ofs, this->ofs + this->size, this->node ? "(data)" : "(hole)");
784 }
785
786 /*
787 * 'This' node is not the hole so it obsoletes the new fragment
788 * either fully or partially.
789 */
790 if (this->ofs + this->size >= fn_ofs + fn_size) {
791 /* The new node is obsolete, drop it */
792 if (fn->frags == 0) {
793 dbg_fragtree2("%#04x-%#04x is obsolete, mark it obsolete\n", fn_ofs, fn_ofs + fn_size);
794 ref_flag = REF_OBSOLETE;
795 }
796 goto out_ok;
797 } else {
798 struct jffs2_node_frag *new_this;
799
800 /* 'This' node obsoletes the beginning of the new node */
801 dbg_fragtree2("the beginning %#04x-%#04x is obsolete\n", fn_ofs, this->ofs + this->size);
802
803 ref_flag = REF_NORMAL;
804
805 fn_size -= this->ofs + this->size - fn_ofs;
806 fn_ofs = this->ofs + this->size;
807 dbg_fragtree2("now considering %#04x-%#04x\n", fn_ofs, fn_ofs + fn_size);
808
809 new_this = rb_entry(rb_next(&this->rb), struct jffs2_node_frag, rb);
810 if (!new_this) {
811 /*
812 * There is no next fragment. Add the rest of
813 * the new node as the right-hand child.
814 */
815 if (!checked) {
816 err = check_node(c, f, tn);
817 if (unlikely(err != 0))
818 return err;
819 checked = 1;
820 }
821
822 fn->frags += 1;
823 newfrag = new_fragment(fn, fn_ofs, fn_size);
824 if (unlikely(!newfrag))
825 return -ENOMEM;
826
827 dbg_fragtree2("there are no more fragments, insert %#04x-%#04x\n",
828 newfrag->ofs, newfrag->ofs + newfrag->size);
829 rb_link_node(&newfrag->rb, &this->rb, &this->rb.rb_right);
830 rb_insert_color(&newfrag->rb, root);
831 goto out_ok;
832 } else {
833 this = new_this;
834 dbg_fragtree2("switch to the next 'this' fragment: %#04x-%#04x %s\n",
835 this->ofs, this->ofs + this->size, this->node ? "(data)" : "(hole)");
836 }
837 }
838 }
839
840out_ok:
841 BUG_ON(fn->size < PAGE_CACHE_SIZE && ref_flag == REF_PRISTINE);
842
843 if (ref_flag == REF_OBSOLETE) {
844 dbg_fragtree2("the node is obsolete now\n");
845 /* jffs2_mark_node_obsolete() will adjust space accounting */
846 jffs2_mark_node_obsolete(c, fn->raw);
847 return 1;
848 }
849
850 dbg_fragtree2("the node is \"%s\" now\n", ref_flag == REF_NORMAL ? "REF_NORMAL" : "REF_PRISTINE");
851
852 /* Space accounting was adjusted at check_node_data() */
853 spin_lock(&c->erase_completion_lock);
854 fn->raw->flash_offset = ref_offset(fn->raw) | ref_flag;
855 spin_unlock(&c->erase_completion_lock);
856
857 return 0;
858}
859
860void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state) 404void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state)
861{ 405{
862 spin_lock(&c->inocache_lock); 406 spin_lock(&c->inocache_lock);
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 4178b4b55948..25126a062cae 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: nodelist.h,v 1.140 2005/09/07 08:34:54 havasi Exp $
11 *
12 */ 10 */
13 11
14#ifndef __JFFS2_NODELIST_H__ 12#ifndef __JFFS2_NODELIST_H__
@@ -40,6 +38,9 @@
40#define cpu_to_je32(x) ((jint32_t){x}) 38#define cpu_to_je32(x) ((jint32_t){x})
41#define cpu_to_jemode(x) ((jmode_t){os_to_jffs2_mode(x)}) 39#define cpu_to_jemode(x) ((jmode_t){os_to_jffs2_mode(x)})
42 40
41#define constant_cpu_to_je16(x) ((jint16_t){x})
42#define constant_cpu_to_je32(x) ((jint32_t){x})
43
43#define je16_to_cpu(x) ((x).v16) 44#define je16_to_cpu(x) ((x).v16)
44#define je32_to_cpu(x) ((x).v32) 45#define je32_to_cpu(x) ((x).v32)
45#define jemode_to_cpu(x) (jffs2_to_os_mode((x).m)) 46#define jemode_to_cpu(x) (jffs2_to_os_mode((x).m))
@@ -48,6 +49,9 @@
48#define cpu_to_je32(x) ((jint32_t){cpu_to_be32(x)}) 49#define cpu_to_je32(x) ((jint32_t){cpu_to_be32(x)})
49#define cpu_to_jemode(x) ((jmode_t){cpu_to_be32(os_to_jffs2_mode(x))}) 50#define cpu_to_jemode(x) ((jmode_t){cpu_to_be32(os_to_jffs2_mode(x))})
50 51
52#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_be16(x)})
53#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_be32(x)})
54
51#define je16_to_cpu(x) (be16_to_cpu(x.v16)) 55#define je16_to_cpu(x) (be16_to_cpu(x.v16))
52#define je32_to_cpu(x) (be32_to_cpu(x.v32)) 56#define je32_to_cpu(x) (be32_to_cpu(x.v32))
53#define jemode_to_cpu(x) (be32_to_cpu(jffs2_to_os_mode((x).m))) 57#define jemode_to_cpu(x) (be32_to_cpu(jffs2_to_os_mode((x).m)))
@@ -56,6 +60,9 @@
56#define cpu_to_je32(x) ((jint32_t){cpu_to_le32(x)}) 60#define cpu_to_je32(x) ((jint32_t){cpu_to_le32(x)})
57#define cpu_to_jemode(x) ((jmode_t){cpu_to_le32(os_to_jffs2_mode(x))}) 61#define cpu_to_jemode(x) ((jmode_t){cpu_to_le32(os_to_jffs2_mode(x))})
58 62
63#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_le16(x)})
64#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_le32(x)})
65
59#define je16_to_cpu(x) (le16_to_cpu(x.v16)) 66#define je16_to_cpu(x) (le16_to_cpu(x.v16))
60#define je32_to_cpu(x) (le32_to_cpu(x.v32)) 67#define je32_to_cpu(x) (le32_to_cpu(x.v32))
61#define jemode_to_cpu(x) (le32_to_cpu(jffs2_to_os_mode((x).m))) 68#define jemode_to_cpu(x) (le32_to_cpu(jffs2_to_os_mode((x).m)))
@@ -216,7 +223,20 @@ struct jffs2_tmp_dnode_info
216 uint32_t version; 223 uint32_t version;
217 uint32_t data_crc; 224 uint32_t data_crc;
218 uint32_t partial_crc; 225 uint32_t partial_crc;
219 uint32_t csize; 226 uint16_t csize;
227 uint16_t overlapped;
228};
229
230/* Temporary data structure used during readinode. */
231struct jffs2_readinode_info
232{
233 struct rb_root tn_root;
234 struct jffs2_tmp_dnode_info *mdata_tn;
235 uint32_t highest_version;
236 uint32_t latest_mctime;
237 uint32_t mctime_ver;
238 struct jffs2_full_dirent *fds;
239 struct jffs2_raw_node_ref *latest_ref;
220}; 240};
221 241
222struct jffs2_full_dirent 242struct jffs2_full_dirent
@@ -319,6 +339,15 @@ static inline struct jffs2_node_frag *frag_last(struct rb_root *root)
319#define frag_right(frag) rb_entry((frag)->rb.rb_right, struct jffs2_node_frag, rb) 339#define frag_right(frag) rb_entry((frag)->rb.rb_right, struct jffs2_node_frag, rb)
320#define frag_erase(frag, list) rb_erase(&frag->rb, list); 340#define frag_erase(frag, list) rb_erase(&frag->rb, list);
321 341
342#define tn_next(tn) rb_entry(rb_next(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
343#define tn_prev(tn) rb_entry(rb_prev(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
344#define tn_parent(tn) rb_entry(rb_parent(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
345#define tn_left(tn) rb_entry((tn)->rb.rb_left, struct jffs2_tmp_dnode_info, rb)
346#define tn_right(tn) rb_entry((tn)->rb.rb_right, struct jffs2_tmp_dnode_info, rb)
347#define tn_erase(tn, list) rb_erase(&tn->rb, list);
348#define tn_last(list) rb_entry(rb_last(list), struct jffs2_tmp_dnode_info, rb)
349#define tn_first(list) rb_entry(rb_first(list), struct jffs2_tmp_dnode_info, rb)
350
322/* nodelist.c */ 351/* nodelist.c */
323void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list); 352void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list);
324void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state); 353void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state);
@@ -333,8 +362,7 @@ struct rb_node *rb_next(struct rb_node *);
333struct rb_node *rb_prev(struct rb_node *); 362struct rb_node *rb_prev(struct rb_node *);
334void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); 363void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root);
335int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn); 364int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
336void jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size); 365uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
337int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn);
338struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c, 366struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
339 struct jffs2_eraseblock *jeb, 367 struct jffs2_eraseblock *jeb,
340 uint32_t ofs, uint32_t len, 368 uint32_t ofs, uint32_t len,
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index d88376992ed9..dbc908ad622b 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: nodemgmt.c,v 1.127 2005/09/20 15:49:12 dedekind Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
@@ -172,6 +170,11 @@ int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
172static void jffs2_close_nextblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) 170static void jffs2_close_nextblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
173{ 171{
174 172
173 if (c->nextblock == NULL) {
174 D1(printk(KERN_DEBUG "jffs2_close_nextblock: Erase block at 0x%08x has already been placed in a list\n",
175 jeb->offset));
176 return;
177 }
175 /* Check, if we have a dirty block now, or if it was dirty already */ 178 /* Check, if we have a dirty block now, or if it was dirty already */
176 if (ISDIRTY (jeb->wasted_size + jeb->dirty_size)) { 179 if (ISDIRTY (jeb->wasted_size + jeb->dirty_size)) {
177 c->dirty_size += jeb->wasted_size; 180 c->dirty_size += jeb->wasted_size;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index e07a0edcdb4f..80daea96bbc2 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2002-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: os-linux.h,v 1.64 2005/09/30 13:59:13 dedekind Exp $
11 *
12 */ 10 */
13 11
14#ifndef __JFFS2_OS_LINUX_H__ 12#ifndef __JFFS2_OS_LINUX_H__
@@ -98,6 +96,9 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
98#define jffs2_nor_wbuf_flash(c) (0) 96#define jffs2_nor_wbuf_flash(c) (0)
99#define jffs2_nor_wbuf_flash_setup(c) (0) 97#define jffs2_nor_wbuf_flash_setup(c) (0)
100#define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0) 98#define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0)
99#define jffs2_ubivol(c) (0)
100#define jffs2_ubivol_setup(c) (0)
101#define jffs2_ubivol_cleanup(c) do {} while (0)
101 102
102#else /* NAND and/or ECC'd NOR support present */ 103#else /* NAND and/or ECC'd NOR support present */
103 104
@@ -133,6 +134,9 @@ void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
133#define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH) 134#define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
134int jffs2_dataflash_setup(struct jffs2_sb_info *c); 135int jffs2_dataflash_setup(struct jffs2_sb_info *c);
135void jffs2_dataflash_cleanup(struct jffs2_sb_info *c); 136void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
137#define jffs2_ubivol(c) (c->mtd->type == MTD_UBIVOLUME)
138int jffs2_ubivol_setup(struct jffs2_sb_info *c);
139void jffs2_ubivol_cleanup(struct jffs2_sb_info *c);
136 140
137#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE)) 141#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
138int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c); 142int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/pushpull.h b/fs/jffs2/pushpull.h
deleted file mode 100644
index c0c2a9158dff..000000000000
--- a/fs/jffs2/pushpull.h
+++ /dev/null
@@ -1,72 +0,0 @@
1/*
2 * JFFS2 -- Journalling Flash File System, Version 2.
3 *
4 * Copyright (C) 2001, 2002 Red Hat, Inc.
5 *
6 * Created by David Woodhouse <dwmw2@infradead.org>
7 *
8 * For licensing information, see the file 'LICENCE' in this directory.
9 *
10 * $Id: pushpull.h,v 1.10 2004/11/16 20:36:11 dwmw2 Exp $
11 *
12 */
13
14#ifndef __PUSHPULL_H__
15#define __PUSHPULL_H__
16
17#include <linux/errno.h>
18
19struct pushpull {
20 unsigned char *buf;
21 unsigned int buflen;
22 unsigned int ofs;
23 unsigned int reserve;
24};
25
26
27static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve)
28{
29 pp->buf = buf;
30 pp->buflen = buflen;
31 pp->ofs = ofs;
32 pp->reserve = reserve;
33}
34
35static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
36{
37 if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) {
38 return -ENOSPC;
39 }
40
41 if (bit) {
42 pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7)));
43 }
44 else {
45 pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7)));
46 }
47 pp->ofs++;
48
49 return 0;
50}
51
52static inline int pushedbits(struct pushpull *pp)
53{
54 return pp->ofs;
55}
56
57static inline int pullbit(struct pushpull *pp)
58{
59 int bit;
60
61 bit = (pp->buf[pp->ofs >> 3] >> (7-(pp->ofs & 7))) & 1;
62
63 pp->ofs++;
64 return bit;
65}
66
67static inline int pulledbits(struct pushpull *pp)
68{
69 return pp->ofs;
70}
71
72#endif /* __PUSHPULL_H__ */
diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c
index f3b86da833ba..cfe05c1966a5 100644
--- a/fs/jffs2/read.c
+++ b/fs/jffs2/read.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: read.c,v 1.42 2005/11/07 11:14:41 gleixner Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 717a48cf7df2..6aff38930b50 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: readinode.c,v 1.143 2005/11/07 11:14:41 gleixner Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
@@ -22,30 +20,510 @@
22#include "nodelist.h" 20#include "nodelist.h"
23 21
24/* 22/*
25 * Put a new tmp_dnode_info into the temporaty RB-tree, keeping the list in 23 * Check the data CRC of the node.
26 * order of increasing version. 24 *
25 * Returns: 0 if the data CRC is correct;
26 * 1 - if incorrect;
27 * error code if an error occured.
27 */ 28 */
28static void jffs2_add_tn_to_tree(struct jffs2_tmp_dnode_info *tn, struct rb_root *list) 29static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
29{ 30{
30 struct rb_node **p = &list->rb_node; 31 struct jffs2_raw_node_ref *ref = tn->fn->raw;
31 struct rb_node * parent = NULL; 32 int err = 0, pointed = 0;
32 struct jffs2_tmp_dnode_info *this; 33 struct jffs2_eraseblock *jeb;
33 34 unsigned char *buffer;
34 while (*p) { 35 uint32_t crc, ofs, len;
35 parent = *p; 36 size_t retlen;
36 this = rb_entry(parent, struct jffs2_tmp_dnode_info, rb); 37
37 38 BUG_ON(tn->csize == 0);
38 /* There may actually be a collision here, but it doesn't 39
39 actually matter. As long as the two nodes with the same 40 if (!jffs2_is_writebuffered(c))
40 version are together, it's all fine. */ 41 goto adj_acc;
41 if (tn->version > this->version) 42
42 p = &(*p)->rb_left; 43 /* Calculate how many bytes were already checked */
44 ofs = ref_offset(ref) + sizeof(struct jffs2_raw_inode);
45 len = ofs % c->wbuf_pagesize;
46 if (likely(len))
47 len = c->wbuf_pagesize - len;
48
49 if (len >= tn->csize) {
50 dbg_readinode("no need to check node at %#08x, data length %u, data starts at %#08x - it has already been checked.\n",
51 ref_offset(ref), tn->csize, ofs);
52 goto adj_acc;
53 }
54
55 ofs += len;
56 len = tn->csize - len;
57
58 dbg_readinode("check node at %#08x, data length %u, partial CRC %#08x, correct CRC %#08x, data starts at %#08x, start checking from %#08x - %u bytes.\n",
59 ref_offset(ref), tn->csize, tn->partial_crc, tn->data_crc, ofs - len, ofs, len);
60
61#ifndef __ECOS
62 /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
63 * adding and jffs2_flash_read_end() interface. */
64 if (c->mtd->point) {
65 err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer);
66 if (!err && retlen < tn->csize) {
67 JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
68 c->mtd->unpoint(c->mtd, buffer, ofs, len);
69 } else if (err)
70 JFFS2_WARNING("MTD point failed: error code %d.\n", err);
43 else 71 else
44 p = &(*p)->rb_right; 72 pointed = 1; /* succefully pointed to device */
73 }
74#endif
75
76 if (!pointed) {
77 buffer = kmalloc(len, GFP_KERNEL);
78 if (unlikely(!buffer))
79 return -ENOMEM;
80
81 /* TODO: this is very frequent pattern, make it a separate
82 * routine */
83 err = jffs2_flash_read(c, ofs, len, &retlen, buffer);
84 if (err) {
85 JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ofs, err);
86 goto free_out;
87 }
88
89 if (retlen != len) {
90 JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len);
91 err = -EIO;
92 goto free_out;
93 }
94 }
95
96 /* Continue calculating CRC */
97 crc = crc32(tn->partial_crc, buffer, len);
98 if(!pointed)
99 kfree(buffer);
100#ifndef __ECOS
101 else
102 c->mtd->unpoint(c->mtd, buffer, ofs, len);
103#endif
104
105 if (crc != tn->data_crc) {
106 JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n",
107 ofs, tn->data_crc, crc);
108 return 1;
45 } 109 }
46 110
47 rb_link_node(&tn->rb, parent, p); 111adj_acc:
48 rb_insert_color(&tn->rb, list); 112 jeb = &c->blocks[ref->flash_offset / c->sector_size];
113 len = ref_totlen(c, jeb, ref);
114 /* If it should be REF_NORMAL, it'll get marked as such when
115 we build the fragtree, shortly. No need to worry about GC
116 moving it while it's marked REF_PRISTINE -- GC won't happen
117 till we've finished checking every inode anyway. */
118 ref->flash_offset |= REF_PRISTINE;
119 /*
120 * Mark the node as having been checked and fix the
121 * accounting accordingly.
122 */
123 spin_lock(&c->erase_completion_lock);
124 jeb->used_size += len;
125 jeb->unchecked_size -= len;
126 c->used_size += len;
127 c->unchecked_size -= len;
128 jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
129 spin_unlock(&c->erase_completion_lock);
130
131 return 0;
132
133free_out:
134 if(!pointed)
135 kfree(buffer);
136#ifndef __ECOS
137 else
138 c->mtd->unpoint(c->mtd, buffer, ofs, len);
139#endif
140 return err;
141}
142
143/*
144 * Helper function for jffs2_add_older_frag_to_fragtree().
145 *
146 * Checks the node if we are in the checking stage.
147 */
148static int check_tn_node(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
149{
150 int ret;
151
152 BUG_ON(ref_obsolete(tn->fn->raw));
153
154 /* We only check the data CRC of unchecked nodes */
155 if (ref_flags(tn->fn->raw) != REF_UNCHECKED)
156 return 0;
157
158 dbg_readinode("check node %#04x-%#04x, phys offs %#08x\n",
159 tn->fn->ofs, tn->fn->ofs + tn->fn->size, ref_offset(tn->fn->raw));
160
161 ret = check_node_data(c, tn);
162 if (unlikely(ret < 0)) {
163 JFFS2_ERROR("check_node_data() returned error: %d.\n",
164 ret);
165 } else if (unlikely(ret > 0)) {
166 dbg_readinode("CRC error, mark it obsolete.\n");
167 jffs2_mark_node_obsolete(c, tn->fn->raw);
168 }
169
170 return ret;
171}
172
173static struct jffs2_tmp_dnode_info *jffs2_lookup_tn(struct rb_root *tn_root, uint32_t offset)
174{
175 struct rb_node *next;
176 struct jffs2_tmp_dnode_info *tn = NULL;
177
178 dbg_readinode("root %p, offset %d\n", tn_root, offset);
179
180 next = tn_root->rb_node;
181
182 while (next) {
183 tn = rb_entry(next, struct jffs2_tmp_dnode_info, rb);
184
185 if (tn->fn->ofs < offset)
186 next = tn->rb.rb_right;
187 else if (tn->fn->ofs >= offset)
188 next = tn->rb.rb_left;
189 else
190 break;
191 }
192
193 return tn;
194}
195
196
197static void jffs2_kill_tn(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
198{
199 jffs2_mark_node_obsolete(c, tn->fn->raw);
200 jffs2_free_full_dnode(tn->fn);
201 jffs2_free_tmp_dnode_info(tn);
202}
203/*
204 * This function is used when we read an inode. Data nodes arrive in
205 * arbitrary order -- they may be older or newer than the nodes which
206 * are already in the tree. Where overlaps occur, the older node can
207 * be discarded as long as the newer passes the CRC check. We don't
208 * bother to keep track of holes in this rbtree, and neither do we deal
209 * with frags -- we can have multiple entries starting at the same
210 * offset, and the one with the smallest length will come first in the
211 * ordering.
212 *
213 * Returns 0 if the node was inserted
214 * 1 if the node is obsolete (because we can't mark it so yet)
215 * < 0 an if error occurred
216 */
217static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
218 struct jffs2_readinode_info *rii,
219 struct jffs2_tmp_dnode_info *tn)
220{
221 uint32_t fn_end = tn->fn->ofs + tn->fn->size;
222 struct jffs2_tmp_dnode_info *insert_point = NULL, *this;
223
224 dbg_readinode("insert fragment %#04x-%#04x, ver %u\n", tn->fn->ofs, fn_end, tn->version);
225
226 /* If a node has zero dsize, we only have to keep if it if it might be the
227 node with highest version -- i.e. the one which will end up as f->metadata.
228 Note that such nodes won't be REF_UNCHECKED since there are no data to
229 check anyway. */
230 if (!tn->fn->size) {
231 if (rii->mdata_tn) {
232 /* We had a candidate mdata node already */
233 dbg_readinode("kill old mdata with ver %d\n", rii->mdata_tn->version);
234 jffs2_kill_tn(c, rii->mdata_tn);
235 }
236 rii->mdata_tn = tn;
237 dbg_readinode("keep new mdata with ver %d\n", tn->version);
238 return 0;
239 }
240
241 /* Find the earliest node which _may_ be relevant to this one */
242 this = jffs2_lookup_tn(&rii->tn_root, tn->fn->ofs);
243 if (!this) {
244 /* First addition to empty tree. $DEITY how I love the easy cases */
245 rb_link_node(&tn->rb, NULL, &rii->tn_root.rb_node);
246 rb_insert_color(&tn->rb, &rii->tn_root);
247 dbg_readinode("keep new frag\n");
248 return 0;
249 }
250
251 /* If we add a new node it'll be somewhere under here. */
252 insert_point = this;
253
254 /* If the node is coincident with another at a lower address,
255 back up until the other node is found. It may be relevant */
256 while (tn->overlapped)
257 tn = tn_prev(tn);
258
259 dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole");
260
261 while (this) {
262 if (this->fn->ofs > fn_end)
263 break;
264 dbg_readinode("Ponder this ver %d, 0x%x-0x%x\n",
265 this->version, this->fn->ofs, this->fn->size);
266
267 if (this->version == tn->version) {
268 /* Version number collision means REF_PRISTINE GC. Accept either of them
269 as long as the CRC is correct. Check the one we have already... */
270 if (!check_tn_node(c, this)) {
271 /* The one we already had was OK. Keep it and throw away the new one */
272 dbg_readinode("Like old node. Throw away new\n");
273 jffs2_kill_tn(c, tn);
274 return 0;
275 } else {
276 /* Who cares if the new one is good; keep it for now anyway. */
277 rb_replace_node(&this->rb, &tn->rb, &rii->tn_root);
278 /* Same overlapping from in front and behind */
279 tn->overlapped = this->overlapped;
280 jffs2_kill_tn(c, this);
281 dbg_readinode("Like new node. Throw away old\n");
282 return 0;
283 }
284 }
285 if (this->version < tn->version &&
286 this->fn->ofs >= tn->fn->ofs &&
287 this->fn->ofs + this->fn->size <= fn_end) {
288 /* New node entirely overlaps 'this' */
289 if (check_tn_node(c, tn)) {
290 dbg_readinode("new node bad CRC\n");
291 jffs2_kill_tn(c, tn);
292 return 0;
293 }
294 /* ... and is good. Kill 'this'... */
295 rb_replace_node(&this->rb, &tn->rb, &rii->tn_root);
296 tn->overlapped = this->overlapped;
297 jffs2_kill_tn(c, this);
298 /* ... and any subsequent nodes which are also overlapped */
299 this = tn_next(tn);
300 while (this && this->fn->ofs + this->fn->size < fn_end) {
301 struct jffs2_tmp_dnode_info *next = tn_next(this);
302 if (this->version < tn->version) {
303 tn_erase(this, &rii->tn_root);
304 dbg_readinode("Kill overlapped ver %d, 0x%x-0x%x\n",
305 this->version, this->fn->ofs,
306 this->fn->ofs+this->fn->size);
307 jffs2_kill_tn(c, this);
308 }
309 this = next;
310 }
311 dbg_readinode("Done inserting new\n");
312 return 0;
313 }
314 if (this->version > tn->version &&
315 this->fn->ofs <= tn->fn->ofs &&
316 this->fn->ofs+this->fn->size >= fn_end) {
317 /* New node entirely overlapped by 'this' */
318 if (!check_tn_node(c, this)) {
319 dbg_readinode("Good CRC on old node. Kill new\n");
320 jffs2_kill_tn(c, tn);
321 return 0;
322 }
323 /* ... but 'this' was bad. Replace it... */
324 rb_replace_node(&this->rb, &tn->rb, &rii->tn_root);
325 dbg_readinode("Bad CRC on old overlapping node. Kill it\n");
326 jffs2_kill_tn(c, this);
327 return 0;
328 }
329 /* We want to be inserted under the last node which is
330 either at a lower offset _or_ has a smaller range */
331 if (this->fn->ofs < tn->fn->ofs ||
332 (this->fn->ofs == tn->fn->ofs &&
333 this->fn->size <= tn->fn->size))
334 insert_point = this;
335
336 this = tn_next(this);
337 }
338 dbg_readinode("insert_point %p, ver %d, 0x%x-0x%x, ov %d\n",
339 insert_point, insert_point->version, insert_point->fn->ofs,
340 insert_point->fn->ofs+insert_point->fn->size,
341 insert_point->overlapped);
342 /* We neither completely obsoleted nor were completely
343 obsoleted by an earlier node. Insert under insert_point */
344 {
345 struct rb_node *parent = &insert_point->rb;
346 struct rb_node **link = &parent;
347
348 while (*link) {
349 parent = *link;
350 insert_point = rb_entry(parent, struct jffs2_tmp_dnode_info, rb);
351 if (tn->fn->ofs > insert_point->fn->ofs)
352 link = &insert_point->rb.rb_right;
353 else if (tn->fn->ofs < insert_point->fn->ofs ||
354 tn->fn->size < insert_point->fn->size)
355 link = &insert_point->rb.rb_left;
356 else
357 link = &insert_point->rb.rb_right;
358 }
359 rb_link_node(&tn->rb, &insert_point->rb, link);
360 rb_insert_color(&tn->rb, &rii->tn_root);
361 }
362 /* If there's anything behind that overlaps us, note it */
363 this = tn_prev(tn);
364 if (this) {
365 while (1) {
366 if (this->fn->ofs + this->fn->size > tn->fn->ofs) {
367 dbg_readinode("Node is overlapped by %p (v %d, 0x%x-0x%x)\n",
368 this, this->version, this->fn->ofs,
369 this->fn->ofs+this->fn->size);
370 tn->overlapped = 1;
371 break;
372 }
373 if (!this->overlapped)
374 break;
375 this = tn_prev(this);
376 }
377 }
378
379 /* If the new node overlaps anything ahead, note it */
380 this = tn_next(tn);
381 while (this && this->fn->ofs < fn_end) {
382 this->overlapped = 1;
383 dbg_readinode("Node ver %d, 0x%x-0x%x is overlapped\n",
384 this->version, this->fn->ofs,
385 this->fn->ofs+this->fn->size);
386 this = tn_next(this);
387 }
388 return 0;
389}
390
391/* Trivial function to remove the last node in the tree. Which by definition
392 has no right-hand -- so can be removed just by making its only child (if
393 any) take its place under its parent. */
394static void eat_last(struct rb_root *root, struct rb_node *node)
395{
396 struct rb_node *parent = rb_parent(node);
397 struct rb_node **link;
398
399 /* LAST! */
400 BUG_ON(node->rb_right);
401
402 if (!parent)
403 link = &root->rb_node;
404 else if (node == parent->rb_left)
405 link = &parent->rb_left;
406 else
407 link = &parent->rb_right;
408
409 *link = node->rb_left;
410 /* Colour doesn't matter now. Only the parent pointer. */
411 if (node->rb_left)
412 node->rb_left->rb_parent_color = node->rb_parent_color;
413}
414
415/* We put this in reverse order, so we can just use eat_last */
416static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn)
417{
418 struct rb_node **link = &ver_root->rb_node;
419 struct rb_node *parent = NULL;
420 struct jffs2_tmp_dnode_info *this_tn;
421
422 while (*link) {
423 parent = *link;
424 this_tn = rb_entry(parent, struct jffs2_tmp_dnode_info, rb);
425
426 if (tn->version > this_tn->version)
427 link = &parent->rb_left;
428 else
429 link = &parent->rb_right;
430 }
431 dbg_readinode("Link new node at %p (root is %p)\n", link, ver_root);
432 rb_link_node(&tn->rb, parent, link);
433 rb_insert_color(&tn->rb, ver_root);
434}
435
436/* Build final, normal fragtree from tn tree. It doesn't matter which order
437 we add nodes to the real fragtree, as long as they don't overlap. And
438 having thrown away the majority of overlapped nodes as we went, there
439 really shouldn't be many sets of nodes which do overlap. If we start at
440 the end, we can use the overlap markers -- we can just eat nodes which
441 aren't overlapped, and when we encounter nodes which _do_ overlap we
442 sort them all into a temporary tree in version order before replaying them. */
443static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
444 struct jffs2_inode_info *f,
445 struct jffs2_readinode_info *rii)
446{
447 struct jffs2_tmp_dnode_info *pen, *last, *this;
448 struct rb_root ver_root = RB_ROOT;
449 uint32_t high_ver = 0;
450
451 if (rii->mdata_tn) {
452 dbg_readinode("potential mdata is ver %d at %p\n", rii->mdata_tn->version, rii->mdata_tn);
453 high_ver = rii->mdata_tn->version;
454 rii->latest_ref = rii->mdata_tn->fn->raw;
455 }
456#ifdef JFFS2_DBG_READINODE_MESSAGES
457 this = tn_last(&rii->tn_root);
458 while (this) {
459 dbg_readinode("tn %p ver %d range 0x%x-0x%x ov %d\n", this, this->version, this->fn->ofs,
460 this->fn->ofs+this->fn->size, this->overlapped);
461 this = tn_prev(this);
462 }
463#endif
464 pen = tn_last(&rii->tn_root);
465 while ((last = pen)) {
466 pen = tn_prev(last);
467
468 eat_last(&rii->tn_root, &last->rb);
469 ver_insert(&ver_root, last);
470
471 if (unlikely(last->overlapped))
472 continue;
473
474 /* Now we have a bunch of nodes in reverse version
475 order, in the tree at ver_root. Most of the time,
476 there'll actually be only one node in the 'tree',
477 in fact. */
478 this = tn_last(&ver_root);
479
480 while (this) {
481 struct jffs2_tmp_dnode_info *vers_next;
482 int ret;
483 vers_next = tn_prev(this);
484 eat_last(&ver_root, &this->rb);
485 if (check_tn_node(c, this)) {
486 dbg_readinode("node ver %x, 0x%x-0x%x failed CRC\n",
487 this->version, this->fn->ofs,
488 this->fn->ofs+this->fn->size);
489 jffs2_kill_tn(c, this);
490 } else {
491 if (this->version > high_ver) {
492 /* Note that this is different from the other
493 highest_version, because this one is only
494 counting _valid_ nodes which could give the
495 latest inode metadata */
496 high_ver = this->version;
497 rii->latest_ref = this->fn->raw;
498 }
499 dbg_readinode("Add %p (v %x, 0x%x-0x%x, ov %d) to fragtree\n",
500 this, this->version, this->fn->ofs,
501 this->fn->ofs+this->fn->size, this->overlapped);
502
503 ret = jffs2_add_full_dnode_to_inode(c, f, this->fn);
504 if (ret) {
505 /* Free the nodes in vers_root; let the caller
506 deal with the rest */
507 JFFS2_ERROR("Add node to tree failed %d\n", ret);
508 while (1) {
509 vers_next = tn_prev(this);
510 if (check_tn_node(c, this))
511 jffs2_mark_node_obsolete(c, this->fn->raw);
512 jffs2_free_full_dnode(this->fn);
513 jffs2_free_tmp_dnode_info(this);
514 this = vers_next;
515 if (!this)
516 break;
517 eat_last(&ver_root, &vers_next->rb);
518 }
519 return ret;
520 }
521 jffs2_free_tmp_dnode_info(this);
522 }
523 this = vers_next;
524 }
525 }
526 return 0;
49} 527}
50 528
51static void jffs2_free_tmp_dnode_info_list(struct rb_root *list) 529static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
@@ -112,8 +590,8 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r
112 * negative error code on failure. 590 * negative error code on failure.
113 */ 591 */
114static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, 592static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
115 struct jffs2_raw_dirent *rd, size_t read, struct jffs2_full_dirent **fdp, 593 struct jffs2_raw_dirent *rd, size_t read,
116 uint32_t *latest_mctime, uint32_t *mctime_ver) 594 struct jffs2_readinode_info *rii)
117{ 595{
118 struct jffs2_full_dirent *fd; 596 struct jffs2_full_dirent *fd;
119 uint32_t crc; 597 uint32_t crc;
@@ -125,7 +603,8 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
125 if (unlikely(crc != je32_to_cpu(rd->node_crc))) { 603 if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
126 JFFS2_NOTICE("header CRC failed on dirent node at %#08x: read %#08x, calculated %#08x\n", 604 JFFS2_NOTICE("header CRC failed on dirent node at %#08x: read %#08x, calculated %#08x\n",
127 ref_offset(ref), je32_to_cpu(rd->node_crc), crc); 605 ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
128 return 1; 606 jffs2_mark_node_obsolete(c, ref);
607 return 0;
129 } 608 }
130 609
131 /* If we've never checked the CRCs on this node, check them now */ 610 /* If we've never checked the CRCs on this node, check them now */
@@ -137,7 +616,8 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
137 if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) { 616 if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) {
138 JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n", 617 JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n",
139 ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen)); 618 ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen));
140 return 1; 619 jffs2_mark_node_obsolete(c, ref);
620 return 0;
141 } 621 }
142 622
143 jeb = &c->blocks[ref->flash_offset / c->sector_size]; 623 jeb = &c->blocks[ref->flash_offset / c->sector_size];
@@ -161,10 +641,13 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
161 fd->ino = je32_to_cpu(rd->ino); 641 fd->ino = je32_to_cpu(rd->ino);
162 fd->type = rd->type; 642 fd->type = rd->type;
163 643
644 if (fd->version > rii->highest_version)
645 rii->highest_version = fd->version;
646
164 /* Pick out the mctime of the latest dirent */ 647 /* Pick out the mctime of the latest dirent */
165 if(fd->version > *mctime_ver && je32_to_cpu(rd->mctime)) { 648 if(fd->version > rii->mctime_ver && je32_to_cpu(rd->mctime)) {
166 *mctime_ver = fd->version; 649 rii->mctime_ver = fd->version;
167 *latest_mctime = je32_to_cpu(rd->mctime); 650 rii->latest_mctime = je32_to_cpu(rd->mctime);
168 } 651 }
169 652
170 /* 653 /*
@@ -201,7 +684,7 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
201 * Wheee. We now have a complete jffs2_full_dirent structure, with 684 * Wheee. We now have a complete jffs2_full_dirent structure, with
202 * the name in it and everything. Link it into the list 685 * the name in it and everything. Link it into the list
203 */ 686 */
204 jffs2_add_fd_to_list(c, fd, fdp); 687 jffs2_add_fd_to_list(c, fd, &rii->fds);
205 688
206 return 0; 689 return 0;
207} 690}
@@ -210,13 +693,13 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
210 * Helper function for jffs2_get_inode_nodes(). 693 * Helper function for jffs2_get_inode_nodes().
211 * It is called every time an inode node is found. 694 * It is called every time an inode node is found.
212 * 695 *
213 * Returns: 0 on succes; 696 * Returns: 0 on success;
214 * 1 if the node should be marked obsolete; 697 * 1 if the node should be marked obsolete;
215 * negative error code on failure. 698 * negative error code on failure.
216 */ 699 */
217static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, 700static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
218 struct jffs2_raw_inode *rd, struct rb_root *tnp, int rdlen, 701 struct jffs2_raw_inode *rd, int rdlen,
219 uint32_t *latest_mctime, uint32_t *mctime_ver) 702 struct jffs2_readinode_info *rii)
220{ 703{
221 struct jffs2_tmp_dnode_info *tn; 704 struct jffs2_tmp_dnode_info *tn;
222 uint32_t len, csize; 705 uint32_t len, csize;
@@ -230,7 +713,8 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
230 if (unlikely(crc != je32_to_cpu(rd->node_crc))) { 713 if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
231 JFFS2_NOTICE("node CRC failed on dnode at %#08x: read %#08x, calculated %#08x\n", 714 JFFS2_NOTICE("node CRC failed on dnode at %#08x: read %#08x, calculated %#08x\n",
232 ref_offset(ref), je32_to_cpu(rd->node_crc), crc); 715 ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
233 return 1; 716 jffs2_mark_node_obsolete(c, ref);
717 return 0;
234 } 718 }
235 719
236 tn = jffs2_alloc_tmp_dnode_info(); 720 tn = jffs2_alloc_tmp_dnode_info();
@@ -342,6 +826,10 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
342 tn->data_crc = je32_to_cpu(rd->data_crc); 826 tn->data_crc = je32_to_cpu(rd->data_crc);
343 tn->csize = csize; 827 tn->csize = csize;
344 tn->fn->raw = ref; 828 tn->fn->raw = ref;
829 tn->overlapped = 0;
830
831 if (tn->version > rii->highest_version)
832 rii->highest_version = tn->version;
345 833
346 /* There was a bug where we wrote hole nodes out with 834 /* There was a bug where we wrote hole nodes out with
347 csize/dsize swapped. Deal with it */ 835 csize/dsize swapped. Deal with it */
@@ -353,13 +841,25 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
353 dbg_readinode("dnode @%08x: ver %u, offset %#04x, dsize %#04x, csize %#04x\n", 841 dbg_readinode("dnode @%08x: ver %u, offset %#04x, dsize %#04x, csize %#04x\n",
354 ref_offset(ref), je32_to_cpu(rd->version), je32_to_cpu(rd->offset), je32_to_cpu(rd->dsize), csize); 842 ref_offset(ref), je32_to_cpu(rd->version), je32_to_cpu(rd->offset), je32_to_cpu(rd->dsize), csize);
355 843
356 jffs2_add_tn_to_tree(tn, tnp); 844 ret = jffs2_add_tn_to_tree(c, rii, tn);
357 845
846 if (ret) {
847 jffs2_free_full_dnode(tn->fn);
848 free_out:
849 jffs2_free_tmp_dnode_info(tn);
850 return ret;
851 }
852#ifdef JFFS2_DBG_READINODE_MESSAGES
853 dbg_readinode("After adding ver %d:\n", tn->version);
854 tn = tn_first(&rii->tn_root);
855 while (tn) {
856 dbg_readinode("%p: v %d r 0x%x-0x%x ov %d\n",
857 tn, tn->version, tn->fn->ofs,
858 tn->fn->ofs+tn->fn->size, tn->overlapped);
859 tn = tn_next(tn);
860 }
861#endif
358 return 0; 862 return 0;
359
360free_out:
361 jffs2_free_tmp_dnode_info(tn);
362 return ret;
363} 863}
364 864
365/* 865/*
@@ -379,7 +879,8 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
379 JFFS2_ERROR("Node is {%04x,%04x,%08x,%08x}. Please report this error.\n", 879 JFFS2_ERROR("Node is {%04x,%04x,%08x,%08x}. Please report this error.\n",
380 je16_to_cpu(un->magic), je16_to_cpu(un->nodetype), 880 je16_to_cpu(un->magic), je16_to_cpu(un->nodetype),
381 je32_to_cpu(un->totlen), je32_to_cpu(un->hdr_crc)); 881 je32_to_cpu(un->totlen), je32_to_cpu(un->hdr_crc));
382 return 1; 882 jffs2_mark_node_obsolete(c, ref);
883 return 0;
383 } 884 }
384 885
385 un->nodetype = cpu_to_je16(JFFS2_NODE_ACCURATE | je16_to_cpu(un->nodetype)); 886 un->nodetype = cpu_to_je16(JFFS2_NODE_ACCURATE | je16_to_cpu(un->nodetype));
@@ -407,7 +908,8 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
407 case JFFS2_FEATURE_RWCOMPAT_DELETE: 908 case JFFS2_FEATURE_RWCOMPAT_DELETE:
408 JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n", 909 JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n",
409 je16_to_cpu(un->nodetype), ref_offset(ref)); 910 je16_to_cpu(un->nodetype), ref_offset(ref));
410 return 1; 911 jffs2_mark_node_obsolete(c, ref);
912 return 0;
411 } 913 }
412 914
413 return 0; 915 return 0;
@@ -421,92 +923,62 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
421 * negative error code on failure. 923 * negative error code on failure.
422 */ 924 */
423static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, 925static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
424 int right_size, int *rdlen, unsigned char *buf, unsigned char *bufstart) 926 int needed_len, int *rdlen, unsigned char *buf)
425{ 927{
426 int right_len, err, len; 928 int err, to_read = needed_len - *rdlen;
427 size_t retlen; 929 size_t retlen;
428 uint32_t offs; 930 uint32_t offs;
429 931
430 if (jffs2_is_writebuffered(c)) { 932 if (jffs2_is_writebuffered(c)) {
431 right_len = c->wbuf_pagesize - (bufstart - buf); 933 int rem = to_read % c->wbuf_pagesize;
432 if (right_size + (int)(bufstart - buf) > c->wbuf_pagesize)
433 right_len += c->wbuf_pagesize;
434 } else
435 right_len = right_size;
436 934
437 if (*rdlen == right_len) 935 if (rem)
438 return 0; 936 to_read += c->wbuf_pagesize - rem;
937 }
439 938
440 /* We need to read more data */ 939 /* We need to read more data */
441 offs = ref_offset(ref) + *rdlen; 940 offs = ref_offset(ref) + *rdlen;
442 if (jffs2_is_writebuffered(c)) {
443 bufstart = buf + c->wbuf_pagesize;
444 len = c->wbuf_pagesize;
445 } else {
446 bufstart = buf + *rdlen;
447 len = right_size - *rdlen;
448 }
449 941
450 dbg_readinode("read more %d bytes\n", len); 942 dbg_readinode("read more %d bytes\n", to_read);
451 943
452 err = jffs2_flash_read(c, offs, len, &retlen, bufstart); 944 err = jffs2_flash_read(c, offs, to_read, &retlen, buf + *rdlen);
453 if (err) { 945 if (err) {
454 JFFS2_ERROR("can not read %d bytes from 0x%08x, " 946 JFFS2_ERROR("can not read %d bytes from 0x%08x, "
455 "error code: %d.\n", len, offs, err); 947 "error code: %d.\n", to_read, offs, err);
456 return err; 948 return err;
457 } 949 }
458 950
459 if (retlen < len) { 951 if (retlen < to_read) {
460 JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n", 952 JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n",
461 offs, retlen, len); 953 offs, retlen, to_read);
462 return -EIO; 954 return -EIO;
463 } 955 }
464 956
465 *rdlen = right_len; 957 *rdlen += to_read;
466
467 return 0; 958 return 0;
468} 959}
469 960
470/* Get tmp_dnode_info and full_dirent for all non-obsolete nodes associated 961/* Get tmp_dnode_info and full_dirent for all non-obsolete nodes associated
471 with this ino, returning the former in order of version */ 962 with this ino. Perform a preliminary ordering on data nodes, throwing away
963 those which are completely obsoleted by newer ones. The naïve approach we
964 use to take of just returning them _all_ in version order will cause us to
965 run out of memory in certain degenerate cases. */
472static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f, 966static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
473 struct rb_root *tnp, struct jffs2_full_dirent **fdp, 967 struct jffs2_readinode_info *rii)
474 uint32_t *highest_version, uint32_t *latest_mctime,
475 uint32_t *mctime_ver)
476{ 968{
477 struct jffs2_raw_node_ref *ref, *valid_ref; 969 struct jffs2_raw_node_ref *ref, *valid_ref;
478 struct rb_root ret_tn = RB_ROOT;
479 struct jffs2_full_dirent *ret_fd = NULL;
480 unsigned char *buf = NULL; 970 unsigned char *buf = NULL;
481 union jffs2_node_union *node; 971 union jffs2_node_union *node;
482 size_t retlen; 972 size_t retlen;
483 int len, err; 973 int len, err;
484 974
485 *mctime_ver = 0; 975 rii->mctime_ver = 0;
486 976
487 dbg_readinode("ino #%u\n", f->inocache->ino); 977 dbg_readinode("ino #%u\n", f->inocache->ino);
488 978
489 if (jffs2_is_writebuffered(c)) {
490 /*
491 * If we have the write buffer, we assume the minimal I/O unit
492 * is c->wbuf_pagesize. We implement some optimizations which in
493 * this case and we need a temporary buffer of size =
494 * 2*c->wbuf_pagesize bytes (see comments in read_dnode()).
495 * Basically, we want to read not only the node header, but the
496 * whole wbuf (NAND page in case of NAND) or 2, if the node
497 * header overlaps the border between the 2 wbufs.
498 */
499 len = 2*c->wbuf_pagesize;
500 } else {
501 /*
502 * When there is no write buffer, the size of the temporary
503 * buffer is the size of the larges node header.
504 */
505 len = sizeof(union jffs2_node_union);
506 }
507
508 /* FIXME: in case of NOR and available ->point() this 979 /* FIXME: in case of NOR and available ->point() this
509 * needs to be fixed. */ 980 * needs to be fixed. */
981 len = sizeof(union jffs2_node_union) + c->wbuf_pagesize;
510 buf = kmalloc(len, GFP_KERNEL); 982 buf = kmalloc(len, GFP_KERNEL);
511 if (!buf) 983 if (!buf)
512 return -ENOMEM; 984 return -ENOMEM;
@@ -516,8 +988,6 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
516 if (!valid_ref && f->inocache->ino != 1) 988 if (!valid_ref && f->inocache->ino != 1)
517 JFFS2_WARNING("Eep. No valid nodes for ino #%u.\n", f->inocache->ino); 989 JFFS2_WARNING("Eep. No valid nodes for ino #%u.\n", f->inocache->ino);
518 while (valid_ref) { 990 while (valid_ref) {
519 unsigned char *bufstart;
520
521 /* We can hold a pointer to a non-obsolete node without the spinlock, 991 /* We can hold a pointer to a non-obsolete node without the spinlock,
522 but _obsolete_ nodes may disappear at any time, if the block 992 but _obsolete_ nodes may disappear at any time, if the block
523 they're in gets erased. So if we mark 'ref' obsolete while we're 993 they're in gets erased. So if we mark 'ref' obsolete while we're
@@ -533,32 +1003,31 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
533 /* 1003 /*
534 * At this point we don't know the type of the node we're going 1004 * At this point we don't know the type of the node we're going
535 * to read, so we do not know the size of its header. In order 1005 * to read, so we do not know the size of its header. In order
536 * to minimize the amount of flash IO we assume the node has 1006 * to minimize the amount of flash IO we assume the header is
537 * size = JFFS2_MIN_NODE_HEADER. 1007 * of size = JFFS2_MIN_NODE_HEADER.
538 */ 1008 */
1009 len = JFFS2_MIN_NODE_HEADER;
539 if (jffs2_is_writebuffered(c)) { 1010 if (jffs2_is_writebuffered(c)) {
1011 int end, rem;
1012
540 /* 1013 /*
541 * We treat 'buf' as 2 adjacent wbufs. We want to 1014 * We are about to read JFFS2_MIN_NODE_HEADER bytes,
542 * adjust bufstart such as it points to the 1015 * but this flash has some minimal I/O unit. It is
543 * beginning of the node within this wbuf. 1016 * possible that we'll need to read more soon, so read
1017 * up to the next min. I/O unit, in order not to
1018 * re-read the same min. I/O unit twice.
544 */ 1019 */
545 bufstart = buf + (ref_offset(ref) % c->wbuf_pagesize); 1020 end = ref_offset(ref) + len;
546 /* We will read either one wbuf or 2 wbufs. */ 1021 rem = end % c->wbuf_pagesize;
547 len = c->wbuf_pagesize - (bufstart - buf); 1022 if (rem)
548 if (JFFS2_MIN_NODE_HEADER + (int)(bufstart - buf) > c->wbuf_pagesize) { 1023 end += c->wbuf_pagesize - rem;
549 /* The header spans the border of the first wbuf */ 1024 len = end - ref_offset(ref);
550 len += c->wbuf_pagesize;
551 }
552 } else {
553 bufstart = buf;
554 len = JFFS2_MIN_NODE_HEADER;
555 } 1025 }
556 1026
557 dbg_readinode("read %d bytes at %#08x(%d).\n", len, ref_offset(ref), ref_flags(ref)); 1027 dbg_readinode("read %d bytes at %#08x(%d).\n", len, ref_offset(ref), ref_flags(ref));
558 1028
559 /* FIXME: point() */ 1029 /* FIXME: point() */
560 err = jffs2_flash_read(c, ref_offset(ref), len, 1030 err = jffs2_flash_read(c, ref_offset(ref), len, &retlen, buf);
561 &retlen, bufstart);
562 if (err) { 1031 if (err) {
563 JFFS2_ERROR("can not read %d bytes from 0x%08x, " "error code: %d.\n", len, ref_offset(ref), err); 1032 JFFS2_ERROR("can not read %d bytes from 0x%08x, " "error code: %d.\n", len, ref_offset(ref), err);
564 goto free_out; 1033 goto free_out;
@@ -570,7 +1039,7 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
570 goto free_out; 1039 goto free_out;
571 } 1040 }
572 1041
573 node = (union jffs2_node_union *)bufstart; 1042 node = (union jffs2_node_union *)buf;
574 1043
575 /* No need to mask in the valid bit; it shouldn't be invalid */ 1044 /* No need to mask in the valid bit; it shouldn't be invalid */
576 if (je32_to_cpu(node->u.hdr_crc) != crc32(0, node, sizeof(node->u)-4)) { 1045 if (je32_to_cpu(node->u.hdr_crc) != crc32(0, node, sizeof(node->u)-4)) {
@@ -583,10 +1052,10 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
583 jffs2_mark_node_obsolete(c, ref); 1052 jffs2_mark_node_obsolete(c, ref);
584 goto cont; 1053 goto cont;
585 } 1054 }
586 /* Due to poor choice of crc32 seed, an all-zero node will have a correct CRC */ 1055 if (je16_to_cpu(node->u.magic) != JFFS2_MAGIC_BITMASK) {
587 if (!je32_to_cpu(node->u.hdr_crc) && !je16_to_cpu(node->u.nodetype) && 1056 /* Not a JFFS2 node, whinge and move on */
588 !je16_to_cpu(node->u.magic) && !je32_to_cpu(node->u.totlen)) { 1057 JFFS2_NOTICE("Wrong magic bitmask 0x%04x in node header at %#08x.\n",
589 JFFS2_NOTICE("All zero node header at %#08x.\n", ref_offset(ref)); 1058 je16_to_cpu(node->u.magic), ref_offset(ref));
590 jffs2_mark_node_obsolete(c, ref); 1059 jffs2_mark_node_obsolete(c, ref);
591 goto cont; 1060 goto cont;
592 } 1061 }
@@ -596,46 +1065,34 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
596 case JFFS2_NODETYPE_DIRENT: 1065 case JFFS2_NODETYPE_DIRENT:
597 1066
598 if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_dirent)) { 1067 if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_dirent)) {
599 err = read_more(c, ref, sizeof(struct jffs2_raw_dirent), &len, buf, bufstart); 1068 err = read_more(c, ref, sizeof(struct jffs2_raw_dirent), &len, buf);
600 if (unlikely(err)) 1069 if (unlikely(err))
601 goto free_out; 1070 goto free_out;
602 } 1071 }
603 1072
604 err = read_direntry(c, ref, &node->d, retlen, &ret_fd, latest_mctime, mctime_ver); 1073 err = read_direntry(c, ref, &node->d, retlen, rii);
605 if (err == 1) { 1074 if (unlikely(err))
606 jffs2_mark_node_obsolete(c, ref);
607 break;
608 } else if (unlikely(err))
609 goto free_out; 1075 goto free_out;
610 1076
611 if (je32_to_cpu(node->d.version) > *highest_version)
612 *highest_version = je32_to_cpu(node->d.version);
613
614 break; 1077 break;
615 1078
616 case JFFS2_NODETYPE_INODE: 1079 case JFFS2_NODETYPE_INODE:
617 1080
618 if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_inode)) { 1081 if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_inode)) {
619 err = read_more(c, ref, sizeof(struct jffs2_raw_inode), &len, buf, bufstart); 1082 err = read_more(c, ref, sizeof(struct jffs2_raw_inode), &len, buf);
620 if (unlikely(err)) 1083 if (unlikely(err))
621 goto free_out; 1084 goto free_out;
622 } 1085 }
623 1086
624 err = read_dnode(c, ref, &node->i, &ret_tn, len, latest_mctime, mctime_ver); 1087 err = read_dnode(c, ref, &node->i, len, rii);
625 if (err == 1) { 1088 if (unlikely(err))
626 jffs2_mark_node_obsolete(c, ref);
627 break;
628 } else if (unlikely(err))
629 goto free_out; 1089 goto free_out;
630 1090
631 if (je32_to_cpu(node->i.version) > *highest_version)
632 *highest_version = je32_to_cpu(node->i.version);
633
634 break; 1091 break;
635 1092
636 default: 1093 default:
637 if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_unknown_node)) { 1094 if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_unknown_node)) {
638 err = read_more(c, ref, sizeof(struct jffs2_unknown_node), &len, buf, bufstart); 1095 err = read_more(c, ref, sizeof(struct jffs2_unknown_node), &len, buf);
639 if (unlikely(err)) 1096 if (unlikely(err))
640 goto free_out; 1097 goto free_out;
641 } 1098 }
@@ -653,17 +1110,19 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
653 } 1110 }
654 1111
655 spin_unlock(&c->erase_completion_lock); 1112 spin_unlock(&c->erase_completion_lock);
656 *tnp = ret_tn;
657 *fdp = ret_fd;
658 kfree(buf); 1113 kfree(buf);
659 1114
1115 f->highest_version = rii->highest_version;
1116
660 dbg_readinode("nodes of inode #%u were read, the highest version is %u, latest_mctime %u, mctime_ver %u.\n", 1117 dbg_readinode("nodes of inode #%u were read, the highest version is %u, latest_mctime %u, mctime_ver %u.\n",
661 f->inocache->ino, *highest_version, *latest_mctime, *mctime_ver); 1118 f->inocache->ino, rii->highest_version, rii->latest_mctime,
1119 rii->mctime_ver);
662 return 0; 1120 return 0;
663 1121
664 free_out: 1122 free_out:
665 jffs2_free_tmp_dnode_info_list(&ret_tn); 1123 jffs2_free_tmp_dnode_info_list(&rii->tn_root);
666 jffs2_free_full_dirent_list(ret_fd); 1124 jffs2_free_full_dirent_list(rii->fds);
1125 rii->fds = NULL;
667 kfree(buf); 1126 kfree(buf);
668 return err; 1127 return err;
669} 1128}
@@ -672,20 +1131,17 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
672 struct jffs2_inode_info *f, 1131 struct jffs2_inode_info *f,
673 struct jffs2_raw_inode *latest_node) 1132 struct jffs2_raw_inode *latest_node)
674{ 1133{
675 struct jffs2_tmp_dnode_info *tn; 1134 struct jffs2_readinode_info rii;
676 struct rb_root tn_list; 1135 uint32_t crc, new_size;
677 struct rb_node *rb, *repl_rb;
678 struct jffs2_full_dirent *fd_list;
679 struct jffs2_full_dnode *fn, *first_fn = NULL;
680 uint32_t crc;
681 uint32_t latest_mctime, mctime_ver;
682 size_t retlen; 1136 size_t retlen;
683 int ret; 1137 int ret;
684 1138
685 dbg_readinode("ino #%u nlink is %d\n", f->inocache->ino, f->inocache->nlink); 1139 dbg_readinode("ino #%u nlink is %d\n", f->inocache->ino, f->inocache->nlink);
686 1140
1141 memset(&rii, 0, sizeof(rii));
1142
687 /* Grab all nodes relevant to this ino */ 1143 /* Grab all nodes relevant to this ino */
688 ret = jffs2_get_inode_nodes(c, f, &tn_list, &fd_list, &f->highest_version, &latest_mctime, &mctime_ver); 1144 ret = jffs2_get_inode_nodes(c, f, &rii);
689 1145
690 if (ret) { 1146 if (ret) {
691 JFFS2_ERROR("cannot read nodes for ino %u, returned error is %d\n", f->inocache->ino, ret); 1147 JFFS2_ERROR("cannot read nodes for ino %u, returned error is %d\n", f->inocache->ino, ret);
@@ -693,74 +1149,42 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
693 jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT); 1149 jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
694 return ret; 1150 return ret;
695 } 1151 }
696 f->dents = fd_list;
697
698 rb = rb_first(&tn_list);
699 1152
700 while (rb) { 1153 ret = jffs2_build_inode_fragtree(c, f, &rii);
701 cond_resched(); 1154 if (ret) {
702 tn = rb_entry(rb, struct jffs2_tmp_dnode_info, rb); 1155 JFFS2_ERROR("Failed to build final fragtree for inode #%u: error %d\n",
703 fn = tn->fn; 1156 f->inocache->ino, ret);
704 ret = 1; 1157 if (f->inocache->state == INO_STATE_READING)
705 dbg_readinode("consider node ver %u, phys offset " 1158 jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
706 "%#08x(%d), range %u-%u.\n", tn->version, 1159 jffs2_free_tmp_dnode_info_list(&rii.tn_root);
707 ref_offset(fn->raw), ref_flags(fn->raw), 1160 /* FIXME: We could at least crc-check them all */
708 fn->ofs, fn->ofs + fn->size); 1161 if (rii.mdata_tn) {
709 1162 jffs2_free_full_dnode(rii.mdata_tn->fn);
710 if (fn->size) { 1163 jffs2_free_tmp_dnode_info(rii.mdata_tn);
711 ret = jffs2_add_older_frag_to_fragtree(c, f, tn); 1164 rii.mdata_tn = NULL;
712 /* TODO: the error code isn't checked, check it */ 1165 }
713 jffs2_dbg_fragtree_paranoia_check_nolock(f); 1166 return ret;
714 BUG_ON(ret < 0); 1167 }
715 if (!first_fn && ret == 0)
716 first_fn = fn;
717 } else if (!first_fn) {
718 first_fn = fn;
719 f->metadata = fn;
720 ret = 0; /* Prevent freeing the metadata update node */
721 } else
722 jffs2_mark_node_obsolete(c, fn->raw);
723
724 BUG_ON(rb->rb_left);
725 if (rb_parent(rb) && rb_parent(rb)->rb_left == rb) {
726 /* We were then left-hand child of our parent. We need
727 * to move our own right-hand child into our place. */
728 repl_rb = rb->rb_right;
729 if (repl_rb)
730 rb_set_parent(repl_rb, rb_parent(rb));
731 } else
732 repl_rb = NULL;
733
734 rb = rb_next(rb);
735
736 /* Remove the spent tn from the tree; don't bother rebalancing
737 * but put our right-hand child in our own place. */
738 if (rb_parent(&tn->rb)) {
739 if (rb_parent(&tn->rb)->rb_left == &tn->rb)
740 rb_parent(&tn->rb)->rb_left = repl_rb;
741 else if (rb_parent(&tn->rb)->rb_right == &tn->rb)
742 rb_parent(&tn->rb)->rb_right = repl_rb;
743 else BUG();
744 } else if (tn->rb.rb_right)
745 rb_set_parent(tn->rb.rb_right, NULL);
746 1168
747 jffs2_free_tmp_dnode_info(tn); 1169 if (rii.mdata_tn) {
748 if (ret) { 1170 if (rii.mdata_tn->fn->raw == rii.latest_ref) {
749 dbg_readinode("delete dnode %u-%u.\n", 1171 f->metadata = rii.mdata_tn->fn;
750 fn->ofs, fn->ofs + fn->size); 1172 jffs2_free_tmp_dnode_info(rii.mdata_tn);
751 jffs2_free_full_dnode(fn); 1173 } else {
1174 jffs2_kill_tn(c, rii.mdata_tn);
752 } 1175 }
1176 rii.mdata_tn = NULL;
753 } 1177 }
754 jffs2_dbg_fragtree_paranoia_check_nolock(f);
755 1178
756 BUG_ON(first_fn && ref_obsolete(first_fn->raw)); 1179 f->dents = rii.fds;
757 1180
758 fn = first_fn; 1181 jffs2_dbg_fragtree_paranoia_check_nolock(f);
759 if (unlikely(!first_fn)) { 1182
1183 if (unlikely(!rii.latest_ref)) {
760 /* No data nodes for this inode. */ 1184 /* No data nodes for this inode. */
761 if (f->inocache->ino != 1) { 1185 if (f->inocache->ino != 1) {
762 JFFS2_WARNING("no data nodes found for ino #%u\n", f->inocache->ino); 1186 JFFS2_WARNING("no data nodes found for ino #%u\n", f->inocache->ino);
763 if (!fd_list) { 1187 if (!rii.fds) {
764 if (f->inocache->state == INO_STATE_READING) 1188 if (f->inocache->state == INO_STATE_READING)
765 jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT); 1189 jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
766 return -EIO; 1190 return -EIO;
@@ -778,7 +1202,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
778 return 0; 1202 return 0;
779 } 1203 }
780 1204
781 ret = jffs2_flash_read(c, ref_offset(fn->raw), sizeof(*latest_node), &retlen, (void *)latest_node); 1205 ret = jffs2_flash_read(c, ref_offset(rii.latest_ref), sizeof(*latest_node), &retlen, (void *)latest_node);
782 if (ret || retlen != sizeof(*latest_node)) { 1206 if (ret || retlen != sizeof(*latest_node)) {
783 JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n", 1207 JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n",
784 ret, retlen, sizeof(*latest_node)); 1208 ret, retlen, sizeof(*latest_node));
@@ -791,7 +1215,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
791 crc = crc32(0, latest_node, sizeof(*latest_node)-8); 1215 crc = crc32(0, latest_node, sizeof(*latest_node)-8);
792 if (crc != je32_to_cpu(latest_node->node_crc)) { 1216 if (crc != je32_to_cpu(latest_node->node_crc)) {
793 JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n", 1217 JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n",
794 f->inocache->ino, ref_offset(fn->raw)); 1218 f->inocache->ino, ref_offset(rii.latest_ref));
795 up(&f->sem); 1219 up(&f->sem);
796 jffs2_do_clear_inode(c, f); 1220 jffs2_do_clear_inode(c, f);
797 return -EIO; 1221 return -EIO;
@@ -799,17 +1223,22 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
799 1223
800 switch(jemode_to_cpu(latest_node->mode) & S_IFMT) { 1224 switch(jemode_to_cpu(latest_node->mode) & S_IFMT) {
801 case S_IFDIR: 1225 case S_IFDIR:
802 if (mctime_ver > je32_to_cpu(latest_node->version)) { 1226 if (rii.mctime_ver > je32_to_cpu(latest_node->version)) {
803 /* The times in the latest_node are actually older than 1227 /* The times in the latest_node are actually older than
804 mctime in the latest dirent. Cheat. */ 1228 mctime in the latest dirent. Cheat. */
805 latest_node->ctime = latest_node->mtime = cpu_to_je32(latest_mctime); 1229 latest_node->ctime = latest_node->mtime = cpu_to_je32(rii.latest_mctime);
806 } 1230 }
807 break; 1231 break;
808 1232
809 1233
810 case S_IFREG: 1234 case S_IFREG:
811 /* If it was a regular file, truncate it to the latest node's isize */ 1235 /* If it was a regular file, truncate it to the latest node's isize */
812 jffs2_truncate_fragtree(c, &f->fragtree, je32_to_cpu(latest_node->isize)); 1236 new_size = jffs2_truncate_fragtree(c, &f->fragtree, je32_to_cpu(latest_node->isize));
1237 if (new_size != je32_to_cpu(latest_node->isize)) {
1238 JFFS2_WARNING("Truncating ino #%u to %d bytes failed because it only had %d bytes to start with!\n",
1239 f->inocache->ino, je32_to_cpu(latest_node->isize), new_size);
1240 latest_node->isize = cpu_to_je32(new_size);
1241 }
813 break; 1242 break;
814 1243
815 case S_IFLNK: 1244 case S_IFLNK:
@@ -832,7 +1261,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
832 return -ENOMEM; 1261 return -ENOMEM;
833 } 1262 }
834 1263
835 ret = jffs2_flash_read(c, ref_offset(fn->raw) + sizeof(*latest_node), 1264 ret = jffs2_flash_read(c, ref_offset(rii.latest_ref) + sizeof(*latest_node),
836 je32_to_cpu(latest_node->csize), &retlen, (char *)f->target); 1265 je32_to_cpu(latest_node->csize), &retlen, (char *)f->target);
837 1266
838 if (ret || retlen != je32_to_cpu(latest_node->csize)) { 1267 if (ret || retlen != je32_to_cpu(latest_node->csize)) {
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 7fb45bd4915c..2a1c976c7924 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -1,15 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: scan.c,v 1.125 2005/09/30 13:59:13 dedekind Exp $
11 *
12 */ 10 */
11
13#include <linux/kernel.h> 12#include <linux/kernel.h>
14#include <linux/sched.h> 13#include <linux/sched.h>
15#include <linux/slab.h> 14#include <linux/slab.h>
@@ -636,16 +635,17 @@ scan_more:
636 635
637 if (*(uint32_t *)(&buf[ofs-buf_ofs]) == 0xffffffff) { 636 if (*(uint32_t *)(&buf[ofs-buf_ofs]) == 0xffffffff) {
638 uint32_t inbuf_ofs; 637 uint32_t inbuf_ofs;
639 uint32_t empty_start; 638 uint32_t empty_start, scan_end;
640 639
641 empty_start = ofs; 640 empty_start = ofs;
642 ofs += 4; 641 ofs += 4;
642 scan_end = min_t(uint32_t, EMPTY_SCAN_SIZE(c->sector_size)/8, buf_len);
643 643
644 D1(printk(KERN_DEBUG "Found empty flash at 0x%08x\n", ofs)); 644 D1(printk(KERN_DEBUG "Found empty flash at 0x%08x\n", ofs));
645 more_empty: 645 more_empty:
646 inbuf_ofs = ofs - buf_ofs; 646 inbuf_ofs = ofs - buf_ofs;
647 while (inbuf_ofs < buf_len) { 647 while (inbuf_ofs < scan_end) {
648 if (*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff) { 648 if (unlikely(*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff)) {
649 printk(KERN_WARNING "Empty flash at 0x%08x ends at 0x%08x\n", 649 printk(KERN_WARNING "Empty flash at 0x%08x ends at 0x%08x\n",
650 empty_start, ofs); 650 empty_start, ofs);
651 if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start))) 651 if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start)))
@@ -666,7 +666,11 @@ scan_more:
666 D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE(c->sector_size))); 666 D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE(c->sector_size)));
667 return BLK_STATE_CLEANMARKER; 667 return BLK_STATE_CLEANMARKER;
668 } 668 }
669 669 if (!buf_size && (scan_end != buf_len)) {/* XIP/point case */
670 scan_end = buf_len;
671 goto more_empty;
672 }
673
670 /* See how much more there is to read in this eraseblock... */ 674 /* See how much more there is to read in this eraseblock... */
671 buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); 675 buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
672 if (!buf_len) { 676 if (!buf_len) {
@@ -676,6 +680,8 @@ scan_more:
676 empty_start)); 680 empty_start));
677 break; 681 break;
678 } 682 }
683 /* point never reaches here */
684 scan_end = buf_len;
679 D1(printk(KERN_DEBUG "Reading another 0x%x at 0x%08x\n", buf_len, ofs)); 685 D1(printk(KERN_DEBUG "Reading another 0x%x at 0x%08x\n", buf_len, ofs));
680 err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); 686 err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
681 if (err) 687 if (err)
@@ -734,18 +740,8 @@ scan_more:
734 ofs += 4; 740 ofs += 4;
735 continue; 741 continue;
736 } 742 }
737 /* Due to poor choice of crc32 seed, an all-zero node will have a correct CRC */
738 if (!je32_to_cpu(node->hdr_crc) && !je16_to_cpu(node->nodetype) &&
739 !je16_to_cpu(node->magic) && !je32_to_cpu(node->totlen)) {
740 noisy_printk(&noise, "jffs2_scan_eraseblock(): All zero node header at 0x%08x.\n", ofs);
741 if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
742 return err;
743 ofs += 4;
744 continue;
745 }
746 743
747 if (ofs + je32_to_cpu(node->totlen) > 744 if (ofs + je32_to_cpu(node->totlen) > jeb->offset + c->sector_size) {
748 jeb->offset + c->sector_size) {
749 /* Eep. Node goes over the end of the erase block. */ 745 /* Eep. Node goes over the end of the erase block. */
750 printk(KERN_WARNING "Node at 0x%08x with length 0x%08x would run over the end of the erase block\n", 746 printk(KERN_WARNING "Node at 0x%08x with length 0x%08x would run over the end of the erase block\n",
751 ofs, je32_to_cpu(node->totlen)); 747 ofs, je32_to_cpu(node->totlen));
@@ -952,8 +948,7 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
952 struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s) 948 struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s)
953{ 949{
954 struct jffs2_inode_cache *ic; 950 struct jffs2_inode_cache *ic;
955 uint32_t ino = je32_to_cpu(ri->ino); 951 uint32_t crc, ino = je32_to_cpu(ri->ino);
956 int err;
957 952
958 D1(printk(KERN_DEBUG "jffs2_scan_inode_node(): Node at 0x%08x\n", ofs)); 953 D1(printk(KERN_DEBUG "jffs2_scan_inode_node(): Node at 0x%08x\n", ofs));
959 954
@@ -966,21 +961,22 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
966 Which means that the _full_ amount of time to get to proper write mode with GC 961 Which means that the _full_ amount of time to get to proper write mode with GC
967 operational may actually be _longer_ than before. Sucks to be me. */ 962 operational may actually be _longer_ than before. Sucks to be me. */
968 963
964 /* Check the node CRC in any case. */
965 crc = crc32(0, ri, sizeof(*ri)-8);
966 if (crc != je32_to_cpu(ri->node_crc)) {
967 printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on "
968 "node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
969 ofs, je32_to_cpu(ri->node_crc), crc);
970 /*
971 * We believe totlen because the CRC on the node
972 * _header_ was OK, just the node itself failed.
973 */
974 return jffs2_scan_dirty_space(c, jeb,
975 PAD(je32_to_cpu(ri->totlen)));
976 }
977
969 ic = jffs2_get_ino_cache(c, ino); 978 ic = jffs2_get_ino_cache(c, ino);
970 if (!ic) { 979 if (!ic) {
971 /* Inocache get failed. Either we read a bogus ino# or it's just genuinely the
972 first node we found for this inode. Do a CRC check to protect against the former
973 case */
974 uint32_t crc = crc32(0, ri, sizeof(*ri)-8);
975
976 if (crc != je32_to_cpu(ri->node_crc)) {
977 printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
978 ofs, je32_to_cpu(ri->node_crc), crc);
979 /* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
980 if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(ri->totlen)))))
981 return err;
982 return 0;
983 }
984 ic = jffs2_scan_make_ino_cache(c, ino); 980 ic = jffs2_scan_make_ino_cache(c, ino);
985 if (!ic) 981 if (!ic)
986 return -ENOMEM; 982 return -ENOMEM;
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 52a9894a6364..bc9f6ba10823 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -1,13 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2006 NEC Corporation 4 * Copyright © 2006 NEC Corporation
5 * 5 *
6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com> 6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 */ 10 */
11
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 30f888414ce7..d828b296392a 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -1,16 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>, 4 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
5 * Zoltan Sogor <weth@inf.u-szeged.hu>, 5 * Zoltan Sogor <weth@inf.u-szeged.hu>,
6 * Patrik Kluba <pajko@halom.u-szeged.hu>, 6 * Patrik Kluba <pajko@halom.u-szeged.hu>,
7 * University of Szeged, Hungary 7 * University of Szeged, Hungary
8 * 2006 KaiGai Kohei <kaigai@ak.jp.nec.com> 8 * 2006 KaiGai Kohei <kaigai@ak.jp.nec.com>
9 * 9 *
10 * For licensing information, see the file 'LICENCE' in this directory. 10 * For licensing information, see the file 'LICENCE' in this directory.
11 * 11 *
12 * $Id: summary.c,v 1.4 2005/09/26 11:37:21 havasi Exp $
13 *
14 */ 12 */
15 13
16#include <linux/kernel.h> 14#include <linux/kernel.h>
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index 6bf1f6aa4552..0c6669e21390 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -1,15 +1,13 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>, 4 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
5 * Zoltan Sogor <weth@inf.u-szeged.hu>, 5 * Zoltan Sogor <weth@inf.u-szeged.hu>,
6 * Patrik Kluba <pajko@halom.u-szeged.hu>, 6 * Patrik Kluba <pajko@halom.u-szeged.hu>,
7 * University of Szeged, Hungary 7 * University of Szeged, Hungary
8 * 8 *
9 * For licensing information, see the file 'LICENCE' in this directory. 9 * For licensing information, see the file 'LICENCE' in this directory.
10 * 10 *
11 * $Id: summary.h,v 1.2 2005/09/26 11:37:21 havasi Exp $
12 *
13 */ 11 */
14 12
15#ifndef JFFS2_SUMMARY_H 13#ifndef JFFS2_SUMMARY_H
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index cc7e8e71ad46..45368f8bbe72 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: super.c,v 1.110 2005/11/07 11:14:42 gleixner Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
@@ -49,8 +47,7 @@ static void jffs2_i_init_once(void * foo, struct kmem_cache * cachep, unsigned l
49{ 47{
50 struct jffs2_inode_info *ei = (struct jffs2_inode_info *) foo; 48 struct jffs2_inode_info *ei = (struct jffs2_inode_info *) foo;
51 49
52 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 50 if (flags & SLAB_CTOR_CONSTRUCTOR) {
53 SLAB_CTOR_CONSTRUCTOR) {
54 init_MUTEX(&ei->sem); 51 init_MUTEX(&ei->sem);
55 inode_init_once(&ei->vfs_inode); 52 inode_init_once(&ei->vfs_inode);
56 } 53 }
@@ -347,7 +344,7 @@ static int __init init_jffs2_fs(void)
347#ifdef CONFIG_JFFS2_SUMMARY 344#ifdef CONFIG_JFFS2_SUMMARY
348 " (SUMMARY) " 345 " (SUMMARY) "
349#endif 346#endif
350 " (C) 2001-2006 Red Hat, Inc.\n"); 347 " © 2001-2006 Red Hat, Inc.\n");
351 348
352 jffs2_inode_cachep = kmem_cache_create("jffs2_i", 349 jffs2_inode_cachep = kmem_cache_create("jffs2_i",
353 sizeof(struct jffs2_inode_info), 350 sizeof(struct jffs2_inode_info),
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 7e4882c8a7ed..b7339c3b6ad9 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -1,17 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001, 2002 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: symlink.c,v 1.19 2005/11/07 11:14:42 gleixner Exp $
11 *
12 */ 10 */
13 11
14
15#include <linux/kernel.h> 12#include <linux/kernel.h>
16#include <linux/slab.h> 13#include <linux/slab.h>
17#include <linux/fs.h> 14#include <linux/fs.h>
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4fac6dd53954..c556e85a565c 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1,16 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * Copyright (C) 2004 Thomas Gleixner <tglx@linutronix.de> 5 * Copyright © 2004 Thomas Gleixner <tglx@linutronix.de>
6 * 6 *
7 * Created by David Woodhouse <dwmw2@infradead.org> 7 * Created by David Woodhouse <dwmw2@infradead.org>
8 * Modified debugged and enhanced by Thomas Gleixner <tglx@linutronix.de> 8 * Modified debugged and enhanced by Thomas Gleixner <tglx@linutronix.de>
9 * 9 *
10 * For licensing information, see the file 'LICENCE' in this directory. 10 * For licensing information, see the file 'LICENCE' in this directory.
11 * 11 *
12 * $Id: wbuf.c,v 1.100 2005/09/30 13:59:13 dedekind Exp $
13 *
14 */ 12 */
15 13
16#include <linux/kernel.h> 14#include <linux/kernel.h>
@@ -345,6 +343,9 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
345 return; 343 return;
346 } 344 }
347 345
346 /* The summary is not recovered, so it must be disabled for this erase block */
347 jffs2_sum_disable_collecting(c->summary);
348
348 ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile); 349 ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile);
349 if (ret) { 350 if (ret) {
350 printk(KERN_WARNING "Failed to allocate node refs for wbuf recovery. Data loss ensues.\n"); 351 printk(KERN_WARNING "Failed to allocate node refs for wbuf recovery. Data loss ensues.\n");
@@ -967,9 +968,9 @@ exit:
967 968
968static const struct jffs2_unknown_node oob_cleanmarker = 969static const struct jffs2_unknown_node oob_cleanmarker =
969{ 970{
970 .magic = cpu_to_je16(JFFS2_MAGIC_BITMASK), 971 .magic = constant_cpu_to_je16(JFFS2_MAGIC_BITMASK),
971 .nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER), 972 .nodetype = constant_cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER),
972 .totlen = cpu_to_je32(8) 973 .totlen = constant_cpu_to_je32(8)
973}; 974};
974 975
975/* 976/*
@@ -1208,3 +1209,27 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
1208void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { 1209void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
1209 kfree(c->wbuf); 1210 kfree(c->wbuf);
1210} 1211}
1212
1213int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
1214 c->cleanmarker_size = 0;
1215
1216 if (c->mtd->writesize == 1)
1217 /* We do not need write-buffer */
1218 return 0;
1219
1220 init_rwsem(&c->wbuf_sem);
1221
1222 c->wbuf_pagesize = c->mtd->writesize;
1223 c->wbuf_ofs = 0xFFFFFFFF;
1224 c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
1225 if (!c->wbuf)
1226 return -ENOMEM;
1227
1228 printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size);
1229
1230 return 0;
1231}
1232
1233void jffs2_ubivol_cleanup(struct jffs2_sb_info *c) {
1234 kfree(c->wbuf);
1235}
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index 67176792e138..c9fe0ab3a329 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001-2003 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: write.c,v 1.97 2005/11/07 11:14:42 gleixner Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
@@ -507,8 +505,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
507 uint32_t alloclen; 505 uint32_t alloclen;
508 int ret; 506 int ret;
509 507
510 if (1 /* alternative branch needs testing */ || 508 if (!jffs2_can_mark_obsolete(c)) {
511 !jffs2_can_mark_obsolete(c)) {
512 /* We can't mark stuff obsolete on the medium. We need to write a deletion dirent */ 509 /* We can't mark stuff obsolete on the medium. We need to write a deletion dirent */
513 510
514 rd = jffs2_alloc_raw_dirent(); 511 rd = jffs2_alloc_raw_dirent();
diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c
index c638ae1008de..b9276b11bac6 100644
--- a/fs/jffs2/writev.c
+++ b/fs/jffs2/writev.c
@@ -1,14 +1,12 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2001, 2002 Red Hat, Inc. 4 * Copyright © 2001-2007 Red Hat, Inc.
5 * 5 *
6 * Created by David Woodhouse <dwmw2@infradead.org> 6 * Created by David Woodhouse <dwmw2@infradead.org>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 * $Id: writev.c,v 1.8 2005/09/09 15:11:58 havasi Exp $
11 *
12 */ 10 */
13 11
14#include <linux/kernel.h> 12#include <linux/kernel.h>
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 4bb3f1897330..78fc08893a6c 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -1,13 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2006 NEC Corporation 4 * Copyright © 2006 NEC Corporation
5 * 5 *
6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com> 6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 */ 10 */
11
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 06a5c69dcf8b..3b0ff2925937 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -1,13 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2006 NEC Corporation 4 * Copyright © 2006 NEC Corporation
5 * 5 *
6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com> 6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 */ 10 */
11
11#ifndef _JFFS2_FS_XATTR_H_ 12#ifndef _JFFS2_FS_XATTR_H_
12#define _JFFS2_FS_XATTR_H_ 13#define _JFFS2_FS_XATTR_H_
13 14
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index ed046e19dbfa..8ec5765ef348 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -1,13 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2006 NEC Corporation 4 * Copyright © 2006 NEC Corporation
5 * 5 *
6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com> 6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 */ 10 */
11
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13#include <linux/jffs2.h> 14#include <linux/jffs2.h>
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 2f8e9aa01ea0..40942bc516bb 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -1,13 +1,14 @@
1/* 1/*
2 * JFFS2 -- Journalling Flash File System, Version 2. 2 * JFFS2 -- Journalling Flash File System, Version 2.
3 * 3 *
4 * Copyright (C) 2006 NEC Corporation 4 * Copyright © 2006 NEC Corporation
5 * 5 *
6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com> 6 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
7 * 7 *
8 * For licensing information, see the file 'LICENCE' in this directory. 8 * For licensing information, see the file 'LICENCE' in this directory.
9 * 9 *
10 */ 10 */
11
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13#include <linux/jffs2.h> 14#include <linux/jffs2.h>
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 58deae007507..6b3acb0b5781 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -184,8 +184,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
184{ 184{
185 struct metapage *mp = (struct metapage *)foo; 185 struct metapage *mp = (struct metapage *)foo;
186 186
187 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 187 if (flags & SLAB_CTOR_CONSTRUCTOR) {
188 SLAB_CTOR_CONSTRUCTOR) {
189 mp->lid = 0; 188 mp->lid = 0;
190 mp->lsn = 0; 189 mp->lsn = 0;
191 mp->flag = 0; 190 mp->flag = 0;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 52d73d54a931..ea9dc3e65dcf 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -752,8 +752,7 @@ static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags
752{ 752{
753 struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo; 753 struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
754 754
755 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == 755 if (flags & SLAB_CTOR_CONSTRUCTOR) {
756 SLAB_CTOR_CONSTRUCTOR) {
757 memset(jfs_ip, 0, sizeof(struct jfs_inode_info)); 756 memset(jfs_ip, 0, sizeof(struct jfs_inode_info));
758 INIT_LIST_HEAD(&jfs_ip->anon_inode_list); 757 INIT_LIST_HEAD(&jfs_ip->anon_inode_list);
759 init_rwsem(&jfs_ip->rdwrlock); 758 init_rwsem(&jfs_ip->rdwrlock);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index eb243edf8932..2102e2d0134d 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -225,16 +225,13 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
225#define SM_monres_sz 2 225#define SM_monres_sz 2
226#define SM_unmonres_sz 1 226#define SM_unmonres_sz 1
227 227
228#ifndef MAX
229# define MAX(a, b) (((a) > (b))? (a) : (b))
230#endif
231
232static struct rpc_procinfo nsm_procedures[] = { 228static struct rpc_procinfo nsm_procedures[] = {
233[SM_MON] = { 229[SM_MON] = {
234 .p_proc = SM_MON, 230 .p_proc = SM_MON,
235 .p_encode = (kxdrproc_t) xdr_encode_mon, 231 .p_encode = (kxdrproc_t) xdr_encode_mon,
236 .p_decode = (kxdrproc_t) xdr_decode_stat_res, 232 .p_decode = (kxdrproc_t) xdr_decode_stat_res,
237 .p_bufsiz = MAX(SM_mon_sz, SM_monres_sz) << 2, 233 .p_arglen = SM_mon_sz,
234 .p_replen = SM_monres_sz,
238 .p_statidx = SM_MON, 235 .p_statidx = SM_MON,
239 .p_name = "MONITOR", 236 .p_name = "MONITOR",
240 }, 237 },
@@ -242,7 +239,8 @@ static struct rpc_procinfo nsm_procedures[] = {
242 .p_proc = SM_UNMON, 239 .p_proc = SM_UNMON,
243 .p_encode = (kxdrproc_t) xdr_encode_unmon, 240 .p_encode = (kxdrproc_t) xdr_encode_unmon,
244 .p_decode = (kxdrproc_t) xdr_decode_stat, 241 .p_decode = (kxdrproc_t) xdr_decode_stat,
245 .p_bufsiz = MAX(SM_mon_id_sz, SM_unmonres_sz) << 2, 242 .p_arglen = SM_mon_id_sz,
243 .p_replen = SM_unmonres_sz,
246 .p_statidx = SM_UNMON, 244 .p_statidx = SM_UNMON,
247 .p_name = "UNMONITOR", 245 .p_name = "UNMONITOR",
248 }, 246 },
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 34dae5d70738..9702956d206c 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -510,17 +510,20 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
510 return 0; 510 return 0;
511} 511}
512 512
513#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
514# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
515#endif
516
513/* 517/*
514 * Buffer requirements for NLM 518 * Buffer requirements for NLM
515 */ 519 */
516#define NLM_void_sz 0 520#define NLM_void_sz 0
517#define NLM_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN) 521#define NLM_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
518#define NLM_caller_sz 1+XDR_QUADLEN(sizeof(utsname()->nodename)) 522#define NLM_caller_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
519#define NLM_netobj_sz 1+XDR_QUADLEN(XDR_MAX_NETOBJ) 523#define NLM_owner_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
520/* #define NLM_owner_sz 1+XDR_QUADLEN(NLM_MAXOWNER) */
521#define NLM_fhandle_sz 1+XDR_QUADLEN(NFS2_FHSIZE) 524#define NLM_fhandle_sz 1+XDR_QUADLEN(NFS2_FHSIZE)
522#define NLM_lock_sz 3+NLM_caller_sz+NLM_netobj_sz+NLM_fhandle_sz 525#define NLM_lock_sz 3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
523#define NLM_holder_sz 4+NLM_netobj_sz 526#define NLM_holder_sz 4+NLM_owner_sz
524 527
525#define NLM_testargs_sz NLM_cookie_sz+1+NLM_lock_sz 528#define NLM_testargs_sz NLM_cookie_sz+1+NLM_lock_sz
526#define NLM_lockargs_sz NLM_cookie_sz+4+NLM_lock_sz 529#define NLM_lockargs_sz NLM_cookie_sz+4+NLM_lock_sz
@@ -531,10 +534,6 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
531#define NLM_res_sz NLM_cookie_sz+1 534#define NLM_res_sz NLM_cookie_sz+1
532#define NLM_norep_sz 0 535#define NLM_norep_sz 0
533 536
534#ifndef MAX
535# define MAX(a, b) (((a) > (b))? (a) : (b))
536#endif
537
538/* 537/*
539 * For NLM, a void procedure really returns nothing 538 * For NLM, a void procedure really returns nothing
540 */ 539 */
@@ -545,7 +544,8 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
545 .p_proc = NLMPROC_##proc, \ 544 .p_proc = NLMPROC_##proc, \
546 .p_encode = (kxdrproc_t) nlmclt_encode_##argtype, \ 545 .p_encode = (kxdrproc_t) nlmclt_encode_##argtype, \
547 .p_decode = (kxdrproc_t) nlmclt_decode_##restype, \ 546 .p_decode = (kxdrproc_t) nlmclt_decode_##restype, \
548 .p_bufsiz = MAX(NLM_##argtype##_sz, NLM_##restype##_sz) << 2, \ 547 .p_arglen = NLM_##argtype##_sz, \
548 .p_replen = NLM_##restype##_sz, \
549 .p_statidx = NLMPROC_##proc, \ 549 .p_statidx = NLMPROC_##proc, \
550 .p_name = #proc, \ 550 .p_name = #proc, \
551 } 551 }
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index a78240551219..ce1efdbe1b3a 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -516,17 +516,24 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
516 return 0; 516 return 0;
517} 517}
518 518
519#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
520# error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
521#endif
522
523#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
524# error "NLM host name cannot be larger than NLM's maximum string length!"
525#endif
526
519/* 527/*
520 * Buffer requirements for NLM 528 * Buffer requirements for NLM
521 */ 529 */
522#define NLM4_void_sz 0 530#define NLM4_void_sz 0
523#define NLM4_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN) 531#define NLM4_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
524#define NLM4_caller_sz 1+XDR_QUADLEN(NLM_MAXSTRLEN) 532#define NLM4_caller_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
525#define NLM4_netobj_sz 1+XDR_QUADLEN(XDR_MAX_NETOBJ) 533#define NLM4_owner_sz 1+XDR_QUADLEN(NLMCLNT_OHSIZE)
526/* #define NLM4_owner_sz 1+XDR_QUADLEN(NLM4_MAXOWNER) */
527#define NLM4_fhandle_sz 1+XDR_QUADLEN(NFS3_FHSIZE) 534#define NLM4_fhandle_sz 1+XDR_QUADLEN(NFS3_FHSIZE)
528#define NLM4_lock_sz 5+NLM4_caller_sz+NLM4_netobj_sz+NLM4_fhandle_sz 535#define NLM4_lock_sz 5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
529#define NLM4_holder_sz 6+NLM4_netobj_sz 536#define NLM4_holder_sz 6+NLM4_owner_sz
530 537
531#define NLM4_testargs_sz NLM4_cookie_sz+1+NLM4_lock_sz 538#define NLM4_testargs_sz NLM4_cookie_sz+1+NLM4_lock_sz
532#define NLM4_lockargs_sz NLM4_cookie_sz+4+NLM4_lock_sz 539#define NLM4_lockargs_sz NLM4_cookie_sz+4+NLM4_lock_sz
@@ -537,10 +544,6 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
537#define NLM4_res_sz NLM4_cookie_sz+1 544#define NLM4_res_sz NLM4_cookie_sz+1
538#define NLM4_norep_sz 0 545#define NLM4_norep_sz 0
539 546
540#ifndef MAX
541# define MAX(a,b) (((a) > (b))? (a) : (b))
542#endif
543
544/* 547/*
545 * For NLM, a void procedure really returns nothing 548 * For NLM, a void procedure really returns nothing
546 */ 549 */
@@ -551,7 +554,8 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
551 .p_proc = NLMPROC_##proc, \ 554 .p_proc = NLMPROC_##proc, \
552 .p_encode = (kxdrproc_t) nlm4clt_encode_##argtype, \ 555 .p_encode = (kxdrproc_t) nlm4clt_encode_##argtype, \
553 .p_decode = (kxdrproc_t) nlm4clt_decode_##restype, \ 556 .p_decode = (kxdrproc_t) nlm4clt_decode_##restype, \
554 .p_bufsiz = MAX(NLM4_##argtype##_sz, NLM4_##restype##_sz) << 2, \ 557 .p_arglen = NLM4_##argtype##_sz, \
558 .p_replen = NLM4_##restype##_sz, \
555 .p_statidx = NLMPROC_##proc, \ 559 .p_statidx = NLMPROC_##proc, \
556 .p_name = #proc, \ 560 .p_name = #proc, \
557 } 561 }
diff --git a/fs/locks.c b/fs/locks.c
index 53b0cd153202..671a034dc999 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -203,8 +203,7 @@ static void init_once(void *foo, struct kmem_cache *cache, unsigned long flags)
203{ 203{
204 struct file_lock *lock = (struct file_lock *) foo; 204 struct file_lock *lock = (struct file_lock *) foo;
205 205
206 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) != 206 if (!(flags & SLAB_CTOR_CONSTRUCTOR))
207 SLAB_CTOR_CONSTRUCTOR)
208 return; 207 return;
209 208
210 locks_init_lock(lock); 209 locks_init_lock(lock);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index cb4cb571fddf..e207cbe70951 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -65,7 +65,6 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
65 struct address_space *mapping = dir->i_mapping; 65 struct address_space *mapping = dir->i_mapping;
66 struct page *page = read_mapping_page(mapping, n, NULL); 66 struct page *page = read_mapping_page(mapping, n, NULL);
67 if (!IS_ERR(page)) { 67 if (!IS_ERR(page)) {
68 wait_on_page_locked(page);
69 kmap(page); 68 kmap(page);
70 if (!PageUptodate(page)) 69 if (!PageUptodate(page))
71 goto fail; 70 goto fail;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 92e383af3709..2f4d43a2a310 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -73,8 +73,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
73{ 73{
74 struct minix_inode_info *ei = (struct minix_inode_info *) foo; 74 struct minix_inode_info *ei = (struct minix_inode_info *) foo;
75 75
76 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 76 if (flags & SLAB_CTOR_CONSTRUCTOR)
77 SLAB_CTOR_CONSTRUCTOR)
78 inode_init_once(&ei->vfs_inode); 77 inode_init_once(&ei->vfs_inode);
79} 78}
80 79
diff --git a/fs/namei.c b/fs/namei.c
index ee60cc4d3453..94b2f60aec22 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1243,22 +1243,13 @@ int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
1243 return err; 1243 return err;
1244} 1244}
1245 1245
1246/* 1246static inline struct dentry *__lookup_hash_kern(struct qstr *name, struct dentry *base, struct nameidata *nd)
1247 * Restricted form of lookup. Doesn't follow links, single-component only,
1248 * needs parent already locked. Doesn't follow mounts.
1249 * SMP-safe.
1250 */
1251static struct dentry * __lookup_hash(struct qstr *name, struct dentry * base, struct nameidata *nd)
1252{ 1247{
1253 struct dentry * dentry; 1248 struct dentry *dentry;
1254 struct inode *inode; 1249 struct inode *inode;
1255 int err; 1250 int err;
1256 1251
1257 inode = base->d_inode; 1252 inode = base->d_inode;
1258 err = permission(inode, MAY_EXEC, nd);
1259 dentry = ERR_PTR(err);
1260 if (err)
1261 goto out;
1262 1253
1263 /* 1254 /*
1264 * See if the low-level filesystem might want 1255 * See if the low-level filesystem might want
@@ -1287,35 +1278,76 @@ out:
1287 return dentry; 1278 return dentry;
1288} 1279}
1289 1280
1281/*
1282 * Restricted form of lookup. Doesn't follow links, single-component only,
1283 * needs parent already locked. Doesn't follow mounts.
1284 * SMP-safe.
1285 */
1286static inline struct dentry * __lookup_hash(struct qstr *name, struct dentry *base, struct nameidata *nd)
1287{
1288 struct dentry *dentry;
1289 struct inode *inode;
1290 int err;
1291
1292 inode = base->d_inode;
1293
1294 err = permission(inode, MAY_EXEC, nd);
1295 dentry = ERR_PTR(err);
1296 if (err)
1297 goto out;
1298
1299 dentry = __lookup_hash_kern(name, base, nd);
1300out:
1301 return dentry;
1302}
1303
1290static struct dentry *lookup_hash(struct nameidata *nd) 1304static struct dentry *lookup_hash(struct nameidata *nd)
1291{ 1305{
1292 return __lookup_hash(&nd->last, nd->dentry, nd); 1306 return __lookup_hash(&nd->last, nd->dentry, nd);
1293} 1307}
1294 1308
1295/* SMP-safe */ 1309/* SMP-safe */
1296struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) 1310static inline int __lookup_one_len(const char *name, struct qstr *this, struct dentry *base, int len)
1297{ 1311{
1298 unsigned long hash; 1312 unsigned long hash;
1299 struct qstr this;
1300 unsigned int c; 1313 unsigned int c;
1301 1314
1302 this.name = name; 1315 this->name = name;
1303 this.len = len; 1316 this->len = len;
1304 if (!len) 1317 if (!len)
1305 goto access; 1318 return -EACCES;
1306 1319
1307 hash = init_name_hash(); 1320 hash = init_name_hash();
1308 while (len--) { 1321 while (len--) {
1309 c = *(const unsigned char *)name++; 1322 c = *(const unsigned char *)name++;
1310 if (c == '/' || c == '\0') 1323 if (c == '/' || c == '\0')
1311 goto access; 1324 return -EACCES;
1312 hash = partial_name_hash(c, hash); 1325 hash = partial_name_hash(c, hash);
1313 } 1326 }
1314 this.hash = end_name_hash(hash); 1327 this->hash = end_name_hash(hash);
1328 return 0;
1329}
1330
1331struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1332{
1333 int err;
1334 struct qstr this;
1315 1335
1336 err = __lookup_one_len(name, &this, base, len);
1337 if (err)
1338 return ERR_PTR(err);
1316 return __lookup_hash(&this, base, NULL); 1339 return __lookup_hash(&this, base, NULL);
1317access: 1340}
1318 return ERR_PTR(-EACCES); 1341
1342struct dentry *lookup_one_len_kern(const char *name, struct dentry *base, int len)
1343{
1344 int err;
1345 struct qstr this;
1346
1347 err = __lookup_one_len(name, &this, base, len);
1348 if (err)
1349 return ERR_PTR(err);
1350 return __lookup_hash_kern(&this, base, NULL);
1319} 1351}
1320 1352
1321/* 1353/*
@@ -2639,19 +2671,9 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
2639 struct address_space *mapping = dentry->d_inode->i_mapping; 2671 struct address_space *mapping = dentry->d_inode->i_mapping;
2640 page = read_mapping_page(mapping, 0, NULL); 2672 page = read_mapping_page(mapping, 0, NULL);
2641 if (IS_ERR(page)) 2673 if (IS_ERR(page))
2642 goto sync_fail; 2674 return (char*)page;
2643 wait_on_page_locked(page);
2644 if (!PageUptodate(page))
2645 goto async_fail;
2646 *ppage = page; 2675 *ppage = page;
2647 return kmap(page); 2676 return kmap(page);
2648
2649async_fail:
2650 page_cache_release(page);
2651 return ERR_PTR(-EIO);
2652
2653sync_fail:
2654 return (char*)page;
2655} 2677}
2656 2678
2657int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) 2679int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 7285c94956c4..c29f00ad495d 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -60,8 +60,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
60{ 60{
61 struct ncp_inode_info *ei = (struct ncp_inode_info *) foo; 61 struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
62 62
63 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 63 if (flags & SLAB_CTOR_CONSTRUCTOR) {
64 SLAB_CTOR_CONSTRUCTOR) {
65 mutex_init(&ei->open_mutex); 64 mutex_init(&ei->open_mutex);
66 inode_init_once(&ei->vfs_inode); 65 inode_init_once(&ei->vfs_inode);
67 } 66 }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2190e6c2792e..5bd03b97002e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -618,7 +618,8 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat
618 if (clp->cl_nfsversion == 3) { 618 if (clp->cl_nfsversion == 3) {
619 if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) 619 if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
620 server->namelen = NFS3_MAXNAMLEN; 620 server->namelen = NFS3_MAXNAMLEN;
621 server->caps |= NFS_CAP_READDIRPLUS; 621 if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
622 server->caps |= NFS_CAP_READDIRPLUS;
622 } else { 623 } else {
623 if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) 624 if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
624 server->namelen = NFS2_MAXNAMLEN; 625 server->namelen = NFS2_MAXNAMLEN;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cd3469720cbf..625d8e5fb39d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -154,6 +154,8 @@ typedef struct {
154 decode_dirent_t decode; 154 decode_dirent_t decode;
155 int plus; 155 int plus;
156 int error; 156 int error;
157 unsigned long timestamp;
158 int timestamp_valid;
157} nfs_readdir_descriptor_t; 159} nfs_readdir_descriptor_t;
158 160
159/* Now we cache directories properly, by stuffing the dirent 161/* Now we cache directories properly, by stuffing the dirent
@@ -195,6 +197,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
195 } 197 }
196 goto error; 198 goto error;
197 } 199 }
200 desc->timestamp = timestamp;
201 desc->timestamp_valid = 1;
198 SetPageUptodate(page); 202 SetPageUptodate(page);
199 spin_lock(&inode->i_lock); 203 spin_lock(&inode->i_lock);
200 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; 204 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
@@ -225,6 +229,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc)
225 if (IS_ERR(p)) 229 if (IS_ERR(p))
226 return PTR_ERR(p); 230 return PTR_ERR(p);
227 desc->ptr = p; 231 desc->ptr = p;
232 if (desc->timestamp_valid)
233 desc->entry->fattr->time_start = desc->timestamp;
234 else
235 desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
228 return 0; 236 return 0;
229} 237}
230 238
@@ -316,14 +324,16 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
316 __FUNCTION__, desc->page_index, 324 __FUNCTION__, desc->page_index,
317 (long long) *desc->dir_cookie); 325 (long long) *desc->dir_cookie);
318 326
327 /* If we find the page in the page_cache, we cannot be sure
328 * how fresh the data is, so we will ignore readdir_plus attributes.
329 */
330 desc->timestamp_valid = 0;
319 page = read_cache_page(inode->i_mapping, desc->page_index, 331 page = read_cache_page(inode->i_mapping, desc->page_index,
320 (filler_t *)nfs_readdir_filler, desc); 332 (filler_t *)nfs_readdir_filler, desc);
321 if (IS_ERR(page)) { 333 if (IS_ERR(page)) {
322 status = PTR_ERR(page); 334 status = PTR_ERR(page);
323 goto out; 335 goto out;
324 } 336 }
325 if (!PageUptodate(page))
326 goto read_error;
327 337
328 /* NOTE: Someone else may have changed the READDIRPLUS flag */ 338 /* NOTE: Someone else may have changed the READDIRPLUS flag */
329 desc->page = page; 339 desc->page = page;
@@ -337,9 +347,6 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
337 out: 347 out:
338 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, status); 348 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, status);
339 return status; 349 return status;
340 read_error:
341 page_cache_release(page);
342 return -EIO;
343} 350}
344 351
345/* 352/*
@@ -468,6 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
468 struct rpc_cred *cred = nfs_file_cred(file); 475 struct rpc_cred *cred = nfs_file_cred(file);
469 struct page *page = NULL; 476 struct page *page = NULL;
470 int status; 477 int status;
478 unsigned long timestamp;
471 479
472 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", 480 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
473 (unsigned long long)*desc->dir_cookie); 481 (unsigned long long)*desc->dir_cookie);
@@ -477,6 +485,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
477 status = -ENOMEM; 485 status = -ENOMEM;
478 goto out; 486 goto out;
479 } 487 }
488 timestamp = jiffies;
480 desc->error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, *desc->dir_cookie, 489 desc->error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, *desc->dir_cookie,
481 page, 490 page,
482 NFS_SERVER(inode)->dtsize, 491 NFS_SERVER(inode)->dtsize,
@@ -487,6 +496,8 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
487 desc->page = page; 496 desc->page = page;
488 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ 497 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
489 if (desc->error >= 0) { 498 if (desc->error >= 0) {
499 desc->timestamp = timestamp;
500 desc->timestamp_valid = 1;
490 if ((status = dir_decode(desc)) == 0) 501 if ((status = dir_decode(desc)) == 0)
491 desc->entry->prev_cookie = *desc->dir_cookie; 502 desc->entry->prev_cookie = *desc->dir_cookie;
492 } else 503 } else
@@ -849,6 +860,10 @@ static int nfs_dentry_delete(struct dentry *dentry)
849static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) 860static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
850{ 861{
851 nfs_inode_return_delegation(inode); 862 nfs_inode_return_delegation(inode);
863 if (S_ISDIR(inode->i_mode))
864 /* drop any readdir cache as it could easily be old */
865 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
866
852 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 867 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
853 lock_kernel(); 868 lock_kernel();
854 drop_nlink(inode); 869 drop_nlink(inode);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 2877744cb606..889de60f8a84 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -54,6 +54,7 @@
54#include <asm/uaccess.h> 54#include <asm/uaccess.h>
55#include <asm/atomic.h> 55#include <asm/atomic.h>
56 56
57#include "internal.h"
57#include "iostat.h" 58#include "iostat.h"
58 59
59#define NFSDBG_FACILITY NFSDBG_VFS 60#define NFSDBG_FACILITY NFSDBG_VFS
@@ -271,7 +272,7 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
271 bytes = min(rsize,count); 272 bytes = min(rsize,count);
272 273
273 result = -ENOMEM; 274 result = -ENOMEM;
274 data = nfs_readdata_alloc(pgbase + bytes); 275 data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes));
275 if (unlikely(!data)) 276 if (unlikely(!data))
276 break; 277 break;
277 278
@@ -602,7 +603,7 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
602 bytes = min(wsize,count); 603 bytes = min(wsize,count);
603 604
604 result = -ENOMEM; 605 result = -ENOMEM;
605 data = nfs_writedata_alloc(pgbase + bytes); 606 data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes));
606 if (unlikely(!data)) 607 if (unlikely(!data))
607 break; 608 break;
608 609
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 44aa9b726573..1e9a915d1fea 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1167,8 +1167,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
1167{ 1167{
1168 struct nfs_inode *nfsi = (struct nfs_inode *) foo; 1168 struct nfs_inode *nfsi = (struct nfs_inode *) foo;
1169 1169
1170 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 1170 if (flags & SLAB_CTOR_CONSTRUCTOR) {
1171 SLAB_CTOR_CONSTRUCTOR) {
1172 inode_init_once(&nfsi->vfs_inode); 1171 inode_init_once(&nfsi->vfs_inode);
1173 spin_lock_init(&nfsi->req_lock); 1172 spin_lock_init(&nfsi->req_lock);
1174 INIT_LIST_HEAD(&nfsi->dirty); 1173 INIT_LIST_HEAD(&nfsi->dirty);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 6610f2b02077..ad2b40db1e65 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -231,3 +231,15 @@ unsigned int nfs_page_length(struct page *page)
231 } 231 }
232 return 0; 232 return 0;
233} 233}
234
235/*
236 * Determine the number of pages in an array of length 'len' and
237 * with a base offset of 'base'
238 */
239static inline
240unsigned int nfs_page_array_len(unsigned int base, size_t len)
241{
242 return ((unsigned long)len + (unsigned long)base +
243 PAGE_SIZE - 1) >> PAGE_SHIFT;
244}
245
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index f75fe72b4160..ca5a266a3140 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -133,13 +133,15 @@ xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
133 133
134#define MNT_dirpath_sz (1 + 256) 134#define MNT_dirpath_sz (1 + 256)
135#define MNT_fhstatus_sz (1 + 8) 135#define MNT_fhstatus_sz (1 + 8)
136#define MNT_fhstatus3_sz (1 + 16)
136 137
137static struct rpc_procinfo mnt_procedures[] = { 138static struct rpc_procinfo mnt_procedures[] = {
138[MNTPROC_MNT] = { 139[MNTPROC_MNT] = {
139 .p_proc = MNTPROC_MNT, 140 .p_proc = MNTPROC_MNT,
140 .p_encode = (kxdrproc_t) xdr_encode_dirpath, 141 .p_encode = (kxdrproc_t) xdr_encode_dirpath,
141 .p_decode = (kxdrproc_t) xdr_decode_fhstatus, 142 .p_decode = (kxdrproc_t) xdr_decode_fhstatus,
142 .p_bufsiz = MNT_dirpath_sz << 2, 143 .p_arglen = MNT_dirpath_sz,
144 .p_replen = MNT_fhstatus_sz,
143 .p_statidx = MNTPROC_MNT, 145 .p_statidx = MNTPROC_MNT,
144 .p_name = "MOUNT", 146 .p_name = "MOUNT",
145 }, 147 },
@@ -150,7 +152,8 @@ static struct rpc_procinfo mnt3_procedures[] = {
150 .p_proc = MOUNTPROC3_MNT, 152 .p_proc = MOUNTPROC3_MNT,
151 .p_encode = (kxdrproc_t) xdr_encode_dirpath, 153 .p_encode = (kxdrproc_t) xdr_encode_dirpath,
152 .p_decode = (kxdrproc_t) xdr_decode_fhstatus3, 154 .p_decode = (kxdrproc_t) xdr_decode_fhstatus3,
153 .p_bufsiz = MNT_dirpath_sz << 2, 155 .p_arglen = MNT_dirpath_sz,
156 .p_replen = MNT_fhstatus3_sz,
154 .p_statidx = MOUNTPROC3_MNT, 157 .p_statidx = MOUNTPROC3_MNT,
155 .p_name = "MOUNT", 158 .p_name = "MOUNT",
156 }, 159 },
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 3be4e72a0227..abd9f8b48943 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -687,16 +687,13 @@ nfs_stat_to_errno(int stat)
687 return nfs_errtbl[i].errno; 687 return nfs_errtbl[i].errno;
688} 688}
689 689
690#ifndef MAX
691# define MAX(a, b) (((a) > (b))? (a) : (b))
692#endif
693
694#define PROC(proc, argtype, restype, timer) \ 690#define PROC(proc, argtype, restype, timer) \
695[NFSPROC_##proc] = { \ 691[NFSPROC_##proc] = { \
696 .p_proc = NFSPROC_##proc, \ 692 .p_proc = NFSPROC_##proc, \
697 .p_encode = (kxdrproc_t) nfs_xdr_##argtype, \ 693 .p_encode = (kxdrproc_t) nfs_xdr_##argtype, \
698 .p_decode = (kxdrproc_t) nfs_xdr_##restype, \ 694 .p_decode = (kxdrproc_t) nfs_xdr_##restype, \
699 .p_bufsiz = MAX(NFS_##argtype##_sz,NFS_##restype##_sz) << 2, \ 695 .p_arglen = NFS_##argtype##_sz, \
696 .p_replen = NFS_##restype##_sz, \
700 .p_timer = timer, \ 697 .p_timer = timer, \
701 .p_statidx = NFSPROC_##proc, \ 698 .p_statidx = NFSPROC_##proc, \
702 .p_name = #proc, \ 699 .p_name = #proc, \
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 0ace092d126f..b51df8eb9f01 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1102,16 +1102,13 @@ nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
1102} 1102}
1103#endif /* CONFIG_NFS_V3_ACL */ 1103#endif /* CONFIG_NFS_V3_ACL */
1104 1104
1105#ifndef MAX
1106# define MAX(a, b) (((a) > (b))? (a) : (b))
1107#endif
1108
1109#define PROC(proc, argtype, restype, timer) \ 1105#define PROC(proc, argtype, restype, timer) \
1110[NFS3PROC_##proc] = { \ 1106[NFS3PROC_##proc] = { \
1111 .p_proc = NFS3PROC_##proc, \ 1107 .p_proc = NFS3PROC_##proc, \
1112 .p_encode = (kxdrproc_t) nfs3_xdr_##argtype, \ 1108 .p_encode = (kxdrproc_t) nfs3_xdr_##argtype, \
1113 .p_decode = (kxdrproc_t) nfs3_xdr_##restype, \ 1109 .p_decode = (kxdrproc_t) nfs3_xdr_##restype, \
1114 .p_bufsiz = MAX(NFS3_##argtype##_sz,NFS3_##restype##_sz) << 2, \ 1110 .p_arglen = NFS3_##argtype##_sz, \
1111 .p_replen = NFS3_##restype##_sz, \
1115 .p_timer = timer, \ 1112 .p_timer = timer, \
1116 .p_statidx = NFS3PROC_##proc, \ 1113 .p_statidx = NFS3PROC_##proc, \
1117 .p_name = #proc, \ 1114 .p_name = #proc, \
@@ -1153,7 +1150,8 @@ static struct rpc_procinfo nfs3_acl_procedures[] = {
1153 .p_proc = ACLPROC3_GETACL, 1150 .p_proc = ACLPROC3_GETACL,
1154 .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs, 1151 .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs,
1155 .p_decode = (kxdrproc_t) nfs3_xdr_getaclres, 1152 .p_decode = (kxdrproc_t) nfs3_xdr_getaclres,
1156 .p_bufsiz = MAX(ACL3_getaclargs_sz, ACL3_getaclres_sz) << 2, 1153 .p_arglen = ACL3_getaclargs_sz,
1154 .p_replen = ACL3_getaclres_sz,
1157 .p_timer = 1, 1155 .p_timer = 1,
1158 .p_name = "GETACL", 1156 .p_name = "GETACL",
1159 }, 1157 },
@@ -1161,7 +1159,8 @@ static struct rpc_procinfo nfs3_acl_procedures[] = {
1161 .p_proc = ACLPROC3_SETACL, 1159 .p_proc = ACLPROC3_SETACL,
1162 .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs, 1160 .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs,
1163 .p_decode = (kxdrproc_t) nfs3_xdr_setaclres, 1161 .p_decode = (kxdrproc_t) nfs3_xdr_setaclres,
1164 .p_bufsiz = MAX(ACL3_setaclargs_sz, ACL3_setaclres_sz) << 2, 1162 .p_arglen = ACL3_setaclargs_sz,
1163 .p_replen = ACL3_setaclres_sz,
1165 .p_timer = 0, 1164 .p_timer = 0,
1166 .p_name = "SETACL", 1165 .p_name = "SETACL",
1167 }, 1166 },
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d9000ec52f72..d6a30e965787 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2647,8 +2647,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
2647 nfs_inode_return_delegation(inode); 2647 nfs_inode_return_delegation(inode);
2648 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); 2648 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
2649 ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 2649 ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
2650 if (ret == 0) 2650 nfs_zap_caches(inode);
2651 nfs4_write_cached_acl(inode, buf, buflen);
2652 return ret; 2651 return ret;
2653} 2652}
2654 2653
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f02d522fd788..b8c28f2380a5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4546,16 +4546,13 @@ nfs4_stat_to_errno(int stat)
4546 return stat; 4546 return stat;
4547} 4547}
4548 4548
4549#ifndef MAX
4550# define MAX(a, b) (((a) > (b))? (a) : (b))
4551#endif
4552
4553#define PROC(proc, argtype, restype) \ 4549#define PROC(proc, argtype, restype) \
4554[NFSPROC4_CLNT_##proc] = { \ 4550[NFSPROC4_CLNT_##proc] = { \
4555 .p_proc = NFSPROC4_COMPOUND, \ 4551 .p_proc = NFSPROC4_COMPOUND, \
4556 .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ 4552 .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \
4557 .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ 4553 .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \
4558 .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2, \ 4554 .p_arglen = NFS4_##argtype##_sz, \
4555 .p_replen = NFS4_##restype##_sz, \
4559 .p_statidx = NFSPROC4_CLNT_##proc, \ 4556 .p_statidx = NFSPROC4_CLNT_##proc, \
4560 .p_name = #proc, \ 4557 .p_name = #proc, \
4561 } 4558 }
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 75f819dc0255..49d1008ce1d7 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -428,7 +428,7 @@ static int __init root_nfs_getport(int program, int version, int proto)
428 printk(KERN_NOTICE "Looking up port of RPC %d/%d on %u.%u.%u.%u\n", 428 printk(KERN_NOTICE "Looking up port of RPC %d/%d on %u.%u.%u.%u\n",
429 program, version, NIPQUAD(servaddr)); 429 program, version, NIPQUAD(servaddr));
430 set_sockaddr(&sin, servaddr, 0); 430 set_sockaddr(&sin, servaddr, 0);
431 return rpc_getport_external(&sin, program, version, proto); 431 return rpcb_getport_external(&sin, program, version, proto);
432} 432}
433 433
434 434
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ca4b1d4ff42b..388950118f59 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -17,7 +17,8 @@
17#include <linux/nfs_page.h> 17#include <linux/nfs_page.h>
18#include <linux/nfs_fs.h> 18#include <linux/nfs_fs.h>
19#include <linux/nfs_mount.h> 19#include <linux/nfs_mount.h>
20#include <linux/writeback.h> 20
21#include "internal.h"
21 22
22#define NFS_PARANOIA 1 23#define NFS_PARANOIA 1
23 24
@@ -50,9 +51,7 @@ nfs_page_free(struct nfs_page *p)
50 * @count: number of bytes to read/write 51 * @count: number of bytes to read/write
51 * 52 *
52 * The page must be locked by the caller. This makes sure we never 53 * The page must be locked by the caller. This makes sure we never
53 * create two different requests for the same page, and avoids 54 * create two different requests for the same page.
54 * a possible deadlock when we reach the hard limit on the number
55 * of dirty pages.
56 * User should ensure it is safe to sleep in this function. 55 * User should ensure it is safe to sleep in this function.
57 */ 56 */
58struct nfs_page * 57struct nfs_page *
@@ -63,16 +62,12 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
63 struct nfs_server *server = NFS_SERVER(inode); 62 struct nfs_server *server = NFS_SERVER(inode);
64 struct nfs_page *req; 63 struct nfs_page *req;
65 64
66 /* Deal with hard limits. */
67 for (;;) { 65 for (;;) {
68 /* try to allocate the request struct */ 66 /* try to allocate the request struct */
69 req = nfs_page_alloc(); 67 req = nfs_page_alloc();
70 if (req != NULL) 68 if (req != NULL)
71 break; 69 break;
72 70
73 /* Try to free up at least one request in order to stay
74 * below the hard limit
75 */
76 if (signalled() && (server->flags & NFS_MOUNT_INTR)) 71 if (signalled() && (server->flags & NFS_MOUNT_INTR))
77 return ERR_PTR(-ERESTARTSYS); 72 return ERR_PTR(-ERESTARTSYS);
78 yield(); 73 yield();
@@ -223,124 +218,151 @@ out:
223} 218}
224 219
225/** 220/**
226 * nfs_coalesce_requests - Split coalesced requests out from a list. 221 * nfs_pageio_init - initialise a page io descriptor
227 * @head: source list 222 * @desc: pointer to descriptor
228 * @dst: destination list 223 * @inode: pointer to inode
229 * @nmax: maximum number of requests to coalesce 224 * @doio: pointer to io function
230 * 225 * @bsize: io block size
231 * Moves a maximum of 'nmax' elements from one list to another. 226 * @io_flags: extra parameters for the io function
232 * The elements are checked to ensure that they form a contiguous set
233 * of pages, and that the RPC credentials are the same.
234 */ 227 */
235int 228void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
236nfs_coalesce_requests(struct list_head *head, struct list_head *dst, 229 struct inode *inode,
237 unsigned int nmax) 230 int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
231 size_t bsize,
232 int io_flags)
238{ 233{
239 struct nfs_page *req = NULL; 234 INIT_LIST_HEAD(&desc->pg_list);
240 unsigned int npages = 0; 235 desc->pg_bytes_written = 0;
241 236 desc->pg_count = 0;
242 while (!list_empty(head)) { 237 desc->pg_bsize = bsize;
243 struct nfs_page *prev = req; 238 desc->pg_base = 0;
244 239 desc->pg_inode = inode;
245 req = nfs_list_entry(head->next); 240 desc->pg_doio = doio;
246 if (prev) { 241 desc->pg_ioflags = io_flags;
247 if (req->wb_context->cred != prev->wb_context->cred) 242 desc->pg_error = 0;
248 break;
249 if (req->wb_context->lockowner != prev->wb_context->lockowner)
250 break;
251 if (req->wb_context->state != prev->wb_context->state)
252 break;
253 if (req->wb_index != (prev->wb_index + 1))
254 break;
255
256 if (req->wb_pgbase != 0)
257 break;
258 }
259 nfs_list_remove_request(req);
260 nfs_list_add_request(req, dst);
261 npages++;
262 if (req->wb_pgbase + req->wb_bytes != PAGE_CACHE_SIZE)
263 break;
264 if (npages >= nmax)
265 break;
266 }
267 return npages;
268} 243}
269 244
270#define NFS_SCAN_MAXENTRIES 16
271/** 245/**
272 * nfs_scan_dirty - Scan the radix tree for dirty requests 246 * nfs_can_coalesce_requests - test two requests for compatibility
273 * @mapping: pointer to address space 247 * @prev: pointer to nfs_page
274 * @wbc: writeback_control structure 248 * @req: pointer to nfs_page
275 * @dst: Destination list
276 * 249 *
277 * Moves elements from one of the inode request lists. 250 * The nfs_page structures 'prev' and 'req' are compared to ensure that the
278 * If the number of requests is set to 0, the entire address_space 251 * page data area they describe is contiguous, and that their RPC
279 * starting at index idx_start, is scanned. 252 * credentials, NFSv4 open state, and lockowners are the same.
280 * The requests are *not* checked to ensure that they form a contiguous set. 253 *
281 * You must be holding the inode's req_lock when calling this function 254 * Return 'true' if this is the case, else return 'false'.
282 */ 255 */
283long nfs_scan_dirty(struct address_space *mapping, 256static int nfs_can_coalesce_requests(struct nfs_page *prev,
284 struct writeback_control *wbc, 257 struct nfs_page *req)
285 struct list_head *dst)
286{ 258{
287 struct nfs_inode *nfsi = NFS_I(mapping->host); 259 if (req->wb_context->cred != prev->wb_context->cred)
288 struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
289 struct nfs_page *req;
290 pgoff_t idx_start, idx_end;
291 long res = 0;
292 int found, i;
293
294 if (nfsi->ndirty == 0)
295 return 0; 260 return 0;
296 if (wbc->range_cyclic) { 261 if (req->wb_context->lockowner != prev->wb_context->lockowner)
297 idx_start = 0; 262 return 0;
298 idx_end = ULONG_MAX; 263 if (req->wb_context->state != prev->wb_context->state)
299 } else if (wbc->range_end == 0) { 264 return 0;
300 idx_start = wbc->range_start >> PAGE_CACHE_SHIFT; 265 if (req->wb_index != (prev->wb_index + 1))
301 idx_end = ULONG_MAX; 266 return 0;
302 } else { 267 if (req->wb_pgbase != 0)
303 idx_start = wbc->range_start >> PAGE_CACHE_SHIFT; 268 return 0;
304 idx_end = wbc->range_end >> PAGE_CACHE_SHIFT; 269 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
305 } 270 return 0;
271 return 1;
272}
306 273
307 for (;;) { 274/**
308 unsigned int toscan = NFS_SCAN_MAXENTRIES; 275 * nfs_pageio_do_add_request - Attempt to coalesce a request into a page list.
276 * @desc: destination io descriptor
277 * @req: request
278 *
279 * Returns true if the request 'req' was successfully coalesced into the
280 * existing list of pages 'desc'.
281 */
282static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
283 struct nfs_page *req)
284{
285 size_t newlen = req->wb_bytes;
309 286
310 found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, 287 if (desc->pg_count != 0) {
311 (void **)&pgvec[0], idx_start, toscan, 288 struct nfs_page *prev;
312 NFS_PAGE_TAG_DIRTY);
313 289
314 /* Did we make progress? */ 290 /*
315 if (found <= 0) 291 * FIXME: ideally we should be able to coalesce all requests
316 break; 292 * that are not block boundary aligned, but currently this
293 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
294 * since nfs_flush_multi and nfs_pagein_multi assume you
295 * can have only one struct nfs_page.
296 */
297 if (desc->pg_bsize < PAGE_SIZE)
298 return 0;
299 newlen += desc->pg_count;
300 if (newlen > desc->pg_bsize)
301 return 0;
302 prev = nfs_list_entry(desc->pg_list.prev);
303 if (!nfs_can_coalesce_requests(prev, req))
304 return 0;
305 } else
306 desc->pg_base = req->wb_pgbase;
307 nfs_list_remove_request(req);
308 nfs_list_add_request(req, &desc->pg_list);
309 desc->pg_count = newlen;
310 return 1;
311}
317 312
318 for (i = 0; i < found; i++) { 313/*
319 req = pgvec[i]; 314 * Helper for nfs_pageio_add_request and nfs_pageio_complete
320 if (!wbc->range_cyclic && req->wb_index > idx_end) 315 */
321 goto out; 316static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
317{
318 if (!list_empty(&desc->pg_list)) {
319 int error = desc->pg_doio(desc->pg_inode,
320 &desc->pg_list,
321 nfs_page_array_len(desc->pg_base,
322 desc->pg_count),
323 desc->pg_count,
324 desc->pg_ioflags);
325 if (error < 0)
326 desc->pg_error = error;
327 else
328 desc->pg_bytes_written += desc->pg_count;
329 }
330 if (list_empty(&desc->pg_list)) {
331 desc->pg_count = 0;
332 desc->pg_base = 0;
333 }
334}
322 335
323 /* Try to lock request and mark it for writeback */ 336/**
324 if (!nfs_set_page_writeback_locked(req)) 337 * nfs_pageio_add_request - Attempt to coalesce a request into a page list.
325 goto next; 338 * @desc: destination io descriptor
326 radix_tree_tag_clear(&nfsi->nfs_page_tree, 339 * @req: request
327 req->wb_index, NFS_PAGE_TAG_DIRTY); 340 *
328 nfsi->ndirty--; 341 * Returns true if the request 'req' was successfully coalesced into the
329 nfs_list_remove_request(req); 342 * existing list of pages 'desc'.
330 nfs_list_add_request(req, dst); 343 */
331 res++; 344int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
332 if (res == LONG_MAX) 345 struct nfs_page *req)
333 goto out; 346{
334next: 347 while (!nfs_pageio_do_add_request(desc, req)) {
335 idx_start = req->wb_index + 1; 348 nfs_pageio_doio(desc);
336 } 349 if (desc->pg_error < 0)
350 return 0;
337 } 351 }
338out: 352 return 1;
339 WARN_ON ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty));
340 return res;
341} 353}
342 354
343/** 355/**
356 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
357 * @desc: pointer to io descriptor
358 */
359void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
360{
361 nfs_pageio_doio(desc);
362}
363
364#define NFS_SCAN_MAXENTRIES 16
365/**
344 * nfs_scan_list - Scan a list for matching requests 366 * nfs_scan_list - Scan a list for matching requests
345 * @nfsi: NFS inode 367 * @nfsi: NFS inode
346 * @head: One of the NFS inode request lists 368 * @head: One of the NFS inode request lists
@@ -355,12 +377,12 @@ out:
355 * You must be holding the inode's req_lock when calling this function 377 * You must be holding the inode's req_lock when calling this function
356 */ 378 */
357int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, 379int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
358 struct list_head *dst, unsigned long idx_start, 380 struct list_head *dst, pgoff_t idx_start,
359 unsigned int npages) 381 unsigned int npages)
360{ 382{
361 struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; 383 struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
362 struct nfs_page *req; 384 struct nfs_page *req;
363 unsigned long idx_end; 385 pgoff_t idx_end;
364 int found, i; 386 int found, i;
365 int res; 387 int res;
366 388
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6ab4d5a9edf2..9a55807b2a70 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -27,7 +27,8 @@
27 27
28#define NFSDBG_FACILITY NFSDBG_PAGECACHE 28#define NFSDBG_FACILITY NFSDBG_PAGECACHE
29 29
30static int nfs_pagein_one(struct list_head *, struct inode *); 30static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
31static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
31static const struct rpc_call_ops nfs_read_partial_ops; 32static const struct rpc_call_ops nfs_read_partial_ops;
32static const struct rpc_call_ops nfs_read_full_ops; 33static const struct rpc_call_ops nfs_read_full_ops;
33 34
@@ -36,9 +37,8 @@ static mempool_t *nfs_rdata_mempool;
36 37
37#define MIN_POOL_READ (32) 38#define MIN_POOL_READ (32)
38 39
39struct nfs_read_data *nfs_readdata_alloc(size_t len) 40struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
40{ 41{
41 unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
42 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS); 42 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
43 43
44 if (p) { 44 if (p) {
@@ -133,7 +133,10 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
133 memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); 133 memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len);
134 134
135 nfs_list_add_request(new, &one_request); 135 nfs_list_add_request(new, &one_request);
136 nfs_pagein_one(&one_request, inode); 136 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
137 nfs_pagein_multi(inode, &one_request, 1, len, 0);
138 else
139 nfs_pagein_one(inode, &one_request, 1, len, 0);
137 return 0; 140 return 0;
138} 141}
139 142
@@ -230,7 +233,7 @@ static void nfs_execute_read(struct nfs_read_data *data)
230 * won't see the new data until our attribute cache is updated. This is more 233 * won't see the new data until our attribute cache is updated. This is more
231 * or less conventional NFS client behavior. 234 * or less conventional NFS client behavior.
232 */ 235 */
233static int nfs_pagein_multi(struct list_head *head, struct inode *inode) 236static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
234{ 237{
235 struct nfs_page *req = nfs_list_entry(head->next); 238 struct nfs_page *req = nfs_list_entry(head->next);
236 struct page *page = req->wb_page; 239 struct page *page = req->wb_page;
@@ -242,11 +245,11 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
242 245
243 nfs_list_remove_request(req); 246 nfs_list_remove_request(req);
244 247
245 nbytes = req->wb_bytes; 248 nbytes = count;
246 do { 249 do {
247 size_t len = min(nbytes,rsize); 250 size_t len = min(nbytes,rsize);
248 251
249 data = nfs_readdata_alloc(len); 252 data = nfs_readdata_alloc(1);
250 if (!data) 253 if (!data)
251 goto out_bad; 254 goto out_bad;
252 INIT_LIST_HEAD(&data->pages); 255 INIT_LIST_HEAD(&data->pages);
@@ -258,23 +261,19 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
258 261
259 ClearPageError(page); 262 ClearPageError(page);
260 offset = 0; 263 offset = 0;
261 nbytes = req->wb_bytes; 264 nbytes = count;
262 do { 265 do {
263 data = list_entry(list.next, struct nfs_read_data, pages); 266 data = list_entry(list.next, struct nfs_read_data, pages);
264 list_del_init(&data->pages); 267 list_del_init(&data->pages);
265 268
266 data->pagevec[0] = page; 269 data->pagevec[0] = page;
267 270
268 if (nbytes > rsize) { 271 if (nbytes < rsize)
269 nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, 272 rsize = nbytes;
270 rsize, offset); 273 nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
271 offset += rsize; 274 rsize, offset);
272 nbytes -= rsize; 275 offset += rsize;
273 } else { 276 nbytes -= rsize;
274 nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
275 nbytes, offset);
276 nbytes = 0;
277 }
278 nfs_execute_read(data); 277 nfs_execute_read(data);
279 } while (nbytes != 0); 278 } while (nbytes != 0);
280 279
@@ -291,30 +290,24 @@ out_bad:
291 return -ENOMEM; 290 return -ENOMEM;
292} 291}
293 292
294static int nfs_pagein_one(struct list_head *head, struct inode *inode) 293static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
295{ 294{
296 struct nfs_page *req; 295 struct nfs_page *req;
297 struct page **pages; 296 struct page **pages;
298 struct nfs_read_data *data; 297 struct nfs_read_data *data;
299 unsigned int count;
300 298
301 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) 299 data = nfs_readdata_alloc(npages);
302 return nfs_pagein_multi(head, inode);
303
304 data = nfs_readdata_alloc(NFS_SERVER(inode)->rsize);
305 if (!data) 300 if (!data)
306 goto out_bad; 301 goto out_bad;
307 302
308 INIT_LIST_HEAD(&data->pages); 303 INIT_LIST_HEAD(&data->pages);
309 pages = data->pagevec; 304 pages = data->pagevec;
310 count = 0;
311 while (!list_empty(head)) { 305 while (!list_empty(head)) {
312 req = nfs_list_entry(head->next); 306 req = nfs_list_entry(head->next);
313 nfs_list_remove_request(req); 307 nfs_list_remove_request(req);
314 nfs_list_add_request(req, &data->pages); 308 nfs_list_add_request(req, &data->pages);
315 ClearPageError(req->wb_page); 309 ClearPageError(req->wb_page);
316 *pages++ = req->wb_page; 310 *pages++ = req->wb_page;
317 count += req->wb_bytes;
318 } 311 }
319 req = nfs_list_entry(data->pages.next); 312 req = nfs_list_entry(data->pages.next);
320 313
@@ -327,28 +320,6 @@ out_bad:
327 return -ENOMEM; 320 return -ENOMEM;
328} 321}
329 322
330static int
331nfs_pagein_list(struct list_head *head, int rpages)
332{
333 LIST_HEAD(one_request);
334 struct nfs_page *req;
335 int error = 0;
336 unsigned int pages = 0;
337
338 while (!list_empty(head)) {
339 pages += nfs_coalesce_requests(head, &one_request, rpages);
340 req = nfs_list_entry(one_request.next);
341 error = nfs_pagein_one(&one_request, req->wb_context->dentry->d_inode);
342 if (error < 0)
343 break;
344 }
345 if (error >= 0)
346 return pages;
347
348 nfs_async_read_error(head);
349 return error;
350}
351
352/* 323/*
353 * This is the callback from RPC telling us whether a reply was 324 * This is the callback from RPC telling us whether a reply was
354 * received or some error occurred (timeout or socket shutdown). 325 * received or some error occurred (timeout or socket shutdown).
@@ -538,7 +509,7 @@ out_error:
538} 509}
539 510
540struct nfs_readdesc { 511struct nfs_readdesc {
541 struct list_head *head; 512 struct nfs_pageio_descriptor *pgio;
542 struct nfs_open_context *ctx; 513 struct nfs_open_context *ctx;
543}; 514};
544 515
@@ -562,19 +533,21 @@ readpage_async_filler(void *data, struct page *page)
562 } 533 }
563 if (len < PAGE_CACHE_SIZE) 534 if (len < PAGE_CACHE_SIZE)
564 memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); 535 memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len);
565 nfs_list_add_request(new, desc->head); 536 nfs_pageio_add_request(desc->pgio, new);
566 return 0; 537 return 0;
567} 538}
568 539
569int nfs_readpages(struct file *filp, struct address_space *mapping, 540int nfs_readpages(struct file *filp, struct address_space *mapping,
570 struct list_head *pages, unsigned nr_pages) 541 struct list_head *pages, unsigned nr_pages)
571{ 542{
572 LIST_HEAD(head); 543 struct nfs_pageio_descriptor pgio;
573 struct nfs_readdesc desc = { 544 struct nfs_readdesc desc = {
574 .head = &head, 545 .pgio = &pgio,
575 }; 546 };
576 struct inode *inode = mapping->host; 547 struct inode *inode = mapping->host;
577 struct nfs_server *server = NFS_SERVER(inode); 548 struct nfs_server *server = NFS_SERVER(inode);
549 size_t rsize = server->rsize;
550 unsigned long npages;
578 int ret = -ESTALE; 551 int ret = -ESTALE;
579 552
580 dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", 553 dprintk("NFS: nfs_readpages (%s/%Ld %d)\n",
@@ -593,13 +566,16 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
593 } else 566 } else
594 desc.ctx = get_nfs_open_context((struct nfs_open_context *) 567 desc.ctx = get_nfs_open_context((struct nfs_open_context *)
595 filp->private_data); 568 filp->private_data);
569 if (rsize < PAGE_CACHE_SIZE)
570 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
571 else
572 nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0);
573
596 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); 574 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
597 if (!list_empty(&head)) { 575
598 int err = nfs_pagein_list(&head, server->rpages); 576 nfs_pageio_complete(&pgio);
599 if (!ret) 577 npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
600 nfs_add_stats(inode, NFSIOS_READPAGES, err); 578 nfs_add_stats(inode, NFSIOS_READPAGES, npages);
601 ret = err;
602 }
603 put_nfs_open_context(desc.ctx); 579 put_nfs_open_context(desc.ctx);
604out: 580out:
605 return ret; 581 return ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1eae44b9a1a..ca20d3cc2609 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -204,9 +204,9 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
204 lock_kernel(); 204 lock_kernel();
205 205
206 error = server->nfs_client->rpc_ops->statfs(server, fh, &res); 206 error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
207 buf->f_type = NFS_SUPER_MAGIC;
208 if (error < 0) 207 if (error < 0)
209 goto out_err; 208 goto out_err;
209 buf->f_type = NFS_SUPER_MAGIC;
210 210
211 /* 211 /*
212 * Current versions of glibc do not correctly handle the 212 * Current versions of glibc do not correctly handle the
@@ -233,15 +233,14 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
233 buf->f_ffree = res.afiles; 233 buf->f_ffree = res.afiles;
234 234
235 buf->f_namelen = server->namelen; 235 buf->f_namelen = server->namelen;
236 out: 236
237 unlock_kernel(); 237 unlock_kernel();
238 return 0; 238 return 0;
239 239
240 out_err: 240 out_err:
241 dprintk("%s: statfs error = %d\n", __FUNCTION__, -error); 241 dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
242 buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; 242 unlock_kernel();
243 goto out; 243 return error;
244
245} 244}
246 245
247/* 246/*
@@ -291,6 +290,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
291 { NFS_MOUNT_NOAC, ",noac", "" }, 290 { NFS_MOUNT_NOAC, ",noac", "" },
292 { NFS_MOUNT_NONLM, ",nolock", "" }, 291 { NFS_MOUNT_NONLM, ",nolock", "" },
293 { NFS_MOUNT_NOACL, ",noacl", "" }, 292 { NFS_MOUNT_NOACL, ",noacl", "" },
293 { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
294 { 0, NULL, NULL } 294 { 0, NULL, NULL }
295 }; 295 };
296 const struct proc_nfs_info *nfs_infop; 296 const struct proc_nfs_info *nfs_infop;
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index f4a0548b9ce8..bc2821331c29 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -61,15 +61,9 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
61 err = page; 61 err = page;
62 goto read_failed; 62 goto read_failed;
63 } 63 }
64 if (!PageUptodate(page)) {
65 err = ERR_PTR(-EIO);
66 goto getlink_read_error;
67 }
68 nd_set_link(nd, kmap(page)); 64 nd_set_link(nd, kmap(page));
69 return page; 65 return page;
70 66
71getlink_read_error:
72 page_cache_release(page);
73read_failed: 67read_failed:
74 nd_set_link(nd, err); 68 nd_set_link(nd, err);
75 return NULL; 69 return NULL;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ad2e91b4904f..5d44b8bd1070 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -38,8 +38,8 @@
38static struct nfs_page * nfs_update_request(struct nfs_open_context*, 38static struct nfs_page * nfs_update_request(struct nfs_open_context*,
39 struct page *, 39 struct page *,
40 unsigned int, unsigned int); 40 unsigned int, unsigned int);
41static void nfs_mark_request_dirty(struct nfs_page *req); 41static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
42static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how); 42 struct inode *inode, int ioflags);
43static const struct rpc_call_ops nfs_write_partial_ops; 43static const struct rpc_call_ops nfs_write_partial_ops;
44static const struct rpc_call_ops nfs_write_full_ops; 44static const struct rpc_call_ops nfs_write_full_ops;
45static const struct rpc_call_ops nfs_commit_ops; 45static const struct rpc_call_ops nfs_commit_ops;
@@ -72,9 +72,8 @@ void nfs_commit_free(struct nfs_write_data *wdata)
72 call_rcu_bh(&wdata->task.u.tk_rcu, nfs_commit_rcu_free); 72 call_rcu_bh(&wdata->task.u.tk_rcu, nfs_commit_rcu_free);
73} 73}
74 74
75struct nfs_write_data *nfs_writedata_alloc(size_t len) 75struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
76{ 76{
77 unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
78 struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); 77 struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
79 78
80 if (p) { 79 if (p) {
@@ -140,7 +139,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
140{ 139{
141 struct inode *inode = page->mapping->host; 140 struct inode *inode = page->mapping->host;
142 loff_t end, i_size = i_size_read(inode); 141 loff_t end, i_size = i_size_read(inode);
143 unsigned long end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; 142 pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
144 143
145 if (i_size > 0 && page->index < end_index) 144 if (i_size > 0 && page->index < end_index)
146 return; 145 return;
@@ -202,7 +201,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
202static int wb_priority(struct writeback_control *wbc) 201static int wb_priority(struct writeback_control *wbc)
203{ 202{
204 if (wbc->for_reclaim) 203 if (wbc->for_reclaim)
205 return FLUSH_HIGHPRI; 204 return FLUSH_HIGHPRI | FLUSH_STABLE;
206 if (wbc->for_kupdate) 205 if (wbc->for_kupdate)
207 return FLUSH_LOWPRI; 206 return FLUSH_LOWPRI;
208 return 0; 207 return 0;
@@ -252,10 +251,12 @@ static void nfs_end_page_writeback(struct page *page)
252 * was not tagged. 251 * was not tagged.
253 * May also return an error if the user signalled nfs_wait_on_request(). 252 * May also return an error if the user signalled nfs_wait_on_request().
254 */ 253 */
255static int nfs_page_mark_flush(struct page *page) 254static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
255 struct page *page)
256{ 256{
257 struct nfs_page *req; 257 struct nfs_page *req;
258 spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock; 258 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
259 spinlock_t *req_lock = &nfsi->req_lock;
259 int ret; 260 int ret;
260 261
261 spin_lock(req_lock); 262 spin_lock(req_lock);
@@ -273,19 +274,30 @@ static int nfs_page_mark_flush(struct page *page)
273 * request as dirty (in which case we don't care). 274 * request as dirty (in which case we don't care).
274 */ 275 */
275 spin_unlock(req_lock); 276 spin_unlock(req_lock);
277 /* Prevent deadlock! */
278 nfs_pageio_complete(pgio);
276 ret = nfs_wait_on_request(req); 279 ret = nfs_wait_on_request(req);
277 nfs_release_request(req); 280 nfs_release_request(req);
278 if (ret != 0) 281 if (ret != 0)
279 return ret; 282 return ret;
280 spin_lock(req_lock); 283 spin_lock(req_lock);
281 } 284 }
282 spin_unlock(req_lock); 285 if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
283 if (nfs_set_page_writeback(page) == 0) { 286 /* This request is marked for commit */
284 nfs_list_remove_request(req); 287 spin_unlock(req_lock);
285 nfs_mark_request_dirty(req); 288 nfs_unlock_request(req);
289 nfs_pageio_complete(pgio);
290 return 1;
291 }
292 if (nfs_set_page_writeback(page) != 0) {
293 spin_unlock(req_lock);
294 BUG();
286 } 295 }
296 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
297 NFS_PAGE_TAG_WRITEBACK);
287 ret = test_bit(PG_NEED_FLUSH, &req->wb_flags); 298 ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
288 nfs_unlock_request(req); 299 spin_unlock(req_lock);
300 nfs_pageio_add_request(pgio, req);
289 return ret; 301 return ret;
290} 302}
291 303
@@ -294,6 +306,7 @@ static int nfs_page_mark_flush(struct page *page)
294 */ 306 */
295static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) 307static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
296{ 308{
309 struct nfs_pageio_descriptor mypgio, *pgio;
297 struct nfs_open_context *ctx; 310 struct nfs_open_context *ctx;
298 struct inode *inode = page->mapping->host; 311 struct inode *inode = page->mapping->host;
299 unsigned offset; 312 unsigned offset;
@@ -302,7 +315,14 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
302 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); 315 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
303 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); 316 nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
304 317
305 err = nfs_page_mark_flush(page); 318 if (wbc->for_writepages)
319 pgio = wbc->fs_private;
320 else {
321 nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc));
322 pgio = &mypgio;
323 }
324
325 err = nfs_page_async_flush(pgio, page);
306 if (err <= 0) 326 if (err <= 0)
307 goto out; 327 goto out;
308 err = 0; 328 err = 0;
@@ -319,12 +339,12 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
319 put_nfs_open_context(ctx); 339 put_nfs_open_context(ctx);
320 if (err != 0) 340 if (err != 0)
321 goto out; 341 goto out;
322 err = nfs_page_mark_flush(page); 342 err = nfs_page_async_flush(pgio, page);
323 if (err > 0) 343 if (err > 0)
324 err = 0; 344 err = 0;
325out: 345out:
326 if (!wbc->for_writepages) 346 if (!wbc->for_writepages)
327 nfs_flush_mapping(page->mapping, wbc, FLUSH_STABLE|wb_priority(wbc)); 347 nfs_pageio_complete(pgio);
328 return err; 348 return err;
329} 349}
330 350
@@ -340,20 +360,20 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
340int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 360int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
341{ 361{
342 struct inode *inode = mapping->host; 362 struct inode *inode = mapping->host;
363 struct nfs_pageio_descriptor pgio;
343 int err; 364 int err;
344 365
345 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); 366 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
346 367
368 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
369 wbc->fs_private = &pgio;
347 err = generic_writepages(mapping, wbc); 370 err = generic_writepages(mapping, wbc);
371 nfs_pageio_complete(&pgio);
348 if (err) 372 if (err)
349 return err; 373 return err;
350 err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc)); 374 if (pgio.pg_error)
351 if (err < 0) 375 return pgio.pg_error;
352 goto out; 376 return 0;
353 nfs_add_stats(inode, NFSIOS_WRITEPAGES, err);
354 err = 0;
355out:
356 return err;
357} 377}
358 378
359/* 379/*
@@ -376,6 +396,8 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
376 } 396 }
377 SetPagePrivate(req->wb_page); 397 SetPagePrivate(req->wb_page);
378 set_page_private(req->wb_page, (unsigned long)req); 398 set_page_private(req->wb_page, (unsigned long)req);
399 if (PageDirty(req->wb_page))
400 set_bit(PG_NEED_FLUSH, &req->wb_flags);
379 nfsi->npages++; 401 nfsi->npages++;
380 atomic_inc(&req->wb_count); 402 atomic_inc(&req->wb_count);
381 return 0; 403 return 0;
@@ -395,6 +417,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
395 set_page_private(req->wb_page, 0); 417 set_page_private(req->wb_page, 0);
396 ClearPagePrivate(req->wb_page); 418 ClearPagePrivate(req->wb_page);
397 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); 419 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
420 if (test_and_clear_bit(PG_NEED_FLUSH, &req->wb_flags))
421 __set_page_dirty_nobuffers(req->wb_page);
398 nfsi->npages--; 422 nfsi->npages--;
399 if (!nfsi->npages) { 423 if (!nfsi->npages) {
400 spin_unlock(&nfsi->req_lock); 424 spin_unlock(&nfsi->req_lock);
@@ -406,24 +430,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
406 nfs_release_request(req); 430 nfs_release_request(req);
407} 431}
408 432
409/*
410 * Add a request to the inode's dirty list.
411 */
412static void
413nfs_mark_request_dirty(struct nfs_page *req)
414{
415 struct inode *inode = req->wb_context->dentry->d_inode;
416 struct nfs_inode *nfsi = NFS_I(inode);
417
418 spin_lock(&nfsi->req_lock);
419 radix_tree_tag_set(&nfsi->nfs_page_tree,
420 req->wb_index, NFS_PAGE_TAG_DIRTY);
421 nfs_list_add_request(req, &nfsi->dirty);
422 nfsi->ndirty++;
423 spin_unlock(&nfsi->req_lock);
424 __mark_inode_dirty(inode, I_DIRTY_PAGES);
425}
426
427static void 433static void
428nfs_redirty_request(struct nfs_page *req) 434nfs_redirty_request(struct nfs_page *req)
429{ 435{
@@ -438,7 +444,7 @@ nfs_dirty_request(struct nfs_page *req)
438{ 444{
439 struct page *page = req->wb_page; 445 struct page *page = req->wb_page;
440 446
441 if (page == NULL) 447 if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
442 return 0; 448 return 0;
443 return !PageWriteback(req->wb_page); 449 return !PageWriteback(req->wb_page);
444} 450}
@@ -456,10 +462,48 @@ nfs_mark_request_commit(struct nfs_page *req)
456 spin_lock(&nfsi->req_lock); 462 spin_lock(&nfsi->req_lock);
457 nfs_list_add_request(req, &nfsi->commit); 463 nfs_list_add_request(req, &nfsi->commit);
458 nfsi->ncommit++; 464 nfsi->ncommit++;
465 set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
459 spin_unlock(&nfsi->req_lock); 466 spin_unlock(&nfsi->req_lock);
460 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 467 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
461 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 468 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
462} 469}
470
471static inline
472int nfs_write_need_commit(struct nfs_write_data *data)
473{
474 return data->verf.committed != NFS_FILE_SYNC;
475}
476
477static inline
478int nfs_reschedule_unstable_write(struct nfs_page *req)
479{
480 if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
481 nfs_mark_request_commit(req);
482 return 1;
483 }
484 if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
485 nfs_redirty_request(req);
486 return 1;
487 }
488 return 0;
489}
490#else
491static inline void
492nfs_mark_request_commit(struct nfs_page *req)
493{
494}
495
496static inline
497int nfs_write_need_commit(struct nfs_write_data *data)
498{
499 return 0;
500}
501
502static inline
503int nfs_reschedule_unstable_write(struct nfs_page *req)
504{
505 return 0;
506}
463#endif 507#endif
464 508
465/* 509/*
@@ -467,11 +511,11 @@ nfs_mark_request_commit(struct nfs_page *req)
467 * 511 *
468 * Interruptible by signals only if mounted with intr flag. 512 * Interruptible by signals only if mounted with intr flag.
469 */ 513 */
470static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_start, unsigned int npages) 514static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
471{ 515{
472 struct nfs_inode *nfsi = NFS_I(inode); 516 struct nfs_inode *nfsi = NFS_I(inode);
473 struct nfs_page *req; 517 struct nfs_page *req;
474 unsigned long idx_end, next; 518 pgoff_t idx_end, next;
475 unsigned int res = 0; 519 unsigned int res = 0;
476 int error; 520 int error;
477 521
@@ -500,18 +544,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_st
500 return res; 544 return res;
501} 545}
502 546
503static void nfs_cancel_dirty_list(struct list_head *head)
504{
505 struct nfs_page *req;
506 while(!list_empty(head)) {
507 req = nfs_list_entry(head->next);
508 nfs_list_remove_request(req);
509 nfs_end_page_writeback(req->wb_page);
510 nfs_inode_remove_request(req);
511 nfs_clear_page_writeback(req);
512 }
513}
514
515static void nfs_cancel_commit_list(struct list_head *head) 547static void nfs_cancel_commit_list(struct list_head *head)
516{ 548{
517 struct nfs_page *req; 549 struct nfs_page *req;
@@ -520,6 +552,7 @@ static void nfs_cancel_commit_list(struct list_head *head)
520 req = nfs_list_entry(head->next); 552 req = nfs_list_entry(head->next);
521 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 553 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
522 nfs_list_remove_request(req); 554 nfs_list_remove_request(req);
555 clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
523 nfs_inode_remove_request(req); 556 nfs_inode_remove_request(req);
524 nfs_unlock_request(req); 557 nfs_unlock_request(req);
525 } 558 }
@@ -537,7 +570,7 @@ static void nfs_cancel_commit_list(struct list_head *head)
537 * The requests are *not* checked to ensure that they form a contiguous set. 570 * The requests are *not* checked to ensure that they form a contiguous set.
538 */ 571 */
539static int 572static int
540nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages) 573nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
541{ 574{
542 struct nfs_inode *nfsi = NFS_I(inode); 575 struct nfs_inode *nfsi = NFS_I(inode);
543 int res = 0; 576 int res = 0;
@@ -551,40 +584,12 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st
551 return res; 584 return res;
552} 585}
553#else 586#else
554static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages) 587static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
555{ 588{
556 return 0; 589 return 0;
557} 590}
558#endif 591#endif
559 592
560static int nfs_wait_on_write_congestion(struct address_space *mapping)
561{
562 struct inode *inode = mapping->host;
563 struct backing_dev_info *bdi = mapping->backing_dev_info;
564 int ret = 0;
565
566 might_sleep();
567
568 if (!bdi_write_congested(bdi))
569 return 0;
570
571 nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
572
573 do {
574 struct rpc_clnt *clnt = NFS_CLIENT(inode);
575 sigset_t oldset;
576
577 rpc_clnt_sigmask(clnt, &oldset);
578 ret = congestion_wait_interruptible(WRITE, HZ/10);
579 rpc_clnt_sigunmask(clnt, &oldset);
580 if (ret == -ERESTARTSYS)
581 break;
582 ret = 0;
583 } while (bdi_write_congested(bdi));
584
585 return ret;
586}
587
588/* 593/*
589 * Try to update any existing write request, or create one if there is none. 594 * Try to update any existing write request, or create one if there is none.
590 * In order to match, the request's credentials must match those of 595 * In order to match, the request's credentials must match those of
@@ -599,12 +604,10 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
599 struct inode *inode = mapping->host; 604 struct inode *inode = mapping->host;
600 struct nfs_inode *nfsi = NFS_I(inode); 605 struct nfs_inode *nfsi = NFS_I(inode);
601 struct nfs_page *req, *new = NULL; 606 struct nfs_page *req, *new = NULL;
602 unsigned long rqend, end; 607 pgoff_t rqend, end;
603 608
604 end = offset + bytes; 609 end = offset + bytes;
605 610
606 if (nfs_wait_on_write_congestion(mapping))
607 return ERR_PTR(-ERESTARTSYS);
608 for (;;) { 611 for (;;) {
609 /* Loop over all inode entries and see if we find 612 /* Loop over all inode entries and see if we find
610 * A request for the page we wish to update 613 * A request for the page we wish to update
@@ -746,26 +749,12 @@ int nfs_updatepage(struct file *file, struct page *page,
746 749
747static void nfs_writepage_release(struct nfs_page *req) 750static void nfs_writepage_release(struct nfs_page *req)
748{ 751{
749 nfs_end_page_writeback(req->wb_page);
750
751#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
752 if (!PageError(req->wb_page)) {
753 if (NFS_NEED_RESCHED(req)) {
754 nfs_redirty_request(req);
755 goto out;
756 } else if (NFS_NEED_COMMIT(req)) {
757 nfs_mark_request_commit(req);
758 goto out;
759 }
760 }
761 nfs_inode_remove_request(req);
762 752
763out: 753 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
764 nfs_clear_commit(req); 754 nfs_end_page_writeback(req->wb_page);
765 nfs_clear_reschedule(req); 755 nfs_inode_remove_request(req);
766#else 756 } else
767 nfs_inode_remove_request(req); 757 nfs_end_page_writeback(req->wb_page);
768#endif
769 nfs_clear_page_writeback(req); 758 nfs_clear_page_writeback(req);
770} 759}
771 760
@@ -842,7 +831,7 @@ static void nfs_execute_write(struct nfs_write_data *data)
842 * Generate multiple small requests to write out a single 831 * Generate multiple small requests to write out a single
843 * contiguous dirty area on one page. 832 * contiguous dirty area on one page.
844 */ 833 */
845static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how) 834static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
846{ 835{
847 struct nfs_page *req = nfs_list_entry(head->next); 836 struct nfs_page *req = nfs_list_entry(head->next);
848 struct page *page = req->wb_page; 837 struct page *page = req->wb_page;
@@ -854,11 +843,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
854 843
855 nfs_list_remove_request(req); 844 nfs_list_remove_request(req);
856 845
857 nbytes = req->wb_bytes; 846 nbytes = count;
858 do { 847 do {
859 size_t len = min(nbytes, wsize); 848 size_t len = min(nbytes, wsize);
860 849
861 data = nfs_writedata_alloc(len); 850 data = nfs_writedata_alloc(1);
862 if (!data) 851 if (!data)
863 goto out_bad; 852 goto out_bad;
864 list_add(&data->pages, &list); 853 list_add(&data->pages, &list);
@@ -869,23 +858,19 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
869 858
870 ClearPageError(page); 859 ClearPageError(page);
871 offset = 0; 860 offset = 0;
872 nbytes = req->wb_bytes; 861 nbytes = count;
873 do { 862 do {
874 data = list_entry(list.next, struct nfs_write_data, pages); 863 data = list_entry(list.next, struct nfs_write_data, pages);
875 list_del_init(&data->pages); 864 list_del_init(&data->pages);
876 865
877 data->pagevec[0] = page; 866 data->pagevec[0] = page;
878 867
879 if (nbytes > wsize) { 868 if (nbytes < wsize)
880 nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, 869 wsize = nbytes;
881 wsize, offset, how); 870 nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
882 offset += wsize; 871 wsize, offset, how);
883 nbytes -= wsize; 872 offset += wsize;
884 } else { 873 nbytes -= wsize;
885 nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
886 nbytes, offset, how);
887 nbytes = 0;
888 }
889 nfs_execute_write(data); 874 nfs_execute_write(data);
890 } while (nbytes != 0); 875 } while (nbytes != 0);
891 876
@@ -897,8 +882,8 @@ out_bad:
897 list_del(&data->pages); 882 list_del(&data->pages);
898 nfs_writedata_release(data); 883 nfs_writedata_release(data);
899 } 884 }
900 nfs_end_page_writeback(req->wb_page);
901 nfs_redirty_request(req); 885 nfs_redirty_request(req);
886 nfs_end_page_writeback(req->wb_page);
902 nfs_clear_page_writeback(req); 887 nfs_clear_page_writeback(req);
903 return -ENOMEM; 888 return -ENOMEM;
904} 889}
@@ -911,26 +896,23 @@ out_bad:
911 * This is the case if nfs_updatepage detects a conflicting request 896 * This is the case if nfs_updatepage detects a conflicting request
912 * that has been written but not committed. 897 * that has been written but not committed.
913 */ 898 */
914static int nfs_flush_one(struct inode *inode, struct list_head *head, int how) 899static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
915{ 900{
916 struct nfs_page *req; 901 struct nfs_page *req;
917 struct page **pages; 902 struct page **pages;
918 struct nfs_write_data *data; 903 struct nfs_write_data *data;
919 unsigned int count;
920 904
921 data = nfs_writedata_alloc(NFS_SERVER(inode)->wsize); 905 data = nfs_writedata_alloc(npages);
922 if (!data) 906 if (!data)
923 goto out_bad; 907 goto out_bad;
924 908
925 pages = data->pagevec; 909 pages = data->pagevec;
926 count = 0;
927 while (!list_empty(head)) { 910 while (!list_empty(head)) {
928 req = nfs_list_entry(head->next); 911 req = nfs_list_entry(head->next);
929 nfs_list_remove_request(req); 912 nfs_list_remove_request(req);
930 nfs_list_add_request(req, &data->pages); 913 nfs_list_add_request(req, &data->pages);
931 ClearPageError(req->wb_page); 914 ClearPageError(req->wb_page);
932 *pages++ = req->wb_page; 915 *pages++ = req->wb_page;
933 count += req->wb_bytes;
934 } 916 }
935 req = nfs_list_entry(data->pages.next); 917 req = nfs_list_entry(data->pages.next);
936 918
@@ -943,47 +925,22 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, int how)
943 while (!list_empty(head)) { 925 while (!list_empty(head)) {
944 struct nfs_page *req = nfs_list_entry(head->next); 926 struct nfs_page *req = nfs_list_entry(head->next);
945 nfs_list_remove_request(req); 927 nfs_list_remove_request(req);
946 nfs_end_page_writeback(req->wb_page);
947 nfs_redirty_request(req); 928 nfs_redirty_request(req);
929 nfs_end_page_writeback(req->wb_page);
948 nfs_clear_page_writeback(req); 930 nfs_clear_page_writeback(req);
949 } 931 }
950 return -ENOMEM; 932 return -ENOMEM;
951} 933}
952 934
953static int nfs_flush_list(struct inode *inode, struct list_head *head, int npages, int how) 935static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
936 struct inode *inode, int ioflags)
954{ 937{
955 LIST_HEAD(one_request);
956 int (*flush_one)(struct inode *, struct list_head *, int);
957 struct nfs_page *req;
958 int wpages = NFS_SERVER(inode)->wpages;
959 int wsize = NFS_SERVER(inode)->wsize; 938 int wsize = NFS_SERVER(inode)->wsize;
960 int error;
961 939
962 flush_one = nfs_flush_one;
963 if (wsize < PAGE_CACHE_SIZE) 940 if (wsize < PAGE_CACHE_SIZE)
964 flush_one = nfs_flush_multi; 941 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
965 /* For single writes, FLUSH_STABLE is more efficient */ 942 else
966 if (npages <= wpages && npages == NFS_I(inode)->npages 943 nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
967 && nfs_list_entry(head->next)->wb_bytes <= wsize)
968 how |= FLUSH_STABLE;
969
970 do {
971 nfs_coalesce_requests(head, &one_request, wpages);
972 req = nfs_list_entry(one_request.next);
973 error = flush_one(inode, &one_request, how);
974 if (error < 0)
975 goto out_err;
976 } while (!list_empty(head));
977 return 0;
978out_err:
979 while (!list_empty(head)) {
980 req = nfs_list_entry(head->next);
981 nfs_list_remove_request(req);
982 nfs_end_page_writeback(req->wb_page);
983 nfs_redirty_request(req);
984 nfs_clear_page_writeback(req);
985 }
986 return error;
987} 944}
988 945
989/* 946/*
@@ -1008,22 +965,28 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
1008 nfs_set_pageerror(page); 965 nfs_set_pageerror(page);
1009 req->wb_context->error = task->tk_status; 966 req->wb_context->error = task->tk_status;
1010 dprintk(", error = %d\n", task->tk_status); 967 dprintk(", error = %d\n", task->tk_status);
1011 } else { 968 goto out;
1012#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1013 if (data->verf.committed < NFS_FILE_SYNC) {
1014 if (!NFS_NEED_COMMIT(req)) {
1015 nfs_defer_commit(req);
1016 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
1017 dprintk(" defer commit\n");
1018 } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) {
1019 nfs_defer_reschedule(req);
1020 dprintk(" server reboot detected\n");
1021 }
1022 } else
1023#endif
1024 dprintk(" OK\n");
1025 } 969 }
1026 970
971 if (nfs_write_need_commit(data)) {
972 spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
973
974 spin_lock(req_lock);
975 if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) {
976 /* Do nothing we need to resend the writes */
977 } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) {
978 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
979 dprintk(" defer commit\n");
980 } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) {
981 set_bit(PG_NEED_RESCHED, &req->wb_flags);
982 clear_bit(PG_NEED_COMMIT, &req->wb_flags);
983 dprintk(" server reboot detected\n");
984 }
985 spin_unlock(req_lock);
986 } else
987 dprintk(" OK\n");
988
989out:
1027 if (atomic_dec_and_test(&req->wb_complete)) 990 if (atomic_dec_and_test(&req->wb_complete))
1028 nfs_writepage_release(req); 991 nfs_writepage_release(req);
1029} 992}
@@ -1064,25 +1027,21 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
1064 if (task->tk_status < 0) { 1027 if (task->tk_status < 0) {
1065 nfs_set_pageerror(page); 1028 nfs_set_pageerror(page);
1066 req->wb_context->error = task->tk_status; 1029 req->wb_context->error = task->tk_status;
1067 nfs_end_page_writeback(page);
1068 nfs_inode_remove_request(req);
1069 dprintk(", error = %d\n", task->tk_status); 1030 dprintk(", error = %d\n", task->tk_status);
1070 goto next; 1031 goto remove_request;
1071 } 1032 }
1072 nfs_end_page_writeback(page);
1073 1033
1074#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1034 if (nfs_write_need_commit(data)) {
1075 if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) { 1035 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
1076 nfs_inode_remove_request(req); 1036 nfs_mark_request_commit(req);
1077 dprintk(" OK\n"); 1037 nfs_end_page_writeback(page);
1038 dprintk(" marked for commit\n");
1078 goto next; 1039 goto next;
1079 } 1040 }
1080 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); 1041 dprintk(" OK\n");
1081 nfs_mark_request_commit(req); 1042remove_request:
1082 dprintk(" marked for commit\n"); 1043 nfs_end_page_writeback(page);
1083#else
1084 nfs_inode_remove_request(req); 1044 nfs_inode_remove_request(req);
1085#endif
1086 next: 1045 next:
1087 nfs_clear_page_writeback(req); 1046 nfs_clear_page_writeback(req);
1088 } 1047 }
@@ -1270,6 +1229,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
1270 while (!list_empty(&data->pages)) { 1229 while (!list_empty(&data->pages)) {
1271 req = nfs_list_entry(data->pages.next); 1230 req = nfs_list_entry(data->pages.next);
1272 nfs_list_remove_request(req); 1231 nfs_list_remove_request(req);
1232 clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
1273 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 1233 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1274 1234
1275 dprintk("NFS: commit (%s/%Ld %d@%Ld)", 1235 dprintk("NFS: commit (%s/%Ld %d@%Ld)",
@@ -1304,31 +1264,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
1304 .rpc_call_done = nfs_commit_done, 1264 .rpc_call_done = nfs_commit_done,
1305 .rpc_release = nfs_commit_release, 1265 .rpc_release = nfs_commit_release,
1306}; 1266};
1307#else
1308static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1309{
1310 return 0;
1311}
1312#endif
1313
1314static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
1315{
1316 struct nfs_inode *nfsi = NFS_I(mapping->host);
1317 LIST_HEAD(head);
1318 long res;
1319 1267
1320 spin_lock(&nfsi->req_lock);
1321 res = nfs_scan_dirty(mapping, wbc, &head);
1322 spin_unlock(&nfsi->req_lock);
1323 if (res) {
1324 int error = nfs_flush_list(mapping->host, &head, res, how);
1325 if (error < 0)
1326 return error;
1327 }
1328 return res;
1329}
1330
1331#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1332int nfs_commit_inode(struct inode *inode, int how) 1268int nfs_commit_inode(struct inode *inode, int how)
1333{ 1269{
1334 struct nfs_inode *nfsi = NFS_I(inode); 1270 struct nfs_inode *nfsi = NFS_I(inode);
@@ -1345,13 +1281,18 @@ int nfs_commit_inode(struct inode *inode, int how)
1345 } 1281 }
1346 return res; 1282 return res;
1347} 1283}
1284#else
1285static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1286{
1287 return 0;
1288}
1348#endif 1289#endif
1349 1290
1350long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) 1291long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
1351{ 1292{
1352 struct inode *inode = mapping->host; 1293 struct inode *inode = mapping->host;
1353 struct nfs_inode *nfsi = NFS_I(inode); 1294 struct nfs_inode *nfsi = NFS_I(inode);
1354 unsigned long idx_start, idx_end; 1295 pgoff_t idx_start, idx_end;
1355 unsigned int npages = 0; 1296 unsigned int npages = 0;
1356 LIST_HEAD(head); 1297 LIST_HEAD(head);
1357 int nocommit = how & FLUSH_NOCOMMIT; 1298 int nocommit = how & FLUSH_NOCOMMIT;
@@ -1364,41 +1305,24 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
1364 idx_start = wbc->range_start >> PAGE_CACHE_SHIFT; 1305 idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
1365 idx_end = wbc->range_end >> PAGE_CACHE_SHIFT; 1306 idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
1366 if (idx_end > idx_start) { 1307 if (idx_end > idx_start) {
1367 unsigned long l_npages = 1 + idx_end - idx_start; 1308 pgoff_t l_npages = 1 + idx_end - idx_start;
1368 npages = l_npages; 1309 npages = l_npages;
1369 if (sizeof(npages) != sizeof(l_npages) && 1310 if (sizeof(npages) != sizeof(l_npages) &&
1370 (unsigned long)npages != l_npages) 1311 (pgoff_t)npages != l_npages)
1371 npages = 0; 1312 npages = 0;
1372 } 1313 }
1373 } 1314 }
1374 how &= ~FLUSH_NOCOMMIT; 1315 how &= ~FLUSH_NOCOMMIT;
1375 spin_lock(&nfsi->req_lock); 1316 spin_lock(&nfsi->req_lock);
1376 do { 1317 do {
1377 wbc->pages_skipped = 0;
1378 ret = nfs_wait_on_requests_locked(inode, idx_start, npages); 1318 ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
1379 if (ret != 0) 1319 if (ret != 0)
1380 continue; 1320 continue;
1381 pages = nfs_scan_dirty(mapping, wbc, &head);
1382 if (pages != 0) {
1383 spin_unlock(&nfsi->req_lock);
1384 if (how & FLUSH_INVALIDATE) {
1385 nfs_cancel_dirty_list(&head);
1386 ret = pages;
1387 } else
1388 ret = nfs_flush_list(inode, &head, pages, how);
1389 spin_lock(&nfsi->req_lock);
1390 continue;
1391 }
1392 if (wbc->pages_skipped != 0)
1393 continue;
1394 if (nocommit) 1321 if (nocommit)
1395 break; 1322 break;
1396 pages = nfs_scan_commit(inode, &head, idx_start, npages); 1323 pages = nfs_scan_commit(inode, &head, idx_start, npages);
1397 if (pages == 0) { 1324 if (pages == 0)
1398 if (wbc->pages_skipped != 0)
1399 continue;
1400 break; 1325 break;
1401 }
1402 if (how & FLUSH_INVALIDATE) { 1326 if (how & FLUSH_INVALIDATE) {
1403 spin_unlock(&nfsi->req_lock); 1327 spin_unlock(&nfsi->req_lock);
1404 nfs_cancel_commit_list(&head); 1328 nfs_cancel_commit_list(&head);
@@ -1430,7 +1354,7 @@ int nfs_wb_all(struct inode *inode)
1430 }; 1354 };
1431 int ret; 1355 int ret;
1432 1356
1433 ret = generic_writepages(mapping, &wbc); 1357 ret = nfs_writepages(mapping, &wbc);
1434 if (ret < 0) 1358 if (ret < 0)
1435 goto out; 1359 goto out;
1436 ret = nfs_sync_mapping_wait(mapping, &wbc, 0); 1360 ret = nfs_sync_mapping_wait(mapping, &wbc, 0);
@@ -1453,11 +1377,9 @@ int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, lo
1453 }; 1377 };
1454 int ret; 1378 int ret;
1455 1379
1456 if (!(how & FLUSH_NOWRITEPAGE)) { 1380 ret = nfs_writepages(mapping, &wbc);
1457 ret = generic_writepages(mapping, &wbc); 1381 if (ret < 0)
1458 if (ret < 0) 1382 goto out;
1459 goto out;
1460 }
1461 ret = nfs_sync_mapping_wait(mapping, &wbc, how); 1383 ret = nfs_sync_mapping_wait(mapping, &wbc, how);
1462 if (ret >= 0) 1384 if (ret >= 0)
1463 return 0; 1385 return 0;
@@ -1480,7 +1402,7 @@ int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
1480 int ret; 1402 int ret;
1481 1403
1482 BUG_ON(!PageLocked(page)); 1404 BUG_ON(!PageLocked(page));
1483 if (!(how & FLUSH_NOWRITEPAGE) && clear_page_dirty_for_io(page)) { 1405 if (clear_page_dirty_for_io(page)) {
1484 ret = nfs_writepage_locked(page, &wbc); 1406 ret = nfs_writepage_locked(page, &wbc);
1485 if (ret < 0) 1407 if (ret < 0)
1486 goto out; 1408 goto out;
@@ -1505,15 +1427,32 @@ int nfs_wb_page(struct inode *inode, struct page* page)
1505 1427
1506int nfs_set_page_dirty(struct page *page) 1428int nfs_set_page_dirty(struct page *page)
1507{ 1429{
1430 struct address_space *mapping = page->mapping;
1431 struct inode *inode;
1432 spinlock_t *req_lock;
1508 struct nfs_page *req; 1433 struct nfs_page *req;
1434 int ret;
1509 1435
1510 req = nfs_page_find_request(page); 1436 if (!mapping)
1437 goto out_raced;
1438 inode = mapping->host;
1439 if (!inode)
1440 goto out_raced;
1441 req_lock = &NFS_I(inode)->req_lock;
1442 spin_lock(req_lock);
1443 req = nfs_page_find_request_locked(page);
1511 if (req != NULL) { 1444 if (req != NULL) {
1512 /* Mark any existing write requests for flushing */ 1445 /* Mark any existing write requests for flushing */
1513 set_bit(PG_NEED_FLUSH, &req->wb_flags); 1446 ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags);
1447 spin_unlock(req_lock);
1514 nfs_release_request(req); 1448 nfs_release_request(req);
1449 return ret;
1515 } 1450 }
1516 return __set_page_dirty_nobuffers(page); 1451 ret = __set_page_dirty_nobuffers(page);
1452 spin_unlock(req_lock);
1453 return ret;
1454out_raced:
1455 return !TestSetPageDirty(page);
1517} 1456}
1518 1457
1519 1458
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index fb14d68eacab..32ffea033c7a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -315,16 +315,13 @@ out:
315/* 315/*
316 * RPC procedure tables 316 * RPC procedure tables
317 */ 317 */
318#ifndef MAX
319# define MAX(a, b) (((a) > (b))? (a) : (b))
320#endif
321
322#define PROC(proc, call, argtype, restype) \ 318#define PROC(proc, call, argtype, restype) \
323[NFSPROC4_CLNT_##proc] = { \ 319[NFSPROC4_CLNT_##proc] = { \
324 .p_proc = NFSPROC4_CB_##call, \ 320 .p_proc = NFSPROC4_CB_##call, \
325 .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ 321 .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \
326 .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ 322 .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \
327 .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2, \ 323 .p_arglen = NFS4_##argtype##_sz, \
324 .p_replen = NFS4_##restype##_sz, \
328 .p_statidx = NFSPROC4_CB_##call, \ 325 .p_statidx = NFSPROC4_CB_##call, \
329 .p_name = #proc, \ 326 .p_name = #proc, \
330} 327}
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 9393f4b1e298..caecc58f529c 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -89,9 +89,8 @@ static inline struct page *ntfs_map_page(struct address_space *mapping,
89 struct page *page = read_mapping_page(mapping, index, NULL); 89 struct page *page = read_mapping_page(mapping, index, NULL);
90 90
91 if (!IS_ERR(page)) { 91 if (!IS_ERR(page)) {
92 wait_on_page_locked(page);
93 kmap(page); 92 kmap(page);
94 if (PageUptodate(page) && !PageError(page)) 93 if (!PageError(page))
95 return page; 94 return page;
96 ntfs_unmap_page(page); 95 ntfs_unmap_page(page);
97 return ERR_PTR(-EIO); 96 return ERR_PTR(-EIO);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 7659cc192995..1c08fefe487a 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -2532,14 +2532,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
2532 page = read_mapping_page(mapping, idx, NULL); 2532 page = read_mapping_page(mapping, idx, NULL);
2533 if (IS_ERR(page)) { 2533 if (IS_ERR(page)) {
2534 ntfs_error(vol->sb, "Failed to read first partial " 2534 ntfs_error(vol->sb, "Failed to read first partial "
2535 "page (sync error, index 0x%lx).", idx); 2535 "page (error, index 0x%lx).", idx);
2536 return PTR_ERR(page);
2537 }
2538 wait_on_page_locked(page);
2539 if (unlikely(!PageUptodate(page))) {
2540 ntfs_error(vol->sb, "Failed to read first partial page "
2541 "(async error, index 0x%lx).", idx);
2542 page_cache_release(page);
2543 return PTR_ERR(page); 2536 return PTR_ERR(page);
2544 } 2537 }
2545 /* 2538 /*
@@ -2602,14 +2595,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
2602 page = read_mapping_page(mapping, idx, NULL); 2595 page = read_mapping_page(mapping, idx, NULL);
2603 if (IS_ERR(page)) { 2596 if (IS_ERR(page)) {
2604 ntfs_error(vol->sb, "Failed to read last partial page " 2597 ntfs_error(vol->sb, "Failed to read last partial page "
2605 "(sync error, index 0x%lx).", idx); 2598 "(error, index 0x%lx).", idx);
2606 return PTR_ERR(page);
2607 }
2608 wait_on_page_locked(page);
2609 if (unlikely(!PageUptodate(page))) {
2610 ntfs_error(vol->sb, "Failed to read last partial page "
2611 "(async error, index 0x%lx).", idx);
2612 page_cache_release(page);
2613 return PTR_ERR(page); 2599 return PTR_ERR(page);
2614 } 2600 }
2615 kaddr = kmap_atomic(page, KM_USER0); 2601 kaddr = kmap_atomic(page, KM_USER0);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index d69c4595ccd0..dbbac5593106 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -236,8 +236,7 @@ do_non_resident_extend:
236 err = PTR_ERR(page); 236 err = PTR_ERR(page);
237 goto init_err_out; 237 goto init_err_out;
238 } 238 }
239 wait_on_page_locked(page); 239 if (unlikely(PageError(page))) {
240 if (unlikely(!PageUptodate(page) || PageError(page))) {
241 page_cache_release(page); 240 page_cache_release(page);
242 err = -EIO; 241 err = -EIO;
243 goto init_err_out; 242 goto init_err_out;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 1594c90b7164..21d834e5ed73 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2471,7 +2471,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2471 s64 nr_free = vol->nr_clusters; 2471 s64 nr_free = vol->nr_clusters;
2472 u32 *kaddr; 2472 u32 *kaddr;
2473 struct address_space *mapping = vol->lcnbmp_ino->i_mapping; 2473 struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
2474 filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
2475 struct page *page; 2474 struct page *page;
2476 pgoff_t index, max_index; 2475 pgoff_t index, max_index;
2477 2476
@@ -2494,24 +2493,14 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2494 * Read the page from page cache, getting it from backing store 2493 * Read the page from page cache, getting it from backing store
2495 * if necessary, and increment the use count. 2494 * if necessary, and increment the use count.
2496 */ 2495 */
2497 page = read_cache_page(mapping, index, (filler_t*)readpage, 2496 page = read_mapping_page(mapping, index, NULL);
2498 NULL);
2499 /* Ignore pages which errored synchronously. */ 2497 /* Ignore pages which errored synchronously. */
2500 if (IS_ERR(page)) { 2498 if (IS_ERR(page)) {
2501 ntfs_debug("Sync read_cache_page() error. Skipping " 2499 ntfs_debug("read_mapping_page() error. Skipping "
2502 "page (index 0x%lx).", index); 2500 "page (index 0x%lx).", index);
2503 nr_free -= PAGE_CACHE_SIZE * 8; 2501 nr_free -= PAGE_CACHE_SIZE * 8;
2504 continue; 2502 continue;
2505 } 2503 }
2506 wait_on_page_locked(page);
2507 /* Ignore pages which errored asynchronously. */
2508 if (!PageUptodate(page)) {
2509 ntfs_debug("Async read_cache_page() error. Skipping "
2510 "page (index 0x%lx).", index);
2511 page_cache_release(page);
2512 nr_free -= PAGE_CACHE_SIZE * 8;
2513 continue;
2514 }
2515 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2504 kaddr = (u32*)kmap_atomic(page, KM_USER0);
2516 /* 2505 /*
2517 * For each 4 bytes, subtract the number of set bits. If this 2506 * For each 4 bytes, subtract the number of set bits. If this
@@ -2562,7 +2551,6 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2562{ 2551{
2563 u32 *kaddr; 2552 u32 *kaddr;
2564 struct address_space *mapping = vol->mftbmp_ino->i_mapping; 2553 struct address_space *mapping = vol->mftbmp_ino->i_mapping;
2565 filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
2566 struct page *page; 2554 struct page *page;
2567 pgoff_t index; 2555 pgoff_t index;
2568 2556
@@ -2576,21 +2564,11 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2576 * Read the page from page cache, getting it from backing store 2564 * Read the page from page cache, getting it from backing store
2577 * if necessary, and increment the use count. 2565 * if necessary, and increment the use count.
2578 */ 2566 */
2579 page = read_cache_page(mapping, index, (filler_t*)readpage, 2567 page = read_mapping_page(mapping, index, NULL);
2580 NULL);
2581 /* Ignore pages which errored synchronously. */ 2568 /* Ignore pages which errored synchronously. */
2582 if (IS_ERR(page)) { 2569 if (IS_ERR(page)) {
2583 ntfs_debug("Sync read_cache_page() error. Skipping " 2570 ntfs_debug("read_mapping_page() error. Skipping "
2584 "page (index 0x%lx).", index);
2585 nr_free -= PAGE_CACHE_SIZE * 8;
2586 continue;
2587 }
2588 wait_on_page_locked(page);
2589 /* Ignore pages which errored asynchronously. */
2590 if (!PageUptodate(page)) {
2591 ntfs_debug("Async read_cache_page() error. Skipping "
2592 "page (index 0x%lx).", index); 2571 "page (index 0x%lx).", index);
2593 page_cache_release(page);
2594 nr_free -= PAGE_CACHE_SIZE * 8; 2572 nr_free -= PAGE_CACHE_SIZE * 8;
2595 continue; 2573 continue;
2596 } 2574 }
@@ -3107,8 +3085,7 @@ static void ntfs_big_inode_init_once(void *foo, struct kmem_cache *cachep,
3107{ 3085{
3108 ntfs_inode *ni = (ntfs_inode *)foo; 3086 ntfs_inode *ni = (ntfs_inode *)foo;
3109 3087
3110 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 3088 if (flags & SLAB_CTOR_CONSTRUCTOR)
3111 SLAB_CTOR_CONSTRUCTOR)
3112 inode_init_once(VFS_I(ni)); 3089 inode_init_once(VFS_I(ni));
3113} 3090}
3114 3091
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f27e5378caf2..19712a7d145f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -27,6 +27,7 @@
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/swap.h>
30 31
31#define MLOG_MASK_PREFIX ML_DISK_ALLOC 32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
32#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -34,6 +35,7 @@
34#include "ocfs2.h" 35#include "ocfs2.h"
35 36
36#include "alloc.h" 37#include "alloc.h"
38#include "aops.h"
37#include "dlmglue.h" 39#include "dlmglue.h"
38#include "extent_map.h" 40#include "extent_map.h"
39#include "inode.h" 41#include "inode.h"
@@ -47,63 +49,243 @@
47 49
48#include "buffer_head_io.h" 50#include "buffer_head_io.h"
49 51
50static int ocfs2_extent_contig(struct inode *inode, 52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
51 struct ocfs2_extent_rec *ext,
52 u64 blkno);
53 53
54static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, 54/*
55 handle_t *handle, 55 * Structures which describe a path through a btree, and functions to
56 struct inode *inode, 56 * manipulate them.
57 int wanted, 57 *
58 struct ocfs2_alloc_context *meta_ac, 58 * The idea here is to be as generic as possible with the tree
59 struct buffer_head *bhs[]); 59 * manipulation code.
60 */
61struct ocfs2_path_item {
62 struct buffer_head *bh;
63 struct ocfs2_extent_list *el;
64};
60 65
61static int ocfs2_add_branch(struct ocfs2_super *osb, 66#define OCFS2_MAX_PATH_DEPTH 5
62 handle_t *handle,
63 struct inode *inode,
64 struct buffer_head *fe_bh,
65 struct buffer_head *eb_bh,
66 struct buffer_head *last_eb_bh,
67 struct ocfs2_alloc_context *meta_ac);
68 67
69static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 68struct ocfs2_path {
70 handle_t *handle, 69 int p_tree_depth;
71 struct inode *inode, 70 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
72 struct buffer_head *fe_bh, 71};
73 struct ocfs2_alloc_context *meta_ac,
74 struct buffer_head **ret_new_eb_bh);
75 72
76static int ocfs2_do_insert_extent(struct ocfs2_super *osb, 73#define path_root_bh(_path) ((_path)->p_node[0].bh)
77 handle_t *handle, 74#define path_root_el(_path) ((_path)->p_node[0].el)
78 struct inode *inode, 75#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
79 struct buffer_head *fe_bh, 76#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
80 u64 blkno, 77#define path_num_items(_path) ((_path)->p_tree_depth + 1)
81 u32 new_clusters);
82 78
83static int ocfs2_find_branch_target(struct ocfs2_super *osb, 79/*
84 struct inode *inode, 80 * Reset the actual path elements so that we can re-use the structure
85 struct buffer_head *fe_bh, 81 * to build another path. Generally, this involves freeing the buffer
86 struct buffer_head **target_bh); 82 * heads.
83 */
84static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
85{
86 int i, start = 0, depth = 0;
87 struct ocfs2_path_item *node;
87 88
88static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, 89 if (keep_root)
89 struct inode *inode, 90 start = 1;
90 struct ocfs2_dinode *fe, 91
91 unsigned int new_i_clusters, 92 for(i = start; i < path_num_items(path); i++) {
92 struct buffer_head *old_last_eb, 93 node = &path->p_node[i];
93 struct buffer_head **new_last_eb); 94
95 brelse(node->bh);
96 node->bh = NULL;
97 node->el = NULL;
98 }
99
100 /*
101 * Tree depth may change during truncate, or insert. If we're
102 * keeping the root extent list, then make sure that our path
103 * structure reflects the proper depth.
104 */
105 if (keep_root)
106 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
107
108 path->p_tree_depth = depth;
109}
110
111static void ocfs2_free_path(struct ocfs2_path *path)
112{
113 if (path) {
114 ocfs2_reinit_path(path, 0);
115 kfree(path);
116 }
117}
118
119/*
120 * Make the *dest path the same as src and re-initialize src path to
121 * have a root only.
122 */
123static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
124{
125 int i;
126
127 BUG_ON(path_root_bh(dest) != path_root_bh(src));
128
129 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
130 brelse(dest->p_node[i].bh);
131
132 dest->p_node[i].bh = src->p_node[i].bh;
133 dest->p_node[i].el = src->p_node[i].el;
134
135 src->p_node[i].bh = NULL;
136 src->p_node[i].el = NULL;
137 }
138}
139
140/*
141 * Insert an extent block at given index.
142 *
143 * This will not take an additional reference on eb_bh.
144 */
145static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
146 struct buffer_head *eb_bh)
147{
148 struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
149
150 /*
151 * Right now, no root bh is an extent block, so this helps
152 * catch code errors with dinode trees. The assertion can be
153 * safely removed if we ever need to insert extent block
154 * structures at the root.
155 */
156 BUG_ON(index == 0);
157
158 path->p_node[index].bh = eb_bh;
159 path->p_node[index].el = &eb->h_list;
160}
161
162static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
163 struct ocfs2_extent_list *root_el)
164{
165 struct ocfs2_path *path;
166
167 BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
168
169 path = kzalloc(sizeof(*path), GFP_NOFS);
170 if (path) {
171 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
172 get_bh(root_bh);
173 path_root_bh(path) = root_bh;
174 path_root_el(path) = root_el;
175 }
176
177 return path;
178}
179
180/*
181 * Allocate and initialize a new path based on a disk inode tree.
182 */
183static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
184{
185 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
186 struct ocfs2_extent_list *el = &di->id2.i_list;
187
188 return ocfs2_new_path(di_bh, el);
189}
190
191/*
192 * Convenience function to journal all components in a path.
193 */
194static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
195 struct ocfs2_path *path)
196{
197 int i, ret = 0;
198
199 if (!path)
200 goto out;
201
202 for(i = 0; i < path_num_items(path); i++) {
203 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
204 OCFS2_JOURNAL_ACCESS_WRITE);
205 if (ret < 0) {
206 mlog_errno(ret);
207 goto out;
208 }
209 }
210
211out:
212 return ret;
213}
214
215enum ocfs2_contig_type {
216 CONTIG_NONE = 0,
217 CONTIG_LEFT,
218 CONTIG_RIGHT
219};
94 220
95static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
96 221
97static int ocfs2_extent_contig(struct inode *inode, 222/*
98 struct ocfs2_extent_rec *ext, 223 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
99 u64 blkno) 224 * ocfs2_extent_contig only work properly against leaf nodes!
225 */
226static int ocfs2_block_extent_contig(struct super_block *sb,
227 struct ocfs2_extent_rec *ext,
228 u64 blkno)
229{
230 u64 blk_end = le64_to_cpu(ext->e_blkno);
231
232 blk_end += ocfs2_clusters_to_blocks(sb,
233 le16_to_cpu(ext->e_leaf_clusters));
234
235 return blkno == blk_end;
236}
237
238static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
239 struct ocfs2_extent_rec *right)
240{
241 u32 left_range;
242
243 left_range = le32_to_cpu(left->e_cpos) +
244 le16_to_cpu(left->e_leaf_clusters);
245
246 return (left_range == le32_to_cpu(right->e_cpos));
247}
248
249static enum ocfs2_contig_type
250 ocfs2_extent_contig(struct inode *inode,
251 struct ocfs2_extent_rec *ext,
252 struct ocfs2_extent_rec *insert_rec)
100{ 253{
101 return blkno == (le64_to_cpu(ext->e_blkno) + 254 u64 blkno = le64_to_cpu(insert_rec->e_blkno);
102 ocfs2_clusters_to_blocks(inode->i_sb, 255
103 le32_to_cpu(ext->e_clusters))); 256 if (ocfs2_extents_adjacent(ext, insert_rec) &&
257 ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
258 return CONTIG_RIGHT;
259
260 blkno = le64_to_cpu(ext->e_blkno);
261 if (ocfs2_extents_adjacent(insert_rec, ext) &&
262 ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
263 return CONTIG_LEFT;
264
265 return CONTIG_NONE;
104} 266}
105 267
106/* 268/*
269 * NOTE: We can have pretty much any combination of contiguousness and
270 * appending.
271 *
272 * The usefulness of APPEND_TAIL is more in that it lets us know that
273 * we'll have to update the path to that leaf.
274 */
275enum ocfs2_append_type {
276 APPEND_NONE = 0,
277 APPEND_TAIL,
278};
279
280struct ocfs2_insert_type {
281 enum ocfs2_append_type ins_appending;
282 enum ocfs2_contig_type ins_contig;
283 int ins_contig_index;
284 int ins_free_records;
285 int ins_tree_depth;
286};
287
288/*
107 * How many free extents have we got before we need more meta data? 289 * How many free extents have we got before we need more meta data?
108 */ 290 */
109int ocfs2_num_free_extents(struct ocfs2_super *osb, 291int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -242,6 +424,28 @@ bail:
242} 424}
243 425
244/* 426/*
427 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
428 *
429 * Returns the sum of the rightmost extent rec logical offset and
430 * cluster count.
431 *
432 * ocfs2_add_branch() uses this to determine what logical cluster
433 * value should be populated into the leftmost new branch records.
434 *
435 * ocfs2_shift_tree_depth() uses this to determine the # clusters
436 * value for the new topmost tree record.
437 */
438static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
439{
440 int i;
441
442 i = le16_to_cpu(el->l_next_free_rec) - 1;
443
444 return le32_to_cpu(el->l_recs[i].e_cpos) +
445 ocfs2_rec_clusters(el, &el->l_recs[i]);
446}
447
448/*
245 * Add an entire tree branch to our inode. eb_bh is the extent block 449 * Add an entire tree branch to our inode. eb_bh is the extent block
246 * to start at, if we don't want to start the branch at the dinode 450 * to start at, if we don't want to start the branch at the dinode
247 * structure. 451 * structure.
@@ -250,7 +454,7 @@ bail:
250 * for the new last extent block. 454 * for the new last extent block.
251 * 455 *
252 * the new branch will be 'empty' in the sense that every block will 456 * the new branch will be 'empty' in the sense that every block will
253 * contain a single record with e_clusters == 0. 457 * contain a single record with cluster count == 0.
254 */ 458 */
255static int ocfs2_add_branch(struct ocfs2_super *osb, 459static int ocfs2_add_branch(struct ocfs2_super *osb,
256 handle_t *handle, 460 handle_t *handle,
@@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
268 struct ocfs2_extent_block *eb; 472 struct ocfs2_extent_block *eb;
269 struct ocfs2_extent_list *eb_el; 473 struct ocfs2_extent_list *eb_el;
270 struct ocfs2_extent_list *el; 474 struct ocfs2_extent_list *el;
475 u32 new_cpos;
271 476
272 mlog_entry_void(); 477 mlog_entry_void();
273 478
@@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
302 goto bail; 507 goto bail;
303 } 508 }
304 509
510 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
511 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
512
305 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 513 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
306 * linked with the rest of the tree. 514 * linked with the rest of the tree.
307 * conversly, new_eb_bhs[0] is the new bottommost leaf. 515 * conversly, new_eb_bhs[0] is the new bottommost leaf.
@@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
330 eb->h_next_leaf_blk = 0; 538 eb->h_next_leaf_blk = 0;
331 eb_el->l_tree_depth = cpu_to_le16(i); 539 eb_el->l_tree_depth = cpu_to_le16(i);
332 eb_el->l_next_free_rec = cpu_to_le16(1); 540 eb_el->l_next_free_rec = cpu_to_le16(1);
333 eb_el->l_recs[0].e_cpos = fe->i_clusters; 541 /*
542 * This actually counts as an empty extent as
543 * c_clusters == 0
544 */
545 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
334 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); 546 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
335 eb_el->l_recs[0].e_clusters = cpu_to_le32(0); 547 /*
548 * eb_el isn't always an interior node, but even leaf
549 * nodes want a zero'd flags and reserved field so
550 * this gets the whole 32 bits regardless of use.
551 */
552 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
336 if (!eb_el->l_tree_depth) 553 if (!eb_el->l_tree_depth)
337 new_last_eb_blk = le64_to_cpu(eb->h_blkno); 554 new_last_eb_blk = le64_to_cpu(eb->h_blkno);
338 555
@@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
376 * either be on the fe, or the extent block passed in. */ 593 * either be on the fe, or the extent block passed in. */
377 i = le16_to_cpu(el->l_next_free_rec); 594 i = le16_to_cpu(el->l_next_free_rec);
378 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 595 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
379 el->l_recs[i].e_cpos = fe->i_clusters; 596 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
380 el->l_recs[i].e_clusters = 0; 597 el->l_recs[i].e_int_clusters = 0;
381 le16_add_cpu(&el->l_next_free_rec, 1); 598 le16_add_cpu(&el->l_next_free_rec, 1);
382 599
383 /* fe needs a new last extent block pointer, as does the 600 /* fe needs a new last extent block pointer, as does the
@@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
425 struct buffer_head **ret_new_eb_bh) 642 struct buffer_head **ret_new_eb_bh)
426{ 643{
427 int status, i; 644 int status, i;
645 u32 new_clusters;
428 struct buffer_head *new_eb_bh = NULL; 646 struct buffer_head *new_eb_bh = NULL;
429 struct ocfs2_dinode *fe; 647 struct ocfs2_dinode *fe;
430 struct ocfs2_extent_block *eb; 648 struct ocfs2_extent_block *eb;
@@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
461 /* copy the fe data into the new extent block */ 679 /* copy the fe data into the new extent block */
462 eb_el->l_tree_depth = fe_el->l_tree_depth; 680 eb_el->l_tree_depth = fe_el->l_tree_depth;
463 eb_el->l_next_free_rec = fe_el->l_next_free_rec; 681 eb_el->l_next_free_rec = fe_el->l_next_free_rec;
464 for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { 682 for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
465 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; 683 eb_el->l_recs[i] = fe_el->l_recs[i];
466 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
467 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
468 }
469 684
470 status = ocfs2_journal_dirty(handle, new_eb_bh); 685 status = ocfs2_journal_dirty(handle, new_eb_bh);
471 if (status < 0) { 686 if (status < 0) {
@@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
480 goto bail; 695 goto bail;
481 } 696 }
482 697
698 new_clusters = ocfs2_sum_rightmost_rec(eb_el);
699
483 /* update fe now */ 700 /* update fe now */
484 le16_add_cpu(&fe_el->l_tree_depth, 1); 701 le16_add_cpu(&fe_el->l_tree_depth, 1);
485 fe_el->l_recs[0].e_cpos = 0; 702 fe_el->l_recs[0].e_cpos = 0;
486 fe_el->l_recs[0].e_blkno = eb->h_blkno; 703 fe_el->l_recs[0].e_blkno = eb->h_blkno;
487 fe_el->l_recs[0].e_clusters = fe->i_clusters; 704 fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
488 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { 705 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
489 fe_el->l_recs[i].e_cpos = 0; 706 memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
490 fe_el->l_recs[i].e_clusters = 0;
491 fe_el->l_recs[i].e_blkno = 0;
492 }
493 fe_el->l_next_free_rec = cpu_to_le16(1); 707 fe_el->l_next_free_rec = cpu_to_le16(1);
494 708
495 /* If this is our 1st tree depth shift, then last_eb_blk 709 /* If this is our 1st tree depth shift, then last_eb_blk
@@ -515,199 +729,6 @@ bail:
515} 729}
516 730
517/* 731/*
518 * Expects the tree to already have room in the rightmost leaf for the
519 * extent. Updates all the extent blocks (and the dinode) on the way
520 * down.
521 */
522static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
523 handle_t *handle,
524 struct inode *inode,
525 struct buffer_head *fe_bh,
526 u64 start_blk,
527 u32 new_clusters)
528{
529 int status, i, num_bhs = 0;
530 u64 next_blkno;
531 u16 next_free;
532 struct buffer_head **eb_bhs = NULL;
533 struct ocfs2_dinode *fe;
534 struct ocfs2_extent_block *eb;
535 struct ocfs2_extent_list *el;
536
537 mlog_entry_void();
538
539 status = ocfs2_journal_access(handle, inode, fe_bh,
540 OCFS2_JOURNAL_ACCESS_WRITE);
541 if (status < 0) {
542 mlog_errno(status);
543 goto bail;
544 }
545
546 fe = (struct ocfs2_dinode *) fe_bh->b_data;
547 el = &fe->id2.i_list;
548 if (el->l_tree_depth) {
549 /* This is another operation where we want to be
550 * careful about our tree updates. An error here means
551 * none of the previous changes we made should roll
552 * forward. As a result, we have to record the buffers
553 * for this part of the tree in an array and reserve a
554 * journal write to them before making any changes. */
555 num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
556 eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
557 GFP_KERNEL);
558 if (!eb_bhs) {
559 status = -ENOMEM;
560 mlog_errno(status);
561 goto bail;
562 }
563
564 i = 0;
565 while(el->l_tree_depth) {
566 next_free = le16_to_cpu(el->l_next_free_rec);
567 if (next_free == 0) {
568 ocfs2_error(inode->i_sb,
569 "Dinode %llu has a bad extent list",
570 (unsigned long long)OCFS2_I(inode)->ip_blkno);
571 status = -EIO;
572 goto bail;
573 }
574 next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
575
576 BUG_ON(i >= num_bhs);
577 status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
578 OCFS2_BH_CACHED, inode);
579 if (status < 0) {
580 mlog_errno(status);
581 goto bail;
582 }
583 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
584 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
585 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
586 eb);
587 status = -EIO;
588 goto bail;
589 }
590
591 status = ocfs2_journal_access(handle, inode, eb_bhs[i],
592 OCFS2_JOURNAL_ACCESS_WRITE);
593 if (status < 0) {
594 mlog_errno(status);
595 goto bail;
596 }
597
598 el = &eb->h_list;
599 i++;
600 /* When we leave this loop, eb_bhs[num_bhs - 1] will
601 * hold the bottom-most leaf extent block. */
602 }
603 BUG_ON(el->l_tree_depth);
604
605 el = &fe->id2.i_list;
606 /* If we have tree depth, then the fe update is
607 * trivial, and we want to switch el out for the
608 * bottom-most leaf in order to update it with the
609 * actual extent data below. */
610 next_free = le16_to_cpu(el->l_next_free_rec);
611 if (next_free == 0) {
612 ocfs2_error(inode->i_sb,
613 "Dinode %llu has a bad extent list",
614 (unsigned long long)OCFS2_I(inode)->ip_blkno);
615 status = -EIO;
616 goto bail;
617 }
618 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
619 new_clusters);
620 /* (num_bhs - 1) to avoid the leaf */
621 for(i = 0; i < (num_bhs - 1); i++) {
622 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
623 el = &eb->h_list;
624
625 /* finally, make our actual change to the
626 * intermediate extent blocks. */
627 next_free = le16_to_cpu(el->l_next_free_rec);
628 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
629 new_clusters);
630
631 status = ocfs2_journal_dirty(handle, eb_bhs[i]);
632 if (status < 0)
633 mlog_errno(status);
634 }
635 BUG_ON(i != (num_bhs - 1));
636 /* note that the leaf block wasn't touched in
637 * the loop above */
638 eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
639 el = &eb->h_list;
640 BUG_ON(el->l_tree_depth);
641 }
642
643 /* yay, we can finally add the actual extent now! */
644 i = le16_to_cpu(el->l_next_free_rec) - 1;
645 if (le16_to_cpu(el->l_next_free_rec) &&
646 ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
647 le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
648 } else if (le16_to_cpu(el->l_next_free_rec) &&
649 (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
650 /* having an empty extent at eof is legal. */
651 if (el->l_recs[i].e_cpos != fe->i_clusters) {
652 ocfs2_error(inode->i_sb,
653 "Dinode %llu trailing extent is bad: "
654 "cpos (%u) != number of clusters (%u)",
655 (unsigned long long)OCFS2_I(inode)->ip_blkno,
656 le32_to_cpu(el->l_recs[i].e_cpos),
657 le32_to_cpu(fe->i_clusters));
658 status = -EIO;
659 goto bail;
660 }
661 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
662 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
663 } else {
664 /* No contiguous record, or no empty record at eof, so
665 * we add a new one. */
666
667 BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
668 le16_to_cpu(el->l_count));
669 i = le16_to_cpu(el->l_next_free_rec);
670
671 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
672 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
673 el->l_recs[i].e_cpos = fe->i_clusters;
674 le16_add_cpu(&el->l_next_free_rec, 1);
675 }
676
677 /*
678 * extent_map errors are not fatal, so they are ignored outside
679 * of flushing the thing.
680 */
681 status = ocfs2_extent_map_append(inode, &el->l_recs[i],
682 new_clusters);
683 if (status) {
684 mlog_errno(status);
685 ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
686 }
687
688 status = ocfs2_journal_dirty(handle, fe_bh);
689 if (status < 0)
690 mlog_errno(status);
691 if (fe->id2.i_list.l_tree_depth) {
692 status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
693 if (status < 0)
694 mlog_errno(status);
695 }
696
697 status = 0;
698bail:
699 if (eb_bhs) {
700 for (i = 0; i < num_bhs; i++)
701 if (eb_bhs[i])
702 brelse(eb_bhs[i]);
703 kfree(eb_bhs);
704 }
705
706 mlog_exit(status);
707 return status;
708}
709
710/*
711 * Should only be called when there is no space left in any of the 732 * Should only be called when there is no space left in any of the
712 * leaf nodes. What we want to do is find the lowest tree depth 733 * leaf nodes. What we want to do is find the lowest tree depth
713 * non-leaf extent block with room for new records. There are three 734 * non-leaf extent block with room for new records. There are three
@@ -807,53 +828,1548 @@ bail:
807 return status; 828 return status;
808} 829}
809 830
810/* the caller needs to update fe->i_clusters */ 831/*
811int ocfs2_insert_extent(struct ocfs2_super *osb, 832 * This is only valid for leaf nodes, which are the only ones that can
812 handle_t *handle, 833 * have empty extents anyway.
813 struct inode *inode, 834 */
814 struct buffer_head *fe_bh, 835static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
815 u64 start_blk,
816 u32 new_clusters,
817 struct ocfs2_alloc_context *meta_ac)
818{ 836{
819 int status, i, shift; 837 return !rec->e_leaf_clusters;
820 struct buffer_head *last_eb_bh = NULL; 838}
839
840/*
841 * This function will discard the rightmost extent record.
842 */
843static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
844{
845 int next_free = le16_to_cpu(el->l_next_free_rec);
846 int count = le16_to_cpu(el->l_count);
847 unsigned int num_bytes;
848
849 BUG_ON(!next_free);
850 /* This will cause us to go off the end of our extent list. */
851 BUG_ON(next_free >= count);
852
853 num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
854
855 memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
856}
857
858static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
859 struct ocfs2_extent_rec *insert_rec)
860{
861 int i, insert_index, next_free, has_empty, num_bytes;
862 u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
863 struct ocfs2_extent_rec *rec;
864
865 next_free = le16_to_cpu(el->l_next_free_rec);
866 has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
867
868 BUG_ON(!next_free);
869
870 /* The tree code before us didn't allow enough room in the leaf. */
871 if (el->l_next_free_rec == el->l_count && !has_empty)
872 BUG();
873
874 /*
875 * The easiest way to approach this is to just remove the
876 * empty extent and temporarily decrement next_free.
877 */
878 if (has_empty) {
879 /*
880 * If next_free was 1 (only an empty extent), this
881 * loop won't execute, which is fine. We still want
882 * the decrement above to happen.
883 */
884 for(i = 0; i < (next_free - 1); i++)
885 el->l_recs[i] = el->l_recs[i+1];
886
887 next_free--;
888 }
889
890 /*
891 * Figure out what the new record index should be.
892 */
893 for(i = 0; i < next_free; i++) {
894 rec = &el->l_recs[i];
895
896 if (insert_cpos < le32_to_cpu(rec->e_cpos))
897 break;
898 }
899 insert_index = i;
900
901 mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
902 insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
903
904 BUG_ON(insert_index < 0);
905 BUG_ON(insert_index >= le16_to_cpu(el->l_count));
906 BUG_ON(insert_index > next_free);
907
908 /*
909 * No need to memmove if we're just adding to the tail.
910 */
911 if (insert_index != next_free) {
912 BUG_ON(next_free >= le16_to_cpu(el->l_count));
913
914 num_bytes = next_free - insert_index;
915 num_bytes *= sizeof(struct ocfs2_extent_rec);
916 memmove(&el->l_recs[insert_index + 1],
917 &el->l_recs[insert_index],
918 num_bytes);
919 }
920
921 /*
922 * Either we had an empty extent, and need to re-increment or
923 * there was no empty extent on a non full rightmost leaf node,
924 * in which case we still need to increment.
925 */
926 next_free++;
927 el->l_next_free_rec = cpu_to_le16(next_free);
928 /*
929 * Make sure none of the math above just messed up our tree.
930 */
931 BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
932
933 el->l_recs[insert_index] = *insert_rec;
934
935}
936
937/*
938 * Create an empty extent record .
939 *
940 * l_next_free_rec may be updated.
941 *
942 * If an empty extent already exists do nothing.
943 */
944static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
945{
946 int next_free = le16_to_cpu(el->l_next_free_rec);
947
948 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
949
950 if (next_free == 0)
951 goto set_and_inc;
952
953 if (ocfs2_is_empty_extent(&el->l_recs[0]))
954 return;
955
956 mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
957 "Asked to create an empty extent in a full list:\n"
958 "count = %u, tree depth = %u",
959 le16_to_cpu(el->l_count),
960 le16_to_cpu(el->l_tree_depth));
961
962 ocfs2_shift_records_right(el);
963
964set_and_inc:
965 le16_add_cpu(&el->l_next_free_rec, 1);
966 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
967}
968
969/*
970 * For a rotation which involves two leaf nodes, the "root node" is
971 * the lowest level tree node which contains a path to both leafs. This
972 * resulting set of information can be used to form a complete "subtree"
973 *
974 * This function is passed two full paths from the dinode down to a
975 * pair of adjacent leaves. It's task is to figure out which path
976 * index contains the subtree root - this can be the root index itself
977 * in a worst-case rotation.
978 *
979 * The array index of the subtree root is passed back.
980 */
981static int ocfs2_find_subtree_root(struct inode *inode,
982 struct ocfs2_path *left,
983 struct ocfs2_path *right)
984{
985 int i = 0;
986
987 /*
988 * Check that the caller passed in two paths from the same tree.
989 */
990 BUG_ON(path_root_bh(left) != path_root_bh(right));
991
992 do {
993 i++;
994
995 /*
996 * The caller didn't pass two adjacent paths.
997 */
998 mlog_bug_on_msg(i > left->p_tree_depth,
999 "Inode %lu, left depth %u, right depth %u\n"
1000 "left leaf blk %llu, right leaf blk %llu\n",
1001 inode->i_ino, left->p_tree_depth,
1002 right->p_tree_depth,
1003 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1004 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1005 } while (left->p_node[i].bh->b_blocknr ==
1006 right->p_node[i].bh->b_blocknr);
1007
1008 return i - 1;
1009}
1010
1011typedef void (path_insert_t)(void *, struct buffer_head *);
1012
1013/*
1014 * Traverse a btree path in search of cpos, starting at root_el.
1015 *
1016 * This code can be called with a cpos larger than the tree, in which
1017 * case it will return the rightmost path.
1018 */
1019static int __ocfs2_find_path(struct inode *inode,
1020 struct ocfs2_extent_list *root_el, u32 cpos,
1021 path_insert_t *func, void *data)
1022{
1023 int i, ret = 0;
1024 u32 range;
1025 u64 blkno;
821 struct buffer_head *bh = NULL; 1026 struct buffer_head *bh = NULL;
822 struct ocfs2_dinode *fe;
823 struct ocfs2_extent_block *eb; 1027 struct ocfs2_extent_block *eb;
824 struct ocfs2_extent_list *el; 1028 struct ocfs2_extent_list *el;
1029 struct ocfs2_extent_rec *rec;
1030 struct ocfs2_inode_info *oi = OCFS2_I(inode);
825 1031
826 mlog_entry_void(); 1032 el = root_el;
1033 while (el->l_tree_depth) {
1034 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1035 ocfs2_error(inode->i_sb,
1036 "Inode %llu has empty extent list at "
1037 "depth %u\n",
1038 (unsigned long long)oi->ip_blkno,
1039 le16_to_cpu(el->l_tree_depth));
1040 ret = -EROFS;
1041 goto out;
827 1042
828 mlog(0, "add %u clusters starting at block %llu to inode %llu\n", 1043 }
829 new_clusters, (unsigned long long)start_blk,
830 (unsigned long long)OCFS2_I(inode)->ip_blkno);
831 1044
832 fe = (struct ocfs2_dinode *) fe_bh->b_data; 1045 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
833 el = &fe->id2.i_list; 1046 rec = &el->l_recs[i];
1047
1048 /*
1049 * In the case that cpos is off the allocation
1050 * tree, this should just wind up returning the
1051 * rightmost record.
1052 */
1053 range = le32_to_cpu(rec->e_cpos) +
1054 ocfs2_rec_clusters(el, rec);
1055 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1056 break;
1057 }
834 1058
835 if (el->l_tree_depth) { 1059 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
836 /* jump to end of tree */ 1060 if (blkno == 0) {
837 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 1061 ocfs2_error(inode->i_sb,
838 &last_eb_bh, OCFS2_BH_CACHED, inode); 1062 "Inode %llu has bad blkno in extent list "
839 if (status < 0) { 1063 "at depth %u (index %d)\n",
840 mlog_exit(status); 1064 (unsigned long long)oi->ip_blkno,
841 goto bail; 1065 le16_to_cpu(el->l_tree_depth), i);
1066 ret = -EROFS;
1067 goto out;
842 } 1068 }
843 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 1069
1070 brelse(bh);
1071 bh = NULL;
1072 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1073 &bh, OCFS2_BH_CACHED, inode);
1074 if (ret) {
1075 mlog_errno(ret);
1076 goto out;
1077 }
1078
1079 eb = (struct ocfs2_extent_block *) bh->b_data;
844 el = &eb->h_list; 1080 el = &eb->h_list;
1081 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1082 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1083 ret = -EIO;
1084 goto out;
1085 }
1086
1087 if (le16_to_cpu(el->l_next_free_rec) >
1088 le16_to_cpu(el->l_count)) {
1089 ocfs2_error(inode->i_sb,
1090 "Inode %llu has bad count in extent list "
1091 "at block %llu (next free=%u, count=%u)\n",
1092 (unsigned long long)oi->ip_blkno,
1093 (unsigned long long)bh->b_blocknr,
1094 le16_to_cpu(el->l_next_free_rec),
1095 le16_to_cpu(el->l_count));
1096 ret = -EROFS;
1097 goto out;
1098 }
1099
1100 if (func)
1101 func(data, bh);
1102 }
1103
1104out:
1105 /*
1106 * Catch any trailing bh that the loop didn't handle.
1107 */
1108 brelse(bh);
1109
1110 return ret;
1111}
1112
1113/*
1114 * Given an initialized path (that is, it has a valid root extent
1115 * list), this function will traverse the btree in search of the path
1116 * which would contain cpos.
1117 *
1118 * The path traveled is recorded in the path structure.
1119 *
1120 * Note that this will not do any comparisons on leaf node extent
1121 * records, so it will work fine in the case that we just added a tree
1122 * branch.
1123 */
1124struct find_path_data {
1125 int index;
1126 struct ocfs2_path *path;
1127};
1128static void find_path_ins(void *data, struct buffer_head *bh)
1129{
1130 struct find_path_data *fp = data;
1131
1132 get_bh(bh);
1133 ocfs2_path_insert_eb(fp->path, fp->index, bh);
1134 fp->index++;
1135}
1136static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1137 u32 cpos)
1138{
1139 struct find_path_data data;
1140
1141 data.index = 1;
1142 data.path = path;
1143 return __ocfs2_find_path(inode, path_root_el(path), cpos,
1144 find_path_ins, &data);
1145}
1146
1147static void find_leaf_ins(void *data, struct buffer_head *bh)
1148{
1149 struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1150 struct ocfs2_extent_list *el = &eb->h_list;
1151 struct buffer_head **ret = data;
1152
1153 /* We want to retain only the leaf block. */
1154 if (le16_to_cpu(el->l_tree_depth) == 0) {
1155 get_bh(bh);
1156 *ret = bh;
1157 }
1158}
1159/*
1160 * Find the leaf block in the tree which would contain cpos. No
1161 * checking of the actual leaf is done.
1162 *
1163 * Some paths want to call this instead of allocating a path structure
1164 * and calling ocfs2_find_path().
1165 *
1166 * This function doesn't handle non btree extent lists.
1167 */
1168int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1169 u32 cpos, struct buffer_head **leaf_bh)
1170{
1171 int ret;
1172 struct buffer_head *bh = NULL;
1173
1174 ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1175 if (ret) {
1176 mlog_errno(ret);
1177 goto out;
1178 }
1179
1180 *leaf_bh = bh;
1181out:
1182 return ret;
1183}
1184
1185/*
1186 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1187 *
1188 * Basically, we've moved stuff around at the bottom of the tree and
1189 * we need to fix up the extent records above the changes to reflect
1190 * the new changes.
1191 *
1192 * left_rec: the record on the left.
1193 * left_child_el: is the child list pointed to by left_rec
1194 * right_rec: the record to the right of left_rec
1195 * right_child_el: is the child list pointed to by right_rec
1196 *
1197 * By definition, this only works on interior nodes.
1198 */
1199static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1200 struct ocfs2_extent_list *left_child_el,
1201 struct ocfs2_extent_rec *right_rec,
1202 struct ocfs2_extent_list *right_child_el)
1203{
1204 u32 left_clusters, right_end;
1205
1206 /*
1207 * Interior nodes never have holes. Their cpos is the cpos of
1208 * the leftmost record in their child list. Their cluster
1209 * count covers the full theoretical range of their child list
1210 * - the range between their cpos and the cpos of the record
1211 * immediately to their right.
1212 */
1213 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1214 left_clusters -= le32_to_cpu(left_rec->e_cpos);
1215 left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1216
1217 /*
1218 * Calculate the rightmost cluster count boundary before
1219 * moving cpos - we will need to adjust clusters after
1220 * updating e_cpos to keep the same highest cluster count.
1221 */
1222 right_end = le32_to_cpu(right_rec->e_cpos);
1223 right_end += le32_to_cpu(right_rec->e_int_clusters);
1224
1225 right_rec->e_cpos = left_rec->e_cpos;
1226 le32_add_cpu(&right_rec->e_cpos, left_clusters);
1227
1228 right_end -= le32_to_cpu(right_rec->e_cpos);
1229 right_rec->e_int_clusters = cpu_to_le32(right_end);
1230}
1231
1232/*
1233 * Adjust the adjacent root node records involved in a
1234 * rotation. left_el_blkno is passed in as a key so that we can easily
1235 * find it's index in the root list.
1236 */
1237static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1238 struct ocfs2_extent_list *left_el,
1239 struct ocfs2_extent_list *right_el,
1240 u64 left_el_blkno)
1241{
1242 int i;
1243
1244 BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1245 le16_to_cpu(left_el->l_tree_depth));
1246
1247 for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1248 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1249 break;
1250 }
1251
1252 /*
1253 * The path walking code should have never returned a root and
1254 * two paths which are not adjacent.
1255 */
1256 BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1257
1258 ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1259 &root_el->l_recs[i + 1], right_el);
1260}
1261
1262/*
1263 * We've changed a leaf block (in right_path) and need to reflect that
1264 * change back up the subtree.
1265 *
1266 * This happens in multiple places:
1267 * - When we've moved an extent record from the left path leaf to the right
1268 * path leaf to make room for an empty extent in the left path leaf.
1269 * - When our insert into the right path leaf is at the leftmost edge
1270 * and requires an update of the path immediately to it's left. This
1271 * can occur at the end of some types of rotation and appending inserts.
1272 */
1273static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1274 struct ocfs2_path *left_path,
1275 struct ocfs2_path *right_path,
1276 int subtree_index)
1277{
1278 int ret, i, idx;
1279 struct ocfs2_extent_list *el, *left_el, *right_el;
1280 struct ocfs2_extent_rec *left_rec, *right_rec;
1281 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1282
1283 /*
1284 * Update the counts and position values within all the
1285 * interior nodes to reflect the leaf rotation we just did.
1286 *
1287 * The root node is handled below the loop.
1288 *
1289 * We begin the loop with right_el and left_el pointing to the
1290 * leaf lists and work our way up.
1291 *
1292 * NOTE: within this loop, left_el and right_el always refer
1293 * to the *child* lists.
1294 */
1295 left_el = path_leaf_el(left_path);
1296 right_el = path_leaf_el(right_path);
1297 for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1298 mlog(0, "Adjust records at index %u\n", i);
1299
1300 /*
1301 * One nice property of knowing that all of these
1302 * nodes are below the root is that we only deal with
1303 * the leftmost right node record and the rightmost
1304 * left node record.
1305 */
1306 el = left_path->p_node[i].el;
1307 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1308 left_rec = &el->l_recs[idx];
1309
1310 el = right_path->p_node[i].el;
1311 right_rec = &el->l_recs[0];
1312
1313 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1314 right_el);
1315
1316 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1317 if (ret)
1318 mlog_errno(ret);
1319
1320 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1321 if (ret)
1322 mlog_errno(ret);
1323
1324 /*
1325 * Setup our list pointers now so that the current
1326 * parents become children in the next iteration.
1327 */
1328 left_el = left_path->p_node[i].el;
1329 right_el = right_path->p_node[i].el;
1330 }
1331
1332 /*
1333 * At the root node, adjust the two adjacent records which
1334 * begin our path to the leaves.
1335 */
1336
1337 el = left_path->p_node[subtree_index].el;
1338 left_el = left_path->p_node[subtree_index + 1].el;
1339 right_el = right_path->p_node[subtree_index + 1].el;
1340
1341 ocfs2_adjust_root_records(el, left_el, right_el,
1342 left_path->p_node[subtree_index + 1].bh->b_blocknr);
1343
1344 root_bh = left_path->p_node[subtree_index].bh;
1345
1346 ret = ocfs2_journal_dirty(handle, root_bh);
1347 if (ret)
1348 mlog_errno(ret);
1349}
1350
1351static int ocfs2_rotate_subtree_right(struct inode *inode,
1352 handle_t *handle,
1353 struct ocfs2_path *left_path,
1354 struct ocfs2_path *right_path,
1355 int subtree_index)
1356{
1357 int ret, i;
1358 struct buffer_head *right_leaf_bh;
1359 struct buffer_head *left_leaf_bh = NULL;
1360 struct buffer_head *root_bh;
1361 struct ocfs2_extent_list *right_el, *left_el;
1362 struct ocfs2_extent_rec move_rec;
1363
1364 left_leaf_bh = path_leaf_bh(left_path);
1365 left_el = path_leaf_el(left_path);
1366
1367 if (left_el->l_next_free_rec != left_el->l_count) {
1368 ocfs2_error(inode->i_sb,
1369 "Inode %llu has non-full interior leaf node %llu"
1370 "(next free = %u)",
1371 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1372 (unsigned long long)left_leaf_bh->b_blocknr,
1373 le16_to_cpu(left_el->l_next_free_rec));
1374 return -EROFS;
1375 }
1376
1377 /*
1378 * This extent block may already have an empty record, so we
1379 * return early if so.
1380 */
1381 if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1382 return 0;
1383
1384 root_bh = left_path->p_node[subtree_index].bh;
1385 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1386
1387 ret = ocfs2_journal_access(handle, inode, root_bh,
1388 OCFS2_JOURNAL_ACCESS_WRITE);
1389 if (ret) {
1390 mlog_errno(ret);
1391 goto out;
1392 }
1393
1394 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1395 ret = ocfs2_journal_access(handle, inode,
1396 right_path->p_node[i].bh,
1397 OCFS2_JOURNAL_ACCESS_WRITE);
1398 if (ret) {
1399 mlog_errno(ret);
1400 goto out;
1401 }
1402
1403 ret = ocfs2_journal_access(handle, inode,
1404 left_path->p_node[i].bh,
1405 OCFS2_JOURNAL_ACCESS_WRITE);
1406 if (ret) {
1407 mlog_errno(ret);
1408 goto out;
1409 }
1410 }
1411
1412 right_leaf_bh = path_leaf_bh(right_path);
1413 right_el = path_leaf_el(right_path);
1414
1415 /* This is a code error, not a disk corruption. */
1416 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1417 "because rightmost leaf block %llu is empty\n",
1418 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1419 (unsigned long long)right_leaf_bh->b_blocknr);
1420
1421 ocfs2_create_empty_extent(right_el);
1422
1423 ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1424 if (ret) {
1425 mlog_errno(ret);
1426 goto out;
1427 }
1428
1429 /* Do the copy now. */
1430 i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1431 move_rec = left_el->l_recs[i];
1432 right_el->l_recs[0] = move_rec;
1433
1434 /*
1435 * Clear out the record we just copied and shift everything
1436 * over, leaving an empty extent in the left leaf.
1437 *
1438 * We temporarily subtract from next_free_rec so that the
1439 * shift will lose the tail record (which is now defunct).
1440 */
1441 le16_add_cpu(&left_el->l_next_free_rec, -1);
1442 ocfs2_shift_records_right(left_el);
1443 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1444 le16_add_cpu(&left_el->l_next_free_rec, 1);
1445
1446 ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1447 if (ret) {
1448 mlog_errno(ret);
1449 goto out;
1450 }
1451
1452 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1453 subtree_index);
1454
1455out:
1456 return ret;
1457}
1458
1459/*
1460 * Given a full path, determine what cpos value would return us a path
1461 * containing the leaf immediately to the left of the current one.
1462 *
1463 * Will return zero if the path passed in is already the leftmost path.
1464 */
1465static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1466 struct ocfs2_path *path, u32 *cpos)
1467{
1468 int i, j, ret = 0;
1469 u64 blkno;
1470 struct ocfs2_extent_list *el;
1471
1472 BUG_ON(path->p_tree_depth == 0);
1473
1474 *cpos = 0;
1475
1476 blkno = path_leaf_bh(path)->b_blocknr;
1477
1478 /* Start at the tree node just above the leaf and work our way up. */
1479 i = path->p_tree_depth - 1;
1480 while (i >= 0) {
1481 el = path->p_node[i].el;
1482
1483 /*
1484 * Find the extent record just before the one in our
1485 * path.
1486 */
1487 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1488 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1489 if (j == 0) {
1490 if (i == 0) {
1491 /*
1492 * We've determined that the
1493 * path specified is already
1494 * the leftmost one - return a
1495 * cpos of zero.
1496 */
1497 goto out;
1498 }
1499 /*
1500 * The leftmost record points to our
1501 * leaf - we need to travel up the
1502 * tree one level.
1503 */
1504 goto next_node;
1505 }
1506
1507 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1508 *cpos = *cpos + ocfs2_rec_clusters(el,
1509 &el->l_recs[j - 1]);
1510 *cpos = *cpos - 1;
1511 goto out;
1512 }
1513 }
1514
1515 /*
1516 * If we got here, we never found a valid node where
1517 * the tree indicated one should be.
1518 */
1519 ocfs2_error(sb,
1520 "Invalid extent tree at extent block %llu\n",
1521 (unsigned long long)blkno);
1522 ret = -EROFS;
1523 goto out;
1524
1525next_node:
1526 blkno = path->p_node[i].bh->b_blocknr;
1527 i--;
1528 }
1529
1530out:
1531 return ret;
1532}
1533
1534static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1535 struct ocfs2_path *path)
1536{
1537 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
1538
1539 if (handle->h_buffer_credits < credits)
1540 return ocfs2_extend_trans(handle, credits);
1541
1542 return 0;
1543}
1544
1545/*
1546 * Trap the case where we're inserting into the theoretical range past
1547 * the _actual_ left leaf range. Otherwise, we'll rotate a record
1548 * whose cpos is less than ours into the right leaf.
1549 *
1550 * It's only necessary to look at the rightmost record of the left
1551 * leaf because the logic that calls us should ensure that the
1552 * theoretical ranges in the path components above the leaves are
1553 * correct.
1554 */
1555static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1556 u32 insert_cpos)
1557{
1558 struct ocfs2_extent_list *left_el;
1559 struct ocfs2_extent_rec *rec;
1560 int next_free;
1561
1562 left_el = path_leaf_el(left_path);
1563 next_free = le16_to_cpu(left_el->l_next_free_rec);
1564 rec = &left_el->l_recs[next_free - 1];
1565
1566 if (insert_cpos > le32_to_cpu(rec->e_cpos))
1567 return 1;
1568 return 0;
1569}
1570
1571/*
1572 * Rotate all the records in a btree right one record, starting at insert_cpos.
1573 *
1574 * The path to the rightmost leaf should be passed in.
1575 *
1576 * The array is assumed to be large enough to hold an entire path (tree depth).
1577 *
1578 * Upon succesful return from this function:
1579 *
1580 * - The 'right_path' array will contain a path to the leaf block
1581 * whose range contains e_cpos.
1582 * - That leaf block will have a single empty extent in list index 0.
1583 * - In the case that the rotation requires a post-insert update,
1584 * *ret_left_path will contain a valid path which can be passed to
1585 * ocfs2_insert_path().
1586 */
1587static int ocfs2_rotate_tree_right(struct inode *inode,
1588 handle_t *handle,
1589 u32 insert_cpos,
1590 struct ocfs2_path *right_path,
1591 struct ocfs2_path **ret_left_path)
1592{
1593 int ret, start;
1594 u32 cpos;
1595 struct ocfs2_path *left_path = NULL;
1596
1597 *ret_left_path = NULL;
1598
1599 left_path = ocfs2_new_path(path_root_bh(right_path),
1600 path_root_el(right_path));
1601 if (!left_path) {
1602 ret = -ENOMEM;
1603 mlog_errno(ret);
1604 goto out;
1605 }
1606
1607 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1608 if (ret) {
1609 mlog_errno(ret);
1610 goto out;
1611 }
1612
1613 mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1614
1615 /*
1616 * What we want to do here is:
1617 *
1618 * 1) Start with the rightmost path.
1619 *
1620 * 2) Determine a path to the leaf block directly to the left
1621 * of that leaf.
1622 *
1623 * 3) Determine the 'subtree root' - the lowest level tree node
1624 * which contains a path to both leaves.
1625 *
1626 * 4) Rotate the subtree.
1627 *
1628 * 5) Find the next subtree by considering the left path to be
1629 * the new right path.
1630 *
1631 * The check at the top of this while loop also accepts
1632 * insert_cpos == cpos because cpos is only a _theoretical_
1633 * value to get us the left path - insert_cpos might very well
1634 * be filling that hole.
1635 *
1636 * Stop at a cpos of '0' because we either started at the
1637 * leftmost branch (i.e., a tree with one branch and a
1638 * rotation inside of it), or we've gone as far as we can in
1639 * rotating subtrees.
1640 */
1641 while (cpos && insert_cpos <= cpos) {
1642 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1643 insert_cpos, cpos);
1644
1645 ret = ocfs2_find_path(inode, left_path, cpos);
1646 if (ret) {
1647 mlog_errno(ret);
1648 goto out;
1649 }
1650
1651 mlog_bug_on_msg(path_leaf_bh(left_path) ==
1652 path_leaf_bh(right_path),
1653 "Inode %lu: error during insert of %u "
1654 "(left path cpos %u) results in two identical "
1655 "paths ending at %llu\n",
1656 inode->i_ino, insert_cpos, cpos,
1657 (unsigned long long)
1658 path_leaf_bh(left_path)->b_blocknr);
1659
1660 if (ocfs2_rotate_requires_path_adjustment(left_path,
1661 insert_cpos)) {
1662 mlog(0, "Path adjustment required\n");
1663
1664 /*
1665 * We've rotated the tree as much as we
1666 * should. The rest is up to
1667 * ocfs2_insert_path() to complete, after the
1668 * record insertion. We indicate this
1669 * situation by returning the left path.
1670 *
1671 * The reason we don't adjust the records here
1672 * before the record insert is that an error
1673 * later might break the rule where a parent
1674 * record e_cpos will reflect the actual
1675 * e_cpos of the 1st nonempty record of the
1676 * child list.
1677 */
1678 *ret_left_path = left_path;
1679 goto out_ret_path;
1680 }
1681
1682 start = ocfs2_find_subtree_root(inode, left_path, right_path);
1683
1684 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
1685 start,
1686 (unsigned long long) right_path->p_node[start].bh->b_blocknr,
1687 right_path->p_tree_depth);
1688
1689 ret = ocfs2_extend_rotate_transaction(handle, start,
1690 right_path);
1691 if (ret) {
1692 mlog_errno(ret);
1693 goto out;
1694 }
1695
1696 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
1697 right_path, start);
1698 if (ret) {
1699 mlog_errno(ret);
1700 goto out;
1701 }
1702
1703 /*
1704 * There is no need to re-read the next right path
1705 * as we know that it'll be our current left
1706 * path. Optimize by copying values instead.
1707 */
1708 ocfs2_mv_path(right_path, left_path);
1709
1710 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1711 &cpos);
1712 if (ret) {
1713 mlog_errno(ret);
1714 goto out;
1715 }
1716 }
1717
1718out:
1719 ocfs2_free_path(left_path);
1720
1721out_ret_path:
1722 return ret;
1723}
1724
1725/*
1726 * Do the final bits of extent record insertion at the target leaf
1727 * list. If this leaf is part of an allocation tree, it is assumed
1728 * that the tree above has been prepared.
1729 */
1730static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1731 struct ocfs2_extent_list *el,
1732 struct ocfs2_insert_type *insert,
1733 struct inode *inode)
1734{
1735 int i = insert->ins_contig_index;
1736 unsigned int range;
1737 struct ocfs2_extent_rec *rec;
1738
1739 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1740
1741 /*
1742 * Contiguous insert - either left or right.
1743 */
1744 if (insert->ins_contig != CONTIG_NONE) {
1745 rec = &el->l_recs[i];
1746 if (insert->ins_contig == CONTIG_LEFT) {
1747 rec->e_blkno = insert_rec->e_blkno;
1748 rec->e_cpos = insert_rec->e_cpos;
1749 }
1750 le16_add_cpu(&rec->e_leaf_clusters,
1751 le16_to_cpu(insert_rec->e_leaf_clusters));
1752 return;
1753 }
1754
1755 /*
1756 * Handle insert into an empty leaf.
1757 */
1758 if (le16_to_cpu(el->l_next_free_rec) == 0 ||
1759 ((le16_to_cpu(el->l_next_free_rec) == 1) &&
1760 ocfs2_is_empty_extent(&el->l_recs[0]))) {
1761 el->l_recs[0] = *insert_rec;
1762 el->l_next_free_rec = cpu_to_le16(1);
1763 return;
1764 }
1765
1766 /*
1767 * Appending insert.
1768 */
1769 if (insert->ins_appending == APPEND_TAIL) {
1770 i = le16_to_cpu(el->l_next_free_rec) - 1;
1771 rec = &el->l_recs[i];
1772 range = le32_to_cpu(rec->e_cpos)
1773 + le16_to_cpu(rec->e_leaf_clusters);
1774 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
1775
1776 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
1777 le16_to_cpu(el->l_count),
1778 "inode %lu, depth %u, count %u, next free %u, "
1779 "rec.cpos %u, rec.clusters %u, "
1780 "insert.cpos %u, insert.clusters %u\n",
1781 inode->i_ino,
1782 le16_to_cpu(el->l_tree_depth),
1783 le16_to_cpu(el->l_count),
1784 le16_to_cpu(el->l_next_free_rec),
1785 le32_to_cpu(el->l_recs[i].e_cpos),
1786 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
1787 le32_to_cpu(insert_rec->e_cpos),
1788 le16_to_cpu(insert_rec->e_leaf_clusters));
1789 i++;
1790 el->l_recs[i] = *insert_rec;
1791 le16_add_cpu(&el->l_next_free_rec, 1);
1792 return;
1793 }
1794
1795 /*
1796 * Ok, we have to rotate.
1797 *
1798 * At this point, it is safe to assume that inserting into an
1799 * empty leaf and appending to a leaf have both been handled
1800 * above.
1801 *
1802 * This leaf needs to have space, either by the empty 1st
1803 * extent record, or by virtue of an l_next_rec < l_count.
1804 */
1805 ocfs2_rotate_leaf(el, insert_rec);
1806}
1807
1808static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1809 struct ocfs2_dinode *di,
1810 u32 clusters)
1811{
1812 le32_add_cpu(&di->i_clusters, clusters);
1813 spin_lock(&OCFS2_I(inode)->ip_lock);
1814 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
1815 spin_unlock(&OCFS2_I(inode)->ip_lock);
1816}
1817
1818static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1819 struct ocfs2_extent_rec *insert_rec,
1820 struct ocfs2_path *right_path,
1821 struct ocfs2_path **ret_left_path)
1822{
1823 int ret, i, next_free;
1824 struct buffer_head *bh;
1825 struct ocfs2_extent_list *el;
1826 struct ocfs2_path *left_path = NULL;
1827
1828 *ret_left_path = NULL;
1829
1830 /*
1831 * This shouldn't happen for non-trees. The extent rec cluster
1832 * count manipulation below only works for interior nodes.
1833 */
1834 BUG_ON(right_path->p_tree_depth == 0);
1835
1836 /*
1837 * If our appending insert is at the leftmost edge of a leaf,
1838 * then we might need to update the rightmost records of the
1839 * neighboring path.
1840 */
1841 el = path_leaf_el(right_path);
1842 next_free = le16_to_cpu(el->l_next_free_rec);
1843 if (next_free == 0 ||
1844 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
1845 u32 left_cpos;
1846
1847 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1848 &left_cpos);
1849 if (ret) {
1850 mlog_errno(ret);
1851 goto out;
1852 }
1853
1854 mlog(0, "Append may need a left path update. cpos: %u, "
1855 "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
1856 left_cpos);
1857
1858 /*
1859 * No need to worry if the append is already in the
1860 * leftmost leaf.
1861 */
1862 if (left_cpos) {
1863 left_path = ocfs2_new_path(path_root_bh(right_path),
1864 path_root_el(right_path));
1865 if (!left_path) {
1866 ret = -ENOMEM;
1867 mlog_errno(ret);
1868 goto out;
1869 }
1870
1871 ret = ocfs2_find_path(inode, left_path, left_cpos);
1872 if (ret) {
1873 mlog_errno(ret);
1874 goto out;
1875 }
1876
1877 /*
1878 * ocfs2_insert_path() will pass the left_path to the
1879 * journal for us.
1880 */
1881 }
1882 }
1883
1884 ret = ocfs2_journal_access_path(inode, handle, right_path);
1885 if (ret) {
1886 mlog_errno(ret);
1887 goto out;
1888 }
1889
1890 el = path_root_el(right_path);
1891 bh = path_root_bh(right_path);
1892 i = 0;
1893 while (1) {
1894 struct ocfs2_extent_rec *rec;
1895
1896 next_free = le16_to_cpu(el->l_next_free_rec);
1897 if (next_free == 0) {
1898 ocfs2_error(inode->i_sb,
1899 "Dinode %llu has a bad extent list",
1900 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1901 ret = -EIO;
1902 goto out;
1903 }
1904
1905 rec = &el->l_recs[next_free - 1];
1906
1907 rec->e_int_clusters = insert_rec->e_cpos;
1908 le32_add_cpu(&rec->e_int_clusters,
1909 le16_to_cpu(insert_rec->e_leaf_clusters));
1910 le32_add_cpu(&rec->e_int_clusters,
1911 -le32_to_cpu(rec->e_cpos));
1912
1913 ret = ocfs2_journal_dirty(handle, bh);
1914 if (ret)
1915 mlog_errno(ret);
1916
1917 /* Don't touch the leaf node */
1918 if (++i >= right_path->p_tree_depth)
1919 break;
1920
1921 bh = right_path->p_node[i].bh;
1922 el = right_path->p_node[i].el;
1923 }
1924
1925 *ret_left_path = left_path;
1926 ret = 0;
1927out:
1928 if (ret != 0)
1929 ocfs2_free_path(left_path);
1930
1931 return ret;
1932}
1933
1934/*
1935 * This function only does inserts on an allocation b-tree. For dinode
1936 * lists, ocfs2_insert_at_leaf() is called directly.
1937 *
1938 * right_path is the path we want to do the actual insert
1939 * in. left_path should only be passed in if we need to update that
1940 * portion of the tree after an edge insert.
1941 */
1942static int ocfs2_insert_path(struct inode *inode,
1943 handle_t *handle,
1944 struct ocfs2_path *left_path,
1945 struct ocfs2_path *right_path,
1946 struct ocfs2_extent_rec *insert_rec,
1947 struct ocfs2_insert_type *insert)
1948{
1949 int ret, subtree_index;
1950 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1951 struct ocfs2_extent_list *el;
1952
1953 /*
1954 * Pass both paths to the journal. The majority of inserts
1955 * will be touching all components anyway.
1956 */
1957 ret = ocfs2_journal_access_path(inode, handle, right_path);
1958 if (ret < 0) {
1959 mlog_errno(ret);
1960 goto out;
1961 }
1962
1963 if (left_path) {
1964 int credits = handle->h_buffer_credits;
1965
1966 /*
1967 * There's a chance that left_path got passed back to
1968 * us without being accounted for in the
1969 * journal. Extend our transaction here to be sure we
1970 * can change those blocks.
1971 */
1972 credits += left_path->p_tree_depth;
1973
1974 ret = ocfs2_extend_trans(handle, credits);
1975 if (ret < 0) {
1976 mlog_errno(ret);
1977 goto out;
1978 }
1979
1980 ret = ocfs2_journal_access_path(inode, handle, left_path);
1981 if (ret < 0) {
1982 mlog_errno(ret);
1983 goto out;
1984 }
1985 }
1986
1987 el = path_leaf_el(right_path);
1988
1989 ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1990 ret = ocfs2_journal_dirty(handle, leaf_bh);
1991 if (ret)
1992 mlog_errno(ret);
1993
1994 if (left_path) {
1995 /*
1996 * The rotate code has indicated that we need to fix
1997 * up portions of the tree after the insert.
1998 *
1999 * XXX: Should we extend the transaction here?
2000 */
2001 subtree_index = ocfs2_find_subtree_root(inode, left_path,
2002 right_path);
2003 ocfs2_complete_edge_insert(inode, handle, left_path,
2004 right_path, subtree_index);
2005 }
2006
2007 ret = 0;
2008out:
2009 return ret;
2010}
2011
2012static int ocfs2_do_insert_extent(struct inode *inode,
2013 handle_t *handle,
2014 struct buffer_head *di_bh,
2015 struct ocfs2_extent_rec *insert_rec,
2016 struct ocfs2_insert_type *type)
2017{
2018 int ret, rotate = 0;
2019 u32 cpos;
2020 struct ocfs2_path *right_path = NULL;
2021 struct ocfs2_path *left_path = NULL;
2022 struct ocfs2_dinode *di;
2023 struct ocfs2_extent_list *el;
2024
2025 di = (struct ocfs2_dinode *) di_bh->b_data;
2026 el = &di->id2.i_list;
2027
2028 ret = ocfs2_journal_access(handle, inode, di_bh,
2029 OCFS2_JOURNAL_ACCESS_WRITE);
2030 if (ret) {
2031 mlog_errno(ret);
2032 goto out;
2033 }
2034
2035 if (le16_to_cpu(el->l_tree_depth) == 0) {
2036 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
2037 goto out_update_clusters;
2038 }
2039
2040 right_path = ocfs2_new_inode_path(di_bh);
2041 if (!right_path) {
2042 ret = -ENOMEM;
2043 mlog_errno(ret);
2044 goto out;
2045 }
2046
2047 /*
2048 * Determine the path to start with. Rotations need the
2049 * rightmost path, everything else can go directly to the
2050 * target leaf.
2051 */
2052 cpos = le32_to_cpu(insert_rec->e_cpos);
2053 if (type->ins_appending == APPEND_NONE &&
2054 type->ins_contig == CONTIG_NONE) {
2055 rotate = 1;
2056 cpos = UINT_MAX;
2057 }
2058
2059 ret = ocfs2_find_path(inode, right_path, cpos);
2060 if (ret) {
2061 mlog_errno(ret);
2062 goto out;
2063 }
2064
2065 /*
2066 * Rotations and appends need special treatment - they modify
2067 * parts of the tree's above them.
2068 *
2069 * Both might pass back a path immediate to the left of the
2070 * one being inserted to. This will be cause
2071 * ocfs2_insert_path() to modify the rightmost records of
2072 * left_path to account for an edge insert.
2073 *
2074 * XXX: When modifying this code, keep in mind that an insert
2075 * can wind up skipping both of these two special cases...
2076 */
2077 if (rotate) {
2078 ret = ocfs2_rotate_tree_right(inode, handle,
2079 le32_to_cpu(insert_rec->e_cpos),
2080 right_path, &left_path);
2081 if (ret) {
2082 mlog_errno(ret);
2083 goto out;
2084 }
2085 } else if (type->ins_appending == APPEND_TAIL
2086 && type->ins_contig != CONTIG_LEFT) {
2087 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
2088 right_path, &left_path);
2089 if (ret) {
2090 mlog_errno(ret);
2091 goto out;
2092 }
2093 }
2094
2095 ret = ocfs2_insert_path(inode, handle, left_path, right_path,
2096 insert_rec, type);
2097 if (ret) {
2098 mlog_errno(ret);
2099 goto out;
2100 }
2101
2102out_update_clusters:
2103 ocfs2_update_dinode_clusters(inode, di,
2104 le16_to_cpu(insert_rec->e_leaf_clusters));
2105
2106 ret = ocfs2_journal_dirty(handle, di_bh);
2107 if (ret)
2108 mlog_errno(ret);
2109
2110out:
2111 ocfs2_free_path(left_path);
2112 ocfs2_free_path(right_path);
2113
2114 return ret;
2115}
2116
2117static void ocfs2_figure_contig_type(struct inode *inode,
2118 struct ocfs2_insert_type *insert,
2119 struct ocfs2_extent_list *el,
2120 struct ocfs2_extent_rec *insert_rec)
2121{
2122 int i;
2123 enum ocfs2_contig_type contig_type = CONTIG_NONE;
2124
2125 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
2126
2127 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2128 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
2129 insert_rec);
2130 if (contig_type != CONTIG_NONE) {
2131 insert->ins_contig_index = i;
2132 break;
2133 }
2134 }
2135 insert->ins_contig = contig_type;
2136}
2137
2138/*
2139 * This should only be called against the righmost leaf extent list.
2140 *
2141 * ocfs2_figure_appending_type() will figure out whether we'll have to
2142 * insert at the tail of the rightmost leaf.
2143 *
2144 * This should also work against the dinode list for tree's with 0
2145 * depth. If we consider the dinode list to be the rightmost leaf node
2146 * then the logic here makes sense.
2147 */
2148static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
2149 struct ocfs2_extent_list *el,
2150 struct ocfs2_extent_rec *insert_rec)
2151{
2152 int i;
2153 u32 cpos = le32_to_cpu(insert_rec->e_cpos);
2154 struct ocfs2_extent_rec *rec;
2155
2156 insert->ins_appending = APPEND_NONE;
2157
2158 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
2159
2160 if (!el->l_next_free_rec)
2161 goto set_tail_append;
2162
2163 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
2164 /* Were all records empty? */
2165 if (le16_to_cpu(el->l_next_free_rec) == 1)
2166 goto set_tail_append;
845 } 2167 }
846 2168
847 /* Can we allocate without adding/shifting tree bits? */
848 i = le16_to_cpu(el->l_next_free_rec) - 1; 2169 i = le16_to_cpu(el->l_next_free_rec) - 1;
849 if (le16_to_cpu(el->l_next_free_rec) == 0 2170 rec = &el->l_recs[i];
850 || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) 2171
851 || le32_to_cpu(el->l_recs[i].e_clusters) == 0 2172 if (cpos >=
852 || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) 2173 (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
853 goto out_add; 2174 goto set_tail_append;
2175
2176 return;
2177
2178set_tail_append:
2179 insert->ins_appending = APPEND_TAIL;
2180}
2181
2182/*
2183 * Helper function called at the begining of an insert.
2184 *
2185 * This computes a few things that are commonly used in the process of
2186 * inserting into the btree:
2187 * - Whether the new extent is contiguous with an existing one.
2188 * - The current tree depth.
2189 * - Whether the insert is an appending one.
2190 * - The total # of free records in the tree.
2191 *
2192 * All of the information is stored on the ocfs2_insert_type
2193 * structure.
2194 */
2195static int ocfs2_figure_insert_type(struct inode *inode,
2196 struct buffer_head *di_bh,
2197 struct buffer_head **last_eb_bh,
2198 struct ocfs2_extent_rec *insert_rec,
2199 struct ocfs2_insert_type *insert)
2200{
2201 int ret;
2202 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2203 struct ocfs2_extent_block *eb;
2204 struct ocfs2_extent_list *el;
2205 struct ocfs2_path *path = NULL;
2206 struct buffer_head *bh = NULL;
2207
2208 el = &di->id2.i_list;
2209 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2210
2211 if (el->l_tree_depth) {
2212 /*
2213 * If we have tree depth, we read in the
2214 * rightmost extent block ahead of time as
2215 * ocfs2_figure_insert_type() and ocfs2_add_branch()
2216 * may want it later.
2217 */
2218 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
2219 le64_to_cpu(di->i_last_eb_blk), &bh,
2220 OCFS2_BH_CACHED, inode);
2221 if (ret) {
2222 mlog_exit(ret);
2223 goto out;
2224 }
2225 eb = (struct ocfs2_extent_block *) bh->b_data;
2226 el = &eb->h_list;
2227 }
2228
2229 /*
2230 * Unless we have a contiguous insert, we'll need to know if
2231 * there is room left in our allocation tree for another
2232 * extent record.
2233 *
2234 * XXX: This test is simplistic, we can search for empty
2235 * extent records too.
2236 */
2237 insert->ins_free_records = le16_to_cpu(el->l_count) -
2238 le16_to_cpu(el->l_next_free_rec);
2239
2240 if (!insert->ins_tree_depth) {
2241 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2242 ocfs2_figure_appending_type(insert, el, insert_rec);
2243 return 0;
2244 }
2245
2246 path = ocfs2_new_inode_path(di_bh);
2247 if (!path) {
2248 ret = -ENOMEM;
2249 mlog_errno(ret);
2250 goto out;
2251 }
2252
2253 /*
2254 * In the case that we're inserting past what the tree
2255 * currently accounts for, ocfs2_find_path() will return for
2256 * us the rightmost tree path. This is accounted for below in
2257 * the appending code.
2258 */
2259 ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
2260 if (ret) {
2261 mlog_errno(ret);
2262 goto out;
2263 }
2264
2265 el = path_leaf_el(path);
2266
2267 /*
2268 * Now that we have the path, there's two things we want to determine:
2269 * 1) Contiguousness (also set contig_index if this is so)
2270 *
2271 * 2) Are we doing an append? We can trivially break this up
2272 * into two types of appends: simple record append, or a
2273 * rotate inside the tail leaf.
2274 */
2275 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2276
2277 /*
2278 * The insert code isn't quite ready to deal with all cases of
2279 * left contiguousness. Specifically, if it's an insert into
2280 * the 1st record in a leaf, it will require the adjustment of
2281 * cluster count on the last record of the path directly to it's
2282 * left. For now, just catch that case and fool the layers
2283 * above us. This works just fine for tree_depth == 0, which
2284 * is why we allow that above.
2285 */
2286 if (insert->ins_contig == CONTIG_LEFT &&
2287 insert->ins_contig_index == 0)
2288 insert->ins_contig = CONTIG_NONE;
2289
2290 /*
2291 * Ok, so we can simply compare against last_eb to figure out
2292 * whether the path doesn't exist. This will only happen in
2293 * the case that we're doing a tail append, so maybe we can
2294 * take advantage of that information somehow.
2295 */
2296 if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
2297 /*
2298 * Ok, ocfs2_find_path() returned us the rightmost
2299 * tree path. This might be an appending insert. There are
2300 * two cases:
2301 * 1) We're doing a true append at the tail:
2302 * -This might even be off the end of the leaf
2303 * 2) We're "appending" by rotating in the tail
2304 */
2305 ocfs2_figure_appending_type(insert, el, insert_rec);
2306 }
2307
2308out:
2309 ocfs2_free_path(path);
2310
2311 if (ret == 0)
2312 *last_eb_bh = bh;
2313 else
2314 brelse(bh);
2315 return ret;
2316}
2317
2318/*
2319 * Insert an extent into an inode btree.
2320 *
2321 * The caller needs to update fe->i_clusters
2322 */
2323int ocfs2_insert_extent(struct ocfs2_super *osb,
2324 handle_t *handle,
2325 struct inode *inode,
2326 struct buffer_head *fe_bh,
2327 u32 cpos,
2328 u64 start_blk,
2329 u32 new_clusters,
2330 struct ocfs2_alloc_context *meta_ac)
2331{
2332 int status, shift;
2333 struct buffer_head *last_eb_bh = NULL;
2334 struct buffer_head *bh = NULL;
2335 struct ocfs2_insert_type insert = {0, };
2336 struct ocfs2_extent_rec rec;
2337
2338 mlog(0, "add %u clusters at position %u to inode %llu\n",
2339 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
2340
2341 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
2342 (OCFS2_I(inode)->ip_clusters != cpos),
2343 "Device %s, asking for sparse allocation: inode %llu, "
2344 "cpos %u, clusters %u\n",
2345 osb->dev_str,
2346 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
2347 OCFS2_I(inode)->ip_clusters);
2348
2349 memset(&rec, 0, sizeof(rec));
2350 rec.e_cpos = cpu_to_le32(cpos);
2351 rec.e_blkno = cpu_to_le64(start_blk);
2352 rec.e_leaf_clusters = cpu_to_le16(new_clusters);
2353
2354 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2355 &insert);
2356 if (status < 0) {
2357 mlog_errno(status);
2358 goto bail;
2359 }
854 2360
855 mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " 2361 mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
856 "tree now.\n"); 2362 "Insert.contig_index: %d, Insert.free_records: %d, "
2363 "Insert.tree_depth: %d\n",
2364 insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2365 insert.ins_free_records, insert.ins_tree_depth);
2366
2367 /*
2368 * Avoid growing the tree unless we're out of records and the
2369 * insert type requres one.
2370 */
2371 if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
2372 goto out_add;
857 2373
858 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); 2374 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
859 if (shift < 0) { 2375 if (shift < 0) {
@@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
866 * and didn't find room for any more extents - we need to add 2382 * and didn't find room for any more extents - we need to add
867 * another tree level */ 2383 * another tree level */
868 if (shift) { 2384 if (shift) {
869 /* if we hit a leaf, we'd better be empty :) */
870 BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
871 le16_to_cpu(el->l_count));
872 BUG_ON(bh); 2385 BUG_ON(bh);
873 mlog(0, "ocfs2_allocate_extent: need to shift tree depth " 2386 mlog(0, "need to shift tree depth "
874 "(current = %u)\n", 2387 "(current = %d)\n", insert.ins_tree_depth);
875 le16_to_cpu(fe->id2.i_list.l_tree_depth));
876 2388
877 /* ocfs2_shift_tree_depth will return us a buffer with 2389 /* ocfs2_shift_tree_depth will return us a buffer with
878 * the new extent block (so we can pass that to 2390 * the new extent block (so we can pass that to
@@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
883 mlog_errno(status); 2395 mlog_errno(status);
884 goto bail; 2396 goto bail;
885 } 2397 }
2398 insert.ins_tree_depth++;
886 /* Special case: we have room now if we shifted from 2399 /* Special case: we have room now if we shifted from
887 * tree_depth 0 */ 2400 * tree_depth 0 */
888 if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) 2401 if (insert.ins_tree_depth == 1)
889 goto out_add; 2402 goto out_add;
890 } 2403 }
891 2404
892 /* call ocfs2_add_branch to add the final part of the tree with 2405 /* call ocfs2_add_branch to add the final part of the tree with
893 * the new data. */ 2406 * the new data. */
894 mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); 2407 mlog(0, "add branch. bh = %p\n", bh);
895 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, 2408 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
896 meta_ac); 2409 meta_ac);
897 if (status < 0) { 2410 if (status < 0) {
@@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
900 } 2413 }
901 2414
902out_add: 2415out_add:
903 /* Finally, we can add clusters. */ 2416 /* Finally, we can add clusters. This might rotate the tree for us. */
904 status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, 2417 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
905 start_blk, new_clusters);
906 if (status < 0) 2418 if (status < 0)
907 mlog_errno(status); 2419 mlog_errno(status);
2420 else
2421 ocfs2_extent_map_insert_rec(inode, &rec);
908 2422
909bail: 2423bail:
910 if (bh) 2424 if (bh)
@@ -1355,7 +2869,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
1355 tl = &tl_copy->id2.i_dealloc; 2869 tl = &tl_copy->id2.i_dealloc;
1356 num_recs = le16_to_cpu(tl->tl_used); 2870 num_recs = le16_to_cpu(tl->tl_used);
1357 mlog(0, "cleanup %u records from %llu\n", num_recs, 2871 mlog(0, "cleanup %u records from %llu\n", num_recs,
1358 (unsigned long long)tl_copy->i_blkno); 2872 (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
1359 2873
1360 mutex_lock(&tl_inode->i_mutex); 2874 mutex_lock(&tl_inode->i_mutex);
1361 for(i = 0; i < num_recs; i++) { 2875 for(i = 0; i < num_recs; i++) {
@@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
1447 * block will be deleted, and if it will, what the new last extent 2961 * block will be deleted, and if it will, what the new last extent
1448 * block will be so we can update his h_next_leaf_blk field, as well 2962 * block will be so we can update his h_next_leaf_blk field, as well
1449 * as the dinodes i_last_eb_blk */ 2963 * as the dinodes i_last_eb_blk */
1450static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, 2964static int ocfs2_find_new_last_ext_blk(struct inode *inode,
1451 struct inode *inode, 2965 unsigned int clusters_to_del,
1452 struct ocfs2_dinode *fe, 2966 struct ocfs2_path *path,
1453 u32 new_i_clusters,
1454 struct buffer_head *old_last_eb,
1455 struct buffer_head **new_last_eb) 2967 struct buffer_head **new_last_eb)
1456{ 2968{
1457 int i, status = 0; 2969 int next_free, ret = 0;
1458 u64 block = 0; 2970 u32 cpos;
2971 struct ocfs2_extent_rec *rec;
1459 struct ocfs2_extent_block *eb; 2972 struct ocfs2_extent_block *eb;
1460 struct ocfs2_extent_list *el; 2973 struct ocfs2_extent_list *el;
1461 struct buffer_head *bh = NULL; 2974 struct buffer_head *bh = NULL;
1462 2975
1463 *new_last_eb = NULL; 2976 *new_last_eb = NULL;
1464 2977
1465 if (!OCFS2_IS_VALID_DINODE(fe)) {
1466 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1467 status = -EIO;
1468 goto bail;
1469 }
1470
1471 /* we have no tree, so of course, no last_eb. */ 2978 /* we have no tree, so of course, no last_eb. */
1472 if (!fe->id2.i_list.l_tree_depth) 2979 if (!path->p_tree_depth)
1473 goto bail; 2980 goto out;
1474 2981
1475 /* trunc to zero special case - this makes tree_depth = 0 2982 /* trunc to zero special case - this makes tree_depth = 0
1476 * regardless of what it is. */ 2983 * regardless of what it is. */
1477 if (!new_i_clusters) 2984 if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
1478 goto bail; 2985 goto out;
1479 2986
1480 eb = (struct ocfs2_extent_block *) old_last_eb->b_data; 2987 el = path_leaf_el(path);
1481 el = &(eb->h_list);
1482 BUG_ON(!el->l_next_free_rec); 2988 BUG_ON(!el->l_next_free_rec);
1483 2989
1484 /* Make sure that this guy will actually be empty after we 2990 /*
1485 * clear away the data. */ 2991 * Make sure that this extent list will actually be empty
1486 if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) 2992 * after we clear away the data. We can shortcut out if
1487 goto bail; 2993 * there's more than one non-empty extent in the
2994 * list. Otherwise, a check of the remaining extent is
2995 * necessary.
2996 */
2997 next_free = le16_to_cpu(el->l_next_free_rec);
2998 rec = NULL;
2999 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
3000 if (next_free > 2)
3001 goto out;
1488 3002
1489 /* Ok, at this point, we know that last_eb will definitely 3003 /* We may have a valid extent in index 1, check it. */
1490 * change, so lets traverse the tree and find the second to 3004 if (next_free == 2)
1491 * last extent block. */ 3005 rec = &el->l_recs[1];
1492 el = &(fe->id2.i_list); 3006
1493 /* go down the tree, */ 3007 /*
1494 do { 3008 * Fall through - no more nonempty extents, so we want
1495 for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { 3009 * to delete this leaf.
1496 if (le32_to_cpu(el->l_recs[i].e_cpos) < 3010 */
1497 new_i_clusters) { 3011 } else {
1498 block = le64_to_cpu(el->l_recs[i].e_blkno); 3012 if (next_free > 1)
1499 break; 3013 goto out;
1500 } 3014
3015 rec = &el->l_recs[0];
3016 }
3017
3018 if (rec) {
3019 /*
3020 * Check it we'll only be trimming off the end of this
3021 * cluster.
3022 */
3023 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
3024 goto out;
3025 }
3026
3027 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
3028 if (ret) {
3029 mlog_errno(ret);
3030 goto out;
3031 }
3032
3033 ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
3034 if (ret) {
3035 mlog_errno(ret);
3036 goto out;
3037 }
3038
3039 eb = (struct ocfs2_extent_block *) bh->b_data;
3040 el = &eb->h_list;
3041 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
3042 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
3043 ret = -EROFS;
3044 goto out;
3045 }
3046
3047 *new_last_eb = bh;
3048 get_bh(*new_last_eb);
3049 mlog(0, "returning block %llu, (cpos: %u)\n",
3050 (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
3051out:
3052 brelse(bh);
3053
3054 return ret;
3055}
3056
3057/*
3058 * Trim some clusters off the rightmost edge of a tree. Only called
3059 * during truncate.
3060 *
3061 * The caller needs to:
3062 * - start journaling of each path component.
3063 * - compute and fully set up any new last ext block
3064 */
3065static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
3066 handle_t *handle, struct ocfs2_truncate_context *tc,
3067 u32 clusters_to_del, u64 *delete_start)
3068{
3069 int ret, i, index = path->p_tree_depth;
3070 u32 new_edge = 0;
3071 u64 deleted_eb = 0;
3072 struct buffer_head *bh;
3073 struct ocfs2_extent_list *el;
3074 struct ocfs2_extent_rec *rec;
3075
3076 *delete_start = 0;
3077
3078 while (index >= 0) {
3079 bh = path->p_node[index].bh;
3080 el = path->p_node[index].el;
3081
3082 mlog(0, "traveling tree (index = %d, block = %llu)\n",
3083 index, (unsigned long long)bh->b_blocknr);
3084
3085 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
3086
3087 if (index !=
3088 (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
3089 ocfs2_error(inode->i_sb,
3090 "Inode %lu has invalid ext. block %llu",
3091 inode->i_ino,
3092 (unsigned long long)bh->b_blocknr);
3093 ret = -EROFS;
3094 goto out;
1501 } 3095 }
1502 BUG_ON(i < 0);
1503 3096
1504 if (bh) { 3097find_tail_record:
1505 brelse(bh); 3098 i = le16_to_cpu(el->l_next_free_rec) - 1;
1506 bh = NULL; 3099 rec = &el->l_recs[i];
3100
3101 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
3102 "next = %u\n", i, le32_to_cpu(rec->e_cpos),
3103 ocfs2_rec_clusters(el, rec),
3104 (unsigned long long)le64_to_cpu(rec->e_blkno),
3105 le16_to_cpu(el->l_next_free_rec));
3106
3107 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
3108
3109 if (le16_to_cpu(el->l_tree_depth) == 0) {
3110 /*
3111 * If the leaf block contains a single empty
3112 * extent and no records, we can just remove
3113 * the block.
3114 */
3115 if (i == 0 && ocfs2_is_empty_extent(rec)) {
3116 memset(rec, 0,
3117 sizeof(struct ocfs2_extent_rec));
3118 el->l_next_free_rec = cpu_to_le16(0);
3119
3120 goto delete;
3121 }
3122
3123 /*
3124 * Remove any empty extents by shifting things
3125 * left. That should make life much easier on
3126 * the code below. This condition is rare
3127 * enough that we shouldn't see a performance
3128 * hit.
3129 */
3130 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
3131 le16_add_cpu(&el->l_next_free_rec, -1);
3132
3133 for(i = 0;
3134 i < le16_to_cpu(el->l_next_free_rec); i++)
3135 el->l_recs[i] = el->l_recs[i + 1];
3136
3137 memset(&el->l_recs[i], 0,
3138 sizeof(struct ocfs2_extent_rec));
3139
3140 /*
3141 * We've modified our extent list. The
3142 * simplest way to handle this change
3143 * is to being the search from the
3144 * start again.
3145 */
3146 goto find_tail_record;
3147 }
3148
3149 le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
3150
3151 /*
3152 * We'll use "new_edge" on our way back up the
3153 * tree to know what our rightmost cpos is.
3154 */
3155 new_edge = le16_to_cpu(rec->e_leaf_clusters);
3156 new_edge += le32_to_cpu(rec->e_cpos);
3157
3158 /*
3159 * The caller will use this to delete data blocks.
3160 */
3161 *delete_start = le64_to_cpu(rec->e_blkno)
3162 + ocfs2_clusters_to_blocks(inode->i_sb,
3163 le16_to_cpu(rec->e_leaf_clusters));
3164
3165 /*
3166 * If it's now empty, remove this record.
3167 */
3168 if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
3169 memset(rec, 0,
3170 sizeof(struct ocfs2_extent_rec));
3171 le16_add_cpu(&el->l_next_free_rec, -1);
3172 }
3173 } else {
3174 if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
3175 memset(rec, 0,
3176 sizeof(struct ocfs2_extent_rec));
3177 le16_add_cpu(&el->l_next_free_rec, -1);
3178
3179 goto delete;
3180 }
3181
3182 /* Can this actually happen? */
3183 if (le16_to_cpu(el->l_next_free_rec) == 0)
3184 goto delete;
3185
3186 /*
3187 * We never actually deleted any clusters
3188 * because our leaf was empty. There's no
3189 * reason to adjust the rightmost edge then.
3190 */
3191 if (new_edge == 0)
3192 goto delete;
3193
3194 rec->e_int_clusters = cpu_to_le32(new_edge);
3195 le32_add_cpu(&rec->e_int_clusters,
3196 -le32_to_cpu(rec->e_cpos));
3197
3198 /*
3199 * A deleted child record should have been
3200 * caught above.
3201 */
3202 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
1507 } 3203 }
1508 3204
1509 status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, 3205delete:
1510 inode); 3206 ret = ocfs2_journal_dirty(handle, bh);
1511 if (status < 0) { 3207 if (ret) {
1512 mlog_errno(status); 3208 mlog_errno(ret);
1513 goto bail; 3209 goto out;
1514 } 3210 }
1515 eb = (struct ocfs2_extent_block *) bh->b_data; 3211
1516 el = &eb->h_list; 3212 mlog(0, "extent list container %llu, after: record %d: "
1517 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 3213 "(%u, %u, %llu), next = %u.\n",
1518 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 3214 (unsigned long long)bh->b_blocknr, i,
1519 status = -EIO; 3215 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
1520 goto bail; 3216 (unsigned long long)le64_to_cpu(rec->e_blkno),
3217 le16_to_cpu(el->l_next_free_rec));
3218
3219 /*
3220 * We must be careful to only attempt delete of an
3221 * extent block (and not the root inode block).
3222 */
3223 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
3224 struct ocfs2_extent_block *eb =
3225 (struct ocfs2_extent_block *)bh->b_data;
3226
3227 /*
3228 * Save this for use when processing the
3229 * parent block.
3230 */
3231 deleted_eb = le64_to_cpu(eb->h_blkno);
3232
3233 mlog(0, "deleting this extent block.\n");
3234
3235 ocfs2_remove_from_cache(inode, bh);
3236
3237 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
3238 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
3239 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
3240
3241 if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
3242 /*
3243 * This code only understands how to
3244 * lock the suballocator in slot 0,
3245 * which is fine because allocation is
3246 * only ever done out of that
3247 * suballocator too. A future version
3248 * might change that however, so avoid
3249 * a free if we don't know how to
3250 * handle it. This way an fs incompat
3251 * bit will not be necessary.
3252 */
3253 ret = ocfs2_free_extent_block(handle,
3254 tc->tc_ext_alloc_inode,
3255 tc->tc_ext_alloc_bh,
3256 eb);
3257
3258 /* An error here is not fatal. */
3259 if (ret < 0)
3260 mlog_errno(ret);
3261 }
3262 } else {
3263 deleted_eb = 0;
1521 } 3264 }
1522 } while (el->l_tree_depth);
1523 3265
1524 *new_last_eb = bh; 3266 index--;
1525 get_bh(*new_last_eb); 3267 }
1526 mlog(0, "returning block %llu\n",
1527 (unsigned long long)le64_to_cpu(eb->h_blkno));
1528bail:
1529 if (bh)
1530 brelse(bh);
1531 3268
1532 return status; 3269 ret = 0;
3270out:
3271 return ret;
1533} 3272}
1534 3273
1535static int ocfs2_do_truncate(struct ocfs2_super *osb, 3274static int ocfs2_do_truncate(struct ocfs2_super *osb,
1536 unsigned int clusters_to_del, 3275 unsigned int clusters_to_del,
1537 struct inode *inode, 3276 struct inode *inode,
1538 struct buffer_head *fe_bh, 3277 struct buffer_head *fe_bh,
1539 struct buffer_head *old_last_eb_bh,
1540 handle_t *handle, 3278 handle_t *handle,
1541 struct ocfs2_truncate_context *tc) 3279 struct ocfs2_truncate_context *tc,
3280 struct ocfs2_path *path)
1542{ 3281{
1543 int status, i, depth; 3282 int status;
1544 struct ocfs2_dinode *fe; 3283 struct ocfs2_dinode *fe;
1545 struct ocfs2_extent_block *eb;
1546 struct ocfs2_extent_block *last_eb = NULL; 3284 struct ocfs2_extent_block *last_eb = NULL;
1547 struct ocfs2_extent_list *el; 3285 struct ocfs2_extent_list *el;
1548 struct buffer_head *eb_bh = NULL;
1549 struct buffer_head *last_eb_bh = NULL; 3286 struct buffer_head *last_eb_bh = NULL;
1550 u64 next_eb = 0;
1551 u64 delete_blk = 0; 3287 u64 delete_blk = 0;
1552 3288
1553 fe = (struct ocfs2_dinode *) fe_bh->b_data; 3289 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1554 3290
1555 status = ocfs2_find_new_last_ext_blk(osb, 3291 status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
1556 inode, 3292 path, &last_eb_bh);
1557 fe,
1558 le32_to_cpu(fe->i_clusters) -
1559 clusters_to_del,
1560 old_last_eb_bh,
1561 &last_eb_bh);
1562 if (status < 0) { 3293 if (status < 0) {
1563 mlog_errno(status); 3294 mlog_errno(status);
1564 goto bail; 3295 goto bail;
1565 } 3296 }
1566 if (last_eb_bh)
1567 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1568 3297
1569 status = ocfs2_journal_access(handle, inode, fe_bh, 3298 /*
1570 OCFS2_JOURNAL_ACCESS_WRITE); 3299 * Each component will be touched, so we might as well journal
3300 * here to avoid having to handle errors later.
3301 */
3302 status = ocfs2_journal_access_path(inode, handle, path);
1571 if (status < 0) { 3303 if (status < 0) {
1572 mlog_errno(status); 3304 mlog_errno(status);
1573 goto bail; 3305 goto bail;
1574 } 3306 }
3307
3308 if (last_eb_bh) {
3309 status = ocfs2_journal_access(handle, inode, last_eb_bh,
3310 OCFS2_JOURNAL_ACCESS_WRITE);
3311 if (status < 0) {
3312 mlog_errno(status);
3313 goto bail;
3314 }
3315
3316 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3317 }
3318
1575 el = &(fe->id2.i_list); 3319 el = &(fe->id2.i_list);
1576 3320
3321 /*
3322 * Lower levels depend on this never happening, but it's best
3323 * to check it up here before changing the tree.
3324 */
3325 if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
3326 ocfs2_error(inode->i_sb,
3327 "Inode %lu has an empty extent record, depth %u\n",
3328 inode->i_ino, le16_to_cpu(el->l_tree_depth));
3329 status = -EROFS;
3330 goto bail;
3331 }
3332
1577 spin_lock(&OCFS2_I(inode)->ip_lock); 3333 spin_lock(&OCFS2_I(inode)->ip_lock);
1578 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 3334 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
1579 clusters_to_del; 3335 clusters_to_del;
1580 spin_unlock(&OCFS2_I(inode)->ip_lock); 3336 spin_unlock(&OCFS2_I(inode)->ip_lock);
1581 le32_add_cpu(&fe->i_clusters, -clusters_to_del); 3337 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
1582 fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
1583 fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
1584 3338
1585 i = le16_to_cpu(el->l_next_free_rec) - 1; 3339 status = ocfs2_trim_tree(inode, path, handle, tc,
1586 3340 clusters_to_del, &delete_blk);
1587 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); 3341 if (status) {
1588 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); 3342 mlog_errno(status);
1589 /* tree depth zero, we can just delete the clusters, otherwise 3343 goto bail;
1590 * we need to record the offset of the next level extent block
1591 * as we may overwrite it. */
1592 if (!el->l_tree_depth)
1593 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1594 + ocfs2_clusters_to_blocks(osb->sb,
1595 le32_to_cpu(el->l_recs[i].e_clusters));
1596 else
1597 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1598
1599 if (!el->l_recs[i].e_clusters) {
1600 /* if we deleted the whole extent record, then clear
1601 * out the other fields and update the extent
1602 * list. For depth > 0 trees, we've already recorded
1603 * the extent block in 'next_eb' */
1604 el->l_recs[i].e_cpos = 0;
1605 el->l_recs[i].e_blkno = 0;
1606 BUG_ON(!el->l_next_free_rec);
1607 le16_add_cpu(&el->l_next_free_rec, -1);
1608 } 3344 }
1609 3345
1610 depth = le16_to_cpu(el->l_tree_depth); 3346 if (le32_to_cpu(fe->i_clusters) == 0) {
1611 if (!fe->i_clusters) {
1612 /* trunc to zero is a special case. */ 3347 /* trunc to zero is a special case. */
1613 el->l_tree_depth = 0; 3348 el->l_tree_depth = 0;
1614 fe->i_last_eb_blk = 0; 3349 fe->i_last_eb_blk = 0;
@@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
1625 /* If there will be a new last extent block, then by 3360 /* If there will be a new last extent block, then by
1626 * definition, there cannot be any leaves to the right of 3361 * definition, there cannot be any leaves to the right of
1627 * him. */ 3362 * him. */
1628 status = ocfs2_journal_access(handle, inode, last_eb_bh,
1629 OCFS2_JOURNAL_ACCESS_WRITE);
1630 if (status < 0) {
1631 mlog_errno(status);
1632 goto bail;
1633 }
1634 last_eb->h_next_leaf_blk = 0; 3363 last_eb->h_next_leaf_blk = 0;
1635 status = ocfs2_journal_dirty(handle, last_eb_bh); 3364 status = ocfs2_journal_dirty(handle, last_eb_bh);
1636 if (status < 0) { 3365 if (status < 0) {
@@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
1639 } 3368 }
1640 } 3369 }
1641 3370
1642 /* if our tree depth > 0, update all the tree blocks below us. */ 3371 if (delete_blk) {
1643 while (depth) { 3372 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
1644 mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n", 3373 clusters_to_del);
1645 depth, (unsigned long long)next_eb);
1646 status = ocfs2_read_block(osb, next_eb, &eb_bh,
1647 OCFS2_BH_CACHED, inode);
1648 if (status < 0) { 3374 if (status < 0) {
1649 mlog_errno(status); 3375 mlog_errno(status);
1650 goto bail; 3376 goto bail;
1651 } 3377 }
1652 eb = (struct ocfs2_extent_block *)eb_bh->b_data; 3378 }
1653 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 3379 status = 0;
1654 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 3380bail:
1655 status = -EIO; 3381
1656 goto bail; 3382 mlog_exit(status);
3383 return status;
3384}
3385
3386static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
3387{
3388 set_buffer_uptodate(bh);
3389 mark_buffer_dirty(bh);
3390 return 0;
3391}
3392
3393static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
3394{
3395 set_buffer_uptodate(bh);
3396 mark_buffer_dirty(bh);
3397 return ocfs2_journal_dirty_data(handle, bh);
3398}
3399
3400static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3401 struct page **pages, int numpages,
3402 u64 phys, handle_t *handle)
3403{
3404 int i, ret, partial = 0;
3405 void *kaddr;
3406 struct page *page;
3407 unsigned int from, to = PAGE_CACHE_SIZE;
3408 struct super_block *sb = inode->i_sb;
3409
3410 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
3411
3412 if (numpages == 0)
3413 goto out;
3414
3415 from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
3416 if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
3417 /*
3418 * Since 'from' has been capped to a value below page
3419 * size, this calculation won't be able to overflow
3420 * 'to'
3421 */
3422 to = ocfs2_align_bytes_to_clusters(sb, from);
3423
3424 /*
3425 * The truncate tail in this case should never contain
3426 * more than one page at maximum. The loop below also
3427 * assumes this.
3428 */
3429 BUG_ON(numpages != 1);
3430 }
3431
3432 for(i = 0; i < numpages; i++) {
3433 page = pages[i];
3434
3435 BUG_ON(from > PAGE_CACHE_SIZE);
3436 BUG_ON(to > PAGE_CACHE_SIZE);
3437
3438 ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
3439 if (ret)
3440 mlog_errno(ret);
3441
3442 kaddr = kmap_atomic(page, KM_USER0);
3443 memset(kaddr + from, 0, to - from);
3444 kunmap_atomic(kaddr, KM_USER0);
3445
3446 /*
3447 * Need to set the buffers we zero'd into uptodate
3448 * here if they aren't - ocfs2_map_page_blocks()
3449 * might've skipped some
3450 */
3451 if (ocfs2_should_order_data(inode)) {
3452 ret = walk_page_buffers(handle,
3453 page_buffers(page),
3454 from, to, &partial,
3455 ocfs2_ordered_zero_func);
3456 if (ret < 0)
3457 mlog_errno(ret);
3458 } else {
3459 ret = walk_page_buffers(handle, page_buffers(page),
3460 from, to, &partial,
3461 ocfs2_writeback_zero_func);
3462 if (ret < 0)
3463 mlog_errno(ret);
1657 } 3464 }
1658 el = &(eb->h_list);
1659 3465
1660 status = ocfs2_journal_access(handle, inode, eb_bh, 3466 if (!partial)
1661 OCFS2_JOURNAL_ACCESS_WRITE); 3467 SetPageUptodate(page);
1662 if (status < 0) { 3468
1663 mlog_errno(status); 3469 flush_dcache_page(page);
1664 goto bail; 3470
3471 /*
3472 * Every page after the 1st one should be completely zero'd.
3473 */
3474 from = 0;
3475 }
3476out:
3477 if (pages) {
3478 for (i = 0; i < numpages; i++) {
3479 page = pages[i];
3480 unlock_page(page);
3481 mark_page_accessed(page);
3482 page_cache_release(page);
1665 } 3483 }
3484 }
3485}
1666 3486
1667 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 3487static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
1668 BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); 3488 int *num, u64 *phys)
3489{
3490 int i, numpages = 0, ret = 0;
3491 unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
3492 unsigned int ext_flags;
3493 struct super_block *sb = inode->i_sb;
3494 struct address_space *mapping = inode->i_mapping;
3495 unsigned long index;
3496 u64 next_cluster_bytes;
3497
3498 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
3499
3500 /* Cluster boundary, so we don't need to grab any pages. */
3501 if ((isize & (csize - 1)) == 0)
3502 goto out;
1669 3503
1670 i = le16_to_cpu(el->l_next_free_rec) - 1; 3504 ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
3505 phys, NULL, &ext_flags);
3506 if (ret) {
3507 mlog_errno(ret);
3508 goto out;
3509 }
1671 3510
1672 mlog(0, "extent block %llu, before: record %d: " 3511 /* Tail is a hole. */
1673 "(%u, %u, %llu), next = %u\n", 3512 if (*phys == 0)
1674 (unsigned long long)le64_to_cpu(eb->h_blkno), i, 3513 goto out;
1675 le32_to_cpu(el->l_recs[i].e_cpos),
1676 le32_to_cpu(el->l_recs[i].e_clusters),
1677 (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
1678 le16_to_cpu(el->l_next_free_rec));
1679 3514
1680 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); 3515 /* Tail is marked as unwritten, we can count on write to zero
1681 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); 3516 * in that case. */
1682 3517 if (ext_flags & OCFS2_EXT_UNWRITTEN)
1683 next_eb = le64_to_cpu(el->l_recs[i].e_blkno); 3518 goto out;
1684 /* bottom-most block requires us to delete data.*/
1685 if (!el->l_tree_depth)
1686 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1687 + ocfs2_clusters_to_blocks(osb->sb,
1688 le32_to_cpu(el->l_recs[i].e_clusters));
1689 if (!el->l_recs[i].e_clusters) {
1690 el->l_recs[i].e_cpos = 0;
1691 el->l_recs[i].e_blkno = 0;
1692 BUG_ON(!el->l_next_free_rec);
1693 le16_add_cpu(&el->l_next_free_rec, -1);
1694 }
1695 mlog(0, "extent block %llu, after: record %d: "
1696 "(%u, %u, %llu), next = %u\n",
1697 (unsigned long long)le64_to_cpu(eb->h_blkno), i,
1698 le32_to_cpu(el->l_recs[i].e_cpos),
1699 le32_to_cpu(el->l_recs[i].e_clusters),
1700 (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
1701 le16_to_cpu(el->l_next_free_rec));
1702 3519
1703 status = ocfs2_journal_dirty(handle, eb_bh); 3520 next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
1704 if (status < 0) { 3521 index = isize >> PAGE_CACHE_SHIFT;
1705 mlog_errno(status); 3522 do {
1706 goto bail; 3523 pages[numpages] = grab_cache_page(mapping, index);
3524 if (!pages[numpages]) {
3525 ret = -ENOMEM;
3526 mlog_errno(ret);
3527 goto out;
1707 } 3528 }
1708 3529
1709 if (!el->l_next_free_rec) { 3530 numpages++;
1710 mlog(0, "deleting this extent block.\n"); 3531 index++;
1711 3532 } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
1712 ocfs2_remove_from_cache(inode, eb_bh);
1713 3533
1714 BUG_ON(el->l_recs[0].e_clusters); 3534out:
1715 BUG_ON(el->l_recs[0].e_cpos); 3535 if (ret != 0) {
1716 BUG_ON(el->l_recs[0].e_blkno); 3536 if (pages) {
1717 if (eb->h_suballoc_slot == 0) { 3537 for (i = 0; i < numpages; i++) {
1718 /* 3538 if (pages[i]) {
1719 * This code only understands how to 3539 unlock_page(pages[i]);
1720 * lock the suballocator in slot 0, 3540 page_cache_release(pages[i]);
1721 * which is fine because allocation is
1722 * only ever done out of that
1723 * suballocator too. A future version
1724 * might change that however, so avoid
1725 * a free if we don't know how to
1726 * handle it. This way an fs incompat
1727 * bit will not be necessary.
1728 */
1729 status = ocfs2_free_extent_block(handle,
1730 tc->tc_ext_alloc_inode,
1731 tc->tc_ext_alloc_bh,
1732 eb);
1733 if (status < 0) {
1734 mlog_errno(status);
1735 goto bail;
1736 } 3541 }
1737 } 3542 }
1738 } 3543 }
1739 brelse(eb_bh); 3544 numpages = 0;
1740 eb_bh = NULL;
1741 depth--;
1742 } 3545 }
1743 3546
1744 BUG_ON(!delete_blk); 3547 *num = numpages;
1745 status = ocfs2_truncate_log_append(osb, handle, delete_blk, 3548
1746 clusters_to_del); 3549 return ret;
1747 if (status < 0) { 3550}
1748 mlog_errno(status); 3551
1749 goto bail; 3552/*
3553 * Zero the area past i_size but still within an allocated
3554 * cluster. This avoids exposing nonzero data on subsequent file
3555 * extends.
3556 *
3557 * We need to call this before i_size is updated on the inode because
3558 * otherwise block_write_full_page() will skip writeout of pages past
3559 * i_size. The new_i_size parameter is passed for this reason.
3560 */
3561int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3562 u64 new_i_size)
3563{
3564 int ret, numpages;
3565 loff_t endbyte;
3566 struct page **pages = NULL;
3567 u64 phys;
3568
3569 /*
3570 * File systems which don't support sparse files zero on every
3571 * extend.
3572 */
3573 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
3574 return 0;
3575
3576 pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
3577 sizeof(struct page *), GFP_NOFS);
3578 if (pages == NULL) {
3579 ret = -ENOMEM;
3580 mlog_errno(ret);
3581 goto out;
1750 } 3582 }
1751 status = 0; 3583
1752bail: 3584 ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
1753 if (!status) 3585 if (ret) {
1754 ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); 3586 mlog_errno(ret);
1755 else 3587 goto out;
1756 ocfs2_extent_map_drop(inode, 0); 3588 }
1757 mlog_exit(status); 3589
1758 return status; 3590 if (numpages == 0)
3591 goto out;
3592
3593 ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
3594 handle);
3595
3596 /*
3597 * Initiate writeout of the pages we zero'd here. We don't
3598 * wait on them - the truncate_inode_pages() call later will
3599 * do that for us.
3600 */
3601 endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
3602 ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
3603 endbyte - 1, SYNC_FILE_RANGE_WRITE);
3604 if (ret)
3605 mlog_errno(ret);
3606
3607out:
3608 if (pages)
3609 kfree(pages);
3610
3611 return ret;
1759} 3612}
1760 3613
1761/* 3614/*
@@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
1770 struct ocfs2_truncate_context *tc) 3623 struct ocfs2_truncate_context *tc)
1771{ 3624{
1772 int status, i, credits, tl_sem = 0; 3625 int status, i, credits, tl_sem = 0;
1773 u32 clusters_to_del, target_i_clusters; 3626 u32 clusters_to_del, new_highest_cpos, range;
1774 u64 last_eb = 0;
1775 struct ocfs2_dinode *fe;
1776 struct ocfs2_extent_block *eb;
1777 struct ocfs2_extent_list *el; 3627 struct ocfs2_extent_list *el;
1778 struct buffer_head *last_eb_bh;
1779 handle_t *handle = NULL; 3628 handle_t *handle = NULL;
1780 struct inode *tl_inode = osb->osb_tl_inode; 3629 struct inode *tl_inode = osb->osb_tl_inode;
3630 struct ocfs2_path *path = NULL;
1781 3631
1782 mlog_entry_void(); 3632 mlog_entry_void();
1783 3633
1784 down_write(&OCFS2_I(inode)->ip_alloc_sem); 3634 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1785 3635
1786 target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, 3636 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
1787 i_size_read(inode)); 3637 i_size_read(inode));
1788 3638
1789 last_eb_bh = tc->tc_last_eb_bh; 3639 path = ocfs2_new_inode_path(fe_bh);
1790 tc->tc_last_eb_bh = NULL; 3640 if (!path) {
3641 status = -ENOMEM;
3642 mlog_errno(status);
3643 goto bail;
3644 }
1791 3645
1792 fe = (struct ocfs2_dinode *) fe_bh->b_data; 3646 ocfs2_extent_map_trunc(inode, new_highest_cpos);
1793 3647
1794 if (fe->id2.i_list.l_tree_depth) {
1795 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1796 el = &eb->h_list;
1797 } else
1798 el = &fe->id2.i_list;
1799 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1800start: 3648start:
1801 mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " 3649 /*
1802 "last_eb = %llu, fe->i_last_eb_blk = %llu, " 3650 * Check that we still have allocation to delete.
1803 "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", 3651 */
1804 le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb, 3652 if (OCFS2_I(inode)->ip_clusters == 0) {
1805 (unsigned long long)le64_to_cpu(fe->i_last_eb_blk), 3653 status = 0;
1806 le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); 3654 goto bail;
1807 3655 }
1808 if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
1809 mlog(0, "last_eb changed!\n");
1810 BUG_ON(!fe->id2.i_list.l_tree_depth);
1811 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1812 /* i_last_eb_blk may have changed, read it if
1813 * necessary. We don't have to worry about the
1814 * truncate to zero case here (where there becomes no
1815 * last_eb) because we never loop back after our work
1816 * is done. */
1817 if (last_eb_bh) {
1818 brelse(last_eb_bh);
1819 last_eb_bh = NULL;
1820 }
1821 3656
1822 status = ocfs2_read_block(osb, last_eb, 3657 /*
1823 &last_eb_bh, OCFS2_BH_CACHED, 3658 * Truncate always works against the rightmost tree branch.
1824 inode); 3659 */
1825 if (status < 0) { 3660 status = ocfs2_find_path(inode, path, UINT_MAX);
1826 mlog_errno(status); 3661 if (status) {
1827 goto bail; 3662 mlog_errno(status);
1828 } 3663 goto bail;
1829 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 3664 }
1830 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 3665
1831 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 3666 mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
1832 status = -EIO; 3667 OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
1833 goto bail; 3668
1834 } 3669 /*
1835 el = &(eb->h_list); 3670 * By now, el will point to the extent list on the bottom most
3671 * portion of this tree. Only the tail record is considered in
3672 * each pass.
3673 *
3674 * We handle the following cases, in order:
3675 * - empty extent: delete the remaining branch
3676 * - remove the entire record
3677 * - remove a partial record
3678 * - no record needs to be removed (truncate has completed)
3679 */
3680 el = path_leaf_el(path);
3681 if (le16_to_cpu(el->l_next_free_rec) == 0) {
3682 ocfs2_error(inode->i_sb,
3683 "Inode %llu has empty extent block at %llu\n",
3684 (unsigned long long)OCFS2_I(inode)->ip_blkno,
3685 (unsigned long long)path_leaf_bh(path)->b_blocknr);
3686 status = -EROFS;
3687 goto bail;
1836 } 3688 }
1837 3689
1838 /* by now, el will point to the extent list on the bottom most
1839 * portion of this tree. */
1840 i = le16_to_cpu(el->l_next_free_rec) - 1; 3690 i = le16_to_cpu(el->l_next_free_rec) - 1;
1841 if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) 3691 range = le32_to_cpu(el->l_recs[i].e_cpos) +
1842 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); 3692 ocfs2_rec_clusters(el, &el->l_recs[i]);
1843 else 3693 if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
1844 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + 3694 clusters_to_del = 0;
3695 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
3696 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
3697 } else if (range > new_highest_cpos) {
3698 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
1845 le32_to_cpu(el->l_recs[i].e_cpos)) - 3699 le32_to_cpu(el->l_recs[i].e_cpos)) -
1846 target_i_clusters; 3700 new_highest_cpos;
3701 } else {
3702 status = 0;
3703 goto bail;
3704 }
3705
3706 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
3707 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
1847 3708
1848 mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); 3709 BUG_ON(clusters_to_del == 0);
1849 3710
1850 mutex_lock(&tl_inode->i_mutex); 3711 mutex_lock(&tl_inode->i_mutex);
1851 tl_sem = 1; 3712 tl_sem = 1;
@@ -1861,7 +3722,8 @@ start:
1861 } 3722 }
1862 3723
1863 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 3724 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
1864 fe, el); 3725 (struct ocfs2_dinode *)fe_bh->b_data,
3726 el);
1865 handle = ocfs2_start_trans(osb, credits); 3727 handle = ocfs2_start_trans(osb, credits);
1866 if (IS_ERR(handle)) { 3728 if (IS_ERR(handle)) {
1867 status = PTR_ERR(handle); 3729 status = PTR_ERR(handle);
@@ -1870,13 +3732,8 @@ start:
1870 goto bail; 3732 goto bail;
1871 } 3733 }
1872 3734
1873 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 3735 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
1874 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 3736 tc, path);
1875 if (status < 0)
1876 mlog_errno(status);
1877
1878 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
1879 last_eb_bh, handle, tc);
1880 if (status < 0) { 3737 if (status < 0) {
1881 mlog_errno(status); 3738 mlog_errno(status);
1882 goto bail; 3739 goto bail;
@@ -1888,9 +3745,14 @@ start:
1888 ocfs2_commit_trans(osb, handle); 3745 ocfs2_commit_trans(osb, handle);
1889 handle = NULL; 3746 handle = NULL;
1890 3747
1891 BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); 3748 ocfs2_reinit_path(path, 1);
1892 if (le32_to_cpu(fe->i_clusters) > target_i_clusters) 3749
1893 goto start; 3750 /*
3751 * The check above will catch the case where we've truncated
3752 * away all allocation.
3753 */
3754 goto start;
3755
1894bail: 3756bail:
1895 up_write(&OCFS2_I(inode)->ip_alloc_sem); 3757 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1896 3758
@@ -1902,8 +3764,7 @@ bail:
1902 if (handle) 3764 if (handle)
1903 ocfs2_commit_trans(osb, handle); 3765 ocfs2_commit_trans(osb, handle);
1904 3766
1905 if (last_eb_bh) 3767 ocfs2_free_path(path);
1906 brelse(last_eb_bh);
1907 3768
1908 /* This will drop the ext_alloc cluster lock for us */ 3769 /* This will drop the ext_alloc cluster lock for us */
1909 ocfs2_free_truncate_context(tc); 3770 ocfs2_free_truncate_context(tc);
@@ -1912,7 +3773,6 @@ bail:
1912 return status; 3773 return status;
1913} 3774}
1914 3775
1915
1916/* 3776/*
1917 * Expects the inode to already be locked. This will figure out which 3777 * Expects the inode to already be locked. This will figure out which
1918 * inodes need to be locked and will put them on the returned truncate 3778 * inodes need to be locked and will put them on the returned truncate
@@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1923 struct buffer_head *fe_bh, 3783 struct buffer_head *fe_bh,
1924 struct ocfs2_truncate_context **tc) 3784 struct ocfs2_truncate_context **tc)
1925{ 3785{
1926 int status, metadata_delete; 3786 int status, metadata_delete, i;
1927 unsigned int new_i_clusters; 3787 unsigned int new_i_clusters;
1928 struct ocfs2_dinode *fe; 3788 struct ocfs2_dinode *fe;
1929 struct ocfs2_extent_block *eb; 3789 struct ocfs2_extent_block *eb;
@@ -1941,23 +3801,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1941 fe = (struct ocfs2_dinode *) fe_bh->b_data; 3801 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1942 3802
1943 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" 3803 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
1944 "%llu\n", fe->i_clusters, new_i_clusters, 3804 "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
1945 (unsigned long long)fe->i_size); 3805 (unsigned long long)le64_to_cpu(fe->i_size));
1946
1947 if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
1948 ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
1949 "%u and size %llu whereas struct inode has "
1950 "cluster count %u and size %llu which caused an "
1951 "invalid truncate to %u clusters.",
1952 (unsigned long long)le64_to_cpu(fe->i_blkno),
1953 le32_to_cpu(fe->i_clusters),
1954 (unsigned long long)le64_to_cpu(fe->i_size),
1955 OCFS2_I(inode)->ip_clusters, i_size_read(inode),
1956 new_i_clusters);
1957 mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
1958 status = -EIO;
1959 goto bail;
1960 }
1961 3806
1962 *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); 3807 *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
1963 if (!(*tc)) { 3808 if (!(*tc)) {
@@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1986 goto bail; 3831 goto bail;
1987 } 3832 }
1988 el = &(eb->h_list); 3833 el = &(eb->h_list);
1989 if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) 3834
3835 i = 0;
3836 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3837 i = 1;
3838 /*
3839 * XXX: Should we check that next_free_rec contains
3840 * the extent?
3841 */
3842 if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
1990 metadata_delete = 1; 3843 metadata_delete = 1;
1991 } 3844 }
1992 3845
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 0b82e8044325..fbcb5934a081 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
31 handle_t *handle, 31 handle_t *handle,
32 struct inode *inode, 32 struct inode *inode,
33 struct buffer_head *fe_bh, 33 struct buffer_head *fe_bh,
34 u64 blkno, 34 u32 cpos,
35 u64 start_blk,
35 u32 new_clusters, 36 u32 new_clusters,
36 struct ocfs2_alloc_context *meta_ac); 37 struct ocfs2_alloc_context *meta_ac);
37int ocfs2_num_free_extents(struct ocfs2_super *osb, 38int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -70,6 +71,8 @@ struct ocfs2_truncate_context {
70 struct buffer_head *tc_last_eb_bh; 71 struct buffer_head *tc_last_eb_bh;
71}; 72};
72 73
74int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
75 u64 new_i_size);
73int ocfs2_prepare_truncate(struct ocfs2_super *osb, 76int ocfs2_prepare_truncate(struct ocfs2_super *osb,
74 struct inode *inode, 77 struct inode *inode,
75 struct buffer_head *fe_bh, 78 struct buffer_head *fe_bh,
@@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
79 struct buffer_head *fe_bh, 82 struct buffer_head *fe_bh,
80 struct ocfs2_truncate_context *tc); 83 struct ocfs2_truncate_context *tc);
81 84
85int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
86 u32 cpos, struct buffer_head **leaf_bh);
87
88/*
89 * Helper function to look at the # of clusters in an extent record.
90 */
91static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
92 struct ocfs2_extent_rec *rec)
93{
94 /*
95 * Cluster count in extent records is slightly different
96 * between interior nodes and leaf nodes. This is to support
97 * unwritten extents which need a flags field in leaf node
98 * records, thus shrinking the available space for a clusters
99 * field.
100 */
101 if (el->l_tree_depth)
102 return le32_to_cpu(rec->e_int_clusters);
103 else
104 return le16_to_cpu(rec->e_leaf_clusters);
105}
106
82#endif /* OCFS2_ALLOC_H */ 107#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 875c11443817..8e7cafb5fc6c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -24,6 +24,8 @@
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/pagemap.h> 25#include <linux/pagemap.h>
26#include <asm/byteorder.h> 26#include <asm/byteorder.h>
27#include <linux/swap.h>
28#include <linux/pipe_fs_i.h>
27 29
28#define MLOG_MASK_PREFIX ML_FILE_IO 30#define MLOG_MASK_PREFIX ML_FILE_IO
29#include <cluster/masklog.h> 31#include <cluster/masklog.h>
@@ -37,6 +39,7 @@
37#include "file.h" 39#include "file.h"
38#include "inode.h" 40#include "inode.h"
39#include "journal.h" 41#include "journal.h"
42#include "suballoc.h"
40#include "super.h" 43#include "super.h"
41#include "symlink.h" 44#include "symlink.h"
42 45
@@ -75,7 +78,8 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
75 78
76 if (!OCFS2_IS_VALID_DINODE(fe)) { 79 if (!OCFS2_IS_VALID_DINODE(fe)) {
77 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", 80 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
78 (unsigned long long)fe->i_blkno, 7, fe->i_signature); 81 (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
82 fe->i_signature);
79 goto bail; 83 goto bail;
80 } 84 }
81 85
@@ -134,7 +138,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
134 struct buffer_head *bh_result, int create) 138 struct buffer_head *bh_result, int create)
135{ 139{
136 int err = 0; 140 int err = 0;
141 unsigned int ext_flags;
137 u64 p_blkno, past_eof; 142 u64 p_blkno, past_eof;
143 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
138 144
139 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 145 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
140 (unsigned long long)iblock, bh_result, create); 146 (unsigned long long)iblock, bh_result, create);
@@ -149,17 +155,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
149 goto bail; 155 goto bail;
150 } 156 }
151 157
152 /* this can happen if another node truncs after our extend! */ 158 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
153 spin_lock(&OCFS2_I(inode)->ip_lock); 159 &ext_flags);
154 if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
155 OCFS2_I(inode)->ip_clusters))
156 err = -EIO;
157 spin_unlock(&OCFS2_I(inode)->ip_lock);
158 if (err)
159 goto bail;
160
161 err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
162 NULL);
163 if (err) { 160 if (err) {
164 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " 161 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
165 "%llu, NULL)\n", err, inode, (unsigned long long)iblock, 162 "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
@@ -167,22 +164,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
167 goto bail; 164 goto bail;
168 } 165 }
169 166
170 map_bh(bh_result, inode->i_sb, p_blkno); 167 /*
171 168 * ocfs2 never allocates in this function - the only time we
172 if (bh_result->b_blocknr == 0) { 169 * need to use BH_New is when we're extending i_size on a file
173 err = -EIO; 170 * system which doesn't support holes, in which case BH_New
174 mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", 171 * allows block_prepare_write() to zero.
175 (unsigned long long)iblock, 172 */
176 (unsigned long long)p_blkno, 173 mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
177 (unsigned long long)OCFS2_I(inode)->ip_blkno); 174 "ino %lu, iblock %llu\n", inode->i_ino,
178 } 175 (unsigned long long)iblock);
176
177 /* Treat the unwritten extent as a hole for zeroing purposes. */
178 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
179 map_bh(bh_result, inode->i_sb, p_blkno);
180
181 if (!ocfs2_sparse_alloc(osb)) {
182 if (p_blkno == 0) {
183 err = -EIO;
184 mlog(ML_ERROR,
185 "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
186 (unsigned long long)iblock,
187 (unsigned long long)p_blkno,
188 (unsigned long long)OCFS2_I(inode)->ip_blkno);
189 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
190 dump_stack();
191 }
179 192
180 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 193 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
181 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, 194 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
182 (unsigned long long)past_eof); 195 (unsigned long long)past_eof);
183 196
184 if (create && (iblock >= past_eof)) 197 if (create && (iblock >= past_eof))
185 set_buffer_new(bh_result); 198 set_buffer_new(bh_result);
199 }
186 200
187bail: 201bail:
188 if (err < 0) 202 if (err < 0)
@@ -276,8 +290,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
276 return ret; 290 return ret;
277} 291}
278 292
279/* This can also be called from ocfs2_write_zero_page() which has done 293/*
280 * it's own cluster locking. */ 294 * This is called from ocfs2_write_zero_page() which has handled it's
295 * own cluster locking and has ensured allocation exists for those
296 * blocks to be written.
297 */
281int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, 298int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
282 unsigned from, unsigned to) 299 unsigned from, unsigned to)
283{ 300{
@@ -292,44 +309,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
292 return ret; 309 return ret;
293} 310}
294 311
295/*
296 * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
297 * from loopback. It must be able to perform its own locking around
298 * ocfs2_get_block().
299 */
300static int ocfs2_prepare_write(struct file *file, struct page *page,
301 unsigned from, unsigned to)
302{
303 struct inode *inode = page->mapping->host;
304 int ret;
305
306 mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
307
308 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
309 if (ret != 0) {
310 mlog_errno(ret);
311 goto out;
312 }
313
314 ret = ocfs2_prepare_write_nolock(inode, page, from, to);
315
316 ocfs2_meta_unlock(inode, 0);
317out:
318 mlog_exit(ret);
319 return ret;
320}
321
322/* Taken from ext3. We don't necessarily need the full blown 312/* Taken from ext3. We don't necessarily need the full blown
323 * functionality yet, but IMHO it's better to cut and paste the whole 313 * functionality yet, but IMHO it's better to cut and paste the whole
324 * thing so we can avoid introducing our own bugs (and easily pick up 314 * thing so we can avoid introducing our own bugs (and easily pick up
325 * their fixes when they happen) --Mark */ 315 * their fixes when they happen) --Mark */
326static int walk_page_buffers( handle_t *handle, 316int walk_page_buffers( handle_t *handle,
327 struct buffer_head *head, 317 struct buffer_head *head,
328 unsigned from, 318 unsigned from,
329 unsigned to, 319 unsigned to,
330 int *partial, 320 int *partial,
331 int (*fn)( handle_t *handle, 321 int (*fn)( handle_t *handle,
332 struct buffer_head *bh)) 322 struct buffer_head *bh))
333{ 323{
334 struct buffer_head *bh; 324 struct buffer_head *bh;
335 unsigned block_start, block_end; 325 unsigned block_start, block_end;
@@ -388,95 +378,6 @@ out:
388 return handle; 378 return handle;
389} 379}
390 380
391static int ocfs2_commit_write(struct file *file, struct page *page,
392 unsigned from, unsigned to)
393{
394 int ret;
395 struct buffer_head *di_bh = NULL;
396 struct inode *inode = page->mapping->host;
397 handle_t *handle = NULL;
398 struct ocfs2_dinode *di;
399
400 mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
401
402 /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
403 * us to continue here without rechecking the I/O against
404 * changed inode values.
405 *
406 * 1) We're currently holding the inode alloc lock, so no
407 * nodes can change it underneath us.
408 *
409 * 2) We've had to take the metadata lock at least once
410 * already to check for extending writes, suid removal, etc.
411 * The meta data update code then ensures that we don't get a
412 * stale inode allocation image (i_size, i_clusters, etc).
413 */
414
415 ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page);
416 if (ret != 0) {
417 mlog_errno(ret);
418 goto out;
419 }
420
421 ret = ocfs2_data_lock_with_page(inode, 1, page);
422 if (ret != 0) {
423 mlog_errno(ret);
424 goto out_unlock_meta;
425 }
426
427 handle = ocfs2_start_walk_page_trans(inode, page, from, to);
428 if (IS_ERR(handle)) {
429 ret = PTR_ERR(handle);
430 goto out_unlock_data;
431 }
432
433 /* Mark our buffer early. We'd rather catch this error up here
434 * as opposed to after a successful commit_write which would
435 * require us to set back inode->i_size. */
436 ret = ocfs2_journal_access(handle, inode, di_bh,
437 OCFS2_JOURNAL_ACCESS_WRITE);
438 if (ret < 0) {
439 mlog_errno(ret);
440 goto out_commit;
441 }
442
443 /* might update i_size */
444 ret = generic_commit_write(file, page, from, to);
445 if (ret < 0) {
446 mlog_errno(ret);
447 goto out_commit;
448 }
449
450 di = (struct ocfs2_dinode *)di_bh->b_data;
451
452 /* ocfs2_mark_inode_dirty() is too heavy to use here. */
453 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
454 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
455 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
456
457 inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
458 di->i_size = cpu_to_le64((u64)i_size_read(inode));
459
460 ret = ocfs2_journal_dirty(handle, di_bh);
461 if (ret < 0) {
462 mlog_errno(ret);
463 goto out_commit;
464 }
465
466out_commit:
467 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
468out_unlock_data:
469 ocfs2_data_unlock(inode, 1);
470out_unlock_meta:
471 ocfs2_meta_unlock(inode, 1);
472out:
473 if (di_bh)
474 brelse(di_bh);
475
476 mlog_exit(ret);
477 return ret;
478}
479
480static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) 381static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
481{ 382{
482 sector_t status; 383 sector_t status;
@@ -499,8 +400,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
499 down_read(&OCFS2_I(inode)->ip_alloc_sem); 400 down_read(&OCFS2_I(inode)->ip_alloc_sem);
500 } 401 }
501 402
502 err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, 403 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
503 NULL);
504 404
505 if (!INODE_JOURNAL(inode)) { 405 if (!INODE_JOURNAL(inode)) {
506 up_read(&OCFS2_I(inode)->ip_alloc_sem); 406 up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -540,8 +440,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
540 struct buffer_head *bh_result, int create) 440 struct buffer_head *bh_result, int create)
541{ 441{
542 int ret; 442 int ret;
543 u64 p_blkno, inode_blocks; 443 u64 p_blkno, inode_blocks, contig_blocks;
544 int contig_blocks; 444 unsigned int ext_flags;
545 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 445 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
546 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; 446 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
547 447
@@ -549,33 +449,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
549 * nicely aligned and of the right size, so there's no need 449 * nicely aligned and of the right size, so there's no need
550 * for us to check any of that. */ 450 * for us to check any of that. */
551 451
552 spin_lock(&OCFS2_I(inode)->ip_lock); 452 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
553 inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb,
554 OCFS2_I(inode)->ip_clusters);
555
556 /*
557 * For a read which begins past the end of file, we return a hole.
558 */
559 if (!create && (iblock >= inode_blocks)) {
560 spin_unlock(&OCFS2_I(inode)->ip_lock);
561 ret = 0;
562 goto bail;
563 }
564 453
565 /* 454 /*
566 * Any write past EOF is not allowed because we'd be extending. 455 * Any write past EOF is not allowed because we'd be extending.
567 */ 456 */
568 if (create && (iblock + max_blocks) > inode_blocks) { 457 if (create && (iblock + max_blocks) > inode_blocks) {
569 spin_unlock(&OCFS2_I(inode)->ip_lock);
570 ret = -EIO; 458 ret = -EIO;
571 goto bail; 459 goto bail;
572 } 460 }
573 spin_unlock(&OCFS2_I(inode)->ip_lock);
574 461
575 /* This figures out the size of the next contiguous block, and 462 /* This figures out the size of the next contiguous block, and
576 * our logical offset */ 463 * our logical offset */
577 ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, 464 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
578 &contig_blocks); 465 &contig_blocks, &ext_flags);
579 if (ret) { 466 if (ret) {
580 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", 467 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
581 (unsigned long long)iblock); 468 (unsigned long long)iblock);
@@ -583,7 +470,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
583 goto bail; 470 goto bail;
584 } 471 }
585 472
586 map_bh(bh_result, inode->i_sb, p_blkno); 473 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
474 ocfs2_error(inode->i_sb,
475 "Inode %llu has a hole at block %llu\n",
476 (unsigned long long)OCFS2_I(inode)->ip_blkno,
477 (unsigned long long)iblock);
478 ret = -EROFS;
479 goto bail;
480 }
481
482 /*
483 * get_more_blocks() expects us to describe a hole by clearing
484 * the mapped bit on bh_result().
485 *
486 * Consider an unwritten extent as a hole.
487 */
488 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
489 map_bh(bh_result, inode->i_sb, p_blkno);
490 else {
491 /*
492 * ocfs2_prepare_inode_for_write() should have caught
493 * the case where we'd be filling a hole and triggered
494 * a buffered write instead.
495 */
496 if (create) {
497 ret = -EIO;
498 mlog_errno(ret);
499 goto bail;
500 }
501
502 clear_buffer_mapped(bh_result);
503 }
587 504
588 /* make sure we don't map more than max_blocks blocks here as 505 /* make sure we don't map more than max_blocks blocks here as
589 that's all the kernel will handle at this point. */ 506 that's all the kernel will handle at this point. */
@@ -606,12 +523,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
606 void *private) 523 void *private)
607{ 524{
608 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 525 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
526 int level;
609 527
610 /* this io's submitter should not have unlocked this before we could */ 528 /* this io's submitter should not have unlocked this before we could */
611 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 529 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
530
612 ocfs2_iocb_clear_rw_locked(iocb); 531 ocfs2_iocb_clear_rw_locked(iocb);
613 up_read(&inode->i_alloc_sem); 532
614 ocfs2_rw_unlock(inode, 0); 533 level = ocfs2_iocb_rw_locked_level(iocb);
534 if (!level)
535 up_read(&inode->i_alloc_sem);
536 ocfs2_rw_unlock(inode, level);
615} 537}
616 538
617/* 539/*
@@ -647,23 +569,27 @@ static ssize_t ocfs2_direct_IO(int rw,
647 569
648 mlog_entry_void(); 570 mlog_entry_void();
649 571
650 /* 572 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
651 * We get PR data locks even for O_DIRECT. This allows 573 /*
652 * concurrent O_DIRECT I/O but doesn't let O_DIRECT with 574 * We get PR data locks even for O_DIRECT. This
653 * extending and buffered zeroing writes race. If they did 575 * allows concurrent O_DIRECT I/O but doesn't let
654 * race then the buffered zeroing could be written back after 576 * O_DIRECT with extending and buffered zeroing writes
655 * the O_DIRECT I/O. It's one thing to tell people not to mix 577 * race. If they did race then the buffered zeroing
656 * buffered and O_DIRECT writes, but expecting them to 578 * could be written back after the O_DIRECT I/O. It's
657 * understand that file extension is also an implicit buffered 579 * one thing to tell people not to mix buffered and
658 * write is too much. By getting the PR we force writeback of 580 * O_DIRECT writes, but expecting them to understand
659 * the buffered zeroing before proceeding. 581 * that file extension is also an implicit buffered
660 */ 582 * write is too much. By getting the PR we force
661 ret = ocfs2_data_lock(inode, 0); 583 * writeback of the buffered zeroing before
662 if (ret < 0) { 584 * proceeding.
663 mlog_errno(ret); 585 */
664 goto out; 586 ret = ocfs2_data_lock(inode, 0);
587 if (ret < 0) {
588 mlog_errno(ret);
589 goto out;
590 }
591 ocfs2_data_unlock(inode, 0);
665 } 592 }
666 ocfs2_data_unlock(inode, 0);
667 593
668 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 594 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
669 inode->i_sb->s_bdev, iov, offset, 595 inode->i_sb->s_bdev, iov, offset,
@@ -675,11 +601,715 @@ out:
675 return ret; 601 return ret;
676} 602}
677 603
604static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
605 u32 cpos,
606 unsigned int *start,
607 unsigned int *end)
608{
609 unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
610
611 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
612 unsigned int cpp;
613
614 cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
615
616 cluster_start = cpos % cpp;
617 cluster_start = cluster_start << osb->s_clustersize_bits;
618
619 cluster_end = cluster_start + osb->s_clustersize;
620 }
621
622 BUG_ON(cluster_start > PAGE_SIZE);
623 BUG_ON(cluster_end > PAGE_SIZE);
624
625 if (start)
626 *start = cluster_start;
627 if (end)
628 *end = cluster_end;
629}
630
631/*
632 * 'from' and 'to' are the region in the page to avoid zeroing.
633 *
634 * If pagesize > clustersize, this function will avoid zeroing outside
635 * of the cluster boundary.
636 *
637 * from == to == 0 is code for "zero the entire cluster region"
638 */
639static void ocfs2_clear_page_regions(struct page *page,
640 struct ocfs2_super *osb, u32 cpos,
641 unsigned from, unsigned to)
642{
643 void *kaddr;
644 unsigned int cluster_start, cluster_end;
645
646 ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
647
648 kaddr = kmap_atomic(page, KM_USER0);
649
650 if (from || to) {
651 if (from > cluster_start)
652 memset(kaddr + cluster_start, 0, from - cluster_start);
653 if (to < cluster_end)
654 memset(kaddr + to, 0, cluster_end - to);
655 } else {
656 memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
657 }
658
659 kunmap_atomic(kaddr, KM_USER0);
660}
661
662/*
663 * Some of this taken from block_prepare_write(). We already have our
664 * mapping by now though, and the entire write will be allocating or
665 * it won't, so not much need to use BH_New.
666 *
667 * This will also skip zeroing, which is handled externally.
668 */
669int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
670 struct inode *inode, unsigned int from,
671 unsigned int to, int new)
672{
673 int ret = 0;
674 struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
675 unsigned int block_end, block_start;
676 unsigned int bsize = 1 << inode->i_blkbits;
677
678 if (!page_has_buffers(page))
679 create_empty_buffers(page, bsize, 0);
680
681 head = page_buffers(page);
682 for (bh = head, block_start = 0; bh != head || !block_start;
683 bh = bh->b_this_page, block_start += bsize) {
684 block_end = block_start + bsize;
685
686 /*
687 * Ignore blocks outside of our i/o range -
688 * they may belong to unallocated clusters.
689 */
690 if (block_start >= to || block_end <= from) {
691 if (PageUptodate(page))
692 set_buffer_uptodate(bh);
693 continue;
694 }
695
696 /*
697 * For an allocating write with cluster size >= page
698 * size, we always write the entire page.
699 */
700
701 if (buffer_new(bh))
702 clear_buffer_new(bh);
703
704 if (!buffer_mapped(bh)) {
705 map_bh(bh, inode->i_sb, *p_blkno);
706 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
707 }
708
709 if (PageUptodate(page)) {
710 if (!buffer_uptodate(bh))
711 set_buffer_uptodate(bh);
712 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
713 (block_start < from || block_end > to)) {
714 ll_rw_block(READ, 1, &bh);
715 *wait_bh++=bh;
716 }
717
718 *p_blkno = *p_blkno + 1;
719 }
720
721 /*
722 * If we issued read requests - let them complete.
723 */
724 while(wait_bh > wait) {
725 wait_on_buffer(*--wait_bh);
726 if (!buffer_uptodate(*wait_bh))
727 ret = -EIO;
728 }
729
730 if (ret == 0 || !new)
731 return ret;
732
733 /*
734 * If we get -EIO above, zero out any newly allocated blocks
735 * to avoid exposing stale data.
736 */
737 bh = head;
738 block_start = 0;
739 do {
740 void *kaddr;
741
742 block_end = block_start + bsize;
743 if (block_end <= from)
744 goto next_bh;
745 if (block_start >= to)
746 break;
747
748 kaddr = kmap_atomic(page, KM_USER0);
749 memset(kaddr+block_start, 0, bh->b_size);
750 flush_dcache_page(page);
751 kunmap_atomic(kaddr, KM_USER0);
752 set_buffer_uptodate(bh);
753 mark_buffer_dirty(bh);
754
755next_bh:
756 block_start = block_end;
757 bh = bh->b_this_page;
758 } while (bh != head);
759
760 return ret;
761}
762
763/*
764 * This will copy user data from the buffer page in the splice
765 * context.
766 *
767 * For now, we ignore SPLICE_F_MOVE as that would require some extra
768 * communication out all the way to ocfs2_write().
769 */
770int ocfs2_map_and_write_splice_data(struct inode *inode,
771 struct ocfs2_write_ctxt *wc, u64 *p_blkno,
772 unsigned int *ret_from, unsigned int *ret_to)
773{
774 int ret;
775 unsigned int to, from, cluster_start, cluster_end;
776 char *src, *dst;
777 struct ocfs2_splice_write_priv *sp = wc->w_private;
778 struct pipe_buffer *buf = sp->s_buf;
779 unsigned long bytes, src_from;
780 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
781
782 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
783 &cluster_end);
784
785 from = sp->s_offset;
786 src_from = sp->s_buf_offset;
787 bytes = wc->w_count;
788
789 if (wc->w_large_pages) {
790 /*
791 * For cluster size < page size, we have to
792 * calculate pos within the cluster and obey
793 * the rightmost boundary.
794 */
795 bytes = min(bytes, (unsigned long)(osb->s_clustersize
796 - (wc->w_pos & (osb->s_clustersize - 1))));
797 }
798 to = from + bytes;
799
800 if (wc->w_this_page_new)
801 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
802 cluster_start, cluster_end, 1);
803 else
804 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
805 from, to, 0);
806 if (ret) {
807 mlog_errno(ret);
808 goto out;
809 }
810
811 BUG_ON(from > PAGE_CACHE_SIZE);
812 BUG_ON(to > PAGE_CACHE_SIZE);
813 BUG_ON(from > osb->s_clustersize);
814 BUG_ON(to > osb->s_clustersize);
815
816 src = buf->ops->map(sp->s_pipe, buf, 1);
817 dst = kmap_atomic(wc->w_this_page, KM_USER1);
818 memcpy(dst + from, src + src_from, bytes);
819 kunmap_atomic(wc->w_this_page, KM_USER1);
820 buf->ops->unmap(sp->s_pipe, buf, src);
821
822 wc->w_finished_copy = 1;
823
824 *ret_from = from;
825 *ret_to = to;
826out:
827
828 return bytes ? (unsigned int)bytes : ret;
829}
830
831/*
832 * This will copy user data from the iovec in the buffered write
833 * context.
834 */
835int ocfs2_map_and_write_user_data(struct inode *inode,
836 struct ocfs2_write_ctxt *wc, u64 *p_blkno,
837 unsigned int *ret_from, unsigned int *ret_to)
838{
839 int ret;
840 unsigned int to, from, cluster_start, cluster_end;
841 unsigned long bytes, src_from;
842 char *dst;
843 struct ocfs2_buffered_write_priv *bp = wc->w_private;
844 const struct iovec *cur_iov = bp->b_cur_iov;
845 char __user *buf;
846 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
847
848 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
849 &cluster_end);
850
851 buf = cur_iov->iov_base + bp->b_cur_off;
852 src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
853
854 from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
855
856 /*
857 * This is a lot of comparisons, but it reads quite
858 * easily, which is important here.
859 */
860 /* Stay within the src page */
861 bytes = PAGE_SIZE - src_from;
862 /* Stay within the vector */
863 bytes = min(bytes,
864 (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
865 /* Stay within count */
866 bytes = min(bytes, (unsigned long)wc->w_count);
867 /*
868 * For clustersize > page size, just stay within
869 * target page, otherwise we have to calculate pos
870 * within the cluster and obey the rightmost
871 * boundary.
872 */
873 if (wc->w_large_pages) {
874 /*
875 * For cluster size < page size, we have to
876 * calculate pos within the cluster and obey
877 * the rightmost boundary.
878 */
879 bytes = min(bytes, (unsigned long)(osb->s_clustersize
880 - (wc->w_pos & (osb->s_clustersize - 1))));
881 } else {
882 /*
883 * cluster size > page size is the most common
884 * case - we just stay within the target page
885 * boundary.
886 */
887 bytes = min(bytes, PAGE_CACHE_SIZE - from);
888 }
889
890 to = from + bytes;
891
892 if (wc->w_this_page_new)
893 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
894 cluster_start, cluster_end, 1);
895 else
896 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
897 from, to, 0);
898 if (ret) {
899 mlog_errno(ret);
900 goto out;
901 }
902
903 BUG_ON(from > PAGE_CACHE_SIZE);
904 BUG_ON(to > PAGE_CACHE_SIZE);
905 BUG_ON(from > osb->s_clustersize);
906 BUG_ON(to > osb->s_clustersize);
907
908 dst = kmap(wc->w_this_page);
909 memcpy(dst + from, bp->b_src_buf + src_from, bytes);
910 kunmap(wc->w_this_page);
911
912 /*
913 * XXX: This is slow, but simple. The caller of
914 * ocfs2_buffered_write_cluster() is responsible for
915 * passing through the iovecs, so it's difficult to
916 * predict what our next step is in here after our
917 * initial write. A future version should be pushing
918 * that iovec manipulation further down.
919 *
920 * By setting this, we indicate that a copy from user
921 * data was done, and subsequent calls for this
922 * cluster will skip copying more data.
923 */
924 wc->w_finished_copy = 1;
925
926 *ret_from = from;
927 *ret_to = to;
928out:
929
930 return bytes ? (unsigned int)bytes : ret;
931}
932
933/*
934 * Map, fill and write a page to disk.
935 *
936 * The work of copying data is done via callback. Newly allocated
937 * pages which don't take user data will be zero'd (set 'new' to
938 * indicate an allocating write)
939 *
940 * Returns a negative error code or the number of bytes copied into
941 * the page.
942 */
943static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
944 u64 *p_blkno, struct page *page,
945 struct ocfs2_write_ctxt *wc, int new)
946{
947 int ret, copied = 0;
948 unsigned int from = 0, to = 0;
949 unsigned int cluster_start, cluster_end;
950 unsigned int zero_from = 0, zero_to = 0;
951
952 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
953 &cluster_start, &cluster_end);
954
955 if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
956 && !wc->w_finished_copy) {
957
958 wc->w_this_page = page;
959 wc->w_this_page_new = new;
960 ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
961 if (ret < 0) {
962 mlog_errno(ret);
963 goto out;
964 }
965
966 copied = ret;
967
968 zero_from = from;
969 zero_to = to;
970 if (new) {
971 from = cluster_start;
972 to = cluster_end;
973 }
974 } else {
975 /*
976 * If we haven't allocated the new page yet, we
977 * shouldn't be writing it out without copying user
978 * data. This is likely a math error from the caller.
979 */
980 BUG_ON(!new);
981
982 from = cluster_start;
983 to = cluster_end;
984
985 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
986 cluster_start, cluster_end, 1);
987 if (ret) {
988 mlog_errno(ret);
989 goto out;
990 }
991 }
992
993 /*
994 * Parts of newly allocated pages need to be zero'd.
995 *
996 * Above, we have also rewritten 'to' and 'from' - as far as
997 * the rest of the function is concerned, the entire cluster
998 * range inside of a page needs to be written.
999 *
1000 * We can skip this if the page is up to date - it's already
1001 * been zero'd from being read in as a hole.
1002 */
1003 if (new && !PageUptodate(page))
1004 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1005 wc->w_cpos, zero_from, zero_to);
1006
1007 flush_dcache_page(page);
1008
1009 if (ocfs2_should_order_data(inode)) {
1010 ret = walk_page_buffers(handle,
1011 page_buffers(page),
1012 from, to, NULL,
1013 ocfs2_journal_dirty_data);
1014 if (ret < 0)
1015 mlog_errno(ret);
1016 }
1017
1018 /*
1019 * We don't use generic_commit_write() because we need to
1020 * handle our own i_size update.
1021 */
1022 ret = block_commit_write(page, from, to);
1023 if (ret)
1024 mlog_errno(ret);
1025out:
1026
1027 return copied ? copied : ret;
1028}
1029
1030/*
1031 * Do the actual write of some data into an inode. Optionally allocate
1032 * in order to fulfill the write.
1033 *
1034 * cpos is the logical cluster offset within the file to write at
1035 *
1036 * 'phys' is the physical mapping of that offset. a 'phys' value of
1037 * zero indicates that allocation is required. In this case, data_ac
1038 * and meta_ac should be valid (meta_ac can be null if metadata
1039 * allocation isn't required).
1040 */
1041static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
1042 struct buffer_head *di_bh,
1043 struct ocfs2_alloc_context *data_ac,
1044 struct ocfs2_alloc_context *meta_ac,
1045 struct ocfs2_write_ctxt *wc)
1046{
1047 int ret, i, numpages = 1, new;
1048 unsigned int copied = 0;
1049 u32 tmp_pos;
1050 u64 v_blkno, p_blkno;
1051 struct address_space *mapping = file->f_mapping;
1052 struct inode *inode = mapping->host;
1053 unsigned long index, start;
1054 struct page **cpages;
1055
1056 new = phys == 0 ? 1 : 0;
1057
1058 /*
1059 * Figure out how many pages we'll be manipulating here. For
1060 * non allocating write, we just change the one
1061 * page. Otherwise, we'll need a whole clusters worth.
1062 */
1063 if (new)
1064 numpages = ocfs2_pages_per_cluster(inode->i_sb);
1065
1066 cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
1067 if (!cpages) {
1068 ret = -ENOMEM;
1069 mlog_errno(ret);
1070 return ret;
1071 }
1072
1073 /*
1074 * Fill our page array first. That way we've grabbed enough so
1075 * that we can zero and flush if we error after adding the
1076 * extent.
1077 */
1078 if (new) {
1079 start = ocfs2_align_clusters_to_page_index(inode->i_sb,
1080 wc->w_cpos);
1081 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
1082 } else {
1083 start = wc->w_pos >> PAGE_CACHE_SHIFT;
1084 v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
1085 }
1086
1087 for(i = 0; i < numpages; i++) {
1088 index = start + i;
1089
1090 cpages[i] = find_or_create_page(mapping, index, GFP_NOFS);
1091 if (!cpages[i]) {
1092 ret = -ENOMEM;
1093 mlog_errno(ret);
1094 goto out;
1095 }
1096 }
1097
1098 if (new) {
1099 /*
1100 * This is safe to call with the page locks - it won't take
1101 * any additional semaphores or cluster locks.
1102 */
1103 tmp_pos = wc->w_cpos;
1104 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
1105 &tmp_pos, 1, di_bh, handle,
1106 data_ac, meta_ac, NULL);
1107 /*
1108 * This shouldn't happen because we must have already
1109 * calculated the correct meta data allocation required. The
1110 * internal tree allocation code should know how to increase
1111 * transaction credits itself.
1112 *
1113 * If need be, we could handle -EAGAIN for a
1114 * RESTART_TRANS here.
1115 */
1116 mlog_bug_on_msg(ret == -EAGAIN,
1117 "Inode %llu: EAGAIN return during allocation.\n",
1118 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1119 if (ret < 0) {
1120 mlog_errno(ret);
1121 goto out;
1122 }
1123 }
1124
1125 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1126 NULL);
1127 if (ret < 0) {
1128
1129 /*
1130 * XXX: Should we go readonly here?
1131 */
1132
1133 mlog_errno(ret);
1134 goto out;
1135 }
1136
1137 BUG_ON(p_blkno == 0);
1138
1139 for(i = 0; i < numpages; i++) {
1140 ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
1141 wc, new);
1142 if (ret < 0) {
1143 mlog_errno(ret);
1144 goto out;
1145 }
1146
1147 copied += ret;
1148 }
1149
1150out:
1151 for(i = 0; i < numpages; i++) {
1152 unlock_page(cpages[i]);
1153 mark_page_accessed(cpages[i]);
1154 page_cache_release(cpages[i]);
1155 }
1156 kfree(cpages);
1157
1158 return copied ? copied : ret;
1159}
1160
1161static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
1162 struct ocfs2_super *osb, loff_t pos,
1163 size_t count, ocfs2_page_writer *cb,
1164 void *cb_priv)
1165{
1166 wc->w_count = count;
1167 wc->w_pos = pos;
1168 wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
1169 wc->w_finished_copy = 0;
1170
1171 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
1172 wc->w_large_pages = 1;
1173 else
1174 wc->w_large_pages = 0;
1175
1176 wc->w_write_data_page = cb;
1177 wc->w_private = cb_priv;
1178}
1179
1180/*
1181 * Write a cluster to an inode. The cluster may not be allocated yet,
1182 * in which case it will be. This only exists for buffered writes -
1183 * O_DIRECT takes a more "traditional" path through the kernel.
1184 *
1185 * The caller is responsible for incrementing pos, written counts, etc
1186 *
1187 * For file systems that don't support sparse files, pre-allocation
1188 * and page zeroing up until cpos should be done prior to this
1189 * function call.
1190 *
1191 * Callers should be holding i_sem, and the rw cluster lock.
1192 *
1193 * Returns the number of user bytes written, or less than zero for
1194 * error.
1195 */
1196ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
1197 size_t count, ocfs2_page_writer *actor,
1198 void *priv)
1199{
1200 int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
1201 ssize_t written = 0;
1202 u32 phys;
1203 struct inode *inode = file->f_mapping->host;
1204 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1205 struct buffer_head *di_bh = NULL;
1206 struct ocfs2_dinode *di;
1207 struct ocfs2_alloc_context *data_ac = NULL;
1208 struct ocfs2_alloc_context *meta_ac = NULL;
1209 handle_t *handle;
1210 struct ocfs2_write_ctxt wc;
1211
1212 ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
1213
1214 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1215 if (ret) {
1216 mlog_errno(ret);
1217 goto out;
1218 }
1219 di = (struct ocfs2_dinode *)di_bh->b_data;
1220
1221 /*
1222 * Take alloc sem here to prevent concurrent lookups. That way
1223 * the mapping, zeroing and tree manipulation within
1224 * ocfs2_write() will be safe against ->readpage(). This
1225 * should also serve to lock out allocation from a shared
1226 * writeable region.
1227 */
1228 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1229
1230 ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
1231 if (ret) {
1232 mlog_errno(ret);
1233 goto out_meta;
1234 }
1235
1236 /* phys == 0 means that allocation is required. */
1237 if (phys == 0) {
1238 ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
1239 if (ret) {
1240 mlog_errno(ret);
1241 goto out_meta;
1242 }
1243
1244 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
1245 }
1246
1247 ret = ocfs2_data_lock(inode, 1);
1248 if (ret) {
1249 mlog_errno(ret);
1250 goto out_meta;
1251 }
1252
1253 handle = ocfs2_start_trans(osb, credits);
1254 if (IS_ERR(handle)) {
1255 ret = PTR_ERR(handle);
1256 mlog_errno(ret);
1257 goto out_data;
1258 }
1259
1260 written = ocfs2_write(file, phys, handle, di_bh, data_ac,
1261 meta_ac, &wc);
1262 if (written < 0) {
1263 ret = written;
1264 mlog_errno(ret);
1265 goto out_commit;
1266 }
1267
1268 ret = ocfs2_journal_access(handle, inode, di_bh,
1269 OCFS2_JOURNAL_ACCESS_WRITE);
1270 if (ret) {
1271 mlog_errno(ret);
1272 goto out_commit;
1273 }
1274
1275 pos += written;
1276 if (pos > inode->i_size) {
1277 i_size_write(inode, pos);
1278 mark_inode_dirty(inode);
1279 }
1280 inode->i_blocks = ocfs2_inode_sector_count(inode);
1281 di->i_size = cpu_to_le64((u64)i_size_read(inode));
1282 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1283 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
1284 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1285
1286 ret = ocfs2_journal_dirty(handle, di_bh);
1287 if (ret)
1288 mlog_errno(ret);
1289
1290out_commit:
1291 ocfs2_commit_trans(osb, handle);
1292
1293out_data:
1294 ocfs2_data_unlock(inode, 1);
1295
1296out_meta:
1297 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1298 ocfs2_meta_unlock(inode, 1);
1299
1300out:
1301 brelse(di_bh);
1302 if (data_ac)
1303 ocfs2_free_alloc_context(data_ac);
1304 if (meta_ac)
1305 ocfs2_free_alloc_context(meta_ac);
1306
1307 return written ? written : ret;
1308}
1309
678const struct address_space_operations ocfs2_aops = { 1310const struct address_space_operations ocfs2_aops = {
679 .readpage = ocfs2_readpage, 1311 .readpage = ocfs2_readpage,
680 .writepage = ocfs2_writepage, 1312 .writepage = ocfs2_writepage,
681 .prepare_write = ocfs2_prepare_write,
682 .commit_write = ocfs2_commit_write,
683 .bmap = ocfs2_bmap, 1313 .bmap = ocfs2_bmap,
684 .sync_page = block_sync_page, 1314 .sync_page = block_sync_page,
685 .direct_IO = ocfs2_direct_IO, 1315 .direct_IO = ocfs2_direct_IO,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f446a15eab88..45821d479b5a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
30 unsigned from, 30 unsigned from,
31 unsigned to); 31 unsigned to);
32 32
33int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
34 struct inode *inode, unsigned int from,
35 unsigned int to, int new);
36
37int walk_page_buffers( handle_t *handle,
38 struct buffer_head *head,
39 unsigned from,
40 unsigned to,
41 int *partial,
42 int (*fn)( handle_t *handle,
43 struct buffer_head *bh));
44
45struct ocfs2_write_ctxt;
46typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
47 u64 *, unsigned int *, unsigned int *);
48
49ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
50 size_t count, ocfs2_page_writer *actor,
51 void *priv);
52
53struct ocfs2_write_ctxt {
54 size_t w_count;
55 loff_t w_pos;
56 u32 w_cpos;
57 unsigned int w_finished_copy;
58
59 /* This is true if page_size > cluster_size */
60 unsigned int w_large_pages;
61
62 /* Filler callback and private data */
63 ocfs2_page_writer *w_write_data_page;
64 void *w_private;
65
66 /* Only valid for the filler callback */
67 struct page *w_this_page;
68 unsigned int w_this_page_new;
69};
70
71struct ocfs2_buffered_write_priv {
72 char *b_src_buf;
73 const struct iovec *b_cur_iov; /* Current iovec */
74 size_t b_cur_off; /* Offset in the
75 * current iovec */
76};
77int ocfs2_map_and_write_user_data(struct inode *inode,
78 struct ocfs2_write_ctxt *wc,
79 u64 *p_blkno,
80 unsigned int *ret_from,
81 unsigned int *ret_to);
82
83struct ocfs2_splice_write_priv {
84 struct splice_desc *s_sd;
85 struct pipe_buffer *s_buf;
86 struct pipe_inode_info *s_pipe;
87 /* Neither offset value is ever larger than one page */
88 unsigned int s_offset;
89 unsigned int s_buf_offset;
90};
91int ocfs2_map_and_write_splice_data(struct inode *inode,
92 struct ocfs2_write_ctxt *wc,
93 u64 *p_blkno,
94 unsigned int *ret_from,
95 unsigned int *ret_to);
96
33/* all ocfs2_dio_end_io()'s fault */ 97/* all ocfs2_dio_end_io()'s fault */
34#define ocfs2_iocb_is_rw_locked(iocb) \ 98#define ocfs2_iocb_is_rw_locked(iocb) \
35 test_bit(0, (unsigned long *)&iocb->private) 99 test_bit(0, (unsigned long *)&iocb->private)
36#define ocfs2_iocb_set_rw_locked(iocb) \ 100static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
37 set_bit(0, (unsigned long *)&iocb->private) 101{
102 set_bit(0, (unsigned long *)&iocb->private);
103 if (level)
104 set_bit(1, (unsigned long *)&iocb->private);
105 else
106 clear_bit(1, (unsigned long *)&iocb->private);
107}
38#define ocfs2_iocb_clear_rw_locked(iocb) \ 108#define ocfs2_iocb_clear_rw_locked(iocb) \
39 clear_bit(0, (unsigned long *)&iocb->private) 109 clear_bit(0, (unsigned long *)&iocb->private)
40 110#define ocfs2_iocb_rw_locked_level(iocb) \
111 test_bit(1, (unsigned long *)&iocb->private)
41#endif /* OCFS2_FILE_H */ 112#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index eba282da500e..979113479c66 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -438,7 +438,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
438 hb_block)); 438 hb_block));
439 439
440 mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n", 440 mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n",
441 (long long)cpu_to_le64(generation), 441 (long long)generation,
442 le32_to_cpu(hb_block->hb_cksum)); 442 le32_to_cpu(hb_block->hb_cksum));
443} 443}
444 444
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 636593bf4d17..2e975c0a35e1 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -147,7 +147,7 @@ static struct kset mlog_kset = {
147 .kobj = {.name = "logmask", .ktype = &mlog_ktype}, 147 .kobj = {.name = "logmask", .ktype = &mlog_ktype},
148}; 148};
149 149
150int mlog_sys_init(struct subsystem *o2cb_subsys) 150int mlog_sys_init(struct kset *o2cb_subsys)
151{ 151{
152 int i = 0; 152 int i = 0;
153 153
@@ -157,7 +157,7 @@ int mlog_sys_init(struct subsystem *o2cb_subsys)
157 } 157 }
158 mlog_attr_ptrs[i] = NULL; 158 mlog_attr_ptrs[i] = NULL;
159 159
160 mlog_kset.subsys = o2cb_subsys; 160 kobj_set_kset_s(&mlog_kset, o2cb_subsys);
161 return kset_register(&mlog_kset); 161 return kset_register(&mlog_kset);
162} 162}
163 163
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index a42628ba9ddf..75cd877f6d42 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -278,7 +278,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
278 278
279#include <linux/kobject.h> 279#include <linux/kobject.h>
280#include <linux/sysfs.h> 280#include <linux/sysfs.h>
281int mlog_sys_init(struct subsystem *o2cb_subsys); 281int mlog_sys_init(struct kset *o2cb_subsys);
282void mlog_sys_shutdown(void); 282void mlog_sys_shutdown(void);
283 283
284#endif /* O2CLUSTER_MASKLOG_H */ 284#endif /* O2CLUSTER_MASKLOG_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 4705d659fe57..bbacf7da48a4 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -46,6 +46,7 @@
46#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/workqueue.h> 48#include <linux/workqueue.h>
49#include <linux/reboot.h>
49 50
50#include "heartbeat.h" 51#include "heartbeat.h"
51#include "nodemanager.h" 52#include "nodemanager.h"
@@ -72,7 +73,9 @@ static void o2quo_fence_self(void)
72 /* panic spins with interrupts enabled. with preempt 73 /* panic spins with interrupts enabled. with preempt
73 * threads can still schedule, etc, etc */ 74 * threads can still schedule, etc, etc */
74 o2hb_stop_all_regions(); 75 o2hb_stop_all_regions();
75 panic("ocfs2 is very sorry to be fencing this system by panicing\n"); 76
77 printk("ocfs2 is very sorry to be fencing this system by restarting\n");
78 emergency_restart();
76} 79}
77 80
78/* Indicate that a timeout occured on a hearbeat region write. The 81/* Indicate that a timeout occured on a hearbeat region write. The
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 1d9f6acafa2e..64f6f378fd09 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -42,7 +42,6 @@ struct o2cb_attribute {
42#define O2CB_ATTR(_name, _mode, _show, _store) \ 42#define O2CB_ATTR(_name, _mode, _show, _store) \
43struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store) 43struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
44 44
45#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
46#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr) 45#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
47 46
48static ssize_t o2cb_interface_revision_show(char *buf) 47static ssize_t o2cb_interface_revision_show(char *buf)
@@ -79,7 +78,7 @@ static ssize_t
79o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer) 78o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
80{ 79{
81 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr); 80 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
82 struct subsystem *sbs = to_o2cb_subsys(kobj); 81 struct kset *sbs = to_kset(kobj);
83 82
84 BUG_ON(sbs != &o2cb_subsys); 83 BUG_ON(sbs != &o2cb_subsys);
85 84
@@ -93,7 +92,7 @@ o2cb_store(struct kobject * kobj, struct attribute * attr,
93 const char * buffer, size_t count) 92 const char * buffer, size_t count)
94{ 93{
95 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr); 94 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
96 struct subsystem *sbs = to_o2cb_subsys(kobj); 95 struct kset *sbs = to_kset(kobj);
97 96
98 BUG_ON(sbs != &o2cb_subsys); 97 BUG_ON(sbs != &o2cb_subsys);
99 98
@@ -112,7 +111,7 @@ int o2cb_sys_init(void)
112{ 111{
113 int ret; 112 int ret;
114 113
115 o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type; 114 o2cb_subsys.kobj.ktype = &o2cb_subsys_type;
116 ret = subsystem_register(&o2cb_subsys); 115 ret = subsystem_register(&o2cb_subsys);
117 if (ret) 116 if (ret)
118 return ret; 117 return ret;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 69caf3e12fea..0b229a9c7952 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1496,7 +1496,7 @@ static void o2net_start_connect(struct work_struct *work)
1496 sock->sk->sk_allocation = GFP_ATOMIC; 1496 sock->sk->sk_allocation = GFP_ATOMIC;
1497 1497
1498 myaddr.sin_family = AF_INET; 1498 myaddr.sin_family = AF_INET;
1499 myaddr.sin_addr.s_addr = (__force u32)mynode->nd_ipv4_address; 1499 myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
1500 myaddr.sin_port = (__force u16)htons(0); /* any port */ 1500 myaddr.sin_port = (__force u16)htons(0); /* any port */
1501 1501
1502 ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, 1502 ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
@@ -1521,8 +1521,8 @@ static void o2net_start_connect(struct work_struct *work)
1521 spin_unlock(&nn->nn_lock); 1521 spin_unlock(&nn->nn_lock);
1522 1522
1523 remoteaddr.sin_family = AF_INET; 1523 remoteaddr.sin_family = AF_INET;
1524 remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address; 1524 remoteaddr.sin_addr.s_addr = node->nd_ipv4_address;
1525 remoteaddr.sin_port = (__force u16)node->nd_ipv4_port; 1525 remoteaddr.sin_port = node->nd_ipv4_port;
1526 1526
1527 ret = sc->sc_sock->ops->connect(sc->sc_sock, 1527 ret = sc->sc_sock->ops->connect(sc->sc_sock,
1528 (struct sockaddr *)&remoteaddr, 1528 (struct sockaddr *)&remoteaddr,
@@ -1810,8 +1810,8 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
1810 int ret; 1810 int ret;
1811 struct sockaddr_in sin = { 1811 struct sockaddr_in sin = {
1812 .sin_family = PF_INET, 1812 .sin_family = PF_INET,
1813 .sin_addr = { .s_addr = (__force u32)addr }, 1813 .sin_addr = { .s_addr = addr },
1814 .sin_port = (__force u16)port, 1814 .sin_port = port,
1815 }; 1815 };
1816 1816
1817 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); 1817 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4dae5df5e467..9606111fe89d 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,9 @@
38 * locking semantics of the file system using the protocol. It should 38 * locking semantics of the file system using the protocol. It should
39 * be somewhere else, I'm sure, but right now it isn't. 39 * be somewhere else, I'm sure, but right now it isn't.
40 * 40 *
41 * New in version 8:
42 * - Replace delete inode votes with a cluster lock
43 *
41 * New in version 7: 44 * New in version 7:
42 * - DLM join domain includes the live nodemap 45 * - DLM join domain includes the live nodemap
43 * 46 *
@@ -57,7 +60,7 @@
57 * - full 64 bit i_size in the metadata lock lvbs 60 * - full 64 bit i_size in the metadata lock lvbs
58 * - introduction of "rw" lock and pushing meta/data locking down 61 * - introduction of "rw" lock and pushing meta/data locking down
59 */ 62 */
60#define O2NET_PROTOCOL_VERSION 7ULL 63#define O2NET_PROTOCOL_VERSION 8ULL
61struct o2net_handshake { 64struct o2net_handshake {
62 __be64 protocol_version; 65 __be64 protocol_version;
63 __be64 connector_id; 66 __be64 connector_id;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 66821e178167..c441ef1f2bad 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb,
358{ 358{
359 int status; 359 int status;
360 int extend; 360 int extend;
361 u64 p_blkno; 361 u64 p_blkno, v_blkno;
362 362
363 spin_lock(&OCFS2_I(dir)->ip_lock); 363 spin_lock(&OCFS2_I(dir)->ip_lock);
364 extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); 364 extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
365 spin_unlock(&OCFS2_I(dir)->ip_lock); 365 spin_unlock(&OCFS2_I(dir)->ip_lock);
366 366
367 if (extend) { 367 if (extend) {
368 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1, 368 u32 offset = OCFS2_I(dir)->ip_clusters;
369 parent_fe_bh, handle, 369
370 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
371 1, parent_fe_bh, handle,
370 data_ac, meta_ac, NULL); 372 data_ac, meta_ac, NULL);
371 BUG_ON(status == -EAGAIN); 373 BUG_ON(status == -EAGAIN);
372 if (status < 0) { 374 if (status < 0) {
@@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb,
375 } 377 }
376 } 378 }
377 379
378 status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> 380 v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
379 (sb->s_blocksize_bits - 9)), 381 status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
380 1, &p_blkno, NULL);
381 if (status < 0) { 382 if (status < 0) {
382 mlog_errno(status); 383 mlog_errno(status);
383 goto bail; 384 goto bail;
@@ -402,7 +403,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
402 struct buffer_head **new_de_bh) 403 struct buffer_head **new_de_bh)
403{ 404{
404 int status = 0; 405 int status = 0;
405 int credits, num_free_extents; 406 int credits, num_free_extents, drop_alloc_sem = 0;
406 loff_t dir_i_size; 407 loff_t dir_i_size;
407 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 408 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
408 struct ocfs2_alloc_context *data_ac = NULL; 409 struct ocfs2_alloc_context *data_ac = NULL;
@@ -451,6 +452,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
451 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; 452 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
452 } 453 }
453 454
455 down_write(&OCFS2_I(dir)->ip_alloc_sem);
456 drop_alloc_sem = 1;
457
454 handle = ocfs2_start_trans(osb, credits); 458 handle = ocfs2_start_trans(osb, credits);
455 if (IS_ERR(handle)) { 459 if (IS_ERR(handle)) {
456 status = PTR_ERR(handle); 460 status = PTR_ERR(handle);
@@ -486,7 +490,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
486 490
487 dir_i_size += dir->i_sb->s_blocksize; 491 dir_i_size += dir->i_sb->s_blocksize;
488 i_size_write(dir, dir_i_size); 492 i_size_write(dir, dir_i_size);
489 dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size); 493 dir->i_blocks = ocfs2_inode_sector_count(dir);
490 status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); 494 status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
491 if (status < 0) { 495 if (status < 0) {
492 mlog_errno(status); 496 mlog_errno(status);
@@ -496,6 +500,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
496 *new_de_bh = new_bh; 500 *new_de_bh = new_bh;
497 get_bh(*new_de_bh); 501 get_bh(*new_de_bh);
498bail: 502bail:
503 if (drop_alloc_sem)
504 up_write(&OCFS2_I(dir)->ip_alloc_sem);
499 if (handle) 505 if (handle)
500 ocfs2_commit_trans(osb, handle); 506 ocfs2_commit_trans(osb, handle);
501 507
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 241cad342a48..2fd8bded38f3 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -312,8 +312,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
312 past->type != DLM_BAST) { 312 past->type != DLM_BAST) {
313 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" 313 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
314 "name=%.*s\n", past->type, 314 "name=%.*s\n", past->type,
315 dlm_get_lock_cookie_node(be64_to_cpu(cookie)), 315 dlm_get_lock_cookie_node(cookie),
316 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), 316 dlm_get_lock_cookie_seq(cookie),
317 locklen, name); 317 locklen, name);
318 ret = DLM_IVLOCKID; 318 ret = DLM_IVLOCKID;
319 goto leave; 319 goto leave;
@@ -324,8 +324,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
324 mlog(0, "got %sast for unknown lockres! " 324 mlog(0, "got %sast for unknown lockres! "
325 "cookie=%u:%llu, name=%.*s, namelen=%u\n", 325 "cookie=%u:%llu, name=%.*s, namelen=%u\n",
326 past->type == DLM_AST ? "" : "b", 326 past->type == DLM_AST ? "" : "b",
327 dlm_get_lock_cookie_node(be64_to_cpu(cookie)), 327 dlm_get_lock_cookie_node(cookie),
328 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), 328 dlm_get_lock_cookie_seq(cookie),
329 locklen, name, locklen); 329 locklen, name, locklen);
330 ret = DLM_IVLOCKID; 330 ret = DLM_IVLOCKID;
331 goto leave; 331 goto leave;
@@ -370,8 +370,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
370 370
371 mlog(0, "got %sast for unknown lock! cookie=%u:%llu, " 371 mlog(0, "got %sast for unknown lock! cookie=%u:%llu, "
372 "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 372 "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b",
373 dlm_get_lock_cookie_node(be64_to_cpu(cookie)), 373 dlm_get_lock_cookie_node(cookie),
374 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), 374 dlm_get_lock_cookie_seq(cookie),
375 locklen, name, locklen); 375 locklen, name, locklen);
376 376
377 ret = DLM_NORMAL; 377 ret = DLM_NORMAL;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c558442a0b44..d836b98dd99a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -430,11 +430,10 @@ redo_bucket:
430 430
431 dlm_lockres_put(res); 431 dlm_lockres_put(res);
432 432
433 cond_resched_lock(&dlm->spinlock);
434
435 if (dropped) 433 if (dropped)
436 goto redo_bucket; 434 goto redo_bucket;
437 } 435 }
436 cond_resched_lock(&dlm->spinlock);
438 num += n; 437 num += n;
439 mlog(0, "%s: touched %d lockreses in bucket %d " 438 mlog(0, "%s: touched %d lockreses in bucket %d "
440 "(tot=%d)\n", dlm->name, n, i, num); 439 "(tot=%d)\n", dlm->name, n, i, num);
@@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
1035{ 1034{
1036 int status = 0, tmpstat, node; 1035 int status = 0, tmpstat, node;
1037 struct domain_join_ctxt *ctxt; 1036 struct domain_join_ctxt *ctxt;
1038 enum dlm_query_join_response response; 1037 enum dlm_query_join_response response = JOIN_DISALLOW;
1039 1038
1040 mlog_entry("%p", dlm); 1039 mlog_entry("%p", dlm);
1041 1040
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index de952eba29a9..d4e46d067edd 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -263,8 +263,7 @@ static void dlmfs_init_once(void *foo,
263 struct dlmfs_inode_private *ip = 263 struct dlmfs_inode_private *ip =
264 (struct dlmfs_inode_private *) foo; 264 (struct dlmfs_inode_private *) foo;
265 265
266 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 266 if (flags & SLAB_CTOR_CONSTRUCTOR) {
267 SLAB_CTOR_CONSTRUCTOR) {
268 ip->ip_dlm = NULL; 267 ip->ip_dlm = NULL;
269 ip->ip_parent = NULL; 268 ip->ip_parent = NULL;
270 269
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 6d4a83d50152..671c4ed58ee2 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
611 } 611 }
612 } while (status != 0); 612 } while (status != 0);
613 613
614 spin_lock(&dlm_reco_state_lock);
614 switch (ndata->state) { 615 switch (ndata->state) {
615 case DLM_RECO_NODE_DATA_INIT: 616 case DLM_RECO_NODE_DATA_INIT:
616 case DLM_RECO_NODE_DATA_FINALIZE_SENT: 617 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
@@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
641 ndata->node_num, dead_node); 642 ndata->node_num, dead_node);
642 break; 643 break;
643 } 644 }
645 spin_unlock(&dlm_reco_state_lock);
644 } 646 }
645 647
646 mlog(0, "done requesting all lock info\n"); 648 mlog(0, "done requesting all lock info\n");
@@ -1767,7 +1769,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1767 /* lock is always created locally first, and 1769 /* lock is always created locally first, and
1768 * destroyed locally last. it must be on the list */ 1770 * destroyed locally last. it must be on the list */
1769 if (!lock) { 1771 if (!lock) {
1770 u64 c = ml->cookie; 1772 __be64 c = ml->cookie;
1771 mlog(ML_ERROR, "could not find local lock " 1773 mlog(ML_ERROR, "could not find local lock "
1772 "with cookie %u:%llu!\n", 1774 "with cookie %u:%llu!\n",
1773 dlm_get_lock_cookie_node(be64_to_cpu(c)), 1775 dlm_get_lock_cookie_node(be64_to_cpu(c)),
@@ -1876,7 +1878,7 @@ skip_lvb:
1876 spin_lock(&res->spinlock); 1878 spin_lock(&res->spinlock);
1877 list_for_each_entry(lock, queue, list) { 1879 list_for_each_entry(lock, queue, list) {
1878 if (lock->ml.cookie == ml->cookie) { 1880 if (lock->ml.cookie == ml->cookie) {
1879 u64 c = lock->ml.cookie; 1881 __be64 c = lock->ml.cookie;
1880 mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " 1882 mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
1881 "exists on this lockres!\n", dlm->name, 1883 "exists on this lockres!\n", dlm->name,
1882 res->lockname.len, res->lockname.name, 1884 res->lockname.len, res->lockname.name,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2b264c6ba039..cebd089f8955 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -76,7 +76,7 @@ repeat:
76 goto repeat; 76 goto repeat;
77 } 77 }
78 remove_wait_queue(&res->wq, &wait); 78 remove_wait_queue(&res->wq, &wait);
79 current->state = TASK_RUNNING; 79 __set_current_state(TASK_RUNNING);
80} 80}
81 81
82int __dlm_lockres_has_locks(struct dlm_lock_resource *res) 82int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e335541727f9..024777abc8e3 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -104,6 +104,35 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
104static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, 104static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
105 struct ocfs2_lock_res *lockres); 105 struct ocfs2_lock_res *lockres);
106 106
107
108#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
109
110/* This aids in debugging situations where a bad LVB might be involved. */
111static void ocfs2_dump_meta_lvb_info(u64 level,
112 const char *function,
113 unsigned int line,
114 struct ocfs2_lock_res *lockres)
115{
116 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
117
118 mlog(level, "LVB information for %s (called from %s:%u):\n",
119 lockres->l_name, function, line);
120 mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
121 lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
122 be32_to_cpu(lvb->lvb_igeneration));
123 mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
124 (unsigned long long)be64_to_cpu(lvb->lvb_isize),
125 be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
126 be16_to_cpu(lvb->lvb_imode));
127 mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
128 "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
129 (long long)be64_to_cpu(lvb->lvb_iatime_packed),
130 (long long)be64_to_cpu(lvb->lvb_ictime_packed),
131 (long long)be64_to_cpu(lvb->lvb_imtime_packed),
132 be32_to_cpu(lvb->lvb_iattr));
133}
134
135
107/* 136/*
108 * OCFS2 Lock Resource Operations 137 * OCFS2 Lock Resource Operations
109 * 138 *
@@ -225,11 +254,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
225 .flags = 0, 254 .flags = 0,
226}; 255};
227 256
257static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
258 .get_osb = ocfs2_get_inode_osb,
259 .flags = 0,
260};
261
228static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 262static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
229{ 263{
230 return lockres->l_type == OCFS2_LOCK_TYPE_META || 264 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
231 lockres->l_type == OCFS2_LOCK_TYPE_DATA || 265 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
232 lockres->l_type == OCFS2_LOCK_TYPE_RW; 266 lockres->l_type == OCFS2_LOCK_TYPE_RW ||
267 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
233} 268}
234 269
235static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) 270static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@@ -373,6 +408,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
373 case OCFS2_LOCK_TYPE_DATA: 408 case OCFS2_LOCK_TYPE_DATA:
374 ops = &ocfs2_inode_data_lops; 409 ops = &ocfs2_inode_data_lops;
375 break; 410 break;
411 case OCFS2_LOCK_TYPE_OPEN:
412 ops = &ocfs2_inode_open_lops;
413 break;
376 default: 414 default:
377 mlog_bug_on_msg(1, "type: %d\n", type); 415 mlog_bug_on_msg(1, "type: %d\n", type);
378 ops = NULL; /* thanks, gcc */ 416 ops = NULL; /* thanks, gcc */
@@ -1129,6 +1167,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1129 goto bail; 1167 goto bail;
1130 } 1168 }
1131 1169
1170 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
1171 if (ret) {
1172 mlog_errno(ret);
1173 goto bail;
1174 }
1175
1132bail: 1176bail:
1133 mlog_exit(ret); 1177 mlog_exit(ret);
1134 return ret; 1178 return ret;
@@ -1182,6 +1226,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
1182 mlog_exit_void(); 1226 mlog_exit_void();
1183} 1227}
1184 1228
1229/*
1230 * ocfs2_open_lock always get PR mode lock.
1231 */
1232int ocfs2_open_lock(struct inode *inode)
1233{
1234 int status = 0;
1235 struct ocfs2_lock_res *lockres;
1236 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1237
1238 BUG_ON(!inode);
1239
1240 mlog_entry_void();
1241
1242 mlog(0, "inode %llu take PRMODE open lock\n",
1243 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1244
1245 if (ocfs2_mount_local(osb))
1246 goto out;
1247
1248 lockres = &OCFS2_I(inode)->ip_open_lockres;
1249
1250 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
1251 LKM_PRMODE, 0, 0);
1252 if (status < 0)
1253 mlog_errno(status);
1254
1255out:
1256 mlog_exit(status);
1257 return status;
1258}
1259
1260int ocfs2_try_open_lock(struct inode *inode, int write)
1261{
1262 int status = 0, level;
1263 struct ocfs2_lock_res *lockres;
1264 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1265
1266 BUG_ON(!inode);
1267
1268 mlog_entry_void();
1269
1270 mlog(0, "inode %llu try to take %s open lock\n",
1271 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1272 write ? "EXMODE" : "PRMODE");
1273
1274 if (ocfs2_mount_local(osb))
1275 goto out;
1276
1277 lockres = &OCFS2_I(inode)->ip_open_lockres;
1278
1279 level = write ? LKM_EXMODE : LKM_PRMODE;
1280
1281 /*
1282 * The file system may already holding a PRMODE/EXMODE open lock.
1283 * Since we pass LKM_NOQUEUE, the request won't block waiting on
1284 * other nodes and the -EAGAIN will indicate to the caller that
1285 * this inode is still in use.
1286 */
1287 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
1288 level, LKM_NOQUEUE, 0);
1289
1290out:
1291 mlog_exit(status);
1292 return status;
1293}
1294
1295/*
1296 * ocfs2_open_unlock unlock PR and EX mode open locks.
1297 */
1298void ocfs2_open_unlock(struct inode *inode)
1299{
1300 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
1301 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1302
1303 mlog_entry_void();
1304
1305 mlog(0, "inode %llu drop open lock\n",
1306 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1307
1308 if (ocfs2_mount_local(osb))
1309 goto out;
1310
1311 if(lockres->l_ro_holders)
1312 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
1313 LKM_PRMODE);
1314 if(lockres->l_ex_holders)
1315 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
1316 LKM_EXMODE);
1317
1318out:
1319 mlog_exit_void();
1320}
1321
1185int ocfs2_data_lock_full(struct inode *inode, 1322int ocfs2_data_lock_full(struct inode *inode,
1186 int write, 1323 int write,
1187 int arg_flags) 1324 int arg_flags)
@@ -1387,8 +1524,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1387 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) 1524 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
1388 inode->i_blocks = 0; 1525 inode->i_blocks = 0;
1389 else 1526 else
1390 inode->i_blocks = 1527 inode->i_blocks = ocfs2_inode_sector_count(inode);
1391 ocfs2_align_bytes_to_sectors(i_size_read(inode));
1392 1528
1393 inode->i_uid = be32_to_cpu(lvb->lvb_iuid); 1529 inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
1394 inode->i_gid = be32_to_cpu(lvb->lvb_igid); 1530 inode->i_gid = be32_to_cpu(lvb->lvb_igid);
@@ -1479,12 +1615,15 @@ static int ocfs2_meta_lock_update(struct inode *inode,
1479{ 1615{
1480 int status = 0; 1616 int status = 0;
1481 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1617 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1482 struct ocfs2_lock_res *lockres = NULL; 1618 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1483 struct ocfs2_dinode *fe; 1619 struct ocfs2_dinode *fe;
1484 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1620 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1485 1621
1486 mlog_entry_void(); 1622 mlog_entry_void();
1487 1623
1624 if (ocfs2_mount_local(osb))
1625 goto bail;
1626
1488 spin_lock(&oi->ip_lock); 1627 spin_lock(&oi->ip_lock);
1489 if (oi->ip_flags & OCFS2_INODE_DELETED) { 1628 if (oi->ip_flags & OCFS2_INODE_DELETED) {
1490 mlog(0, "Orphaned inode %llu was deleted while we " 1629 mlog(0, "Orphaned inode %llu was deleted while we "
@@ -1496,22 +1635,16 @@ static int ocfs2_meta_lock_update(struct inode *inode,
1496 } 1635 }
1497 spin_unlock(&oi->ip_lock); 1636 spin_unlock(&oi->ip_lock);
1498 1637
1499 if (!ocfs2_mount_local(osb)) { 1638 if (!ocfs2_should_refresh_lock_res(lockres))
1500 lockres = &oi->ip_meta_lockres; 1639 goto bail;
1501
1502 if (!ocfs2_should_refresh_lock_res(lockres))
1503 goto bail;
1504 }
1505 1640
1506 /* This will discard any caching information we might have had 1641 /* This will discard any caching information we might have had
1507 * for the inode metadata. */ 1642 * for the inode metadata. */
1508 ocfs2_metadata_cache_purge(inode); 1643 ocfs2_metadata_cache_purge(inode);
1509 1644
1510 /* will do nothing for inode types that don't use the extent
1511 * map (directories, bitmap files, etc) */
1512 ocfs2_extent_map_trunc(inode, 0); 1645 ocfs2_extent_map_trunc(inode, 0);
1513 1646
1514 if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) { 1647 if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
1515 mlog(0, "Trusting LVB on inode %llu\n", 1648 mlog(0, "Trusting LVB on inode %llu\n",
1516 (unsigned long long)oi->ip_blkno); 1649 (unsigned long long)oi->ip_blkno);
1517 ocfs2_refresh_inode_from_lvb(inode); 1650 ocfs2_refresh_inode_from_lvb(inode);
@@ -1558,8 +1691,7 @@ static int ocfs2_meta_lock_update(struct inode *inode,
1558 1691
1559 status = 0; 1692 status = 0;
1560bail_refresh: 1693bail_refresh:
1561 if (lockres) 1694 ocfs2_complete_lock_res_refresh(lockres, status);
1562 ocfs2_complete_lock_res_refresh(lockres, status);
1563bail: 1695bail:
1564 mlog_exit(status); 1696 mlog_exit(status);
1565 return status; 1697 return status;
@@ -1630,7 +1762,6 @@ int ocfs2_meta_lock_full(struct inode *inode,
1630 wait_event(osb->recovery_event, 1762 wait_event(osb->recovery_event,
1631 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1763 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1632 1764
1633 acquired = 0;
1634 lockres = &OCFS2_I(inode)->ip_meta_lockres; 1765 lockres = &OCFS2_I(inode)->ip_meta_lockres;
1635 level = ex ? LKM_EXMODE : LKM_PRMODE; 1766 level = ex ? LKM_EXMODE : LKM_PRMODE;
1636 dlm_flags = 0; 1767 dlm_flags = 0;
@@ -2458,13 +2589,20 @@ int ocfs2_drop_inode_locks(struct inode *inode)
2458 * ocfs2_clear_inode has done it for us. */ 2589 * ocfs2_clear_inode has done it for us. */
2459 2590
2460 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2591 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2461 &OCFS2_I(inode)->ip_data_lockres); 2592 &OCFS2_I(inode)->ip_open_lockres);
2462 if (err < 0) 2593 if (err < 0)
2463 mlog_errno(err); 2594 mlog_errno(err);
2464 2595
2465 status = err; 2596 status = err;
2466 2597
2467 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2598 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2599 &OCFS2_I(inode)->ip_data_lockres);
2600 if (err < 0)
2601 mlog_errno(err);
2602 if (err < 0 && !status)
2603 status = err;
2604
2605 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2468 &OCFS2_I(inode)->ip_meta_lockres); 2606 &OCFS2_I(inode)->ip_meta_lockres);
2469 if (err < 0) 2607 if (err < 0)
2470 mlog_errno(err); 2608 mlog_errno(err);
@@ -2969,28 +3107,3 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
2969 3107
2970 mlog_exit_void(); 3108 mlog_exit_void();
2971} 3109}
2972
2973/* This aids in debugging situations where a bad LVB might be involved. */
2974void ocfs2_dump_meta_lvb_info(u64 level,
2975 const char *function,
2976 unsigned int line,
2977 struct ocfs2_lock_res *lockres)
2978{
2979 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
2980
2981 mlog(level, "LVB information for %s (called from %s:%u):\n",
2982 lockres->l_name, function, line);
2983 mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
2984 lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
2985 be32_to_cpu(lvb->lvb_igeneration));
2986 mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
2987 (unsigned long long)be64_to_cpu(lvb->lvb_isize),
2988 be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
2989 be16_to_cpu(lvb->lvb_imode));
2990 mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
2991 "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
2992 (long long)be64_to_cpu(lvb->lvb_iatime_packed),
2993 (long long)be64_to_cpu(lvb->lvb_ictime_packed),
2994 (long long)be64_to_cpu(lvb->lvb_imtime_packed),
2995 be32_to_cpu(lvb->lvb_iattr));
2996}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index c343fca68cf1..492bad32a8c0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
80 int write); 80 int write);
81int ocfs2_rw_lock(struct inode *inode, int write); 81int ocfs2_rw_lock(struct inode *inode, int write);
82void ocfs2_rw_unlock(struct inode *inode, int write); 82void ocfs2_rw_unlock(struct inode *inode, int write);
83int ocfs2_open_lock(struct inode *inode);
84int ocfs2_try_open_lock(struct inode *inode, int write);
85void ocfs2_open_unlock(struct inode *inode);
83int ocfs2_meta_lock_atime(struct inode *inode, 86int ocfs2_meta_lock_atime(struct inode *inode,
84 struct vfsmount *vfsmnt, 87 struct vfsmount *vfsmnt,
85 int *level); 88 int *level);
@@ -116,11 +119,4 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
116struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); 119struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
117void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); 120void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
118 121
119/* aids in debugging and tracking lvbs */
120void ocfs2_dump_meta_lvb_info(u64 level,
121 const char *function,
122 unsigned int line,
123 struct ocfs2_lock_res *lockres);
124#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
125
126#endif /* DLMGLUE_H */ 122#endif /* DLMGLUE_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 56e1fefc1205..bc48177bd183 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -140,7 +140,7 @@ bail:
140 return parent; 140 return parent;
141} 141}
142 142
143static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len, 143static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
144 int connectable) 144 int connectable)
145{ 145{
146 struct inode *inode = dentry->d_inode; 146 struct inode *inode = dentry->d_inode;
@@ -148,6 +148,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
148 int type = 1; 148 int type = 1;
149 u64 blkno; 149 u64 blkno;
150 u32 generation; 150 u32 generation;
151 __le32 *fh = (__force __le32 *) fh_in;
151 152
152 mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry, 153 mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
153 dentry->d_name.len, dentry->d_name.name, 154 dentry->d_name.len, dentry->d_name.name,
@@ -199,7 +200,7 @@ bail:
199 return type; 200 return type;
200} 201}
201 202
202static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh, 203static struct dentry *ocfs2_decode_fh(struct super_block *sb, u32 *fh_in,
203 int fh_len, int fileid_type, 204 int fh_len, int fileid_type,
204 int (*acceptable)(void *context, 205 int (*acceptable)(void *context,
205 struct dentry *de), 206 struct dentry *de),
@@ -207,6 +208,7 @@ static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
207{ 208{
208 struct ocfs2_inode_handle handle, parent; 209 struct ocfs2_inode_handle handle, parent;
209 struct dentry *ret = NULL; 210 struct dentry *ret = NULL;
211 __le32 *fh = (__force __le32 *) fh_in;
210 212
211 mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n", 213 mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
212 sb, fh, fh_len, fileid_type, acceptable, context); 214 sb, fh, fh_len, fileid_type, acceptable, context);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 80ac69f11d9f..ba2b2ab1c6e4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -3,8 +3,7 @@
3 * 3 *
4 * extent_map.c 4 * extent_map.c
5 * 5 *
6 * In-memory extent map for OCFS2. Man, this code was prettier in 6 * Block/Cluster mapping functions
7 * the library.
8 * 7 *
9 * Copyright (C) 2004 Oracle. All rights reserved. 8 * Copyright (C) 2004 Oracle. All rights reserved.
10 * 9 *
@@ -26,1016 +25,528 @@
26#include <linux/fs.h> 25#include <linux/fs.h>
27#include <linux/init.h> 26#include <linux/init.h>
28#include <linux/types.h> 27#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/rbtree.h>
31 28
32#define MLOG_MASK_PREFIX ML_EXTENT_MAP 29#define MLOG_MASK_PREFIX ML_EXTENT_MAP
33#include <cluster/masklog.h> 30#include <cluster/masklog.h>
34 31
35#include "ocfs2.h" 32#include "ocfs2.h"
36 33
34#include "alloc.h"
37#include "extent_map.h" 35#include "extent_map.h"
38#include "inode.h" 36#include "inode.h"
39#include "super.h" 37#include "super.h"
40 38
41#include "buffer_head_io.h" 39#include "buffer_head_io.h"
42 40
43
44/* 41/*
45 * SUCK SUCK SUCK 42 * The extent caching implementation is intentionally trivial.
46 * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
47 */
48
49struct ocfs2_extent_map_entry {
50 struct rb_node e_node;
51 int e_tree_depth;
52 struct ocfs2_extent_rec e_rec;
53};
54
55struct ocfs2_em_insert_context {
56 int need_left;
57 int need_right;
58 struct ocfs2_extent_map_entry *new_ent;
59 struct ocfs2_extent_map_entry *old_ent;
60 struct ocfs2_extent_map_entry *left_ent;
61 struct ocfs2_extent_map_entry *right_ent;
62};
63
64static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
65
66
67static struct ocfs2_extent_map_entry *
68ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
69 u32 cpos, u32 clusters,
70 struct rb_node ***ret_p,
71 struct rb_node **ret_parent);
72static int ocfs2_extent_map_insert(struct inode *inode,
73 struct ocfs2_extent_rec *rec,
74 int tree_depth);
75static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
76 struct ocfs2_extent_map_entry *ent);
77static int ocfs2_extent_map_find_leaf(struct inode *inode,
78 u32 cpos, u32 clusters,
79 struct ocfs2_extent_list *el);
80static int ocfs2_extent_map_lookup_read(struct inode *inode,
81 u32 cpos, u32 clusters,
82 struct ocfs2_extent_map_entry **ret_ent);
83static int ocfs2_extent_map_try_insert(struct inode *inode,
84 struct ocfs2_extent_rec *rec,
85 int tree_depth,
86 struct ocfs2_em_insert_context *ctxt);
87
88/* returns 1 only if the rec contains all the given clusters -- that is that
89 * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
90 * clusters) is >= the argument's endpoint */
91static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
92 u32 cpos, u32 clusters)
93{
94 if (le32_to_cpu(rec->e_cpos) > cpos)
95 return 0;
96 if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
97 le32_to_cpu(rec->e_clusters))
98 return 0;
99 return 1;
100}
101
102
103/*
104 * Find an entry in the tree that intersects the region passed in.
105 * Note that this will find straddled intervals, it is up to the
106 * callers to enforce any boundary conditions.
107 *
108 * Callers must hold ip_lock. This lookup is not guaranteed to return
109 * a tree_depth 0 match, and as such can race inserts if the lock
110 * were not held.
111 * 43 *
112 * The rb_node garbage lets insertion share the search. Trivial 44 * We only cache a small number of extents stored directly on the
113 * callers pass NULL. 45 * inode, so linear order operations are acceptable. If we ever want
46 * to increase the size of the extent map, then these algorithms must
47 * get smarter.
114 */ 48 */
115static struct ocfs2_extent_map_entry * 49
116ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, 50void ocfs2_extent_map_init(struct inode *inode)
117 u32 cpos, u32 clusters,
118 struct rb_node ***ret_p,
119 struct rb_node **ret_parent)
120{ 51{
121 struct rb_node **p = &em->em_extents.rb_node; 52 struct ocfs2_inode_info *oi = OCFS2_I(inode);
122 struct rb_node *parent = NULL;
123 struct ocfs2_extent_map_entry *ent = NULL;
124
125 while (*p)
126 {
127 parent = *p;
128 ent = rb_entry(parent, struct ocfs2_extent_map_entry,
129 e_node);
130 if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
131 p = &(*p)->rb_left;
132 ent = NULL;
133 } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
134 le32_to_cpu(ent->e_rec.e_clusters))) {
135 p = &(*p)->rb_right;
136 ent = NULL;
137 } else
138 break;
139 }
140 53
141 if (ret_p != NULL) 54 oi->ip_extent_map.em_num_items = 0;
142 *ret_p = p; 55 INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
143 if (ret_parent != NULL)
144 *ret_parent = parent;
145 return ent;
146} 56}
147 57
148/* 58static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
149 * Find the leaf containing the interval we want. While we're on our 59 unsigned int cpos,
150 * way down the tree, fill in every record we see at any depth, because 60 struct ocfs2_extent_map_item **ret_emi)
151 * we might want it later.
152 *
153 * Note that this code is run without ip_lock. That's because it
154 * sleeps while reading. If someone is also filling the extent list at
155 * the same time we are, we might have to restart.
156 */
157static int ocfs2_extent_map_find_leaf(struct inode *inode,
158 u32 cpos, u32 clusters,
159 struct ocfs2_extent_list *el)
160{ 61{
161 int i, ret; 62 unsigned int range;
162 struct buffer_head *eb_bh = NULL; 63 struct ocfs2_extent_map_item *emi;
163 u64 blkno;
164 u32 rec_end;
165 struct ocfs2_extent_block *eb;
166 struct ocfs2_extent_rec *rec;
167
168 /*
169 * The bh data containing the el cannot change here, because
170 * we hold alloc_sem. So we can do this without other
171 * locks.
172 */
173 while (el->l_tree_depth)
174 {
175 blkno = 0;
176 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
177 rec = &el->l_recs[i];
178 rec_end = (le32_to_cpu(rec->e_cpos) +
179 le32_to_cpu(rec->e_clusters));
180
181 ret = -EBADR;
182 if (rec_end > OCFS2_I(inode)->ip_clusters) {
183 mlog_errno(ret);
184 ocfs2_error(inode->i_sb,
185 "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
186 i,
187 (unsigned long long)le64_to_cpu(rec->e_blkno),
188 (unsigned long long)OCFS2_I(inode)->ip_blkno,
189 OCFS2_I(inode)->ip_clusters);
190 goto out_free;
191 }
192
193 if (rec_end <= cpos) {
194 ret = ocfs2_extent_map_insert(inode, rec,
195 le16_to_cpu(el->l_tree_depth));
196 if (ret && (ret != -EEXIST)) {
197 mlog_errno(ret);
198 goto out_free;
199 }
200 continue;
201 }
202 if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
203 ret = ocfs2_extent_map_insert(inode, rec,
204 le16_to_cpu(el->l_tree_depth));
205 if (ret && (ret != -EEXIST)) {
206 mlog_errno(ret);
207 goto out_free;
208 }
209 continue;
210 }
211 64
212 /* 65 *ret_emi = NULL;
213 * We've found a record that matches our
214 * interval. We don't insert it because we're
215 * about to traverse it.
216 */
217
218 /* Check to see if we're stradling */
219 ret = -ESRCH;
220 if (!ocfs2_extent_rec_contains_clusters(rec,
221 cpos,
222 clusters)) {
223 mlog_errno(ret);
224 goto out_free;
225 }
226 66
227 /* 67 list_for_each_entry(emi, &em->em_list, ei_list) {
228 * If we've already found a record, the el has 68 range = emi->ei_cpos + emi->ei_clusters;
229 * two records covering the same interval.
230 * EEEK!
231 */
232 ret = -EBADR;
233 if (blkno) {
234 mlog_errno(ret);
235 ocfs2_error(inode->i_sb,
236 "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
237 cpos, clusters,
238 (unsigned long long)OCFS2_I(inode)->ip_blkno,
239 (unsigned long long)blkno, i,
240 (unsigned long long)le64_to_cpu(rec->e_blkno));
241 goto out_free;
242 }
243 69
244 blkno = le64_to_cpu(rec->e_blkno); 70 if (cpos >= emi->ei_cpos && cpos < range) {
245 } 71 list_move(&emi->ei_list, &em->em_list);
246 72
247 /* 73 *ret_emi = emi;
248 * We don't support holes, and we're still up 74 break;
249 * in the branches, so we'd better have found someone
250 */
251 ret = -EBADR;
252 if (!blkno) {
253 ocfs2_error(inode->i_sb,
254 "No record found for (cpos = %u, clusters = %u) on inode %llu\n",
255 cpos, clusters,
256 (unsigned long long)OCFS2_I(inode)->ip_blkno);
257 mlog_errno(ret);
258 goto out_free;
259 }
260
261 if (eb_bh) {
262 brelse(eb_bh);
263 eb_bh = NULL;
264 }
265 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
266 blkno, &eb_bh, OCFS2_BH_CACHED,
267 inode);
268 if (ret) {
269 mlog_errno(ret);
270 goto out_free;
271 }
272 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
273 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
274 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
275 ret = -EIO;
276 goto out_free;
277 } 75 }
278 el = &eb->h_list;
279 } 76 }
77}
280 78
281 BUG_ON(el->l_tree_depth); 79static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
282 80 unsigned int *phys, unsigned int *len,
283 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 81 unsigned int *flags)
284 rec = &el->l_recs[i]; 82{
285 83 unsigned int coff;
286 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > 84 struct ocfs2_inode_info *oi = OCFS2_I(inode);
287 OCFS2_I(inode)->ip_clusters) { 85 struct ocfs2_extent_map_item *emi;
288 ret = -EBADR; 86
289 mlog_errno(ret); 87 spin_lock(&oi->ip_lock);
290 ocfs2_error(inode->i_sb, 88
291 "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", 89 __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
292 i, 90 if (emi) {
293 (unsigned long long)le64_to_cpu(rec->e_blkno), 91 coff = cpos - emi->ei_cpos;
294 (unsigned long long)OCFS2_I(inode)->ip_blkno, 92 *phys = emi->ei_phys + coff;
295 OCFS2_I(inode)->ip_clusters); 93 if (len)
296 return ret; 94 *len = emi->ei_clusters - coff;
297 } 95 if (flags)
298 96 *flags = emi->ei_flags;
299 ret = ocfs2_extent_map_insert(inode, rec,
300 le16_to_cpu(el->l_tree_depth));
301 if (ret && (ret != -EEXIST)) {
302 mlog_errno(ret);
303 goto out_free;
304 }
305 } 97 }
306 98
307 ret = 0; 99 spin_unlock(&oi->ip_lock);
308 100
309out_free: 101 if (emi == NULL)
310 if (eb_bh) 102 return -ENOENT;
311 brelse(eb_bh);
312 103
313 return ret; 104 return 0;
314} 105}
315 106
316/* 107/*
317 * This lookup actually will read from disk. It has one invariant: 108 * Forget about all clusters equal to or greater than cpos.
318 * It will never re-traverse blocks. This means that all inserts should
319 * be new regions or more granular regions (both allowed by insert).
320 */ 109 */
321static int ocfs2_extent_map_lookup_read(struct inode *inode, 110void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
322 u32 cpos,
323 u32 clusters,
324 struct ocfs2_extent_map_entry **ret_ent)
325{ 111{
326 int ret; 112 struct list_head *p, *n;
327 u64 blkno; 113 struct ocfs2_extent_map_item *emi;
328 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; 114 struct ocfs2_inode_info *oi = OCFS2_I(inode);
329 struct ocfs2_extent_map_entry *ent; 115 struct ocfs2_extent_map *em = &oi->ip_extent_map;
330 struct buffer_head *bh = NULL; 116 LIST_HEAD(tmp_list);
331 struct ocfs2_extent_block *eb; 117 unsigned int range;
332 struct ocfs2_dinode *di; 118
333 struct ocfs2_extent_list *el; 119 spin_lock(&oi->ip_lock);
334 120 list_for_each_safe(p, n, &em->em_list) {
335 spin_lock(&OCFS2_I(inode)->ip_lock); 121 emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
336 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); 122
337 if (ent) { 123 if (emi->ei_cpos >= cpos) {
338 if (!ent->e_tree_depth) { 124 /* Full truncate of this record. */
339 spin_unlock(&OCFS2_I(inode)->ip_lock); 125 list_move(&emi->ei_list, &tmp_list);
340 *ret_ent = ent; 126 BUG_ON(em->em_num_items == 0);
341 return 0; 127 em->em_num_items--;
342 } 128 continue;
343 blkno = le64_to_cpu(ent->e_rec.e_blkno);
344 spin_unlock(&OCFS2_I(inode)->ip_lock);
345
346 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
347 OCFS2_BH_CACHED, inode);
348 if (ret) {
349 mlog_errno(ret);
350 if (bh)
351 brelse(bh);
352 return ret;
353 } 129 }
354 eb = (struct ocfs2_extent_block *)bh->b_data;
355 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
356 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
357 brelse(bh);
358 return -EIO;
359 }
360 el = &eb->h_list;
361 } else {
362 spin_unlock(&OCFS2_I(inode)->ip_lock);
363 130
364 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 131 range = emi->ei_cpos + emi->ei_clusters;
365 OCFS2_I(inode)->ip_blkno, &bh, 132 if (range > cpos) {
366 OCFS2_BH_CACHED, inode); 133 /* Partial truncate */
367 if (ret) { 134 emi->ei_clusters = cpos - emi->ei_cpos;
368 mlog_errno(ret);
369 if (bh)
370 brelse(bh);
371 return ret;
372 } 135 }
373 di = (struct ocfs2_dinode *)bh->b_data;
374 if (!OCFS2_IS_VALID_DINODE(di)) {
375 brelse(bh);
376 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
377 return -EIO;
378 }
379 el = &di->id2.i_list;
380 }
381
382 ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
383 brelse(bh);
384 if (ret) {
385 mlog_errno(ret);
386 return ret;
387 } 136 }
137 spin_unlock(&oi->ip_lock);
388 138
389 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); 139 list_for_each_safe(p, n, &tmp_list) {
390 if (!ent) { 140 emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
391 ret = -ESRCH; 141 list_del(&emi->ei_list);
392 mlog_errno(ret); 142 kfree(emi);
393 return ret;
394 } 143 }
395
396 /* FIXME: Make sure this isn't a corruption */
397 BUG_ON(ent->e_tree_depth);
398
399 *ret_ent = ent;
400
401 return 0;
402} 144}
403 145
404/* 146/*
405 * Callers must hold ip_lock. This can insert pieces of the tree, 147 * Is any part of emi2 contained within emi1
406 * thus racing lookup if the lock weren't held.
407 */ 148 */
408static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, 149static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
409 struct ocfs2_extent_map_entry *ent) 150 struct ocfs2_extent_map_item *emi2)
410{ 151{
411 struct rb_node **p, *parent; 152 unsigned int range1, range2;
412 struct ocfs2_extent_map_entry *old_ent;
413 153
414 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), 154 /*
415 le32_to_cpu(ent->e_rec.e_clusters), 155 * Check if logical start of emi2 is inside emi1
416 &p, &parent); 156 */
417 if (old_ent) 157 range1 = emi1->ei_cpos + emi1->ei_clusters;
418 return -EEXIST; 158 if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
159 return 1;
419 160
420 rb_link_node(&ent->e_node, parent, p); 161 /*
421 rb_insert_color(&ent->e_node, &em->em_extents); 162 * Check if logical end of emi2 is inside emi1
163 */
164 range2 = emi2->ei_cpos + emi2->ei_clusters;
165 if (range2 > emi1->ei_cpos && range2 <= range1)
166 return 1;
422 167
423 return 0; 168 return 0;
424} 169}
425 170
171static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
172 struct ocfs2_extent_map_item *src)
173{
174 dest->ei_cpos = src->ei_cpos;
175 dest->ei_phys = src->ei_phys;
176 dest->ei_clusters = src->ei_clusters;
177 dest->ei_flags = src->ei_flags;
178}
426 179
427/* 180/*
428 * Simple rule: on any return code other than -EAGAIN, anything left 181 * Try to merge emi with ins. Returns 1 if merge succeeds, zero
429 * in the insert_context will be freed. 182 * otherwise.
430 *
431 * Simple rule #2: A return code of -EEXIST from this function or
432 * its calls to ocfs2_extent_map_insert_entry() signifies that another
433 * thread beat us to the insert. It is not an actual error, but it
434 * tells the caller we have no more work to do.
435 */ 183 */
436static int ocfs2_extent_map_try_insert(struct inode *inode, 184static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
437 struct ocfs2_extent_rec *rec, 185 struct ocfs2_extent_map_item *ins)
438 int tree_depth,
439 struct ocfs2_em_insert_context *ctxt)
440{ 186{
441 int ret;
442 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
443 struct ocfs2_extent_map_entry *old_ent;
444
445 ctxt->need_left = 0;
446 ctxt->need_right = 0;
447 ctxt->old_ent = NULL;
448
449 spin_lock(&OCFS2_I(inode)->ip_lock);
450 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
451 if (!ret) {
452 ctxt->new_ent = NULL;
453 goto out_unlock;
454 }
455
456 /* Since insert_entry failed, the map MUST have old_ent */
457 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
458 le32_to_cpu(rec->e_clusters),
459 NULL, NULL);
460
461 BUG_ON(!old_ent);
462
463 if (old_ent->e_tree_depth < tree_depth) {
464 /* Another thread beat us to the lower tree_depth */
465 ret = -EEXIST;
466 goto out_unlock;
467 }
468
469 if (old_ent->e_tree_depth == tree_depth) {
470 /*
471 * Another thread beat us to this tree_depth.
472 * Let's make sure we agree with that thread (the
473 * extent_rec should be identical).
474 */
475 if (!memcmp(rec, &old_ent->e_rec,
476 sizeof(struct ocfs2_extent_rec)))
477 ret = 0;
478 else
479 /* FIXME: Should this be ESRCH/EBADR??? */
480 ret = -EEXIST;
481
482 goto out_unlock;
483 }
484
485 /* 187 /*
486 * We do it in this order specifically so that no actual tree 188 * Handle contiguousness
487 * changes occur until we have all the pieces we need. We
488 * don't want malloc failures to leave an inconsistent tree.
489 * Whenever we drop the lock, another process could be
490 * inserting. Also note that, if another process just beat us
491 * to an insert, we might not need the same pieces we needed
492 * the first go round. In the end, the pieces we need will
493 * be used, and the pieces we don't will be freed.
494 */ 189 */
495 ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > 190 if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
496 le32_to_cpu(old_ent->e_rec.e_cpos)); 191 ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
497 ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + 192 ins->ei_flags == emi->ei_flags) {
498 le32_to_cpu(old_ent->e_rec.e_clusters)) > 193 emi->ei_clusters += ins->ei_clusters;
499 (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); 194 return 1;
500 ret = -EAGAIN; 195 } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
501 if (ctxt->need_left) { 196 (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
502 if (!ctxt->left_ent) 197 ins->ei_flags == emi->ei_flags) {
503 goto out_unlock; 198 emi->ei_phys = ins->ei_phys;
504 *(ctxt->left_ent) = *old_ent; 199 emi->ei_cpos = ins->ei_cpos;
505 ctxt->left_ent->e_rec.e_clusters = 200 emi->ei_clusters += ins->ei_clusters;
506 cpu_to_le32(le32_to_cpu(rec->e_cpos) - 201 return 1;
507 le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
508 }
509 if (ctxt->need_right) {
510 if (!ctxt->right_ent)
511 goto out_unlock;
512 *(ctxt->right_ent) = *old_ent;
513 ctxt->right_ent->e_rec.e_cpos =
514 cpu_to_le32(le32_to_cpu(rec->e_cpos) +
515 le32_to_cpu(rec->e_clusters));
516 ctxt->right_ent->e_rec.e_clusters =
517 cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
518 le32_to_cpu(old_ent->e_rec.e_clusters)) -
519 le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
520 }
521
522 rb_erase(&old_ent->e_node, &em->em_extents);
523 /* Now that he's erased, set him up for deletion */
524 ctxt->old_ent = old_ent;
525
526 if (ctxt->need_left) {
527 ret = ocfs2_extent_map_insert_entry(em,
528 ctxt->left_ent);
529 if (ret)
530 goto out_unlock;
531 ctxt->left_ent = NULL;
532 } 202 }
533 203
534 if (ctxt->need_right) { 204 /*
535 ret = ocfs2_extent_map_insert_entry(em, 205 * Overlapping extents - this shouldn't happen unless we've
536 ctxt->right_ent); 206 * split an extent to change it's flags. That is exceedingly
537 if (ret) 207 * rare, so there's no sense in trying to optimize it yet.
538 goto out_unlock; 208 */
539 ctxt->right_ent = NULL; 209 if (ocfs2_ei_is_contained(emi, ins) ||
210 ocfs2_ei_is_contained(ins, emi)) {
211 ocfs2_copy_emi_fields(emi, ins);
212 return 1;
540 } 213 }
541 214
542 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); 215 /* No merge was possible. */
543 216 return 0;
544 if (!ret)
545 ctxt->new_ent = NULL;
546
547out_unlock:
548 spin_unlock(&OCFS2_I(inode)->ip_lock);
549
550 return ret;
551} 217}
552 218
553 219/*
554static int ocfs2_extent_map_insert(struct inode *inode, 220 * In order to reduce complexity on the caller, this insert function
555 struct ocfs2_extent_rec *rec, 221 * is intentionally liberal in what it will accept.
556 int tree_depth) 222 *
223 * The only rule is that the truncate call *must* be used whenever
224 * records have been deleted. This avoids inserting overlapping
225 * records with different physical mappings.
226 */
227void ocfs2_extent_map_insert_rec(struct inode *inode,
228 struct ocfs2_extent_rec *rec)
557{ 229{
558 int ret; 230 struct ocfs2_inode_info *oi = OCFS2_I(inode);
559 struct ocfs2_em_insert_context ctxt = {0, }; 231 struct ocfs2_extent_map *em = &oi->ip_extent_map;
560 232 struct ocfs2_extent_map_item *emi, *new_emi = NULL;
561 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > 233 struct ocfs2_extent_map_item ins;
562 OCFS2_I(inode)->ip_map.em_clusters) { 234
563 ret = -EBADR; 235 ins.ei_cpos = le32_to_cpu(rec->e_cpos);
564 mlog_errno(ret); 236 ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
565 return ret; 237 le64_to_cpu(rec->e_blkno));
238 ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
239 ins.ei_flags = rec->e_flags;
240
241search:
242 spin_lock(&oi->ip_lock);
243
244 list_for_each_entry(emi, &em->em_list, ei_list) {
245 if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
246 list_move(&emi->ei_list, &em->em_list);
247 spin_unlock(&oi->ip_lock);
248 goto out;
249 }
566 } 250 }
567 251
568 /* Zero e_clusters means a truncated tail record. It better be EOF */ 252 /*
569 if (!rec->e_clusters) { 253 * No item could be merged.
570 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != 254 *
571 OCFS2_I(inode)->ip_map.em_clusters) { 255 * Either allocate and add a new item, or overwrite the last recently
572 ret = -EBADR; 256 * inserted.
573 mlog_errno(ret); 257 */
574 ocfs2_error(inode->i_sb,
575 "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
576 (unsigned long long)le64_to_cpu(rec->e_blkno),
577 (unsigned long long)OCFS2_I(inode)->ip_blkno);
578 return ret;
579 }
580 258
581 /* Ignore the truncated tail */ 259 if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
582 return 0; 260 if (new_emi == NULL) {
583 } 261 spin_unlock(&oi->ip_lock);
584 262
585 ret = -ENOMEM; 263 new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
586 ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, 264 if (new_emi == NULL)
587 GFP_NOFS); 265 goto out;
588 if (!ctxt.new_ent) {
589 mlog_errno(ret);
590 return ret;
591 }
592 266
593 ctxt.new_ent->e_rec = *rec; 267 goto search;
594 ctxt.new_ent->e_tree_depth = tree_depth;
595
596 do {
597 ret = -ENOMEM;
598 if (ctxt.need_left && !ctxt.left_ent) {
599 ctxt.left_ent =
600 kmem_cache_alloc(ocfs2_em_ent_cachep,
601 GFP_NOFS);
602 if (!ctxt.left_ent)
603 break;
604 }
605 if (ctxt.need_right && !ctxt.right_ent) {
606 ctxt.right_ent =
607 kmem_cache_alloc(ocfs2_em_ent_cachep,
608 GFP_NOFS);
609 if (!ctxt.right_ent)
610 break;
611 } 268 }
612 269
613 ret = ocfs2_extent_map_try_insert(inode, rec, 270 ocfs2_copy_emi_fields(new_emi, &ins);
614 tree_depth, &ctxt); 271 list_add(&new_emi->ei_list, &em->em_list);
615 } while (ret == -EAGAIN); 272 em->em_num_items++;
616 273 new_emi = NULL;
617 if ((ret < 0) && (ret != -EEXIST)) 274 } else {
618 mlog_errno(ret); 275 BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
276 emi = list_entry(em->em_list.prev,
277 struct ocfs2_extent_map_item, ei_list);
278 list_move(&emi->ei_list, &em->em_list);
279 ocfs2_copy_emi_fields(emi, &ins);
280 }
619 281
620 if (ctxt.left_ent) 282 spin_unlock(&oi->ip_lock);
621 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
622 if (ctxt.right_ent)
623 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
624 if (ctxt.old_ent)
625 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
626 if (ctxt.new_ent)
627 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
628 283
629 return ret; 284out:
285 if (new_emi)
286 kfree(new_emi);
630} 287}
631 288
632/* 289/*
633 * Append this record to the tail of the extent map. It must be 290 * Return the 1st index within el which contains an extent start
634 * tree_depth 0. The record might be an extension of an existing 291 * larger than v_cluster.
635 * record, and as such that needs to be handled. eg:
636 *
637 * Existing record in the extent map:
638 *
639 * cpos = 10, len = 10
640 * |---------|
641 *
642 * New Record:
643 *
644 * cpos = 10, len = 20
645 * |------------------|
646 *
647 * The passed record is the new on-disk record. The new_clusters value
648 * is how many clusters were added to the file. If the append is a
649 * contiguous append, the new_clusters has been added to
650 * rec->e_clusters. If the append is an entirely new extent, then
651 * rec->e_clusters is == new_clusters.
652 */ 292 */
653int ocfs2_extent_map_append(struct inode *inode, 293static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
654 struct ocfs2_extent_rec *rec, 294 u32 v_cluster)
655 u32 new_clusters)
656{ 295{
657 int ret; 296 int i;
658 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; 297 struct ocfs2_extent_rec *rec;
659 struct ocfs2_extent_map_entry *ent;
660 struct ocfs2_extent_rec *old;
661
662 BUG_ON(!new_clusters);
663 BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
664 298
665 if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { 299 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
666 /* 300 rec = &el->l_recs[i];
667 * Size changed underneath us on disk. Drop any
668 * straddling records and update our idea of
669 * i_clusters
670 */
671 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
672 em->em_clusters = OCFS2_I(inode)->ip_clusters;
673 }
674 301
675 mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + 302 if (v_cluster < le32_to_cpu(rec->e_cpos))
676 le32_to_cpu(rec->e_clusters)) != 303 break;
677 (em->em_clusters + new_clusters),
678 "Inode %llu:\n"
679 "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
680 "em->em_clusters = %u + new_clusters = %u = %u\n",
681 (unsigned long long)OCFS2_I(inode)->ip_blkno,
682 le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
683 le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
684 em->em_clusters, new_clusters,
685 em->em_clusters + new_clusters);
686
687 em->em_clusters += new_clusters;
688
689 ret = -ENOENT;
690 if (le32_to_cpu(rec->e_clusters) > new_clusters) {
691 /* This is a contiguous append */
692 ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
693 NULL, NULL);
694 if (ent) {
695 old = &ent->e_rec;
696 BUG_ON((le32_to_cpu(rec->e_cpos) +
697 le32_to_cpu(rec->e_clusters)) !=
698 (le32_to_cpu(old->e_cpos) +
699 le32_to_cpu(old->e_clusters) +
700 new_clusters));
701 if (ent->e_tree_depth == 0) {
702 BUG_ON(le32_to_cpu(old->e_cpos) !=
703 le32_to_cpu(rec->e_cpos));
704 BUG_ON(le64_to_cpu(old->e_blkno) !=
705 le64_to_cpu(rec->e_blkno));
706 ret = 0;
707 }
708 /*
709 * Let non-leafs fall through as -ENOENT to
710 * force insertion of the new leaf.
711 */
712 le32_add_cpu(&old->e_clusters, new_clusters);
713 }
714 } 304 }
715 305
716 if (ret == -ENOENT) 306 return i;
717 ret = ocfs2_extent_map_insert(inode, rec, 0);
718 if (ret < 0)
719 mlog_errno(ret);
720 return ret;
721} 307}
722 308
723#if 0
724/* Code here is included but defined out as it completes the extent
725 * map api and may be used in the future. */
726
727/* 309/*
728 * Look up the record containing this cluster offset. This record is 310 * Figure out the size of a hole which starts at v_cluster within the given
729 * part of the extent map. Do not free it. Any changes you make to 311 * extent list.
730 * it will reflect in the extent map. So, if your last extent
731 * is (cpos = 10, clusters = 10) and you truncate the file by 5
732 * clusters, you can do:
733 * 312 *
734 * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); 313 * If there is no more allocation past v_cluster, we return the maximum
735 * rec->e_clusters -= 5; 314 * cluster size minus v_cluster.
736 * 315 *
737 * The lookup does not read from disk. If the map isn't filled in for 316 * If we have in-inode extents, then el points to the dinode list and
738 * an entry, you won't find it. 317 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
739 * 318 * containing el.
740 * Also note that the returned record is valid until alloc_sem is
741 * dropped. After that, truncate and extend can happen. Caveat Emptor.
742 */ 319 */
743int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, 320static int ocfs2_figure_hole_clusters(struct inode *inode,
744 struct ocfs2_extent_rec **rec, 321 struct ocfs2_extent_list *el,
745 int *tree_depth) 322 struct buffer_head *eb_bh,
323 u32 v_cluster,
324 u32 *num_clusters)
746{ 325{
747 int ret = -ENOENT; 326 int ret, i;
748 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; 327 struct buffer_head *next_eb_bh = NULL;
749 struct ocfs2_extent_map_entry *ent; 328 struct ocfs2_extent_block *eb, *next_eb;
750 329
751 *rec = NULL; 330 i = ocfs2_search_for_hole_index(el, v_cluster);
752 331
753 if (cpos >= OCFS2_I(inode)->ip_clusters) 332 if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
754 return -EINVAL; 333 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
755 334
756 if (cpos >= em->em_clusters) {
757 /* 335 /*
758 * Size changed underneath us on disk. Drop any 336 * Check the next leaf for any extents.
759 * straddling records and update our idea of
760 * i_clusters
761 */ 337 */
762 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
763 em->em_clusters = OCFS2_I(inode)->ip_clusters ;
764 }
765
766 ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
767 NULL, NULL);
768 338
769 if (ent) { 339 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
770 *rec = &ent->e_rec; 340 goto no_more_extents;
771 if (tree_depth)
772 *tree_depth = ent->e_tree_depth;
773 ret = 0;
774 }
775 341
776 return ret; 342 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
777} 343 le64_to_cpu(eb->h_next_leaf_blk),
344 &next_eb_bh, OCFS2_BH_CACHED, inode);
345 if (ret) {
346 mlog_errno(ret);
347 goto out;
348 }
349 next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
778 350
779int ocfs2_extent_map_get_clusters(struct inode *inode, 351 if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
780 u32 v_cpos, int count, 352 ret = -EROFS;
781 u32 *p_cpos, int *ret_count) 353 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
782{ 354 goto out;
783 int ret; 355 }
784 u32 coff, ccount;
785 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
786 struct ocfs2_extent_map_entry *ent = NULL;
787 356
788 *p_cpos = ccount = 0; 357 el = &next_eb->h_list;
789 358
790 if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) 359 i = ocfs2_search_for_hole_index(el, v_cluster);
791 return -EINVAL; 360 }
792 361
793 if ((v_cpos + count) > em->em_clusters) { 362no_more_extents:
363 if (i == le16_to_cpu(el->l_next_free_rec)) {
794 /* 364 /*
795 * Size changed underneath us on disk. Drop any 365 * We're at the end of our existing allocation. Just
796 * straddling records and update our idea of 366 * return the maximum number of clusters we could
797 * i_clusters 367 * possibly allocate.
798 */ 368 */
799 ocfs2_extent_map_drop(inode, em->em_clusters - 1); 369 *num_clusters = UINT_MAX - v_cluster;
800 em->em_clusters = OCFS2_I(inode)->ip_clusters; 370 } else {
371 *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
801 } 372 }
802 373
374 ret = 0;
375out:
376 brelse(next_eb_bh);
377 return ret;
378}
803 379
804 ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); 380/*
805 if (ret) 381 * Return the index of the extent record which contains cluster #v_cluster.
806 return ret; 382 * -1 is returned if it was not found.
383 *
384 * Should work fine on interior and exterior nodes.
385 */
386static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
387 u32 v_cluster)
388{
389 int ret = -1;
390 int i;
391 struct ocfs2_extent_rec *rec;
392 u32 rec_end, rec_start, clusters;
807 393
808 if (ent) { 394 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
809 /* We should never find ourselves straddling an interval */ 395 rec = &el->l_recs[i];
810 if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
811 v_cpos,
812 count))
813 return -ESRCH;
814 396
815 coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); 397 rec_start = le32_to_cpu(rec->e_cpos);
816 *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 398 clusters = ocfs2_rec_clusters(el, rec);
817 le64_to_cpu(ent->e_rec.e_blkno)) +
818 coff;
819 399
820 if (ret_count) 400 rec_end = rec_start + clusters;
821 *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
822 401
823 return 0; 402 if (v_cluster >= rec_start && v_cluster < rec_end) {
403 ret = i;
404 break;
405 }
824 } 406 }
825 407
826 408 return ret;
827 return -ENOENT;
828} 409}
829 410
830#endif /* 0 */ 411int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
831 412 u32 *p_cluster, u32 *num_clusters,
832int ocfs2_extent_map_get_blocks(struct inode *inode, 413 unsigned int *extent_flags)
833 u64 v_blkno, int count,
834 u64 *p_blkno, int *ret_count)
835{ 414{
836 int ret; 415 int ret, i;
837 u64 boff; 416 unsigned int flags = 0;
838 u32 cpos, clusters; 417 struct buffer_head *di_bh = NULL;
839 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 418 struct buffer_head *eb_bh = NULL;
840 struct ocfs2_extent_map_entry *ent = NULL; 419 struct ocfs2_dinode *di;
841 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; 420 struct ocfs2_extent_block *eb;
421 struct ocfs2_extent_list *el;
842 struct ocfs2_extent_rec *rec; 422 struct ocfs2_extent_rec *rec;
423 u32 coff;
843 424
844 *p_blkno = 0; 425 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
845 426 num_clusters, extent_flags);
846 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); 427 if (ret == 0)
847 clusters = ocfs2_blocks_to_clusters(inode->i_sb, 428 goto out;
848 (u64)count + bpc - 1);
849 if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
850 ret = -EINVAL;
851 mlog_errno(ret);
852 return ret;
853 }
854
855 if ((cpos + clusters) > em->em_clusters) {
856 /*
857 * Size changed underneath us on disk. Drop any
858 * straddling records and update our idea of
859 * i_clusters
860 */
861 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
862 em->em_clusters = OCFS2_I(inode)->ip_clusters;
863 }
864 429
865 ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); 430 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
431 &di_bh, OCFS2_BH_CACHED, inode);
866 if (ret) { 432 if (ret) {
867 mlog_errno(ret); 433 mlog_errno(ret);
868 return ret; 434 goto out;
869 } 435 }
870 436
871 if (ent) 437 di = (struct ocfs2_dinode *) di_bh->b_data;
872 { 438 el = &di->id2.i_list;
873 rec = &ent->e_rec;
874 439
875 /* We should never find ourselves straddling an interval */ 440 if (el->l_tree_depth) {
876 if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { 441 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
877 ret = -ESRCH; 442 if (ret) {
878 mlog_errno(ret); 443 mlog_errno(ret);
879 return ret; 444 goto out;
880 } 445 }
881 446
882 boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - 447 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
883 le32_to_cpu(rec->e_cpos)); 448 el = &eb->h_list;
884 boff += (v_blkno & (u64)(bpc - 1));
885 *p_blkno = le64_to_cpu(rec->e_blkno) + boff;
886 449
887 if (ret_count) { 450 if (el->l_tree_depth) {
888 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, 451 ocfs2_error(inode->i_sb,
889 le32_to_cpu(rec->e_clusters)) - boff; 452 "Inode %lu has non zero tree depth in "
453 "leaf block %llu\n", inode->i_ino,
454 (unsigned long long)eb_bh->b_blocknr);
455 ret = -EROFS;
456 goto out;
890 } 457 }
891
892 return 0;
893 } 458 }
894 459
895 return -ENOENT; 460 i = ocfs2_search_extent_list(el, v_cluster);
896} 461 if (i == -1) {
897 462 /*
898int ocfs2_extent_map_init(struct inode *inode) 463 * A hole was found. Return some canned values that
899{ 464 * callers can key on. If asked for, num_clusters will
900 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; 465 * be populated with the size of the hole.
901 466 */
902 em->em_extents = RB_ROOT; 467 *p_cluster = 0;
903 em->em_clusters = 0; 468 if (num_clusters) {
904 469 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
905 return 0; 470 v_cluster,
906} 471 num_clusters);
907 472 if (ret) {
908/* Needs the lock */ 473 mlog_errno(ret);
909static void __ocfs2_extent_map_drop(struct inode *inode, 474 goto out;
910 u32 new_clusters, 475 }
911 struct rb_node **free_head, 476 }
912 struct ocfs2_extent_map_entry **tail_ent) 477 } else {
913{ 478 rec = &el->l_recs[i];
914 struct rb_node *node, *next;
915 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
916 struct ocfs2_extent_map_entry *ent;
917 479
918 *free_head = NULL; 480 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
919 481
920 ent = NULL; 482 if (!rec->e_blkno) {
921 node = rb_last(&em->em_extents); 483 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
922 while (node) 484 "record (%u, %u, 0)", inode->i_ino,
923 { 485 le32_to_cpu(rec->e_cpos),
924 next = rb_prev(node); 486 ocfs2_rec_clusters(el, rec));
487 ret = -EROFS;
488 goto out;
489 }
925 490
926 ent = rb_entry(node, struct ocfs2_extent_map_entry, 491 coff = v_cluster - le32_to_cpu(rec->e_cpos);
927 e_node);
928 if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
929 break;
930 492
931 rb_erase(&ent->e_node, &em->em_extents); 493 *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
494 le64_to_cpu(rec->e_blkno));
495 *p_cluster = *p_cluster + coff;
932 496
933 node->rb_right = *free_head; 497 if (num_clusters)
934 *free_head = node; 498 *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
935 499
936 ent = NULL; 500 flags = rec->e_flags;
937 node = next;
938 }
939 501
940 /* Do we have an entry straddling new_clusters? */ 502 ocfs2_extent_map_insert_rec(inode, rec);
941 if (tail_ent) {
942 if (ent &&
943 ((le32_to_cpu(ent->e_rec.e_cpos) +
944 le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
945 *tail_ent = ent;
946 else
947 *tail_ent = NULL;
948 } 503 }
949}
950
951static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
952{
953 struct rb_node *node;
954 struct ocfs2_extent_map_entry *ent;
955 504
956 while (free_head) { 505 if (extent_flags)
957 node = free_head; 506 *extent_flags = flags;
958 free_head = node->rb_right;
959 507
960 ent = rb_entry(node, struct ocfs2_extent_map_entry, 508out:
961 e_node); 509 brelse(di_bh);
962 kmem_cache_free(ocfs2_em_ent_cachep, ent); 510 brelse(eb_bh);
963 } 511 return ret;
964} 512}
965 513
966/* 514/*
967 * Remove all entries past new_clusters, inclusive of an entry that 515 * This expects alloc_sem to be held. The allocation cannot change at
968 * contains new_clusters. This is effectively a cache forget. 516 * all while the map is in the process of being updated.
969 *
970 * If you want to also clip the last extent by some number of clusters,
971 * you need to call ocfs2_extent_map_trunc().
972 * This code does not check or modify ip_clusters.
973 */ 517 */
974int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) 518int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
519 u64 *ret_count, unsigned int *extent_flags)
975{ 520{
976 struct rb_node *free_head = NULL; 521 int ret;
977 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; 522 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
978 struct ocfs2_extent_map_entry *ent; 523 u32 cpos, num_clusters, p_cluster;
979 524 u64 boff = 0;
980 spin_lock(&OCFS2_I(inode)->ip_lock);
981 525
982 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); 526 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
983 527
984 if (ent) { 528 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
985 rb_erase(&ent->e_node, &em->em_extents); 529 extent_flags);
986 ent->e_node.rb_right = free_head; 530 if (ret) {
987 free_head = &ent->e_node; 531 mlog_errno(ret);
532 goto out;
988 } 533 }
989 534
990 spin_unlock(&OCFS2_I(inode)->ip_lock); 535 /*
991 536 * p_cluster == 0 indicates a hole.
992 if (free_head) 537 */
993 __ocfs2_extent_map_drop_cleanup(free_head); 538 if (p_cluster) {
994 539 boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
995 return 0; 540 boff += (v_blkno & (u64)(bpc - 1));
996} 541 }
997
998/*
999 * Remove all entries past new_clusters and also clip any extent
1000 * straddling new_clusters, if there is one. This does not check
1001 * or modify ip_clusters
1002 */
1003int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
1004{
1005 struct rb_node *free_head = NULL;
1006 struct ocfs2_extent_map_entry *ent = NULL;
1007
1008 spin_lock(&OCFS2_I(inode)->ip_lock);
1009
1010 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
1011
1012 if (ent)
1013 ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
1014 le32_to_cpu(ent->e_rec.e_cpos));
1015
1016 OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
1017
1018 spin_unlock(&OCFS2_I(inode)->ip_lock);
1019
1020 if (free_head)
1021 __ocfs2_extent_map_drop_cleanup(free_head);
1022
1023 return 0;
1024}
1025 542
1026int __init init_ocfs2_extent_maps(void) 543 *p_blkno = boff;
1027{
1028 ocfs2_em_ent_cachep =
1029 kmem_cache_create("ocfs2_em_ent",
1030 sizeof(struct ocfs2_extent_map_entry),
1031 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1032 if (!ocfs2_em_ent_cachep)
1033 return -ENOMEM;
1034 544
1035 return 0; 545 if (ret_count) {
1036} 546 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
547 *ret_count -= v_blkno & (u64)(bpc - 1);
548 }
1037 549
1038void exit_ocfs2_extent_maps(void) 550out:
1039{ 551 return ret;
1040 kmem_cache_destroy(ocfs2_em_ent_cachep);
1041} 552}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index fa3745efa886..de91e3e41a22 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -25,22 +25,29 @@
25#ifndef _EXTENT_MAP_H 25#ifndef _EXTENT_MAP_H
26#define _EXTENT_MAP_H 26#define _EXTENT_MAP_H
27 27
28int init_ocfs2_extent_maps(void); 28struct ocfs2_extent_map_item {
29void exit_ocfs2_extent_maps(void); 29 unsigned int ei_cpos;
30 unsigned int ei_phys;
31 unsigned int ei_clusters;
32 unsigned int ei_flags;
30 33
31/* 34 struct list_head ei_list;
32 * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem 35};
33 * to be held. The allocation cannot change at all while the map is 36
34 * in the process of being updated. 37#define OCFS2_MAX_EXTENT_MAP_ITEMS 3
35 */ 38struct ocfs2_extent_map {
36int ocfs2_extent_map_init(struct inode *inode); 39 unsigned int em_num_items;
37int ocfs2_extent_map_append(struct inode *inode, 40 struct list_head em_list;
38 struct ocfs2_extent_rec *rec, 41};
39 u32 new_clusters); 42
40int ocfs2_extent_map_get_blocks(struct inode *inode, 43void ocfs2_extent_map_init(struct inode *inode);
41 u64 v_blkno, int count, 44void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
42 u64 *p_blkno, int *ret_count); 45void ocfs2_extent_map_insert_rec(struct inode *inode,
43int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); 46 struct ocfs2_extent_rec *rec);
44int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters); 47
48int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
49 u32 *num_clusters, unsigned int *extent_flags);
50int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
51 u64 *ret_count, unsigned int *extent_flags);
45 52
46#endif /* _EXTENT_MAP_H */ 53#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f2cd3bf9efb2..9395b4fa547d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -33,6 +33,7 @@
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <linux/pipe_fs_i.h> 34#include <linux/pipe_fs_i.h>
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/writeback.h>
36 37
37#define MLOG_MASK_PREFIX ML_INODE 38#define MLOG_MASK_PREFIX ML_INODE
38#include <cluster/masklog.h> 39#include <cluster/masklog.h>
@@ -206,16 +207,16 @@ out:
206 return ret; 207 return ret;
207} 208}
208 209
209int ocfs2_set_inode_size(handle_t *handle, 210static int ocfs2_set_inode_size(handle_t *handle,
210 struct inode *inode, 211 struct inode *inode,
211 struct buffer_head *fe_bh, 212 struct buffer_head *fe_bh,
212 u64 new_i_size) 213 u64 new_i_size)
213{ 214{
214 int status; 215 int status;
215 216
216 mlog_entry_void(); 217 mlog_entry_void();
217 i_size_write(inode, new_i_size); 218 i_size_write(inode, new_i_size);
218 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); 219 inode->i_blocks = ocfs2_inode_sector_count(inode);
219 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 220 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
220 221
221 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 222 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
@@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
261{ 262{
262 int status; 263 int status;
263 handle_t *handle; 264 handle_t *handle;
265 struct ocfs2_dinode *di;
264 266
265 mlog_entry_void(); 267 mlog_entry_void();
266 268
@@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
274 goto out; 276 goto out;
275 } 277 }
276 278
277 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); 279 status = ocfs2_journal_access(handle, inode, fe_bh,
280 OCFS2_JOURNAL_ACCESS_WRITE);
281 if (status < 0) {
282 mlog_errno(status);
283 goto out_commit;
284 }
285
286 /*
287 * Do this before setting i_size.
288 */
289 status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
290 if (status) {
291 mlog_errno(status);
292 goto out_commit;
293 }
294
295 i_size_write(inode, new_i_size);
296 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
297 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
298
299 di = (struct ocfs2_dinode *) fe_bh->b_data;
300 di->i_size = cpu_to_le64(new_i_size);
301 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
302 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
303
304 status = ocfs2_journal_dirty(handle, fe_bh);
278 if (status < 0) 305 if (status < 0)
279 mlog_errno(status); 306 mlog_errno(status);
280 307
308out_commit:
281 ocfs2_commit_trans(osb, handle); 309 ocfs2_commit_trans(osb, handle);
282out: 310out:
311
283 mlog_exit(status); 312 mlog_exit(status);
284 return status; 313 return status;
285} 314}
@@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
342 mlog_errno(status); 371 mlog_errno(status);
343 goto bail; 372 goto bail;
344 } 373 }
345 ocfs2_data_unlock(inode, 1);
346
347 if (le32_to_cpu(fe->i_clusters) ==
348 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
349 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
350 fe->i_clusters);
351 /* No allocation change is required, so lets fast path
352 * this truncate. */
353 status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
354 if (status < 0)
355 mlog_errno(status);
356 goto bail;
357 }
358 374
359 /* alright, we're going to need to do a full blown alloc size 375 /* alright, we're going to need to do a full blown alloc size
360 * change. Orphan the inode so that recovery can complete the 376 * change. Orphan the inode so that recovery can complete the
@@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
363 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 379 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
364 if (status < 0) { 380 if (status < 0) {
365 mlog_errno(status); 381 mlog_errno(status);
366 goto bail; 382 goto bail_unlock_data;
367 } 383 }
368 384
369 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 385 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
370 if (status < 0) { 386 if (status < 0) {
371 mlog_errno(status); 387 mlog_errno(status);
372 goto bail; 388 goto bail_unlock_data;
373 } 389 }
374 390
375 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 391 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
376 if (status < 0) { 392 if (status < 0) {
377 mlog_errno(status); 393 mlog_errno(status);
378 goto bail; 394 goto bail_unlock_data;
379 } 395 }
380 396
381 /* TODO: orphan dir cleanup here. */ 397 /* TODO: orphan dir cleanup here. */
398bail_unlock_data:
399 ocfs2_data_unlock(inode, 1);
400
382bail: 401bail:
383 402
384 mlog_exit(status); 403 mlog_exit(status);
@@ -397,6 +416,7 @@ bail:
397 */ 416 */
398int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 417int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
399 struct inode *inode, 418 struct inode *inode,
419 u32 *logical_offset,
400 u32 clusters_to_add, 420 u32 clusters_to_add,
401 struct buffer_head *fe_bh, 421 struct buffer_head *fe_bh,
402 handle_t *handle, 422 handle_t *handle,
@@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
460 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 480 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
461 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 481 mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
462 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 482 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
463 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, 483 status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
464 num_bits, meta_ac); 484 *logical_offset, block, num_bits,
485 meta_ac);
465 if (status < 0) { 486 if (status < 0) {
466 mlog_errno(status); 487 mlog_errno(status);
467 goto leave; 488 goto leave;
468 } 489 }
469 490
470 le32_add_cpu(&fe->i_clusters, num_bits);
471 spin_lock(&OCFS2_I(inode)->ip_lock);
472 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
473 spin_unlock(&OCFS2_I(inode)->ip_lock);
474
475 status = ocfs2_journal_dirty(handle, fe_bh); 491 status = ocfs2_journal_dirty(handle, fe_bh);
476 if (status < 0) { 492 if (status < 0) {
477 mlog_errno(status); 493 mlog_errno(status);
@@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
479 } 495 }
480 496
481 clusters_to_add -= num_bits; 497 clusters_to_add -= num_bits;
498 *logical_offset += num_bits;
482 499
483 if (clusters_to_add) { 500 if (clusters_to_add) {
484 mlog(0, "need to alloc once more, clusters = %u, wanted = " 501 mlog(0, "need to alloc once more, clusters = %u, wanted = "
@@ -494,14 +511,87 @@ leave:
494 return status; 511 return status;
495} 512}
496 513
514/*
515 * For a given allocation, determine which allocators will need to be
516 * accessed, and lock them, reserving the appropriate number of bits.
517 *
518 * Called from ocfs2_extend_allocation() for file systems which don't
519 * support holes, and from ocfs2_write() for file systems which
520 * understand sparse inodes.
521 */
522int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
523 u32 clusters_to_add,
524 struct ocfs2_alloc_context **data_ac,
525 struct ocfs2_alloc_context **meta_ac)
526{
527 int ret, num_free_extents;
528 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
529
530 *meta_ac = NULL;
531 *data_ac = NULL;
532
533 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
534 "clusters_to_add = %u\n",
535 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
536 le32_to_cpu(di->i_clusters), clusters_to_add);
537
538 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
539 if (num_free_extents < 0) {
540 ret = num_free_extents;
541 mlog_errno(ret);
542 goto out;
543 }
544
545 /*
546 * Sparse allocation file systems need to be more conservative
547 * with reserving room for expansion - the actual allocation
548 * happens while we've got a journal handle open so re-taking
549 * a cluster lock (because we ran out of room for another
550 * extent) will violate ordering rules.
551 *
552 * Most of the time we'll only be seeing this 1 cluster at a time
553 * anyway.
554 */
555 if (!num_free_extents ||
556 (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
557 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
558 if (ret < 0) {
559 if (ret != -ENOSPC)
560 mlog_errno(ret);
561 goto out;
562 }
563 }
564
565 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
566 if (ret < 0) {
567 if (ret != -ENOSPC)
568 mlog_errno(ret);
569 goto out;
570 }
571
572out:
573 if (ret) {
574 if (*meta_ac) {
575 ocfs2_free_alloc_context(*meta_ac);
576 *meta_ac = NULL;
577 }
578
579 /*
580 * We cannot have an error and a non null *data_ac.
581 */
582 }
583
584 return ret;
585}
586
497static int ocfs2_extend_allocation(struct inode *inode, 587static int ocfs2_extend_allocation(struct inode *inode,
498 u32 clusters_to_add) 588 u32 clusters_to_add)
499{ 589{
500 int status = 0; 590 int status = 0;
501 int restart_func = 0; 591 int restart_func = 0;
502 int drop_alloc_sem = 0; 592 int drop_alloc_sem = 0;
503 int credits, num_free_extents; 593 int credits;
504 u32 prev_clusters; 594 u32 prev_clusters, logical_start;
505 struct buffer_head *bh = NULL; 595 struct buffer_head *bh = NULL;
506 struct ocfs2_dinode *fe = NULL; 596 struct ocfs2_dinode *fe = NULL;
507 handle_t *handle = NULL; 597 handle_t *handle = NULL;
@@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode,
512 602
513 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 603 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
514 604
605 /*
606 * This function only exists for file systems which don't
607 * support holes.
608 */
609 BUG_ON(ocfs2_sparse_alloc(osb));
610
515 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 611 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
516 OCFS2_BH_CACHED, inode); 612 OCFS2_BH_CACHED, inode);
517 if (status < 0) { 613 if (status < 0) {
@@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode,
526 goto leave; 622 goto leave;
527 } 623 }
528 624
625 logical_start = OCFS2_I(inode)->ip_clusters;
626
529restart_all: 627restart_all:
530 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 628 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
531 629
532 mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
533 "clusters_to_add = %u\n",
534 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
535 fe->i_clusters, clusters_to_add);
536
537 num_free_extents = ocfs2_num_free_extents(osb,
538 inode,
539 fe);
540 if (num_free_extents < 0) {
541 status = num_free_extents;
542 mlog_errno(status);
543 goto leave;
544 }
545
546 if (!num_free_extents) {
547 status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
548 if (status < 0) {
549 if (status != -ENOSPC)
550 mlog_errno(status);
551 goto leave;
552 }
553 }
554
555 status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
556 if (status < 0) {
557 if (status != -ENOSPC)
558 mlog_errno(status);
559 goto leave;
560 }
561
562 /* blocks peope in read/write from reading our allocation 630 /* blocks peope in read/write from reading our allocation
563 * until we're done changing it. We depend on i_mutex to block 631 * until we're done changing it. We depend on i_mutex to block
564 * other extend/truncate calls while we're here. Ordering wrt 632 * other extend/truncate calls while we're here. Ordering wrt
@@ -566,6 +634,13 @@ restart_all:
566 down_write(&OCFS2_I(inode)->ip_alloc_sem); 634 down_write(&OCFS2_I(inode)->ip_alloc_sem);
567 drop_alloc_sem = 1; 635 drop_alloc_sem = 1;
568 636
637 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
638 &meta_ac);
639 if (status) {
640 mlog_errno(status);
641 goto leave;
642 }
643
569 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 644 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
570 handle = ocfs2_start_trans(osb, credits); 645 handle = ocfs2_start_trans(osb, credits);
571 if (IS_ERR(handle)) { 646 if (IS_ERR(handle)) {
@@ -590,6 +665,7 @@ restarted_transaction:
590 665
591 status = ocfs2_do_extend_allocation(osb, 666 status = ocfs2_do_extend_allocation(osb,
592 inode, 667 inode,
668 &logical_start,
593 clusters_to_add, 669 clusters_to_add,
594 bh, 670 bh,
595 handle, 671 handle,
@@ -637,7 +713,8 @@ restarted_transaction:
637 } 713 }
638 714
639 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 715 mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
640 fe->i_clusters, (unsigned long long)fe->i_size); 716 le32_to_cpu(fe->i_clusters),
717 (unsigned long long)le64_to_cpu(fe->i_size));
641 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 718 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
642 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 719 OCFS2_I(inode)->ip_clusters, i_size_read(inode));
643 720
@@ -778,7 +855,7 @@ static int ocfs2_extend_file(struct inode *inode,
778 size_t tail_to_skip) 855 size_t tail_to_skip)
779{ 856{
780 int ret = 0; 857 int ret = 0;
781 u32 clusters_to_add; 858 u32 clusters_to_add = 0;
782 859
783 BUG_ON(!tail_to_skip && !di_bh); 860 BUG_ON(!tail_to_skip && !di_bh);
784 861
@@ -790,6 +867,11 @@ static int ocfs2_extend_file(struct inode *inode,
790 goto out; 867 goto out;
791 BUG_ON(new_i_size < i_size_read(inode)); 868 BUG_ON(new_i_size < i_size_read(inode));
792 869
870 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
871 BUG_ON(tail_to_skip != 0);
872 goto out_update_size;
873 }
874
793 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 875 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
794 OCFS2_I(inode)->ip_clusters; 876 OCFS2_I(inode)->ip_clusters;
795 877
@@ -825,6 +907,7 @@ static int ocfs2_extend_file(struct inode *inode,
825 goto out_unlock; 907 goto out_unlock;
826 } 908 }
827 909
910out_update_size:
828 if (!tail_to_skip) { 911 if (!tail_to_skip) {
829 /* We're being called from ocfs2_setattr() which wants 912 /* We're being called from ocfs2_setattr() which wants
830 * us to update i_size */ 913 * us to update i_size */
@@ -834,7 +917,8 @@ static int ocfs2_extend_file(struct inode *inode,
834 } 917 }
835 918
836out_unlock: 919out_unlock:
837 ocfs2_data_unlock(inode, 1); 920 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
921 ocfs2_data_unlock(inode, 1);
838 922
839out: 923out:
840 return ret; 924 return ret;
@@ -972,7 +1056,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
972 1056
973 ret = ocfs2_meta_lock(inode, NULL, 0); 1057 ret = ocfs2_meta_lock(inode, NULL, 0);
974 if (ret) { 1058 if (ret) {
975 mlog_errno(ret); 1059 if (ret != -ENOENT)
1060 mlog_errno(ret);
976 goto out; 1061 goto out;
977 } 1062 }
978 1063
@@ -1035,10 +1120,49 @@ out:
1035 return ret; 1120 return ret;
1036} 1121}
1037 1122
1123/*
1124 * Will look for holes and unwritten extents in the range starting at
1125 * pos for count bytes (inclusive).
1126 */
1127static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1128 size_t count)
1129{
1130 int ret = 0;
1131 unsigned int extent_flags;
1132 u32 cpos, clusters, extent_len, phys_cpos;
1133 struct super_block *sb = inode->i_sb;
1134
1135 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1136 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1137
1138 while (clusters) {
1139 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1140 &extent_flags);
1141 if (ret < 0) {
1142 mlog_errno(ret);
1143 goto out;
1144 }
1145
1146 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1147 ret = 1;
1148 break;
1149 }
1150
1151 if (extent_len > clusters)
1152 extent_len = clusters;
1153
1154 clusters -= extent_len;
1155 cpos += extent_len;
1156 }
1157out:
1158 return ret;
1159}
1160
1038static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1161static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1039 loff_t *ppos, 1162 loff_t *ppos,
1040 size_t count, 1163 size_t count,
1041 int appending) 1164 int appending,
1165 int *direct_io)
1042{ 1166{
1043 int ret = 0, meta_level = appending; 1167 int ret = 0, meta_level = appending;
1044 struct inode *inode = dentry->d_inode; 1168 struct inode *inode = dentry->d_inode;
@@ -1089,6 +1213,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1089 } else { 1213 } else {
1090 saved_pos = *ppos; 1214 saved_pos = *ppos;
1091 } 1215 }
1216
1217 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
1218 loff_t end = saved_pos + count;
1219
1220 /*
1221 * Skip the O_DIRECT checks if we don't need
1222 * them.
1223 */
1224 if (!direct_io || !(*direct_io))
1225 break;
1226
1227 /*
1228 * Allowing concurrent direct writes means
1229 * i_size changes wouldn't be synchronized, so
1230 * one node could wind up truncating another
1231 * nodes writes.
1232 */
1233 if (end > i_size_read(inode)) {
1234 *direct_io = 0;
1235 break;
1236 }
1237
1238 /*
1239 * We don't fill holes during direct io, so
1240 * check for them here. If any are found, the
1241 * caller will have to retake some cluster
1242 * locks and initiate the io as buffered.
1243 */
1244 ret = ocfs2_check_range_for_holes(inode, saved_pos,
1245 count);
1246 if (ret == 1) {
1247 *direct_io = 0;
1248 ret = 0;
1249 } else if (ret < 0)
1250 mlog_errno(ret);
1251 break;
1252 }
1253
1254 /*
1255 * The rest of this loop is concerned with legacy file
1256 * systems which don't support sparse files.
1257 */
1258
1092 newsize = count + saved_pos; 1259 newsize = count + saved_pos;
1093 1260
1094 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", 1261 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
@@ -1141,55 +1308,264 @@ out:
1141 return ret; 1308 return ret;
1142} 1309}
1143 1310
1311static inline void
1312ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1313{
1314 const struct iovec *iov = *iovp;
1315 size_t base = *basep;
1316
1317 do {
1318 int copy = min(bytes, iov->iov_len - base);
1319
1320 bytes -= copy;
1321 base += copy;
1322 if (iov->iov_len == base) {
1323 iov++;
1324 base = 0;
1325 }
1326 } while (bytes);
1327 *iovp = iov;
1328 *basep = base;
1329}
1330
1331static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
1332 const struct iovec *cur_iov,
1333 size_t iov_offset)
1334{
1335 int ret;
1336 char *buf;
1337 struct page *src_page = NULL;
1338
1339 buf = cur_iov->iov_base + iov_offset;
1340
1341 if (!segment_eq(get_fs(), KERNEL_DS)) {
1342 /*
1343 * Pull in the user page. We want to do this outside
1344 * of the meta data locks in order to preserve locking
1345 * order in case of page fault.
1346 */
1347 ret = get_user_pages(current, current->mm,
1348 (unsigned long)buf & PAGE_CACHE_MASK, 1,
1349 0, 0, &src_page, NULL);
1350 if (ret == 1)
1351 bp->b_src_buf = kmap(src_page);
1352 else
1353 src_page = ERR_PTR(-EFAULT);
1354 } else {
1355 bp->b_src_buf = buf;
1356 }
1357
1358 return src_page;
1359}
1360
1361static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
1362 struct page *page)
1363{
1364 if (page) {
1365 kunmap(page);
1366 page_cache_release(page);
1367 }
1368}
1369
1370static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1371 const struct iovec *iov,
1372 unsigned long nr_segs,
1373 size_t count,
1374 ssize_t o_direct_written)
1375{
1376 int ret = 0;
1377 ssize_t copied, total = 0;
1378 size_t iov_offset = 0;
1379 const struct iovec *cur_iov = iov;
1380 struct ocfs2_buffered_write_priv bp;
1381 struct page *page;
1382
1383 /*
1384 * handle partial DIO write. Adjust cur_iov if needed.
1385 */
1386 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1387
1388 do {
1389 bp.b_cur_off = iov_offset;
1390 bp.b_cur_iov = cur_iov;
1391
1392 page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
1393 if (IS_ERR(page)) {
1394 ret = PTR_ERR(page);
1395 goto out;
1396 }
1397
1398 copied = ocfs2_buffered_write_cluster(file, *ppos, count,
1399 ocfs2_map_and_write_user_data,
1400 &bp);
1401
1402 ocfs2_put_write_source(&bp, page);
1403
1404 if (copied < 0) {
1405 mlog_errno(copied);
1406 ret = copied;
1407 goto out;
1408 }
1409
1410 total += copied;
1411 *ppos = *ppos + copied;
1412 count -= copied;
1413
1414 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
1415 } while(count);
1416
1417out:
1418 return total ? total : ret;
1419}
1420
1421static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
1422 unsigned long *nr_segs)
1423{
1424 size_t ocount; /* original count */
1425 unsigned long seg;
1426
1427 ocount = 0;
1428 for (seg = 0; seg < *nr_segs; seg++) {
1429 const struct iovec *iv = &iov[seg];
1430
1431 /*
1432 * If any segment has a negative length, or the cumulative
1433 * length ever wraps negative then return -EINVAL.
1434 */
1435 ocount += iv->iov_len;
1436 if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
1437 return -EINVAL;
1438 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1439 continue;
1440 if (seg == 0)
1441 return -EFAULT;
1442 *nr_segs = seg;
1443 ocount -= iv->iov_len; /* This segment is no good */
1444 break;
1445 }
1446
1447 *counted = ocount;
1448 return 0;
1449}
1450
1144static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 1451static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1145 const struct iovec *iov, 1452 const struct iovec *iov,
1146 unsigned long nr_segs, 1453 unsigned long nr_segs,
1147 loff_t pos) 1454 loff_t pos)
1148{ 1455{
1149 int ret, rw_level, have_alloc_sem = 0; 1456 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1150 struct file *filp = iocb->ki_filp; 1457 int can_do_direct, sync = 0;
1151 struct inode *inode = filp->f_path.dentry->d_inode; 1458 ssize_t written = 0;
1152 int appending = filp->f_flags & O_APPEND ? 1 : 0; 1459 size_t ocount; /* original count */
1153 1460 size_t count; /* after file limit checks */
1154 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 1461 loff_t *ppos = &iocb->ki_pos;
1462 struct file *file = iocb->ki_filp;
1463 struct inode *inode = file->f_path.dentry->d_inode;
1464
1465 mlog_entry("(0x%p, %u, '%.*s')\n", file,
1155 (unsigned int)nr_segs, 1466 (unsigned int)nr_segs,
1156 filp->f_path.dentry->d_name.len, 1467 file->f_path.dentry->d_name.len,
1157 filp->f_path.dentry->d_name.name); 1468 file->f_path.dentry->d_name.name);
1158 1469
1159 /* happy write of zero bytes */
1160 if (iocb->ki_left == 0) 1470 if (iocb->ki_left == 0)
1161 return 0; 1471 return 0;
1162 1472
1473 ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
1474 if (ret)
1475 return ret;
1476
1477 count = ocount;
1478
1479 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1480
1481 appending = file->f_flags & O_APPEND ? 1 : 0;
1482 direct_io = file->f_flags & O_DIRECT ? 1 : 0;
1483
1163 mutex_lock(&inode->i_mutex); 1484 mutex_lock(&inode->i_mutex);
1485
1486relock:
1164 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 1487 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1165 if (filp->f_flags & O_DIRECT) { 1488 if (direct_io) {
1166 have_alloc_sem = 1;
1167 down_read(&inode->i_alloc_sem); 1489 down_read(&inode->i_alloc_sem);
1490 have_alloc_sem = 1;
1168 } 1491 }
1169 1492
1170 /* concurrent O_DIRECT writes are allowed */ 1493 /* concurrent O_DIRECT writes are allowed */
1171 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; 1494 rw_level = !direct_io;
1172 ret = ocfs2_rw_lock(inode, rw_level); 1495 ret = ocfs2_rw_lock(inode, rw_level);
1173 if (ret < 0) { 1496 if (ret < 0) {
1174 rw_level = -1;
1175 mlog_errno(ret); 1497 mlog_errno(ret);
1176 goto out; 1498 goto out_sems;
1177 } 1499 }
1178 1500
1179 ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, 1501 can_do_direct = direct_io;
1180 iocb->ki_left, appending); 1502 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1503 iocb->ki_left, appending,
1504 &can_do_direct);
1181 if (ret < 0) { 1505 if (ret < 0) {
1182 mlog_errno(ret); 1506 mlog_errno(ret);
1183 goto out; 1507 goto out;
1184 } 1508 }
1185 1509
1186 /* communicate with ocfs2_dio_end_io */ 1510 /*
1187 ocfs2_iocb_set_rw_locked(iocb); 1511 * We can't complete the direct I/O as requested, fall back to
1512 * buffered I/O.
1513 */
1514 if (direct_io && !can_do_direct) {
1515 ocfs2_rw_unlock(inode, rw_level);
1516 up_read(&inode->i_alloc_sem);
1517
1518 have_alloc_sem = 0;
1519 rw_level = -1;
1188 1520
1189 ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); 1521 direct_io = 0;
1522 sync = 1;
1523 goto relock;
1524 }
1525
1526 if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
1527 sync = 1;
1528
1529 /*
1530 * XXX: Is it ok to execute these checks a second time?
1531 */
1532 ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
1533 if (ret)
1534 goto out;
1535
1536 /*
1537 * Set pos so that sync_page_range_nolock() below understands
1538 * where to start from. We might've moved it around via the
1539 * calls above. The range we want to actually sync starts from
1540 * *ppos here.
1541 *
1542 */
1543 pos = *ppos;
1544
1545 /* communicate with ocfs2_dio_end_io */
1546 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1547
1548 if (direct_io) {
1549 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1550 ppos, count, ocount);
1551 if (written < 0) {
1552 ret = written;
1553 goto out_dio;
1554 }
1555 } else {
1556 written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
1557 count, written);
1558 if (written < 0) {
1559 ret = written;
1560 if (ret != -EFAULT || ret != -ENOSPC)
1561 mlog_errno(ret);
1562 goto out;
1563 }
1564 }
1190 1565
1566out_dio:
1191 /* buffered aio wouldn't have proper lock coverage today */ 1567 /* buffered aio wouldn't have proper lock coverage today */
1192 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1568 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1193 1569
1194 /* 1570 /*
1195 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1571 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -1207,13 +1583,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1207 } 1583 }
1208 1584
1209out: 1585out:
1586 if (rw_level != -1)
1587 ocfs2_rw_unlock(inode, rw_level);
1588
1589out_sems:
1210 if (have_alloc_sem) 1590 if (have_alloc_sem)
1211 up_read(&inode->i_alloc_sem); 1591 up_read(&inode->i_alloc_sem);
1212 if (rw_level != -1) 1592
1213 ocfs2_rw_unlock(inode, rw_level); 1593 if (written > 0 && sync) {
1594 ssize_t err;
1595
1596 err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
1597 if (err < 0)
1598 written = err;
1599 }
1600
1214 mutex_unlock(&inode->i_mutex); 1601 mutex_unlock(&inode->i_mutex);
1215 1602
1216 mlog_exit(ret); 1603 mlog_exit(ret);
1604 return written ? written : ret;
1605}
1606
1607static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
1608 struct pipe_buffer *buf,
1609 struct splice_desc *sd)
1610{
1611 int ret, count, total = 0;
1612 ssize_t copied = 0;
1613 struct ocfs2_splice_write_priv sp;
1614
1615 ret = buf->ops->pin(pipe, buf);
1616 if (ret)
1617 goto out;
1618
1619 sp.s_sd = sd;
1620 sp.s_buf = buf;
1621 sp.s_pipe = pipe;
1622 sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
1623 sp.s_buf_offset = buf->offset;
1624
1625 count = sd->len;
1626 if (count + sp.s_offset > PAGE_CACHE_SIZE)
1627 count = PAGE_CACHE_SIZE - sp.s_offset;
1628
1629 do {
1630 /*
1631 * splice wants us to copy up to one page at a
1632 * time. For pagesize > cluster size, this means we
1633 * might enter ocfs2_buffered_write_cluster() more
1634 * than once, so keep track of our progress here.
1635 */
1636 copied = ocfs2_buffered_write_cluster(sd->file,
1637 (loff_t)sd->pos + total,
1638 count,
1639 ocfs2_map_and_write_splice_data,
1640 &sp);
1641 if (copied < 0) {
1642 mlog_errno(copied);
1643 ret = copied;
1644 goto out;
1645 }
1646
1647 count -= copied;
1648 sp.s_offset += copied;
1649 sp.s_buf_offset += copied;
1650 total += copied;
1651 } while (count);
1652
1653 ret = 0;
1654out:
1655
1656 return total ? total : ret;
1657}
1658
1659static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1660 struct file *out,
1661 loff_t *ppos,
1662 size_t len,
1663 unsigned int flags)
1664{
1665 int ret, err;
1666 struct address_space *mapping = out->f_mapping;
1667 struct inode *inode = mapping->host;
1668
1669 ret = __splice_from_pipe(pipe, out, ppos, len, flags,
1670 ocfs2_splice_write_actor);
1671 if (ret > 0) {
1672 *ppos += ret;
1673
1674 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
1675 err = generic_osync_inode(inode, mapping,
1676 OSYNC_METADATA|OSYNC_DATA);
1677 if (err)
1678 ret = err;
1679 }
1680 }
1681
1217 return ret; 1682 return ret;
1218} 1683}
1219 1684
@@ -1239,14 +1704,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1239 goto out; 1704 goto out;
1240 } 1705 }
1241 1706
1242 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); 1707 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
1708 NULL);
1243 if (ret < 0) { 1709 if (ret < 0) {
1244 mlog_errno(ret); 1710 mlog_errno(ret);
1245 goto out_unlock; 1711 goto out_unlock;
1246 } 1712 }
1247 1713
1248 /* ok, we're done with i_size and alloc work */ 1714 /* ok, we're done with i_size and alloc work */
1249 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); 1715 ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
1250 1716
1251out_unlock: 1717out_unlock:
1252 ocfs2_rw_unlock(inode, 1); 1718 ocfs2_rw_unlock(inode, 1);
@@ -1323,7 +1789,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1323 } 1789 }
1324 rw_level = 0; 1790 rw_level = 0;
1325 /* communicate with ocfs2_dio_end_io */ 1791 /* communicate with ocfs2_dio_end_io */
1326 ocfs2_iocb_set_rw_locked(iocb); 1792 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1327 } 1793 }
1328 1794
1329 /* 1795 /*
@@ -1388,6 +1854,9 @@ const struct file_operations ocfs2_fops = {
1388 .aio_read = ocfs2_file_aio_read, 1854 .aio_read = ocfs2_file_aio_read,
1389 .aio_write = ocfs2_file_aio_write, 1855 .aio_write = ocfs2_file_aio_write,
1390 .ioctl = ocfs2_ioctl, 1856 .ioctl = ocfs2_ioctl,
1857#ifdef CONFIG_COMPAT
1858 .compat_ioctl = ocfs2_compat_ioctl,
1859#endif
1391 .splice_read = ocfs2_file_splice_read, 1860 .splice_read = ocfs2_file_splice_read,
1392 .splice_write = ocfs2_file_splice_write, 1861 .splice_write = ocfs2_file_splice_write,
1393}; 1862};
@@ -1397,4 +1866,7 @@ const struct file_operations ocfs2_dops = {
1397 .readdir = ocfs2_readdir, 1866 .readdir = ocfs2_readdir,
1398 .fsync = ocfs2_sync_file, 1867 .fsync = ocfs2_sync_file,
1399 .ioctl = ocfs2_ioctl, 1868 .ioctl = ocfs2_ioctl,
1869#ifdef CONFIG_COMPAT
1870 .compat_ioctl = ocfs2_compat_ioctl,
1871#endif
1400}; 1872};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index cc973f01f6ce..a4dd1fa1822b 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,23 +39,23 @@ enum ocfs2_alloc_restarted {
39}; 39};
40int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 40int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
41 struct inode *inode, 41 struct inode *inode,
42 u32 *cluster_start,
42 u32 clusters_to_add, 43 u32 clusters_to_add,
43 struct buffer_head *fe_bh, 44 struct buffer_head *fe_bh,
44 handle_t *handle, 45 handle_t *handle,
45 struct ocfs2_alloc_context *data_ac, 46 struct ocfs2_alloc_context *data_ac,
46 struct ocfs2_alloc_context *meta_ac, 47 struct ocfs2_alloc_context *meta_ac,
47 enum ocfs2_alloc_restarted *reason); 48 enum ocfs2_alloc_restarted *reason);
49int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
50 u32 clusters_to_add,
51 struct ocfs2_alloc_context **data_ac,
52 struct ocfs2_alloc_context **meta_ac);
48int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 53int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
49int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, 54int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
50 struct kstat *stat); 55 struct kstat *stat);
51int ocfs2_permission(struct inode *inode, int mask, 56int ocfs2_permission(struct inode *inode, int mask,
52 struct nameidata *nd); 57 struct nameidata *nd);
53 58
54int ocfs2_set_inode_size(handle_t *handle,
55 struct inode *inode,
56 struct buffer_head *fe_bh,
57 u64 new_i_size);
58
59int ocfs2_should_update_atime(struct inode *inode, 59int ocfs2_should_update_atime(struct inode *inode,
60 struct vfsmount *vfsmnt); 60 struct vfsmount *vfsmnt);
61int ocfs2_update_inode_atime(struct inode *inode, 61int ocfs2_update_inode_atime(struct inode *inode,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 28ab56f2b98c..bc844bfe607c 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -89,22 +89,23 @@ void ocfs2_set_inode_flags(struct inode *inode)
89 inode->i_flags |= S_DIRSYNC; 89 inode->i_flags |= S_DIRSYNC;
90} 90}
91 91
92struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, 92/* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */
93 u64 blkno, 93void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
94 int delete_vote)
95{ 94{
96 struct ocfs2_find_inode_args args; 95 unsigned int flags = oi->vfs_inode.i_flags;
97 96
98 /* ocfs2_ilookup_for_vote should *only* be called from the 97 oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL|
99 * vote thread */ 98 OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL);
100 BUG_ON(current != osb->vote_task); 99 if (flags & S_SYNC)
101 100 oi->ip_attr |= OCFS2_SYNC_FL;
102 args.fi_blkno = blkno; 101 if (flags & S_APPEND)
103 args.fi_flags = OCFS2_FI_FLAG_NOWAIT; 102 oi->ip_attr |= OCFS2_APPEND_FL;
104 if (delete_vote) 103 if (flags & S_IMMUTABLE)
105 args.fi_flags |= OCFS2_FI_FLAG_DELETE; 104 oi->ip_attr |= OCFS2_IMMUTABLE_FL;
106 args.fi_ino = ino_from_blkno(osb->sb, blkno); 105 if (flags & S_NOATIME)
107 return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); 106 oi->ip_attr |= OCFS2_NOATIME_FL;
107 if (flags & S_DIRSYNC)
108 oi->ip_attr |= OCFS2_DIRSYNC_FL;
108} 109}
109 110
110struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) 111struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
@@ -182,28 +183,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
182 if (oi->ip_blkno != args->fi_blkno) 183 if (oi->ip_blkno != args->fi_blkno)
183 goto bail; 184 goto bail;
184 185
185 /* OCFS2_FI_FLAG_NOWAIT is *only* set from
186 * ocfs2_ilookup_for_vote which won't create an inode for one
187 * that isn't found. The vote thread which doesn't want to get
188 * an inode which is in the process of going away - otherwise
189 * the call to __wait_on_freeing_inode in find_inode_fast will
190 * cause it to deadlock on an inode which may be waiting on a
191 * vote (or lock release) in delete_inode */
192 if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
193 (inode->i_state & (I_FREEING|I_CLEAR))) {
194 /* As stated above, we're not going to return an
195 * inode. In the case of a delete vote, the voting
196 * code is going to signal the other node to go
197 * ahead. Mark that state here, so this freeing inode
198 * has the state when it gets to delete_inode. */
199 if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
200 spin_lock(&oi->ip_lock);
201 ocfs2_mark_inode_remotely_deleted(inode);
202 spin_unlock(&oi->ip_lock);
203 }
204 goto bail;
205 }
206
207 ret = 1; 186 ret = 1;
208bail: 187bail:
209 mlog_exit(ret); 188 mlog_exit(ret);
@@ -236,7 +215,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
236 int status = -EINVAL; 215 int status = -EINVAL;
237 216
238 mlog_entry("(0x%p, size:%llu)\n", inode, 217 mlog_entry("(0x%p, size:%llu)\n", inode,
239 (unsigned long long)fe->i_size); 218 (unsigned long long)le64_to_cpu(fe->i_size));
240 219
241 sb = inode->i_sb; 220 sb = inode->i_sb;
242 osb = OCFS2_SB(sb); 221 osb = OCFS2_SB(sb);
@@ -261,6 +240,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
261 goto bail; 240 goto bail;
262 } 241 }
263 242
243 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
244 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
245
264 inode->i_version = 1; 246 inode->i_version = 1;
265 inode->i_generation = le32_to_cpu(fe->i_generation); 247 inode->i_generation = le32_to_cpu(fe->i_generation);
266 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 248 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
@@ -272,8 +254,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
272 if (S_ISLNK(inode->i_mode) && !fe->i_clusters) 254 if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
273 inode->i_blocks = 0; 255 inode->i_blocks = 0;
274 else 256 else
275 inode->i_blocks = 257 inode->i_blocks = ocfs2_inode_sector_count(inode);
276 ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
277 inode->i_mapping->a_ops = &ocfs2_aops; 258 inode->i_mapping->a_ops = &ocfs2_aops;
278 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); 259 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
279 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); 260 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
@@ -286,11 +267,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
286 mlog(ML_ERROR, 267 mlog(ML_ERROR,
287 "ip_blkno %llu != i_blkno %llu!\n", 268 "ip_blkno %llu != i_blkno %llu!\n",
288 (unsigned long long)OCFS2_I(inode)->ip_blkno, 269 (unsigned long long)OCFS2_I(inode)->ip_blkno,
289 (unsigned long long)fe->i_blkno); 270 (unsigned long long)le64_to_cpu(fe->i_blkno));
290
291 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
292 OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
293 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
294 271
295 inode->i_nlink = le16_to_cpu(fe->i_links_count); 272 inode->i_nlink = le16_to_cpu(fe->i_links_count);
296 273
@@ -343,10 +320,13 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
343 * the generation argument to 320 * the generation argument to
344 * ocfs2_inode_lock_res_init() will have to change. 321 * ocfs2_inode_lock_res_init() will have to change.
345 */ 322 */
346 BUG_ON(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)); 323 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
347 324
348 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 325 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
349 OCFS2_LOCK_TYPE_META, 0, inode); 326 OCFS2_LOCK_TYPE_META, 0, inode);
327
328 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
329 OCFS2_LOCK_TYPE_OPEN, 0, inode);
350 } 330 }
351 331
352 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, 332 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
@@ -421,7 +401,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
421 * cluster lock before trusting anything anyway. 401 * cluster lock before trusting anything anyway.
422 */ 402 */
423 can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) 403 can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
424 && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK) 404 && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
425 && !ocfs2_mount_local(osb); 405 && !ocfs2_mount_local(osb);
426 406
427 /* 407 /*
@@ -438,7 +418,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
438 OCFS2_LOCK_TYPE_META, 418 OCFS2_LOCK_TYPE_META,
439 generation, inode); 419 generation, inode);
440 420
421 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
422 OCFS2_LOCK_TYPE_OPEN,
423 0, inode);
424
441 if (can_lock) { 425 if (can_lock) {
426 status = ocfs2_open_lock(inode);
427 if (status) {
428 make_bad_inode(inode);
429 mlog_errno(status);
430 return status;
431 }
442 status = ocfs2_meta_lock(inode, NULL, 0); 432 status = ocfs2_meta_lock(inode, NULL, 0);
443 if (status) { 433 if (status) {
444 make_bad_inode(inode); 434 make_bad_inode(inode);
@@ -447,6 +437,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
447 } 437 }
448 } 438 }
449 439
440 if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
441 status = ocfs2_try_open_lock(inode, 0);
442 if (status) {
443 make_bad_inode(inode);
444 return status;
445 }
446 }
447
450 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, 448 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
451 can_lock ? inode : NULL); 449 can_lock ? inode : NULL);
452 if (status < 0) { 450 if (status < 0) {
@@ -458,7 +456,8 @@ static int ocfs2_read_locked_inode(struct inode *inode,
458 fe = (struct ocfs2_dinode *) bh->b_data; 456 fe = (struct ocfs2_dinode *) bh->b_data;
459 if (!OCFS2_IS_VALID_DINODE(fe)) { 457 if (!OCFS2_IS_VALID_DINODE(fe)) {
460 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", 458 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
461 (unsigned long long)fe->i_blkno, 7, fe->i_signature); 459 (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
460 fe->i_signature);
462 goto bail; 461 goto bail;
463 } 462 }
464 463
@@ -507,50 +506,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
507 struct buffer_head *fe_bh) 506 struct buffer_head *fe_bh)
508{ 507{
509 int status = 0; 508 int status = 0;
510 handle_t *handle = NULL;
511 struct ocfs2_truncate_context *tc = NULL; 509 struct ocfs2_truncate_context *tc = NULL;
512 struct ocfs2_dinode *fe; 510 struct ocfs2_dinode *fe;
511 handle_t *handle = NULL;
513 512
514 mlog_entry_void(); 513 mlog_entry_void();
515 514
516 fe = (struct ocfs2_dinode *) fe_bh->b_data; 515 fe = (struct ocfs2_dinode *) fe_bh->b_data;
517 516
518 /* zero allocation, zero truncate :) */ 517 if (fe->i_clusters) {
519 if (!fe->i_clusters) 518 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
520 goto bail; 519 if (IS_ERR(handle)) {
520 status = PTR_ERR(handle);
521 mlog_errno(status);
522 goto out;
523 }
521 524
522 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 525 status = ocfs2_journal_access(handle, inode, fe_bh,
523 if (IS_ERR(handle)) { 526 OCFS2_JOURNAL_ACCESS_WRITE);
524 status = PTR_ERR(handle); 527 if (status < 0) {
525 handle = NULL; 528 mlog_errno(status);
526 mlog_errno(status); 529 goto out;
527 goto bail; 530 }
528 }
529 531
530 status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL); 532 i_size_write(inode, 0);
531 if (status < 0) {
532 mlog_errno(status);
533 goto bail;
534 }
535 533
536 ocfs2_commit_trans(osb, handle); 534 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
537 handle = NULL; 535 if (status < 0) {
536 mlog_errno(status);
537 goto out;
538 }
538 539
539 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); 540 ocfs2_commit_trans(osb, handle);
540 if (status < 0) { 541 handle = NULL;
541 mlog_errno(status);
542 goto bail;
543 }
544 542
545 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); 543 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
546 if (status < 0) { 544 if (status < 0) {
547 mlog_errno(status); 545 mlog_errno(status);
548 goto bail; 546 goto out;
547 }
548
549 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
550 if (status < 0) {
551 mlog_errno(status);
552 goto out;
553 }
549 } 554 }
550bail: 555
556out:
551 if (handle) 557 if (handle)
552 ocfs2_commit_trans(osb, handle); 558 ocfs2_commit_trans(osb, handle);
553
554 mlog_exit(status); 559 mlog_exit(status);
555 return status; 560 return status;
556} 561}
@@ -678,10 +683,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
678 struct inode *orphan_dir_inode = NULL; 683 struct inode *orphan_dir_inode = NULL;
679 struct buffer_head *orphan_dir_bh = NULL; 684 struct buffer_head *orphan_dir_bh = NULL;
680 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 685 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
686 struct ocfs2_dinode *di;
681 687
682 /* We've already voted on this so it should be readonly - no 688 di = (struct ocfs2_dinode *) di_bh->b_data;
683 * spinlock needed. */ 689 orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
684 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
685 690
686 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); 691 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
687 if (status) 692 if (status)
@@ -827,8 +832,8 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
827 "Inode %llu (on-disk %llu) not orphaned! " 832 "Inode %llu (on-disk %llu) not orphaned! "
828 "Disk flags 0x%x, inode flags 0x%x\n", 833 "Disk flags 0x%x, inode flags 0x%x\n",
829 (unsigned long long)oi->ip_blkno, 834 (unsigned long long)oi->ip_blkno,
830 (unsigned long long)di->i_blkno, di->i_flags, 835 (unsigned long long)le64_to_cpu(di->i_blkno),
831 oi->ip_flags); 836 le32_to_cpu(di->i_flags), oi->ip_flags);
832 goto bail; 837 goto bail;
833 } 838 }
834 839
@@ -839,11 +844,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
839 goto bail; 844 goto bail;
840 } 845 }
841 846
842 status = ocfs2_request_delete_vote(inode); 847 /*
843 /* -EBUSY means that other nodes are still using the 848 * This is how ocfs2 determines whether an inode is still live
844 * inode. We're done here though, so avoid doing anything on 849 * within the cluster. Every node takes a shared read lock on
845 * disk and let them worry about deleting it. */ 850 * the inode open lock in ocfs2_read_locked_inode(). When we
846 if (status == -EBUSY) { 851 * get to ->delete_inode(), each node tries to convert it's
852 * lock to an exclusive. Trylocks are serialized by the inode
853 * meta data lock. If the upconvert suceeds, we know the inode
854 * is no longer live and can be deleted.
855 *
856 * Though we call this with the meta data lock held, the
857 * trylock keeps us from ABBA deadlock.
858 */
859 status = ocfs2_try_open_lock(inode, 1);
860 if (status == -EAGAIN) {
847 status = 0; 861 status = 0;
848 mlog(0, "Skipping delete of %llu because it is in use on" 862 mlog(0, "Skipping delete of %llu because it is in use on"
849 "other nodes\n", (unsigned long long)oi->ip_blkno); 863 "other nodes\n", (unsigned long long)oi->ip_blkno);
@@ -854,21 +868,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
854 goto bail; 868 goto bail;
855 } 869 }
856 870
857 spin_lock(&oi->ip_lock); 871 *wipe = 1;
858 if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) { 872 mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
859 /* Nobody knew which slot this inode was orphaned 873 (unsigned long long)oi->ip_blkno,
860 * into. This may happen during node death and 874 le16_to_cpu(di->i_orphaned_slot));
861 * recovery knows how to clean it up so we can safely
862 * ignore this inode for now on. */
863 mlog(0, "Nobody knew where inode %llu was orphaned!\n",
864 (unsigned long long)oi->ip_blkno);
865 } else {
866 *wipe = 1;
867
868 mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
869 (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
870 }
871 spin_unlock(&oi->ip_lock);
872 875
873bail: 876bail:
874 return status; 877 return status;
@@ -1001,11 +1004,16 @@ void ocfs2_clear_inode(struct inode *inode)
1001 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 1004 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
1002 "Inode=%lu\n", inode->i_ino); 1005 "Inode=%lu\n", inode->i_ino);
1003 1006
1007 /* For remove delete_inode vote, we hold open lock before,
1008 * now it is time to unlock PR and EX open locks. */
1009 ocfs2_open_unlock(inode);
1010
1004 /* Do these before all the other work so that we don't bounce 1011 /* Do these before all the other work so that we don't bounce
1005 * the vote thread while waiting to destroy the locks. */ 1012 * the vote thread while waiting to destroy the locks. */
1006 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); 1013 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
1007 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); 1014 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
1008 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); 1015 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
1016 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
1009 1017
1010 /* We very well may get a clear_inode before all an inodes 1018 /* We very well may get a clear_inode before all an inodes
1011 * metadata has hit disk. Of course, we can't drop any cluster 1019 * metadata has hit disk. Of course, we can't drop any cluster
@@ -1020,8 +1028,7 @@ void ocfs2_clear_inode(struct inode *inode)
1020 "Clear inode of %llu, inode has io markers\n", 1028 "Clear inode of %llu, inode has io markers\n",
1021 (unsigned long long)oi->ip_blkno); 1029 (unsigned long long)oi->ip_blkno);
1022 1030
1023 ocfs2_extent_map_drop(inode, 0); 1031 ocfs2_extent_map_trunc(inode, 0);
1024 ocfs2_extent_map_init(inode);
1025 1032
1026 status = ocfs2_drop_inode_locks(inode); 1033 status = ocfs2_drop_inode_locks(inode);
1027 if (status < 0) 1034 if (status < 0)
@@ -1030,6 +1037,7 @@ void ocfs2_clear_inode(struct inode *inode)
1030 ocfs2_lock_res_free(&oi->ip_rw_lockres); 1037 ocfs2_lock_res_free(&oi->ip_rw_lockres);
1031 ocfs2_lock_res_free(&oi->ip_meta_lockres); 1038 ocfs2_lock_res_free(&oi->ip_meta_lockres);
1032 ocfs2_lock_res_free(&oi->ip_data_lockres); 1039 ocfs2_lock_res_free(&oi->ip_data_lockres);
1040 ocfs2_lock_res_free(&oi->ip_open_lockres);
1033 1041
1034 ocfs2_metadata_cache_purge(inode); 1042 ocfs2_metadata_cache_purge(inode);
1035 1043
@@ -1086,9 +1094,6 @@ void ocfs2_drop_inode(struct inode *inode)
1086 mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n", 1094 mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
1087 (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); 1095 (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
1088 1096
1089 /* Testing ip_orphaned_slot here wouldn't work because we may
1090 * not have gotten a delete_inode vote from any other nodes
1091 * yet. */
1092 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) 1097 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
1093 generic_delete_inode(inode); 1098 generic_delete_inode(inode);
1094 else 1099 else
@@ -1121,8 +1126,10 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
1121 return NULL; 1126 return NULL;
1122 } 1127 }
1123 1128
1124 tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, 1129 down_read(&OCFS2_I(inode)->ip_alloc_sem);
1125 &p_blkno, NULL); 1130 tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
1131 NULL);
1132 up_read(&OCFS2_I(inode)->ip_alloc_sem);
1126 if (tmperr < 0) { 1133 if (tmperr < 0) {
1127 mlog_errno(tmperr); 1134 mlog_errno(tmperr);
1128 goto fail; 1135 goto fail;
@@ -1212,6 +1219,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1212 1219
1213 spin_lock(&OCFS2_I(inode)->ip_lock); 1220 spin_lock(&OCFS2_I(inode)->ip_lock);
1214 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); 1221 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1222 ocfs2_get_inode_flags(OCFS2_I(inode));
1215 fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr); 1223 fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr);
1216 spin_unlock(&OCFS2_I(inode)->ip_lock); 1224 spin_unlock(&OCFS2_I(inode)->ip_lock);
1217 1225
@@ -1259,7 +1267,7 @@ void ocfs2_refresh_inode(struct inode *inode,
1259 if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) 1267 if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
1260 inode->i_blocks = 0; 1268 inode->i_blocks = 0;
1261 else 1269 else
1262 inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode)); 1270 inode->i_blocks = ocfs2_inode_sector_count(inode);
1263 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); 1271 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
1264 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); 1272 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
1265 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); 1273 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1a7dd2945b34..a41d0817121b 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -26,6 +26,8 @@
26#ifndef OCFS2_INODE_H 26#ifndef OCFS2_INODE_H
27#define OCFS2_INODE_H 27#define OCFS2_INODE_H
28 28
29#include "extent_map.h"
30
29/* OCFS2 Inode Private Data */ 31/* OCFS2 Inode Private Data */
30struct ocfs2_inode_info 32struct ocfs2_inode_info
31{ 33{
@@ -34,6 +36,7 @@ struct ocfs2_inode_info
34 struct ocfs2_lock_res ip_rw_lockres; 36 struct ocfs2_lock_res ip_rw_lockres;
35 struct ocfs2_lock_res ip_meta_lockres; 37 struct ocfs2_lock_res ip_meta_lockres;
36 struct ocfs2_lock_res ip_data_lockres; 38 struct ocfs2_lock_res ip_data_lockres;
39 struct ocfs2_lock_res ip_open_lockres;
37 40
38 /* protects allocation changes on this inode. */ 41 /* protects allocation changes on this inode. */
39 struct rw_semaphore ip_alloc_sem; 42 struct rw_semaphore ip_alloc_sem;
@@ -42,9 +45,7 @@ struct ocfs2_inode_info
42 spinlock_t ip_lock; 45 spinlock_t ip_lock;
43 u32 ip_open_count; 46 u32 ip_open_count;
44 u32 ip_clusters; 47 u32 ip_clusters;
45 struct ocfs2_extent_map ip_map;
46 struct list_head ip_io_markers; 48 struct list_head ip_io_markers;
47 int ip_orphaned_slot;
48 49
49 struct mutex ip_io_mutex; 50 struct mutex ip_io_mutex;
50 51
@@ -64,6 +65,8 @@ struct ocfs2_inode_info
64 65
65 struct ocfs2_caching_info ip_metadata_cache; 66 struct ocfs2_caching_info ip_metadata_cache;
66 67
68 struct ocfs2_extent_map ip_extent_map;
69
67 struct inode vfs_inode; 70 struct inode vfs_inode;
68}; 71};
69 72
@@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode);
117void ocfs2_drop_inode(struct inode *inode); 120void ocfs2_drop_inode(struct inode *inode);
118 121
119/* Flags for ocfs2_iget() */ 122/* Flags for ocfs2_iget() */
120#define OCFS2_FI_FLAG_NOWAIT 0x1 123#define OCFS2_FI_FLAG_SYSFILE 0x4
121#define OCFS2_FI_FLAG_DELETE 0x2 124#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8
122#define OCFS2_FI_FLAG_SYSFILE 0x4
123#define OCFS2_FI_FLAG_NOLOCK 0x8
124struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); 125struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
125struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
126 u64 blkno,
127 int delete_vote);
128int ocfs2_inode_init_private(struct inode *inode); 126int ocfs2_inode_init_private(struct inode *inode);
129int ocfs2_inode_revalidate(struct dentry *dentry); 127int ocfs2_inode_revalidate(struct dentry *dentry);
130int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 128int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
@@ -143,5 +141,13 @@ int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
143int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); 141int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
144 142
145void ocfs2_set_inode_flags(struct inode *inode); 143void ocfs2_set_inode_flags(struct inode *inode);
144void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
145
146static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
147{
148 int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
149
150 return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
151}
146 152
147#endif /* OCFS2_INODE_H */ 153#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 4768be5f3086..f3ad21ad9aed 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -31,6 +31,7 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
31 mlog_errno(status); 31 mlog_errno(status);
32 return status; 32 return status;
33 } 33 }
34 ocfs2_get_inode_flags(OCFS2_I(inode));
34 *flags = OCFS2_I(inode)->ip_attr; 35 *flags = OCFS2_I(inode)->ip_attr;
35 ocfs2_meta_unlock(inode, 0); 36 ocfs2_meta_unlock(inode, 0);
36 37
@@ -134,3 +135,26 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
134 } 135 }
135} 136}
136 137
138#ifdef CONFIG_COMPAT
139long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
140{
141 struct inode *inode = file->f_path.dentry->d_inode;
142 int ret;
143
144 switch (cmd) {
145 case OCFS2_IOC32_GETFLAGS:
146 cmd = OCFS2_IOC_GETFLAGS;
147 break;
148 case OCFS2_IOC32_SETFLAGS:
149 cmd = OCFS2_IOC_SETFLAGS;
150 break;
151 default:
152 return -ENOIOCTLCMD;
153 }
154
155 lock_kernel();
156 ret = ocfs2_ioctl(inode, file, cmd, arg);
157 unlock_kernel();
158 return ret;
159}
160#endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 4a7c82931dba..4d6c4f430d0d 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -12,5 +12,6 @@
12 12
13int ocfs2_ioctl(struct inode * inode, struct file * filp, 13int ocfs2_ioctl(struct inode * inode, struct file * filp,
14 unsigned int cmd, unsigned long arg); 14 unsigned int cmd, unsigned long arg);
15long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
15 16
16#endif /* OCFS2_IOCTL_H */ 17#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 825cb0ae1b4c..dc1188081720 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -435,7 +435,8 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
435 * handle the errors in a specific manner, so no need 435 * handle the errors in a specific manner, so no need
436 * to call ocfs2_error() here. */ 436 * to call ocfs2_error() here. */
437 mlog(ML_ERROR, "Journal dinode %llu has invalid " 437 mlog(ML_ERROR, "Journal dinode %llu has invalid "
438 "signature: %.*s", (unsigned long long)fe->i_blkno, 7, 438 "signature: %.*s",
439 (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
439 fe->i_signature); 440 fe->i_signature);
440 status = -EIO; 441 status = -EIO;
441 goto out; 442 goto out;
@@ -649,29 +650,20 @@ bail:
649static int ocfs2_force_read_journal(struct inode *inode) 650static int ocfs2_force_read_journal(struct inode *inode)
650{ 651{
651 int status = 0; 652 int status = 0;
652 int i, p_blocks; 653 int i;
653 u64 v_blkno, p_blkno; 654 u64 v_blkno, p_blkno, p_blocks, num_blocks;
654#define CONCURRENT_JOURNAL_FILL 32 655#define CONCURRENT_JOURNAL_FILL 32ULL
655 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; 656 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
656 657
657 mlog_entry_void(); 658 mlog_entry_void();
658 659
659 BUG_ON(inode->i_blocks !=
660 ocfs2_align_bytes_to_sectors(i_size_read(inode)));
661
662 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); 660 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
663 661
664 mlog(0, "Force reading %llu blocks\n", 662 num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
665 (unsigned long long)(inode->i_blocks >>
666 (inode->i_sb->s_blocksize_bits - 9)));
667
668 v_blkno = 0; 663 v_blkno = 0;
669 while (v_blkno < 664 while (v_blkno < num_blocks) {
670 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
671
672 status = ocfs2_extent_map_get_blocks(inode, v_blkno, 665 status = ocfs2_extent_map_get_blocks(inode, v_blkno,
673 1, &p_blkno, 666 &p_blkno, &p_blocks, NULL);
674 &p_blocks);
675 if (status < 0) { 667 if (status < 0) {
676 mlog_errno(status); 668 mlog_errno(status);
677 goto bail; 669 goto bail;
@@ -751,7 +743,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
751 la_dinode = item->lri_la_dinode; 743 la_dinode = item->lri_la_dinode;
752 if (la_dinode) { 744 if (la_dinode) {
753 mlog(0, "Clean up local alloc %llu\n", 745 mlog(0, "Clean up local alloc %llu\n",
754 (unsigned long long)la_dinode->i_blkno); 746 (unsigned long long)le64_to_cpu(la_dinode->i_blkno));
755 747
756 ret = ocfs2_complete_local_alloc_recovery(osb, 748 ret = ocfs2_complete_local_alloc_recovery(osb,
757 la_dinode); 749 la_dinode);
@@ -764,7 +756,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
764 tl_dinode = item->lri_tl_dinode; 756 tl_dinode = item->lri_tl_dinode;
765 if (tl_dinode) { 757 if (tl_dinode) {
766 mlog(0, "Clean up truncate log %llu\n", 758 mlog(0, "Clean up truncate log %llu\n",
767 (unsigned long long)tl_dinode->i_blkno); 759 (unsigned long long)le64_to_cpu(tl_dinode->i_blkno));
768 760
769 ret = ocfs2_complete_truncate_log_recovery(osb, 761 ret = ocfs2_complete_truncate_log_recovery(osb,
770 tl_dinode); 762 tl_dinode);
@@ -1306,7 +1298,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1306 continue; 1298 continue;
1307 1299
1308 iter = ocfs2_iget(osb, le64_to_cpu(de->inode), 1300 iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
1309 OCFS2_FI_FLAG_NOLOCK); 1301 OCFS2_FI_FLAG_ORPHAN_RECOVERY);
1310 if (IS_ERR(iter)) 1302 if (IS_ERR(iter))
1311 continue; 1303 continue;
1312 1304
@@ -1418,7 +1410,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1418 /* Set the proper information to get us going into 1410 /* Set the proper information to get us going into
1419 * ocfs2_delete_inode. */ 1411 * ocfs2_delete_inode. */
1420 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; 1412 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
1421 oi->ip_orphaned_slot = slot;
1422 spin_unlock(&oi->ip_lock); 1413 spin_unlock(&oi->ip_lock);
1423 1414
1424 iput(inode); 1415 iput(inode);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d026b4f27757..3db5de4506da 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
390 /* We may be deleting metadata blocks, so metadata alloc dinode + 390 /* We may be deleting metadata blocks, so metadata alloc dinode +
391 one desc. block for each possible delete. */ 391 one desc. block for each possible delete. */
392 if (tree_depth && next_free == 1 && 392 if (tree_depth && next_free == 1 &&
393 le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del) 393 ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
394 credits += 1 + tree_depth; 394 credits += 1 + tree_depth;
395 395
396 /* update to the truncate log. */ 396 /* update to the truncate log. */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 51b020447683..af01158b39f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
85 int ret = 0, lock_level = 0; 85 int ret = 0, lock_level = 0;
86 struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); 86 struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
87 87
88 /* We don't want to support shared writable mappings yet. */ 88 /*
89 if (!ocfs2_mount_local(osb) && 89 * Only support shared writeable mmap for local mounts which
90 * don't know about holes.
91 */
92 if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
90 ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && 93 ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
91 ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { 94 ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
92 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); 95 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 28dd757ff67d..36289e6295ce 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
175 175
176 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 176 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
177 if (IS_ERR(inode)) { 177 if (IS_ERR(inode)) {
178 mlog(ML_ERROR, "Unable to create inode %llu\n",
179 (unsigned long long)blkno);
180 ret = ERR_PTR(-EACCES); 178 ret = ERR_PTR(-EACCES);
181 goto bail_unlock; 179 goto bail_unlock;
182 } 180 }
@@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
189 * unlink. */ 187 * unlink. */
190 spin_lock(&oi->ip_lock); 188 spin_lock(&oi->ip_lock);
191 oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; 189 oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
192 oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
193 spin_unlock(&oi->ip_lock); 190 spin_unlock(&oi->ip_lock);
194 191
195bail_add: 192bail_add:
@@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
288 285
289 i_size_write(inode, inode->i_sb->s_blocksize); 286 i_size_write(inode, inode->i_sb->s_blocksize);
290 inode->i_nlink = 2; 287 inode->i_nlink = 2;
291 inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize); 288 inode->i_blocks = ocfs2_inode_sector_count(inode);
292 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 289 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
293 if (status < 0) { 290 if (status < 0) {
294 mlog_errno(status); 291 mlog_errno(status);
@@ -581,8 +578,9 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
581 if (ocfs2_populate_inode(inode, fe, 1) < 0) { 578 if (ocfs2_populate_inode(inode, fe, 1) < 0) {
582 mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, " 579 mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
583 "i_blkno=%llu, i_ino=%lu\n", 580 "i_blkno=%llu, i_ino=%lu\n",
584 (unsigned long long) (*new_fe_bh)->b_blocknr, 581 (unsigned long long)(*new_fe_bh)->b_blocknr,
585 (unsigned long long)fe->i_blkno, inode->i_ino); 582 (unsigned long long)le64_to_cpu(fe->i_blkno),
583 inode->i_ino);
586 BUG(); 584 BUG();
587 } 585 }
588 586
@@ -1486,8 +1484,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1486 struct buffer_head **bhs = NULL; 1484 struct buffer_head **bhs = NULL;
1487 const char *c; 1485 const char *c;
1488 struct super_block *sb = osb->sb; 1486 struct super_block *sb = osb->sb;
1489 u64 p_blkno; 1487 u64 p_blkno, p_blocks;
1490 int p_blocks;
1491 int virtual, blocks, status, i, bytes_left; 1488 int virtual, blocks, status, i, bytes_left;
1492 1489
1493 bytes_left = i_size_read(inode) + 1; 1490 bytes_left = i_size_read(inode) + 1;
@@ -1514,8 +1511,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1514 goto bail; 1511 goto bail;
1515 } 1512 }
1516 1513
1517 status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, 1514 status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
1518 &p_blocks); 1515 NULL);
1519 if (status < 0) { 1516 if (status < 0) {
1520 mlog_errno(status); 1517 mlog_errno(status);
1521 goto bail; 1518 goto bail;
@@ -1674,8 +1671,11 @@ static int ocfs2_symlink(struct inode *dir,
1674 inode->i_rdev = 0; 1671 inode->i_rdev = 0;
1675 newsize = l - 1; 1672 newsize = l - 1;
1676 if (l > ocfs2_fast_symlink_chars(sb)) { 1673 if (l > ocfs2_fast_symlink_chars(sb)) {
1674 u32 offset = 0;
1675
1677 inode->i_op = &ocfs2_symlink_inode_operations; 1676 inode->i_op = &ocfs2_symlink_inode_operations;
1678 status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh, 1677 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
1678 new_fe_bh,
1679 handle, data_ac, NULL, 1679 handle, data_ac, NULL,
1680 NULL); 1680 NULL);
1681 if (status < 0) { 1681 if (status < 0) {
@@ -1689,7 +1689,7 @@ static int ocfs2_symlink(struct inode *dir,
1689 goto bail; 1689 goto bail;
1690 } 1690 }
1691 i_size_write(inode, newsize); 1691 i_size_write(inode, newsize);
1692 inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize); 1692 inode->i_blocks = ocfs2_inode_sector_count(inode);
1693 } else { 1693 } else {
1694 inode->i_op = &ocfs2_fast_symlink_inode_operations; 1694 inode->i_op = &ocfs2_fast_symlink_inode_operations;
1695 memcpy((char *) fe->id2.i_symlink, symname, l); 1695 memcpy((char *) fe->id2.i_symlink, symname, l);
@@ -2222,9 +2222,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2222 /* Record which orphan dir our inode now resides 2222 /* Record which orphan dir our inode now resides
2223 * in. delete_inode will use this to determine which orphan 2223 * in. delete_inode will use this to determine which orphan
2224 * dir to lock. */ 2224 * dir to lock. */
2225 spin_lock(&OCFS2_I(inode)->ip_lock); 2225 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
2226 OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
2227 spin_unlock(&OCFS2_I(inode)->ip_lock);
2228 2226
2229 mlog(0, "Inode %llu orphaned in slot %d\n", 2227 mlog(0, "Inode %llu orphaned in slot %d\n",
2230 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 2228 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index db8e77cd35d3..a860633e833f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -46,11 +46,6 @@
46#include "endian.h" 46#include "endian.h"
47#include "ocfs2_lockid.h" 47#include "ocfs2_lockid.h"
48 48
49struct ocfs2_extent_map {
50 u32 em_clusters;
51 struct rb_root em_extents;
52};
53
54/* Most user visible OCFS2 inodes will have very few pieces of 49/* Most user visible OCFS2 inodes will have very few pieces of
55 * metadata, but larger files (including bitmaps, etc) must be taken 50 * metadata, but larger files (including bitmaps, etc) must be taken
56 * into account when designing an access scheme. We allow a small 51 * into account when designing an access scheme. We allow a small
@@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode)
303 return 1; 298 return 1;
304} 299}
305 300
301static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
302{
303 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
304 return 1;
305 return 0;
306}
307
306/* set / clear functions because cluster events can make these happen 308/* set / clear functions because cluster events can make these happen
307 * in parallel so we want the transitions to be atomic. this also 309 * in parallel so we want the transitions to be atomic. this also
308 * means that any future flags osb_flags must be protected by spinlock 310 * means that any future flags osb_flags must be protected by spinlock
@@ -361,9 +363,9 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
361 typeof(__di) ____di = (__di); \ 363 typeof(__di) ____di = (__di); \
362 ocfs2_error((__sb), \ 364 ocfs2_error((__sb), \
363 "Dinode # %llu has bad signature %.*s", \ 365 "Dinode # %llu has bad signature %.*s", \
364 (unsigned long long)(____di)->i_blkno, 7, \ 366 (unsigned long long)le64_to_cpu((____di)->i_blkno), 7, \
365 (____di)->i_signature); \ 367 (____di)->i_signature); \
366} while (0); 368} while (0)
367 369
368#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ 370#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
369 (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) 371 (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
@@ -372,9 +374,9 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
372 typeof(__eb) ____eb = (__eb); \ 374 typeof(__eb) ____eb = (__eb); \
373 ocfs2_error((__sb), \ 375 ocfs2_error((__sb), \
374 "Extent Block # %llu has bad signature %.*s", \ 376 "Extent Block # %llu has bad signature %.*s", \
375 (unsigned long long)(____eb)->h_blkno, 7, \ 377 (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7, \
376 (____eb)->h_signature); \ 378 (____eb)->h_signature); \
377} while (0); 379} while (0)
378 380
379#define OCFS2_IS_VALID_GROUP_DESC(ptr) \ 381#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
380 (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) 382 (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
@@ -383,9 +385,9 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
383 typeof(__gd) ____gd = (__gd); \ 385 typeof(__gd) ____gd = (__gd); \
384 ocfs2_error((__sb), \ 386 ocfs2_error((__sb), \
385 "Group Descriptor # %llu has bad signature %.*s", \ 387 "Group Descriptor # %llu has bad signature %.*s", \
386 (unsigned long long)(____gd)->bg_blkno, 7, \ 388 (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
387 (____gd)->bg_signature); \ 389 (____gd)->bg_signature); \
388} while (0); 390} while (0)
389 391
390static inline unsigned long ino_from_blkno(struct super_block *sb, 392static inline unsigned long ino_from_blkno(struct super_block *sb,
391 u64 blkno) 393 u64 blkno)
@@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
461 return (unsigned long)((bytes + 511) >> 9); 463 return (unsigned long)((bytes + 511) >> 9);
462} 464}
463 465
466static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
467 unsigned long pg_index)
468{
469 u32 clusters = pg_index;
470 unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
471
472 if (unlikely(PAGE_CACHE_SHIFT > cbits))
473 clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
474 else if (PAGE_CACHE_SHIFT < cbits)
475 clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
476
477 return clusters;
478}
479
480/*
481 * Find the 1st page index which covers the given clusters.
482 */
483static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb,
484 u32 clusters)
485{
486 unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
487 unsigned long index = clusters;
488
489 if (PAGE_CACHE_SHIFT > cbits) {
490 index = clusters >> (PAGE_CACHE_SHIFT - cbits);
491 } else if (PAGE_CACHE_SHIFT < cbits) {
492 index = clusters << (cbits - PAGE_CACHE_SHIFT);
493 }
494
495 return index;
496}
497
498static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
499{
500 unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
501 unsigned int pages_per_cluster = 1;
502
503 if (PAGE_CACHE_SHIFT < cbits)
504 pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
505
506 return pages_per_cluster;
507}
508
464#define ocfs2_set_bit ext2_set_bit 509#define ocfs2_set_bit ext2_set_bit
465#define ocfs2_clear_bit ext2_clear_bit 510#define ocfs2_clear_bit ext2_clear_bit
466#define ocfs2_test_bit ext2_test_bit 511#define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e61e218f5e0b..f0d9eb08547a 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -86,7 +86,8 @@
86 OCFS2_SB(sb)->s_feature_incompat &= ~(mask) 86 OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
87 87
88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB 88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
89#define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT 89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
90#define OCFS2_FEATURE_RO_COMPAT_SUPP 0 91#define OCFS2_FEATURE_RO_COMPAT_SUPP 0
91 92
92/* 93/*
@@ -155,10 +156,18 @@
155#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ 156#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */
156 157
157/* 158/*
159 * Extent record flags (e_node.leaf.flags)
160 */
161#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
162 * unwritten */
163
164/*
158 * ioctl commands 165 * ioctl commands
159 */ 166 */
160#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) 167#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
161#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long) 168#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
169#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
170#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
162 171
163/* 172/*
164 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 173 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
@@ -282,10 +291,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
282/* 291/*
283 * On disk extent record for OCFS2 292 * On disk extent record for OCFS2
284 * It describes a range of clusters on disk. 293 * It describes a range of clusters on disk.
294 *
295 * Length fields are divided into interior and leaf node versions.
296 * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
285 */ 297 */
286struct ocfs2_extent_rec { 298struct ocfs2_extent_rec {
287/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ 299/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */
288 __le32 e_clusters; /* Clusters covered by this extent */ 300 union {
301 __le32 e_int_clusters; /* Clusters covered by all children */
302 struct {
303 __le16 e_leaf_clusters; /* Clusters covered by this
304 extent */
305 __u8 e_reserved1;
306 __u8 e_flags; /* Extent flags */
307 };
308 };
289 __le64 e_blkno; /* Physical disk offset, in blocks */ 309 __le64 e_blkno; /* Physical disk offset, in blocks */
290/*10*/ 310/*10*/
291}; 311};
@@ -311,7 +331,10 @@ struct ocfs2_extent_list {
311/*00*/ __le16 l_tree_depth; /* Extent tree depth from this 331/*00*/ __le16 l_tree_depth; /* Extent tree depth from this
312 point. 0 means data extents 332 point. 0 means data extents
313 hang directly off this 333 hang directly off this
314 header (a leaf) */ 334 header (a leaf)
335 NOTE: The high 8 bits cannot be
336 used - tree_depth is never that big.
337 */
315 __le16 l_count; /* Number of extent records */ 338 __le16 l_count; /* Number of extent records */
316 __le16 l_next_free_rec; /* Next unused extent slot */ 339 __le16 l_next_free_rec; /* Next unused extent slot */
317 __le16 l_reserved1; 340 __le16 l_reserved1;
@@ -446,7 +469,9 @@ struct ocfs2_dinode {
446 __le32 i_ctime_nsec; 469 __le32 i_ctime_nsec;
447 __le32 i_mtime_nsec; 470 __le32 i_mtime_nsec;
448 __le32 i_attr; 471 __le32 i_attr;
449 __le32 i_reserved1; 472 __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
473 was set in i_flags */
474 __le16 i_reserved1;
450/*70*/ __le64 i_reserved2[8]; 475/*70*/ __le64 i_reserved2[8];
451/*B8*/ union { 476/*B8*/ union {
452 __le64 i_pad1; /* Generic way to refer to this 477 __le64 i_pad1; /* Generic way to refer to this
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4d5d5655c185..4ca02b1c38ac 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -44,6 +44,7 @@ enum ocfs2_lock_type {
44 OCFS2_LOCK_TYPE_RENAME, 44 OCFS2_LOCK_TYPE_RENAME,
45 OCFS2_LOCK_TYPE_RW, 45 OCFS2_LOCK_TYPE_RW,
46 OCFS2_LOCK_TYPE_DENTRY, 46 OCFS2_LOCK_TYPE_DENTRY,
47 OCFS2_LOCK_TYPE_OPEN,
47 OCFS2_NUM_LOCK_TYPES 48 OCFS2_NUM_LOCK_TYPES
48}; 49};
49 50
@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
69 case OCFS2_LOCK_TYPE_DENTRY: 70 case OCFS2_LOCK_TYPE_DENTRY:
70 c = 'N'; 71 c = 'N';
71 break; 72 break;
73 case OCFS2_LOCK_TYPE_OPEN:
74 c = 'O';
75 break;
72 default: 76 default:
73 c = '\0'; 77 c = '\0';
74 } 78 }
@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
85 * important job it does, anyway. */ 89 * important job it does, anyway. */
86 [OCFS2_LOCK_TYPE_RW] = "Write/Read", 90 [OCFS2_LOCK_TYPE_RW] = "Write/Read",
87 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", 91 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
92 [OCFS2_LOCK_TYPE_OPEN] = "Open",
88}; 93};
89 94
90static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 95static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 2d3ac32cb74e..d921a28329dc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
197 goto bail; 197 goto bail;
198 } 198 }
199 199
200 status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); 200 status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
201 if (status < 0) { 201 if (status < 0) {
202 mlog_errno(status); 202 mlog_errno(status);
203 goto bail; 203 goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6dbb11762759..e3437626d183 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
381 le32_to_cpu(fe->i_clusters))); 381 le32_to_cpu(fe->i_clusters)));
382 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 382 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
383 i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 383 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
384 alloc_inode->i_blocks = 384 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
385 ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
386 385
387 status = 0; 386 status = 0;
388bail: 387bail:
@@ -850,9 +849,9 @@ static int ocfs2_relink_block_group(handle_t *handle,
850 } 849 }
851 850
852 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n", 851 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
853 (unsigned long long)fe->i_blkno, chain, 852 (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
854 (unsigned long long)bg->bg_blkno, 853 (unsigned long long)le64_to_cpu(bg->bg_blkno),
855 (unsigned long long)prev_bg->bg_blkno); 854 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
856 855
857 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); 856 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
858 bg_ptr = le64_to_cpu(bg->bg_next_group); 857 bg_ptr = le64_to_cpu(bg->bg_next_group);
@@ -1163,7 +1162,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1163 } 1162 }
1164 1163
1165 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", 1164 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1166 tmp_bits, (unsigned long long)bg->bg_blkno); 1165 tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1167 1166
1168 *num_bits = tmp_bits; 1167 *num_bits = tmp_bits;
1169 1168
@@ -1228,7 +1227,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1228 } 1227 }
1229 1228
1230 mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits, 1229 mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1231 (unsigned long long)fe->i_blkno); 1230 (unsigned long long)le64_to_cpu(fe->i_blkno));
1232 1231
1233 *bg_blkno = le64_to_cpu(bg->bg_blkno); 1232 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1234 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1233 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6534f92424dd..7c5e3f5d6634 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -806,9 +806,6 @@ static int __init ocfs2_init(void)
806 806
807 ocfs2_print_version(); 807 ocfs2_print_version();
808 808
809 if (init_ocfs2_extent_maps())
810 return -ENOMEM;
811
812 status = init_ocfs2_uptodate_cache(); 809 status = init_ocfs2_uptodate_cache();
813 if (status < 0) { 810 if (status < 0) {
814 mlog_errno(status); 811 mlog_errno(status);
@@ -837,7 +834,6 @@ leave:
837 if (status < 0) { 834 if (status < 0) {
838 ocfs2_free_mem_caches(); 835 ocfs2_free_mem_caches();
839 exit_ocfs2_uptodate_cache(); 836 exit_ocfs2_uptodate_cache();
840 exit_ocfs2_extent_maps();
841 } 837 }
842 838
843 mlog_exit(status); 839 mlog_exit(status);
@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)
863 859
864 unregister_filesystem(&ocfs2_fs_type); 860 unregister_filesystem(&ocfs2_fs_type);
865 861
866 exit_ocfs2_extent_maps();
867
868 exit_ocfs2_uptodate_cache(); 862 exit_ocfs2_uptodate_cache();
869 863
870 mlog_exit_void(); 864 mlog_exit_void();
@@ -943,8 +937,7 @@ static void ocfs2_inode_init_once(void *data,
943{ 937{
944 struct ocfs2_inode_info *oi = data; 938 struct ocfs2_inode_info *oi = data;
945 939
946 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 940 if (flags & SLAB_CTOR_CONSTRUCTOR) {
947 SLAB_CTOR_CONSTRUCTOR) {
948 oi->ip_flags = 0; 941 oi->ip_flags = 0;
949 oi->ip_open_count = 0; 942 oi->ip_open_count = 0;
950 spin_lock_init(&oi->ip_lock); 943 spin_lock_init(&oi->ip_lock);
@@ -963,6 +956,7 @@ static void ocfs2_inode_init_once(void *data,
963 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 956 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
964 ocfs2_lock_res_init_once(&oi->ip_meta_lockres); 957 ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
965 ocfs2_lock_res_init_once(&oi->ip_data_lockres); 958 ocfs2_lock_res_init_once(&oi->ip_data_lockres);
959 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
966 960
967 ocfs2_metadata_cache_init(&oi->vfs_inode); 961 ocfs2_metadata_cache_init(&oi->vfs_inode);
968 962
@@ -1543,7 +1537,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1543 } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) { 1537 } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
1544 mlog(ML_ERROR, "bad block number on superblock: " 1538 mlog(ML_ERROR, "bad block number on superblock: "
1545 "found %llu, should be %llu\n", 1539 "found %llu, should be %llu\n",
1546 (unsigned long long)di->i_blkno, 1540 (unsigned long long)le64_to_cpu(di->i_blkno),
1547 (unsigned long long)bh->b_blocknr); 1541 (unsigned long long)bh->b_blocknr);
1548 } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || 1542 } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
1549 le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { 1543 le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 40dc1a51f4a9..7134007ba22f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -67,16 +67,9 @@ static char *ocfs2_page_getlink(struct dentry * dentry,
67 page = read_mapping_page(mapping, 0, NULL); 67 page = read_mapping_page(mapping, 0, NULL);
68 if (IS_ERR(page)) 68 if (IS_ERR(page))
69 goto sync_fail; 69 goto sync_fail;
70 wait_on_page_locked(page);
71 if (!PageUptodate(page))
72 goto async_fail;
73 *ppage = page; 70 *ppage = page;
74 return kmap(page); 71 return kmap(page);
75 72
76async_fail:
77 page_cache_release(page);
78 return ERR_PTR(-EIO);
79
80sync_fail: 73sync_fail:
81 return (char*)page; 74 return (char*)page;
82} 75}
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index f30e63b9910c..4f82a2f0efef 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -63,17 +63,10 @@ struct ocfs2_msg_hdr
63 __be32 h_node_num; /* node sending this particular message. */ 63 __be32 h_node_num; /* node sending this particular message. */
64}; 64};
65 65
66/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
67 * for the network. */
68#define OCFS2_VOTE_FILENAME_LEN 256
69struct ocfs2_vote_msg 66struct ocfs2_vote_msg
70{ 67{
71 struct ocfs2_msg_hdr v_hdr; 68 struct ocfs2_msg_hdr v_hdr;
72 union { 69 __be32 v_reserved1;
73 __be32 v_generic1;
74 __be32 v_orphaned_slot; /* Used during delete votes */
75 __be32 v_nlink; /* Used during unlink votes */
76 } md1; /* Message type dependant 1 */
77}; 70};
78 71
79/* Responses are given these values to maintain backwards 72/* Responses are given these values to maintain backwards
@@ -86,7 +79,6 @@ struct ocfs2_response_msg
86{ 79{
87 struct ocfs2_msg_hdr r_hdr; 80 struct ocfs2_msg_hdr r_hdr;
88 __be32 r_response; 81 __be32 r_response;
89 __be32 r_orphaned_slot;
90}; 82};
91 83
92struct ocfs2_vote_work { 84struct ocfs2_vote_work {
@@ -96,7 +88,6 @@ struct ocfs2_vote_work {
96 88
97enum ocfs2_vote_request { 89enum ocfs2_vote_request {
98 OCFS2_VOTE_REQ_INVALID = 0, 90 OCFS2_VOTE_REQ_INVALID = 0,
99 OCFS2_VOTE_REQ_DELETE,
100 OCFS2_VOTE_REQ_MOUNT, 91 OCFS2_VOTE_REQ_MOUNT,
101 OCFS2_VOTE_REQ_UMOUNT, 92 OCFS2_VOTE_REQ_UMOUNT,
102 OCFS2_VOTE_REQ_LAST 93 OCFS2_VOTE_REQ_LAST
@@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb,
151 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); 142 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
152} 143}
153 144
154void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
155{
156 struct ocfs2_inode_info *oi = OCFS2_I(inode);
157
158 assert_spin_locked(&oi->ip_lock);
159 /* We set the SKIP_DELETE flag on the inode so we don't try to
160 * delete it in delete_inode ourselves, thus avoiding
161 * unecessary lock pinging. If the other node failed to wipe
162 * the inode as a result of a crash, then recovery will pick
163 * up the slack. */
164 oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
165}
166
167static int ocfs2_process_delete_request(struct inode *inode,
168 int *orphaned_slot)
169{
170 int response = OCFS2_RESPONSE_BUSY;
171
172 mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
173 inode->i_ino, inode->i_nlink, *orphaned_slot);
174
175 spin_lock(&OCFS2_I(inode)->ip_lock);
176
177 /* Whatever our vote response is, we want to make sure that
178 * the orphaned slot is recorded properly on this node *and*
179 * on the requesting node. Technically, if the requesting node
180 * did not know which slot the inode is orphaned in but we
181 * respond with BUSY he doesn't actually need the orphaned
182 * slot, but it doesn't hurt to do it here anyway. */
183 if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
184 mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
185 OCFS2_INVALID_SLOT &&
186 OCFS2_I(inode)->ip_orphaned_slot !=
187 (*orphaned_slot),
188 "Inode %llu: This node thinks it's "
189 "orphaned in slot %d, messaged it's in %d\n",
190 (unsigned long long)OCFS2_I(inode)->ip_blkno,
191 OCFS2_I(inode)->ip_orphaned_slot,
192 *orphaned_slot);
193
194 mlog(0, "Setting orphaned slot for inode %llu to %d\n",
195 (unsigned long long)OCFS2_I(inode)->ip_blkno,
196 *orphaned_slot);
197
198 OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
199 } else {
200 mlog(0, "Sending back orphaned slot %d for inode %llu\n",
201 OCFS2_I(inode)->ip_orphaned_slot,
202 (unsigned long long)OCFS2_I(inode)->ip_blkno);
203
204 *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
205 }
206
207 /* vote no if the file is still open. */
208 if (OCFS2_I(inode)->ip_open_count) {
209 mlog(0, "open count = %u\n",
210 OCFS2_I(inode)->ip_open_count);
211 spin_unlock(&OCFS2_I(inode)->ip_lock);
212 goto done;
213 }
214 spin_unlock(&OCFS2_I(inode)->ip_lock);
215
216 /* directories are a bit ugly... What if someone is sitting in
217 * it? We want to make sure the inode is removed completely as
218 * a result of the iput in process_vote. */
219 if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
220 mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
221 goto done;
222 }
223
224 if (filemap_fdatawrite(inode->i_mapping)) {
225 mlog(ML_ERROR, "Could not sync inode %llu for delete!\n",
226 (unsigned long long)OCFS2_I(inode)->ip_blkno);
227 goto done;
228 }
229 sync_mapping_buffers(inode->i_mapping);
230 truncate_inode_pages(inode->i_mapping, 0);
231 ocfs2_extent_map_trunc(inode, 0);
232
233 spin_lock(&OCFS2_I(inode)->ip_lock);
234 /* double check open count - someone might have raced this
235 * thread into ocfs2_file_open while we were writing out
236 * data. If we're to allow a wipe of this inode now, we *must*
237 * hold the spinlock until we've marked it. */
238 if (OCFS2_I(inode)->ip_open_count) {
239 mlog(0, "Raced to wipe! open count = %u\n",
240 OCFS2_I(inode)->ip_open_count);
241 spin_unlock(&OCFS2_I(inode)->ip_lock);
242 goto done;
243 }
244
245 /* Mark the inode as being wiped from disk. */
246 ocfs2_mark_inode_remotely_deleted(inode);
247 spin_unlock(&OCFS2_I(inode)->ip_lock);
248
249 /* Not sure this is necessary anymore. */
250 d_prune_aliases(inode);
251
252 /* If we get here, then we're voting 'yes', so commit the
253 * delete on our side. */
254 response = OCFS2_RESPONSE_OK;
255done:
256 return response;
257}
258
259static void ocfs2_process_vote(struct ocfs2_super *osb, 145static void ocfs2_process_vote(struct ocfs2_super *osb,
260 struct ocfs2_vote_msg *msg) 146 struct ocfs2_vote_msg *msg)
261{ 147{
262 int net_status, vote_response; 148 int net_status, vote_response;
263 int orphaned_slot = 0; 149 unsigned int node_num;
264 unsigned int node_num, generation;
265 u64 blkno; 150 u64 blkno;
266 enum ocfs2_vote_request request; 151 enum ocfs2_vote_request request;
267 struct inode *inode = NULL;
268 struct ocfs2_msg_hdr *hdr = &msg->v_hdr; 152 struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
269 struct ocfs2_response_msg response; 153 struct ocfs2_response_msg response;
270 154
271 /* decode the network mumbo jumbo into local variables. */ 155 /* decode the network mumbo jumbo into local variables. */
272 request = be32_to_cpu(hdr->h_request); 156 request = be32_to_cpu(hdr->h_request);
273 blkno = be64_to_cpu(hdr->h_blkno); 157 blkno = be64_to_cpu(hdr->h_blkno);
274 generation = be32_to_cpu(hdr->h_generation);
275 node_num = be32_to_cpu(hdr->h_node_num); 158 node_num = be32_to_cpu(hdr->h_node_num);
276 if (request == OCFS2_VOTE_REQ_DELETE)
277 orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
278 159
279 mlog(0, "processing vote: request = %u, blkno = %llu, " 160 mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
280 "generation = %u, node_num = %u, priv1 = %u\n", request, 161 request, (unsigned long long)blkno, node_num);
281 (unsigned long long)blkno, generation, node_num,
282 be32_to_cpu(msg->md1.v_generic1));
283 162
284 if (!ocfs2_is_valid_vote_request(request)) { 163 if (!ocfs2_is_valid_vote_request(request)) {
285 mlog(ML_ERROR, "Invalid vote request %d from node %u\n", 164 mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
@@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
302 break; 181 break;
303 } 182 }
304 183
305 /* We cannot process the remaining message types before we're
306 * fully mounted. It's perfectly safe however to send a 'yes'
307 * response as we can't possibly have any of the state they're
308 * asking us to modify yet. */
309 if (atomic_read(&osb->vol_state) == VOLUME_INIT)
310 goto respond;
311
312 /* If we get here, then the request is against an inode. */
313 inode = ocfs2_ilookup_for_vote(osb, blkno,
314 request == OCFS2_VOTE_REQ_DELETE);
315
316 /* Not finding the inode is perfectly valid - it means we're
317 * not interested in what the other node is about to do to it
318 * so in those cases we automatically respond with an
319 * affirmative. Cluster locking ensures that we won't race
320 * interest in the inode with this vote request. */
321 if (!inode)
322 goto respond;
323
324 /* Check generation values. It's possible for us to get a
325 * request against a stale inode. If so then we proceed as if
326 * we had not found an inode in the first place. */
327 if (inode->i_generation != generation) {
328 mlog(0, "generation passed %u != inode generation = %u, "
329 "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, "
330 "message type = %u\n", generation, inode->i_generation,
331 OCFS2_I(inode)->ip_flags,
332 (unsigned long long)OCFS2_I(inode)->ip_blkno,
333 (unsigned long long)blkno, atomic_read(&inode->i_count),
334 request);
335 iput(inode);
336 inode = NULL;
337 goto respond;
338 }
339
340 switch (request) {
341 case OCFS2_VOTE_REQ_DELETE:
342 vote_response = ocfs2_process_delete_request(inode,
343 &orphaned_slot);
344 break;
345 default:
346 mlog(ML_ERROR, "node %u, invalid request: %u\n",
347 node_num, request);
348 vote_response = OCFS2_RESPONSE_BAD_MSG;
349 }
350
351respond: 184respond:
352 /* Response struture is small so we just put it on the stack 185 /* Response struture is small so we just put it on the stack
353 * and stuff it inline. */ 186 * and stuff it inline. */
@@ -357,7 +190,6 @@ respond:
357 response.r_hdr.h_generation = hdr->h_generation; 190 response.r_hdr.h_generation = hdr->h_generation;
358 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); 191 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
359 response.r_response = cpu_to_be32(vote_response); 192 response.r_response = cpu_to_be32(vote_response);
360 response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
361 193
362 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, 194 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
363 osb->net_key, 195 osb->net_key,
@@ -373,9 +205,6 @@ respond:
373 && net_status != -ENOTCONN) 205 && net_status != -ENOTCONN)
374 mlog(ML_ERROR, "message to node %u fails with error %d!\n", 206 mlog(ML_ERROR, "message to node %u fails with error %d!\n",
375 node_num, net_status); 207 node_num, net_status);
376
377 if (inode)
378 iput(inode);
379} 208}
380 209
381static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) 210static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
@@ -634,8 +463,7 @@ bail:
634static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, 463static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
635 u64 blkno, 464 u64 blkno,
636 unsigned int generation, 465 unsigned int generation,
637 enum ocfs2_vote_request type, 466 enum ocfs2_vote_request type)
638 u32 priv)
639{ 467{
640 struct ocfs2_vote_msg *request; 468 struct ocfs2_vote_msg *request;
641 struct ocfs2_msg_hdr *hdr; 469 struct ocfs2_msg_hdr *hdr;
@@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
651 hdr->h_request = cpu_to_be32(type); 479 hdr->h_request = cpu_to_be32(type);
652 hdr->h_blkno = cpu_to_be64(blkno); 480 hdr->h_blkno = cpu_to_be64(blkno);
653 hdr->h_generation = cpu_to_be32(generation); 481 hdr->h_generation = cpu_to_be32(generation);
654
655 request->md1.v_generic1 = cpu_to_be32(priv);
656 } 482 }
657 483
658 return request; 484 return request;
@@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb,
664 struct ocfs2_vote_msg *request, 490 struct ocfs2_vote_msg *request,
665 struct ocfs2_net_response_cb *callback) 491 struct ocfs2_net_response_cb *callback)
666{ 492{
667 int status, response; 493 int status, response = -EBUSY;
668 unsigned int response_id; 494 unsigned int response_id;
669 struct ocfs2_msg_hdr *hdr; 495 struct ocfs2_msg_hdr *hdr;
670 496
@@ -686,109 +512,12 @@ bail:
686 return status; 512 return status;
687} 513}
688 514
689static int ocfs2_request_vote(struct inode *inode,
690 struct ocfs2_vote_msg *request,
691 struct ocfs2_net_response_cb *callback)
692{
693 int status;
694 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
695
696 if (ocfs2_inode_is_new(inode))
697 return 0;
698
699 status = -EAGAIN;
700 while (status == -EAGAIN) {
701 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
702 signal_pending(current))
703 return -ERESTARTSYS;
704
705 status = ocfs2_super_lock(osb, 0);
706 if (status < 0) {
707 mlog_errno(status);
708 break;
709 }
710
711 status = 0;
712 if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
713 osb->node_num))
714 status = ocfs2_do_request_vote(osb, request, callback);
715
716 ocfs2_super_unlock(osb, 0);
717 }
718 return status;
719}
720
721static void ocfs2_delete_response_cb(void *priv,
722 struct ocfs2_response_msg *resp)
723{
724 int orphaned_slot, node;
725 struct inode *inode = priv;
726
727 orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
728 node = be32_to_cpu(resp->r_hdr.h_node_num);
729 mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n",
730 node, (unsigned long long)OCFS2_I(inode)->ip_blkno,
731 orphaned_slot);
732
733 /* The other node may not actually know which slot the inode
734 * is orphaned in. */
735 if (orphaned_slot == OCFS2_INVALID_SLOT)
736 return;
737
738 /* Ok, the responding node knows which slot this inode is
739 * orphaned in. We verify that the information is correct and
740 * then record this in the inode. ocfs2_delete_inode will use
741 * this information to determine which lock to take. */
742 spin_lock(&OCFS2_I(inode)->ip_lock);
743 mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
744 OCFS2_I(inode)->ip_orphaned_slot
745 != OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's "
746 "orphaned in slot %d, we think it's in %d\n",
747 (unsigned long long)OCFS2_I(inode)->ip_blkno,
748 be32_to_cpu(resp->r_hdr.h_node_num),
749 orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
750
751 OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
752 spin_unlock(&OCFS2_I(inode)->ip_lock);
753}
754
755int ocfs2_request_delete_vote(struct inode *inode)
756{
757 int orphaned_slot, status;
758 struct ocfs2_net_response_cb delete_cb;
759 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
760 struct ocfs2_vote_msg *request;
761
762 spin_lock(&OCFS2_I(inode)->ip_lock);
763 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
764 spin_unlock(&OCFS2_I(inode)->ip_lock);
765
766 delete_cb.rc_cb = ocfs2_delete_response_cb;
767 delete_cb.rc_priv = inode;
768
769 mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n",
770 (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot);
771
772 status = -ENOMEM;
773 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
774 inode->i_generation,
775 OCFS2_VOTE_REQ_DELETE, orphaned_slot);
776 if (request) {
777 status = ocfs2_request_vote(inode, request, &delete_cb);
778
779 kfree(request);
780 }
781
782 return status;
783}
784
785int ocfs2_request_mount_vote(struct ocfs2_super *osb) 515int ocfs2_request_mount_vote(struct ocfs2_super *osb)
786{ 516{
787 int status; 517 int status;
788 struct ocfs2_vote_msg *request = NULL; 518 struct ocfs2_vote_msg *request = NULL;
789 519
790 request = ocfs2_new_vote_request(osb, 0ULL, 0, 520 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
791 OCFS2_VOTE_REQ_MOUNT, 0);
792 if (!request) { 521 if (!request) {
793 status = -ENOMEM; 522 status = -ENOMEM;
794 goto bail; 523 goto bail;
@@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
821 int status; 550 int status;
822 struct ocfs2_vote_msg *request = NULL; 551 struct ocfs2_vote_msg *request = NULL;
823 552
824 request = ocfs2_new_vote_request(osb, 0ULL, 0, 553 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
825 OCFS2_VOTE_REQ_UMOUNT, 0);
826 if (!request) { 554 if (!request) {
827 status = -ENOMEM; 555 status = -ENOMEM;
828 goto bail; 556 goto bail;
@@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg,
969 be32_to_cpu(work->w_msg.v_hdr.h_generation)); 697 be32_to_cpu(work->w_msg.v_hdr.h_generation));
970 mlog(0, "h_node_num = %u\n", 698 mlog(0, "h_node_num = %u\n",
971 be32_to_cpu(work->w_msg.v_hdr.h_node_num)); 699 be32_to_cpu(work->w_msg.v_hdr.h_node_num));
972 mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
973 700
974 spin_lock(&osb->vote_task_lock); 701 spin_lock(&osb->vote_task_lock);
975 list_add_tail(&work->w_list, &osb->vote_list); 702 list_add_tail(&work->w_list, &osb->vote_list);
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
index 53ebc1c69e56..9ea46f62de31 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/vote.h
@@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
38 wake_up(&osb->vote_event); 38 wake_up(&osb->vote_event);
39} 39}
40 40
41int ocfs2_request_delete_vote(struct inode *inode);
42int ocfs2_request_mount_vote(struct ocfs2_super *osb); 41int ocfs2_request_mount_vote(struct ocfs2_super *osb);
43int ocfs2_request_umount_vote(struct ocfs2_super *osb); 42int ocfs2_request_umount_vote(struct ocfs2_super *osb);
44int ocfs2_register_net_handlers(struct ocfs2_super *osb); 43int ocfs2_register_net_handlers(struct ocfs2_super *osb);
45void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); 44void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
46 45
47void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
48
49void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, 46void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
50 int node_num); 47 int node_num);
51#endif 48#endif
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index bde1c164417d..731a90e9f0cd 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -419,8 +419,7 @@ static void op_inode_init_once(void *data, struct kmem_cache * cachep, unsigned
419{ 419{
420 struct op_inode_info *oi = (struct op_inode_info *) data; 420 struct op_inode_info *oi = (struct op_inode_info *) data;
421 421
422 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 422 if (flags & SLAB_CTOR_CONSTRUCTOR)
423 SLAB_CTOR_CONSTRUCTOR)
424 inode_init_once(&oi->vfs_inode); 423 inode_init_once(&oi->vfs_inode);
425} 424}
426 425
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index 1bc9f372c7d4..e3491328596b 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -271,7 +271,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
271 extern void xd_set_geometry(struct block_device *, 271 extern void xd_set_geometry(struct block_device *,
272 unsigned char, unsigned char, unsigned int); 272 unsigned char, unsigned char, unsigned int);
273 xd_set_geometry(bdev, dr->secspertrack, heads, 1); 273 xd_set_geometry(bdev, dr->secspertrack, heads, 1);
274 invalidate_bdev(bdev, 1); 274 invalidate_bh_lrus();
275 truncate_inode_pages(bdev->bd_inode->i_mapping, 0); 275 truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
276 } 276 }
277#endif 277#endif
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8a7d0035ad7a..6b9dae3f0e6c 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -312,7 +312,7 @@ static struct attribute * default_attrs[] = {
312 NULL, 312 NULL,
313}; 313};
314 314
315extern struct subsystem block_subsys; 315extern struct kset block_subsys;
316 316
317static void part_release(struct kobject *kobj) 317static void part_release(struct kobject *kobj)
318{ 318{
@@ -388,7 +388,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
388 kobject_add(&p->kobj); 388 kobject_add(&p->kobj);
389 if (!disk->part_uevent_suppress) 389 if (!disk->part_uevent_suppress)
390 kobject_uevent(&p->kobj, KOBJ_ADD); 390 kobject_uevent(&p->kobj, KOBJ_ADD);
391 sysfs_create_link(&p->kobj, &block_subsys.kset.kobj, "subsystem"); 391 sysfs_create_link(&p->kobj, &block_subsys.kobj, "subsystem");
392 if (flags & ADDPART_FLAG_WHOLEDISK) { 392 if (flags & ADDPART_FLAG_WHOLEDISK) {
393 static struct attribute addpartattr = { 393 static struct attribute addpartattr = {
394 .name = "whole_disk", 394 .name = "whole_disk",
@@ -444,7 +444,7 @@ static int disk_sysfs_symlinks(struct gendisk *disk)
444 goto err_out_dev_link; 444 goto err_out_dev_link;
445 } 445 }
446 446
447 err = sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj, 447 err = sysfs_create_link(&disk->kobj, &block_subsys.kobj,
448 "subsystem"); 448 "subsystem");
449 if (err) 449 if (err)
450 goto err_out_disk_name_lnk; 450 goto err_out_disk_name_lnk;
@@ -569,9 +569,6 @@ unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
569 page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), 569 page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
570 NULL); 570 NULL);
571 if (!IS_ERR(page)) { 571 if (!IS_ERR(page)) {
572 wait_on_page_locked(page);
573 if (!PageUptodate(page))
574 goto fail;
575 if (PageError(page)) 572 if (PageError(page))
576 goto fail; 573 goto fail;
577 p->v = page; 574 p->v = page;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 989af5e55d1b..ec158dd02b3a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -715,6 +715,40 @@ static const struct file_operations proc_oom_adjust_operations = {
715 .write = oom_adjust_write, 715 .write = oom_adjust_write,
716}; 716};
717 717
718static ssize_t clear_refs_write(struct file *file, const char __user *buf,
719 size_t count, loff_t *ppos)
720{
721 struct task_struct *task;
722 char buffer[PROC_NUMBUF], *end;
723 struct mm_struct *mm;
724
725 memset(buffer, 0, sizeof(buffer));
726 if (count > sizeof(buffer) - 1)
727 count = sizeof(buffer) - 1;
728 if (copy_from_user(buffer, buf, count))
729 return -EFAULT;
730 if (!simple_strtol(buffer, &end, 0))
731 return -EINVAL;
732 if (*end == '\n')
733 end++;
734 task = get_proc_task(file->f_path.dentry->d_inode);
735 if (!task)
736 return -ESRCH;
737 mm = get_task_mm(task);
738 if (mm) {
739 clear_refs_smap(mm);
740 mmput(mm);
741 }
742 put_task_struct(task);
743 if (end - buffer == 0)
744 return -EIO;
745 return end - buffer;
746}
747
748static struct file_operations proc_clear_refs_operations = {
749 .write = clear_refs_write,
750};
751
718#ifdef CONFIG_AUDITSYSCALL 752#ifdef CONFIG_AUDITSYSCALL
719#define TMPBUFLEN 21 753#define TMPBUFLEN 21
720static ssize_t proc_loginuid_read(struct file * file, char __user * buf, 754static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -1851,6 +1885,7 @@ static struct pid_entry tgid_base_stuff[] = {
1851 REG("mounts", S_IRUGO, mounts), 1885 REG("mounts", S_IRUGO, mounts),
1852 REG("mountstats", S_IRUSR, mountstats), 1886 REG("mountstats", S_IRUSR, mountstats),
1853#ifdef CONFIG_MMU 1887#ifdef CONFIG_MMU
1888 REG("clear_refs", S_IWUSR, clear_refs),
1854 REG("smaps", S_IRUGO, smaps), 1889 REG("smaps", S_IRUGO, smaps),
1855#endif 1890#endif
1856#ifdef CONFIG_SECURITY 1891#ifdef CONFIG_SECURITY
@@ -2132,6 +2167,7 @@ static struct pid_entry tid_base_stuff[] = {
2132 LNK("exe", exe), 2167 LNK("exe", exe),
2133 REG("mounts", S_IRUGO, mounts), 2168 REG("mounts", S_IRUGO, mounts),
2134#ifdef CONFIG_MMU 2169#ifdef CONFIG_MMU
2170 REG("clear_refs", S_IWUSR, clear_refs),
2135 REG("smaps", S_IRUGO, smaps), 2171 REG("smaps", S_IRUGO, smaps),
2136#endif 2172#endif
2137#ifdef CONFIG_SECURITY 2173#ifdef CONFIG_SECURITY
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index c372eb151a3a..22b1158389ae 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -109,8 +109,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
109{ 109{
110 struct proc_inode *ei = (struct proc_inode *) foo; 110 struct proc_inode *ei = (struct proc_inode *) foo;
111 111
112 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 112 if (flags & SLAB_CTOR_CONSTRUCTOR)
113 SLAB_CTOR_CONSTRUCTOR)
114 inode_init_once(&ei->vfs_inode); 113 inode_init_once(&ei->vfs_inode);
115} 114}
116 115
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index abdf068bc27f..eca471bc8512 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -38,7 +38,7 @@ static int property_read_proc(char *page, char **start, off_t off,
38 n = count; 38 n = count;
39 else 39 else
40 *eof = 1; 40 *eof = 1;
41 memcpy(page, pp->value + off, n); 41 memcpy(page, (char *)pp->value + off, n);
42 *start = page; 42 *start = page;
43 return n; 43 return n;
44} 44}
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index e2c4c0a5c90d..75ec6523d29a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -398,8 +398,6 @@ static const struct file_operations proc_modules_operations = {
398#endif 398#endif
399 399
400#ifdef CONFIG_SLAB 400#ifdef CONFIG_SLAB
401extern struct seq_operations slabinfo_op;
402extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *);
403static int slabinfo_open(struct inode *inode, struct file *file) 401static int slabinfo_open(struct inode *inode, struct file *file)
404{ 402{
405 return seq_open(file, &slabinfo_op); 403 return seq_open(file, &slabinfo_op);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7445980c8022..4008c060f7ef 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -120,6 +120,14 @@ struct mem_size_stats
120 unsigned long shared_dirty; 120 unsigned long shared_dirty;
121 unsigned long private_clean; 121 unsigned long private_clean;
122 unsigned long private_dirty; 122 unsigned long private_dirty;
123 unsigned long referenced;
124};
125
126struct pmd_walker {
127 struct vm_area_struct *vma;
128 void *private;
129 void (*action)(struct vm_area_struct *, pmd_t *, unsigned long,
130 unsigned long, void *);
123}; 131};
124 132
125static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) 133static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
@@ -181,18 +189,20 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
181 189
182 if (mss) 190 if (mss)
183 seq_printf(m, 191 seq_printf(m,
184 "Size: %8lu kB\n" 192 "Size: %8lu kB\n"
185 "Rss: %8lu kB\n" 193 "Rss: %8lu kB\n"
186 "Shared_Clean: %8lu kB\n" 194 "Shared_Clean: %8lu kB\n"
187 "Shared_Dirty: %8lu kB\n" 195 "Shared_Dirty: %8lu kB\n"
188 "Private_Clean: %8lu kB\n" 196 "Private_Clean: %8lu kB\n"
189 "Private_Dirty: %8lu kB\n", 197 "Private_Dirty: %8lu kB\n"
198 "Referenced: %8lu kB\n",
190 (vma->vm_end - vma->vm_start) >> 10, 199 (vma->vm_end - vma->vm_start) >> 10,
191 mss->resident >> 10, 200 mss->resident >> 10,
192 mss->shared_clean >> 10, 201 mss->shared_clean >> 10,
193 mss->shared_dirty >> 10, 202 mss->shared_dirty >> 10,
194 mss->private_clean >> 10, 203 mss->private_clean >> 10,
195 mss->private_dirty >> 10); 204 mss->private_dirty >> 10,
205 mss->referenced >> 10);
196 206
197 if (m->count < m->size) /* vma is copied successfully */ 207 if (m->count < m->size) /* vma is copied successfully */
198 m->version = (vma != get_gate_vma(task))? vma->vm_start: 0; 208 m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
@@ -205,15 +215,16 @@ static int show_map(struct seq_file *m, void *v)
205} 215}
206 216
207static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 217static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
208 unsigned long addr, unsigned long end, 218 unsigned long addr, unsigned long end,
209 struct mem_size_stats *mss) 219 void *private)
210{ 220{
221 struct mem_size_stats *mss = private;
211 pte_t *pte, ptent; 222 pte_t *pte, ptent;
212 spinlock_t *ptl; 223 spinlock_t *ptl;
213 struct page *page; 224 struct page *page;
214 225
215 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 226 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
216 do { 227 for (; addr != end; pte++, addr += PAGE_SIZE) {
217 ptent = *pte; 228 ptent = *pte;
218 if (!pte_present(ptent)) 229 if (!pte_present(ptent))
219 continue; 230 continue;
@@ -224,6 +235,9 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
224 if (!page) 235 if (!page)
225 continue; 236 continue;
226 237
238 /* Accumulate the size in pages that have been accessed. */
239 if (pte_young(ptent) || PageReferenced(page))
240 mss->referenced += PAGE_SIZE;
227 if (page_mapcount(page) >= 2) { 241 if (page_mapcount(page) >= 2) {
228 if (pte_dirty(ptent)) 242 if (pte_dirty(ptent))
229 mss->shared_dirty += PAGE_SIZE; 243 mss->shared_dirty += PAGE_SIZE;
@@ -235,57 +249,99 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
235 else 249 else
236 mss->private_clean += PAGE_SIZE; 250 mss->private_clean += PAGE_SIZE;
237 } 251 }
238 } while (pte++, addr += PAGE_SIZE, addr != end); 252 }
239 pte_unmap_unlock(pte - 1, ptl); 253 pte_unmap_unlock(pte - 1, ptl);
240 cond_resched(); 254 cond_resched();
241} 255}
242 256
243static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud, 257static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
244 unsigned long addr, unsigned long end, 258 unsigned long addr, unsigned long end,
245 struct mem_size_stats *mss) 259 void *private)
260{
261 pte_t *pte, ptent;
262 spinlock_t *ptl;
263 struct page *page;
264
265 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
266 for (; addr != end; pte++, addr += PAGE_SIZE) {
267 ptent = *pte;
268 if (!pte_present(ptent))
269 continue;
270
271 page = vm_normal_page(vma, addr, ptent);
272 if (!page)
273 continue;
274
275 /* Clear accessed and referenced bits. */
276 ptep_test_and_clear_young(vma, addr, pte);
277 ClearPageReferenced(page);
278 }
279 pte_unmap_unlock(pte - 1, ptl);
280 cond_resched();
281}
282
283static inline void walk_pmd_range(struct pmd_walker *walker, pud_t *pud,
284 unsigned long addr, unsigned long end)
246{ 285{
247 pmd_t *pmd; 286 pmd_t *pmd;
248 unsigned long next; 287 unsigned long next;
249 288
250 pmd = pmd_offset(pud, addr); 289 for (pmd = pmd_offset(pud, addr); addr != end;
251 do { 290 pmd++, addr = next) {
252 next = pmd_addr_end(addr, end); 291 next = pmd_addr_end(addr, end);
253 if (pmd_none_or_clear_bad(pmd)) 292 if (pmd_none_or_clear_bad(pmd))
254 continue; 293 continue;
255 smaps_pte_range(vma, pmd, addr, next, mss); 294 walker->action(walker->vma, pmd, addr, next, walker->private);
256 } while (pmd++, addr = next, addr != end); 295 }
257} 296}
258 297
259static inline void smaps_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 298static inline void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd,
260 unsigned long addr, unsigned long end, 299 unsigned long addr, unsigned long end)
261 struct mem_size_stats *mss)
262{ 300{
263 pud_t *pud; 301 pud_t *pud;
264 unsigned long next; 302 unsigned long next;
265 303
266 pud = pud_offset(pgd, addr); 304 for (pud = pud_offset(pgd, addr); addr != end;
267 do { 305 pud++, addr = next) {
268 next = pud_addr_end(addr, end); 306 next = pud_addr_end(addr, end);
269 if (pud_none_or_clear_bad(pud)) 307 if (pud_none_or_clear_bad(pud))
270 continue; 308 continue;
271 smaps_pmd_range(vma, pud, addr, next, mss); 309 walk_pmd_range(walker, pud, addr, next);
272 } while (pud++, addr = next, addr != end); 310 }
273} 311}
274 312
275static inline void smaps_pgd_range(struct vm_area_struct *vma, 313/*
276 unsigned long addr, unsigned long end, 314 * walk_page_range - walk the page tables of a VMA with a callback
277 struct mem_size_stats *mss) 315 * @vma - VMA to walk
316 * @action - callback invoked for every bottom-level (PTE) page table
317 * @private - private data passed to the callback function
318 *
319 * Recursively walk the page table for the memory area in a VMA, calling
320 * a callback for every bottom-level (PTE) page table.
321 */
322static inline void walk_page_range(struct vm_area_struct *vma,
323 void (*action)(struct vm_area_struct *,
324 pmd_t *, unsigned long,
325 unsigned long, void *),
326 void *private)
278{ 327{
328 unsigned long addr = vma->vm_start;
329 unsigned long end = vma->vm_end;
330 struct pmd_walker walker = {
331 .vma = vma,
332 .private = private,
333 .action = action,
334 };
279 pgd_t *pgd; 335 pgd_t *pgd;
280 unsigned long next; 336 unsigned long next;
281 337
282 pgd = pgd_offset(vma->vm_mm, addr); 338 for (pgd = pgd_offset(vma->vm_mm, addr); addr != end;
283 do { 339 pgd++, addr = next) {
284 next = pgd_addr_end(addr, end); 340 next = pgd_addr_end(addr, end);
285 if (pgd_none_or_clear_bad(pgd)) 341 if (pgd_none_or_clear_bad(pgd))
286 continue; 342 continue;
287 smaps_pud_range(vma, pgd, addr, next, mss); 343 walk_pud_range(&walker, pgd, addr, next);
288 } while (pgd++, addr = next, addr != end); 344 }
289} 345}
290 346
291static int show_smap(struct seq_file *m, void *v) 347static int show_smap(struct seq_file *m, void *v)
@@ -295,10 +351,22 @@ static int show_smap(struct seq_file *m, void *v)
295 351
296 memset(&mss, 0, sizeof mss); 352 memset(&mss, 0, sizeof mss);
297 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 353 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
298 smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss); 354 walk_page_range(vma, smaps_pte_range, &mss);
299 return show_map_internal(m, v, &mss); 355 return show_map_internal(m, v, &mss);
300} 356}
301 357
358void clear_refs_smap(struct mm_struct *mm)
359{
360 struct vm_area_struct *vma;
361
362 down_read(&mm->mmap_sem);
363 for (vma = mm->mmap; vma; vma = vma->vm_next)
364 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
365 walk_page_range(vma, clear_refs_pte_range, NULL);
366 flush_tlb_mm(mm);
367 up_read(&mm->mmap_sem);
368}
369
302static void *m_start(struct seq_file *m, loff_t *pos) 370static void *m_start(struct seq_file *m, loff_t *pos)
303{ 371{
304 struct proc_maps_private *priv = m->private; 372 struct proc_maps_private *priv = m->private;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index d96050728c43..523e1098ae88 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -514,7 +514,7 @@ static int __init parse_crash_elf64_headers(void)
514 /* Do some basic Verification. */ 514 /* Do some basic Verification. */
515 if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || 515 if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
516 (ehdr.e_type != ET_CORE) || 516 (ehdr.e_type != ET_CORE) ||
517 !elf_check_arch(&ehdr) || 517 !vmcore_elf_check_arch(&ehdr) ||
518 ehdr.e_ident[EI_CLASS] != ELFCLASS64 || 518 ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
519 ehdr.e_ident[EI_VERSION] != EV_CURRENT || 519 ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
520 ehdr.e_version != EV_CURRENT || 520 ehdr.e_version != EV_CURRENT ||
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 83bc8e7824cd..75fc8498f2e2 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -536,8 +536,7 @@ static void init_once(void *foo, struct kmem_cache * cachep,
536{ 536{
537 struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo; 537 struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
538 538
539 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == 539 if (flags & SLAB_CTOR_CONSTRUCTOR)
540 SLAB_CTOR_CONSTRUCTOR)
541 inode_init_once(&ei->vfs_inode); 540 inode_init_once(&ei->vfs_inode);
542} 541}
543 542
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f13a7f164dc6..7054aaef0493 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -511,8 +511,7 @@ static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags
511{ 511{
512 struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo; 512 struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
513 513
514 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == 514 if (flags & SLAB_CTOR_CONSTRUCTOR) {
515 SLAB_CTOR_CONSTRUCTOR) {
516 INIT_LIST_HEAD(&ei->i_prealloc_list); 515 INIT_LIST_HEAD(&ei->i_prealloc_list);
517 inode_init_once(&ei->vfs_inode); 516 inode_init_once(&ei->vfs_inode);
518#ifdef CONFIG_REISERFS_FS_POSIX_ACL 517#ifdef CONFIG_REISERFS_FS_POSIX_ACL
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index f01389fd162e..bf6e58214538 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -54,82 +54,48 @@
54static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char 54static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
55 *prefix); 55 *prefix);
56 56
57static struct dentry *create_xa_root(struct super_block *sb) 57/* Returns the dentry referring to the root of the extended attribute
58 * directory tree. If it has already been retrieved, it is used. If it
59 * hasn't been created and the flags indicate creation is allowed, we
60 * attempt to create it. On error, we return a pointer-encoded error.
61 */
62static struct dentry *get_xa_root(struct super_block *sb, int flags)
58{ 63{
59 struct dentry *privroot = dget(REISERFS_SB(sb)->priv_root); 64 struct dentry *privroot = dget(REISERFS_SB(sb)->priv_root);
60 struct dentry *xaroot; 65 struct dentry *xaroot;
61 66
62 /* This needs to be created at mount-time */ 67 /* This needs to be created at mount-time */
63 if (!privroot) 68 if (!privroot)
64 return ERR_PTR(-EOPNOTSUPP); 69 return ERR_PTR(-ENODATA);
65 70
66 xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME)); 71 mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR);
67 if (IS_ERR(xaroot)) { 72 if (REISERFS_SB(sb)->xattr_root) {
73 xaroot = dget(REISERFS_SB(sb)->xattr_root);
68 goto out; 74 goto out;
69 } else if (!xaroot->d_inode) {
70 int err;
71 mutex_lock(&privroot->d_inode->i_mutex);
72 err =
73 privroot->d_inode->i_op->mkdir(privroot->d_inode, xaroot,
74 0700);
75 mutex_unlock(&privroot->d_inode->i_mutex);
76
77 if (err) {
78 dput(xaroot);
79 dput(privroot);
80 return ERR_PTR(err);
81 }
82 REISERFS_SB(sb)->xattr_root = dget(xaroot);
83 } 75 }
84 76
85 out:
86 dput(privroot);
87 return xaroot;
88}
89
90/* This will return a dentry, or error, refering to the xa root directory.
91 * If the xa root doesn't exist yet, the dentry will be returned without
92 * an associated inode. This dentry can be used with ->mkdir to create
93 * the xa directory. */
94static struct dentry *__get_xa_root(struct super_block *s)
95{
96 struct dentry *privroot = dget(REISERFS_SB(s)->priv_root);
97 struct dentry *xaroot = NULL;
98
99 if (IS_ERR(privroot) || !privroot)
100 return privroot;
101
102 xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME)); 77 xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME));
103 if (IS_ERR(xaroot)) { 78 if (IS_ERR(xaroot)) {
104 goto out; 79 goto out;
105 } else if (!xaroot->d_inode) { 80 } else if (!xaroot->d_inode) {
106 dput(xaroot); 81 int err = -ENODATA;
107 xaroot = NULL; 82 if (flags == 0 || flags & XATTR_CREATE)
108 goto out; 83 err = privroot->d_inode->i_op->mkdir(privroot->d_inode,
84 xaroot, 0700);
85 if (err) {
86 dput(xaroot);
87 xaroot = ERR_PTR(err);
88 goto out;
89 }
109 } 90 }
110 91 REISERFS_SB(sb)->xattr_root = dget(xaroot);
111 REISERFS_SB(s)->xattr_root = dget(xaroot);
112 92
113 out: 93 out:
94 mutex_unlock(&privroot->d_inode->i_mutex);
114 dput(privroot); 95 dput(privroot);
115 return xaroot; 96 return xaroot;
116} 97}
117 98
118/* Returns the dentry (or NULL) referring to the root of the extended
119 * attribute directory tree. If it has already been retrieved, it is used.
120 * Otherwise, we attempt to retrieve it from disk. It may also return
121 * a pointer-encoded error.
122 */
123static inline struct dentry *get_xa_root(struct super_block *s)
124{
125 struct dentry *dentry = dget(REISERFS_SB(s)->xattr_root);
126
127 if (!dentry)
128 dentry = __get_xa_root(s);
129
130 return dentry;
131}
132
133/* Opens the directory corresponding to the inode's extended attribute store. 99/* Opens the directory corresponding to the inode's extended attribute store.
134 * If flags allow, the tree to the directory may be created. If creation is 100 * If flags allow, the tree to the directory may be created. If creation is
135 * prohibited, -ENODATA is returned. */ 101 * prohibited, -ENODATA is returned. */
@@ -138,21 +104,11 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
138 struct dentry *xaroot, *xadir; 104 struct dentry *xaroot, *xadir;
139 char namebuf[17]; 105 char namebuf[17];
140 106
141 xaroot = get_xa_root(inode->i_sb); 107 xaroot = get_xa_root(inode->i_sb, flags);
142 if (IS_ERR(xaroot)) { 108 if (IS_ERR(xaroot))
143 return xaroot; 109 return xaroot;
144 } else if (!xaroot) {
145 if (flags == 0 || flags & XATTR_CREATE) {
146 xaroot = create_xa_root(inode->i_sb);
147 if (IS_ERR(xaroot))
148 return xaroot;
149 }
150 if (!xaroot)
151 return ERR_PTR(-ENODATA);
152 }
153 110
154 /* ok, we have xaroot open */ 111 /* ok, we have xaroot open */
155
156 snprintf(namebuf, sizeof(namebuf), "%X.%X", 112 snprintf(namebuf, sizeof(namebuf), "%X.%X",
157 le32_to_cpu(INODE_PKEY(inode)->k_objectid), 113 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
158 inode->i_generation); 114 inode->i_generation);
@@ -454,11 +410,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
454 mapping_set_gfp_mask(mapping, GFP_NOFS); 410 mapping_set_gfp_mask(mapping, GFP_NOFS);
455 page = read_mapping_page(mapping, n, NULL); 411 page = read_mapping_page(mapping, n, NULL);
456 if (!IS_ERR(page)) { 412 if (!IS_ERR(page)) {
457 wait_on_page_locked(page);
458 kmap(page); 413 kmap(page);
459 if (!PageUptodate(page))
460 goto fail;
461
462 if (PageError(page)) 414 if (PageError(page))
463 goto fail; 415 goto fail;
464 } 416 }
@@ -821,7 +773,7 @@ int reiserfs_delete_xattrs(struct inode *inode)
821 773
822 /* Leftovers besides . and .. -- that's not good. */ 774 /* Leftovers besides . and .. -- that's not good. */
823 if (dir->d_inode->i_nlink <= 2) { 775 if (dir->d_inode->i_nlink <= 2) {
824 root = get_xa_root(inode->i_sb); 776 root = get_xa_root(inode->i_sb, XATTR_REPLACE);
825 reiserfs_write_lock_xattrs(inode->i_sb); 777 reiserfs_write_lock_xattrs(inode->i_sb);
826 err = vfs_rmdir(root->d_inode, dir); 778 err = vfs_rmdir(root->d_inode, dir);
827 reiserfs_write_unlock_xattrs(inode->i_sb); 779 reiserfs_write_unlock_xattrs(inode->i_sb);
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index fd601014813e..804285190271 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -570,8 +570,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
570{ 570{
571 struct romfs_inode_info *ei = (struct romfs_inode_info *) foo; 571 struct romfs_inode_info *ei = (struct romfs_inode_info *) foo;
572 572
573 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 573 if (flags & SLAB_CTOR_CONSTRUCTOR)
574 SLAB_CTOR_CONSTRUCTOR)
575 inode_init_once(&ei->vfs_inode); 574 inode_init_once(&ei->vfs_inode);
576} 575}
577 576
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 5faba4f1c9ab..424a3ddf86dd 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -69,9 +69,8 @@ static void smb_destroy_inode(struct inode *inode)
69static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags) 69static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
70{ 70{
71 struct smb_inode_info *ei = (struct smb_inode_info *) foo; 71 struct smb_inode_info *ei = (struct smb_inode_info *) foo;
72 unsigned long flagmask = SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR;
73 72
74 if ((flags & flagmask) == SLAB_CTOR_CONSTRUCTOR) 73 if (flags & SLAB_CTOR_CONSTRUCTOR)
75 inode_init_once(&ei->vfs_inode); 74 inode_init_once(&ei->vfs_inode);
76} 75}
77 76
diff --git a/fs/super.c b/fs/super.c
index 60b1e50cbf53..8341e4e1d738 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -725,16 +725,6 @@ static int test_bdev_super(struct super_block *s, void *data)
725 return (void *)s->s_bdev == data; 725 return (void *)s->s_bdev == data;
726} 726}
727 727
728static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
729{
730 if (bdev->bd_disk) {
731 if (bdev->bd_part)
732 kobject_uevent(&bdev->bd_part->kobj, action);
733 else
734 kobject_uevent(&bdev->bd_disk->kobj, action);
735 }
736}
737
738int get_sb_bdev(struct file_system_type *fs_type, 728int get_sb_bdev(struct file_system_type *fs_type,
739 int flags, const char *dev_name, void *data, 729 int flags, const char *dev_name, void *data,
740 int (*fill_super)(struct super_block *, void *, int), 730 int (*fill_super)(struct super_block *, void *, int),
@@ -782,7 +772,6 @@ int get_sb_bdev(struct file_system_type *fs_type,
782 } 772 }
783 773
784 s->s_flags |= MS_ACTIVE; 774 s->s_flags |= MS_ACTIVE;
785 bdev_uevent(bdev, KOBJ_MOUNT);
786 } 775 }
787 776
788 return simple_set_mnt(mnt, s); 777 return simple_set_mnt(mnt, s);
@@ -801,7 +790,6 @@ void kill_block_super(struct super_block *sb)
801{ 790{
802 struct block_device *bdev = sb->s_bdev; 791 struct block_device *bdev = sb->s_bdev;
803 792
804 bdev_uevent(bdev, KOBJ_UMOUNT);
805 generic_shutdown_super(sb); 793 generic_shutdown_super(sb);
806 sync_blockdev(bdev); 794 sync_blockdev(bdev);
807 close_bdev_excl(bdev); 795 close_bdev_excl(bdev);
diff --git a/fs/sync.c b/fs/sync.c
index d0feff61e6aa..5cb9e7e43383 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -239,13 +239,11 @@ out:
239/* 239/*
240 * `endbyte' is inclusive 240 * `endbyte' is inclusive
241 */ 241 */
242int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, 242int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
243 unsigned int flags) 243 loff_t endbyte, unsigned int flags)
244{ 244{
245 int ret; 245 int ret;
246 struct address_space *mapping;
247 246
248 mapping = file->f_mapping;
249 if (!mapping) { 247 if (!mapping) {
250 ret = -EINVAL; 248 ret = -EINVAL;
251 goto out; 249 goto out;
@@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
275out: 273out:
276 return ret; 274 return ret;
277} 275}
278EXPORT_SYMBOL_GPL(do_sync_file_range); 276EXPORT_SYMBOL_GPL(do_sync_mapping_range);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index fc4633378dc0..0e637adc2b87 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -13,8 +13,7 @@
13 13
14#include "sysfs.h" 14#include "sysfs.h"
15 15
16#define to_subsys(k) container_of(k,struct subsystem,kset.kobj) 16#define to_sattr(a) container_of(a,struct subsys_attribute, attr)
17#define to_sattr(a) container_of(a,struct subsys_attribute,attr)
18 17
19/* 18/*
20 * Subsystem file operations. 19 * Subsystem file operations.
@@ -24,12 +23,12 @@
24static ssize_t 23static ssize_t
25subsys_attr_show(struct kobject * kobj, struct attribute * attr, char * page) 24subsys_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
26{ 25{
27 struct subsystem * s = to_subsys(kobj); 26 struct kset *kset = to_kset(kobj);
28 struct subsys_attribute * sattr = to_sattr(attr); 27 struct subsys_attribute * sattr = to_sattr(attr);
29 ssize_t ret = -EIO; 28 ssize_t ret = -EIO;
30 29
31 if (sattr->show) 30 if (sattr->show)
32 ret = sattr->show(s,page); 31 ret = sattr->show(kset, page);
33 return ret; 32 return ret;
34} 33}
35 34
@@ -37,12 +36,12 @@ static ssize_t
37subsys_attr_store(struct kobject * kobj, struct attribute * attr, 36subsys_attr_store(struct kobject * kobj, struct attribute * attr,
38 const char * page, size_t count) 37 const char * page, size_t count)
39{ 38{
40 struct subsystem * s = to_subsys(kobj); 39 struct kset *kset = to_kset(kobj);
41 struct subsys_attribute * sattr = to_sattr(attr); 40 struct subsys_attribute * sattr = to_sattr(attr);
42 ssize_t ret = -EIO; 41 ssize_t ret = -EIO;
43 42
44 if (sattr->store) 43 if (sattr->store)
45 ret = sattr->store(s,page,count); 44 ret = sattr->store(kset, page, count);
46 return ret; 45 return ret;
47} 46}
48 47
@@ -633,6 +632,7 @@ struct sysfs_schedule_callback_struct {
633 struct kobject *kobj; 632 struct kobject *kobj;
634 void (*func)(void *); 633 void (*func)(void *);
635 void *data; 634 void *data;
635 struct module *owner;
636 struct work_struct work; 636 struct work_struct work;
637}; 637};
638 638
@@ -643,6 +643,7 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
643 643
644 (ss->func)(ss->data); 644 (ss->func)(ss->data);
645 kobject_put(ss->kobj); 645 kobject_put(ss->kobj);
646 module_put(ss->owner);
646 kfree(ss); 647 kfree(ss);
647} 648}
648 649
@@ -651,6 +652,7 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
651 * @kobj: object we're acting for. 652 * @kobj: object we're acting for.
652 * @func: callback function to invoke later. 653 * @func: callback function to invoke later.
653 * @data: argument to pass to @func. 654 * @data: argument to pass to @func.
655 * @owner: module owning the callback code
654 * 656 *
655 * sysfs attribute methods must not unregister themselves or their parent 657 * sysfs attribute methods must not unregister themselves or their parent
656 * kobject (which would amount to the same thing). Attempts to do so will 658 * kobject (which would amount to the same thing). Attempts to do so will
@@ -663,20 +665,25 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
663 * until @func returns. 665 * until @func returns.
664 * 666 *
665 * Returns 0 if the request was submitted, -ENOMEM if storage could not 667 * Returns 0 if the request was submitted, -ENOMEM if storage could not
666 * be allocated. 668 * be allocated, -ENODEV if a reference to @owner isn't available.
667 */ 669 */
668int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), 670int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
669 void *data) 671 void *data, struct module *owner)
670{ 672{
671 struct sysfs_schedule_callback_struct *ss; 673 struct sysfs_schedule_callback_struct *ss;
672 674
675 if (!try_module_get(owner))
676 return -ENODEV;
673 ss = kmalloc(sizeof(*ss), GFP_KERNEL); 677 ss = kmalloc(sizeof(*ss), GFP_KERNEL);
674 if (!ss) 678 if (!ss) {
679 module_put(owner);
675 return -ENOMEM; 680 return -ENOMEM;
681 }
676 kobject_get(kobj); 682 kobject_get(kobj);
677 ss->kobj = kobj; 683 ss->kobj = kobj;
678 ss->func = func; 684 ss->func = func;
679 ss->data = data; 685 ss->data = data;
686 ss->owner = owner;
680 INIT_WORK(&ss->work, sysfs_schedule_callback_work); 687 INIT_WORK(&ss->work, sysfs_schedule_callback_work);
681 schedule_work(&ss->work); 688 schedule_work(&ss->work);
682 return 0; 689 return 0;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index b20951c93761..52eed2a7a5ef 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -70,9 +70,11 @@ void sysfs_remove_group(struct kobject * kobj,
70{ 70{
71 struct dentry * dir; 71 struct dentry * dir;
72 72
73 if (grp->name) 73 if (grp->name) {
74 dir = lookup_one_len(grp->name, kobj->dentry, 74 dir = lookup_one_len_kern(grp->name, kobj->dentry,
75 strlen(grp->name)); 75 strlen(grp->name));
76 BUG_ON(IS_ERR(dir));
77 }
76 else 78 else
77 dir = dget(kobj->dentry); 79 dir = dget(kobj->dentry);
78 80
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index ebf7007fa161..e566b387fcf9 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -54,17 +54,9 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
54{ 54{
55 struct address_space *mapping = dir->i_mapping; 55 struct address_space *mapping = dir->i_mapping;
56 struct page *page = read_mapping_page(mapping, n, NULL); 56 struct page *page = read_mapping_page(mapping, n, NULL);
57 if (!IS_ERR(page)) { 57 if (!IS_ERR(page))
58 wait_on_page_locked(page);
59 kmap(page); 58 kmap(page);
60 if (!PageUptodate(page))
61 goto fail;
62 }
63 return page; 59 return page;
64
65fail:
66 dir_put_page(page);
67 return ERR_PTR(-EIO);
68} 60}
69 61
70static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) 62static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9311cac186fe..3152d7415606 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -322,8 +322,7 @@ static void init_once(void *p, struct kmem_cache *cachep, unsigned long flags)
322{ 322{
323 struct sysv_inode_info *si = (struct sysv_inode_info *)p; 323 struct sysv_inode_info *si = (struct sysv_inode_info *)p;
324 324
325 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 325 if (flags & SLAB_CTOR_CONSTRUCTOR)
326 SLAB_CTOR_CONSTRUCTOR)
327 inode_init_once(&si->vfs_inode); 326 inode_init_once(&si->vfs_inode);
328} 327}
329 328
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 8672b88f7ff2..023b304fdd99 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -134,9 +134,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
134{ 134{
135 struct udf_inode_info *ei = (struct udf_inode_info *) foo; 135 struct udf_inode_info *ei = (struct udf_inode_info *) foo;
136 136
137 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 137 if (flags & SLAB_CTOR_CONSTRUCTOR) {
138 SLAB_CTOR_CONSTRUCTOR)
139 {
140 ei->i_ext.i_data = NULL; 138 ei->i_ext.i_data = NULL;
141 inode_init_once(&ei->vfs_inode); 139 inode_init_once(&ei->vfs_inode);
142 } 140 }
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 4890ddf1518e..4fb8b2e077ee 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -180,13 +180,9 @@ fail:
180static struct page *ufs_get_page(struct inode *dir, unsigned long n) 180static struct page *ufs_get_page(struct inode *dir, unsigned long n)
181{ 181{
182 struct address_space *mapping = dir->i_mapping; 182 struct address_space *mapping = dir->i_mapping;
183 struct page *page = read_cache_page(mapping, n, 183 struct page *page = read_mapping_page(mapping, n, NULL);
184 (filler_t*)mapping->a_ops->readpage, NULL);
185 if (!IS_ERR(page)) { 184 if (!IS_ERR(page)) {
186 wait_on_page_locked(page);
187 kmap(page); 185 kmap(page);
188 if (!PageUptodate(page))
189 goto fail;
190 if (!PageChecked(page)) 186 if (!PageChecked(page))
191 ufs_check_page(page); 187 ufs_check_page(page);
192 if (PageError(page)) 188 if (PageError(page))
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 013d7afe7cde..f18b79122fa3 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -601,7 +601,7 @@ static void ufs_set_inode_ops(struct inode *inode)
601 ufs_get_inode_dev(inode->i_sb, UFS_I(inode))); 601 ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
602} 602}
603 603
604static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) 604static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
605{ 605{
606 struct ufs_inode_info *ufsi = UFS_I(inode); 606 struct ufs_inode_info *ufsi = UFS_I(inode);
607 struct super_block *sb = inode->i_sb; 607 struct super_block *sb = inode->i_sb;
@@ -613,8 +613,10 @@ static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
613 */ 613 */
614 inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode); 614 inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode);
615 inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink); 615 inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink);
616 if (inode->i_nlink == 0) 616 if (inode->i_nlink == 0) {
617 ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); 617 ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
618 return -1;
619 }
618 620
619 /* 621 /*
620 * Linux now has 32-bit uid and gid, so we can support EFT. 622 * Linux now has 32-bit uid and gid, so we can support EFT.
@@ -643,9 +645,10 @@ static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
643 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) 645 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
644 ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i]; 646 ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
645 } 647 }
648 return 0;
646} 649}
647 650
648static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) 651static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
649{ 652{
650 struct ufs_inode_info *ufsi = UFS_I(inode); 653 struct ufs_inode_info *ufsi = UFS_I(inode);
651 struct super_block *sb = inode->i_sb; 654 struct super_block *sb = inode->i_sb;
@@ -658,8 +661,10 @@ static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
658 */ 661 */
659 inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode); 662 inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode);
660 inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink); 663 inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink);
661 if (inode->i_nlink == 0) 664 if (inode->i_nlink == 0) {
662 ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); 665 ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
666 return -1;
667 }
663 668
664 /* 669 /*
665 * Linux now has 32-bit uid and gid, so we can support EFT. 670 * Linux now has 32-bit uid and gid, so we can support EFT.
@@ -690,6 +695,7 @@ static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
690 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) 695 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
691 ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i]; 696 ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i];
692 } 697 }
698 return 0;
693} 699}
694 700
695void ufs_read_inode(struct inode * inode) 701void ufs_read_inode(struct inode * inode)
@@ -698,6 +704,7 @@ void ufs_read_inode(struct inode * inode)
698 struct super_block * sb; 704 struct super_block * sb;
699 struct ufs_sb_private_info * uspi; 705 struct ufs_sb_private_info * uspi;
700 struct buffer_head * bh; 706 struct buffer_head * bh;
707 int err;
701 708
702 UFSD("ENTER, ino %lu\n", inode->i_ino); 709 UFSD("ENTER, ino %lu\n", inode->i_ino);
703 710
@@ -720,14 +727,17 @@ void ufs_read_inode(struct inode * inode)
720 if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { 727 if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
721 struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data; 728 struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
722 729
723 ufs2_read_inode(inode, 730 err = ufs2_read_inode(inode,
724 ufs2_inode + ufs_inotofsbo(inode->i_ino)); 731 ufs2_inode + ufs_inotofsbo(inode->i_ino));
725 } else { 732 } else {
726 struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data; 733 struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data;
727 734
728 ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); 735 err = ufs1_read_inode(inode,
736 ufs_inode + ufs_inotofsbo(inode->i_ino));
729 } 737 }
730 738
739 if (err)
740 goto bad_inode;
731 inode->i_version++; 741 inode->i_version++;
732 ufsi->i_lastfrag = 742 ufsi->i_lastfrag =
733 (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; 743 (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -888,6 +898,8 @@ void ufs_delete_inode (struct inode * inode)
888 loff_t old_i_size; 898 loff_t old_i_size;
889 899
890 truncate_inode_pages(&inode->i_data, 0); 900 truncate_inode_pages(&inode->i_data, 0);
901 if (is_bad_inode(inode))
902 goto no_delete;
891 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ 903 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
892 lock_kernel(); 904 lock_kernel();
893 mark_inode_dirty(inode); 905 mark_inode_dirty(inode);
@@ -898,4 +910,7 @@ void ufs_delete_inode (struct inode * inode)
898 ufs_warning(inode->i_sb, __FUNCTION__, "ufs_truncate failed\n"); 910 ufs_warning(inode->i_sb, __FUNCTION__, "ufs_truncate failed\n");
899 ufs_free_inode (inode); 911 ufs_free_inode (inode);
900 unlock_kernel(); 912 unlock_kernel();
913 return;
914no_delete:
915 clear_inode(inode); /* We must guarantee clearing of inode... */
901} 916}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index b5a6461ec66b..be7c48c5f203 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1237,8 +1237,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
1237{ 1237{
1238 struct ufs_inode_info *ei = (struct ufs_inode_info *) foo; 1238 struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
1239 1239
1240 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 1240 if (flags & SLAB_CTOR_CONSTRUCTOR)
1241 SLAB_CTOR_CONSTRUCTOR)
1242 inode_init_once(&ei->vfs_inode); 1241 inode_init_once(&ei->vfs_inode);
1243} 1242}
1244 1243
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 17437574f79c..84357f1ff0ec 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -251,13 +251,11 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
251 251
252 page = find_lock_page(mapping, index); 252 page = find_lock_page(mapping, index);
253 if (!page) { 253 if (!page) {
254 page = read_cache_page(mapping, index, 254 page = read_mapping_page(mapping, index, NULL);
255 (filler_t*)mapping->a_ops->readpage,
256 NULL);
257 255
258 if (IS_ERR(page)) { 256 if (IS_ERR(page)) {
259 printk(KERN_ERR "ufs_change_blocknr: " 257 printk(KERN_ERR "ufs_change_blocknr: "
260 "read_cache_page error: ino %lu, index: %lu\n", 258 "read_mapping_page error: ino %lu, index: %lu\n",
261 mapping->host->i_ino, index); 259 mapping->host->i_ino, index);
262 goto out; 260 goto out;
263 } 261 }
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 2f2c40db562e..14e2cbe5a8d5 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -360,8 +360,7 @@ xfs_fs_inode_init_once(
360 kmem_zone_t *zonep, 360 kmem_zone_t *zonep,
361 unsigned long flags) 361 unsigned long flags)
362{ 362{
363 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 363 if (flags & SLAB_CTOR_CONSTRUCTOR)
364 SLAB_CTOR_CONSTRUCTOR)
365 inode_init_once(vn_to_inode((bhv_vnode_t *)vnode)); 364 inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
366} 365}
367 366