aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/adfs/Kconfig1
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/autofs/Kconfig1
-rw-r--r--fs/binfmt_aout.c4
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/ceph/Kconfig14
-rw-r--r--fs/ceph/Makefile11
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c65
-rw-r--r--fs/ceph/armor.c103
-rw-r--r--fs/ceph/auth.c259
-rw-r--r--fs/ceph/auth.h92
-rw-r--r--fs/ceph/auth_none.c131
-rw-r--r--fs/ceph/auth_none.h30
-rw-r--r--fs/ceph/auth_x.c687
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c65
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c81
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c3
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c72
-rw-r--r--fs/ceph/ceph_fs.h728
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c609
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c412
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c406
-rw-r--r--fs/ceph/decode.h196
-rw-r--r--fs/ceph/dir.c97
-rw-r--r--fs/ceph/export.c26
-rw-r--r--fs/ceph/file.c209
-rw-r--r--fs/ceph/inode.c19
-rw-r--r--fs/ceph/ioctl.c77
-rw-r--r--fs/ceph/ioctl.h4
-rw-r--r--fs/ceph/locks.c23
-rw-r--r--fs/ceph/mds_client.c129
-rw-r--r--fs/ceph/mds_client.h20
-rw-r--r--fs/ceph/mdsmap.c11
-rw-r--r--fs/ceph/mdsmap.h62
-rw-r--r--fs/ceph/messenger.c2277
-rw-r--r--fs/ceph/messenger.h253
-rw-r--r--fs/ceph/mon_client.c1018
-rw-r--r--fs/ceph/mon_client.h121
-rw-r--r--fs/ceph/msgpool.c64
-rw-r--r--fs/ceph/msgpool.h25
-rw-r--r--fs/ceph/msgr.h175
-rw-r--r--fs/ceph/osd_client.c1539
-rw-r--r--fs/ceph/osd_client.h167
-rw-r--r--fs/ceph/osdmap.c1110
-rw-r--r--fs/ceph/osdmap.h128
-rw-r--r--fs/ceph/pagelist.c63
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h405
-rw-r--r--fs/ceph/snap.c10
-rw-r--r--fs/ceph/strings.c (renamed from fs/ceph/ceph_strings.c)82
-rw-r--r--fs/ceph/super.c1154
-rw-r--r--fs/ceph/super.h400
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c18
-rw-r--r--fs/cifs/cifssmb.c49
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/compat_ioctl.c70
-rw-r--r--fs/exec.c40
-rw-r--r--fs/exofs/inode.c8
-rw-r--r--fs/ext3/super.c4
-rw-r--r--fs/ext4/super.c8
-rw-r--r--fs/fs-writeback.c19
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/aops.c24
-rw-r--r--fs/gfs2/bmap.c255
-rw-r--r--fs/gfs2/bmap.h20
-rw-r--r--fs/gfs2/dentry.c2
-rw-r--r--fs/gfs2/dir.c31
-rw-r--r--fs/gfs2/dir.h34
-rw-r--r--fs/gfs2/export.c9
-rw-r--r--fs/gfs2/file.c6
-rw-r--r--fs/gfs2/glock.c23
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c6
-rw-r--r--fs/gfs2/incore.h8
-rw-r--r--fs/gfs2/inode.c9
-rw-r--r--fs/gfs2/inode.h15
-rw-r--r--fs/gfs2/lock_dlm.c4
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/gfs2/ops_fstype.c79
-rw-r--r--fs/gfs2/ops_inode.c326
-rw-r--r--fs/gfs2/quota.c16
-rw-r--r--fs/gfs2/recovery.c15
-rw-r--r--fs/gfs2/rgrp.c50
-rw-r--r--fs/gfs2/rgrp.h8
-rw-r--r--fs/gfs2/super.c26
-rw-r--r--fs/gfs2/sys.c22
-rw-r--r--fs/gfs2/trace_gfs2.h3
-rw-r--r--fs/gfs2/trans.h9
-rw-r--r--fs/gfs2/xattr.c2
-rw-r--r--fs/hfs/bfind.c4
-rw-r--r--fs/hfs/btree.c2
-rw-r--r--fs/hfs/btree.h2
-rw-r--r--fs/hfsplus/bfind.c17
-rw-r--r--fs/hfsplus/bitmap.c20
-rw-r--r--fs/hfsplus/brec.c29
-rw-r--r--fs/hfsplus/btree.c67
-rw-r--r--fs/hfsplus/catalog.c50
-rw-r--r--fs/hfsplus/dir.c201
-rw-r--r--fs/hfsplus/extents.c223
-rw-r--r--fs/hfsplus/hfsplus_fs.h85
-rw-r--r--fs/hfsplus/hfsplus_raw.h3
-rw-r--r--fs/hfsplus/inode.c185
-rw-r--r--fs/hfsplus/ioctl.c153
-rw-r--r--fs/hfsplus/options.c10
-rw-r--r--fs/hfsplus/part_tbl.c5
-rw-r--r--fs/hfsplus/super.c310
-rw-r--r--fs/hfsplus/unicode.c16
-rw-r--r--fs/hfsplus/wrapper.c40
-rw-r--r--fs/hpfs/Kconfig1
-rw-r--r--fs/jbd2/journal.c4
-rw-r--r--fs/libfs.c29
-rw-r--r--fs/nfs/Kconfig1
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfsfh.h2
-rw-r--r--fs/notify/Kconfig2
-rw-r--r--fs/ocfs2/aops.c9
-rw-r--r--fs/ocfs2/aops.h3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c532
-rw-r--r--fs/ocfs2/cluster/heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/masklog.h3
-rw-r--r--fs/ocfs2/cluster/nodemanager.c5
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h6
-rw-r--r--fs/ocfs2/cluster/tcp.c5
-rw-r--r--fs/ocfs2/dcache.c33
-rw-r--r--fs/ocfs2/dcache.h1
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h29
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c12
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c400
-rw-r--r--fs/ocfs2/dlmglue.c8
-rw-r--r--fs/ocfs2/file.c73
-rw-r--r--fs/ocfs2/inode.c1
-rw-r--r--fs/ocfs2/inode.h12
-rw-r--r--fs/ocfs2/ioctl.c356
-rw-r--r--fs/ocfs2/journal.c9
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/mmap.c7
-rw-r--r--fs/ocfs2/namei.c3
-rw-r--r--fs/ocfs2/ocfs2.h63
-rw-r--r--fs/ocfs2/ocfs2_fs.h46
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h95
-rw-r--r--fs/ocfs2/refcounttree.c43
-rw-r--r--fs/ocfs2/refcounttree.h7
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/stack_o2cb.c2
-rw-r--r--fs/ocfs2/suballoc.c16
-rw-r--r--fs/ocfs2/super.c163
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/sysfile.c60
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/reiserfs/ioctl.c7
-rw-r--r--fs/smbfs/Kconfig1
-rw-r--r--fs/sysfs/group.c59
-rw-r--r--fs/udf/Kconfig1
-rw-r--r--fs/ufs/Kconfig1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c19
-rw-r--r--fs/xfs/xfs_log_cil.c12
-rw-r--r--fs/xfs/xfs_log_priv.h37
175 files changed, 4574 insertions, 15044 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 3d185308ec88..65781de44fc0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -50,6 +50,7 @@ endif # BLOCK
50config FILE_LOCKING 50config FILE_LOCKING
51 bool "Enable POSIX file locking API" if EMBEDDED 51 bool "Enable POSIX file locking API" if EMBEDDED
52 default y 52 default y
53 select BKL # while lockd still uses it.
53 help 54 help
54 This option enables standard file locking support, required 55 This option enables standard file locking support, required
55 for filesystems like NFS and for the flock() system 56 for filesystems like NFS and for the flock() system
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a74605..1dd5f34b3cf2 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,7 @@
1config ADFS_FS 1config ADFS_FS
2 tristate "ADFS file system support (EXPERIMENTAL)" 2 tristate "ADFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL 3 depends on BLOCK && EXPERIMENTAL
4 depends on BKL # need to fix
4 help 5 help
5 The Acorn Disc Filing System is the standard file system of the 6 The Acorn Disc Filing System is the standard file system of the
6 RiscOS operating system which runs on Acorn's ARM-based Risc PC 7 RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/affs/super.c b/fs/affs/super.c
index a167f96d79f7..fa4fbe1e238a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -104,8 +104,8 @@ static void init_once(void *foo)
104{ 104{
105 struct affs_inode_info *ei = (struct affs_inode_info *) foo; 105 struct affs_inode_info *ei = (struct affs_inode_info *) foo;
106 106
107 init_MUTEX(&ei->i_link_lock); 107 sema_init(&ei->i_link_lock, 1);
108 init_MUTEX(&ei->i_ext_lock); 108 sema_init(&ei->i_ext_lock, 1);
109 inode_init_once(&ei->vfs_inode); 109 inode_init_once(&ei->vfs_inode);
110} 110}
111 111
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
index 5f3bea90911e..480e210c83ab 100644
--- a/fs/autofs/Kconfig
+++ b/fs/autofs/Kconfig
@@ -1,5 +1,6 @@
1config AUTOFS_FS 1config AUTOFS_FS
2 tristate "Kernel automounter support" 2 tristate "Kernel automounter support"
3 depends on BKL # unfixable, just use autofs4
3 help 4 help
4 The automounter is a tool to automatically mount remote file systems 5 The automounter is a tool to automatically mount remote file systems
5 on demand. This implementation is partially kernel-based to reduce 6 on demand. This implementation is partially kernel-based to reduce
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f96eff04e11a..a6395bdb26ae 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm)
134 if (!dump_write(file, dump_start, dump_size)) 134 if (!dump_write(file, dump_start, dump_size))
135 goto end_coredump; 135 goto end_coredump;
136 } 136 }
137/* Finally dump the task struct. Not be used by gdb, but could be useful */
138 set_fs(KERNEL_DS);
139 if (!dump_write(file, current, sizeof(*current)))
140 goto end_coredump;
141end_coredump: 137end_coredump:
142 set_fs(fs); 138 set_fs(fs);
143 return has_dumped; 139 return has_dumped;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 535e763ab1a6..6884e198e0c7 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -800,7 +800,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
800 * default mmap base, as well as whatever program they 800 * default mmap base, as well as whatever program they
801 * might try to exec. This is because the brk will 801 * might try to exec. This is because the brk will
802 * follow the loader, and is not movable. */ 802 * follow the loader, and is not movable. */
803#ifdef CONFIG_X86 803#if defined(CONFIG_X86) || defined(CONFIG_ARM)
804 load_bias = 0; 804 load_bias = 0;
805#else 805#else
806 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); 806 load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 0fcd2640c23f..9eb134ea6eb2 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,9 +1,11 @@
1config CEPH_FS 1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)" 2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL 3 depends on INET && EXPERIMENTAL
4 select CEPH_LIB
4 select LIBCRC32C 5 select LIBCRC32C
5 select CRYPTO_AES 6 select CRYPTO_AES
6 select CRYPTO 7 select CRYPTO
8 default n
7 help 9 help
8 Choose Y or M here to include support for mounting the 10 Choose Y or M here to include support for mounting the
9 experimental Ceph distributed file system. Ceph is an extremely 11 experimental Ceph distributed file system. Ceph is an extremely
@@ -14,15 +16,3 @@ config CEPH_FS
14 16
15 If unsure, say N. 17 If unsure, say N.
16 18
17config CEPH_FS_PRETTYDEBUG
18 bool "Include file:line in ceph debug output"
19 depends on CEPH_FS
20 default n
21 help
22 If you say Y here, debug output will include a filename and
23 line to aid debugging. This icnreases kernel size and slows
24 execution slightly when debug call sites are enabled (e.g.,
25 via CONFIG_DYNAMIC_DEBUG).
26
27 If unsure, say N.
28
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 278e1172600d..9e6c4f2e8ff1 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
8 8
9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \ 10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \ 11 mds_client.o mdsmap.o strings.o ceph_frag.o \
12 mds_client.o mdsmap.o \ 12 debugfs.o
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20 13
21else 14else
22#Otherwise we were called directly from the command 15#Otherwise we were called directly from the command
diff --git a/fs/ceph/README b/fs/ceph/README
deleted file mode 100644
index 18352fab37c0..000000000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index efbc604001c8..51bcc5ce3230 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
@@ -10,7 +10,8 @@
10#include <linux/task_io_accounting_ops.h> 10#include <linux/task_io_accounting_ops.h>
11 11
12#include "super.h" 12#include "super.h"
13#include "osd_client.h" 13#include "mds_client.h"
14#include <linux/ceph/osd_client.h>
14 15
15/* 16/*
16 * Ceph address space ops. 17 * Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
193{ 194{
194 struct inode *inode = filp->f_dentry->d_inode; 195 struct inode *inode = filp->f_dentry->d_inode;
195 struct ceph_inode_info *ci = ceph_inode(inode); 196 struct ceph_inode_info *ci = ceph_inode(inode);
196 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 197 struct ceph_osd_client *osdc =
198 &ceph_inode_to_client(inode)->client->osdc;
197 int err = 0; 199 int err = 0;
198 u64 len = PAGE_CACHE_SIZE; 200 u64 len = PAGE_CACHE_SIZE;
199 201
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
265{ 267{
266 struct inode *inode = file->f_dentry->d_inode; 268 struct inode *inode = file->f_dentry->d_inode;
267 struct ceph_inode_info *ci = ceph_inode(inode); 269 struct ceph_inode_info *ci = ceph_inode(inode);
268 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 270 struct ceph_osd_client *osdc =
271 &ceph_inode_to_client(inode)->client->osdc;
269 int rc = 0; 272 int rc = 0;
270 struct page **pages; 273 struct page **pages;
271 loff_t offset; 274 loff_t offset;
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
365{ 368{
366 struct inode *inode; 369 struct inode *inode;
367 struct ceph_inode_info *ci; 370 struct ceph_inode_info *ci;
368 struct ceph_client *client; 371 struct ceph_fs_client *fsc;
369 struct ceph_osd_client *osdc; 372 struct ceph_osd_client *osdc;
370 loff_t page_off = page->index << PAGE_CACHE_SHIFT; 373 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
371 int len = PAGE_CACHE_SIZE; 374 int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
383 } 386 }
384 inode = page->mapping->host; 387 inode = page->mapping->host;
385 ci = ceph_inode(inode); 388 ci = ceph_inode(inode);
386 client = ceph_inode_to_client(inode); 389 fsc = ceph_inode_to_client(inode);
387 osdc = &client->osdc; 390 osdc = &fsc->client->osdc;
388 391
389 /* verify this is a writeable snap context */ 392 /* verify this is a writeable snap context */
390 snapc = (void *)page->private; 393 snapc = (void *)page->private;
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
414 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", 417 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
415 inode, page, page->index, page_off, len, snapc); 418 inode, page, page->index, page_off, len, snapc);
416 419
417 writeback_stat = atomic_long_inc_return(&client->writeback_count); 420 writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
418 if (writeback_stat > 421 if (writeback_stat >
419 CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) 422 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
420 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); 423 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
421 424
422 set_page_writeback(page); 425 set_page_writeback(page);
423 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 426 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
496 struct address_space *mapping = inode->i_mapping; 499 struct address_space *mapping = inode->i_mapping;
497 __s32 rc = -EIO; 500 __s32 rc = -EIO;
498 u64 bytes = 0; 501 u64 bytes = 0;
499 struct ceph_client *client = ceph_inode_to_client(inode); 502 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
500 long writeback_stat; 503 long writeback_stat;
501 unsigned issued = ceph_caps_issued(ci); 504 unsigned issued = ceph_caps_issued(ci);
502 505
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
529 WARN_ON(!PageUptodate(page)); 532 WARN_ON(!PageUptodate(page));
530 533
531 writeback_stat = 534 writeback_stat =
532 atomic_long_dec_return(&client->writeback_count); 535 atomic_long_dec_return(&fsc->writeback_count);
533 if (writeback_stat < 536 if (writeback_stat <
534 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) 537 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
535 clear_bdi_congested(&client->backing_dev_info, 538 clear_bdi_congested(&fsc->backing_dev_info,
536 BLK_RW_ASYNC); 539 BLK_RW_ASYNC);
537 540
538 ceph_put_snap_context((void *)page->private); 541 ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
569 * mempool. we avoid the mempool if we can because req->r_num_pages 572 * mempool. we avoid the mempool if we can because req->r_num_pages
570 * may be less than the maximum write size. 573 * may be less than the maximum write size.
571 */ 574 */
572static void alloc_page_vec(struct ceph_client *client, 575static void alloc_page_vec(struct ceph_fs_client *fsc,
573 struct ceph_osd_request *req) 576 struct ceph_osd_request *req)
574{ 577{
575 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, 578 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
576 GFP_NOFS); 579 GFP_NOFS);
577 if (!req->r_pages) { 580 if (!req->r_pages) {
578 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS); 581 req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
579 req->r_pages_from_pool = 1; 582 req->r_pages_from_pool = 1;
580 WARN_ON(!req->r_pages); 583 WARN_ON(!req->r_pages);
581 } 584 }
@@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping,
590 struct inode *inode = mapping->host; 593 struct inode *inode = mapping->host;
591 struct backing_dev_info *bdi = mapping->backing_dev_info; 594 struct backing_dev_info *bdi = mapping->backing_dev_info;
592 struct ceph_inode_info *ci = ceph_inode(inode); 595 struct ceph_inode_info *ci = ceph_inode(inode);
593 struct ceph_client *client; 596 struct ceph_fs_client *fsc;
594 pgoff_t index, start, end; 597 pgoff_t index, start, end;
595 int range_whole = 0; 598 int range_whole = 0;
596 int should_loop = 1; 599 int should_loop = 1;
@@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping,
617 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 620 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
618 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 621 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
619 622
620 client = ceph_inode_to_client(inode); 623 fsc = ceph_inode_to_client(inode);
621 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) { 624 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
622 pr_warning("writepage_start %p on forced umount\n", inode); 625 pr_warning("writepage_start %p on forced umount\n", inode);
623 return -EIO; /* we're in a forced umount, don't write! */ 626 return -EIO; /* we're in a forced umount, don't write! */
624 } 627 }
625 if (client->mount_args->wsize && client->mount_args->wsize < wsize) 628 if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
626 wsize = client->mount_args->wsize; 629 wsize = fsc->mount_options->wsize;
627 if (wsize < PAGE_CACHE_SIZE) 630 if (wsize < PAGE_CACHE_SIZE)
628 wsize = PAGE_CACHE_SIZE; 631 wsize = PAGE_CACHE_SIZE;
629 max_pages_ever = wsize >> PAGE_CACHE_SHIFT; 632 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
@@ -769,7 +772,7 @@ get_more_pages:
769 offset = (unsigned long long)page->index 772 offset = (unsigned long long)page->index
770 << PAGE_CACHE_SHIFT; 773 << PAGE_CACHE_SHIFT;
771 len = wsize; 774 len = wsize;
772 req = ceph_osdc_new_request(&client->osdc, 775 req = ceph_osdc_new_request(&fsc->client->osdc,
773 &ci->i_layout, 776 &ci->i_layout,
774 ceph_vino(inode), 777 ceph_vino(inode),
775 offset, &len, 778 offset, &len,
@@ -782,7 +785,7 @@ get_more_pages:
782 &inode->i_mtime, true, 1); 785 &inode->i_mtime, true, 1);
783 max_pages = req->r_num_pages; 786 max_pages = req->r_num_pages;
784 787
785 alloc_page_vec(client, req); 788 alloc_page_vec(fsc, req);
786 req->r_callback = writepages_finish; 789 req->r_callback = writepages_finish;
787 req->r_inode = inode; 790 req->r_inode = inode;
788 } 791 }
@@ -794,10 +797,10 @@ get_more_pages:
794 inode, page, page->index); 797 inode, page, page->index);
795 798
796 writeback_stat = 799 writeback_stat =
797 atomic_long_inc_return(&client->writeback_count); 800 atomic_long_inc_return(&fsc->writeback_count);
798 if (writeback_stat > CONGESTION_ON_THRESH( 801 if (writeback_stat > CONGESTION_ON_THRESH(
799 client->mount_args->congestion_kb)) { 802 fsc->mount_options->congestion_kb)) {
800 set_bdi_congested(&client->backing_dev_info, 803 set_bdi_congested(&fsc->backing_dev_info,
801 BLK_RW_ASYNC); 804 BLK_RW_ASYNC);
802 } 805 }
803 806
@@ -846,7 +849,7 @@ get_more_pages:
846 op->payload_len = cpu_to_le32(len); 849 op->payload_len = cpu_to_le32(len);
847 req->r_request->hdr.data_len = cpu_to_le32(len); 850 req->r_request->hdr.data_len = cpu_to_le32(len);
848 851
849 ceph_osdc_start_request(&client->osdc, req, true); 852 ceph_osdc_start_request(&fsc->client->osdc, req, true);
850 req = NULL; 853 req = NULL;
851 854
852 /* continue? */ 855 /* continue? */
@@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file,
915{ 918{
916 struct inode *inode = file->f_dentry->d_inode; 919 struct inode *inode = file->f_dentry->d_inode;
917 struct ceph_inode_info *ci = ceph_inode(inode); 920 struct ceph_inode_info *ci = ceph_inode(inode);
918 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 921 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
919 loff_t page_off = pos & PAGE_CACHE_MASK; 922 loff_t page_off = pos & PAGE_CACHE_MASK;
920 int pos_in_page = pos & ~PAGE_CACHE_MASK; 923 int pos_in_page = pos & ~PAGE_CACHE_MASK;
921 int end_in_page = pos_in_page + len; 924 int end_in_page = pos_in_page + len;
@@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1053 struct page *page, void *fsdata) 1056 struct page *page, void *fsdata)
1054{ 1057{
1055 struct inode *inode = file->f_dentry->d_inode; 1058 struct inode *inode = file->f_dentry->d_inode;
1056 struct ceph_client *client = ceph_inode_to_client(inode); 1059 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1057 struct ceph_mds_client *mdsc = &client->mdsc; 1060 struct ceph_mds_client *mdsc = fsc->mdsc;
1058 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1061 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1059 int check_cap = 0; 1062 int check_cap = 0;
1060 1063
@@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1123{ 1126{
1124 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1127 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1125 struct page *page = vmf->page; 1128 struct page *page = vmf->page;
1126 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1129 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1127 loff_t off = page->index << PAGE_CACHE_SHIFT; 1130 loff_t off = page->index << PAGE_CACHE_SHIFT;
1128 loff_t size, len; 1131 loff_t size, len;
1129 int ret; 1132 int ret;
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
deleted file mode 100644
index eb2a666b0be7..000000000000
--- a/fs/ceph/armor.c
+++ /dev/null
@@ -1,103 +0,0 @@
1
2#include <linux/errno.h>
3
4int ceph_armor(char *dst, const char *src, const char *end);
5int ceph_unarmor(char *dst, const char *src, const char *end);
6
7/*
8 * base64 encode/decode.
9 */
10
11static const char *pem_key =
12 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
13
14static int encode_bits(int c)
15{
16 return pem_key[c];
17}
18
19static int decode_bits(char c)
20{
21 if (c >= 'A' && c <= 'Z')
22 return c - 'A';
23 if (c >= 'a' && c <= 'z')
24 return c - 'a' + 26;
25 if (c >= '0' && c <= '9')
26 return c - '0' + 52;
27 if (c == '+')
28 return 62;
29 if (c == '/')
30 return 63;
31 if (c == '=')
32 return 0; /* just non-negative, please */
33 return -EINVAL;
34}
35
36int ceph_armor(char *dst, const char *src, const char *end)
37{
38 int olen = 0;
39 int line = 0;
40
41 while (src < end) {
42 unsigned char a, b, c;
43
44 a = *src++;
45 *dst++ = encode_bits(a >> 2);
46 if (src < end) {
47 b = *src++;
48 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
49 if (src < end) {
50 c = *src++;
51 *dst++ = encode_bits(((b & 15) << 2) |
52 (c >> 6));
53 *dst++ = encode_bits(c & 63);
54 } else {
55 *dst++ = encode_bits((b & 15) << 2);
56 *dst++ = '=';
57 }
58 } else {
59 *dst++ = encode_bits(((a & 3) << 4));
60 *dst++ = '=';
61 *dst++ = '=';
62 }
63 olen += 4;
64 line += 4;
65 if (line == 64) {
66 line = 0;
67 *(dst++) = '\n';
68 olen++;
69 }
70 }
71 return olen;
72}
73
74int ceph_unarmor(char *dst, const char *src, const char *end)
75{
76 int olen = 0;
77
78 while (src < end) {
79 int a, b, c, d;
80
81 if (src < end && src[0] == '\n')
82 src++;
83 if (src + 4 > end)
84 return -EINVAL;
85 a = decode_bits(src[0]);
86 b = decode_bits(src[1]);
87 c = decode_bits(src[2]);
88 d = decode_bits(src[3]);
89 if (a < 0 || b < 0 || c < 0 || d < 0)
90 return -EINVAL;
91
92 *dst++ = (a << 2) | (b >> 4);
93 if (src[2] == '=')
94 return olen + 1;
95 *dst++ = ((b & 15) << 4) | (c >> 2);
96 if (src[3] == '=')
97 return olen + 2;
98 *dst++ = ((c & 3) << 6) | d;
99 olen += 3;
100 src += 4;
101 }
102 return olen;
103}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
deleted file mode 100644
index 6d2e30600627..000000000000
--- a/fs/ceph/auth.c
+++ /dev/null
@@ -1,259 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/err.h>
5#include <linux/slab.h>
6
7#include "types.h"
8#include "auth_none.h"
9#include "auth_x.h"
10#include "decode.h"
11#include "super.h"
12
13#include "messenger.h"
14
15/*
16 * get protocol handler
17 */
18static u32 supported_protocols[] = {
19 CEPH_AUTH_NONE,
20 CEPH_AUTH_CEPHX
21};
22
23static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{
25 switch (protocol) {
26 case CEPH_AUTH_NONE:
27 return ceph_auth_none_init(ac);
28 case CEPH_AUTH_CEPHX:
29 return ceph_x_init(ac);
30 default:
31 return -ENOENT;
32 }
33}
34
35/*
36 * setup, teardown.
37 */
38struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
39{
40 struct ceph_auth_client *ac;
41 int ret;
42
43 dout("auth_init name '%s' secret '%s'\n", name, secret);
44
45 ret = -ENOMEM;
46 ac = kzalloc(sizeof(*ac), GFP_NOFS);
47 if (!ac)
48 goto out;
49
50 ac->negotiating = true;
51 if (name)
52 ac->name = name;
53 else
54 ac->name = CEPH_AUTH_NAME_DEFAULT;
55 dout("auth_init name %s secret %s\n", ac->name, secret);
56 ac->secret = secret;
57 return ac;
58
59out:
60 return ERR_PTR(ret);
61}
62
63void ceph_auth_destroy(struct ceph_auth_client *ac)
64{
65 dout("auth_destroy %p\n", ac);
66 if (ac->ops)
67 ac->ops->destroy(ac);
68 kfree(ac);
69}
70
71/*
72 * Reset occurs when reconnecting to the monitor.
73 */
74void ceph_auth_reset(struct ceph_auth_client *ac)
75{
76 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac);
79 ac->negotiating = true;
80}
81
82int ceph_entity_name_encode(const char *name, void **p, void *end)
83{
84 int len = strlen(name);
85
86 if (*p + 2*sizeof(u32) + len > end)
87 return -ERANGE;
88 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
89 ceph_encode_32(p, len);
90 ceph_encode_copy(p, name, len);
91 return 0;
92}
93
94/*
95 * Initiate protocol negotiation with monitor. Include entity name
96 * and list supported protocols.
97 */
98int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
99{
100 struct ceph_mon_request_header *monhdr = buf;
101 void *p = monhdr + 1, *end = buf + len, *lenp;
102 int i, num;
103 int ret;
104
105 dout("auth_build_hello\n");
106 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1);
108 monhdr->session_mon_tid = 0;
109
110 ceph_encode_32(&p, 0); /* no protocol, yet */
111
112 lenp = p;
113 p += sizeof(u32);
114
115 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
116 ceph_encode_8(&p, 1);
117 num = ARRAY_SIZE(supported_protocols);
118 ceph_encode_32(&p, num);
119 ceph_decode_need(&p, end, num * sizeof(u32), bad);
120 for (i = 0; i < num; i++)
121 ceph_encode_32(&p, supported_protocols[i]);
122
123 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0)
125 return ret;
126 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id);
128
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf;
131
132bad:
133 return -ERANGE;
134}
135
136static int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len)
138{
139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1;
141 void *end = msg_buf + msg_len;
142 int ret;
143
144 monhdr->have_version = 0;
145 monhdr->session_mon = cpu_to_le16(-1);
146 monhdr->session_mon_tid = 0;
147
148 ceph_encode_32(&p, ac->protocol);
149
150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
151 if (ret < 0) {
152 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name);
154 return ret;
155 }
156 dout(" built request %d bytes\n", ret);
157 ceph_encode_32(&p, ret);
158 return p + ret - msg_buf;
159}
160
161/*
162 * Handle auth message from monitor.
163 */
164int ceph_handle_auth_reply(struct ceph_auth_client *ac,
165 void *buf, size_t len,
166 void *reply_buf, size_t reply_len)
167{
168 void *p = buf;
169 void *end = buf + len;
170 int protocol;
171 s32 result;
172 u64 global_id;
173 void *payload, *payload_end;
174 int payload_len;
175 char *result_msg;
176 int result_msg_len;
177 int ret = -EINVAL;
178
179 dout("handle_auth_reply %p %p\n", p, end);
180 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
181 protocol = ceph_decode_32(&p);
182 result = ceph_decode_32(&p);
183 global_id = ceph_decode_64(&p);
184 payload_len = ceph_decode_32(&p);
185 payload = p;
186 p += payload_len;
187 ceph_decode_need(&p, end, sizeof(u32), bad);
188 result_msg_len = ceph_decode_32(&p);
189 result_msg = p;
190 p += result_msg_len;
191 if (p != end)
192 goto bad;
193
194 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
195 result_msg, global_id, payload_len);
196
197 payload_end = payload + payload_len;
198
199 if (global_id && ac->global_id != global_id) {
200 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
201 ac->global_id = global_id;
202 }
203
204 if (ac->negotiating) {
205 /* server does not support our protocols? */
206 if (!protocol && result < 0) {
207 ret = result;
208 goto out;
209 }
210 /* set up (new) protocol handler? */
211 if (ac->protocol && ac->protocol != protocol) {
212 ac->ops->destroy(ac);
213 ac->protocol = 0;
214 ac->ops = NULL;
215 }
216 if (ac->protocol != protocol) {
217 ret = ceph_auth_init_protocol(ac, protocol);
218 if (ret) {
219 pr_err("error %d on auth protocol %d init\n",
220 ret, protocol);
221 goto out;
222 }
223 }
224
225 ac->negotiating = false;
226 }
227
228 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
229 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) {
232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret;
234 }
235 return 0;
236
237bad:
238 pr_err("failed to decode auth msg\n");
239out:
240 return ret;
241}
242
243int ceph_build_auth(struct ceph_auth_client *ac,
244 void *msg_buf, size_t msg_len)
245{
246 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops);
249 if (ac->ops->should_authenticate(ac))
250 return ceph_build_auth_request(ac, msg_buf, msg_len);
251 return 0;
252}
253
254int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
255{
256 if (!ac->ops)
257 return 0;
258 return ac->ops->is_authenticated(ac);
259}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
deleted file mode 100644
index d38a2fb4a137..000000000000
--- a/fs/ceph/auth.h
+++ /dev/null
@@ -1,92 +0,0 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 const char *name;
19
20 /*
21 * true if we are authenticated and can connect to
22 * services.
23 */
24 int (*is_authenticated)(struct ceph_auth_client *ac);
25
26 /*
27 * true if we should (re)authenticate, e.g., when our tickets
28 * are getting old and crusty.
29 */
30 int (*should_authenticate)(struct ceph_auth_client *ac);
31
32 /*
33 * build requests and process replies during monitor
34 * handshake. if handle_reply returns -EAGAIN, we build
35 * another request.
36 */
37 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
38 int (*handle_reply)(struct ceph_auth_client *ac, int result,
39 void *buf, void *end);
40
41 /*
42 * Create authorizer for connecting to a service, and verify
43 * the response to authenticate the service.
44 */
45 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
46 struct ceph_authorizer **a,
47 void **buf, size_t *len,
48 void **reply_buf, size_t *reply_len);
49 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
50 struct ceph_authorizer *a, size_t len);
51 void (*destroy_authorizer)(struct ceph_auth_client *ac,
52 struct ceph_authorizer *a);
53 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
54 int peer_type);
55
56 /* reset when we (re)connect to a monitor */
57 void (*reset)(struct ceph_auth_client *ac);
58
59 void (*destroy)(struct ceph_auth_client *ac);
60};
61
62struct ceph_auth_client {
63 u32 protocol; /* CEPH_AUTH_* */
64 void *private; /* for use by protocol implementation */
65 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
66
67 bool negotiating; /* true if negotiating protocol */
68 const char *name; /* entity name */
69 u64 global_id; /* our unique id in system */
70 const char *secret; /* our secret key */
71 unsigned want_keys; /* which services we want */
72};
73
74extern struct ceph_auth_client *ceph_auth_init(const char *name,
75 const char *secret);
76extern void ceph_auth_destroy(struct ceph_auth_client *ac);
77
78extern void ceph_auth_reset(struct ceph_auth_client *ac);
79
80extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
81 void *buf, size_t len);
82extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
83 void *buf, size_t len,
84 void *reply_buf, size_t reply_len);
85extern int ceph_entity_name_encode(const char *name, void **p, void *end);
86
87extern int ceph_build_auth(struct ceph_auth_client *ac,
88 void *msg_buf, size_t msg_len);
89
90extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
91
92#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
deleted file mode 100644
index ad1dc21286c7..000000000000
--- a/fs/ceph/auth_none.c
+++ /dev/null
@@ -1,131 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_none.h"
10#include "auth.h"
11#include "decode.h"
12
13static void reset(struct ceph_auth_client *ac)
14{
15 struct ceph_auth_none_info *xi = ac->private;
16
17 xi->starting = true;
18 xi->built_authorizer = false;
19}
20
21static void destroy(struct ceph_auth_client *ac)
22{
23 kfree(ac->private);
24 ac->private = NULL;
25}
26
27static int is_authenticated(struct ceph_auth_client *ac)
28{
29 struct ceph_auth_none_info *xi = ac->private;
30
31 return !xi->starting;
32}
33
34static int should_authenticate(struct ceph_auth_client *ac)
35{
36 struct ceph_auth_none_info *xi = ac->private;
37
38 return xi->starting;
39}
40
41/*
42 * the generic auth code decode the global_id, and we carry no actual
43 * authenticate state, so nothing happens here.
44 */
45static int handle_reply(struct ceph_auth_client *ac, int result,
46 void *buf, void *end)
47{
48 struct ceph_auth_none_info *xi = ac->private;
49
50 xi->starting = false;
51 return result;
52}
53
54/*
55 * build an 'authorizer' with our entity_name and global_id. we can
56 * reuse a single static copy since it is identical for all services
57 * we connect to.
58 */
59static int ceph_auth_none_create_authorizer(
60 struct ceph_auth_client *ac, int peer_type,
61 struct ceph_authorizer **a,
62 void **buf, size_t *len,
63 void **reply_buf, size_t *reply_len)
64{
65 struct ceph_auth_none_info *ai = ac->private;
66 struct ceph_none_authorizer *au = &ai->au;
67 void *p, *end;
68 int ret;
69
70 if (!ai->built_authorizer) {
71 p = au->buf;
72 end = p + sizeof(au->buf);
73 ceph_encode_8(&p, 1);
74 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
75 if (ret < 0)
76 goto bad;
77 ceph_decode_need(&p, end, sizeof(u64), bad2);
78 ceph_encode_64(&p, ac->global_id);
79 au->buf_len = p - (void *)au->buf;
80 ai->built_authorizer = true;
81 dout("built authorizer len %d\n", au->buf_len);
82 }
83
84 *a = (struct ceph_authorizer *)au;
85 *buf = au->buf;
86 *len = au->buf_len;
87 *reply_buf = au->reply_buf;
88 *reply_len = sizeof(au->reply_buf);
89 return 0;
90
91bad2:
92 ret = -ERANGE;
93bad:
94 return ret;
95}
96
97static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
98 struct ceph_authorizer *a)
99{
100 /* nothing to do */
101}
102
103static const struct ceph_auth_client_ops ceph_auth_none_ops = {
104 .name = "none",
105 .reset = reset,
106 .destroy = destroy,
107 .is_authenticated = is_authenticated,
108 .should_authenticate = should_authenticate,
109 .handle_reply = handle_reply,
110 .create_authorizer = ceph_auth_none_create_authorizer,
111 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
112};
113
114int ceph_auth_none_init(struct ceph_auth_client *ac)
115{
116 struct ceph_auth_none_info *xi;
117
118 dout("ceph_auth_none_init %p\n", ac);
119 xi = kzalloc(sizeof(*xi), GFP_NOFS);
120 if (!xi)
121 return -ENOMEM;
122
123 xi->starting = true;
124 xi->built_authorizer = false;
125
126 ac->protocol = CEPH_AUTH_NONE;
127 ac->private = xi;
128 ac->ops = &ceph_auth_none_ops;
129 return 0;
130}
131
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
deleted file mode 100644
index 8164df1a08be..000000000000
--- a/fs/ceph/auth_none.h
+++ /dev/null
@@ -1,30 +0,0 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include <linux/slab.h>
5
6#include "auth.h"
7
8/*
9 * null security mode.
10 *
11 * we use a single static authorizer that simply encodes our entity name
12 * and global id.
13 */
14
15struct ceph_none_authorizer {
16 char buf[128];
17 int buf_len;
18 char reply_buf[0];
19};
20
21struct ceph_auth_none_info {
22 bool starting;
23 bool built_authorizer;
24 struct ceph_none_authorizer au; /* we only need one; it's static */
25};
26
27extern int ceph_auth_none_init(struct ceph_auth_client *ac);
28
29#endif
30
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
deleted file mode 100644
index a2d002cbdec2..000000000000
--- a/fs/ceph/auth_x.c
+++ /dev/null
@@ -1,687 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_x.h"
10#include "auth_x_protocol.h"
11#include "crypto.h"
12#include "auth.h"
13#include "decode.h"
14
15#define TEMP_TICKET_BUF_LEN 256
16
17static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
18
19static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
20{
21 struct ceph_x_info *xi = ac->private;
22 int need;
23
24 ceph_x_validate_tickets(ac, &need);
25 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
26 ac->want_keys, need, xi->have_keys);
27 return (ac->want_keys & xi->have_keys) == ac->want_keys;
28}
29
30static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
31{
32 struct ceph_x_info *xi = ac->private;
33 int need;
34
35 ceph_x_validate_tickets(ac, &need);
36 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
37 ac->want_keys, need, xi->have_keys);
38 return need != 0;
39}
40
41static int ceph_x_encrypt_buflen(int ilen)
42{
43 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
44 sizeof(u32);
45}
46
47static int ceph_x_encrypt(struct ceph_crypto_key *secret,
48 void *ibuf, int ilen, void *obuf, size_t olen)
49{
50 struct ceph_x_encrypt_header head = {
51 .struct_v = 1,
52 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
53 };
54 size_t len = olen - sizeof(u32);
55 int ret;
56
57 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
58 &head, sizeof(head), ibuf, ilen);
59 if (ret)
60 return ret;
61 ceph_encode_32(&obuf, len);
62 return len + sizeof(u32);
63}
64
65static int ceph_x_decrypt(struct ceph_crypto_key *secret,
66 void **p, void *end, void *obuf, size_t olen)
67{
68 struct ceph_x_encrypt_header head;
69 size_t head_len = sizeof(head);
70 int len, ret;
71
72 len = ceph_decode_32(p);
73 if (*p + len > end)
74 return -EINVAL;
75
76 dout("ceph_x_decrypt len %d\n", len);
77 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
78 *p, len);
79 if (ret)
80 return ret;
81 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
82 return -EPERM;
83 *p += len;
84 return olen;
85}
86
87/*
88 * get existing (or insert new) ticket handler
89 */
90static struct ceph_x_ticket_handler *
91get_ticket_handler(struct ceph_auth_client *ac, int service)
92{
93 struct ceph_x_ticket_handler *th;
94 struct ceph_x_info *xi = ac->private;
95 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
96
97 while (*p) {
98 parent = *p;
99 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
100 if (service < th->service)
101 p = &(*p)->rb_left;
102 else if (service > th->service)
103 p = &(*p)->rb_right;
104 else
105 return th;
106 }
107
108 /* add it */
109 th = kzalloc(sizeof(*th), GFP_NOFS);
110 if (!th)
111 return ERR_PTR(-ENOMEM);
112 th->service = service;
113 rb_link_node(&th->node, parent, p);
114 rb_insert_color(&th->node, &xi->ticket_handlers);
115 return th;
116}
117
118static void remove_ticket_handler(struct ceph_auth_client *ac,
119 struct ceph_x_ticket_handler *th)
120{
121 struct ceph_x_info *xi = ac->private;
122
123 dout("remove_ticket_handler %p %d\n", th, th->service);
124 rb_erase(&th->node, &xi->ticket_handlers);
125 ceph_crypto_key_destroy(&th->session_key);
126 if (th->ticket_blob)
127 ceph_buffer_put(th->ticket_blob);
128 kfree(th);
129}
130
131static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
132 struct ceph_crypto_key *secret,
133 void *buf, void *end)
134{
135 struct ceph_x_info *xi = ac->private;
136 int num;
137 void *p = buf;
138 int ret;
139 char *dbuf;
140 char *ticket_buf;
141 u8 reply_struct_v;
142
143 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
144 if (!dbuf)
145 return -ENOMEM;
146
147 ret = -ENOMEM;
148 ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
149 if (!ticket_buf)
150 goto out_dbuf;
151
152 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
153 reply_struct_v = ceph_decode_8(&p);
154 if (reply_struct_v != 1)
155 goto bad;
156 num = ceph_decode_32(&p);
157 dout("%d tickets\n", num);
158 while (num--) {
159 int type;
160 u8 tkt_struct_v, blob_struct_v;
161 struct ceph_x_ticket_handler *th;
162 void *dp, *dend;
163 int dlen;
164 char is_enc;
165 struct timespec validity;
166 struct ceph_crypto_key old_key;
167 void *tp, *tpend;
168 struct ceph_timespec new_validity;
169 struct ceph_crypto_key new_session_key;
170 struct ceph_buffer *new_ticket_blob;
171 unsigned long new_expires, new_renew_after;
172 u64 new_secret_id;
173
174 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
175
176 type = ceph_decode_32(&p);
177 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
178
179 tkt_struct_v = ceph_decode_8(&p);
180 if (tkt_struct_v != 1)
181 goto bad;
182
183 th = get_ticket_handler(ac, type);
184 if (IS_ERR(th)) {
185 ret = PTR_ERR(th);
186 goto out;
187 }
188
189 /* blob for me */
190 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
191 TEMP_TICKET_BUF_LEN);
192 if (dlen <= 0) {
193 ret = dlen;
194 goto out;
195 }
196 dout(" decrypted %d bytes\n", dlen);
197 dend = dbuf + dlen;
198 dp = dbuf;
199
200 tkt_struct_v = ceph_decode_8(&dp);
201 if (tkt_struct_v != 1)
202 goto bad;
203
204 memcpy(&old_key, &th->session_key, sizeof(old_key));
205 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
206 if (ret)
207 goto out;
208
209 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
210 ceph_decode_timespec(&validity, &new_validity);
211 new_expires = get_seconds() + validity.tv_sec;
212 new_renew_after = new_expires - (validity.tv_sec / 4);
213 dout(" expires=%lu renew_after=%lu\n", new_expires,
214 new_renew_after);
215
216 /* ticket blob for service */
217 ceph_decode_8_safe(&p, end, is_enc, bad);
218 tp = ticket_buf;
219 if (is_enc) {
220 /* encrypted */
221 dout(" encrypted ticket\n");
222 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
223 TEMP_TICKET_BUF_LEN);
224 if (dlen < 0) {
225 ret = dlen;
226 goto out;
227 }
228 dlen = ceph_decode_32(&tp);
229 } else {
230 /* unencrypted */
231 ceph_decode_32_safe(&p, end, dlen, bad);
232 ceph_decode_need(&p, end, dlen, bad);
233 ceph_decode_copy(&p, ticket_buf, dlen);
234 }
235 tpend = tp + dlen;
236 dout(" ticket blob is %d bytes\n", dlen);
237 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
238 blob_struct_v = ceph_decode_8(&tp);
239 new_secret_id = ceph_decode_64(&tp);
240 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
241 if (ret)
242 goto out;
243
244 /* all is well, update our ticket */
245 ceph_crypto_key_destroy(&th->session_key);
246 if (th->ticket_blob)
247 ceph_buffer_put(th->ticket_blob);
248 th->session_key = new_session_key;
249 th->ticket_blob = new_ticket_blob;
250 th->validity = new_validity;
251 th->secret_id = new_secret_id;
252 th->expires = new_expires;
253 th->renew_after = new_renew_after;
254 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
255 type, ceph_entity_type_name(type), th->secret_id,
256 (int)th->ticket_blob->vec.iov_len);
257 xi->have_keys |= th->service;
258 }
259
260 ret = 0;
261out:
262 kfree(ticket_buf);
263out_dbuf:
264 kfree(dbuf);
265 return ret;
266
267bad:
268 ret = -EINVAL;
269 goto out;
270}
271
272static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
273 struct ceph_x_ticket_handler *th,
274 struct ceph_x_authorizer *au)
275{
276 int maxlen;
277 struct ceph_x_authorize_a *msg_a;
278 struct ceph_x_authorize_b msg_b;
279 void *p, *end;
280 int ret;
281 int ticket_blob_len =
282 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
283
284 dout("build_authorizer for %s %p\n",
285 ceph_entity_type_name(th->service), au);
286
287 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
288 ceph_x_encrypt_buflen(ticket_blob_len);
289 dout(" need len %d\n", maxlen);
290 if (au->buf && au->buf->alloc_len < maxlen) {
291 ceph_buffer_put(au->buf);
292 au->buf = NULL;
293 }
294 if (!au->buf) {
295 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
296 if (!au->buf)
297 return -ENOMEM;
298 }
299 au->service = th->service;
300
301 msg_a = au->buf->vec.iov_base;
302 msg_a->struct_v = 1;
303 msg_a->global_id = cpu_to_le64(ac->global_id);
304 msg_a->service_id = cpu_to_le32(th->service);
305 msg_a->ticket_blob.struct_v = 1;
306 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
307 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
308 if (ticket_blob_len) {
309 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
310 th->ticket_blob->vec.iov_len);
311 }
312 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
313 le64_to_cpu(msg_a->ticket_blob.secret_id));
314
315 p = msg_a + 1;
316 p += ticket_blob_len;
317 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
318
319 get_random_bytes(&au->nonce, sizeof(au->nonce));
320 msg_b.struct_v = 1;
321 msg_b.nonce = cpu_to_le64(au->nonce);
322 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
323 p, end - p);
324 if (ret < 0)
325 goto out_buf;
326 p += ret;
327 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
328 dout(" built authorizer nonce %llx len %d\n", au->nonce,
329 (int)au->buf->vec.iov_len);
330 BUG_ON(au->buf->vec.iov_len > maxlen);
331 return 0;
332
333out_buf:
334 ceph_buffer_put(au->buf);
335 au->buf = NULL;
336 return ret;
337}
338
339static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
340 void **p, void *end)
341{
342 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
343 ceph_encode_8(p, 1);
344 ceph_encode_64(p, th->secret_id);
345 if (th->ticket_blob) {
346 const char *buf = th->ticket_blob->vec.iov_base;
347 u32 len = th->ticket_blob->vec.iov_len;
348
349 ceph_encode_32_safe(p, end, len, bad);
350 ceph_encode_copy_safe(p, end, buf, len, bad);
351 } else {
352 ceph_encode_32_safe(p, end, 0, bad);
353 }
354
355 return 0;
356bad:
357 return -ERANGE;
358}
359
360static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
361{
362 int want = ac->want_keys;
363 struct ceph_x_info *xi = ac->private;
364 int service;
365
366 *pneed = ac->want_keys & ~(xi->have_keys);
367
368 for (service = 1; service <= want; service <<= 1) {
369 struct ceph_x_ticket_handler *th;
370
371 if (!(ac->want_keys & service))
372 continue;
373
374 if (*pneed & service)
375 continue;
376
377 th = get_ticket_handler(ac, service);
378
379 if (IS_ERR(th)) {
380 *pneed |= service;
381 continue;
382 }
383
384 if (get_seconds() >= th->renew_after)
385 *pneed |= service;
386 if (get_seconds() >= th->expires)
387 xi->have_keys &= ~service;
388 }
389}
390
391
392static int ceph_x_build_request(struct ceph_auth_client *ac,
393 void *buf, void *end)
394{
395 struct ceph_x_info *xi = ac->private;
396 int need;
397 struct ceph_x_request_header *head = buf;
398 int ret;
399 struct ceph_x_ticket_handler *th =
400 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
401
402 if (IS_ERR(th))
403 return PTR_ERR(th);
404
405 ceph_x_validate_tickets(ac, &need);
406
407 dout("build_request want %x have %x need %x\n",
408 ac->want_keys, xi->have_keys, need);
409
410 if (need & CEPH_ENTITY_TYPE_AUTH) {
411 struct ceph_x_authenticate *auth = (void *)(head + 1);
412 void *p = auth + 1;
413 struct ceph_x_challenge_blob tmp;
414 char tmp_enc[40];
415 u64 *u;
416
417 if (p > end)
418 return -ERANGE;
419
420 dout(" get_auth_session_key\n");
421 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
422
423 /* encrypt and hash */
424 get_random_bytes(&auth->client_challenge, sizeof(u64));
425 tmp.client_challenge = auth->client_challenge;
426 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
427 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
428 tmp_enc, sizeof(tmp_enc));
429 if (ret < 0)
430 return ret;
431
432 auth->struct_v = 1;
433 auth->key = 0;
434 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
435 auth->key ^= *(__le64 *)u;
436 dout(" server_challenge %llx client_challenge %llx key %llx\n",
437 xi->server_challenge, le64_to_cpu(auth->client_challenge),
438 le64_to_cpu(auth->key));
439
440 /* now encode the old ticket if exists */
441 ret = ceph_x_encode_ticket(th, &p, end);
442 if (ret < 0)
443 return ret;
444
445 return p - buf;
446 }
447
448 if (need) {
449 void *p = head + 1;
450 struct ceph_x_service_ticket_request *req;
451
452 if (p > end)
453 return -ERANGE;
454 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
455
456 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
457 if (ret)
458 return ret;
459 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
460 xi->auth_authorizer.buf->vec.iov_len);
461
462 req = p;
463 req->keys = cpu_to_le32(need);
464 p += sizeof(*req);
465 return p - buf;
466 }
467
468 return 0;
469}
470
471static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
472 void *buf, void *end)
473{
474 struct ceph_x_info *xi = ac->private;
475 struct ceph_x_reply_header *head = buf;
476 struct ceph_x_ticket_handler *th;
477 int len = end - buf;
478 int op;
479 int ret;
480
481 if (result)
482 return result; /* XXX hmm? */
483
484 if (xi->starting) {
485 /* it's a hello */
486 struct ceph_x_server_challenge *sc = buf;
487
488 if (len != sizeof(*sc))
489 return -EINVAL;
490 xi->server_challenge = le64_to_cpu(sc->server_challenge);
491 dout("handle_reply got server challenge %llx\n",
492 xi->server_challenge);
493 xi->starting = false;
494 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
495 return -EAGAIN;
496 }
497
498 op = le16_to_cpu(head->op);
499 result = le32_to_cpu(head->result);
500 dout("handle_reply op %d result %d\n", op, result);
501 switch (op) {
502 case CEPHX_GET_AUTH_SESSION_KEY:
503 /* verify auth key */
504 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
505 buf + sizeof(*head), end);
506 break;
507
508 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
509 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
510 if (IS_ERR(th))
511 return PTR_ERR(th);
512 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
513 buf + sizeof(*head), end);
514 break;
515
516 default:
517 return -EINVAL;
518 }
519 if (ret)
520 return ret;
521 if (ac->want_keys == xi->have_keys)
522 return 0;
523 return -EAGAIN;
524}
525
526static int ceph_x_create_authorizer(
527 struct ceph_auth_client *ac, int peer_type,
528 struct ceph_authorizer **a,
529 void **buf, size_t *len,
530 void **reply_buf, size_t *reply_len)
531{
532 struct ceph_x_authorizer *au;
533 struct ceph_x_ticket_handler *th;
534 int ret;
535
536 th = get_ticket_handler(ac, peer_type);
537 if (IS_ERR(th))
538 return PTR_ERR(th);
539
540 au = kzalloc(sizeof(*au), GFP_NOFS);
541 if (!au)
542 return -ENOMEM;
543
544 ret = ceph_x_build_authorizer(ac, th, au);
545 if (ret) {
546 kfree(au);
547 return ret;
548 }
549
550 *a = (struct ceph_authorizer *)au;
551 *buf = au->buf->vec.iov_base;
552 *len = au->buf->vec.iov_len;
553 *reply_buf = au->reply_buf;
554 *reply_len = sizeof(au->reply_buf);
555 return 0;
556}
557
558static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
559 struct ceph_authorizer *a, size_t len)
560{
561 struct ceph_x_authorizer *au = (void *)a;
562 struct ceph_x_ticket_handler *th;
563 int ret = 0;
564 struct ceph_x_authorize_reply reply;
565 void *p = au->reply_buf;
566 void *end = p + sizeof(au->reply_buf);
567
568 th = get_ticket_handler(ac, au->service);
569 if (IS_ERR(th))
570 return PTR_ERR(th);
571 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
572 if (ret < 0)
573 return ret;
574 if (ret != sizeof(reply))
575 return -EPERM;
576
577 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
578 ret = -EPERM;
579 else
580 ret = 0;
581 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
582 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
583 return ret;
584}
585
586static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
587 struct ceph_authorizer *a)
588{
589 struct ceph_x_authorizer *au = (void *)a;
590
591 ceph_buffer_put(au->buf);
592 kfree(au);
593}
594
595
596static void ceph_x_reset(struct ceph_auth_client *ac)
597{
598 struct ceph_x_info *xi = ac->private;
599
600 dout("reset\n");
601 xi->starting = true;
602 xi->server_challenge = 0;
603}
604
605static void ceph_x_destroy(struct ceph_auth_client *ac)
606{
607 struct ceph_x_info *xi = ac->private;
608 struct rb_node *p;
609
610 dout("ceph_x_destroy %p\n", ac);
611 ceph_crypto_key_destroy(&xi->secret);
612
613 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
614 struct ceph_x_ticket_handler *th =
615 rb_entry(p, struct ceph_x_ticket_handler, node);
616 remove_ticket_handler(ac, th);
617 }
618
619 if (xi->auth_authorizer.buf)
620 ceph_buffer_put(xi->auth_authorizer.buf);
621
622 kfree(ac->private);
623 ac->private = NULL;
624}
625
626static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
627 int peer_type)
628{
629 struct ceph_x_ticket_handler *th;
630
631 th = get_ticket_handler(ac, peer_type);
632 if (!IS_ERR(th))
633 remove_ticket_handler(ac, th);
634}
635
636
637static const struct ceph_auth_client_ops ceph_x_ops = {
638 .name = "x",
639 .is_authenticated = ceph_x_is_authenticated,
640 .should_authenticate = ceph_x_should_authenticate,
641 .build_request = ceph_x_build_request,
642 .handle_reply = ceph_x_handle_reply,
643 .create_authorizer = ceph_x_create_authorizer,
644 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
645 .destroy_authorizer = ceph_x_destroy_authorizer,
646 .invalidate_authorizer = ceph_x_invalidate_authorizer,
647 .reset = ceph_x_reset,
648 .destroy = ceph_x_destroy,
649};
650
651
652int ceph_x_init(struct ceph_auth_client *ac)
653{
654 struct ceph_x_info *xi;
655 int ret;
656
657 dout("ceph_x_init %p\n", ac);
658 ret = -ENOMEM;
659 xi = kzalloc(sizeof(*xi), GFP_NOFS);
660 if (!xi)
661 goto out;
662
663 ret = -EINVAL;
664 if (!ac->secret) {
665 pr_err("no secret set (for auth_x protocol)\n");
666 goto out_nomem;
667 }
668
669 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
670 if (ret)
671 goto out_nomem;
672
673 xi->starting = true;
674 xi->ticket_handlers = RB_ROOT;
675
676 ac->protocol = CEPH_AUTH_CEPHX;
677 ac->private = xi;
678 ac->ops = &ceph_x_ops;
679 return 0;
680
681out_nomem:
682 kfree(xi);
683out:
684 return ret;
685}
686
687
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
deleted file mode 100644
index ff6f8180e681..000000000000
--- a/fs/ceph/auth_x.h
+++ /dev/null
@@ -1,49 +0,0 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
deleted file mode 100644
index 671d30576c4f..000000000000
--- a/fs/ceph/auth_x_protocol.h
+++ /dev/null
@@ -1,90 +0,0 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
deleted file mode 100644
index cd39f17021de..000000000000
--- a/fs/ceph/buffer.c
+++ /dev/null
@@ -1,65 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5
6#include "buffer.h"
7#include "decode.h"
8
9struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
10{
11 struct ceph_buffer *b;
12
13 b = kmalloc(sizeof(*b), gfp);
14 if (!b)
15 return NULL;
16
17 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
18 if (b->vec.iov_base) {
19 b->is_vmalloc = false;
20 } else {
21 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
22 if (!b->vec.iov_base) {
23 kfree(b);
24 return NULL;
25 }
26 b->is_vmalloc = true;
27 }
28
29 kref_init(&b->kref);
30 b->alloc_len = len;
31 b->vec.iov_len = len;
32 dout("buffer_new %p\n", b);
33 return b;
34}
35
36void ceph_buffer_release(struct kref *kref)
37{
38 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
39
40 dout("buffer_release %p\n", b);
41 if (b->vec.iov_base) {
42 if (b->is_vmalloc)
43 vfree(b->vec.iov_base);
44 else
45 kfree(b->vec.iov_base);
46 }
47 kfree(b);
48}
49
50int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
51{
52 size_t len;
53
54 ceph_decode_need(p, end, sizeof(u32), bad);
55 len = ceph_decode_32(p);
56 dout("decode_buffer len %d\n", (int)len);
57 ceph_decode_need(p, end, len, bad);
58 *b = ceph_buffer_new(len, GFP_NOFS);
59 if (!*b)
60 return -ENOMEM;
61 ceph_decode_copy(p, (*b)->vec.iov_base, len);
62 return 0;
63bad:
64 return -EINVAL;
65}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
deleted file mode 100644
index 58d19014068f..000000000000
--- a/fs/ceph/buffer.h
+++ /dev/null
@@ -1,39 +0,0 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 73c153092f72..98ab13e2b71d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
@@ -9,8 +9,9 @@
9#include <linux/writeback.h> 9#include <linux/writeback.h>
10 10
11#include "super.h" 11#include "super.h"
12#include "decode.h" 12#include "mds_client.h"
13#include "messenger.h" 13#include <linux/ceph/decode.h>
14#include <linux/ceph/messenger.h>
14 15
15/* 16/*
16 * Capability management 17 * Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
287 spin_unlock(&mdsc->caps_list_lock); 288 spin_unlock(&mdsc->caps_list_lock);
288} 289}
289 290
290void ceph_reservation_status(struct ceph_client *client, 291void ceph_reservation_status(struct ceph_fs_client *fsc,
291 int *total, int *avail, int *used, int *reserved, 292 int *total, int *avail, int *used, int *reserved,
292 int *min) 293 int *min)
293{ 294{
294 struct ceph_mds_client *mdsc = &client->mdsc; 295 struct ceph_mds_client *mdsc = fsc->mdsc;
295 296
296 if (total) 297 if (total)
297 *total = mdsc->caps_total_count; 298 *total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
399static void __cap_set_timeouts(struct ceph_mds_client *mdsc, 400static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
400 struct ceph_inode_info *ci) 401 struct ceph_inode_info *ci)
401{ 402{
402 struct ceph_mount_args *ma = mdsc->client->mount_args; 403 struct ceph_mount_options *ma = mdsc->fsc->mount_options;
403 404
404 ci->i_hold_caps_min = round_jiffies(jiffies + 405 ci->i_hold_caps_min = round_jiffies(jiffies +
405 ma->caps_wanted_delay_min * HZ); 406 ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
515 unsigned seq, unsigned mseq, u64 realmino, int flags, 516 unsigned seq, unsigned mseq, u64 realmino, int flags,
516 struct ceph_cap_reservation *caps_reservation) 517 struct ceph_cap_reservation *caps_reservation)
517{ 518{
518 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 519 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
519 struct ceph_inode_info *ci = ceph_inode(inode); 520 struct ceph_inode_info *ci = ceph_inode(inode);
520 struct ceph_cap *new_cap = NULL; 521 struct ceph_cap *new_cap = NULL;
521 struct ceph_cap *cap; 522 struct ceph_cap *cap;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
873 struct ceph_mds_session *session = cap->session; 874 struct ceph_mds_session *session = cap->session;
874 struct ceph_inode_info *ci = cap->ci; 875 struct ceph_inode_info *ci = cap->ci;
875 struct ceph_mds_client *mdsc = 876 struct ceph_mds_client *mdsc =
876 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 877 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
877 int removed = 0; 878 int removed = 0;
878 879
879 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 880 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
1210 int mds; 1211 int mds;
1211 struct ceph_cap_snap *capsnap; 1212 struct ceph_cap_snap *capsnap;
1212 u32 mseq; 1213 u32 mseq;
1213 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1214 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1214 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold 1215 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1215 session->s_mutex */ 1216 session->s_mutex */
1216 u64 next_follows = 0; /* keep track of how far we've gotten through the 1217 u64 next_follows = 0; /* keep track of how far we've gotten through the
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1336void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1337void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1337{ 1338{
1338 struct ceph_mds_client *mdsc = 1339 struct ceph_mds_client *mdsc =
1339 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 1340 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1340 struct inode *inode = &ci->vfs_inode; 1341 struct inode *inode = &ci->vfs_inode;
1341 int was = ci->i_dirty_caps; 1342 int was = ci->i_dirty_caps;
1342 int dirty = 0; 1343 int dirty = 0;
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1378static int __mark_caps_flushing(struct inode *inode, 1379static int __mark_caps_flushing(struct inode *inode,
1379 struct ceph_mds_session *session) 1380 struct ceph_mds_session *session)
1380{ 1381{
1381 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 1382 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1382 struct ceph_inode_info *ci = ceph_inode(inode); 1383 struct ceph_inode_info *ci = ceph_inode(inode);
1383 int flushing; 1384 int flushing;
1384 1385
@@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
1416/* 1417/*
1417 * try to invalidate mapping pages without blocking. 1418 * try to invalidate mapping pages without blocking.
1418 */ 1419 */
1419static int mapping_is_empty(struct address_space *mapping)
1420{
1421 struct page *page = find_get_page(mapping, 0);
1422
1423 if (!page)
1424 return 1;
1425
1426 put_page(page);
1427 return 0;
1428}
1429
1430static int try_nonblocking_invalidate(struct inode *inode) 1420static int try_nonblocking_invalidate(struct inode *inode)
1431{ 1421{
1432 struct ceph_inode_info *ci = ceph_inode(inode); 1422 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1436,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
1436 invalidate_mapping_pages(&inode->i_data, 0, -1); 1426 invalidate_mapping_pages(&inode->i_data, 0, -1);
1437 spin_lock(&inode->i_lock); 1427 spin_lock(&inode->i_lock);
1438 1428
1439 if (mapping_is_empty(&inode->i_data) && 1429 if (inode->i_data.nrpages == 0 &&
1440 invalidating_gen == ci->i_rdcache_gen) { 1430 invalidating_gen == ci->i_rdcache_gen) {
1441 /* success. */ 1431 /* success. */
1442 dout("try_nonblocking_invalidate %p success\n", inode); 1432 dout("try_nonblocking_invalidate %p success\n", inode);
@@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
1462void ceph_check_caps(struct ceph_inode_info *ci, int flags, 1452void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1463 struct ceph_mds_session *session) 1453 struct ceph_mds_session *session)
1464{ 1454{
1465 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); 1455 struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1466 struct ceph_mds_client *mdsc = &client->mdsc; 1456 struct ceph_mds_client *mdsc = fsc->mdsc;
1467 struct inode *inode = &ci->vfs_inode; 1457 struct inode *inode = &ci->vfs_inode;
1468 struct ceph_cap *cap; 1458 struct ceph_cap *cap;
1469 int file_wanted, used; 1459 int file_wanted, used;
@@ -1533,7 +1523,7 @@ retry_locked:
1533 */ 1523 */
1534 if ((!is_delayed || mdsc->stopping) && 1524 if ((!is_delayed || mdsc->stopping) &&
1535 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ 1525 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1536 ci->i_rdcache_gen && /* may have cached pages */ 1526 inode->i_data.nrpages && /* have cached pages */
1537 (file_wanted == 0 || /* no open files */ 1527 (file_wanted == 0 || /* no open files */
1538 (revoking & (CEPH_CAP_FILE_CACHE| 1528 (revoking & (CEPH_CAP_FILE_CACHE|
1539 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ 1529 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
@@ -1706,7 +1696,7 @@ ack:
1706static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1696static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1707 unsigned *flush_tid) 1697 unsigned *flush_tid)
1708{ 1698{
1709 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 1699 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1710 struct ceph_inode_info *ci = ceph_inode(inode); 1700 struct ceph_inode_info *ci = ceph_inode(inode);
1711 int unlock_session = session ? 0 : 1; 1701 int unlock_session = session ? 0 : 1;
1712 int flushing = 0; 1702 int flushing = 0;
@@ -1872,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1872 caps_are_flushed(inode, flush_tid)); 1862 caps_are_flushed(inode, flush_tid));
1873 } else { 1863 } else {
1874 struct ceph_mds_client *mdsc = 1864 struct ceph_mds_client *mdsc =
1875 &ceph_sb_to_client(inode->i_sb)->mdsc; 1865 ceph_sb_to_client(inode->i_sb)->mdsc;
1876 1866
1877 spin_lock(&inode->i_lock); 1867 spin_lock(&inode->i_lock);
1878 if (__ceph_caps_dirty(ci)) 1868 if (__ceph_caps_dirty(ci))
@@ -2283,7 +2273,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2283{ 2273{
2284 struct ceph_inode_info *ci = ceph_inode(inode); 2274 struct ceph_inode_info *ci = ceph_inode(inode);
2285 int mds = session->s_mds; 2275 int mds = session->s_mds;
2286 int seq = le32_to_cpu(grant->seq); 2276 unsigned seq = le32_to_cpu(grant->seq);
2277 unsigned issue_seq = le32_to_cpu(grant->issue_seq);
2287 int newcaps = le32_to_cpu(grant->caps); 2278 int newcaps = le32_to_cpu(grant->caps);
2288 int issued, implemented, used, wanted, dirty; 2279 int issued, implemented, used, wanted, dirty;
2289 u64 size = le64_to_cpu(grant->size); 2280 u64 size = le64_to_cpu(grant->size);
@@ -2295,8 +2286,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2295 int revoked_rdcache = 0; 2286 int revoked_rdcache = 0;
2296 int queue_invalidate = 0; 2287 int queue_invalidate = 0;
2297 2288
2298 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 2289 dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
2299 inode, cap, mds, seq, ceph_cap_string(newcaps)); 2290 inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
2300 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2291 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2301 inode->i_size); 2292 inode->i_size);
2302 2293
@@ -2392,6 +2383,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2392 } 2383 }
2393 2384
2394 cap->seq = seq; 2385 cap->seq = seq;
2386 cap->issue_seq = issue_seq;
2395 2387
2396 /* file layout may have changed */ 2388 /* file layout may have changed */
2397 ci->i_layout = grant->layout; 2389 ci->i_layout = grant->layout;
@@ -2463,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2463 __releases(inode->i_lock) 2455 __releases(inode->i_lock)
2464{ 2456{
2465 struct ceph_inode_info *ci = ceph_inode(inode); 2457 struct ceph_inode_info *ci = ceph_inode(inode);
2466 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 2458 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
2467 unsigned seq = le32_to_cpu(m->seq); 2459 unsigned seq = le32_to_cpu(m->seq);
2468 int dirty = le32_to_cpu(m->dirty); 2460 int dirty = le32_to_cpu(m->dirty);
2469 int cleaned = 0; 2461 int cleaned = 0;
@@ -2711,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2711 struct ceph_msg *msg) 2703 struct ceph_msg *msg)
2712{ 2704{
2713 struct ceph_mds_client *mdsc = session->s_mdsc; 2705 struct ceph_mds_client *mdsc = session->s_mdsc;
2714 struct super_block *sb = mdsc->client->sb; 2706 struct super_block *sb = mdsc->fsc->sb;
2715 struct inode *inode; 2707 struct inode *inode;
2716 struct ceph_cap *cap; 2708 struct ceph_cap *cap;
2717 struct ceph_mds_caps *h; 2709 struct ceph_mds_caps *h;
@@ -2774,15 +2766,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2774 if (op == CEPH_CAP_OP_IMPORT) 2766 if (op == CEPH_CAP_OP_IMPORT)
2775 __queue_cap_release(session, vino.ino, cap_id, 2767 __queue_cap_release(session, vino.ino, cap_id,
2776 mseq, seq); 2768 mseq, seq);
2777 2769 goto flush_cap_releases;
2778 /*
2779 * send any full release message to try to move things
2780 * along for the mds (who clearly thinks we still have this
2781 * cap).
2782 */
2783 ceph_add_cap_releases(mdsc, session);
2784 ceph_send_cap_releases(mdsc, session);
2785 goto done;
2786 } 2770 }
2787 2771
2788 /* these will work even if we don't have a cap yet */ 2772 /* these will work even if we don't have a cap yet */
@@ -2810,7 +2794,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2810 dout(" no cap on %p ino %llx.%llx from mds%d\n", 2794 dout(" no cap on %p ino %llx.%llx from mds%d\n",
2811 inode, ceph_ino(inode), ceph_snap(inode), mds); 2795 inode, ceph_ino(inode), ceph_snap(inode), mds);
2812 spin_unlock(&inode->i_lock); 2796 spin_unlock(&inode->i_lock);
2813 goto done; 2797 goto flush_cap_releases;
2814 } 2798 }
2815 2799
2816 /* note that each of these drops i_lock for us */ 2800 /* note that each of these drops i_lock for us */
@@ -2834,6 +2818,17 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2834 ceph_cap_op_name(op)); 2818 ceph_cap_op_name(op));
2835 } 2819 }
2836 2820
2821 goto done;
2822
2823flush_cap_releases:
2824 /*
2825 * send any full release message to try to move things
2826 * along for the mds (who clearly thinks we still have this
2827 * cap).
2828 */
2829 ceph_add_cap_releases(mdsc, session);
2830 ceph_send_cap_releases(mdsc, session);
2831
2837done: 2832done:
2838 mutex_unlock(&session->s_mutex); 2833 mutex_unlock(&session->s_mutex);
2839done_unlocked: 2834done_unlocked:
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
deleted file mode 100644
index 1818c2305610..000000000000
--- a/fs/ceph/ceph_debug.h
+++ /dev/null
@@ -1,37 +0,0 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index ab6cf35c4091..bdce8b1fbd06 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * Ceph 'frag' type 2 * Ceph 'frag' type
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6int ceph_frag_compare(__u32 a, __u32 b) 7int ceph_frag_compare(__u32 a, __u32 b)
7{ 8{
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
deleted file mode 100644
index 5babb8e95352..000000000000
--- a/fs/ceph/ceph_frag.h
+++ /dev/null
@@ -1,109 +0,0 @@
1#ifndef FS_CEPH_FRAG_H
2#define FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
deleted file mode 100644
index 3ac6cc7c1156..000000000000
--- a/fs/ceph/ceph_fs.c
+++ /dev/null
@@ -1,72 +0,0 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32 int mode;
33
34#ifdef O_DIRECTORY /* fixme */
35 if ((flags & O_DIRECTORY) == O_DIRECTORY)
36 return CEPH_FILE_MODE_PIN;
37#endif
38 if ((flags & O_APPEND) == O_APPEND)
39 flags |= O_WRONLY;
40
41 if ((flags & O_ACCMODE) == O_RDWR)
42 mode = CEPH_FILE_MODE_RDWR;
43 else if ((flags & O_ACCMODE) == O_WRONLY)
44 mode = CEPH_FILE_MODE_WR;
45 else
46 mode = CEPH_FILE_MODE_RD;
47
48#ifdef O_LAZY
49 if (flags & O_LAZY)
50 mode |= CEPH_FILE_MODE_LAZY;
51#endif
52
53 return mode;
54}
55
56int ceph_caps_for_mode(int mode)
57{
58 int caps = CEPH_CAP_PIN;
59
60 if (mode & CEPH_FILE_MODE_RD)
61 caps |= CEPH_CAP_FILE_SHARED |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
63 if (mode & CEPH_FILE_MODE_WR)
64 caps |= CEPH_CAP_FILE_EXCL |
65 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
66 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
67 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
68 if (mode & CEPH_FILE_MODE_LAZY)
69 caps |= CEPH_CAP_FILE_LAZYIO;
70
71 return caps;
72}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
deleted file mode 100644
index d5619ac86711..000000000000
--- a/fs/ceph/ceph_fs.h
+++ /dev/null
@@ -1,728 +0,0 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef CEPH_FS_H
13#define CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * subprotocol versions. when specific messages types or high-level
20 * protocols change, bump the affected components. we keep rev
21 * internal cluster protocols separately from the public,
22 * client-facing protocol.
23 */
24#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
25#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
26#define CEPH_MON_PROTOCOL 5 /* cluster internal */
27#define CEPH_OSDC_PROTOCOL 24 /* server/client */
28#define CEPH_MDSC_PROTOCOL 32 /* server/client */
29#define CEPH_MONC_PROTOCOL 15 /* server/client */
30
31
32#define CEPH_INO_ROOT 1
33#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
34
35/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
36#define CEPH_MAX_MON 31
37
38
39/*
40 * feature bits
41 */
42#define CEPH_FEATURE_UID (1<<0)
43#define CEPH_FEATURE_NOSRCADDR (1<<1)
44#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
45#define CEPH_FEATURE_FLOCK (1<<3)
46
47
48/*
49 * ceph_file_layout - describe data layout for a file/inode
50 */
51struct ceph_file_layout {
52 /* file -> object mapping */
53 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
54 of page size. */
55 __le32 fl_stripe_count; /* over this many objects */
56 __le32 fl_object_size; /* until objects are this big, then move to
57 new objects */
58 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
59
60 /* pg -> disk layout */
61 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
62
63 /* object -> pg layout */
64 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
65 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
66} __attribute__ ((packed));
67
68#define CEPH_MIN_STRIPE_UNIT 65536
69
70int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
71
72
73/* crypto algorithms */
74#define CEPH_CRYPTO_NONE 0x0
75#define CEPH_CRYPTO_AES 0x1
76
77#define CEPH_AES_IV "cephsageyudagreg"
78
79/* security/authentication protocols */
80#define CEPH_AUTH_UNKNOWN 0x0
81#define CEPH_AUTH_NONE 0x1
82#define CEPH_AUTH_CEPHX 0x2
83
84#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
85
86
87/*********************************************
88 * message layer
89 */
90
91/*
92 * message types
93 */
94
95/* misc */
96#define CEPH_MSG_SHUTDOWN 1
97#define CEPH_MSG_PING 2
98
99/* client <-> monitor */
100#define CEPH_MSG_MON_MAP 4
101#define CEPH_MSG_MON_GET_MAP 5
102#define CEPH_MSG_STATFS 13
103#define CEPH_MSG_STATFS_REPLY 14
104#define CEPH_MSG_MON_SUBSCRIBE 15
105#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
106#define CEPH_MSG_AUTH 17
107#define CEPH_MSG_AUTH_REPLY 18
108
109/* client <-> mds */
110#define CEPH_MSG_MDS_MAP 21
111
112#define CEPH_MSG_CLIENT_SESSION 22
113#define CEPH_MSG_CLIENT_RECONNECT 23
114
115#define CEPH_MSG_CLIENT_REQUEST 24
116#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
117#define CEPH_MSG_CLIENT_REPLY 26
118#define CEPH_MSG_CLIENT_CAPS 0x310
119#define CEPH_MSG_CLIENT_LEASE 0x311
120#define CEPH_MSG_CLIENT_SNAP 0x312
121#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
122
123/* pool ops */
124#define CEPH_MSG_POOLOP_REPLY 48
125#define CEPH_MSG_POOLOP 49
126
127
128/* osd */
129#define CEPH_MSG_OSD_MAP 41
130#define CEPH_MSG_OSD_OP 42
131#define CEPH_MSG_OSD_OPREPLY 43
132
133/* pool operations */
134enum {
135 POOL_OP_CREATE = 0x01,
136 POOL_OP_DELETE = 0x02,
137 POOL_OP_AUID_CHANGE = 0x03,
138 POOL_OP_CREATE_SNAP = 0x11,
139 POOL_OP_DELETE_SNAP = 0x12,
140 POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
141 POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
142};
143
144struct ceph_mon_request_header {
145 __le64 have_version;
146 __le16 session_mon;
147 __le64 session_mon_tid;
148} __attribute__ ((packed));
149
150struct ceph_mon_statfs {
151 struct ceph_mon_request_header monhdr;
152 struct ceph_fsid fsid;
153} __attribute__ ((packed));
154
155struct ceph_statfs {
156 __le64 kb, kb_used, kb_avail;
157 __le64 num_objects;
158} __attribute__ ((packed));
159
160struct ceph_mon_statfs_reply {
161 struct ceph_fsid fsid;
162 __le64 version;
163 struct ceph_statfs st;
164} __attribute__ ((packed));
165
166const char *ceph_pool_op_name(int op);
167
168struct ceph_mon_poolop {
169 struct ceph_mon_request_header monhdr;
170 struct ceph_fsid fsid;
171 __le32 pool;
172 __le32 op;
173 __le64 auid;
174 __le64 snapid;
175 __le32 name_len;
176} __attribute__ ((packed));
177
178struct ceph_mon_poolop_reply {
179 struct ceph_mon_request_header monhdr;
180 struct ceph_fsid fsid;
181 __le32 reply_code;
182 __le32 epoch;
183 char has_data;
184 char data[0];
185} __attribute__ ((packed));
186
187struct ceph_mon_unmanaged_snap {
188 __le64 snapid;
189} __attribute__ ((packed));
190
191struct ceph_osd_getmap {
192 struct ceph_mon_request_header monhdr;
193 struct ceph_fsid fsid;
194 __le32 start;
195} __attribute__ ((packed));
196
197struct ceph_mds_getmap {
198 struct ceph_mon_request_header monhdr;
199 struct ceph_fsid fsid;
200} __attribute__ ((packed));
201
202struct ceph_client_mount {
203 struct ceph_mon_request_header monhdr;
204} __attribute__ ((packed));
205
206struct ceph_mon_subscribe_item {
207 __le64 have_version; __le64 have;
208 __u8 onetime;
209} __attribute__ ((packed));
210
211struct ceph_mon_subscribe_ack {
212 __le32 duration; /* seconds */
213 struct ceph_fsid fsid;
214} __attribute__ ((packed));
215
216/*
217 * mds states
218 * > 0 -> in
219 * <= 0 -> out
220 */
221#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
222#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
223 empty log. */
224#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
225#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
226#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
227#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
228#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
229
230#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
231#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
232 operations (import, rename, etc.) */
233#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
234#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
235#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
236#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
237#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
238
239extern const char *ceph_mds_state_name(int s);
240
241
242/*
243 * metadata lock types.
244 * - these are bitmasks.. we can compose them
245 * - they also define the lock ordering by the MDS
246 * - a few of these are internal to the mds
247 */
248#define CEPH_LOCK_DVERSION 1
249#define CEPH_LOCK_DN 2
250#define CEPH_LOCK_ISNAP 16
251#define CEPH_LOCK_IVERSION 32 /* mds internal */
252#define CEPH_LOCK_IFILE 64
253#define CEPH_LOCK_IAUTH 128
254#define CEPH_LOCK_ILINK 256
255#define CEPH_LOCK_IDFT 512 /* dir frag tree */
256#define CEPH_LOCK_INEST 1024 /* mds internal */
257#define CEPH_LOCK_IXATTR 2048
258#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
259#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
260
261/* client_session ops */
262enum {
263 CEPH_SESSION_REQUEST_OPEN,
264 CEPH_SESSION_OPEN,
265 CEPH_SESSION_REQUEST_CLOSE,
266 CEPH_SESSION_CLOSE,
267 CEPH_SESSION_REQUEST_RENEWCAPS,
268 CEPH_SESSION_RENEWCAPS,
269 CEPH_SESSION_STALE,
270 CEPH_SESSION_RECALL_STATE,
271};
272
273extern const char *ceph_session_op_name(int op);
274
275struct ceph_mds_session_head {
276 __le32 op;
277 __le64 seq;
278 struct ceph_timespec stamp;
279 __le32 max_caps, max_leases;
280} __attribute__ ((packed));
281
282/* client_request */
283/*
284 * metadata ops.
285 * & 0x001000 -> write op
286 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
287 & & 0x100000 -> use weird ino/path trace
288 */
289#define CEPH_MDS_OP_WRITE 0x001000
290enum {
291 CEPH_MDS_OP_LOOKUP = 0x00100,
292 CEPH_MDS_OP_GETATTR = 0x00101,
293 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
294 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
295
296 CEPH_MDS_OP_SETXATTR = 0x01105,
297 CEPH_MDS_OP_RMXATTR = 0x01106,
298 CEPH_MDS_OP_SETLAYOUT = 0x01107,
299 CEPH_MDS_OP_SETATTR = 0x01108,
300 CEPH_MDS_OP_SETFILELOCK= 0x01109,
301 CEPH_MDS_OP_GETFILELOCK= 0x00110,
302
303 CEPH_MDS_OP_MKNOD = 0x01201,
304 CEPH_MDS_OP_LINK = 0x01202,
305 CEPH_MDS_OP_UNLINK = 0x01203,
306 CEPH_MDS_OP_RENAME = 0x01204,
307 CEPH_MDS_OP_MKDIR = 0x01220,
308 CEPH_MDS_OP_RMDIR = 0x01221,
309 CEPH_MDS_OP_SYMLINK = 0x01222,
310
311 CEPH_MDS_OP_CREATE = 0x01301,
312 CEPH_MDS_OP_OPEN = 0x00302,
313 CEPH_MDS_OP_READDIR = 0x00305,
314
315 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
316 CEPH_MDS_OP_MKSNAP = 0x01400,
317 CEPH_MDS_OP_RMSNAP = 0x01401,
318 CEPH_MDS_OP_LSSNAP = 0x00402,
319};
320
321extern const char *ceph_mds_op_name(int op);
322
323
324#define CEPH_SETATTR_MODE 1
325#define CEPH_SETATTR_UID 2
326#define CEPH_SETATTR_GID 4
327#define CEPH_SETATTR_MTIME 8
328#define CEPH_SETATTR_ATIME 16
329#define CEPH_SETATTR_SIZE 32
330#define CEPH_SETATTR_CTIME 64
331
332union ceph_mds_request_args {
333 struct {
334 __le32 mask; /* CEPH_CAP_* */
335 } __attribute__ ((packed)) getattr;
336 struct {
337 __le32 mode;
338 __le32 uid;
339 __le32 gid;
340 struct ceph_timespec mtime;
341 struct ceph_timespec atime;
342 __le64 size, old_size; /* old_size needed by truncate */
343 __le32 mask; /* CEPH_SETATTR_* */
344 } __attribute__ ((packed)) setattr;
345 struct {
346 __le32 frag; /* which dir fragment */
347 __le32 max_entries; /* how many dentries to grab */
348 __le32 max_bytes;
349 } __attribute__ ((packed)) readdir;
350 struct {
351 __le32 mode;
352 __le32 rdev;
353 } __attribute__ ((packed)) mknod;
354 struct {
355 __le32 mode;
356 } __attribute__ ((packed)) mkdir;
357 struct {
358 __le32 flags;
359 __le32 mode;
360 __le32 stripe_unit; /* layout for newly created file */
361 __le32 stripe_count; /* ... */
362 __le32 object_size;
363 __le32 file_replication;
364 __le32 preferred;
365 } __attribute__ ((packed)) open;
366 struct {
367 __le32 flags;
368 } __attribute__ ((packed)) setxattr;
369 struct {
370 struct ceph_file_layout layout;
371 } __attribute__ ((packed)) setlayout;
372 struct {
373 __u8 rule; /* currently fcntl or flock */
374 __u8 type; /* shared, exclusive, remove*/
375 __le64 pid; /* process id requesting the lock */
376 __le64 pid_namespace;
377 __le64 start; /* initial location to lock */
378 __le64 length; /* num bytes to lock from start */
379 __u8 wait; /* will caller wait for lock to become available? */
380 } __attribute__ ((packed)) filelock_change;
381} __attribute__ ((packed));
382
383#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
384#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
385
386struct ceph_mds_request_head {
387 __le64 oldest_client_tid;
388 __le32 mdsmap_epoch; /* on client */
389 __le32 flags; /* CEPH_MDS_FLAG_* */
390 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
391 __le16 num_releases; /* # include cap/lease release records */
392 __le32 op; /* mds op code */
393 __le32 caller_uid, caller_gid;
394 __le64 ino; /* use this ino for openc, mkdir, mknod,
395 etc. (if replaying) */
396 union ceph_mds_request_args args;
397} __attribute__ ((packed));
398
399/* cap/lease release record */
400struct ceph_mds_request_release {
401 __le64 ino, cap_id; /* ino and unique cap id */
402 __le32 caps, wanted; /* new issued, wanted */
403 __le32 seq, issue_seq, mseq;
404 __le32 dname_seq; /* if releasing a dentry lease, a */
405 __le32 dname_len; /* string follows. */
406} __attribute__ ((packed));
407
408/* client reply */
409struct ceph_mds_reply_head {
410 __le32 op;
411 __le32 result;
412 __le32 mdsmap_epoch;
413 __u8 safe; /* true if committed to disk */
414 __u8 is_dentry, is_target; /* true if dentry, target inode records
415 are included with reply */
416} __attribute__ ((packed));
417
418/* one for each node split */
419struct ceph_frag_tree_split {
420 __le32 frag; /* this frag splits... */
421 __le32 by; /* ...by this many bits */
422} __attribute__ ((packed));
423
424struct ceph_frag_tree_head {
425 __le32 nsplits; /* num ceph_frag_tree_split records */
426 struct ceph_frag_tree_split splits[];
427} __attribute__ ((packed));
428
429/* capability issue, for bundling with mds reply */
430struct ceph_mds_reply_cap {
431 __le32 caps, wanted; /* caps issued, wanted */
432 __le64 cap_id;
433 __le32 seq, mseq;
434 __le64 realm; /* snap realm */
435 __u8 flags; /* CEPH_CAP_FLAG_* */
436} __attribute__ ((packed));
437
438#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
439
440/* inode record, for bundling with mds reply */
441struct ceph_mds_reply_inode {
442 __le64 ino;
443 __le64 snapid;
444 __le32 rdev;
445 __le64 version; /* inode version */
446 __le64 xattr_version; /* version for xattr blob */
447 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
448 struct ceph_file_layout layout;
449 struct ceph_timespec ctime, mtime, atime;
450 __le32 time_warp_seq;
451 __le64 size, max_size, truncate_size;
452 __le32 truncate_seq;
453 __le32 mode, uid, gid;
454 __le32 nlink;
455 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
456 struct ceph_timespec rctime;
457 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
458} __attribute__ ((packed));
459/* followed by frag array, then symlink string, then xattr blob */
460
461/* reply_lease follows dname, and reply_inode */
462struct ceph_mds_reply_lease {
463 __le16 mask; /* lease type(s) */
464 __le32 duration_ms; /* lease duration */
465 __le32 seq;
466} __attribute__ ((packed));
467
468struct ceph_mds_reply_dirfrag {
469 __le32 frag; /* fragment */
470 __le32 auth; /* auth mds, if this is a delegation point */
471 __le32 ndist; /* number of mds' this is replicated on */
472 __le32 dist[];
473} __attribute__ ((packed));
474
475#define CEPH_LOCK_FCNTL 1
476#define CEPH_LOCK_FLOCK 2
477
478#define CEPH_LOCK_SHARED 1
479#define CEPH_LOCK_EXCL 2
480#define CEPH_LOCK_UNLOCK 4
481
482struct ceph_filelock {
483 __le64 start;/* file offset to start lock at */
484 __le64 length; /* num bytes to lock; 0 for all following start */
485 __le64 client; /* which client holds the lock */
486 __le64 pid; /* process id holding the lock on the client */
487 __le64 pid_namespace;
488 __u8 type; /* shared lock, exclusive lock, or unlock */
489} __attribute__ ((packed));
490
491
492/* file access modes */
493#define CEPH_FILE_MODE_PIN 0
494#define CEPH_FILE_MODE_RD 1
495#define CEPH_FILE_MODE_WR 2
496#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
497#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
498#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
499
500int ceph_flags_to_mode(int flags);
501
502
503/* capability bits */
504#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
505
506/* generic cap bits */
507#define CEPH_CAP_GSHARED 1 /* client can reads */
508#define CEPH_CAP_GEXCL 2 /* client can read and update */
509#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
510#define CEPH_CAP_GRD 8 /* (file) client can read */
511#define CEPH_CAP_GWR 16 /* (file) client can write */
512#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
513#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
514#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
515
516/* per-lock shift */
517#define CEPH_CAP_SAUTH 2
518#define CEPH_CAP_SLINK 4
519#define CEPH_CAP_SXATTR 6
520#define CEPH_CAP_SFILE 8
521#define CEPH_CAP_SFLOCK 20
522
523#define CEPH_CAP_BITS 22
524
525/* composed values */
526#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
527#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
528#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
529#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
530#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
531#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
532#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
533#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
534#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
535#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
536#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
537#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
538#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
539#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
540#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
541#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK)
542#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK)
543
544
545/* cap masks (for getattr) */
546#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
547#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
548#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
549#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
550#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
551#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
552#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
553#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
554#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
555#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
556#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
557#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
558#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
559 CEPH_CAP_AUTH_SHARED | \
560 CEPH_CAP_LINK_SHARED | \
561 CEPH_CAP_FILE_SHARED | \
562 CEPH_CAP_XATTR_SHARED)
563
564#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
565 CEPH_CAP_LINK_SHARED | \
566 CEPH_CAP_XATTR_SHARED | \
567 CEPH_CAP_FILE_SHARED)
568#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
569 CEPH_CAP_FILE_CACHE)
570
571#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
572 CEPH_CAP_LINK_EXCL | \
573 CEPH_CAP_XATTR_EXCL | \
574 CEPH_CAP_FILE_EXCL)
575#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
576 CEPH_CAP_FILE_EXCL)
577#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
578#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
579 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
580 CEPH_CAP_PIN)
581
582#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
583 CEPH_LOCK_IXATTR)
584
585int ceph_caps_for_mode(int mode);
586
587enum {
588 CEPH_CAP_OP_GRANT, /* mds->client grant */
589 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
590 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
591 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
592 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
593 CEPH_CAP_OP_UPDATE, /* client->mds update */
594 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
595 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
596 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
597 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
598 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
599 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
600 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
601};
602
603extern const char *ceph_cap_op_name(int op);
604
605/*
606 * caps message, used for capability callbacks, acks, requests, etc.
607 */
608struct ceph_mds_caps {
609 __le32 op; /* CEPH_CAP_OP_* */
610 __le64 ino, realm;
611 __le64 cap_id;
612 __le32 seq, issue_seq;
613 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
614 __le32 migrate_seq;
615 __le64 snap_follows;
616 __le32 snap_trace_len;
617
618 /* authlock */
619 __le32 uid, gid, mode;
620
621 /* linklock */
622 __le32 nlink;
623
624 /* xattrlock */
625 __le32 xattr_len;
626 __le64 xattr_version;
627
628 /* filelock */
629 __le64 size, max_size, truncate_size;
630 __le32 truncate_seq;
631 struct ceph_timespec mtime, atime, ctime;
632 struct ceph_file_layout layout;
633 __le32 time_warp_seq;
634} __attribute__ ((packed));
635
636/* cap release msg head */
637struct ceph_mds_cap_release {
638 __le32 num; /* number of cap_items that follow */
639} __attribute__ ((packed));
640
641struct ceph_mds_cap_item {
642 __le64 ino;
643 __le64 cap_id;
644 __le32 migrate_seq, seq;
645} __attribute__ ((packed));
646
647#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
648#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
649#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
650#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
651
652extern const char *ceph_lease_op_name(int o);
653
654/* lease msg header */
655struct ceph_mds_lease {
656 __u8 action; /* CEPH_MDS_LEASE_* */
657 __le16 mask; /* which lease */
658 __le64 ino;
659 __le64 first, last; /* snap range */
660 __le32 seq;
661 __le32 duration_ms; /* duration of renewal */
662} __attribute__ ((packed));
663/* followed by a __le32+string for dname */
664
665/* client reconnect */
666struct ceph_mds_cap_reconnect {
667 __le64 cap_id;
668 __le32 wanted;
669 __le32 issued;
670 __le64 snaprealm;
671 __le64 pathbase; /* base ino for our path to this ino */
672 __le32 flock_len; /* size of flock state blob, if any */
673} __attribute__ ((packed));
674/* followed by flock blob */
675
676struct ceph_mds_cap_reconnect_v1 {
677 __le64 cap_id;
678 __le32 wanted;
679 __le32 issued;
680 __le64 size;
681 struct ceph_timespec mtime, atime;
682 __le64 snaprealm;
683 __le64 pathbase; /* base ino for our path to this ino */
684} __attribute__ ((packed));
685
686struct ceph_mds_snaprealm_reconnect {
687 __le64 ino; /* snap realm base */
688 __le64 seq; /* snap seq for this snap realm */
689 __le64 parent; /* parent realm */
690} __attribute__ ((packed));
691
692/*
693 * snaps
694 */
695enum {
696 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
697 CEPH_SNAP_OP_CREATE,
698 CEPH_SNAP_OP_DESTROY,
699 CEPH_SNAP_OP_SPLIT,
700};
701
702extern const char *ceph_snap_op_name(int o);
703
704/* snap msg header */
705struct ceph_mds_snap_head {
706 __le32 op; /* CEPH_SNAP_OP_* */
707 __le64 split; /* ino to split off, if any */
708 __le32 num_split_inos; /* # inos belonging to new child realm */
709 __le32 num_split_realms; /* # child realms udner new child realm */
710 __le32 trace_len; /* size of snap trace blob */
711} __attribute__ ((packed));
712/* followed by split ino list, then split realms, then the trace blob */
713
714/*
715 * encode info about a snaprealm, as viewed by a client
716 */
717struct ceph_mds_snap_realm {
718 __le64 ino; /* ino */
719 __le64 created; /* snap: when created */
720 __le64 parent; /* ino: parent realm */
721 __le64 parent_since; /* snap: same parent since */
722 __le64 seq; /* snap: version */
723 __le32 num_snaps;
724 __le32 num_prior_parent_snaps;
725} __attribute__ ((packed));
726/* followed by my snap list, then prior parent snap list */
727
728#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
deleted file mode 100644
index bd570015d147..000000000000
--- a/fs/ceph/ceph_hash.c
+++ /dev/null
@@ -1,118 +0,0 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
deleted file mode 100644
index d099c3f90236..000000000000
--- a/fs/ceph/ceph_hash.h
+++ /dev/null
@@ -1,13 +0,0 @@
1#ifndef FS_CEPH_HASH_H
2#define FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
deleted file mode 100644
index fabd302e5779..000000000000
--- a/fs/ceph/crush/crush.c
+++ /dev/null
@@ -1,151 +0,0 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
deleted file mode 100644
index 97e435b191f4..000000000000
--- a/fs/ceph/crush/crush.h
+++ /dev/null
@@ -1,180 +0,0 @@
1#ifndef CEPH_CRUSH_CRUSH_H
2#define CEPH_CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
deleted file mode 100644
index 5873aed694bf..000000000000
--- a/fs/ceph/crush/hash.c
+++ /dev/null
@@ -1,149 +0,0 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
deleted file mode 100644
index 91e884230d5d..000000000000
--- a/fs/ceph/crush/hash.h
+++ /dev/null
@@ -1,17 +0,0 @@
1#ifndef CEPH_CRUSH_HASH_H
2#define CEPH_CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
deleted file mode 100644
index a4eec133258e..000000000000
--- a/fs/ceph/crush/mapper.c
+++ /dev/null
@@ -1,609 +0,0 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x10000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308
309 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
310 bucket->id, x, outpos, numrep);
311
312 for (rep = outpos; rep < numrep; rep++) {
313 /* keep trying until we get a non-out, non-colliding item */
314 ftotal = 0;
315 skip_rep = 0;
316 do {
317 retry_descent = 0;
318 in = bucket; /* initial bucket */
319
320 /* choose through intervening buckets */
321 flocal = 0;
322 do {
323 collide = 0;
324 retry_bucket = 0;
325 r = rep;
326 if (in->alg == CRUSH_BUCKET_UNIFORM) {
327 /* be careful */
328 if (firstn || numrep >= in->size)
329 /* r' = r + f_total */
330 r += ftotal;
331 else if (in->size % numrep == 0)
332 /* r'=r+(n+1)*f_local */
333 r += (numrep+1) *
334 (flocal+ftotal);
335 else
336 /* r' = r + n*f_local */
337 r += numrep * (flocal+ftotal);
338 } else {
339 if (firstn)
340 /* r' = r + f_total */
341 r += ftotal;
342 else
343 /* r' = r + n*f_local */
344 r += numrep * (flocal+ftotal);
345 }
346
347 /* bucket choose */
348 if (in->size == 0) {
349 reject = 1;
350 goto reject;
351 }
352 if (flocal >= (in->size>>1) &&
353 flocal > orig_tries)
354 item = bucket_perm_choose(in, x, r);
355 else
356 item = crush_bucket_choose(in, x, r);
357 BUG_ON(item >= map->max_devices);
358
359 /* desired type? */
360 if (item < 0)
361 itemtype = map->buckets[-1-item]->type;
362 else
363 itemtype = 0;
364 dprintk(" item %d type %d\n", item, itemtype);
365
366 /* keep going? */
367 if (itemtype != type) {
368 BUG_ON(item >= 0 ||
369 (-1-item) >= map->max_buckets);
370 in = map->buckets[-1-item];
371 retry_bucket = 1;
372 continue;
373 }
374
375 /* collision? */
376 for (i = 0; i < outpos; i++) {
377 if (out[i] == item) {
378 collide = 1;
379 break;
380 }
381 }
382
383 reject = 0;
384 if (recurse_to_leaf) {
385 if (item < 0) {
386 if (crush_choose(map,
387 map->buckets[-1-item],
388 weight,
389 x, outpos+1, 0,
390 out2, outpos,
391 firstn, 0,
392 NULL) <= outpos)
393 /* didn't get leaf */
394 reject = 1;
395 } else {
396 /* we already have a leaf! */
397 out2[outpos] = item;
398 }
399 }
400
401 if (!reject) {
402 /* out? */
403 if (itemtype == 0)
404 reject = is_out(map, weight,
405 item, x);
406 else
407 reject = 0;
408 }
409
410reject:
411 if (reject || collide) {
412 ftotal++;
413 flocal++;
414
415 if (collide && flocal < 3)
416 /* retry locally a few times */
417 retry_bucket = 1;
418 else if (flocal < in->size + orig_tries)
419 /* exhaustive bucket search */
420 retry_bucket = 1;
421 else if (ftotal < 20)
422 /* then retry descent */
423 retry_descent = 1;
424 else
425 /* else give up */
426 skip_rep = 1;
427 dprintk(" reject %d collide %d "
428 "ftotal %d flocal %d\n",
429 reject, collide, ftotal,
430 flocal);
431 }
432 } while (retry_bucket);
433 } while (retry_descent);
434
435 if (skip_rep) {
436 dprintk("skip rep\n");
437 continue;
438 }
439
440 dprintk("CHOOSE got %d\n", item);
441 out[outpos] = item;
442 outpos++;
443 }
444
445 dprintk("CHOOSE returns %d\n", outpos);
446 return outpos;
447}
448
449
450/**
451 * crush_do_rule - calculate a mapping with the given input and rule
452 * @map: the crush_map
453 * @ruleno: the rule id
454 * @x: hash input
455 * @result: pointer to result vector
456 * @result_max: maximum result size
457 * @force: force initial replica choice; -1 for none
458 */
459int crush_do_rule(struct crush_map *map,
460 int ruleno, int x, int *result, int result_max,
461 int force, __u32 *weight)
462{
463 int result_len;
464 int force_context[CRUSH_MAX_DEPTH];
465 int force_pos = -1;
466 int a[CRUSH_MAX_SET];
467 int b[CRUSH_MAX_SET];
468 int c[CRUSH_MAX_SET];
469 int recurse_to_leaf;
470 int *w;
471 int wsize = 0;
472 int *o;
473 int osize;
474 int *tmp;
475 struct crush_rule *rule;
476 int step;
477 int i, j;
478 int numrep;
479 int firstn;
480 int rc = -1;
481
482 BUG_ON(ruleno >= map->max_rules);
483
484 rule = map->rules[ruleno];
485 result_len = 0;
486 w = a;
487 o = b;
488
489 /*
490 * determine hierarchical context of force, if any. note
491 * that this may or may not correspond to the specific types
492 * referenced by the crush rule.
493 */
494 if (force >= 0) {
495 if (force >= map->max_devices ||
496 map->device_parents[force] == 0) {
497 /*dprintk("CRUSH: forcefed device dne\n");*/
498 rc = -1; /* force fed device dne */
499 goto out;
500 }
501 if (!is_out(map, weight, force, x)) {
502 while (1) {
503 force_context[++force_pos] = force;
504 if (force >= 0)
505 force = map->device_parents[force];
506 else
507 force = map->bucket_parents[-1-force];
508 if (force == 0)
509 break;
510 }
511 }
512 }
513
514 for (step = 0; step < rule->len; step++) {
515 firstn = 0;
516 switch (rule->steps[step].op) {
517 case CRUSH_RULE_TAKE:
518 w[0] = rule->steps[step].arg1;
519 if (force_pos >= 0) {
520 BUG_ON(force_context[force_pos] != w[0]);
521 force_pos--;
522 }
523 wsize = 1;
524 break;
525
526 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
527 case CRUSH_RULE_CHOOSE_FIRSTN:
528 firstn = 1;
529 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
530 case CRUSH_RULE_CHOOSE_INDEP:
531 BUG_ON(wsize == 0);
532
533 recurse_to_leaf =
534 rule->steps[step].op ==
535 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
536 rule->steps[step].op ==
537 CRUSH_RULE_CHOOSE_LEAF_INDEP;
538
539 /* reset output */
540 osize = 0;
541
542 for (i = 0; i < wsize; i++) {
543 /*
544 * see CRUSH_N, CRUSH_N_MINUS macros.
545 * basically, numrep <= 0 means relative to
546 * the provided result_max
547 */
548 numrep = rule->steps[step].arg1;
549 if (numrep <= 0) {
550 numrep += result_max;
551 if (numrep <= 0)
552 continue;
553 }
554 j = 0;
555 if (osize == 0 && force_pos >= 0) {
556 /* skip any intermediate types */
557 while (force_pos &&
558 force_context[force_pos] < 0 &&
559 rule->steps[step].arg2 !=
560 map->buckets[-1 -
561 force_context[force_pos]]->type)
562 force_pos--;
563 o[osize] = force_context[force_pos];
564 if (recurse_to_leaf)
565 c[osize] = force_context[0];
566 j++;
567 force_pos--;
568 }
569 osize += crush_choose(map,
570 map->buckets[-1-w[i]],
571 weight,
572 x, numrep,
573 rule->steps[step].arg2,
574 o+osize, j,
575 firstn,
576 recurse_to_leaf, c+osize);
577 }
578
579 if (recurse_to_leaf)
580 /* copy final _leaf_ values to output set */
581 memcpy(o, c, osize*sizeof(*o));
582
583 /* swap t and w arrays */
584 tmp = o;
585 o = w;
586 w = tmp;
587 wsize = osize;
588 break;
589
590
591 case CRUSH_RULE_EMIT:
592 for (i = 0; i < wsize && result_len < result_max; i++) {
593 result[result_len] = w[i];
594 result_len++;
595 }
596 wsize = 0;
597 break;
598
599 default:
600 BUG_ON(1);
601 }
602 }
603 rc = result_len;
604
605out:
606 return rc;
607}
608
609
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
deleted file mode 100644
index c46b99c18bb0..000000000000
--- a/fs/ceph/crush/mapper.h
+++ /dev/null
@@ -1,20 +0,0 @@
1#ifndef CEPH_CRUSH_MAPPER_H
2#define CEPH_CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
deleted file mode 100644
index a3e627f63293..000000000000
--- a/fs/ceph/crypto.c
+++ /dev/null
@@ -1,412 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include "crypto.h"
10#include "decode.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
79
80static int ceph_aes_encrypt(const void *key, int key_len,
81 void *dst, size_t *dst_len,
82 const void *src, size_t src_len)
83{
84 struct scatterlist sg_in[2], sg_out[1];
85 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
86 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
87 int ret;
88 void *iv;
89 int ivsize;
90 size_t zero_padding = (0x10 - (src_len & 0x0f));
91 char pad[16];
92
93 if (IS_ERR(tfm))
94 return PTR_ERR(tfm);
95
96 memset(pad, zero_padding, zero_padding);
97
98 *dst_len = src_len + zero_padding;
99
100 crypto_blkcipher_setkey((void *)tfm, key, key_len);
101 sg_init_table(sg_in, 2);
102 sg_set_buf(&sg_in[0], src, src_len);
103 sg_set_buf(&sg_in[1], pad, zero_padding);
104 sg_init_table(sg_out, 1);
105 sg_set_buf(sg_out, dst, *dst_len);
106 iv = crypto_blkcipher_crt(tfm)->iv;
107 ivsize = crypto_blkcipher_ivsize(tfm);
108
109 memcpy(iv, aes_iv, ivsize);
110 /*
111 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
112 key, key_len, 1);
113 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
114 src, src_len, 1);
115 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
116 pad, zero_padding, 1);
117 */
118 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
119 src_len + zero_padding);
120 crypto_free_blkcipher(tfm);
121 if (ret < 0)
122 pr_err("ceph_aes_crypt failed %d\n", ret);
123 /*
124 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
125 dst, *dst_len, 1);
126 */
127 return 0;
128}
129
130static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
131 size_t *dst_len,
132 const void *src1, size_t src1_len,
133 const void *src2, size_t src2_len)
134{
135 struct scatterlist sg_in[3], sg_out[1];
136 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
137 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
138 int ret;
139 void *iv;
140 int ivsize;
141 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
142 char pad[16];
143
144 if (IS_ERR(tfm))
145 return PTR_ERR(tfm);
146
147 memset(pad, zero_padding, zero_padding);
148
149 *dst_len = src1_len + src2_len + zero_padding;
150
151 crypto_blkcipher_setkey((void *)tfm, key, key_len);
152 sg_init_table(sg_in, 3);
153 sg_set_buf(&sg_in[0], src1, src1_len);
154 sg_set_buf(&sg_in[1], src2, src2_len);
155 sg_set_buf(&sg_in[2], pad, zero_padding);
156 sg_init_table(sg_out, 1);
157 sg_set_buf(sg_out, dst, *dst_len);
158 iv = crypto_blkcipher_crt(tfm)->iv;
159 ivsize = crypto_blkcipher_ivsize(tfm);
160
161 memcpy(iv, aes_iv, ivsize);
162 /*
163 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
164 key, key_len, 1);
165 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
166 src1, src1_len, 1);
167 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
168 src2, src2_len, 1);
169 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
170 pad, zero_padding, 1);
171 */
172 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
173 src1_len + src2_len + zero_padding);
174 crypto_free_blkcipher(tfm);
175 if (ret < 0)
176 pr_err("ceph_aes_crypt2 failed %d\n", ret);
177 /*
178 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
179 dst, *dst_len, 1);
180 */
181 return 0;
182}
183
184static int ceph_aes_decrypt(const void *key, int key_len,
185 void *dst, size_t *dst_len,
186 const void *src, size_t src_len)
187{
188 struct scatterlist sg_in[1], sg_out[2];
189 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
190 struct blkcipher_desc desc = { .tfm = tfm };
191 char pad[16];
192 void *iv;
193 int ivsize;
194 int ret;
195 int last_byte;
196
197 if (IS_ERR(tfm))
198 return PTR_ERR(tfm);
199
200 crypto_blkcipher_setkey((void *)tfm, key, key_len);
201 sg_init_table(sg_in, 1);
202 sg_init_table(sg_out, 2);
203 sg_set_buf(sg_in, src, src_len);
204 sg_set_buf(&sg_out[0], dst, *dst_len);
205 sg_set_buf(&sg_out[1], pad, sizeof(pad));
206
207 iv = crypto_blkcipher_crt(tfm)->iv;
208 ivsize = crypto_blkcipher_ivsize(tfm);
209
210 memcpy(iv, aes_iv, ivsize);
211
212 /*
213 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
214 key, key_len, 1);
215 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
216 src, src_len, 1);
217 */
218
219 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
220 crypto_free_blkcipher(tfm);
221 if (ret < 0) {
222 pr_err("ceph_aes_decrypt failed %d\n", ret);
223 return ret;
224 }
225
226 if (src_len <= *dst_len)
227 last_byte = ((char *)dst)[src_len - 1];
228 else
229 last_byte = pad[src_len - *dst_len - 1];
230 if (last_byte <= 16 && src_len >= last_byte) {
231 *dst_len = src_len - last_byte;
232 } else {
233 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
234 last_byte, (int)src_len);
235 return -EPERM; /* bad padding */
236 }
237 /*
238 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
239 dst, *dst_len, 1);
240 */
241 return 0;
242}
243
244static int ceph_aes_decrypt2(const void *key, int key_len,
245 void *dst1, size_t *dst1_len,
246 void *dst2, size_t *dst2_len,
247 const void *src, size_t src_len)
248{
249 struct scatterlist sg_in[1], sg_out[3];
250 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
251 struct blkcipher_desc desc = { .tfm = tfm };
252 char pad[16];
253 void *iv;
254 int ivsize;
255 int ret;
256 int last_byte;
257
258 if (IS_ERR(tfm))
259 return PTR_ERR(tfm);
260
261 sg_init_table(sg_in, 1);
262 sg_set_buf(sg_in, src, src_len);
263 sg_init_table(sg_out, 3);
264 sg_set_buf(&sg_out[0], dst1, *dst1_len);
265 sg_set_buf(&sg_out[1], dst2, *dst2_len);
266 sg_set_buf(&sg_out[2], pad, sizeof(pad));
267
268 crypto_blkcipher_setkey((void *)tfm, key, key_len);
269 iv = crypto_blkcipher_crt(tfm)->iv;
270 ivsize = crypto_blkcipher_ivsize(tfm);
271
272 memcpy(iv, aes_iv, ivsize);
273
274 /*
275 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
276 key, key_len, 1);
277 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
278 src, src_len, 1);
279 */
280
281 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
282 crypto_free_blkcipher(tfm);
283 if (ret < 0) {
284 pr_err("ceph_aes_decrypt failed %d\n", ret);
285 return ret;
286 }
287
288 if (src_len <= *dst1_len)
289 last_byte = ((char *)dst1)[src_len - 1];
290 else if (src_len <= *dst1_len + *dst2_len)
291 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
292 else
293 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
294 if (last_byte <= 16 && src_len >= last_byte) {
295 src_len -= last_byte;
296 } else {
297 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
298 last_byte, (int)src_len);
299 return -EPERM; /* bad padding */
300 }
301
302 if (src_len < *dst1_len) {
303 *dst1_len = src_len;
304 *dst2_len = 0;
305 } else {
306 *dst2_len = src_len - *dst1_len;
307 }
308 /*
309 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
310 dst1, *dst1_len, 1);
311 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
312 dst2, *dst2_len, 1);
313 */
314
315 return 0;
316}
317
318
319int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
320 const void *src, size_t src_len)
321{
322 switch (secret->type) {
323 case CEPH_CRYPTO_NONE:
324 if (*dst_len < src_len)
325 return -ERANGE;
326 memcpy(dst, src, src_len);
327 *dst_len = src_len;
328 return 0;
329
330 case CEPH_CRYPTO_AES:
331 return ceph_aes_decrypt(secret->key, secret->len, dst,
332 dst_len, src, src_len);
333
334 default:
335 return -EINVAL;
336 }
337}
338
339int ceph_decrypt2(struct ceph_crypto_key *secret,
340 void *dst1, size_t *dst1_len,
341 void *dst2, size_t *dst2_len,
342 const void *src, size_t src_len)
343{
344 size_t t;
345
346 switch (secret->type) {
347 case CEPH_CRYPTO_NONE:
348 if (*dst1_len + *dst2_len < src_len)
349 return -ERANGE;
350 t = min(*dst1_len, src_len);
351 memcpy(dst1, src, t);
352 *dst1_len = t;
353 src += t;
354 src_len -= t;
355 if (src_len) {
356 t = min(*dst2_len, src_len);
357 memcpy(dst2, src, t);
358 *dst2_len = t;
359 }
360 return 0;
361
362 case CEPH_CRYPTO_AES:
363 return ceph_aes_decrypt2(secret->key, secret->len,
364 dst1, dst1_len, dst2, dst2_len,
365 src, src_len);
366
367 default:
368 return -EINVAL;
369 }
370}
371
372int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
373 const void *src, size_t src_len)
374{
375 switch (secret->type) {
376 case CEPH_CRYPTO_NONE:
377 if (*dst_len < src_len)
378 return -ERANGE;
379 memcpy(dst, src, src_len);
380 *dst_len = src_len;
381 return 0;
382
383 case CEPH_CRYPTO_AES:
384 return ceph_aes_encrypt(secret->key, secret->len, dst,
385 dst_len, src, src_len);
386
387 default:
388 return -EINVAL;
389 }
390}
391
392int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
393 const void *src1, size_t src1_len,
394 const void *src2, size_t src2_len)
395{
396 switch (secret->type) {
397 case CEPH_CRYPTO_NONE:
398 if (*dst_len < src1_len + src2_len)
399 return -ERANGE;
400 memcpy(dst, src1, src1_len);
401 memcpy(dst + src1_len, src2, src2_len);
402 *dst_len = src1_len + src2_len;
403 return 0;
404
405 case CEPH_CRYPTO_AES:
406 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
407 src1, src1_len, src2, src2_len);
408
409 default:
410 return -EINVAL;
411 }
412}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
deleted file mode 100644
index bdf38607323c..000000000000
--- a/fs/ceph/crypto.h
+++ /dev/null
@@ -1,48 +0,0 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const char *src, const char *end);
46extern int ceph_unarmor(char *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6fd8b20a8611..7ae1b3d55b58 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/device.h> 3#include <linux/device.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
@@ -7,143 +7,49 @@
7#include <linux/debugfs.h> 7#include <linux/debugfs.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9 9
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/mon_client.h>
12#include <linux/ceph/auth.h>
13#include <linux/ceph/debugfs.h>
14
10#include "super.h" 15#include "super.h"
11#include "mds_client.h"
12#include "mon_client.h"
13#include "auth.h"
14 16
15#ifdef CONFIG_DEBUG_FS 17#ifdef CONFIG_DEBUG_FS
16 18
17/* 19#include "mds_client.h"
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../mdsmap - current mdsmap
23 * .../monmap - current monmap
24 * .../osdc - active osd requests
25 * .../mdsc - active mds requests
26 * .../monc - mon client state
27 * .../dentry_lru - dump contents of dentry lru
28 * .../caps - expose cap (reservation) stats
29 * .../bdi - symlink to ../../bdi/something
30 */
31
32static struct dentry *ceph_debugfs_dir;
33
34static int monmap_show(struct seq_file *s, void *p)
35{
36 int i;
37 struct ceph_client *client = s->private;
38
39 if (client->monc.monmap == NULL)
40 return 0;
41
42 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
43 for (i = 0; i < client->monc.monmap->num_mon; i++) {
44 struct ceph_entity_inst *inst =
45 &client->monc.monmap->mon_inst[i];
46
47 seq_printf(s, "\t%s%lld\t%s\n",
48 ENTITY_NAME(inst->name),
49 pr_addr(&inst->addr.in_addr));
50 }
51 return 0;
52}
53 20
54static int mdsmap_show(struct seq_file *s, void *p) 21static int mdsmap_show(struct seq_file *s, void *p)
55{ 22{
56 int i; 23 int i;
57 struct ceph_client *client = s->private; 24 struct ceph_fs_client *fsc = s->private;
58 25
59 if (client->mdsc.mdsmap == NULL) 26 if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
60 return 0; 27 return 0;
61 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); 28 seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
62 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); 29 seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
63 seq_printf(s, "session_timeout %d\n", 30 seq_printf(s, "session_timeout %d\n",
64 client->mdsc.mdsmap->m_session_timeout); 31 fsc->mdsc->mdsmap->m_session_timeout);
65 seq_printf(s, "session_autoclose %d\n", 32 seq_printf(s, "session_autoclose %d\n",
66 client->mdsc.mdsmap->m_session_autoclose); 33 fsc->mdsc->mdsmap->m_session_autoclose);
67 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { 34 for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
68 struct ceph_entity_addr *addr = 35 struct ceph_entity_addr *addr =
69 &client->mdsc.mdsmap->m_info[i].addr; 36 &fsc->mdsc->mdsmap->m_info[i].addr;
70 int state = client->mdsc.mdsmap->m_info[i].state; 37 int state = fsc->mdsc->mdsmap->m_info[i].state;
71 38
72 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), 39 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
40 ceph_pr_addr(&addr->in_addr),
73 ceph_mds_state_name(state)); 41 ceph_mds_state_name(state));
74 } 42 }
75 return 0; 43 return 0;
76} 44}
77 45
78static int osdmap_show(struct seq_file *s, void *p) 46/*
79{ 47 * mdsc debugfs
80 int i; 48 */
81 struct ceph_client *client = s->private;
82 struct rb_node *n;
83
84 if (client->osdc.osdmap == NULL)
85 return 0;
86 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
87 seq_printf(s, "flags%s%s\n",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
89 " NEARFULL" : "",
90 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
91 " FULL" : "");
92 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
93 struct ceph_pg_pool_info *pool =
94 rb_entry(n, struct ceph_pg_pool_info, node);
95 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
96 pool->id, pool->v.pg_num, pool->pg_num_mask,
97 pool->v.lpg_num, pool->lpg_num_mask);
98 }
99 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
100 struct ceph_entity_addr *addr =
101 &client->osdc.osdmap->osd_addr[i];
102 int state = client->osdc.osdmap->osd_state[i];
103 char sb[64];
104
105 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
106 i, pr_addr(&addr->in_addr),
107 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
108 ceph_osdmap_state_str(sb, sizeof(sb), state));
109 }
110 return 0;
111}
112
113static int monc_show(struct seq_file *s, void *p)
114{
115 struct ceph_client *client = s->private;
116 struct ceph_mon_generic_request *req;
117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp;
119
120 mutex_lock(&monc->mutex);
121
122 if (monc->have_mdsmap)
123 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
124 if (monc->have_osdmap)
125 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n");
128
129 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
130 __u16 op;
131 req = rb_entry(rp, struct ceph_mon_generic_request, node);
132 op = le16_to_cpu(req->request->hdr.type);
133 if (op == CEPH_MSG_STATFS)
134 seq_printf(s, "%lld statfs\n", req->tid);
135 else
136 seq_printf(s, "%lld unknown\n", req->tid);
137 }
138
139 mutex_unlock(&monc->mutex);
140 return 0;
141}
142
143static int mdsc_show(struct seq_file *s, void *p) 49static int mdsc_show(struct seq_file *s, void *p)
144{ 50{
145 struct ceph_client *client = s->private; 51 struct ceph_fs_client *fsc = s->private;
146 struct ceph_mds_client *mdsc = &client->mdsc; 52 struct ceph_mds_client *mdsc = fsc->mdsc;
147 struct ceph_mds_request *req; 53 struct ceph_mds_request *req;
148 struct rb_node *rp; 54 struct rb_node *rp;
149 int pathlen; 55 int pathlen;
@@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p)
214 return 0; 120 return 0;
215} 121}
216 122
217static int osdc_show(struct seq_file *s, void *pp)
218{
219 struct ceph_client *client = s->private;
220 struct ceph_osd_client *osdc = &client->osdc;
221 struct rb_node *p;
222
223 mutex_lock(&osdc->request_mutex);
224 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
225 struct ceph_osd_request *req;
226 struct ceph_osd_request_head *head;
227 struct ceph_osd_op *op;
228 int num_ops;
229 int opcode, olen;
230 int i;
231
232 req = rb_entry(p, struct ceph_osd_request, r_node);
233
234 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
235 req->r_osd ? req->r_osd->o_osd : -1,
236 le32_to_cpu(req->r_pgid.pool),
237 le16_to_cpu(req->r_pgid.ps));
238
239 head = req->r_request->front.iov_base;
240 op = (void *)(head + 1);
241
242 num_ops = le16_to_cpu(head->num_ops);
243 olen = le32_to_cpu(head->object_len);
244 seq_printf(s, "%.*s", olen,
245 (const char *)(head->ops + num_ops));
246
247 if (req->r_reassert_version.epoch)
248 seq_printf(s, "\t%u'%llu",
249 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
250 le64_to_cpu(req->r_reassert_version.version));
251 else
252 seq_printf(s, "\t");
253
254 for (i = 0; i < num_ops; i++) {
255 opcode = le16_to_cpu(op->op);
256 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
257 op++;
258 }
259
260 seq_printf(s, "\n");
261 }
262 mutex_unlock(&osdc->request_mutex);
263 return 0;
264}
265
266static int caps_show(struct seq_file *s, void *p) 123static int caps_show(struct seq_file *s, void *p)
267{ 124{
268 struct ceph_client *client = s->private; 125 struct ceph_fs_client *fsc = s->private;
269 int total, avail, used, reserved, min; 126 int total, avail, used, reserved, min;
270 127
271 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); 128 ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
272 seq_printf(s, "total\t\t%d\n" 129 seq_printf(s, "total\t\t%d\n"
273 "avail\t\t%d\n" 130 "avail\t\t%d\n"
274 "used\t\t%d\n" 131 "used\t\t%d\n"
@@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p)
280 137
281static int dentry_lru_show(struct seq_file *s, void *ptr) 138static int dentry_lru_show(struct seq_file *s, void *ptr)
282{ 139{
283 struct ceph_client *client = s->private; 140 struct ceph_fs_client *fsc = s->private;
284 struct ceph_mds_client *mdsc = &client->mdsc; 141 struct ceph_mds_client *mdsc = fsc->mdsc;
285 struct ceph_dentry_info *di; 142 struct ceph_dentry_info *di;
286 143
287 spin_lock(&mdsc->dentry_lru_lock); 144 spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
295 return 0; 152 return 0;
296} 153}
297 154
298#define DEFINE_SHOW_FUNC(name) \ 155CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
299static int name##_open(struct inode *inode, struct file *file) \ 156CEPH_DEFINE_SHOW_FUNC(mdsc_show)
300{ \ 157CEPH_DEFINE_SHOW_FUNC(caps_show)
301 struct seq_file *sf; \ 158CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
302 int ret; \ 159
303 \
304 ret = single_open(file, name, NULL); \
305 sf = file->private_data; \
306 sf->private = inode->i_private; \
307 return ret; \
308} \
309 \
310static const struct file_operations name##_fops = { \
311 .open = name##_open, \
312 .read = seq_read, \
313 .llseek = seq_lseek, \
314 .release = single_release, \
315};
316
317DEFINE_SHOW_FUNC(monmap_show)
318DEFINE_SHOW_FUNC(mdsmap_show)
319DEFINE_SHOW_FUNC(osdmap_show)
320DEFINE_SHOW_FUNC(monc_show)
321DEFINE_SHOW_FUNC(mdsc_show)
322DEFINE_SHOW_FUNC(osdc_show)
323DEFINE_SHOW_FUNC(dentry_lru_show)
324DEFINE_SHOW_FUNC(caps_show)
325 160
161/*
162 * debugfs
163 */
326static int congestion_kb_set(void *data, u64 val) 164static int congestion_kb_set(void *data, u64 val)
327{ 165{
328 struct ceph_client *client = (struct ceph_client *)data; 166 struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
329
330 if (client)
331 client->mount_args->congestion_kb = (int)val;
332 167
168 fsc->mount_options->congestion_kb = (int)val;
333 return 0; 169 return 0;
334} 170}
335 171
336static int congestion_kb_get(void *data, u64 *val) 172static int congestion_kb_get(void *data, u64 *val)
337{ 173{
338 struct ceph_client *client = (struct ceph_client *)data; 174 struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
339
340 if (client)
341 *val = (u64)client->mount_args->congestion_kb;
342 175
176 *val = (u64)fsc->mount_options->congestion_kb;
343 return 0; 177 return 0;
344} 178}
345 179
346
347DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, 180DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
348 congestion_kb_set, "%llu\n"); 181 congestion_kb_set, "%llu\n");
349 182
350int __init ceph_debugfs_init(void)
351{
352 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
353 if (!ceph_debugfs_dir)
354 return -ENOMEM;
355 return 0;
356}
357 183
358void ceph_debugfs_cleanup(void) 184void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
359{ 185{
360 debugfs_remove(ceph_debugfs_dir); 186 dout("ceph_fs_debugfs_cleanup\n");
187 debugfs_remove(fsc->debugfs_bdi);
188 debugfs_remove(fsc->debugfs_congestion_kb);
189 debugfs_remove(fsc->debugfs_mdsmap);
190 debugfs_remove(fsc->debugfs_caps);
191 debugfs_remove(fsc->debugfs_mdsc);
192 debugfs_remove(fsc->debugfs_dentry_lru);
361} 193}
362 194
363int ceph_debugfs_client_init(struct ceph_client *client) 195int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
364{ 196{
365 int ret = 0; 197 char name[100];
366 char name[80]; 198 int err = -ENOMEM;
367
368 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
369 client->monc.auth->global_id);
370 199
371 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); 200 dout("ceph_fs_debugfs_init\n");
372 if (!client->debugfs_dir) 201 fsc->debugfs_congestion_kb =
373 goto out; 202 debugfs_create_file("writeback_congestion_kb",
374 203 0600,
375 client->monc.debugfs_file = debugfs_create_file("monc", 204 fsc->client->debugfs_dir,
376 0600, 205 fsc,
377 client->debugfs_dir, 206 &congestion_kb_fops);
378 client, 207 if (!fsc->debugfs_congestion_kb)
379 &monc_show_fops);
380 if (!client->monc.debugfs_file)
381 goto out; 208 goto out;
382 209
383 client->mdsc.debugfs_file = debugfs_create_file("mdsc", 210 dout("a\n");
384 0600,
385 client->debugfs_dir,
386 client,
387 &mdsc_show_fops);
388 if (!client->mdsc.debugfs_file)
389 goto out;
390 211
391 client->osdc.debugfs_file = debugfs_create_file("osdc", 212 snprintf(name, sizeof(name), "../../bdi/%s",
392 0600, 213 dev_name(fsc->backing_dev_info.dev));
393 client->debugfs_dir, 214 fsc->debugfs_bdi =
394 client, 215 debugfs_create_symlink("bdi",
395 &osdc_show_fops); 216 fsc->client->debugfs_dir,
396 if (!client->osdc.debugfs_file) 217 name);
218 if (!fsc->debugfs_bdi)
397 goto out; 219 goto out;
398 220
399 client->debugfs_monmap = debugfs_create_file("monmap", 221 dout("b\n");
222 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
400 0600, 223 0600,
401 client->debugfs_dir, 224 fsc->client->debugfs_dir,
402 client, 225 fsc,
403 &monmap_show_fops);
404 if (!client->debugfs_monmap)
405 goto out;
406
407 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
408 0600,
409 client->debugfs_dir,
410 client,
411 &mdsmap_show_fops); 226 &mdsmap_show_fops);
412 if (!client->debugfs_mdsmap) 227 if (!fsc->debugfs_mdsmap)
413 goto out;
414
415 client->debugfs_osdmap = debugfs_create_file("osdmap",
416 0600,
417 client->debugfs_dir,
418 client,
419 &osdmap_show_fops);
420 if (!client->debugfs_osdmap)
421 goto out; 228 goto out;
422 229
423 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", 230 dout("ca\n");
424 0600, 231 fsc->debugfs_mdsc = debugfs_create_file("mdsc",
425 client->debugfs_dir, 232 0600,
426 client, 233 fsc->client->debugfs_dir,
427 &dentry_lru_show_fops); 234 fsc,
428 if (!client->debugfs_dentry_lru) 235 &mdsc_show_fops);
236 if (!fsc->debugfs_mdsc)
429 goto out; 237 goto out;
430 238
431 client->debugfs_caps = debugfs_create_file("caps", 239 dout("da\n");
240 fsc->debugfs_caps = debugfs_create_file("caps",
432 0400, 241 0400,
433 client->debugfs_dir, 242 fsc->client->debugfs_dir,
434 client, 243 fsc,
435 &caps_show_fops); 244 &caps_show_fops);
436 if (!client->debugfs_caps) 245 if (!fsc->debugfs_caps)
437 goto out; 246 goto out;
438 247
439 client->debugfs_congestion_kb = 248 dout("ea\n");
440 debugfs_create_file("writeback_congestion_kb", 249 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
441 0600, 250 0600,
442 client->debugfs_dir, 251 fsc->client->debugfs_dir,
443 client, 252 fsc,
444 &congestion_kb_fops); 253 &dentry_lru_show_fops);
445 if (!client->debugfs_congestion_kb) 254 if (!fsc->debugfs_dentry_lru)
446 goto out; 255 goto out;
447 256
448 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
449 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
450 name);
451
452 return 0; 257 return 0;
453 258
454out: 259out:
455 ceph_debugfs_client_cleanup(client); 260 ceph_fs_debugfs_cleanup(fsc);
456 return ret; 261 return err;
457} 262}
458 263
459void ceph_debugfs_client_cleanup(struct ceph_client *client)
460{
461 debugfs_remove(client->debugfs_bdi);
462 debugfs_remove(client->debugfs_caps);
463 debugfs_remove(client->debugfs_dentry_lru);
464 debugfs_remove(client->debugfs_osdmap);
465 debugfs_remove(client->debugfs_mdsmap);
466 debugfs_remove(client->debugfs_monmap);
467 debugfs_remove(client->osdc.debugfs_file);
468 debugfs_remove(client->mdsc.debugfs_file);
469 debugfs_remove(client->monc.debugfs_file);
470 debugfs_remove(client->debugfs_congestion_kb);
471 debugfs_remove(client->debugfs_dir);
472}
473 264
474#else /* CONFIG_DEBUG_FS */ 265#else /* CONFIG_DEBUG_FS */
475 266
476int __init ceph_debugfs_init(void) 267int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
477{
478 return 0;
479}
480
481void ceph_debugfs_cleanup(void)
482{
483}
484
485int ceph_debugfs_client_init(struct ceph_client *client)
486{ 268{
487 return 0; 269 return 0;
488} 270}
489 271
490void ceph_debugfs_client_cleanup(struct ceph_client *client) 272void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
491{ 273{
492} 274}
493 275
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
deleted file mode 100644
index 3d25415afe63..000000000000
--- a/fs/ceph/decode.h
+++ /dev/null
@@ -1,196 +0,0 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 __be16 ss_family = htons(a->in_addr.ss_family);
103 a->in_addr.ss_family = *(__u16 *)&ss_family;
104}
105static inline void ceph_decode_addr(struct ceph_entity_addr *a)
106{
107 __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
108 a->in_addr.ss_family = ntohs(ss_family);
109 WARN_ON(a->in_addr.ss_family == 512);
110}
111
112/*
113 * encoders
114 */
115static inline void ceph_encode_64(void **p, u64 v)
116{
117 put_unaligned_le64(v, (__le64 *)*p);
118 *p += sizeof(u64);
119}
120static inline void ceph_encode_32(void **p, u32 v)
121{
122 put_unaligned_le32(v, (__le32 *)*p);
123 *p += sizeof(u32);
124}
125static inline void ceph_encode_16(void **p, u16 v)
126{
127 put_unaligned_le16(v, (__le16 *)*p);
128 *p += sizeof(u16);
129}
130static inline void ceph_encode_8(void **p, u8 v)
131{
132 *(u8 *)*p = v;
133 (*p)++;
134}
135static inline void ceph_encode_copy(void **p, const void *s, int len)
136{
137 memcpy(*p, s, len);
138 *p += len;
139}
140
141/*
142 * filepath, string encoders
143 */
144static inline void ceph_encode_filepath(void **p, void *end,
145 u64 ino, const char *path)
146{
147 u32 len = path ? strlen(path) : 0;
148 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
149 ceph_encode_8(p, 1);
150 ceph_encode_64(p, ino);
151 ceph_encode_32(p, len);
152 if (len)
153 memcpy(*p, path, len);
154 *p += len;
155}
156
157static inline void ceph_encode_string(void **p, void *end,
158 const char *s, u32 len)
159{
160 BUG_ON(*p + sizeof(len) + len > end);
161 ceph_encode_32(p, len);
162 if (len)
163 memcpy(*p, s, len);
164 *p += len;
165}
166
167#define ceph_encode_need(p, end, n, bad) \
168 do { \
169 if (unlikely(*(p) + (n) > (end))) \
170 goto bad; \
171 } while (0)
172
173#define ceph_encode_64_safe(p, end, v, bad) \
174 do { \
175 ceph_encode_need(p, end, sizeof(u64), bad); \
176 ceph_encode_64(p, v); \
177 } while (0)
178#define ceph_encode_32_safe(p, end, v, bad) \
179 do { \
180 ceph_encode_need(p, end, sizeof(u32), bad); \
181 ceph_encode_32(p, v); \
182 } while (0)
183#define ceph_encode_16_safe(p, end, v, bad) \
184 do { \
185 ceph_encode_need(p, end, sizeof(u16), bad); \
186 ceph_encode_16(p, v); \
187 } while (0)
188
189#define ceph_encode_copy_safe(p, end, pv, n, bad) \
190 do { \
191 ceph_encode_need(p, end, n, bad); \
192 ceph_encode_copy(p, pv, n); \
193 } while (0)
194
195
196#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a1986eb52045..e0a2dc6fcafc 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/spinlock.h> 3#include <linux/spinlock.h>
4#include <linux/fs_struct.h> 4#include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8 8
9#include "super.h" 9#include "super.h"
10#include "mds_client.h"
10 11
11/* 12/*
12 * Directory operations: readdir, lookup, create, link, unlink, 13 * Directory operations: readdir, lookup, create, link, unlink,
@@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p)
94 */ 95 */
95static int __dcache_readdir(struct file *filp, 96static int __dcache_readdir(struct file *filp,
96 void *dirent, filldir_t filldir) 97 void *dirent, filldir_t filldir)
97 __releases(inode->i_lock)
98 __acquires(inode->i_lock)
99{ 98{
100 struct inode *inode = filp->f_dentry->d_inode;
101 struct ceph_file_info *fi = filp->private_data; 99 struct ceph_file_info *fi = filp->private_data;
102 struct dentry *parent = filp->f_dentry; 100 struct dentry *parent = filp->f_dentry;
103 struct inode *dir = parent->d_inode; 101 struct inode *dir = parent->d_inode;
@@ -153,7 +151,6 @@ more:
153 151
154 atomic_inc(&dentry->d_count); 152 atomic_inc(&dentry->d_count);
155 spin_unlock(&dcache_lock); 153 spin_unlock(&dcache_lock);
156 spin_unlock(&inode->i_lock);
157 154
158 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 155 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
159 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 156 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -171,35 +168,30 @@ more:
171 } else { 168 } else {
172 dput(last); 169 dput(last);
173 } 170 }
174 last = NULL;
175 } 171 }
176
177 spin_lock(&inode->i_lock);
178 spin_lock(&dcache_lock);
179
180 last = dentry; 172 last = dentry;
181 173
182 if (err < 0) 174 if (err < 0)
183 goto out_unlock; 175 goto out;
184 176
185 p = p->prev;
186 filp->f_pos++; 177 filp->f_pos++;
187 178
188 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ 179 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
189 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) 180 if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
190 goto more; 181 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
191 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); 182 err = -EAGAIN;
192 err = -EAGAIN; 183 goto out;
184 }
185
186 spin_lock(&dcache_lock);
187 p = p->prev; /* advance to next dentry */
188 goto more;
193 189
194out_unlock: 190out_unlock:
195 spin_unlock(&dcache_lock); 191 spin_unlock(&dcache_lock);
196 192out:
197 if (last) { 193 if (last)
198 spin_unlock(&inode->i_lock);
199 dput(last); 194 dput(last);
200 spin_lock(&inode->i_lock);
201 }
202
203 return err; 195 return err;
204} 196}
205 197
@@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
227 struct ceph_file_info *fi = filp->private_data; 219 struct ceph_file_info *fi = filp->private_data;
228 struct inode *inode = filp->f_dentry->d_inode; 220 struct inode *inode = filp->f_dentry->d_inode;
229 struct ceph_inode_info *ci = ceph_inode(inode); 221 struct ceph_inode_info *ci = ceph_inode(inode);
230 struct ceph_client *client = ceph_inode_to_client(inode); 222 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
231 struct ceph_mds_client *mdsc = &client->mdsc; 223 struct ceph_mds_client *mdsc = fsc->mdsc;
232 unsigned frag = fpos_frag(filp->f_pos); 224 unsigned frag = fpos_frag(filp->f_pos);
233 int off = fpos_off(filp->f_pos); 225 int off = fpos_off(filp->f_pos);
234 int err; 226 int err;
235 u32 ftype; 227 u32 ftype;
236 struct ceph_mds_reply_info_parsed *rinfo; 228 struct ceph_mds_reply_info_parsed *rinfo;
237 const int max_entries = client->mount_args->max_readdir; 229 const int max_entries = fsc->mount_options->max_readdir;
238 const int max_bytes = client->mount_args->max_readdir_bytes; 230 const int max_bytes = fsc->mount_options->max_readdir_bytes;
239 231
240 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 232 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
241 if (fi->at_end) 233 if (fi->at_end)
@@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
267 /* can we use the dcache? */ 259 /* can we use the dcache? */
268 spin_lock(&inode->i_lock); 260 spin_lock(&inode->i_lock);
269 if ((filp->f_pos == 2 || fi->dentry) && 261 if ((filp->f_pos == 2 || fi->dentry) &&
270 !ceph_test_opt(client, NOASYNCREADDIR) && 262 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
271 ceph_snap(inode) != CEPH_SNAPDIR && 263 ceph_snap(inode) != CEPH_SNAPDIR &&
272 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 264 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
273 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 265 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
266 spin_unlock(&inode->i_lock);
274 err = __dcache_readdir(filp, dirent, filldir); 267 err = __dcache_readdir(filp, dirent, filldir);
275 if (err != -EAGAIN) { 268 if (err != -EAGAIN)
276 spin_unlock(&inode->i_lock);
277 return err; 269 return err;
278 } 270 } else {
271 spin_unlock(&inode->i_lock);
279 } 272 }
280 spin_unlock(&inode->i_lock);
281 if (fi->dentry) { 273 if (fi->dentry) {
282 err = note_last_dentry(fi, fi->dentry->d_name.name, 274 err = note_last_dentry(fi, fi->dentry->d_name.name,
283 fi->dentry->d_name.len); 275 fi->dentry->d_name.len);
@@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
487struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 479struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
488 struct dentry *dentry, int err) 480 struct dentry *dentry, int err)
489{ 481{
490 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 482 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
491 struct inode *parent = dentry->d_parent->d_inode; 483 struct inode *parent = dentry->d_parent->d_inode;
492 484
493 /* .snap dir? */ 485 /* .snap dir? */
494 if (err == -ENOENT && 486 if (err == -ENOENT &&
495 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
496 strcmp(dentry->d_name.name, 487 strcmp(dentry->d_name.name,
497 client->mount_args->snapdir_name) == 0) { 488 fsc->mount_options->snapdir_name) == 0) {
498 struct inode *inode = ceph_get_snapdir(parent); 489 struct inode *inode = ceph_get_snapdir(parent);
499 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
500 dentry, dentry->d_name.len, dentry->d_name.name, inode); 491 dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
539static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 530static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
540 struct nameidata *nd) 531 struct nameidata *nd)
541{ 532{
542 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 533 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
543 struct ceph_mds_client *mdsc = &client->mdsc; 534 struct ceph_mds_client *mdsc = fsc->mdsc;
544 struct ceph_mds_request *req; 535 struct ceph_mds_request *req;
545 int op; 536 int op;
546 int err; 537 int err;
@@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
572 spin_lock(&dir->i_lock); 563 spin_lock(&dir->i_lock);
573 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 564 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
574 if (strncmp(dentry->d_name.name, 565 if (strncmp(dentry->d_name.name,
575 client->mount_args->snapdir_name, 566 fsc->mount_options->snapdir_name,
576 dentry->d_name.len) && 567 dentry->d_name.len) &&
577 !is_root_ceph_dentry(dir, dentry) && 568 !is_root_ceph_dentry(dir, dentry) &&
578 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 569 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
629static int ceph_mknod(struct inode *dir, struct dentry *dentry, 620static int ceph_mknod(struct inode *dir, struct dentry *dentry,
630 int mode, dev_t rdev) 621 int mode, dev_t rdev)
631{ 622{
632 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 623 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
633 struct ceph_mds_client *mdsc = &client->mdsc; 624 struct ceph_mds_client *mdsc = fsc->mdsc;
634 struct ceph_mds_request *req; 625 struct ceph_mds_request *req;
635 int err; 626 int err;
636 627
@@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
685static int ceph_symlink(struct inode *dir, struct dentry *dentry, 676static int ceph_symlink(struct inode *dir, struct dentry *dentry,
686 const char *dest) 677 const char *dest)
687{ 678{
688 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 679 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
689 struct ceph_mds_client *mdsc = &client->mdsc; 680 struct ceph_mds_client *mdsc = fsc->mdsc;
690 struct ceph_mds_request *req; 681 struct ceph_mds_request *req;
691 int err; 682 int err;
692 683
@@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
716 707
717static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) 708static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
718{ 709{
719 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 710 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
720 struct ceph_mds_client *mdsc = &client->mdsc; 711 struct ceph_mds_client *mdsc = fsc->mdsc;
721 struct ceph_mds_request *req; 712 struct ceph_mds_request *req;
722 int err = -EROFS; 713 int err = -EROFS;
723 int op; 714 int op;
@@ -758,8 +749,8 @@ out:
758static int ceph_link(struct dentry *old_dentry, struct inode *dir, 749static int ceph_link(struct dentry *old_dentry, struct inode *dir,
759 struct dentry *dentry) 750 struct dentry *dentry)
760{ 751{
761 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 752 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
762 struct ceph_mds_client *mdsc = &client->mdsc; 753 struct ceph_mds_client *mdsc = fsc->mdsc;
763 struct ceph_mds_request *req; 754 struct ceph_mds_request *req;
764 int err; 755 int err;
765 756
@@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode)
813 */ 804 */
814static int ceph_unlink(struct inode *dir, struct dentry *dentry) 805static int ceph_unlink(struct inode *dir, struct dentry *dentry)
815{ 806{
816 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 807 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
817 struct ceph_mds_client *mdsc = &client->mdsc; 808 struct ceph_mds_client *mdsc = fsc->mdsc;
818 struct inode *inode = dentry->d_inode; 809 struct inode *inode = dentry->d_inode;
819 struct ceph_mds_request *req; 810 struct ceph_mds_request *req;
820 int err = -EROFS; 811 int err = -EROFS;
@@ -854,8 +845,8 @@ out:
854static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 845static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
855 struct inode *new_dir, struct dentry *new_dentry) 846 struct inode *new_dir, struct dentry *new_dentry)
856{ 847{
857 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); 848 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
858 struct ceph_mds_client *mdsc = &client->mdsc; 849 struct ceph_mds_client *mdsc = fsc->mdsc;
859 struct ceph_mds_request *req; 850 struct ceph_mds_request *req;
860 int err; 851 int err;
861 852
@@ -1076,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1076 struct ceph_inode_info *ci = ceph_inode(inode); 1067 struct ceph_inode_info *ci = ceph_inode(inode);
1077 int left; 1068 int left;
1078 1069
1079 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1070 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1080 return -EISDIR; 1071 return -EISDIR;
1081 1072
1082 if (!cf->dir_info) { 1073 if (!cf->dir_info) {
@@ -1177,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
1177 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1168 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1178 dn->d_name.len, dn->d_name.name); 1169 dn->d_name.len, dn->d_name.name);
1179 if (di) { 1170 if (di) {
1180 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1171 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1181 spin_lock(&mdsc->dentry_lru_lock); 1172 spin_lock(&mdsc->dentry_lru_lock);
1182 list_add_tail(&di->lru, &mdsc->dentry_lru); 1173 list_add_tail(&di->lru, &mdsc->dentry_lru);
1183 mdsc->num_dentry++; 1174 mdsc->num_dentry++;
@@ -1193,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1193 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, 1184 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1194 dn->d_name.len, dn->d_name.name, di->offset); 1185 dn->d_name.len, dn->d_name.name, di->offset);
1195 if (di) { 1186 if (di) {
1196 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1187 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1197 spin_lock(&mdsc->dentry_lru_lock); 1188 spin_lock(&mdsc->dentry_lru_lock);
1198 list_move_tail(&di->lru, &mdsc->dentry_lru); 1189 list_move_tail(&di->lru, &mdsc->dentry_lru);
1199 spin_unlock(&mdsc->dentry_lru_lock); 1190 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1208,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
1208 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1199 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1209 dn->d_name.len, dn->d_name.name); 1200 dn->d_name.len, dn->d_name.name);
1210 if (di) { 1201 if (di) {
1211 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1202 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1212 spin_lock(&mdsc->dentry_lru_lock); 1203 spin_lock(&mdsc->dentry_lru_lock);
1213 list_del_init(&di->lru); 1204 list_del_init(&di->lru);
1214 mdsc->num_dentry--; 1205 mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 4480cb1c63e7..2297d9426992 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/exportfs.h> 3#include <linux/exportfs.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <asm/unaligned.h> 5#include <asm/unaligned.h>
6 6
7#include "super.h" 7#include "super.h"
8#include "mds_client.h"
8 9
9/* 10/*
10 * NFS export support 11 * NFS export support
@@ -42,32 +43,37 @@ struct ceph_nfs_confh {
42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, 43static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
43 int connectable) 44 int connectable)
44{ 45{
46 int type;
45 struct ceph_nfs_fh *fh = (void *)rawfh; 47 struct ceph_nfs_fh *fh = (void *)rawfh;
46 struct ceph_nfs_confh *cfh = (void *)rawfh; 48 struct ceph_nfs_confh *cfh = (void *)rawfh;
47 struct dentry *parent = dentry->d_parent; 49 struct dentry *parent = dentry->d_parent;
48 struct inode *inode = dentry->d_inode; 50 struct inode *inode = dentry->d_inode;
49 int type; 51 int connected_handle_length = sizeof(*cfh)/4;
52 int handle_length = sizeof(*fh)/4;
50 53
51 /* don't re-export snaps */ 54 /* don't re-export snaps */
52 if (ceph_snap(inode) != CEPH_NOSNAP) 55 if (ceph_snap(inode) != CEPH_NOSNAP)
53 return -EINVAL; 56 return -EINVAL;
54 57
55 if (*max_len >= sizeof(*cfh)) { 58 if (*max_len >= connected_handle_length) {
56 dout("encode_fh %p connectable\n", dentry); 59 dout("encode_fh %p connectable\n", dentry);
57 cfh->ino = ceph_ino(dentry->d_inode); 60 cfh->ino = ceph_ino(dentry->d_inode);
58 cfh->parent_ino = ceph_ino(parent->d_inode); 61 cfh->parent_ino = ceph_ino(parent->d_inode);
59 cfh->parent_name_hash = parent->d_name.hash; 62 cfh->parent_name_hash = parent->d_name.hash;
60 *max_len = sizeof(*cfh); 63 *max_len = connected_handle_length;
61 type = 2; 64 type = 2;
62 } else if (*max_len > sizeof(*fh)) { 65 } else if (*max_len >= handle_length) {
63 if (connectable) 66 if (connectable) {
64 return -ENOSPC; 67 *max_len = connected_handle_length;
68 return 255;
69 }
65 dout("encode_fh %p\n", dentry); 70 dout("encode_fh %p\n", dentry);
66 fh->ino = ceph_ino(dentry->d_inode); 71 fh->ino = ceph_ino(dentry->d_inode);
67 *max_len = sizeof(*fh); 72 *max_len = handle_length;
68 type = 1; 73 type = 1;
69 } else { 74 } else {
70 return -ENOSPC; 75 *max_len = handle_length;
76 return 255;
71 } 77 }
72 return type; 78 return type;
73} 79}
@@ -115,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
115static struct dentry *__cfh_to_dentry(struct super_block *sb, 121static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh) 122 struct ceph_nfs_confh *cfh)
117{ 123{
118 struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; 124 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
119 struct inode *inode; 125 struct inode *inode;
120 struct dentry *dentry; 126 struct dentry *dentry;
121 struct ceph_vino vino; 127 struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8c044a4f0457..e77c28cf3690 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/file.h> 6#include <linux/file.h>
@@ -38,8 +39,8 @@
38static struct ceph_mds_request * 39static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode) 40prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{ 41{
41 struct ceph_client *client = ceph_sb_to_client(sb); 42 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc; 43 struct ceph_mds_client *mdsc = fsc->mdsc;
43 struct ceph_mds_request *req; 44 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS; 45 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; 46 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
117int ceph_open(struct inode *inode, struct file *file) 118int ceph_open(struct inode *inode, struct file *file)
118{ 119{
119 struct ceph_inode_info *ci = ceph_inode(inode); 120 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 121 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc; 122 struct ceph_mds_client *mdsc = fsc->mdsc;
122 struct ceph_mds_request *req; 123 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data; 124 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 125 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode, 217 struct nameidata *nd, int mode,
217 int locked_dir) 218 int locked_dir)
218{ 219{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 220 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc; 221 struct ceph_mds_client *mdsc = fsc->mdsc;
221 struct file *file = nd->intent.open.file; 222 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); 223 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req; 224 struct ceph_mds_request *req;
@@ -270,163 +271,6 @@ int ceph_release(struct inode *inode, struct file *file)
270} 271}
271 272
272/* 273/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = __page_cache_alloc(flags);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over 274 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.) 275 * objects we stripe over. (That's not atomic, but good enough for now.)
432 * 276 *
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode,
438 struct page **pages, int num_pages, 282 struct page **pages, int num_pages,
439 int *checkeof) 283 int *checkeof)
440{ 284{
441 struct ceph_client *client = ceph_inode_to_client(inode); 285 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode); 286 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len; 287 u64 pos, this_len;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 288 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode,
459 303
460more: 304more:
461 this_len = left; 305 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), 306 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len, 307 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq, 308 ci->i_truncate_seq,
465 ci->i_truncate_size, 309 ci->i_truncate_size,
@@ -477,8 +321,8 @@ more:
477 321
478 if (read < pos - off) { 322 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos); 323 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read, 324 ceph_zero_page_vector_range(page_off + read,
481 pos - off - read, pages); 325 pos - off - read, pages);
482 } 326 }
483 pos += ret; 327 pos += ret;
484 read = pos - off; 328 read = pos - off;
@@ -495,8 +339,8 @@ more:
495 /* was original extent fully inside i_size? */ 339 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) { 340 if (pos + left <= inode->i_size) {
497 dout("zero tail\n"); 341 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read, 342 ceph_zero_page_vector_range(page_off + read, len - read,
499 pages); 343 pages);
500 read = len; 344 read = len;
501 goto out; 345 goto out;
502 } 346 }
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 375 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532 376
533 if (file->f_flags & O_DIRECT) { 377 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len); 378 pages = ceph_get_direct_page_vector(data, num_pages, off, len);
535 379
536 /* 380 /*
537 * flush any page cache pages in this range. this 381 * flush any page cache pages in this range. this
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof); 396 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
553 397
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 398 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret); 399 ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0) 400 if (ret >= 0)
557 *poff = off + ret; 401 *poff = off + ret;
558 402
559done: 403done:
560 if (file->f_flags & O_DIRECT) 404 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages); 405 ceph_put_page_vector(pages, num_pages);
562 else 406 else
563 ceph_release_page_vector(pages, num_pages); 407 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret); 408 dout("sync_read result %d\n", ret);
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
594{ 438{
595 struct inode *inode = file->f_dentry->d_inode; 439 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode); 440 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode); 441 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req; 442 struct ceph_osd_request *req;
599 struct page **pages; 443 struct page **pages;
600 int num_pages; 444 int num_pages;
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
642 */ 486 */
643more: 487more:
644 len = left; 488 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, 489 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len, 490 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags, 491 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context, 492 ci->i_snap_realm->cached_context,
@@ -655,7 +499,7 @@ more:
655 num_pages = calc_pages_for(pos, len); 499 num_pages = calc_pages_for(pos, len);
656 500
657 if (file->f_flags & O_DIRECT) { 501 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len); 502 pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
659 if (IS_ERR(pages)) { 503 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages); 504 ret = PTR_ERR(pages);
661 goto out; 505 goto out;
@@ -673,7 +517,7 @@ more:
673 ret = PTR_ERR(pages); 517 ret = PTR_ERR(pages);
674 goto out; 518 goto out;
675 } 519 }
676 ret = copy_user_to_page_vector(pages, data, pos, len); 520 ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
677 if (ret < 0) { 521 if (ret < 0) {
678 ceph_release_page_vector(pages, num_pages); 522 ceph_release_page_vector(pages, num_pages);
679 goto out; 523 goto out;
@@ -689,7 +533,7 @@ more:
689 req->r_num_pages = num_pages; 533 req->r_num_pages = num_pages;
690 req->r_inode = inode; 534 req->r_inode = inode;
691 535
692 ret = ceph_osdc_start_request(&client->osdc, req, false); 536 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
693 if (!ret) { 537 if (!ret) {
694 if (req->r_safe_callback) { 538 if (req->r_safe_callback) {
695 /* 539 /*
@@ -697,15 +541,15 @@ more:
697 * start_request so that a tid has been assigned. 541 * start_request so that a tid has been assigned.
698 */ 542 */
699 spin_lock(&ci->i_unsafe_lock); 543 spin_lock(&ci->i_unsafe_lock);
700 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item); 544 list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
701 spin_unlock(&ci->i_unsafe_lock); 545 spin_unlock(&ci->i_unsafe_lock);
702 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); 546 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
703 } 547 }
704 ret = ceph_osdc_wait_request(&client->osdc, req); 548 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
705 } 549 }
706 550
707 if (file->f_flags & O_DIRECT) 551 if (file->f_flags & O_DIRECT)
708 put_page_vector(pages, num_pages); 552 ceph_put_page_vector(pages, num_pages);
709 else if (file->f_flags & O_SYNC) 553 else if (file->f_flags & O_SYNC)
710 ceph_release_page_vector(pages, num_pages); 554 ceph_release_page_vector(pages, num_pages);
711 555
@@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
814 struct ceph_file_info *fi = file->private_data; 658 struct ceph_file_info *fi = file->private_data;
815 struct inode *inode = file->f_dentry->d_inode; 659 struct inode *inode = file->f_dentry->d_inode;
816 struct ceph_inode_info *ci = ceph_inode(inode); 660 struct ceph_inode_info *ci = ceph_inode(inode);
817 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 661 struct ceph_osd_client *osdc =
662 &ceph_sb_to_client(inode->i_sb)->client->osdc;
818 loff_t endoff = pos + iov->iov_len; 663 loff_t endoff = pos + iov->iov_len;
819 int want, got = 0; 664 int want, got = 0;
820 int ret, err; 665 int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 62377ec37edf..1d6a45b5a04c 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
@@ -13,7 +13,8 @@
13#include <linux/pagevec.h> 13#include <linux/pagevec.h>
14 14
15#include "super.h" 15#include "super.h"
16#include "decode.h" 16#include "mds_client.h"
17#include <linux/ceph/decode.h>
17 18
18/* 19/*
19 * Ceph inode operations 20 * Ceph inode operations
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode)
384 */ 385 */
385 if (ci->i_snap_realm) { 386 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc = 387 struct ceph_mds_client *mdsc =
387 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 388 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm; 389 struct ceph_snap_realm *realm = ci->i_snap_realm;
389 390
390 dout(" dropping residual ref to snap realm %p\n", realm); 391 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode,
685 } 686 }
686 687
687 /* it may be better to set st_size in getattr instead? */ 688 /* it may be better to set st_size in getattr instead? */
688 if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) 689 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
689 inode->i_size = ci->i_rbytes; 690 inode->i_size = ci->i_rbytes;
690 break; 691 break;
691 default: 692 default:
@@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
901 struct inode *in = NULL; 902 struct inode *in = NULL;
902 struct ceph_mds_reply_inode *ininfo; 903 struct ceph_mds_reply_inode *ininfo;
903 struct ceph_vino vino; 904 struct ceph_vino vino;
904 struct ceph_client *client = ceph_sb_to_client(sb); 905 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
905 int i = 0; 906 int i = 0;
906 int err = 0; 907 int err = 0;
907 908
@@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
965 */ 966 */
966 if (rinfo->head->is_dentry && !req->r_aborted && 967 if (rinfo->head->is_dentry && !req->r_aborted &&
967 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, 968 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
968 client->mount_args->snapdir_name, 969 fsc->mount_options->snapdir_name,
969 req->r_dentry->d_name.len))) { 970 req->r_dentry->d_name.len))) {
970 /* 971 /*
971 * lookup link rename : null -> possibly existing inode 972 * lookup link rename : null -> possibly existing inode
@@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1533 struct inode *parent_inode = dentry->d_parent->d_inode; 1534 struct inode *parent_inode = dentry->d_parent->d_inode;
1534 const unsigned int ia_valid = attr->ia_valid; 1535 const unsigned int ia_valid = attr->ia_valid;
1535 struct ceph_mds_request *req; 1536 struct ceph_mds_request *req;
1536 struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; 1537 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
1537 int issued; 1538 int issued;
1538 int release = 0, dirtied = 0; 1539 int release = 0, dirtied = 0;
1539 int mask = 0; 1540 int mask = 0;
@@ -1728,8 +1729,8 @@ out:
1728 */ 1729 */
1729int ceph_do_getattr(struct inode *inode, int mask) 1730int ceph_do_getattr(struct inode *inode, int mask)
1730{ 1731{
1731 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 1732 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
1732 struct ceph_mds_client *mdsc = &client->mdsc; 1733 struct ceph_mds_client *mdsc = fsc->mdsc;
1733 struct ceph_mds_request *req; 1734 struct ceph_mds_request *req;
1734 int err; 1735 int err;
1735 1736
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 76e307d2aba1..8888c9ba68db 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
1#include <linux/in.h> 1#include <linux/in.h>
2 2
3#include "ioctl.h"
4#include "super.h" 3#include "super.h"
5#include "ceph_debug.h" 4#include "mds_client.h"
5#include <linux/ceph/ceph_debug.h>
6
7#include "ioctl.h"
6 8
7 9
8/* 10/*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{ 39{
38 struct inode *inode = file->f_dentry->d_inode; 40 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 41 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 42 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req; 43 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l; 44 struct ceph_ioctl_layout l;
43 int err, i; 45 int err, i;
@@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
90} 92}
91 93
92/* 94/*
95 * Set a layout policy on a directory inode. All items in the tree
96 * rooted at this inode will inherit this layout on creation,
97 * (It doesn't apply retroactively )
98 * unless a subdirectory has its own layout policy.
99 */
100static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
101{
102 struct inode *inode = file->f_dentry->d_inode;
103 struct ceph_mds_request *req;
104 struct ceph_ioctl_layout l;
105 int err, i;
106 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
107
108 /* copy and validate */
109 if (copy_from_user(&l, arg, sizeof(l)))
110 return -EFAULT;
111
112 if ((l.object_size & ~PAGE_MASK) ||
113 (l.stripe_unit & ~PAGE_MASK) ||
114 !l.stripe_unit ||
115 (l.object_size &&
116 (unsigned)l.object_size % (unsigned)l.stripe_unit))
117 return -EINVAL;
118
119 /* make sure it's a valid data pool */
120 if (l.data_pool > 0) {
121 mutex_lock(&mdsc->mutex);
122 err = -EINVAL;
123 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
124 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
125 err = 0;
126 break;
127 }
128 mutex_unlock(&mdsc->mutex);
129 if (err)
130 return err;
131 }
132
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
134 USE_AUTH_MDS);
135
136 if (IS_ERR(req))
137 return PTR_ERR(req);
138 req->r_inode = igrab(inode);
139
140 req->r_args.setlayout.layout.fl_stripe_unit =
141 cpu_to_le32(l.stripe_unit);
142 req->r_args.setlayout.layout.fl_stripe_count =
143 cpu_to_le32(l.stripe_count);
144 req->r_args.setlayout.layout.fl_object_size =
145 cpu_to_le32(l.object_size);
146 req->r_args.setlayout.layout.fl_pg_pool =
147 cpu_to_le32(l.data_pool);
148 req->r_args.setlayout.layout.fl_pg_preferred =
149 cpu_to_le32(l.preferred_osd);
150
151 err = ceph_mdsc_do_request(mdsc, inode, req);
152 ceph_mdsc_put_request(req);
153 return err;
154}
155
156/*
93 * Return object name, size/offset information, and location (OSD 157 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset. 158 * number, network address) for a given file offset.
95 */ 159 */
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
98 struct ceph_ioctl_dataloc dl; 162 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode; 163 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode); 164 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 165 struct ceph_osd_client *osdc =
166 &ceph_sb_to_client(inode->i_sb)->client->osdc;
102 u64 len = 1, olen; 167 u64 len = 1, olen;
103 u64 tmp; 168 u64 tmp;
104 struct ceph_object_layout ol; 169 struct ceph_object_layout ol;
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
174 case CEPH_IOC_SET_LAYOUT: 239 case CEPH_IOC_SET_LAYOUT:
175 return ceph_ioctl_set_layout(file, (void __user *)arg); 240 return ceph_ioctl_set_layout(file, (void __user *)arg);
176 241
242 case CEPH_IOC_SET_LAYOUT_POLICY:
243 return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
244
177 case CEPH_IOC_GET_DATALOC: 245 case CEPH_IOC_GET_DATALOC:
178 return ceph_ioctl_get_dataloc(file, (void __user *)arg); 246 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
179 247
180 case CEPH_IOC_LAZYIO: 248 case CEPH_IOC_LAZYIO:
181 return ceph_ioctl_lazyio(file); 249 return ceph_ioctl_lazyio(file);
182 } 250 }
251
183 return -ENOTTY; 252 return -ENOTTY;
184} 253}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 88451a3b6857..a6ce54e94eb5 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
4#include <linux/ioctl.h> 4#include <linux/ioctl.h>
5#include <linux/types.h> 5#include <linux/types.h>
6 6
7#define CEPH_IOCTL_MAGIC 0x97 7#define CEPH_IOCTL_MAGIC 0x98
8 8
9/* just use u64 to align sanely on all archs */ 9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout { 10struct ceph_ioctl_layout {
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
17 struct ceph_ioctl_layout) 17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ 18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout) 19 struct ceph_ioctl_layout)
20#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \
21 struct ceph_ioctl_layout)
20 22
21/* 23/*
22 * Extract identity, address of the OSD and object storing a given 24 * Extract identity, address of the OSD and object storing a given
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ff4e753aae92..40abde93c345 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,11 +1,11 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/file.h> 3#include <linux/file.h>
4#include <linux/namei.h> 4#include <linux/namei.h>
5 5
6#include "super.h" 6#include "super.h"
7#include "mds_client.h" 7#include "mds_client.h"
8#include "pagelist.h" 8#include <linux/ceph/pagelist.h>
9 9
10/** 10/**
11 * Implement fcntl and flock locking functions. 11 * Implement fcntl and flock locking functions.
@@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
16{ 16{
17 struct inode *inode = file->f_dentry->d_inode; 17 struct inode *inode = file->f_dentry->d_inode;
18 struct ceph_mds_client *mdsc = 18 struct ceph_mds_client *mdsc =
19 &ceph_sb_to_client(inode->i_sb)->mdsc; 19 ceph_sb_to_client(inode->i_sb)->mdsc;
20 struct ceph_mds_request *req; 20 struct ceph_mds_request *req;
21 int err; 21 int err;
22 22
@@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
181 * Encode the flock and fcntl locks for the given inode into the pagelist. 181 * Encode the flock and fcntl locks for the given inode into the pagelist.
182 * Format is: #fcntl locks, sequential fcntl locks, #flock locks, 182 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
183 * sequential flock locks. 183 * sequential flock locks.
184 * Must be called with BLK already held, and the lock numbers should have 184 * Must be called with lock_flocks() already held.
185 * been gathered under the same lock holding window. 185 * If we encounter more of a specific lock type than expected,
186 * we return the value 1.
186 */ 187 */
187int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, 188int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
188 int num_fcntl_locks, int num_flock_locks) 189 int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
190 struct file_lock *lock; 191 struct file_lock *lock;
191 struct ceph_filelock cephlock; 192 struct ceph_filelock cephlock;
192 int err = 0; 193 int err = 0;
194 int seen_fcntl = 0;
195 int seen_flock = 0;
193 196
194 dout("encoding %d flock and %d fcntl locks", num_flock_locks, 197 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
195 num_fcntl_locks); 198 num_fcntl_locks);
@@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
198 goto fail; 201 goto fail;
199 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 202 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
200 if (lock->fl_flags & FL_POSIX) { 203 if (lock->fl_flags & FL_POSIX) {
204 ++seen_fcntl;
205 if (seen_fcntl > num_fcntl_locks) {
206 err = -ENOSPC;
207 goto fail;
208 }
201 err = lock_to_ceph_filelock(lock, &cephlock); 209 err = lock_to_ceph_filelock(lock, &cephlock);
202 if (err) 210 if (err)
203 goto fail; 211 goto fail;
@@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
213 goto fail; 221 goto fail;
214 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 222 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
215 if (lock->fl_flags & FL_FLOCK) { 223 if (lock->fl_flags & FL_FLOCK) {
224 ++seen_flock;
225 if (seen_flock > num_flock_locks) {
226 err = -ENOSPC;
227 goto fail;
228 }
216 err = lock_to_ceph_filelock(lock, &cephlock); 229 err = lock_to_ceph_filelock(lock, &cephlock);
217 if (err) 230 if (err)
218 goto fail; 231 goto fail;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fad95f8f2608..3142b15940c2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,17 +1,21 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/fs.h>
3#include <linux/wait.h> 4#include <linux/wait.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
6#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
7 10
8#include "mds_client.h"
9#include "mon_client.h"
10#include "super.h" 11#include "super.h"
11#include "messenger.h" 12#include "mds_client.h"
12#include "decode.h" 13
13#include "auth.h" 14#include <linux/ceph/messenger.h>
14#include "pagelist.h" 15#include <linux/ceph/decode.h>
16#include <linux/ceph/pagelist.h>
17#include <linux/ceph/auth.h>
18#include <linux/ceph/debugfs.h>
15 19
16/* 20/*
17 * A cluster of MDS (metadata server) daemons is responsible for 21 * A cluster of MDS (metadata server) daemons is responsible for
@@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
286 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); 290 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
287 if (atomic_dec_and_test(&s->s_ref)) { 291 if (atomic_dec_and_test(&s->s_ref)) {
288 if (s->s_authorizer) 292 if (s->s_authorizer)
289 s->s_mdsc->client->monc.auth->ops->destroy_authorizer( 293 s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
290 s->s_mdsc->client->monc.auth, s->s_authorizer); 294 s->s_mdsc->fsc->client->monc.auth,
295 s->s_authorizer);
291 kfree(s); 296 kfree(s);
292 } 297 }
293} 298}
@@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
344 s->s_seq = 0; 349 s->s_seq = 0;
345 mutex_init(&s->s_mutex); 350 mutex_init(&s->s_mutex);
346 351
347 ceph_con_init(mdsc->client->msgr, &s->s_con); 352 ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
348 s->s_con.private = s; 353 s->s_con.private = s;
349 s->s_con.ops = &mds_con_ops; 354 s->s_con.ops = &mds_con_ops;
350 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; 355 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
599 } else if (req->r_dentry) { 604 } else if (req->r_dentry) {
600 struct inode *dir = req->r_dentry->d_parent->d_inode; 605 struct inode *dir = req->r_dentry->d_parent->d_inode;
601 606
602 if (dir->i_sb != mdsc->client->sb) { 607 if (dir->i_sb != mdsc->fsc->sb) {
603 /* not this fs! */ 608 /* not this fs! */
604 inode = req->r_dentry->d_inode; 609 inode = req->r_dentry->d_inode;
605 } else if (ceph_snap(dir) != CEPH_NOSNAP) { 610 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
884 __ceph_remove_cap(cap); 889 __ceph_remove_cap(cap);
885 if (!__ceph_is_any_real_caps(ci)) { 890 if (!__ceph_is_any_real_caps(ci)) {
886 struct ceph_mds_client *mdsc = 891 struct ceph_mds_client *mdsc =
887 &ceph_sb_to_client(inode->i_sb)->mdsc; 892 ceph_sb_to_client(inode->i_sb)->mdsc;
888 893
889 spin_lock(&mdsc->cap_dirty_lock); 894 spin_lock(&mdsc->cap_dirty_lock);
890 if (!list_empty(&ci->i_dirty_item)) { 895 if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1146 struct ceph_msg *msg, *partial = NULL; 1151 struct ceph_msg *msg, *partial = NULL;
1147 struct ceph_mds_cap_release *head; 1152 struct ceph_mds_cap_release *head;
1148 int err = -ENOMEM; 1153 int err = -ENOMEM;
1149 int extra = mdsc->client->mount_args->cap_release_safety; 1154 int extra = mdsc->fsc->mount_options->cap_release_safety;
1150 int num; 1155 int num;
1151 1156
1152 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, 1157 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2085 2090
2086 /* insert trace into our cache */ 2091 /* insert trace into our cache */
2087 mutex_lock(&req->r_fill_mutex); 2092 mutex_lock(&req->r_fill_mutex);
2088 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); 2093 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2089 if (err == 0) { 2094 if (err == 0) {
2090 if (result == 0 && rinfo->dir_nr) 2095 if (result == 0 && rinfo->dir_nr)
2091 ceph_readdir_prepopulate(req, req->r_session); 2096 ceph_readdir_prepopulate(req, req->r_session);
@@ -2361,19 +2366,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2361 2366
2362 if (recon_state->flock) { 2367 if (recon_state->flock) {
2363 int num_fcntl_locks, num_flock_locks; 2368 int num_fcntl_locks, num_flock_locks;
2364 2369 struct ceph_pagelist_cursor trunc_point;
2365 lock_kernel(); 2370
2366 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); 2371 ceph_pagelist_set_cursor(pagelist, &trunc_point);
2367 rec.v2.flock_len = (2*sizeof(u32) + 2372 do {
2368 (num_fcntl_locks+num_flock_locks) * 2373 lock_flocks();
2369 sizeof(struct ceph_filelock)); 2374 ceph_count_locks(inode, &num_fcntl_locks,
2370 2375 &num_flock_locks);
2371 err = ceph_pagelist_append(pagelist, &rec, reclen); 2376 rec.v2.flock_len = (2*sizeof(u32) +
2372 if (!err) 2377 (num_fcntl_locks+num_flock_locks) *
2373 err = ceph_encode_locks(inode, pagelist, 2378 sizeof(struct ceph_filelock));
2374 num_fcntl_locks, 2379 unlock_flocks();
2375 num_flock_locks); 2380
2376 unlock_kernel(); 2381 /* pre-alloc pagelist */
2382 ceph_pagelist_truncate(pagelist, &trunc_point);
2383 err = ceph_pagelist_append(pagelist, &rec, reclen);
2384 if (!err)
2385 err = ceph_pagelist_reserve(pagelist,
2386 rec.v2.flock_len);
2387
2388 /* encode locks */
2389 if (!err) {
2390 lock_flocks();
2391 err = ceph_encode_locks(inode,
2392 pagelist,
2393 num_fcntl_locks,
2394 num_flock_locks);
2395 unlock_flocks();
2396 }
2397 } while (err == -ENOSPC);
2377 } else { 2398 } else {
2378 err = ceph_pagelist_append(pagelist, &rec, reclen); 2399 err = ceph_pagelist_append(pagelist, &rec, reclen);
2379 } 2400 }
@@ -2613,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2613 struct ceph_mds_session *session, 2634 struct ceph_mds_session *session,
2614 struct ceph_msg *msg) 2635 struct ceph_msg *msg)
2615{ 2636{
2616 struct super_block *sb = mdsc->client->sb; 2637 struct super_block *sb = mdsc->fsc->sb;
2617 struct inode *inode; 2638 struct inode *inode;
2618 struct ceph_inode_info *ci; 2639 struct ceph_inode_info *ci;
2619 struct dentry *parent, *dentry; 2640 struct dentry *parent, *dentry;
@@ -2891,10 +2912,16 @@ static void delayed_work(struct work_struct *work)
2891 schedule_delayed(mdsc); 2912 schedule_delayed(mdsc);
2892} 2913}
2893 2914
2915int ceph_mdsc_init(struct ceph_fs_client *fsc)
2894 2916
2895int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2896{ 2917{
2897 mdsc->client = client; 2918 struct ceph_mds_client *mdsc;
2919
2920 mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
2921 if (!mdsc)
2922 return -ENOMEM;
2923 mdsc->fsc = fsc;
2924 fsc->mdsc = mdsc;
2898 mutex_init(&mdsc->mutex); 2925 mutex_init(&mdsc->mutex);
2899 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 2926 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2900 if (mdsc->mdsmap == NULL) 2927 if (mdsc->mdsmap == NULL)
@@ -2927,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2927 INIT_LIST_HEAD(&mdsc->dentry_lru); 2954 INIT_LIST_HEAD(&mdsc->dentry_lru);
2928 2955
2929 ceph_caps_init(mdsc); 2956 ceph_caps_init(mdsc);
2930 ceph_adjust_min_caps(mdsc, client->min_caps); 2957 ceph_adjust_min_caps(mdsc, fsc->min_caps);
2931 2958
2932 return 0; 2959 return 0;
2933} 2960}
@@ -2939,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2939static void wait_requests(struct ceph_mds_client *mdsc) 2966static void wait_requests(struct ceph_mds_client *mdsc)
2940{ 2967{
2941 struct ceph_mds_request *req; 2968 struct ceph_mds_request *req;
2942 struct ceph_client *client = mdsc->client; 2969 struct ceph_fs_client *fsc = mdsc->fsc;
2943 2970
2944 mutex_lock(&mdsc->mutex); 2971 mutex_lock(&mdsc->mutex);
2945 if (__get_oldest_req(mdsc)) { 2972 if (__get_oldest_req(mdsc)) {
@@ -2947,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
2947 2974
2948 dout("wait_requests waiting for requests\n"); 2975 dout("wait_requests waiting for requests\n");
2949 wait_for_completion_timeout(&mdsc->safe_umount_waiters, 2976 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2950 client->mount_args->mount_timeout * HZ); 2977 fsc->client->options->mount_timeout * HZ);
2951 2978
2952 /* tear down remaining requests */ 2979 /* tear down remaining requests */
2953 mutex_lock(&mdsc->mutex); 2980 mutex_lock(&mdsc->mutex);
@@ -3030,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3030{ 3057{
3031 u64 want_tid, want_flush; 3058 u64 want_tid, want_flush;
3032 3059
3033 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) 3060 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3034 return; 3061 return;
3035 3062
3036 dout("sync\n"); 3063 dout("sync\n");
@@ -3053,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
3053{ 3080{
3054 int i, n = 0; 3081 int i, n = 0;
3055 3082
3056 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) 3083 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3057 return true; 3084 return true;
3058 3085
3059 mutex_lock(&mdsc->mutex); 3086 mutex_lock(&mdsc->mutex);
@@ -3071,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3071{ 3098{
3072 struct ceph_mds_session *session; 3099 struct ceph_mds_session *session;
3073 int i; 3100 int i;
3074 struct ceph_client *client = mdsc->client; 3101 struct ceph_fs_client *fsc = mdsc->fsc;
3075 unsigned long timeout = client->mount_args->mount_timeout * HZ; 3102 unsigned long timeout = fsc->client->options->mount_timeout * HZ;
3076 3103
3077 dout("close_sessions\n"); 3104 dout("close_sessions\n");
3078 3105
@@ -3119,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3119 dout("stopped\n"); 3146 dout("stopped\n");
3120} 3147}
3121 3148
3122void ceph_mdsc_stop(struct ceph_mds_client *mdsc) 3149static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3123{ 3150{
3124 dout("stop\n"); 3151 dout("stop\n");
3125 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ 3152 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3129,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3129 ceph_caps_finalize(mdsc); 3156 ceph_caps_finalize(mdsc);
3130} 3157}
3131 3158
3159void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
3160{
3161 struct ceph_mds_client *mdsc = fsc->mdsc;
3162
3163 ceph_mdsc_stop(mdsc);
3164 fsc->mdsc = NULL;
3165 kfree(mdsc);
3166}
3167
3132 3168
3133/* 3169/*
3134 * handle mds map update. 3170 * handle mds map update.
@@ -3145,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3145 3181
3146 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); 3182 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
3147 ceph_decode_copy(&p, &fsid, sizeof(fsid)); 3183 ceph_decode_copy(&p, &fsid, sizeof(fsid));
3148 if (ceph_check_fsid(mdsc->client, &fsid) < 0) 3184 if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
3149 return; 3185 return;
3150 epoch = ceph_decode_32(&p); 3186 epoch = ceph_decode_32(&p);
3151 maplen = ceph_decode_32(&p); 3187 maplen = ceph_decode_32(&p);
3152 dout("handle_map epoch %u len %d\n", epoch, (int)maplen); 3188 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
3153 3189
3154 /* do we need it? */ 3190 /* do we need it? */
3155 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch); 3191 ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
3156 mutex_lock(&mdsc->mutex); 3192 mutex_lock(&mdsc->mutex);
3157 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { 3193 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
3158 dout("handle_map epoch %u <= our %u\n", 3194 dout("handle_map epoch %u <= our %u\n",
@@ -3176,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3176 } else { 3212 } else {
3177 mdsc->mdsmap = newmap; /* first mds map */ 3213 mdsc->mdsmap = newmap; /* first mds map */
3178 } 3214 }
3179 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; 3215 mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
3180 3216
3181 __wake_requests(mdsc, &mdsc->waiting_for_map); 3217 __wake_requests(mdsc, &mdsc->waiting_for_map);
3182 3218
@@ -3277,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con,
3277{ 3313{
3278 struct ceph_mds_session *s = con->private; 3314 struct ceph_mds_session *s = con->private;
3279 struct ceph_mds_client *mdsc = s->s_mdsc; 3315 struct ceph_mds_client *mdsc = s->s_mdsc;
3280 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3316 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3281 int ret = 0; 3317 int ret = 0;
3282 3318
3283 if (force_new && s->s_authorizer) { 3319 if (force_new && s->s_authorizer) {
@@ -3311,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
3311{ 3347{
3312 struct ceph_mds_session *s = con->private; 3348 struct ceph_mds_session *s = con->private;
3313 struct ceph_mds_client *mdsc = s->s_mdsc; 3349 struct ceph_mds_client *mdsc = s->s_mdsc;
3314 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3350 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3315 3351
3316 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); 3352 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3317} 3353}
@@ -3320,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
3320{ 3356{
3321 struct ceph_mds_session *s = con->private; 3357 struct ceph_mds_session *s = con->private;
3322 struct ceph_mds_client *mdsc = s->s_mdsc; 3358 struct ceph_mds_client *mdsc = s->s_mdsc;
3323 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3359 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3324 3360
3325 if (ac->ops->invalidate_authorizer) 3361 if (ac->ops->invalidate_authorizer)
3326 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); 3362 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3327 3363
3328 return ceph_monc_validate_auth(&mdsc->client->monc); 3364 return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
3329} 3365}
3330 3366
3331static const struct ceph_connection_operations mds_con_ops = { 3367static const struct ceph_connection_operations mds_con_ops = {
@@ -3338,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = {
3338 .peer_reset = peer_reset, 3374 .peer_reset = peer_reset,
3339}; 3375};
3340 3376
3341
3342
3343
3344/* eof */ 3377/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c98267ce6d2a..d66d63c72355 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
8#include <linux/rbtree.h> 8#include <linux/rbtree.h>
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10 10
11#include "types.h" 11#include <linux/ceph/types.h>
12#include "messenger.h" 12#include <linux/ceph/messenger.h>
13#include "mdsmap.h" 13#include <linux/ceph/mdsmap.h>
14 14
15/* 15/*
16 * Some lock dependencies: 16 * Some lock dependencies:
@@ -26,7 +26,7 @@
26 * 26 *
27 */ 27 */
28 28
29struct ceph_client; 29struct ceph_fs_client;
30struct ceph_cap; 30struct ceph_cap;
31 31
32/* 32/*
@@ -230,7 +230,7 @@ struct ceph_mds_request {
230 * mds client state 230 * mds client state
231 */ 231 */
232struct ceph_mds_client { 232struct ceph_mds_client {
233 struct ceph_client *client; 233 struct ceph_fs_client *fsc;
234 struct mutex mutex; /* all nested structures */ 234 struct mutex mutex; /* all nested structures */
235 235
236 struct ceph_mdsmap *mdsmap; 236 struct ceph_mdsmap *mdsmap;
@@ -289,11 +289,6 @@ struct ceph_mds_client {
289 int caps_avail_count; /* unused, unreserved */ 289 int caps_avail_count; /* unused, unreserved */
290 int caps_min_count; /* keep at least this many 290 int caps_min_count; /* keep at least this many
291 (unreserved) */ 291 (unreserved) */
292
293#ifdef CONFIG_DEBUG_FS
294 struct dentry *debugfs_file;
295#endif
296
297 spinlock_t dentry_lru_lock; 292 spinlock_t dentry_lru_lock;
298 struct list_head dentry_lru; 293 struct list_head dentry_lru;
299 int num_dentry; 294 int num_dentry;
@@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
316extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, 311extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
317 struct ceph_msg *msg, int mds); 312 struct ceph_msg *msg, int mds);
318 313
319extern int ceph_mdsc_init(struct ceph_mds_client *mdsc, 314extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
320 struct ceph_client *client);
321extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); 315extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
322extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc); 316extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
323 317
324extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); 318extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
325 319
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 040be6d1150b..73b7d44e8a35 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/bug.h> 3#include <linux/bug.h>
4#include <linux/err.h> 4#include <linux/err.h>
@@ -6,9 +6,9 @@
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9#include "mdsmap.h" 9#include <linux/ceph/mdsmap.h>
10#include "messenger.h" 10#include <linux/ceph/messenger.h>
11#include "decode.h" 11#include <linux/ceph/decode.h>
12 12
13#include "super.h" 13#include "super.h"
14 14
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
117 } 117 }
118 118
119 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", 119 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
120 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr), 120 i+1, n, global_id, mds, inc,
121 ceph_pr_addr(&addr.in_addr),
121 ceph_mds_state_name(state)); 122 ceph_mds_state_name(state));
122 if (mds >= 0 && mds < m->m_max_mds && state > 0) { 123 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
123 m->m_info[mds].global_id = global_id; 124 m->m_info[mds].global_id = global_id;
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
deleted file mode 100644
index 4c5cb0880bba..000000000000
--- a/fs/ceph/mdsmap.h
+++ /dev/null
@@ -1,62 +0,0 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 bool laggy;
17 u32 *export_targets;
18};
19
20struct ceph_mdsmap {
21 u32 m_epoch, m_client_epoch, m_last_failure;
22 u32 m_root;
23 u32 m_session_timeout; /* seconds */
24 u32 m_session_autoclose; /* seconds */
25 u64 m_max_file_size;
26 u32 m_max_mds; /* size of m_addr, m_state arrays */
27 struct ceph_mds_info *m_info;
28
29 /* which object pools file data can be stored in */
30 int m_num_data_pg_pools;
31 u32 *m_data_pg_pools;
32 u32 m_cas_pg_pool;
33};
34
35static inline struct ceph_entity_addr *
36ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
37{
38 if (w >= m->m_max_mds)
39 return NULL;
40 return &m->m_info[w].addr;
41}
42
43static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
44{
45 BUG_ON(w < 0);
46 if (w >= m->m_max_mds)
47 return CEPH_MDS_STATE_DNE;
48 return m->m_info[w].state;
49}
50
51static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
52{
53 if (w >= 0 && w < m->m_max_mds)
54 return m->m_info[w].laggy;
55 return false;
56}
57
58extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
59extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
60extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
61
62#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
deleted file mode 100644
index 2502d76fcec1..000000000000
--- a/fs/ceph/messenger.c
+++ /dev/null
@@ -1,2277 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <net/tcp.h>
13
14#include "super.h"
15#include "messenger.h"
16#include "decode.h"
17#include "pagelist.h"
18
19/*
20 * Ceph uses the messenger to exchange ceph_msg messages with other
21 * hosts in the system. The messenger provides ordered and reliable
22 * delivery. We tolerate TCP disconnects by reconnecting (with
23 * exponential backoff) in the case of a fault (disconnection, bad
24 * crc, protocol error). Acks allow sent messages to be discarded by
25 * the sender.
26 */
27
28/* static tag bytes (protocol control messages) */
29static char tag_msg = CEPH_MSGR_TAG_MSG;
30static char tag_ack = CEPH_MSGR_TAG_ACK;
31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
32
33#ifdef CONFIG_LOCKDEP
34static struct lock_class_key socket_class;
35#endif
36
37
38static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con);
41
42/*
43 * nicely render a sockaddr as a string.
44 */
45#define MAX_ADDR_STR 20
46#define MAX_ADDR_STR_LEN 60
47static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
48static DEFINE_SPINLOCK(addr_str_lock);
49static int last_addr_str;
50
51const char *pr_addr(const struct sockaddr_storage *ss)
52{
53 int i;
54 char *s;
55 struct sockaddr_in *in4 = (void *)ss;
56 struct sockaddr_in6 *in6 = (void *)ss;
57
58 spin_lock(&addr_str_lock);
59 i = last_addr_str++;
60 if (last_addr_str == MAX_ADDR_STR)
61 last_addr_str = 0;
62 spin_unlock(&addr_str_lock);
63 s = addr_str[i];
64
65 switch (ss->ss_family) {
66 case AF_INET:
67 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
68 (unsigned int)ntohs(in4->sin_port));
69 break;
70
71 case AF_INET6:
72 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
73 (unsigned int)ntohs(in6->sin6_port));
74 break;
75
76 default:
77 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
78 }
79
80 return s;
81}
82
83static void encode_my_addr(struct ceph_messenger *msgr)
84{
85 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
86 ceph_encode_addr(&msgr->my_enc_addr);
87}
88
89/*
90 * work queue for all reading and writing to/from the socket.
91 */
92struct workqueue_struct *ceph_msgr_wq;
93
94int __init ceph_msgr_init(void)
95{
96 ceph_msgr_wq = create_workqueue("ceph-msgr");
97 if (IS_ERR(ceph_msgr_wq)) {
98 int ret = PTR_ERR(ceph_msgr_wq);
99 pr_err("msgr_init failed to create workqueue: %d\n", ret);
100 ceph_msgr_wq = NULL;
101 return ret;
102 }
103 return 0;
104}
105
106void ceph_msgr_exit(void)
107{
108 destroy_workqueue(ceph_msgr_wq);
109}
110
111void ceph_msgr_flush(void)
112{
113 flush_workqueue(ceph_msgr_wq);
114}
115
116
117/*
118 * socket callback functions
119 */
120
121/* data available on socket, or listen socket received a connect */
122static void ceph_data_ready(struct sock *sk, int count_unused)
123{
124 struct ceph_connection *con =
125 (struct ceph_connection *)sk->sk_user_data;
126 if (sk->sk_state != TCP_CLOSE_WAIT) {
127 dout("ceph_data_ready on %p state = %lu, queueing work\n",
128 con, con->state);
129 queue_con(con);
130 }
131}
132
133/* socket has buffer space for writing */
134static void ceph_write_space(struct sock *sk)
135{
136 struct ceph_connection *con =
137 (struct ceph_connection *)sk->sk_user_data;
138
139 /* only queue to workqueue if there is data we want to write. */
140 if (test_bit(WRITE_PENDING, &con->state)) {
141 dout("ceph_write_space %p queueing write work\n", con);
142 queue_con(con);
143 } else {
144 dout("ceph_write_space %p nothing to write\n", con);
145 }
146
147 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
148 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
149}
150
151/* socket's state has changed */
152static void ceph_state_change(struct sock *sk)
153{
154 struct ceph_connection *con =
155 (struct ceph_connection *)sk->sk_user_data;
156
157 dout("ceph_state_change %p state = %lu sk_state = %u\n",
158 con, con->state, sk->sk_state);
159
160 if (test_bit(CLOSED, &con->state))
161 return;
162
163 switch (sk->sk_state) {
164 case TCP_CLOSE:
165 dout("ceph_state_change TCP_CLOSE\n");
166 case TCP_CLOSE_WAIT:
167 dout("ceph_state_change TCP_CLOSE_WAIT\n");
168 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
169 if (test_bit(CONNECTING, &con->state))
170 con->error_msg = "connection failed";
171 else
172 con->error_msg = "socket closed";
173 queue_con(con);
174 }
175 break;
176 case TCP_ESTABLISHED:
177 dout("ceph_state_change TCP_ESTABLISHED\n");
178 queue_con(con);
179 break;
180 }
181}
182
183/*
184 * set up socket callbacks
185 */
186static void set_sock_callbacks(struct socket *sock,
187 struct ceph_connection *con)
188{
189 struct sock *sk = sock->sk;
190 sk->sk_user_data = (void *)con;
191 sk->sk_data_ready = ceph_data_ready;
192 sk->sk_write_space = ceph_write_space;
193 sk->sk_state_change = ceph_state_change;
194}
195
196
197/*
198 * socket helpers
199 */
200
201/*
202 * initiate connection to a remote socket.
203 */
204static struct socket *ceph_tcp_connect(struct ceph_connection *con)
205{
206 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
207 struct socket *sock;
208 int ret;
209
210 BUG_ON(con->sock);
211 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
212 IPPROTO_TCP, &sock);
213 if (ret)
214 return ERR_PTR(ret);
215 con->sock = sock;
216 sock->sk->sk_allocation = GFP_NOFS;
217
218#ifdef CONFIG_LOCKDEP
219 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
220#endif
221
222 set_sock_callbacks(sock, con);
223
224 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
225
226 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
227 O_NONBLOCK);
228 if (ret == -EINPROGRESS) {
229 dout("connect %s EINPROGRESS sk_state = %u\n",
230 pr_addr(&con->peer_addr.in_addr),
231 sock->sk->sk_state);
232 ret = 0;
233 }
234 if (ret < 0) {
235 pr_err("connect %s error %d\n",
236 pr_addr(&con->peer_addr.in_addr), ret);
237 sock_release(sock);
238 con->sock = NULL;
239 con->error_msg = "connect error";
240 }
241
242 if (ret < 0)
243 return ERR_PTR(ret);
244 return sock;
245}
246
247static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
248{
249 struct kvec iov = {buf, len};
250 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
251
252 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
253}
254
255/*
256 * write something. @more is true if caller will be sending more data
257 * shortly.
258 */
259static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
260 size_t kvlen, size_t len, int more)
261{
262 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
263
264 if (more)
265 msg.msg_flags |= MSG_MORE;
266 else
267 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
268
269 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
270}
271
272
273/*
274 * Shutdown/close the socket for the given connection.
275 */
276static int con_close_socket(struct ceph_connection *con)
277{
278 int rc;
279
280 dout("con_close_socket on %p sock %p\n", con, con->sock);
281 if (!con->sock)
282 return 0;
283 set_bit(SOCK_CLOSED, &con->state);
284 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
285 sock_release(con->sock);
286 con->sock = NULL;
287 clear_bit(SOCK_CLOSED, &con->state);
288 return rc;
289}
290
291/*
292 * Reset a connection. Discard all incoming and outgoing messages
293 * and clear *_seq state.
294 */
295static void ceph_msg_remove(struct ceph_msg *msg)
296{
297 list_del_init(&msg->list_head);
298 ceph_msg_put(msg);
299}
300static void ceph_msg_remove_list(struct list_head *head)
301{
302 while (!list_empty(head)) {
303 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
304 list_head);
305 ceph_msg_remove(msg);
306 }
307}
308
309static void reset_connection(struct ceph_connection *con)
310{
311 /* reset connection, out_queue, msg_ and connect_seq */
312 /* discard existing out_queue and msg_seq */
313 ceph_msg_remove_list(&con->out_queue);
314 ceph_msg_remove_list(&con->out_sent);
315
316 if (con->in_msg) {
317 ceph_msg_put(con->in_msg);
318 con->in_msg = NULL;
319 }
320
321 con->connect_seq = 0;
322 con->out_seq = 0;
323 if (con->out_msg) {
324 ceph_msg_put(con->out_msg);
325 con->out_msg = NULL;
326 }
327 con->out_keepalive_pending = false;
328 con->in_seq = 0;
329 con->in_seq_acked = 0;
330}
331
332/*
333 * mark a peer down. drop any open connections.
334 */
335void ceph_con_close(struct ceph_connection *con)
336{
337 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
338 set_bit(CLOSED, &con->state); /* in case there's queued work */
339 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
340 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
341 clear_bit(KEEPALIVE_PENDING, &con->state);
342 clear_bit(WRITE_PENDING, &con->state);
343 mutex_lock(&con->mutex);
344 reset_connection(con);
345 con->peer_global_seq = 0;
346 cancel_delayed_work(&con->work);
347 mutex_unlock(&con->mutex);
348 queue_con(con);
349}
350
351/*
352 * Reopen a closed connection, with a new peer address.
353 */
354void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
355{
356 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
357 set_bit(OPENING, &con->state);
358 clear_bit(CLOSED, &con->state);
359 memcpy(&con->peer_addr, addr, sizeof(*addr));
360 con->delay = 0; /* reset backoff memory */
361 queue_con(con);
362}
363
364/*
365 * return true if this connection ever successfully opened
366 */
367bool ceph_con_opened(struct ceph_connection *con)
368{
369 return con->connect_seq > 0;
370}
371
372/*
373 * generic get/put
374 */
375struct ceph_connection *ceph_con_get(struct ceph_connection *con)
376{
377 dout("con_get %p nref = %d -> %d\n", con,
378 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
379 if (atomic_inc_not_zero(&con->nref))
380 return con;
381 return NULL;
382}
383
384void ceph_con_put(struct ceph_connection *con)
385{
386 dout("con_put %p nref = %d -> %d\n", con,
387 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
388 BUG_ON(atomic_read(&con->nref) == 0);
389 if (atomic_dec_and_test(&con->nref)) {
390 BUG_ON(con->sock);
391 kfree(con);
392 }
393}
394
395/*
396 * initialize a new connection.
397 */
398void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
399{
400 dout("con_init %p\n", con);
401 memset(con, 0, sizeof(*con));
402 atomic_set(&con->nref, 1);
403 con->msgr = msgr;
404 mutex_init(&con->mutex);
405 INIT_LIST_HEAD(&con->out_queue);
406 INIT_LIST_HEAD(&con->out_sent);
407 INIT_DELAYED_WORK(&con->work, con_work);
408}
409
410
411/*
412 * We maintain a global counter to order connection attempts. Get
413 * a unique seq greater than @gt.
414 */
415static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
416{
417 u32 ret;
418
419 spin_lock(&msgr->global_seq_lock);
420 if (msgr->global_seq < gt)
421 msgr->global_seq = gt;
422 ret = ++msgr->global_seq;
423 spin_unlock(&msgr->global_seq_lock);
424 return ret;
425}
426
427
428/*
429 * Prepare footer for currently outgoing message, and finish things
430 * off. Assumes out_kvec* are already valid.. we just add on to the end.
431 */
432static void prepare_write_message_footer(struct ceph_connection *con, int v)
433{
434 struct ceph_msg *m = con->out_msg;
435
436 dout("prepare_write_message_footer %p\n", con);
437 con->out_kvec_is_msg = true;
438 con->out_kvec[v].iov_base = &m->footer;
439 con->out_kvec[v].iov_len = sizeof(m->footer);
440 con->out_kvec_bytes += sizeof(m->footer);
441 con->out_kvec_left++;
442 con->out_more = m->more_to_follow;
443 con->out_msg_done = true;
444}
445
446/*
447 * Prepare headers for the next outgoing message.
448 */
449static void prepare_write_message(struct ceph_connection *con)
450{
451 struct ceph_msg *m;
452 int v = 0;
453
454 con->out_kvec_bytes = 0;
455 con->out_kvec_is_msg = true;
456 con->out_msg_done = false;
457
458 /* Sneak an ack in there first? If we can get it into the same
459 * TCP packet that's a good thing. */
460 if (con->in_seq > con->in_seq_acked) {
461 con->in_seq_acked = con->in_seq;
462 con->out_kvec[v].iov_base = &tag_ack;
463 con->out_kvec[v++].iov_len = 1;
464 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
465 con->out_kvec[v].iov_base = &con->out_temp_ack;
466 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
467 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
468 }
469
470 m = list_first_entry(&con->out_queue,
471 struct ceph_msg, list_head);
472 con->out_msg = m;
473 if (test_bit(LOSSYTX, &con->state)) {
474 list_del_init(&m->list_head);
475 } else {
476 /* put message on sent list */
477 ceph_msg_get(m);
478 list_move_tail(&m->list_head, &con->out_sent);
479 }
480
481 /*
482 * only assign outgoing seq # if we haven't sent this message
483 * yet. if it is requeued, resend with it's original seq.
484 */
485 if (m->needs_out_seq) {
486 m->hdr.seq = cpu_to_le64(++con->out_seq);
487 m->needs_out_seq = false;
488 }
489
490 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
491 m, con->out_seq, le16_to_cpu(m->hdr.type),
492 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
493 le32_to_cpu(m->hdr.data_len),
494 m->nr_pages);
495 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
496
497 /* tag + hdr + front + middle */
498 con->out_kvec[v].iov_base = &tag_msg;
499 con->out_kvec[v++].iov_len = 1;
500 con->out_kvec[v].iov_base = &m->hdr;
501 con->out_kvec[v++].iov_len = sizeof(m->hdr);
502 con->out_kvec[v++] = m->front;
503 if (m->middle)
504 con->out_kvec[v++] = m->middle->vec;
505 con->out_kvec_left = v;
506 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
507 (m->middle ? m->middle->vec.iov_len : 0);
508 con->out_kvec_cur = con->out_kvec;
509
510 /* fill in crc (except data pages), footer */
511 con->out_msg->hdr.crc =
512 cpu_to_le32(crc32c(0, (void *)&m->hdr,
513 sizeof(m->hdr) - sizeof(m->hdr.crc)));
514 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
515 con->out_msg->footer.front_crc =
516 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
517 if (m->middle)
518 con->out_msg->footer.middle_crc =
519 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
520 m->middle->vec.iov_len));
521 else
522 con->out_msg->footer.middle_crc = 0;
523 con->out_msg->footer.data_crc = 0;
524 dout("prepare_write_message front_crc %u data_crc %u\n",
525 le32_to_cpu(con->out_msg->footer.front_crc),
526 le32_to_cpu(con->out_msg->footer.middle_crc));
527
528 /* is there a data payload? */
529 if (le32_to_cpu(m->hdr.data_len) > 0) {
530 /* initialize page iterator */
531 con->out_msg_pos.page = 0;
532 con->out_msg_pos.page_pos =
533 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
534 con->out_msg_pos.data_pos = 0;
535 con->out_msg_pos.did_page_crc = 0;
536 con->out_more = 1; /* data + footer will follow */
537 } else {
538 /* no, queue up footer too and be done */
539 prepare_write_message_footer(con, v);
540 }
541
542 set_bit(WRITE_PENDING, &con->state);
543}
544
545/*
546 * Prepare an ack.
547 */
548static void prepare_write_ack(struct ceph_connection *con)
549{
550 dout("prepare_write_ack %p %llu -> %llu\n", con,
551 con->in_seq_acked, con->in_seq);
552 con->in_seq_acked = con->in_seq;
553
554 con->out_kvec[0].iov_base = &tag_ack;
555 con->out_kvec[0].iov_len = 1;
556 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
557 con->out_kvec[1].iov_base = &con->out_temp_ack;
558 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
559 con->out_kvec_left = 2;
560 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
561 con->out_kvec_cur = con->out_kvec;
562 con->out_more = 1; /* more will follow.. eventually.. */
563 set_bit(WRITE_PENDING, &con->state);
564}
565
566/*
567 * Prepare to write keepalive byte.
568 */
569static void prepare_write_keepalive(struct ceph_connection *con)
570{
571 dout("prepare_write_keepalive %p\n", con);
572 con->out_kvec[0].iov_base = &tag_keepalive;
573 con->out_kvec[0].iov_len = 1;
574 con->out_kvec_left = 1;
575 con->out_kvec_bytes = 1;
576 con->out_kvec_cur = con->out_kvec;
577 set_bit(WRITE_PENDING, &con->state);
578}
579
580/*
581 * Connection negotiation.
582 */
583
584static void prepare_connect_authorizer(struct ceph_connection *con)
585{
586 void *auth_buf;
587 int auth_len = 0;
588 int auth_protocol = 0;
589
590 mutex_unlock(&con->mutex);
591 if (con->ops->get_authorizer)
592 con->ops->get_authorizer(con, &auth_buf, &auth_len,
593 &auth_protocol, &con->auth_reply_buf,
594 &con->auth_reply_buf_len,
595 con->auth_retry);
596 mutex_lock(&con->mutex);
597
598 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
599 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
600
601 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
602 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
603 con->out_kvec_left++;
604 con->out_kvec_bytes += auth_len;
605}
606
607/*
608 * We connected to a peer and are saying hello.
609 */
610static void prepare_write_banner(struct ceph_messenger *msgr,
611 struct ceph_connection *con)
612{
613 int len = strlen(CEPH_BANNER);
614
615 con->out_kvec[0].iov_base = CEPH_BANNER;
616 con->out_kvec[0].iov_len = len;
617 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
618 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
619 con->out_kvec_left = 2;
620 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
621 con->out_kvec_cur = con->out_kvec;
622 con->out_more = 0;
623 set_bit(WRITE_PENDING, &con->state);
624}
625
626static void prepare_write_connect(struct ceph_messenger *msgr,
627 struct ceph_connection *con,
628 int after_banner)
629{
630 unsigned global_seq = get_global_seq(con->msgr, 0);
631 int proto;
632
633 switch (con->peer_name.type) {
634 case CEPH_ENTITY_TYPE_MON:
635 proto = CEPH_MONC_PROTOCOL;
636 break;
637 case CEPH_ENTITY_TYPE_OSD:
638 proto = CEPH_OSDC_PROTOCOL;
639 break;
640 case CEPH_ENTITY_TYPE_MDS:
641 proto = CEPH_MDSC_PROTOCOL;
642 break;
643 default:
644 BUG();
645 }
646
647 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
648 con->connect_seq, global_seq, proto);
649
650 con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
651 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
652 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
653 con->out_connect.global_seq = cpu_to_le32(global_seq);
654 con->out_connect.protocol_version = cpu_to_le32(proto);
655 con->out_connect.flags = 0;
656
657 if (!after_banner) {
658 con->out_kvec_left = 0;
659 con->out_kvec_bytes = 0;
660 }
661 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
662 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
663 con->out_kvec_left++;
664 con->out_kvec_bytes += sizeof(con->out_connect);
665 con->out_kvec_cur = con->out_kvec;
666 con->out_more = 0;
667 set_bit(WRITE_PENDING, &con->state);
668
669 prepare_connect_authorizer(con);
670}
671
672
673/*
674 * write as much of pending kvecs to the socket as we can.
675 * 1 -> done
676 * 0 -> socket full, but more to do
677 * <0 -> error
678 */
679static int write_partial_kvec(struct ceph_connection *con)
680{
681 int ret;
682
683 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
684 while (con->out_kvec_bytes > 0) {
685 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
686 con->out_kvec_left, con->out_kvec_bytes,
687 con->out_more);
688 if (ret <= 0)
689 goto out;
690 con->out_kvec_bytes -= ret;
691 if (con->out_kvec_bytes == 0)
692 break; /* done */
693 while (ret > 0) {
694 if (ret >= con->out_kvec_cur->iov_len) {
695 ret -= con->out_kvec_cur->iov_len;
696 con->out_kvec_cur++;
697 con->out_kvec_left--;
698 } else {
699 con->out_kvec_cur->iov_len -= ret;
700 con->out_kvec_cur->iov_base += ret;
701 ret = 0;
702 break;
703 }
704 }
705 }
706 con->out_kvec_left = 0;
707 con->out_kvec_is_msg = false;
708 ret = 1;
709out:
710 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
711 con->out_kvec_bytes, con->out_kvec_left, ret);
712 return ret; /* done! */
713}
714
715/*
716 * Write as much message data payload as we can. If we finish, queue
717 * up the footer.
718 * 1 -> done, footer is now queued in out_kvec[].
719 * 0 -> socket full, but more to do
720 * <0 -> error
721 */
722static int write_partial_msg_pages(struct ceph_connection *con)
723{
724 struct ceph_msg *msg = con->out_msg;
725 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
726 size_t len;
727 int crc = con->msgr->nocrc;
728 int ret;
729
730 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
731 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
732 con->out_msg_pos.page_pos);
733
734 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
735 struct page *page = NULL;
736 void *kaddr = NULL;
737
738 /*
739 * if we are calculating the data crc (the default), we need
740 * to map the page. if our pages[] has been revoked, use the
741 * zero page.
742 */
743 if (msg->pages) {
744 page = msg->pages[con->out_msg_pos.page];
745 if (crc)
746 kaddr = kmap(page);
747 } else if (msg->pagelist) {
748 page = list_first_entry(&msg->pagelist->head,
749 struct page, lru);
750 if (crc)
751 kaddr = kmap(page);
752 } else {
753 page = con->msgr->zero_page;
754 if (crc)
755 kaddr = page_address(con->msgr->zero_page);
756 }
757 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
758 (int)(data_len - con->out_msg_pos.data_pos));
759 if (crc && !con->out_msg_pos.did_page_crc) {
760 void *base = kaddr + con->out_msg_pos.page_pos;
761 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
762
763 BUG_ON(kaddr == NULL);
764 con->out_msg->footer.data_crc =
765 cpu_to_le32(crc32c(tmpcrc, base, len));
766 con->out_msg_pos.did_page_crc = 1;
767 }
768
769 ret = kernel_sendpage(con->sock, page,
770 con->out_msg_pos.page_pos, len,
771 MSG_DONTWAIT | MSG_NOSIGNAL |
772 MSG_MORE);
773
774 if (crc && (msg->pages || msg->pagelist))
775 kunmap(page);
776
777 if (ret <= 0)
778 goto out;
779
780 con->out_msg_pos.data_pos += ret;
781 con->out_msg_pos.page_pos += ret;
782 if (ret == len) {
783 con->out_msg_pos.page_pos = 0;
784 con->out_msg_pos.page++;
785 con->out_msg_pos.did_page_crc = 0;
786 if (msg->pagelist)
787 list_move_tail(&page->lru,
788 &msg->pagelist->head);
789 }
790 }
791
792 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
793
794 /* prepare and queue up footer, too */
795 if (!crc)
796 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
797 con->out_kvec_bytes = 0;
798 con->out_kvec_left = 0;
799 con->out_kvec_cur = con->out_kvec;
800 prepare_write_message_footer(con, 0);
801 ret = 1;
802out:
803 return ret;
804}
805
806/*
807 * write some zeros
808 */
809static int write_partial_skip(struct ceph_connection *con)
810{
811 int ret;
812
813 while (con->out_skip > 0) {
814 struct kvec iov = {
815 .iov_base = page_address(con->msgr->zero_page),
816 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
817 };
818
819 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
820 if (ret <= 0)
821 goto out;
822 con->out_skip -= ret;
823 }
824 ret = 1;
825out:
826 return ret;
827}
828
829/*
830 * Prepare to read connection handshake, or an ack.
831 */
832static void prepare_read_banner(struct ceph_connection *con)
833{
834 dout("prepare_read_banner %p\n", con);
835 con->in_base_pos = 0;
836}
837
838static void prepare_read_connect(struct ceph_connection *con)
839{
840 dout("prepare_read_connect %p\n", con);
841 con->in_base_pos = 0;
842}
843
844static void prepare_read_ack(struct ceph_connection *con)
845{
846 dout("prepare_read_ack %p\n", con);
847 con->in_base_pos = 0;
848}
849
850static void prepare_read_tag(struct ceph_connection *con)
851{
852 dout("prepare_read_tag %p\n", con);
853 con->in_base_pos = 0;
854 con->in_tag = CEPH_MSGR_TAG_READY;
855}
856
857/*
858 * Prepare to read a message.
859 */
860static int prepare_read_message(struct ceph_connection *con)
861{
862 dout("prepare_read_message %p\n", con);
863 BUG_ON(con->in_msg != NULL);
864 con->in_base_pos = 0;
865 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
866 return 0;
867}
868
869
870static int read_partial(struct ceph_connection *con,
871 int *to, int size, void *object)
872{
873 *to += size;
874 while (con->in_base_pos < *to) {
875 int left = *to - con->in_base_pos;
876 int have = size - left;
877 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
878 if (ret <= 0)
879 return ret;
880 con->in_base_pos += ret;
881 }
882 return 1;
883}
884
885
886/*
887 * Read all or part of the connect-side handshake on a new connection
888 */
889static int read_partial_banner(struct ceph_connection *con)
890{
891 int ret, to = 0;
892
893 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
894
895 /* peer's banner */
896 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
897 if (ret <= 0)
898 goto out;
899 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
900 &con->actual_peer_addr);
901 if (ret <= 0)
902 goto out;
903 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
904 &con->peer_addr_for_me);
905 if (ret <= 0)
906 goto out;
907out:
908 return ret;
909}
910
911static int read_partial_connect(struct ceph_connection *con)
912{
913 int ret, to = 0;
914
915 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
916
917 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
918 if (ret <= 0)
919 goto out;
920 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
921 con->auth_reply_buf);
922 if (ret <= 0)
923 goto out;
924
925 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
926 con, (int)con->in_reply.tag,
927 le32_to_cpu(con->in_reply.connect_seq),
928 le32_to_cpu(con->in_reply.global_seq));
929out:
930 return ret;
931
932}
933
934/*
935 * Verify the hello banner looks okay.
936 */
937static int verify_hello(struct ceph_connection *con)
938{
939 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
940 pr_err("connect to %s got bad banner\n",
941 pr_addr(&con->peer_addr.in_addr));
942 con->error_msg = "protocol error, bad banner";
943 return -1;
944 }
945 return 0;
946}
947
948static bool addr_is_blank(struct sockaddr_storage *ss)
949{
950 switch (ss->ss_family) {
951 case AF_INET:
952 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
953 case AF_INET6:
954 return
955 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
956 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
957 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
958 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
959 }
960 return false;
961}
962
963static int addr_port(struct sockaddr_storage *ss)
964{
965 switch (ss->ss_family) {
966 case AF_INET:
967 return ntohs(((struct sockaddr_in *)ss)->sin_port);
968 case AF_INET6:
969 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
970 }
971 return 0;
972}
973
974static void addr_set_port(struct sockaddr_storage *ss, int p)
975{
976 switch (ss->ss_family) {
977 case AF_INET:
978 ((struct sockaddr_in *)ss)->sin_port = htons(p);
979 case AF_INET6:
980 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
981 }
982}
983
984/*
985 * Parse an ip[:port] list into an addr array. Use the default
986 * monitor port if a port isn't specified.
987 */
988int ceph_parse_ips(const char *c, const char *end,
989 struct ceph_entity_addr *addr,
990 int max_count, int *count)
991{
992 int i;
993 const char *p = c;
994
995 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
996 for (i = 0; i < max_count; i++) {
997 const char *ipend;
998 struct sockaddr_storage *ss = &addr[i].in_addr;
999 struct sockaddr_in *in4 = (void *)ss;
1000 struct sockaddr_in6 *in6 = (void *)ss;
1001 int port;
1002 char delim = ',';
1003
1004 if (*p == '[') {
1005 delim = ']';
1006 p++;
1007 }
1008
1009 memset(ss, 0, sizeof(*ss));
1010 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1011 delim, &ipend))
1012 ss->ss_family = AF_INET;
1013 else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1014 delim, &ipend))
1015 ss->ss_family = AF_INET6;
1016 else
1017 goto bad;
1018 p = ipend;
1019
1020 if (delim == ']') {
1021 if (*p != ']') {
1022 dout("missing matching ']'\n");
1023 goto bad;
1024 }
1025 p++;
1026 }
1027
1028 /* port? */
1029 if (p < end && *p == ':') {
1030 port = 0;
1031 p++;
1032 while (p < end && *p >= '0' && *p <= '9') {
1033 port = (port * 10) + (*p - '0');
1034 p++;
1035 }
1036 if (port > 65535 || port == 0)
1037 goto bad;
1038 } else {
1039 port = CEPH_MON_PORT;
1040 }
1041
1042 addr_set_port(ss, port);
1043
1044 dout("parse_ips got %s\n", pr_addr(ss));
1045
1046 if (p == end)
1047 break;
1048 if (*p != ',')
1049 goto bad;
1050 p++;
1051 }
1052
1053 if (p != end)
1054 goto bad;
1055
1056 if (count)
1057 *count = i + 1;
1058 return 0;
1059
1060bad:
1061 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
1062 return -EINVAL;
1063}
1064
1065static int process_banner(struct ceph_connection *con)
1066{
1067 dout("process_banner on %p\n", con);
1068
1069 if (verify_hello(con) < 0)
1070 return -1;
1071
1072 ceph_decode_addr(&con->actual_peer_addr);
1073 ceph_decode_addr(&con->peer_addr_for_me);
1074
1075 /*
1076 * Make sure the other end is who we wanted. note that the other
1077 * end may not yet know their ip address, so if it's 0.0.0.0, give
1078 * them the benefit of the doubt.
1079 */
1080 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1081 sizeof(con->peer_addr)) != 0 &&
1082 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1083 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1084 pr_warning("wrong peer, want %s/%d, got %s/%d\n",
1085 pr_addr(&con->peer_addr.in_addr),
1086 (int)le32_to_cpu(con->peer_addr.nonce),
1087 pr_addr(&con->actual_peer_addr.in_addr),
1088 (int)le32_to_cpu(con->actual_peer_addr.nonce));
1089 con->error_msg = "wrong peer at address";
1090 return -1;
1091 }
1092
1093 /*
1094 * did we learn our address?
1095 */
1096 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1097 int port = addr_port(&con->msgr->inst.addr.in_addr);
1098
1099 memcpy(&con->msgr->inst.addr.in_addr,
1100 &con->peer_addr_for_me.in_addr,
1101 sizeof(con->peer_addr_for_me.in_addr));
1102 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1103 encode_my_addr(con->msgr);
1104 dout("process_banner learned my addr is %s\n",
1105 pr_addr(&con->msgr->inst.addr.in_addr));
1106 }
1107
1108 set_bit(NEGOTIATING, &con->state);
1109 prepare_read_connect(con);
1110 return 0;
1111}
1112
1113static void fail_protocol(struct ceph_connection *con)
1114{
1115 reset_connection(con);
1116 set_bit(CLOSED, &con->state); /* in case there's queued work */
1117
1118 mutex_unlock(&con->mutex);
1119 if (con->ops->bad_proto)
1120 con->ops->bad_proto(con);
1121 mutex_lock(&con->mutex);
1122}
1123
1124static int process_connect(struct ceph_connection *con)
1125{
1126 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1127 u64 req_feat = CEPH_FEATURE_REQUIRED;
1128 u64 server_feat = le64_to_cpu(con->in_reply.features);
1129
1130 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1131
1132 switch (con->in_reply.tag) {
1133 case CEPH_MSGR_TAG_FEATURES:
1134 pr_err("%s%lld %s feature set mismatch,"
1135 " my %llx < server's %llx, missing %llx\n",
1136 ENTITY_NAME(con->peer_name),
1137 pr_addr(&con->peer_addr.in_addr),
1138 sup_feat, server_feat, server_feat & ~sup_feat);
1139 con->error_msg = "missing required protocol features";
1140 fail_protocol(con);
1141 return -1;
1142
1143 case CEPH_MSGR_TAG_BADPROTOVER:
1144 pr_err("%s%lld %s protocol version mismatch,"
1145 " my %d != server's %d\n",
1146 ENTITY_NAME(con->peer_name),
1147 pr_addr(&con->peer_addr.in_addr),
1148 le32_to_cpu(con->out_connect.protocol_version),
1149 le32_to_cpu(con->in_reply.protocol_version));
1150 con->error_msg = "protocol version mismatch";
1151 fail_protocol(con);
1152 return -1;
1153
1154 case CEPH_MSGR_TAG_BADAUTHORIZER:
1155 con->auth_retry++;
1156 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1157 con->auth_retry);
1158 if (con->auth_retry == 2) {
1159 con->error_msg = "connect authorization failure";
1160 reset_connection(con);
1161 set_bit(CLOSED, &con->state);
1162 return -1;
1163 }
1164 con->auth_retry = 1;
1165 prepare_write_connect(con->msgr, con, 0);
1166 prepare_read_connect(con);
1167 break;
1168
1169 case CEPH_MSGR_TAG_RESETSESSION:
1170 /*
1171 * If we connected with a large connect_seq but the peer
1172 * has no record of a session with us (no connection, or
1173 * connect_seq == 0), they will send RESETSESION to indicate
1174 * that they must have reset their session, and may have
1175 * dropped messages.
1176 */
1177 dout("process_connect got RESET peer seq %u\n",
1178 le32_to_cpu(con->in_connect.connect_seq));
1179 pr_err("%s%lld %s connection reset\n",
1180 ENTITY_NAME(con->peer_name),
1181 pr_addr(&con->peer_addr.in_addr));
1182 reset_connection(con);
1183 prepare_write_connect(con->msgr, con, 0);
1184 prepare_read_connect(con);
1185
1186 /* Tell ceph about it. */
1187 mutex_unlock(&con->mutex);
1188 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1189 if (con->ops->peer_reset)
1190 con->ops->peer_reset(con);
1191 mutex_lock(&con->mutex);
1192 break;
1193
1194 case CEPH_MSGR_TAG_RETRY_SESSION:
1195 /*
1196 * If we sent a smaller connect_seq than the peer has, try
1197 * again with a larger value.
1198 */
1199 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1200 le32_to_cpu(con->out_connect.connect_seq),
1201 le32_to_cpu(con->in_connect.connect_seq));
1202 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1203 prepare_write_connect(con->msgr, con, 0);
1204 prepare_read_connect(con);
1205 break;
1206
1207 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1208 /*
1209 * If we sent a smaller global_seq than the peer has, try
1210 * again with a larger value.
1211 */
1212 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1213 con->peer_global_seq,
1214 le32_to_cpu(con->in_connect.global_seq));
1215 get_global_seq(con->msgr,
1216 le32_to_cpu(con->in_connect.global_seq));
1217 prepare_write_connect(con->msgr, con, 0);
1218 prepare_read_connect(con);
1219 break;
1220
1221 case CEPH_MSGR_TAG_READY:
1222 if (req_feat & ~server_feat) {
1223 pr_err("%s%lld %s protocol feature mismatch,"
1224 " my required %llx > server's %llx, need %llx\n",
1225 ENTITY_NAME(con->peer_name),
1226 pr_addr(&con->peer_addr.in_addr),
1227 req_feat, server_feat, req_feat & ~server_feat);
1228 con->error_msg = "missing required protocol features";
1229 fail_protocol(con);
1230 return -1;
1231 }
1232 clear_bit(CONNECTING, &con->state);
1233 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1234 con->connect_seq++;
1235 con->peer_features = server_feat;
1236 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1237 con->peer_global_seq,
1238 le32_to_cpu(con->in_reply.connect_seq),
1239 con->connect_seq);
1240 WARN_ON(con->connect_seq !=
1241 le32_to_cpu(con->in_reply.connect_seq));
1242
1243 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1244 set_bit(LOSSYTX, &con->state);
1245
1246 prepare_read_tag(con);
1247 break;
1248
1249 case CEPH_MSGR_TAG_WAIT:
1250 /*
1251 * If there is a connection race (we are opening
1252 * connections to each other), one of us may just have
1253 * to WAIT. This shouldn't happen if we are the
1254 * client.
1255 */
1256 pr_err("process_connect peer connecting WAIT\n");
1257
1258 default:
1259 pr_err("connect protocol error, will retry\n");
1260 con->error_msg = "protocol error, garbage tag during connect";
1261 return -1;
1262 }
1263 return 0;
1264}
1265
1266
1267/*
1268 * read (part of) an ack
1269 */
1270static int read_partial_ack(struct ceph_connection *con)
1271{
1272 int to = 0;
1273
1274 return read_partial(con, &to, sizeof(con->in_temp_ack),
1275 &con->in_temp_ack);
1276}
1277
1278
1279/*
1280 * We can finally discard anything that's been acked.
1281 */
1282static void process_ack(struct ceph_connection *con)
1283{
1284 struct ceph_msg *m;
1285 u64 ack = le64_to_cpu(con->in_temp_ack);
1286 u64 seq;
1287
1288 while (!list_empty(&con->out_sent)) {
1289 m = list_first_entry(&con->out_sent, struct ceph_msg,
1290 list_head);
1291 seq = le64_to_cpu(m->hdr.seq);
1292 if (seq > ack)
1293 break;
1294 dout("got ack for seq %llu type %d at %p\n", seq,
1295 le16_to_cpu(m->hdr.type), m);
1296 ceph_msg_remove(m);
1297 }
1298 prepare_read_tag(con);
1299}
1300
1301
1302
1303
1304static int read_partial_message_section(struct ceph_connection *con,
1305 struct kvec *section,
1306 unsigned int sec_len, u32 *crc)
1307{
1308 int left;
1309 int ret;
1310
1311 BUG_ON(!section);
1312
1313 while (section->iov_len < sec_len) {
1314 BUG_ON(section->iov_base == NULL);
1315 left = sec_len - section->iov_len;
1316 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1317 section->iov_len, left);
1318 if (ret <= 0)
1319 return ret;
1320 section->iov_len += ret;
1321 if (section->iov_len == sec_len)
1322 *crc = crc32c(0, section->iov_base,
1323 section->iov_len);
1324 }
1325
1326 return 1;
1327}
1328
1329static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1330 struct ceph_msg_header *hdr,
1331 int *skip);
1332/*
1333 * read (part of) a message.
1334 */
1335static int read_partial_message(struct ceph_connection *con)
1336{
1337 struct ceph_msg *m = con->in_msg;
1338 void *p;
1339 int ret;
1340 int to, left;
1341 unsigned front_len, middle_len, data_len, data_off;
1342 int datacrc = con->msgr->nocrc;
1343 int skip;
1344 u64 seq;
1345
1346 dout("read_partial_message con %p msg %p\n", con, m);
1347
1348 /* header */
1349 while (con->in_base_pos < sizeof(con->in_hdr)) {
1350 left = sizeof(con->in_hdr) - con->in_base_pos;
1351 ret = ceph_tcp_recvmsg(con->sock,
1352 (char *)&con->in_hdr + con->in_base_pos,
1353 left);
1354 if (ret <= 0)
1355 return ret;
1356 con->in_base_pos += ret;
1357 if (con->in_base_pos == sizeof(con->in_hdr)) {
1358 u32 crc = crc32c(0, (void *)&con->in_hdr,
1359 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1360 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1361 pr_err("read_partial_message bad hdr "
1362 " crc %u != expected %u\n",
1363 crc, con->in_hdr.crc);
1364 return -EBADMSG;
1365 }
1366 }
1367 }
1368 front_len = le32_to_cpu(con->in_hdr.front_len);
1369 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1370 return -EIO;
1371 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1372 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1373 return -EIO;
1374 data_len = le32_to_cpu(con->in_hdr.data_len);
1375 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1376 return -EIO;
1377 data_off = le16_to_cpu(con->in_hdr.data_off);
1378
1379 /* verify seq# */
1380 seq = le64_to_cpu(con->in_hdr.seq);
1381 if ((s64)seq - (s64)con->in_seq < 1) {
1382 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
1383 ENTITY_NAME(con->peer_name),
1384 pr_addr(&con->peer_addr.in_addr),
1385 seq, con->in_seq + 1);
1386 con->in_base_pos = -front_len - middle_len - data_len -
1387 sizeof(m->footer);
1388 con->in_tag = CEPH_MSGR_TAG_READY;
1389 con->in_seq++;
1390 return 0;
1391 } else if ((s64)seq - (s64)con->in_seq > 1) {
1392 pr_err("read_partial_message bad seq %lld expected %lld\n",
1393 seq, con->in_seq + 1);
1394 con->error_msg = "bad message sequence # for incoming message";
1395 return -EBADMSG;
1396 }
1397
1398 /* allocate message? */
1399 if (!con->in_msg) {
1400 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1401 con->in_hdr.front_len, con->in_hdr.data_len);
1402 skip = 0;
1403 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1404 if (skip) {
1405 /* skip this message */
1406 dout("alloc_msg said skip message\n");
1407 BUG_ON(con->in_msg);
1408 con->in_base_pos = -front_len - middle_len - data_len -
1409 sizeof(m->footer);
1410 con->in_tag = CEPH_MSGR_TAG_READY;
1411 con->in_seq++;
1412 return 0;
1413 }
1414 if (!con->in_msg) {
1415 con->error_msg =
1416 "error allocating memory for incoming message";
1417 return -ENOMEM;
1418 }
1419 m = con->in_msg;
1420 m->front.iov_len = 0; /* haven't read it yet */
1421 if (m->middle)
1422 m->middle->vec.iov_len = 0;
1423
1424 con->in_msg_pos.page = 0;
1425 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1426 con->in_msg_pos.data_pos = 0;
1427 }
1428
1429 /* front */
1430 ret = read_partial_message_section(con, &m->front, front_len,
1431 &con->in_front_crc);
1432 if (ret <= 0)
1433 return ret;
1434
1435 /* middle */
1436 if (m->middle) {
1437 ret = read_partial_message_section(con, &m->middle->vec,
1438 middle_len,
1439 &con->in_middle_crc);
1440 if (ret <= 0)
1441 return ret;
1442 }
1443
1444 /* (page) data */
1445 while (con->in_msg_pos.data_pos < data_len) {
1446 left = min((int)(data_len - con->in_msg_pos.data_pos),
1447 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1448 BUG_ON(m->pages == NULL);
1449 p = kmap(m->pages[con->in_msg_pos.page]);
1450 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1451 left);
1452 if (ret > 0 && datacrc)
1453 con->in_data_crc =
1454 crc32c(con->in_data_crc,
1455 p + con->in_msg_pos.page_pos, ret);
1456 kunmap(m->pages[con->in_msg_pos.page]);
1457 if (ret <= 0)
1458 return ret;
1459 con->in_msg_pos.data_pos += ret;
1460 con->in_msg_pos.page_pos += ret;
1461 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1462 con->in_msg_pos.page_pos = 0;
1463 con->in_msg_pos.page++;
1464 }
1465 }
1466
1467 /* footer */
1468 to = sizeof(m->hdr) + sizeof(m->footer);
1469 while (con->in_base_pos < to) {
1470 left = to - con->in_base_pos;
1471 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1472 (con->in_base_pos - sizeof(m->hdr)),
1473 left);
1474 if (ret <= 0)
1475 return ret;
1476 con->in_base_pos += ret;
1477 }
1478 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1479 m, front_len, m->footer.front_crc, middle_len,
1480 m->footer.middle_crc, data_len, m->footer.data_crc);
1481
1482 /* crc ok? */
1483 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1484 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1485 m, con->in_front_crc, m->footer.front_crc);
1486 return -EBADMSG;
1487 }
1488 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1489 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1490 m, con->in_middle_crc, m->footer.middle_crc);
1491 return -EBADMSG;
1492 }
1493 if (datacrc &&
1494 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1495 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1496 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1497 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1498 return -EBADMSG;
1499 }
1500
1501 return 1; /* done! */
1502}
1503
1504/*
1505 * Process message. This happens in the worker thread. The callback should
1506 * be careful not to do anything that waits on other incoming messages or it
1507 * may deadlock.
1508 */
1509static void process_message(struct ceph_connection *con)
1510{
1511 struct ceph_msg *msg;
1512
1513 msg = con->in_msg;
1514 con->in_msg = NULL;
1515
1516 /* if first message, set peer_name */
1517 if (con->peer_name.type == 0)
1518 con->peer_name = msg->hdr.src;
1519
1520 con->in_seq++;
1521 mutex_unlock(&con->mutex);
1522
1523 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1524 msg, le64_to_cpu(msg->hdr.seq),
1525 ENTITY_NAME(msg->hdr.src),
1526 le16_to_cpu(msg->hdr.type),
1527 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1528 le32_to_cpu(msg->hdr.front_len),
1529 le32_to_cpu(msg->hdr.data_len),
1530 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1531 con->ops->dispatch(con, msg);
1532
1533 mutex_lock(&con->mutex);
1534 prepare_read_tag(con);
1535}
1536
1537
1538/*
1539 * Write something to the socket. Called in a worker thread when the
1540 * socket appears to be writeable and we have something ready to send.
1541 */
1542static int try_write(struct ceph_connection *con)
1543{
1544 struct ceph_messenger *msgr = con->msgr;
1545 int ret = 1;
1546
1547 dout("try_write start %p state %lu nref %d\n", con, con->state,
1548 atomic_read(&con->nref));
1549
1550more:
1551 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1552
1553 /* open the socket first? */
1554 if (con->sock == NULL) {
1555 /*
1556 * if we were STANDBY and are reconnecting _this_
1557 * connection, bump connect_seq now. Always bump
1558 * global_seq.
1559 */
1560 if (test_and_clear_bit(STANDBY, &con->state))
1561 con->connect_seq++;
1562
1563 prepare_write_banner(msgr, con);
1564 prepare_write_connect(msgr, con, 1);
1565 prepare_read_banner(con);
1566 set_bit(CONNECTING, &con->state);
1567 clear_bit(NEGOTIATING, &con->state);
1568
1569 BUG_ON(con->in_msg);
1570 con->in_tag = CEPH_MSGR_TAG_READY;
1571 dout("try_write initiating connect on %p new state %lu\n",
1572 con, con->state);
1573 con->sock = ceph_tcp_connect(con);
1574 if (IS_ERR(con->sock)) {
1575 con->sock = NULL;
1576 con->error_msg = "connect error";
1577 ret = -1;
1578 goto out;
1579 }
1580 }
1581
1582more_kvec:
1583 /* kvec data queued? */
1584 if (con->out_skip) {
1585 ret = write_partial_skip(con);
1586 if (ret <= 0)
1587 goto done;
1588 if (ret < 0) {
1589 dout("try_write write_partial_skip err %d\n", ret);
1590 goto done;
1591 }
1592 }
1593 if (con->out_kvec_left) {
1594 ret = write_partial_kvec(con);
1595 if (ret <= 0)
1596 goto done;
1597 }
1598
1599 /* msg pages? */
1600 if (con->out_msg) {
1601 if (con->out_msg_done) {
1602 ceph_msg_put(con->out_msg);
1603 con->out_msg = NULL; /* we're done with this one */
1604 goto do_next;
1605 }
1606
1607 ret = write_partial_msg_pages(con);
1608 if (ret == 1)
1609 goto more_kvec; /* we need to send the footer, too! */
1610 if (ret == 0)
1611 goto done;
1612 if (ret < 0) {
1613 dout("try_write write_partial_msg_pages err %d\n",
1614 ret);
1615 goto done;
1616 }
1617 }
1618
1619do_next:
1620 if (!test_bit(CONNECTING, &con->state)) {
1621 /* is anything else pending? */
1622 if (!list_empty(&con->out_queue)) {
1623 prepare_write_message(con);
1624 goto more;
1625 }
1626 if (con->in_seq > con->in_seq_acked) {
1627 prepare_write_ack(con);
1628 goto more;
1629 }
1630 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1631 prepare_write_keepalive(con);
1632 goto more;
1633 }
1634 }
1635
1636 /* Nothing to do! */
1637 clear_bit(WRITE_PENDING, &con->state);
1638 dout("try_write nothing else to write.\n");
1639done:
1640 ret = 0;
1641out:
1642 dout("try_write done on %p\n", con);
1643 return ret;
1644}
1645
1646
1647
1648/*
1649 * Read what we can from the socket.
1650 */
1651static int try_read(struct ceph_connection *con)
1652{
1653 int ret = -1;
1654
1655 if (!con->sock)
1656 return 0;
1657
1658 if (test_bit(STANDBY, &con->state))
1659 return 0;
1660
1661 dout("try_read start on %p\n", con);
1662
1663more:
1664 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1665 con->in_base_pos);
1666 if (test_bit(CONNECTING, &con->state)) {
1667 if (!test_bit(NEGOTIATING, &con->state)) {
1668 dout("try_read connecting\n");
1669 ret = read_partial_banner(con);
1670 if (ret <= 0)
1671 goto done;
1672 if (process_banner(con) < 0) {
1673 ret = -1;
1674 goto out;
1675 }
1676 }
1677 ret = read_partial_connect(con);
1678 if (ret <= 0)
1679 goto done;
1680 if (process_connect(con) < 0) {
1681 ret = -1;
1682 goto out;
1683 }
1684 goto more;
1685 }
1686
1687 if (con->in_base_pos < 0) {
1688 /*
1689 * skipping + discarding content.
1690 *
1691 * FIXME: there must be a better way to do this!
1692 */
1693 static char buf[1024];
1694 int skip = min(1024, -con->in_base_pos);
1695 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1696 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1697 if (ret <= 0)
1698 goto done;
1699 con->in_base_pos += ret;
1700 if (con->in_base_pos)
1701 goto more;
1702 }
1703 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1704 /*
1705 * what's next?
1706 */
1707 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1708 if (ret <= 0)
1709 goto done;
1710 dout("try_read got tag %d\n", (int)con->in_tag);
1711 switch (con->in_tag) {
1712 case CEPH_MSGR_TAG_MSG:
1713 prepare_read_message(con);
1714 break;
1715 case CEPH_MSGR_TAG_ACK:
1716 prepare_read_ack(con);
1717 break;
1718 case CEPH_MSGR_TAG_CLOSE:
1719 set_bit(CLOSED, &con->state); /* fixme */
1720 goto done;
1721 default:
1722 goto bad_tag;
1723 }
1724 }
1725 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1726 ret = read_partial_message(con);
1727 if (ret <= 0) {
1728 switch (ret) {
1729 case -EBADMSG:
1730 con->error_msg = "bad crc";
1731 ret = -EIO;
1732 goto out;
1733 case -EIO:
1734 con->error_msg = "io error";
1735 goto out;
1736 default:
1737 goto done;
1738 }
1739 }
1740 if (con->in_tag == CEPH_MSGR_TAG_READY)
1741 goto more;
1742 process_message(con);
1743 goto more;
1744 }
1745 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1746 ret = read_partial_ack(con);
1747 if (ret <= 0)
1748 goto done;
1749 process_ack(con);
1750 goto more;
1751 }
1752
1753done:
1754 ret = 0;
1755out:
1756 dout("try_read done on %p\n", con);
1757 return ret;
1758
1759bad_tag:
1760 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1761 con->error_msg = "protocol error, garbage tag";
1762 ret = -1;
1763 goto out;
1764}
1765
1766
1767/*
1768 * Atomically queue work on a connection. Bump @con reference to
1769 * avoid races with connection teardown.
1770 *
1771 * There is some trickery going on with QUEUED and BUSY because we
1772 * only want a _single_ thread operating on each connection at any
1773 * point in time, but we want to use all available CPUs.
1774 *
1775 * The worker thread only proceeds if it can atomically set BUSY. It
1776 * clears QUEUED and does it's thing. When it thinks it's done, it
1777 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1778 * (tries again to set BUSY).
1779 *
1780 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1781 * try to queue work. If that fails (work is already queued, or BUSY)
1782 * we give up (work also already being done or is queued) but leave QUEUED
1783 * set so that the worker thread will loop if necessary.
1784 */
1785static void queue_con(struct ceph_connection *con)
1786{
1787 if (test_bit(DEAD, &con->state)) {
1788 dout("queue_con %p ignoring: DEAD\n",
1789 con);
1790 return;
1791 }
1792
1793 if (!con->ops->get(con)) {
1794 dout("queue_con %p ref count 0\n", con);
1795 return;
1796 }
1797
1798 set_bit(QUEUED, &con->state);
1799 if (test_bit(BUSY, &con->state)) {
1800 dout("queue_con %p - already BUSY\n", con);
1801 con->ops->put(con);
1802 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1803 dout("queue_con %p - already queued\n", con);
1804 con->ops->put(con);
1805 } else {
1806 dout("queue_con %p\n", con);
1807 }
1808}
1809
1810/*
1811 * Do some work on a connection. Drop a connection ref when we're done.
1812 */
1813static void con_work(struct work_struct *work)
1814{
1815 struct ceph_connection *con = container_of(work, struct ceph_connection,
1816 work.work);
1817 int backoff = 0;
1818
1819more:
1820 if (test_and_set_bit(BUSY, &con->state) != 0) {
1821 dout("con_work %p BUSY already set\n", con);
1822 goto out;
1823 }
1824 dout("con_work %p start, clearing QUEUED\n", con);
1825 clear_bit(QUEUED, &con->state);
1826
1827 mutex_lock(&con->mutex);
1828
1829 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1830 dout("con_work CLOSED\n");
1831 con_close_socket(con);
1832 goto done;
1833 }
1834 if (test_and_clear_bit(OPENING, &con->state)) {
1835 /* reopen w/ new peer */
1836 dout("con_work OPENING\n");
1837 con_close_socket(con);
1838 }
1839
1840 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1841 try_read(con) < 0 ||
1842 try_write(con) < 0) {
1843 mutex_unlock(&con->mutex);
1844 backoff = 1;
1845 ceph_fault(con); /* error/fault path */
1846 goto done_unlocked;
1847 }
1848
1849done:
1850 mutex_unlock(&con->mutex);
1851
1852done_unlocked:
1853 clear_bit(BUSY, &con->state);
1854 dout("con->state=%lu\n", con->state);
1855 if (test_bit(QUEUED, &con->state)) {
1856 if (!backoff || test_bit(OPENING, &con->state)) {
1857 dout("con_work %p QUEUED reset, looping\n", con);
1858 goto more;
1859 }
1860 dout("con_work %p QUEUED reset, but just faulted\n", con);
1861 clear_bit(QUEUED, &con->state);
1862 }
1863 dout("con_work %p done\n", con);
1864
1865out:
1866 con->ops->put(con);
1867}
1868
1869
1870/*
1871 * Generic error/fault handler. A retry mechanism is used with
1872 * exponential backoff
1873 */
1874static void ceph_fault(struct ceph_connection *con)
1875{
1876 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1877 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1878 dout("fault %p state %lu to peer %s\n",
1879 con, con->state, pr_addr(&con->peer_addr.in_addr));
1880
1881 if (test_bit(LOSSYTX, &con->state)) {
1882 dout("fault on LOSSYTX channel\n");
1883 goto out;
1884 }
1885
1886 mutex_lock(&con->mutex);
1887 if (test_bit(CLOSED, &con->state))
1888 goto out_unlock;
1889
1890 con_close_socket(con);
1891
1892 if (con->in_msg) {
1893 ceph_msg_put(con->in_msg);
1894 con->in_msg = NULL;
1895 }
1896
1897 /* Requeue anything that hasn't been acked */
1898 list_splice_init(&con->out_sent, &con->out_queue);
1899
1900 /* If there are no messages in the queue, place the connection
1901 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1902 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1903 dout("fault setting STANDBY\n");
1904 set_bit(STANDBY, &con->state);
1905 } else {
1906 /* retry after a delay. */
1907 if (con->delay == 0)
1908 con->delay = BASE_DELAY_INTERVAL;
1909 else if (con->delay < MAX_DELAY_INTERVAL)
1910 con->delay *= 2;
1911 dout("fault queueing %p delay %lu\n", con, con->delay);
1912 con->ops->get(con);
1913 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1914 round_jiffies_relative(con->delay)) == 0)
1915 con->ops->put(con);
1916 }
1917
1918out_unlock:
1919 mutex_unlock(&con->mutex);
1920out:
1921 /*
1922 * in case we faulted due to authentication, invalidate our
1923 * current tickets so that we can get new ones.
1924 */
1925 if (con->auth_retry && con->ops->invalidate_authorizer) {
1926 dout("calling invalidate_authorizer()\n");
1927 con->ops->invalidate_authorizer(con);
1928 }
1929
1930 if (con->ops->fault)
1931 con->ops->fault(con);
1932}
1933
1934
1935
1936/*
1937 * create a new messenger instance
1938 */
1939struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1940{
1941 struct ceph_messenger *msgr;
1942
1943 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1944 if (msgr == NULL)
1945 return ERR_PTR(-ENOMEM);
1946
1947 spin_lock_init(&msgr->global_seq_lock);
1948
1949 /* the zero page is needed if a request is "canceled" while the message
1950 * is being written over the socket */
1951 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
1952 if (!msgr->zero_page) {
1953 kfree(msgr);
1954 return ERR_PTR(-ENOMEM);
1955 }
1956 kmap(msgr->zero_page);
1957
1958 if (myaddr)
1959 msgr->inst.addr = *myaddr;
1960
1961 /* select a random nonce */
1962 msgr->inst.addr.type = 0;
1963 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1964 encode_my_addr(msgr);
1965
1966 dout("messenger_create %p\n", msgr);
1967 return msgr;
1968}
1969
1970void ceph_messenger_destroy(struct ceph_messenger *msgr)
1971{
1972 dout("destroy %p\n", msgr);
1973 kunmap(msgr->zero_page);
1974 __free_page(msgr->zero_page);
1975 kfree(msgr);
1976 dout("destroyed messenger %p\n", msgr);
1977}
1978
1979/*
1980 * Queue up an outgoing message on the given connection.
1981 */
1982void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1983{
1984 if (test_bit(CLOSED, &con->state)) {
1985 dout("con_send %p closed, dropping %p\n", con, msg);
1986 ceph_msg_put(msg);
1987 return;
1988 }
1989
1990 /* set src+dst */
1991 msg->hdr.src = con->msgr->inst.name;
1992
1993 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1994
1995 msg->needs_out_seq = true;
1996
1997 /* queue */
1998 mutex_lock(&con->mutex);
1999 BUG_ON(!list_empty(&msg->list_head));
2000 list_add_tail(&msg->list_head, &con->out_queue);
2001 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
2002 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
2003 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
2004 le32_to_cpu(msg->hdr.front_len),
2005 le32_to_cpu(msg->hdr.middle_len),
2006 le32_to_cpu(msg->hdr.data_len));
2007 mutex_unlock(&con->mutex);
2008
2009 /* if there wasn't anything waiting to send before, queue
2010 * new work */
2011 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2012 queue_con(con);
2013}
2014
2015/*
2016 * Revoke a message that was previously queued for send
2017 */
2018void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
2019{
2020 mutex_lock(&con->mutex);
2021 if (!list_empty(&msg->list_head)) {
2022 dout("con_revoke %p msg %p - was on queue\n", con, msg);
2023 list_del_init(&msg->list_head);
2024 ceph_msg_put(msg);
2025 msg->hdr.seq = 0;
2026 }
2027 if (con->out_msg == msg) {
2028 dout("con_revoke %p msg %p - was sending\n", con, msg);
2029 con->out_msg = NULL;
2030 if (con->out_kvec_is_msg) {
2031 con->out_skip = con->out_kvec_bytes;
2032 con->out_kvec_is_msg = false;
2033 }
2034 ceph_msg_put(msg);
2035 msg->hdr.seq = 0;
2036 }
2037 mutex_unlock(&con->mutex);
2038}
2039
2040/*
2041 * Revoke a message that we may be reading data into
2042 */
2043void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2044{
2045 mutex_lock(&con->mutex);
2046 if (con->in_msg && con->in_msg == msg) {
2047 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2048 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2049 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2050
2051 /* skip rest of message */
2052 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2053 con->in_base_pos = con->in_base_pos -
2054 sizeof(struct ceph_msg_header) -
2055 front_len -
2056 middle_len -
2057 data_len -
2058 sizeof(struct ceph_msg_footer);
2059 ceph_msg_put(con->in_msg);
2060 con->in_msg = NULL;
2061 con->in_tag = CEPH_MSGR_TAG_READY;
2062 con->in_seq++;
2063 } else {
2064 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2065 con, con->in_msg, msg);
2066 }
2067 mutex_unlock(&con->mutex);
2068}
2069
2070/*
2071 * Queue a keepalive byte to ensure the tcp connection is alive.
2072 */
2073void ceph_con_keepalive(struct ceph_connection *con)
2074{
2075 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2076 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2077 queue_con(con);
2078}
2079
2080
2081/*
2082 * construct a new message with given type, size
2083 * the new msg has a ref count of 1.
2084 */
2085struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2086{
2087 struct ceph_msg *m;
2088
2089 m = kmalloc(sizeof(*m), flags);
2090 if (m == NULL)
2091 goto out;
2092 kref_init(&m->kref);
2093 INIT_LIST_HEAD(&m->list_head);
2094
2095 m->hdr.tid = 0;
2096 m->hdr.type = cpu_to_le16(type);
2097 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2098 m->hdr.version = 0;
2099 m->hdr.front_len = cpu_to_le32(front_len);
2100 m->hdr.middle_len = 0;
2101 m->hdr.data_len = 0;
2102 m->hdr.data_off = 0;
2103 m->hdr.reserved = 0;
2104 m->footer.front_crc = 0;
2105 m->footer.middle_crc = 0;
2106 m->footer.data_crc = 0;
2107 m->footer.flags = 0;
2108 m->front_max = front_len;
2109 m->front_is_vmalloc = false;
2110 m->more_to_follow = false;
2111 m->pool = NULL;
2112
2113 /* front */
2114 if (front_len) {
2115 if (front_len > PAGE_CACHE_SIZE) {
2116 m->front.iov_base = __vmalloc(front_len, flags,
2117 PAGE_KERNEL);
2118 m->front_is_vmalloc = true;
2119 } else {
2120 m->front.iov_base = kmalloc(front_len, flags);
2121 }
2122 if (m->front.iov_base == NULL) {
2123 pr_err("msg_new can't allocate %d bytes\n",
2124 front_len);
2125 goto out2;
2126 }
2127 } else {
2128 m->front.iov_base = NULL;
2129 }
2130 m->front.iov_len = front_len;
2131
2132 /* middle */
2133 m->middle = NULL;
2134
2135 /* data */
2136 m->nr_pages = 0;
2137 m->pages = NULL;
2138 m->pagelist = NULL;
2139
2140 dout("ceph_msg_new %p front %d\n", m, front_len);
2141 return m;
2142
2143out2:
2144 ceph_msg_put(m);
2145out:
2146 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2147 return NULL;
2148}
2149
2150/*
2151 * Allocate "middle" portion of a message, if it is needed and wasn't
2152 * allocated by alloc_msg. This allows us to read a small fixed-size
2153 * per-type header in the front and then gracefully fail (i.e.,
2154 * propagate the error to the caller based on info in the front) when
2155 * the middle is too large.
2156 */
2157static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2158{
2159 int type = le16_to_cpu(msg->hdr.type);
2160 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2161
2162 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2163 ceph_msg_type_name(type), middle_len);
2164 BUG_ON(!middle_len);
2165 BUG_ON(msg->middle);
2166
2167 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2168 if (!msg->middle)
2169 return -ENOMEM;
2170 return 0;
2171}
2172
2173/*
2174 * Generic message allocator, for incoming messages.
2175 */
2176static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2177 struct ceph_msg_header *hdr,
2178 int *skip)
2179{
2180 int type = le16_to_cpu(hdr->type);
2181 int front_len = le32_to_cpu(hdr->front_len);
2182 int middle_len = le32_to_cpu(hdr->middle_len);
2183 struct ceph_msg *msg = NULL;
2184 int ret;
2185
2186 if (con->ops->alloc_msg) {
2187 mutex_unlock(&con->mutex);
2188 msg = con->ops->alloc_msg(con, hdr, skip);
2189 mutex_lock(&con->mutex);
2190 if (!msg || *skip)
2191 return NULL;
2192 }
2193 if (!msg) {
2194 *skip = 0;
2195 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2196 if (!msg) {
2197 pr_err("unable to allocate msg type %d len %d\n",
2198 type, front_len);
2199 return NULL;
2200 }
2201 }
2202 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2203
2204 if (middle_len && !msg->middle) {
2205 ret = ceph_alloc_middle(con, msg);
2206 if (ret < 0) {
2207 ceph_msg_put(msg);
2208 return NULL;
2209 }
2210 }
2211
2212 return msg;
2213}
2214
2215
2216/*
2217 * Free a generically kmalloc'd message.
2218 */
2219void ceph_msg_kfree(struct ceph_msg *m)
2220{
2221 dout("msg_kfree %p\n", m);
2222 if (m->front_is_vmalloc)
2223 vfree(m->front.iov_base);
2224 else
2225 kfree(m->front.iov_base);
2226 kfree(m);
2227}
2228
2229/*
2230 * Drop a msg ref. Destroy as needed.
2231 */
2232void ceph_msg_last_put(struct kref *kref)
2233{
2234 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2235
2236 dout("ceph_msg_put last one on %p\n", m);
2237 WARN_ON(!list_empty(&m->list_head));
2238
2239 /* drop middle, data, if any */
2240 if (m->middle) {
2241 ceph_buffer_put(m->middle);
2242 m->middle = NULL;
2243 }
2244 m->nr_pages = 0;
2245 m->pages = NULL;
2246
2247 if (m->pagelist) {
2248 ceph_pagelist_release(m->pagelist);
2249 kfree(m->pagelist);
2250 m->pagelist = NULL;
2251 }
2252
2253 if (m->pool)
2254 ceph_msgpool_put(m->pool, m);
2255 else
2256 ceph_msg_kfree(m);
2257}
2258
2259void ceph_msg_dump(struct ceph_msg *msg)
2260{
2261 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2262 msg->front_max, msg->nr_pages);
2263 print_hex_dump(KERN_DEBUG, "header: ",
2264 DUMP_PREFIX_OFFSET, 16, 1,
2265 &msg->hdr, sizeof(msg->hdr), true);
2266 print_hex_dump(KERN_DEBUG, " front: ",
2267 DUMP_PREFIX_OFFSET, 16, 1,
2268 msg->front.iov_base, msg->front.iov_len, true);
2269 if (msg->middle)
2270 print_hex_dump(KERN_DEBUG, "middle: ",
2271 DUMP_PREFIX_OFFSET, 16, 1,
2272 msg->middle->vec.iov_base,
2273 msg->middle->vec.iov_len, true);
2274 print_hex_dump(KERN_DEBUG, "footer: ",
2275 DUMP_PREFIX_OFFSET, 16, 1,
2276 &msg->footer, sizeof(msg->footer), true);
2277}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
deleted file mode 100644
index 76fbc957bc13..000000000000
--- a/fs/ceph/messenger.h
+++ /dev/null
@@ -1,253 +0,0 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52/* use format string %s%d */
53#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
54
55struct ceph_messenger {
56 struct ceph_entity_inst inst; /* my name+address */
57 struct ceph_entity_addr my_enc_addr;
58 struct page *zero_page; /* used in certain error cases */
59
60 bool nocrc;
61
62 /*
63 * the global_seq counts connections i (attempt to) initiate
64 * in order to disambiguate certain connect race conditions.
65 */
66 u32 global_seq;
67 spinlock_t global_seq_lock;
68};
69
70/*
71 * a single message. it contains a header (src, dest, message type, etc.),
72 * footer (crc values, mainly), a "front" message body, and possibly a
73 * data payload (stored in some number of pages).
74 */
75struct ceph_msg {
76 struct ceph_msg_header hdr; /* header */
77 struct ceph_msg_footer footer; /* footer */
78 struct kvec front; /* unaligned blobs of message */
79 struct ceph_buffer *middle;
80 struct page **pages; /* data payload. NOT OWNER. */
81 unsigned nr_pages; /* size of page array */
82 struct ceph_pagelist *pagelist; /* instead of pages */
83 struct list_head list_head;
84 struct kref kref;
85 bool front_is_vmalloc;
86 bool more_to_follow;
87 bool needs_out_seq;
88 int front_max;
89
90 struct ceph_msgpool *pool;
91};
92
93struct ceph_msg_pos {
94 int page, page_pos; /* which page; offset in page */
95 int data_pos; /* offset in data payload */
96 int did_page_crc; /* true if we've calculated crc for current page */
97};
98
99/* ceph connection fault delay defaults, for exponential backoff */
100#define BASE_DELAY_INTERVAL (HZ/2)
101#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
102
103/*
104 * ceph_connection state bit flags
105 *
106 * QUEUED and BUSY are used together to ensure that only a single
107 * thread is currently opening, reading or writing data to the socket.
108 */
109#define LOSSYTX 0 /* we can close channel or drop messages on errors */
110#define CONNECTING 1
111#define NEGOTIATING 2
112#define KEEPALIVE_PENDING 3
113#define WRITE_PENDING 4 /* we have data ready to send */
114#define QUEUED 5 /* there is work queued on this connection */
115#define BUSY 6 /* work is being done */
116#define STANDBY 8 /* no outgoing messages, socket closed. we keep
117 * the ceph_connection around to maintain shared
118 * state with the peer. */
119#define CLOSED 10 /* we've closed the connection */
120#define SOCK_CLOSED 11 /* socket state changed to closed */
121#define OPENING 13 /* open connection w/ (possibly new) peer */
122#define DEAD 14 /* dead, about to kfree */
123
124/*
125 * A single connection with another host.
126 *
127 * We maintain a queue of outgoing messages, and some session state to
128 * ensure that we can preserve the lossless, ordered delivery of
129 * messages in the case of a TCP disconnect.
130 */
131struct ceph_connection {
132 void *private;
133 atomic_t nref;
134
135 const struct ceph_connection_operations *ops;
136
137 struct ceph_messenger *msgr;
138 struct socket *sock;
139 unsigned long state; /* connection state (see flags above) */
140 const char *error_msg; /* error message, if any */
141
142 struct ceph_entity_addr peer_addr; /* peer address */
143 struct ceph_entity_name peer_name; /* peer name */
144 struct ceph_entity_addr peer_addr_for_me;
145 unsigned peer_features;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 bool out_keepalive_pending;
161
162 u64 in_seq, in_seq_acked; /* last message received, acked */
163
164 /* connection negotiation temps */
165 char in_banner[CEPH_BANNER_MAX_LEN];
166 union {
167 struct { /* outgoing connection */
168 struct ceph_msg_connect out_connect;
169 struct ceph_msg_connect_reply in_reply;
170 };
171 struct { /* incoming */
172 struct ceph_msg_connect in_connect;
173 struct ceph_msg_connect_reply out_reply;
174 };
175 };
176 struct ceph_entity_addr actual_peer_addr;
177
178 /* message out temps */
179 struct ceph_msg *out_msg; /* sending message (== tail of
180 out_sent) */
181 bool out_msg_done;
182 struct ceph_msg_pos out_msg_pos;
183
184 struct kvec out_kvec[8], /* sending header/footer data */
185 *out_kvec_cur;
186 int out_kvec_left; /* kvec's left in out_kvec */
187 int out_skip; /* skip this many bytes */
188 int out_kvec_bytes; /* total bytes left */
189 bool out_kvec_is_msg; /* kvec refers to out_msg */
190 int out_more; /* there is more data after the kvecs */
191 __le64 out_temp_ack; /* for writing an ack */
192
193 /* message in temps */
194 struct ceph_msg_header in_hdr;
195 struct ceph_msg *in_msg;
196 struct ceph_msg_pos in_msg_pos;
197 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
198
199 char in_tag; /* protocol control byte */
200 int in_base_pos; /* bytes read */
201 __le64 in_temp_ack; /* for reading an ack */
202
203 struct delayed_work work; /* send|recv work */
204 unsigned long delay; /* current delay interval */
205};
206
207
208extern const char *pr_addr(const struct sockaddr_storage *ss);
209extern int ceph_parse_ips(const char *c, const char *end,
210 struct ceph_entity_addr *addr,
211 int max_count, int *count);
212
213
214extern int ceph_msgr_init(void);
215extern void ceph_msgr_exit(void);
216extern void ceph_msgr_flush(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern bool ceph_con_opened(struct ceph_connection *con);
227extern void ceph_con_close(struct ceph_connection *con);
228extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
230extern void ceph_con_revoke_message(struct ceph_connection *con,
231 struct ceph_msg *msg);
232extern void ceph_con_keepalive(struct ceph_connection *con);
233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
234extern void ceph_con_put(struct ceph_connection *con);
235
236extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
237extern void ceph_msg_kfree(struct ceph_msg *m);
238
239
240static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
241{
242 kref_get(&msg->kref);
243 return msg;
244}
245extern void ceph_msg_last_put(struct kref *kref);
246static inline void ceph_msg_put(struct ceph_msg *msg)
247{
248 kref_put(&msg->kref, ceph_msg_last_put);
249}
250
251extern void ceph_msg_dump(struct ceph_msg *msg);
252
253#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
deleted file mode 100644
index b2a5a3e4a671..000000000000
--- a/fs/ceph/mon_client.c
+++ /dev/null
@@ -1,1018 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/slab.h>
5#include <linux/random.h>
6#include <linux/sched.h>
7
8#include "mon_client.h"
9#include "super.h"
10#include "auth.h"
11#include "decode.h"
12
13/*
14 * Interact with Ceph monitor cluster. Handle requests for new map
15 * versions, and periodically resend as needed. Also implement
16 * statfs() and umount().
17 *
18 * A small cluster of Ceph "monitors" are responsible for managing critical
19 * cluster configuration and state information. An odd number (e.g., 3, 5)
20 * of cmon daemons use a modified version of the Paxos part-time parliament
21 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
22 * list of clients who have mounted the file system.
23 *
24 * We maintain an open, active session with a monitor at all times in order to
25 * receive timely MDSMap updates. We periodically send a keepalive byte on the
26 * TCP socket to ensure we detect a failure. If the connection does break, we
27 * randomly hunt for a new monitor. Once the connection is reestablished, we
28 * resend any outstanding requests.
29 */
30
31static const struct ceph_connection_operations mon_con_ops;
32
33static int __validate_auth(struct ceph_mon_client *monc);
34
35/*
36 * Decode a monmap blob (e.g., during mount).
37 */
38struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
39{
40 struct ceph_monmap *m = NULL;
41 int i, err = -EINVAL;
42 struct ceph_fsid fsid;
43 u32 epoch, num_mon;
44 u16 version;
45 u32 len;
46
47 ceph_decode_32_safe(&p, end, len, bad);
48 ceph_decode_need(&p, end, len, bad);
49
50 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
51
52 ceph_decode_16_safe(&p, end, version, bad);
53
54 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
55 ceph_decode_copy(&p, &fsid, sizeof(fsid));
56 epoch = ceph_decode_32(&p);
57
58 num_mon = ceph_decode_32(&p);
59 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
60
61 if (num_mon >= CEPH_MAX_MON)
62 goto bad;
63 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
64 if (m == NULL)
65 return ERR_PTR(-ENOMEM);
66 m->fsid = fsid;
67 m->epoch = epoch;
68 m->num_mon = num_mon;
69 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
70 for (i = 0; i < num_mon; i++)
71 ceph_decode_addr(&m->mon_inst[i].addr);
72
73 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
74 m->num_mon);
75 for (i = 0; i < m->num_mon; i++)
76 dout("monmap_decode mon%d is %s\n", i,
77 pr_addr(&m->mon_inst[i].addr.in_addr));
78 return m;
79
80bad:
81 dout("monmap_decode failed with %d\n", err);
82 kfree(m);
83 return ERR_PTR(err);
84}
85
86/*
87 * return true if *addr is included in the monmap.
88 */
89int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
90{
91 int i;
92
93 for (i = 0; i < m->num_mon; i++)
94 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
95 return 1;
96 return 0;
97}
98
99/*
100 * Send an auth request.
101 */
102static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
103{
104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_con_revoke(monc->con, monc->m_auth);
108 ceph_msg_get(monc->m_auth); /* keep our ref */
109 ceph_con_send(monc->con, monc->m_auth);
110}
111
112/*
113 * Close monitor session, if any.
114 */
115static void __close_session(struct ceph_mon_client *monc)
116{
117 if (monc->con) {
118 dout("__close_session closing mon%d\n", monc->cur_mon);
119 ceph_con_revoke(monc->con, monc->m_auth);
120 ceph_con_close(monc->con);
121 monc->cur_mon = -1;
122 monc->pending_auth = 0;
123 ceph_auth_reset(monc->auth);
124 }
125}
126
127/*
128 * Open a session with a (new) monitor.
129 */
130static int __open_session(struct ceph_mon_client *monc)
131{
132 char r;
133 int ret;
134
135 if (monc->cur_mon < 0) {
136 get_random_bytes(&r, 1);
137 monc->cur_mon = r % monc->monmap->num_mon;
138 dout("open_session num=%d r=%d -> mon%d\n",
139 monc->monmap->num_mon, r, monc->cur_mon);
140 monc->sub_sent = 0;
141 monc->sub_renew_after = jiffies; /* i.e., expired */
142 monc->want_next_osdmap = !!monc->want_next_osdmap;
143
144 dout("open_session mon%d opening\n", monc->cur_mon);
145 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
146 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
147 ceph_con_open(monc->con,
148 &monc->monmap->mon_inst[monc->cur_mon].addr);
149
150 /* initiatiate authentication handshake */
151 ret = ceph_auth_build_hello(monc->auth,
152 monc->m_auth->front.iov_base,
153 monc->m_auth->front_max);
154 __send_prepared_auth_request(monc, ret);
155 } else {
156 dout("open_session mon%d already open\n", monc->cur_mon);
157 }
158 return 0;
159}
160
161static bool __sub_expired(struct ceph_mon_client *monc)
162{
163 return time_after_eq(jiffies, monc->sub_renew_after);
164}
165
166/*
167 * Reschedule delayed work timer.
168 */
169static void __schedule_delayed(struct ceph_mon_client *monc)
170{
171 unsigned delay;
172
173 if (monc->cur_mon < 0 || __sub_expired(monc))
174 delay = 10 * HZ;
175 else
176 delay = 20 * HZ;
177 dout("__schedule_delayed after %u\n", delay);
178 schedule_delayed_work(&monc->delayed_work, delay);
179}
180
181/*
182 * Send subscribe request for mdsmap and/or osdmap.
183 */
184static void __send_subscribe(struct ceph_mon_client *monc)
185{
186 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
187 (unsigned)monc->sub_sent, __sub_expired(monc),
188 monc->want_next_osdmap);
189 if ((__sub_expired(monc) && !monc->sub_sent) ||
190 monc->want_next_osdmap == 1) {
191 struct ceph_msg *msg = monc->m_subscribe;
192 struct ceph_mon_subscribe_item *i;
193 void *p, *end;
194
195 p = msg->front.iov_base;
196 end = p + msg->front_max;
197
198 dout("__send_subscribe to 'mdsmap' %u+\n",
199 (unsigned)monc->have_mdsmap);
200 if (monc->want_next_osdmap) {
201 dout("__send_subscribe to 'osdmap' %u\n",
202 (unsigned)monc->have_osdmap);
203 ceph_encode_32(&p, 3);
204 ceph_encode_string(&p, end, "osdmap", 6);
205 i = p;
206 i->have = cpu_to_le64(monc->have_osdmap);
207 i->onetime = 1;
208 p += sizeof(*i);
209 monc->want_next_osdmap = 2; /* requested */
210 } else {
211 ceph_encode_32(&p, 2);
212 }
213 ceph_encode_string(&p, end, "mdsmap", 6);
214 i = p;
215 i->have = cpu_to_le64(monc->have_mdsmap);
216 i->onetime = 0;
217 p += sizeof(*i);
218 ceph_encode_string(&p, end, "monmap", 6);
219 i = p;
220 i->have = 0;
221 i->onetime = 0;
222 p += sizeof(*i);
223
224 msg->front.iov_len = p - msg->front.iov_base;
225 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
226 ceph_con_revoke(monc->con, msg);
227 ceph_con_send(monc->con, ceph_msg_get(msg));
228
229 monc->sub_sent = jiffies | 1; /* never 0 */
230 }
231}
232
233static void handle_subscribe_ack(struct ceph_mon_client *monc,
234 struct ceph_msg *msg)
235{
236 unsigned seconds;
237 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
238
239 if (msg->front.iov_len < sizeof(*h))
240 goto bad;
241 seconds = le32_to_cpu(h->duration);
242
243 mutex_lock(&monc->mutex);
244 if (monc->hunting) {
245 pr_info("mon%d %s session established\n",
246 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
247 monc->hunting = false;
248 }
249 dout("handle_subscribe_ack after %d seconds\n", seconds);
250 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
251 monc->sub_sent = 0;
252 mutex_unlock(&monc->mutex);
253 return;
254bad:
255 pr_err("got corrupt subscribe-ack msg\n");
256 ceph_msg_dump(msg);
257}
258
259/*
260 * Keep track of which maps we have
261 */
262int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
263{
264 mutex_lock(&monc->mutex);
265 monc->have_mdsmap = got;
266 mutex_unlock(&monc->mutex);
267 return 0;
268}
269
270int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
271{
272 mutex_lock(&monc->mutex);
273 monc->have_osdmap = got;
274 monc->want_next_osdmap = 0;
275 mutex_unlock(&monc->mutex);
276 return 0;
277}
278
279/*
280 * Register interest in the next osdmap
281 */
282void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
283{
284 dout("request_next_osdmap have %u\n", monc->have_osdmap);
285 mutex_lock(&monc->mutex);
286 if (!monc->want_next_osdmap)
287 monc->want_next_osdmap = 1;
288 if (monc->want_next_osdmap < 2)
289 __send_subscribe(monc);
290 mutex_unlock(&monc->mutex);
291}
292
293/*
294 *
295 */
296int ceph_monc_open_session(struct ceph_mon_client *monc)
297{
298 if (!monc->con) {
299 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
300 if (!monc->con)
301 return -ENOMEM;
302 ceph_con_init(monc->client->msgr, monc->con);
303 monc->con->private = monc;
304 monc->con->ops = &mon_con_ops;
305 }
306
307 mutex_lock(&monc->mutex);
308 __open_session(monc);
309 __schedule_delayed(monc);
310 mutex_unlock(&monc->mutex);
311 return 0;
312}
313
314/*
315 * The monitor responds with mount ack indicate mount success. The
316 * included client ticket allows the client to talk to MDSs and OSDs.
317 */
318static void ceph_monc_handle_map(struct ceph_mon_client *monc,
319 struct ceph_msg *msg)
320{
321 struct ceph_client *client = monc->client;
322 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
323 void *p, *end;
324
325 mutex_lock(&monc->mutex);
326
327 dout("handle_monmap\n");
328 p = msg->front.iov_base;
329 end = p + msg->front.iov_len;
330
331 monmap = ceph_monmap_decode(p, end);
332 if (IS_ERR(monmap)) {
333 pr_err("problem decoding monmap, %d\n",
334 (int)PTR_ERR(monmap));
335 goto out;
336 }
337
338 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
339 kfree(monmap);
340 goto out;
341 }
342
343 client->monc.monmap = monmap;
344 kfree(old);
345
346out:
347 mutex_unlock(&monc->mutex);
348 wake_up_all(&client->auth_wq);
349}
350
351/*
352 * generic requests (e.g., statfs, poolop)
353 */
354static struct ceph_mon_generic_request *__lookup_generic_req(
355 struct ceph_mon_client *monc, u64 tid)
356{
357 struct ceph_mon_generic_request *req;
358 struct rb_node *n = monc->generic_request_tree.rb_node;
359
360 while (n) {
361 req = rb_entry(n, struct ceph_mon_generic_request, node);
362 if (tid < req->tid)
363 n = n->rb_left;
364 else if (tid > req->tid)
365 n = n->rb_right;
366 else
367 return req;
368 }
369 return NULL;
370}
371
372static void __insert_generic_request(struct ceph_mon_client *monc,
373 struct ceph_mon_generic_request *new)
374{
375 struct rb_node **p = &monc->generic_request_tree.rb_node;
376 struct rb_node *parent = NULL;
377 struct ceph_mon_generic_request *req = NULL;
378
379 while (*p) {
380 parent = *p;
381 req = rb_entry(parent, struct ceph_mon_generic_request, node);
382 if (new->tid < req->tid)
383 p = &(*p)->rb_left;
384 else if (new->tid > req->tid)
385 p = &(*p)->rb_right;
386 else
387 BUG();
388 }
389
390 rb_link_node(&new->node, parent, p);
391 rb_insert_color(&new->node, &monc->generic_request_tree);
392}
393
394static void release_generic_request(struct kref *kref)
395{
396 struct ceph_mon_generic_request *req =
397 container_of(kref, struct ceph_mon_generic_request, kref);
398
399 if (req->reply)
400 ceph_msg_put(req->reply);
401 if (req->request)
402 ceph_msg_put(req->request);
403
404 kfree(req);
405}
406
407static void put_generic_request(struct ceph_mon_generic_request *req)
408{
409 kref_put(&req->kref, release_generic_request);
410}
411
412static void get_generic_request(struct ceph_mon_generic_request *req)
413{
414 kref_get(&req->kref);
415}
416
417static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
418 struct ceph_msg_header *hdr,
419 int *skip)
420{
421 struct ceph_mon_client *monc = con->private;
422 struct ceph_mon_generic_request *req;
423 u64 tid = le64_to_cpu(hdr->tid);
424 struct ceph_msg *m;
425
426 mutex_lock(&monc->mutex);
427 req = __lookup_generic_req(monc, tid);
428 if (!req) {
429 dout("get_generic_reply %lld dne\n", tid);
430 *skip = 1;
431 m = NULL;
432 } else {
433 dout("get_generic_reply %lld got %p\n", tid, req->reply);
434 m = ceph_msg_get(req->reply);
435 /*
436 * we don't need to track the connection reading into
437 * this reply because we only have one open connection
438 * at a time, ever.
439 */
440 }
441 mutex_unlock(&monc->mutex);
442 return m;
443}
444
445static int do_generic_request(struct ceph_mon_client *monc,
446 struct ceph_mon_generic_request *req)
447{
448 int err;
449
450 /* register request */
451 mutex_lock(&monc->mutex);
452 req->tid = ++monc->last_tid;
453 req->request->hdr.tid = cpu_to_le64(req->tid);
454 __insert_generic_request(monc, req);
455 monc->num_generic_requests++;
456 ceph_con_send(monc->con, ceph_msg_get(req->request));
457 mutex_unlock(&monc->mutex);
458
459 err = wait_for_completion_interruptible(&req->completion);
460
461 mutex_lock(&monc->mutex);
462 rb_erase(&req->node, &monc->generic_request_tree);
463 monc->num_generic_requests--;
464 mutex_unlock(&monc->mutex);
465
466 if (!err)
467 err = req->result;
468 return err;
469}
470
471/*
472 * statfs
473 */
474static void handle_statfs_reply(struct ceph_mon_client *monc,
475 struct ceph_msg *msg)
476{
477 struct ceph_mon_generic_request *req;
478 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
479 u64 tid = le64_to_cpu(msg->hdr.tid);
480
481 if (msg->front.iov_len != sizeof(*reply))
482 goto bad;
483 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
484
485 mutex_lock(&monc->mutex);
486 req = __lookup_generic_req(monc, tid);
487 if (req) {
488 *(struct ceph_statfs *)req->buf = reply->st;
489 req->result = 0;
490 get_generic_request(req);
491 }
492 mutex_unlock(&monc->mutex);
493 if (req) {
494 complete_all(&req->completion);
495 put_generic_request(req);
496 }
497 return;
498
499bad:
500 pr_err("corrupt generic reply, tid %llu\n", tid);
501 ceph_msg_dump(msg);
502}
503
504/*
505 * Do a synchronous statfs().
506 */
507int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
508{
509 struct ceph_mon_generic_request *req;
510 struct ceph_mon_statfs *h;
511 int err;
512
513 req = kzalloc(sizeof(*req), GFP_NOFS);
514 if (!req)
515 return -ENOMEM;
516
517 kref_init(&req->kref);
518 req->buf = buf;
519 req->buf_len = sizeof(*buf);
520 init_completion(&req->completion);
521
522 err = -ENOMEM;
523 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
524 if (!req->request)
525 goto out;
526 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
527 if (!req->reply)
528 goto out;
529
530 /* fill out request */
531 h = req->request->front.iov_base;
532 h->monhdr.have_version = 0;
533 h->monhdr.session_mon = cpu_to_le16(-1);
534 h->monhdr.session_mon_tid = 0;
535 h->fsid = monc->monmap->fsid;
536
537 err = do_generic_request(monc, req);
538
539out:
540 kref_put(&req->kref, release_generic_request);
541 return err;
542}
543
544/*
545 * pool ops
546 */
547static int get_poolop_reply_buf(const char *src, size_t src_len,
548 char *dst, size_t dst_len)
549{
550 u32 buf_len;
551
552 if (src_len != sizeof(u32) + dst_len)
553 return -EINVAL;
554
555 buf_len = le32_to_cpu(*(u32 *)src);
556 if (buf_len != dst_len)
557 return -EINVAL;
558
559 memcpy(dst, src + sizeof(u32), dst_len);
560 return 0;
561}
562
563static void handle_poolop_reply(struct ceph_mon_client *monc,
564 struct ceph_msg *msg)
565{
566 struct ceph_mon_generic_request *req;
567 struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
568 u64 tid = le64_to_cpu(msg->hdr.tid);
569
570 if (msg->front.iov_len < sizeof(*reply))
571 goto bad;
572 dout("handle_poolop_reply %p tid %llu\n", msg, tid);
573
574 mutex_lock(&monc->mutex);
575 req = __lookup_generic_req(monc, tid);
576 if (req) {
577 if (req->buf_len &&
578 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
579 msg->front.iov_len - sizeof(*reply),
580 req->buf, req->buf_len) < 0) {
581 mutex_unlock(&monc->mutex);
582 goto bad;
583 }
584 req->result = le32_to_cpu(reply->reply_code);
585 get_generic_request(req);
586 }
587 mutex_unlock(&monc->mutex);
588 if (req) {
589 complete(&req->completion);
590 put_generic_request(req);
591 }
592 return;
593
594bad:
595 pr_err("corrupt generic reply, tid %llu\n", tid);
596 ceph_msg_dump(msg);
597}
598
599/*
600 * Do a synchronous pool op.
601 */
602int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
603 u32 pool, u64 snapid,
604 char *buf, int len)
605{
606 struct ceph_mon_generic_request *req;
607 struct ceph_mon_poolop *h;
608 int err;
609
610 req = kzalloc(sizeof(*req), GFP_NOFS);
611 if (!req)
612 return -ENOMEM;
613
614 kref_init(&req->kref);
615 req->buf = buf;
616 req->buf_len = len;
617 init_completion(&req->completion);
618
619 err = -ENOMEM;
620 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
621 if (!req->request)
622 goto out;
623 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
624 if (!req->reply)
625 goto out;
626
627 /* fill out request */
628 req->request->hdr.version = cpu_to_le16(2);
629 h = req->request->front.iov_base;
630 h->monhdr.have_version = 0;
631 h->monhdr.session_mon = cpu_to_le16(-1);
632 h->monhdr.session_mon_tid = 0;
633 h->fsid = monc->monmap->fsid;
634 h->pool = cpu_to_le32(pool);
635 h->op = cpu_to_le32(op);
636 h->auid = 0;
637 h->snapid = cpu_to_le64(snapid);
638 h->name_len = 0;
639
640 err = do_generic_request(monc, req);
641
642out:
643 kref_put(&req->kref, release_generic_request);
644 return err;
645}
646
647int ceph_monc_create_snapid(struct ceph_mon_client *monc,
648 u32 pool, u64 *snapid)
649{
650 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
651 pool, 0, (char *)snapid, sizeof(*snapid));
652
653}
654
655int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
656 u32 pool, u64 snapid)
657{
658 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
659 pool, snapid, 0, 0);
660
661}
662
663/*
664 * Resend pending generic requests.
665 */
666static void __resend_generic_request(struct ceph_mon_client *monc)
667{
668 struct ceph_mon_generic_request *req;
669 struct rb_node *p;
670
671 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
672 req = rb_entry(p, struct ceph_mon_generic_request, node);
673 ceph_con_revoke(monc->con, req->request);
674 ceph_con_send(monc->con, ceph_msg_get(req->request));
675 }
676}
677
678/*
679 * Delayed work. If we haven't mounted yet, retry. Otherwise,
680 * renew/retry subscription as needed (in case it is timing out, or we
681 * got an ENOMEM). And keep the monitor connection alive.
682 */
683static void delayed_work(struct work_struct *work)
684{
685 struct ceph_mon_client *monc =
686 container_of(work, struct ceph_mon_client, delayed_work.work);
687
688 dout("monc delayed_work\n");
689 mutex_lock(&monc->mutex);
690 if (monc->hunting) {
691 __close_session(monc);
692 __open_session(monc); /* continue hunting */
693 } else {
694 ceph_con_keepalive(monc->con);
695
696 __validate_auth(monc);
697
698 if (monc->auth->ops->is_authenticated(monc->auth))
699 __send_subscribe(monc);
700 }
701 __schedule_delayed(monc);
702 mutex_unlock(&monc->mutex);
703}
704
705/*
706 * On startup, we build a temporary monmap populated with the IPs
707 * provided by mount(2).
708 */
709static int build_initial_monmap(struct ceph_mon_client *monc)
710{
711 struct ceph_mount_args *args = monc->client->mount_args;
712 struct ceph_entity_addr *mon_addr = args->mon_addr;
713 int num_mon = args->num_mon;
714 int i;
715
716 /* build initial monmap */
717 monc->monmap = kzalloc(sizeof(*monc->monmap) +
718 num_mon*sizeof(monc->monmap->mon_inst[0]),
719 GFP_KERNEL);
720 if (!monc->monmap)
721 return -ENOMEM;
722 for (i = 0; i < num_mon; i++) {
723 monc->monmap->mon_inst[i].addr = mon_addr[i];
724 monc->monmap->mon_inst[i].addr.nonce = 0;
725 monc->monmap->mon_inst[i].name.type =
726 CEPH_ENTITY_TYPE_MON;
727 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
728 }
729 monc->monmap->num_mon = num_mon;
730 monc->have_fsid = false;
731
732 /* release addr memory */
733 kfree(args->mon_addr);
734 args->mon_addr = NULL;
735 args->num_mon = 0;
736 return 0;
737}
738
739int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
740{
741 int err = 0;
742
743 dout("init\n");
744 memset(monc, 0, sizeof(*monc));
745 monc->client = cl;
746 monc->monmap = NULL;
747 mutex_init(&monc->mutex);
748
749 err = build_initial_monmap(monc);
750 if (err)
751 goto out;
752
753 monc->con = NULL;
754
755 /* authentication */
756 monc->auth = ceph_auth_init(cl->mount_args->name,
757 cl->mount_args->secret);
758 if (IS_ERR(monc->auth))
759 return PTR_ERR(monc->auth);
760 monc->auth->want_keys =
761 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
762 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
763
764 /* msgs */
765 err = -ENOMEM;
766 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
767 sizeof(struct ceph_mon_subscribe_ack),
768 GFP_NOFS);
769 if (!monc->m_subscribe_ack)
770 goto out_monmap;
771
772 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
773 if (!monc->m_subscribe)
774 goto out_subscribe_ack;
775
776 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
777 if (!monc->m_auth_reply)
778 goto out_subscribe;
779
780 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
781 monc->pending_auth = 0;
782 if (!monc->m_auth)
783 goto out_auth_reply;
784
785 monc->cur_mon = -1;
786 monc->hunting = true;
787 monc->sub_renew_after = jiffies;
788 monc->sub_sent = 0;
789
790 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
791 monc->generic_request_tree = RB_ROOT;
792 monc->num_generic_requests = 0;
793 monc->last_tid = 0;
794
795 monc->have_mdsmap = 0;
796 monc->have_osdmap = 0;
797 monc->want_next_osdmap = 1;
798 return 0;
799
800out_auth_reply:
801 ceph_msg_put(monc->m_auth_reply);
802out_subscribe:
803 ceph_msg_put(monc->m_subscribe);
804out_subscribe_ack:
805 ceph_msg_put(monc->m_subscribe_ack);
806out_monmap:
807 kfree(monc->monmap);
808out:
809 return err;
810}
811
812void ceph_monc_stop(struct ceph_mon_client *monc)
813{
814 dout("stop\n");
815 cancel_delayed_work_sync(&monc->delayed_work);
816
817 mutex_lock(&monc->mutex);
818 __close_session(monc);
819 if (monc->con) {
820 monc->con->private = NULL;
821 monc->con->ops->put(monc->con);
822 monc->con = NULL;
823 }
824 mutex_unlock(&monc->mutex);
825
826 ceph_auth_destroy(monc->auth);
827
828 ceph_msg_put(monc->m_auth);
829 ceph_msg_put(monc->m_auth_reply);
830 ceph_msg_put(monc->m_subscribe);
831 ceph_msg_put(monc->m_subscribe_ack);
832
833 kfree(monc->monmap);
834}
835
836static void handle_auth_reply(struct ceph_mon_client *monc,
837 struct ceph_msg *msg)
838{
839 int ret;
840 int was_auth = 0;
841
842 mutex_lock(&monc->mutex);
843 if (monc->auth->ops)
844 was_auth = monc->auth->ops->is_authenticated(monc->auth);
845 monc->pending_auth = 0;
846 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
847 msg->front.iov_len,
848 monc->m_auth->front.iov_base,
849 monc->m_auth->front_max);
850 if (ret < 0) {
851 monc->client->auth_err = ret;
852 wake_up_all(&monc->client->auth_wq);
853 } else if (ret > 0) {
854 __send_prepared_auth_request(monc, ret);
855 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
856 dout("authenticated, starting session\n");
857
858 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
859 monc->client->msgr->inst.name.num =
860 cpu_to_le64(monc->auth->global_id);
861
862 __send_subscribe(monc);
863 __resend_generic_request(monc);
864 }
865 mutex_unlock(&monc->mutex);
866}
867
868static int __validate_auth(struct ceph_mon_client *monc)
869{
870 int ret;
871
872 if (monc->pending_auth)
873 return 0;
874
875 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
876 monc->m_auth->front_max);
877 if (ret <= 0)
878 return ret; /* either an error, or no need to authenticate */
879 __send_prepared_auth_request(monc, ret);
880 return 0;
881}
882
883int ceph_monc_validate_auth(struct ceph_mon_client *monc)
884{
885 int ret;
886
887 mutex_lock(&monc->mutex);
888 ret = __validate_auth(monc);
889 mutex_unlock(&monc->mutex);
890 return ret;
891}
892
893/*
894 * handle incoming message
895 */
896static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
897{
898 struct ceph_mon_client *monc = con->private;
899 int type = le16_to_cpu(msg->hdr.type);
900
901 if (!monc)
902 return;
903
904 switch (type) {
905 case CEPH_MSG_AUTH_REPLY:
906 handle_auth_reply(monc, msg);
907 break;
908
909 case CEPH_MSG_MON_SUBSCRIBE_ACK:
910 handle_subscribe_ack(monc, msg);
911 break;
912
913 case CEPH_MSG_STATFS_REPLY:
914 handle_statfs_reply(monc, msg);
915 break;
916
917 case CEPH_MSG_POOLOP_REPLY:
918 handle_poolop_reply(monc, msg);
919 break;
920
921 case CEPH_MSG_MON_MAP:
922 ceph_monc_handle_map(monc, msg);
923 break;
924
925 case CEPH_MSG_MDS_MAP:
926 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
927 break;
928
929 case CEPH_MSG_OSD_MAP:
930 ceph_osdc_handle_map(&monc->client->osdc, msg);
931 break;
932
933 default:
934 pr_err("received unknown message type %d %s\n", type,
935 ceph_msg_type_name(type));
936 }
937 ceph_msg_put(msg);
938}
939
940/*
941 * Allocate memory for incoming message
942 */
943static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
944 struct ceph_msg_header *hdr,
945 int *skip)
946{
947 struct ceph_mon_client *monc = con->private;
948 int type = le16_to_cpu(hdr->type);
949 int front_len = le32_to_cpu(hdr->front_len);
950 struct ceph_msg *m = NULL;
951
952 *skip = 0;
953
954 switch (type) {
955 case CEPH_MSG_MON_SUBSCRIBE_ACK:
956 m = ceph_msg_get(monc->m_subscribe_ack);
957 break;
958 case CEPH_MSG_POOLOP_REPLY:
959 case CEPH_MSG_STATFS_REPLY:
960 return get_generic_reply(con, hdr, skip);
961 case CEPH_MSG_AUTH_REPLY:
962 m = ceph_msg_get(monc->m_auth_reply);
963 break;
964 case CEPH_MSG_MON_MAP:
965 case CEPH_MSG_MDS_MAP:
966 case CEPH_MSG_OSD_MAP:
967 m = ceph_msg_new(type, front_len, GFP_NOFS);
968 break;
969 }
970
971 if (!m) {
972 pr_info("alloc_msg unknown type %d\n", type);
973 *skip = 1;
974 }
975 return m;
976}
977
978/*
979 * If the monitor connection resets, pick a new monitor and resubmit
980 * any pending requests.
981 */
982static void mon_fault(struct ceph_connection *con)
983{
984 struct ceph_mon_client *monc = con->private;
985
986 if (!monc)
987 return;
988
989 dout("mon_fault\n");
990 mutex_lock(&monc->mutex);
991 if (!con->private)
992 goto out;
993
994 if (monc->con && !monc->hunting)
995 pr_info("mon%d %s session lost, "
996 "hunting for new mon\n", monc->cur_mon,
997 pr_addr(&monc->con->peer_addr.in_addr));
998
999 __close_session(monc);
1000 if (!monc->hunting) {
1001 /* start hunting */
1002 monc->hunting = true;
1003 __open_session(monc);
1004 } else {
1005 /* already hunting, let's wait a bit */
1006 __schedule_delayed(monc);
1007 }
1008out:
1009 mutex_unlock(&monc->mutex);
1010}
1011
1012static const struct ceph_connection_operations mon_con_ops = {
1013 .get = ceph_con_get,
1014 .put = ceph_con_put,
1015 .dispatch = dispatch,
1016 .fault = mon_fault,
1017 .alloc_msg = mon_alloc_msg,
1018};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
deleted file mode 100644
index 8e396f2c0963..000000000000
--- a/fs/ceph/mon_client.h
+++ /dev/null
@@ -1,121 +0,0 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/rbtree.h>
7
8#include "messenger.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_generic_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * ceph_mon_generic_request is being used for the statfs and poolop requests
44 * which are bening done a bit differently because we need to get data back
45 * to the caller
46 */
47struct ceph_mon_generic_request {
48 struct kref kref;
49 u64 tid;
50 struct rb_node node;
51 int result;
52 void *buf;
53 int buf_len;
54 struct completion completion;
55 struct ceph_msg *request; /* original request */
56 struct ceph_msg *reply; /* and reply */
57};
58
59struct ceph_mon_client {
60 struct ceph_client *client;
61 struct ceph_monmap *monmap;
62
63 struct mutex mutex;
64 struct delayed_work delayed_work;
65
66 struct ceph_auth_client *auth;
67 struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
68 int pending_auth;
69
70 bool hunting;
71 int cur_mon; /* last monitor i contacted */
72 unsigned long sub_sent, sub_renew_after;
73 struct ceph_connection *con;
74 bool have_fsid;
75
76 /* pending generic requests */
77 struct rb_root generic_request_tree;
78 int num_generic_requests;
79 u64 last_tid;
80
81 /* mds/osd map */
82 int want_next_osdmap; /* 1 = want, 2 = want+asked */
83 u32 have_osdmap, have_mdsmap;
84
85#ifdef CONFIG_DEBUG_FS
86 struct dentry *debugfs_file;
87#endif
88};
89
90extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
91extern int ceph_monmap_contains(struct ceph_monmap *m,
92 struct ceph_entity_addr *addr);
93
94extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
95extern void ceph_monc_stop(struct ceph_mon_client *monc);
96
97/*
98 * The model here is to indicate that we need a new map of at least
99 * epoch @want, and also call in when we receive a map. We will
100 * periodically rerequest the map from the monitor cluster until we
101 * get what we want.
102 */
103extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
104extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
105
106extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
107
108extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
109 struct ceph_statfs *buf);
110
111extern int ceph_monc_open_session(struct ceph_mon_client *monc);
112
113extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
114
115extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
116 u32 pool, u64 *snapid);
117
118extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
119 u32 pool, u64 snapid);
120
121#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
deleted file mode 100644
index dd65a6438131..000000000000
--- a/fs/ceph/msgpool.c
+++ /dev/null
@@ -1,64 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11{
12 struct ceph_msgpool *pool = arg;
13 void *p;
14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
20
21static void free_fn(void *element, void *arg)
22{
23 ceph_msg_put(element);
24}
25
26int ceph_msgpool_init(struct ceph_msgpool *pool,
27 int front_len, int size, bool blocking, const char *name)
28{
29 pool->front_len = front_len;
30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
31 if (!pool->pool)
32 return -ENOMEM;
33 pool->name = name;
34 return 0;
35}
36
37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
38{
39 mempool_destroy(pool->pool);
40}
41
42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
44{
45 if (front_len > pool->front_len) {
46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
47 pool->name, front_len, pool->front_len);
48 WARN_ON(1);
49
50 /* try to alloc a fresh message */
51 return ceph_msg_new(0, front_len, GFP_NOFS);
52 }
53
54 return mempool_alloc(pool->pool, GFP_NOFS);
55}
56
57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
58{
59 /* reset msg front_len; user may have changed it */
60 msg->front.iov_len = pool->front_len;
61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
62
63 kref_init(&msg->kref); /* retake single ref */
64}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
deleted file mode 100644
index a362605f9368..000000000000
--- a/fs/ceph/msgpool.h
+++ /dev/null
@@ -1,25 +0,0 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include <linux/mempool.h>
5#include "messenger.h"
6
7/*
8 * we use memory pools for preallocating messages we may receive, to
9 * avoid unexpected OOM conditions.
10 */
11struct ceph_msgpool {
12 const char *name;
13 mempool_t *pool;
14 int front_len; /* preallocated payload size */
15};
16
17extern int ceph_msgpool_init(struct ceph_msgpool *pool,
18 int front_len, int size, bool blocking,
19 const char *name);
20extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
21extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
22 int front_len);
23extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
24
25#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
deleted file mode 100644
index 680d3d648cac..000000000000
--- a/fs/ceph/msgr.h
+++ /dev/null
@@ -1,175 +0,0 @@
1#ifndef CEPH_MSGR_H
2#define CEPH_MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_AUTH 0x20
54
55#define CEPH_ENTITY_TYPE_ANY 0xFF
56
57extern const char *ceph_entity_type_name(int type);
58
59/*
60 * entity_addr -- network address
61 */
62struct ceph_entity_addr {
63 __le32 type;
64 __le32 nonce; /* unique id for process (e.g. pid) */
65 struct sockaddr_storage in_addr;
66} __attribute__ ((packed));
67
68struct ceph_entity_inst {
69 struct ceph_entity_name name;
70 struct ceph_entity_addr addr;
71} __attribute__ ((packed));
72
73
74/* used by message exchange protocol */
75#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
76#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
77#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
78 incoming connection */
79#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
80 with higher cseq */
81#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
82 with higher gseq */
83#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
84#define CEPH_MSGR_TAG_MSG 7 /* message */
85#define CEPH_MSGR_TAG_ACK 8 /* message ack */
86#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
87#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
88#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
89#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
90
91
92/*
93 * connection negotiation
94 */
95struct ceph_msg_connect {
96 __le64 features; /* supported feature bits */
97 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
98 __le32 global_seq; /* count connections initiated by this host */
99 __le32 connect_seq; /* count connections initiated in this session */
100 __le32 protocol_version;
101 __le32 authorizer_protocol;
102 __le32 authorizer_len;
103 __u8 flags; /* CEPH_MSG_CONNECT_* */
104} __attribute__ ((packed));
105
106struct ceph_msg_connect_reply {
107 __u8 tag;
108 __le64 features; /* feature bits for this session */
109 __le32 global_seq;
110 __le32 connect_seq;
111 __le32 protocol_version;
112 __le32 authorizer_len;
113 __u8 flags;
114} __attribute__ ((packed));
115
116#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
117
118
119/*
120 * message header
121 */
122struct ceph_msg_header_old {
123 __le64 seq; /* message seq# for this session */
124 __le64 tid; /* transaction id */
125 __le16 type; /* message type */
126 __le16 priority; /* priority. higher value == higher priority */
127 __le16 version; /* version of message encoding */
128
129 __le32 front_len; /* bytes in main payload */
130 __le32 middle_len;/* bytes in middle payload */
131 __le32 data_len; /* bytes of data payload */
132 __le16 data_off; /* sender: include full offset;
133 receiver: mask against ~PAGE_MASK */
134
135 struct ceph_entity_inst src, orig_src;
136 __le32 reserved;
137 __le32 crc; /* header crc32c */
138} __attribute__ ((packed));
139
140struct ceph_msg_header {
141 __le64 seq; /* message seq# for this session */
142 __le64 tid; /* transaction id */
143 __le16 type; /* message type */
144 __le16 priority; /* priority. higher value == higher priority */
145 __le16 version; /* version of message encoding */
146
147 __le32 front_len; /* bytes in main payload */
148 __le32 middle_len;/* bytes in middle payload */
149 __le32 data_len; /* bytes of data payload */
150 __le16 data_off; /* sender: include full offset;
151 receiver: mask against ~PAGE_MASK */
152
153 struct ceph_entity_name src;
154 __le32 reserved;
155 __le32 crc; /* header crc32c */
156} __attribute__ ((packed));
157
158#define CEPH_MSG_PRIO_LOW 64
159#define CEPH_MSG_PRIO_DEFAULT 127
160#define CEPH_MSG_PRIO_HIGH 196
161#define CEPH_MSG_PRIO_HIGHEST 255
162
163/*
164 * follows data payload
165 */
166struct ceph_msg_footer {
167 __le32 front_crc, middle_crc, data_crc;
168 __u8 flags;
169} __attribute__ ((packed));
170
171#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
172#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
173
174
175#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
deleted file mode 100644
index dfced1dacbcd..000000000000
--- a/fs/ceph/osd_client.c
+++ /dev/null
@@ -1,1539 +0,0 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19static const struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return NULL;
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
168 if (!msg) {
169 ceph_osdc_put_request(req);
170 return NULL;
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
182 if (!msg) {
183 ceph_osdc_put_request(req);
184 return NULL;
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref)) {
365 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
366
367 if (osd->o_authorizer)
368 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
369 kfree(osd);
370 }
371}
372
373/*
374 * remove an osd from our map
375 */
376static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
377{
378 dout("__remove_osd %p\n", osd);
379 BUG_ON(!list_empty(&osd->o_requests));
380 rb_erase(&osd->o_node, &osdc->osds);
381 list_del_init(&osd->o_osd_lru);
382 ceph_con_close(&osd->o_con);
383 put_osd(osd);
384}
385
386static void __move_osd_to_lru(struct ceph_osd_client *osdc,
387 struct ceph_osd *osd)
388{
389 dout("__move_osd_to_lru %p\n", osd);
390 BUG_ON(!list_empty(&osd->o_osd_lru));
391 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
392 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
393}
394
395static void __remove_osd_from_lru(struct ceph_osd *osd)
396{
397 dout("__remove_osd_from_lru %p\n", osd);
398 if (!list_empty(&osd->o_osd_lru))
399 list_del_init(&osd->o_osd_lru);
400}
401
402static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
403{
404 struct ceph_osd *osd, *nosd;
405
406 dout("__remove_old_osds %p\n", osdc);
407 mutex_lock(&osdc->request_mutex);
408 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
409 if (!remove_all && time_before(jiffies, osd->lru_ttl))
410 break;
411 __remove_osd(osdc, osd);
412 }
413 mutex_unlock(&osdc->request_mutex);
414}
415
416/*
417 * reset osd connect
418 */
419static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
420{
421 struct ceph_osd_request *req;
422 int ret = 0;
423
424 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
425 if (list_empty(&osd->o_requests)) {
426 __remove_osd(osdc, osd);
427 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
428 &osd->o_con.peer_addr,
429 sizeof(osd->o_con.peer_addr)) == 0 &&
430 !ceph_con_opened(&osd->o_con)) {
431 dout(" osd addr hasn't changed and connection never opened,"
432 " letting msgr retry");
433 /* touch each r_stamp for handle_timeout()'s benfit */
434 list_for_each_entry(req, &osd->o_requests, r_osd_item)
435 req->r_stamp = jiffies;
436 ret = -EAGAIN;
437 } else {
438 ceph_con_close(&osd->o_con);
439 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
440 osd->o_incarnation++;
441 }
442 return ret;
443}
444
445static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
446{
447 struct rb_node **p = &osdc->osds.rb_node;
448 struct rb_node *parent = NULL;
449 struct ceph_osd *osd = NULL;
450
451 while (*p) {
452 parent = *p;
453 osd = rb_entry(parent, struct ceph_osd, o_node);
454 if (new->o_osd < osd->o_osd)
455 p = &(*p)->rb_left;
456 else if (new->o_osd > osd->o_osd)
457 p = &(*p)->rb_right;
458 else
459 BUG();
460 }
461
462 rb_link_node(&new->o_node, parent, p);
463 rb_insert_color(&new->o_node, &osdc->osds);
464}
465
466static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
467{
468 struct ceph_osd *osd;
469 struct rb_node *n = osdc->osds.rb_node;
470
471 while (n) {
472 osd = rb_entry(n, struct ceph_osd, o_node);
473 if (o < osd->o_osd)
474 n = n->rb_left;
475 else if (o > osd->o_osd)
476 n = n->rb_right;
477 else
478 return osd;
479 }
480 return NULL;
481}
482
483static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
484{
485 schedule_delayed_work(&osdc->timeout_work,
486 osdc->client->mount_args->osd_keepalive_timeout * HZ);
487}
488
489static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
490{
491 cancel_delayed_work(&osdc->timeout_work);
492}
493
494/*
495 * Register request, assign tid. If this is the first request, set up
496 * the timeout event.
497 */
498static void register_request(struct ceph_osd_client *osdc,
499 struct ceph_osd_request *req)
500{
501 mutex_lock(&osdc->request_mutex);
502 req->r_tid = ++osdc->last_tid;
503 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
504 INIT_LIST_HEAD(&req->r_req_lru_item);
505
506 dout("register_request %p tid %lld\n", req, req->r_tid);
507 __insert_request(osdc, req);
508 ceph_osdc_get_request(req);
509 osdc->num_requests++;
510
511 if (osdc->num_requests == 1) {
512 dout(" first request, scheduling timeout\n");
513 __schedule_osd_timeout(osdc);
514 }
515 mutex_unlock(&osdc->request_mutex);
516}
517
518/*
519 * called under osdc->request_mutex
520 */
521static void __unregister_request(struct ceph_osd_client *osdc,
522 struct ceph_osd_request *req)
523{
524 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
525 rb_erase(&req->r_node, &osdc->requests);
526 osdc->num_requests--;
527
528 if (req->r_osd) {
529 /* make sure the original request isn't in flight. */
530 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
531
532 list_del_init(&req->r_osd_item);
533 if (list_empty(&req->r_osd->o_requests))
534 __move_osd_to_lru(osdc, req->r_osd);
535 req->r_osd = NULL;
536 }
537
538 ceph_osdc_put_request(req);
539
540 list_del_init(&req->r_req_lru_item);
541 if (osdc->num_requests == 0) {
542 dout(" no requests, canceling timeout\n");
543 __cancel_osd_timeout(osdc);
544 }
545}
546
547/*
548 * Cancel a previously queued request message
549 */
550static void __cancel_request(struct ceph_osd_request *req)
551{
552 if (req->r_sent) {
553 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
554 req->r_sent = 0;
555 }
556 list_del_init(&req->r_req_lru_item);
557}
558
559/*
560 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
561 * (as needed), and set the request r_osd appropriately. If there is
562 * no up osd, set r_osd to NULL.
563 *
564 * Return 0 if unchanged, 1 if changed, or negative on error.
565 *
566 * Caller should hold map_sem for read and request_mutex.
567 */
568static int __map_osds(struct ceph_osd_client *osdc,
569 struct ceph_osd_request *req)
570{
571 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
572 struct ceph_pg pgid;
573 int acting[CEPH_PG_MAX_SIZE];
574 int o = -1, num = 0;
575 int err;
576
577 dout("map_osds %p tid %lld\n", req, req->r_tid);
578 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
579 &req->r_file_layout, osdc->osdmap);
580 if (err)
581 return err;
582 pgid = reqhead->layout.ol_pgid;
583 req->r_pgid = pgid;
584
585 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
586 if (err > 0) {
587 o = acting[0];
588 num = err;
589 }
590
591 if ((req->r_osd && req->r_osd->o_osd == o &&
592 req->r_sent >= req->r_osd->o_incarnation &&
593 req->r_num_pg_osds == num &&
594 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
595 (req->r_osd == NULL && o == -1))
596 return 0; /* no change */
597
598 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
599 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
600 req->r_osd ? req->r_osd->o_osd : -1);
601
602 /* record full pg acting set */
603 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
604 req->r_num_pg_osds = num;
605
606 if (req->r_osd) {
607 __cancel_request(req);
608 list_del_init(&req->r_osd_item);
609 req->r_osd = NULL;
610 }
611
612 req->r_osd = __lookup_osd(osdc, o);
613 if (!req->r_osd && o >= 0) {
614 err = -ENOMEM;
615 req->r_osd = create_osd(osdc);
616 if (!req->r_osd)
617 goto out;
618
619 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
620 req->r_osd->o_osd = o;
621 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
622 __insert_osd(osdc, req->r_osd);
623
624 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
625 }
626
627 if (req->r_osd) {
628 __remove_osd_from_lru(req->r_osd);
629 list_add(&req->r_osd_item, &req->r_osd->o_requests);
630 }
631 err = 1; /* osd or pg changed */
632
633out:
634 return err;
635}
636
637/*
638 * caller should hold map_sem (for read) and request_mutex
639 */
640static int __send_request(struct ceph_osd_client *osdc,
641 struct ceph_osd_request *req)
642{
643 struct ceph_osd_request_head *reqhead;
644 int err;
645
646 err = __map_osds(osdc, req);
647 if (err < 0)
648 return err;
649 if (req->r_osd == NULL) {
650 dout("send_request %p no up osds in pg\n", req);
651 ceph_monc_request_next_osdmap(&osdc->client->monc);
652 return 0;
653 }
654
655 dout("send_request %p tid %llu to osd%d flags %d\n",
656 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
657
658 reqhead = req->r_request->front.iov_base;
659 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
660 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
661 reqhead->reassert_version = req->r_reassert_version;
662
663 req->r_stamp = jiffies;
664 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
665
666 ceph_msg_get(req->r_request); /* send consumes a ref */
667 ceph_con_send(&req->r_osd->o_con, req->r_request);
668 req->r_sent = req->r_osd->o_incarnation;
669 return 0;
670}
671
672/*
673 * Timeout callback, called every N seconds when 1 or more osd
674 * requests has been active for more than N seconds. When this
675 * happens, we ping all OSDs with requests who have timed out to
676 * ensure any communications channel reset is detected. Reset the
677 * request timeouts another N seconds in the future as we go.
678 * Reschedule the timeout event another N seconds in future (unless
679 * there are no open requests).
680 */
681static void handle_timeout(struct work_struct *work)
682{
683 struct ceph_osd_client *osdc =
684 container_of(work, struct ceph_osd_client, timeout_work.work);
685 struct ceph_osd_request *req, *last_req = NULL;
686 struct ceph_osd *osd;
687 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
688 unsigned long keepalive =
689 osdc->client->mount_args->osd_keepalive_timeout * HZ;
690 unsigned long last_stamp = 0;
691 struct rb_node *p;
692 struct list_head slow_osds;
693
694 dout("timeout\n");
695 down_read(&osdc->map_sem);
696
697 ceph_monc_request_next_osdmap(&osdc->client->monc);
698
699 mutex_lock(&osdc->request_mutex);
700 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
701 req = rb_entry(p, struct ceph_osd_request, r_node);
702
703 if (req->r_resend) {
704 int err;
705
706 dout("osdc resending prev failed %lld\n", req->r_tid);
707 err = __send_request(osdc, req);
708 if (err)
709 dout("osdc failed again on %lld\n", req->r_tid);
710 else
711 req->r_resend = false;
712 continue;
713 }
714 }
715
716 /*
717 * reset osds that appear to be _really_ unresponsive. this
718 * is a failsafe measure.. we really shouldn't be getting to
719 * this point if the system is working properly. the monitors
720 * should mark the osd as failed and we should find out about
721 * it from an updated osd map.
722 */
723 while (timeout && !list_empty(&osdc->req_lru)) {
724 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
725 r_req_lru_item);
726
727 if (time_before(jiffies, req->r_stamp + timeout))
728 break;
729
730 BUG_ON(req == last_req && req->r_stamp == last_stamp);
731 last_req = req;
732 last_stamp = req->r_stamp;
733
734 osd = req->r_osd;
735 BUG_ON(!osd);
736 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
737 req->r_tid, osd->o_osd);
738 __kick_requests(osdc, osd);
739 }
740
741 /*
742 * ping osds that are a bit slow. this ensures that if there
743 * is a break in the TCP connection we will notice, and reopen
744 * a connection with that osd (from the fault callback).
745 */
746 INIT_LIST_HEAD(&slow_osds);
747 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
748 if (time_before(jiffies, req->r_stamp + keepalive))
749 break;
750
751 osd = req->r_osd;
752 BUG_ON(!osd);
753 dout(" tid %llu is slow, will send keepalive on osd%d\n",
754 req->r_tid, osd->o_osd);
755 list_move_tail(&osd->o_keepalive_item, &slow_osds);
756 }
757 while (!list_empty(&slow_osds)) {
758 osd = list_entry(slow_osds.next, struct ceph_osd,
759 o_keepalive_item);
760 list_del_init(&osd->o_keepalive_item);
761 ceph_con_keepalive(&osd->o_con);
762 }
763
764 __schedule_osd_timeout(osdc);
765 mutex_unlock(&osdc->request_mutex);
766
767 up_read(&osdc->map_sem);
768}
769
770static void handle_osds_timeout(struct work_struct *work)
771{
772 struct ceph_osd_client *osdc =
773 container_of(work, struct ceph_osd_client,
774 osds_timeout_work.work);
775 unsigned long delay =
776 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
777
778 dout("osds timeout\n");
779 down_read(&osdc->map_sem);
780 remove_old_osds(osdc, 0);
781 up_read(&osdc->map_sem);
782
783 schedule_delayed_work(&osdc->osds_timeout_work,
784 round_jiffies_relative(delay));
785}
786
787/*
788 * handle osd op reply. either call the callback if it is specified,
789 * or do the completion to wake up the waiting thread.
790 */
791static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
792 struct ceph_connection *con)
793{
794 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
795 struct ceph_osd_request *req;
796 u64 tid;
797 int numops, object_len, flags;
798 s32 result;
799
800 tid = le64_to_cpu(msg->hdr.tid);
801 if (msg->front.iov_len < sizeof(*rhead))
802 goto bad;
803 numops = le32_to_cpu(rhead->num_ops);
804 object_len = le32_to_cpu(rhead->object_len);
805 result = le32_to_cpu(rhead->result);
806 if (msg->front.iov_len != sizeof(*rhead) + object_len +
807 numops * sizeof(struct ceph_osd_op))
808 goto bad;
809 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
810
811 /* lookup */
812 mutex_lock(&osdc->request_mutex);
813 req = __lookup_request(osdc, tid);
814 if (req == NULL) {
815 dout("handle_reply tid %llu dne\n", tid);
816 mutex_unlock(&osdc->request_mutex);
817 return;
818 }
819 ceph_osdc_get_request(req);
820 flags = le32_to_cpu(rhead->flags);
821
822 /*
823 * if this connection filled our message, drop our reference now, to
824 * avoid a (safe but slower) revoke later.
825 */
826 if (req->r_con_filling_msg == con && req->r_reply == msg) {
827 dout(" dropping con_filling_msg ref %p\n", con);
828 req->r_con_filling_msg = NULL;
829 ceph_con_put(con);
830 }
831
832 if (!req->r_got_reply) {
833 unsigned bytes;
834
835 req->r_result = le32_to_cpu(rhead->result);
836 bytes = le32_to_cpu(msg->hdr.data_len);
837 dout("handle_reply result %d bytes %d\n", req->r_result,
838 bytes);
839 if (req->r_result == 0)
840 req->r_result = bytes;
841
842 /* in case this is a write and we need to replay, */
843 req->r_reassert_version = rhead->reassert_version;
844
845 req->r_got_reply = 1;
846 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
847 dout("handle_reply tid %llu dup ack\n", tid);
848 mutex_unlock(&osdc->request_mutex);
849 goto done;
850 }
851
852 dout("handle_reply tid %llu flags %d\n", tid, flags);
853
854 /* either this is a read, or we got the safe response */
855 if (result < 0 ||
856 (flags & CEPH_OSD_FLAG_ONDISK) ||
857 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
858 __unregister_request(osdc, req);
859
860 mutex_unlock(&osdc->request_mutex);
861
862 if (req->r_callback)
863 req->r_callback(req, msg);
864 else
865 complete_all(&req->r_completion);
866
867 if (flags & CEPH_OSD_FLAG_ONDISK) {
868 if (req->r_safe_callback)
869 req->r_safe_callback(req, msg);
870 complete_all(&req->r_safe_completion); /* fsync waiter */
871 }
872
873done:
874 ceph_osdc_put_request(req);
875 return;
876
877bad:
878 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
879 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
880 (int)sizeof(*rhead));
881 ceph_msg_dump(msg);
882}
883
884
885static int __kick_requests(struct ceph_osd_client *osdc,
886 struct ceph_osd *kickosd)
887{
888 struct ceph_osd_request *req;
889 struct rb_node *p, *n;
890 int needmap = 0;
891 int err;
892
893 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
894 if (kickosd) {
895 err = __reset_osd(osdc, kickosd);
896 if (err == -EAGAIN)
897 return 1;
898 } else {
899 for (p = rb_first(&osdc->osds); p; p = n) {
900 struct ceph_osd *osd =
901 rb_entry(p, struct ceph_osd, o_node);
902
903 n = rb_next(p);
904 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
905 memcmp(&osd->o_con.peer_addr,
906 ceph_osd_addr(osdc->osdmap,
907 osd->o_osd),
908 sizeof(struct ceph_entity_addr)) != 0)
909 __reset_osd(osdc, osd);
910 }
911 }
912
913 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
914 req = rb_entry(p, struct ceph_osd_request, r_node);
915
916 if (req->r_resend) {
917 dout(" r_resend set on tid %llu\n", req->r_tid);
918 __cancel_request(req);
919 goto kick;
920 }
921 if (req->r_osd && kickosd == req->r_osd) {
922 __cancel_request(req);
923 goto kick;
924 }
925
926 err = __map_osds(osdc, req);
927 if (err == 0)
928 continue; /* no change */
929 if (err < 0) {
930 /*
931 * FIXME: really, we should set the request
932 * error and fail if this isn't a 'nofail'
933 * request, but that's a fair bit more
934 * complicated to do. So retry!
935 */
936 dout(" setting r_resend on %llu\n", req->r_tid);
937 req->r_resend = true;
938 continue;
939 }
940 if (req->r_osd == NULL) {
941 dout("tid %llu maps to no valid osd\n", req->r_tid);
942 needmap++; /* request a newer map */
943 continue;
944 }
945
946kick:
947 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
948 req->r_osd ? req->r_osd->o_osd : -1);
949 req->r_flags |= CEPH_OSD_FLAG_RETRY;
950 err = __send_request(osdc, req);
951 if (err) {
952 dout(" setting r_resend on %llu\n", req->r_tid);
953 req->r_resend = true;
954 }
955 }
956
957 return needmap;
958}
959
960/*
961 * Resubmit osd requests whose osd or osd address has changed. Request
962 * a new osd map if osds are down, or we are otherwise unable to determine
963 * how to direct a request.
964 *
965 * Close connections to down osds.
966 *
967 * If @who is specified, resubmit requests for that specific osd.
968 *
969 * Caller should hold map_sem for read and request_mutex.
970 */
971static void kick_requests(struct ceph_osd_client *osdc,
972 struct ceph_osd *kickosd)
973{
974 int needmap;
975
976 mutex_lock(&osdc->request_mutex);
977 needmap = __kick_requests(osdc, kickosd);
978 mutex_unlock(&osdc->request_mutex);
979
980 if (needmap) {
981 dout("%d requests for down osds, need new map\n", needmap);
982 ceph_monc_request_next_osdmap(&osdc->client->monc);
983 }
984
985}
986/*
987 * Process updated osd map.
988 *
989 * The message contains any number of incremental and full maps, normally
990 * indicating some sort of topology change in the cluster. Kick requests
991 * off to different OSDs as needed.
992 */
993void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
994{
995 void *p, *end, *next;
996 u32 nr_maps, maplen;
997 u32 epoch;
998 struct ceph_osdmap *newmap = NULL, *oldmap;
999 int err;
1000 struct ceph_fsid fsid;
1001
1002 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
1003 p = msg->front.iov_base;
1004 end = p + msg->front.iov_len;
1005
1006 /* verify fsid */
1007 ceph_decode_need(&p, end, sizeof(fsid), bad);
1008 ceph_decode_copy(&p, &fsid, sizeof(fsid));
1009 if (ceph_check_fsid(osdc->client, &fsid) < 0)
1010 return;
1011
1012 down_write(&osdc->map_sem);
1013
1014 /* incremental maps */
1015 ceph_decode_32_safe(&p, end, nr_maps, bad);
1016 dout(" %d inc maps\n", nr_maps);
1017 while (nr_maps > 0) {
1018 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1019 epoch = ceph_decode_32(&p);
1020 maplen = ceph_decode_32(&p);
1021 ceph_decode_need(&p, end, maplen, bad);
1022 next = p + maplen;
1023 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1024 dout("applying incremental map %u len %d\n",
1025 epoch, maplen);
1026 newmap = osdmap_apply_incremental(&p, next,
1027 osdc->osdmap,
1028 osdc->client->msgr);
1029 if (IS_ERR(newmap)) {
1030 err = PTR_ERR(newmap);
1031 goto bad;
1032 }
1033 BUG_ON(!newmap);
1034 if (newmap != osdc->osdmap) {
1035 ceph_osdmap_destroy(osdc->osdmap);
1036 osdc->osdmap = newmap;
1037 }
1038 } else {
1039 dout("ignoring incremental map %u len %d\n",
1040 epoch, maplen);
1041 }
1042 p = next;
1043 nr_maps--;
1044 }
1045 if (newmap)
1046 goto done;
1047
1048 /* full maps */
1049 ceph_decode_32_safe(&p, end, nr_maps, bad);
1050 dout(" %d full maps\n", nr_maps);
1051 while (nr_maps) {
1052 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1053 epoch = ceph_decode_32(&p);
1054 maplen = ceph_decode_32(&p);
1055 ceph_decode_need(&p, end, maplen, bad);
1056 if (nr_maps > 1) {
1057 dout("skipping non-latest full map %u len %d\n",
1058 epoch, maplen);
1059 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1060 dout("skipping full map %u len %d, "
1061 "older than our %u\n", epoch, maplen,
1062 osdc->osdmap->epoch);
1063 } else {
1064 dout("taking full map %u len %d\n", epoch, maplen);
1065 newmap = osdmap_decode(&p, p+maplen);
1066 if (IS_ERR(newmap)) {
1067 err = PTR_ERR(newmap);
1068 goto bad;
1069 }
1070 BUG_ON(!newmap);
1071 oldmap = osdc->osdmap;
1072 osdc->osdmap = newmap;
1073 if (oldmap)
1074 ceph_osdmap_destroy(oldmap);
1075 }
1076 p += maplen;
1077 nr_maps--;
1078 }
1079
1080done:
1081 downgrade_write(&osdc->map_sem);
1082 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1083 if (newmap)
1084 kick_requests(osdc, NULL);
1085 up_read(&osdc->map_sem);
1086 wake_up_all(&osdc->client->auth_wq);
1087 return;
1088
1089bad:
1090 pr_err("osdc handle_map corrupt msg\n");
1091 ceph_msg_dump(msg);
1092 up_write(&osdc->map_sem);
1093 return;
1094}
1095
1096/*
1097 * Register request, send initial attempt.
1098 */
1099int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1100 struct ceph_osd_request *req,
1101 bool nofail)
1102{
1103 int rc = 0;
1104
1105 req->r_request->pages = req->r_pages;
1106 req->r_request->nr_pages = req->r_num_pages;
1107
1108 register_request(osdc, req);
1109
1110 down_read(&osdc->map_sem);
1111 mutex_lock(&osdc->request_mutex);
1112 /*
1113 * a racing kick_requests() may have sent the message for us
1114 * while we dropped request_mutex above, so only send now if
1115 * the request still han't been touched yet.
1116 */
1117 if (req->r_sent == 0) {
1118 rc = __send_request(osdc, req);
1119 if (rc) {
1120 if (nofail) {
1121 dout("osdc_start_request failed send, "
1122 " marking %lld\n", req->r_tid);
1123 req->r_resend = true;
1124 rc = 0;
1125 } else {
1126 __unregister_request(osdc, req);
1127 }
1128 }
1129 }
1130 mutex_unlock(&osdc->request_mutex);
1131 up_read(&osdc->map_sem);
1132 return rc;
1133}
1134
1135/*
1136 * wait for a request to complete
1137 */
1138int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1139 struct ceph_osd_request *req)
1140{
1141 int rc;
1142
1143 rc = wait_for_completion_interruptible(&req->r_completion);
1144 if (rc < 0) {
1145 mutex_lock(&osdc->request_mutex);
1146 __cancel_request(req);
1147 __unregister_request(osdc, req);
1148 mutex_unlock(&osdc->request_mutex);
1149 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1150 return rc;
1151 }
1152
1153 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1154 return req->r_result;
1155}
1156
1157/*
1158 * sync - wait for all in-flight requests to flush. avoid starvation.
1159 */
1160void ceph_osdc_sync(struct ceph_osd_client *osdc)
1161{
1162 struct ceph_osd_request *req;
1163 u64 last_tid, next_tid = 0;
1164
1165 mutex_lock(&osdc->request_mutex);
1166 last_tid = osdc->last_tid;
1167 while (1) {
1168 req = __lookup_request_ge(osdc, next_tid);
1169 if (!req)
1170 break;
1171 if (req->r_tid > last_tid)
1172 break;
1173
1174 next_tid = req->r_tid + 1;
1175 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1176 continue;
1177
1178 ceph_osdc_get_request(req);
1179 mutex_unlock(&osdc->request_mutex);
1180 dout("sync waiting on tid %llu (last is %llu)\n",
1181 req->r_tid, last_tid);
1182 wait_for_completion(&req->r_safe_completion);
1183 mutex_lock(&osdc->request_mutex);
1184 ceph_osdc_put_request(req);
1185 }
1186 mutex_unlock(&osdc->request_mutex);
1187 dout("sync done (thru tid %llu)\n", last_tid);
1188}
1189
1190/*
1191 * init, shutdown
1192 */
1193int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1194{
1195 int err;
1196
1197 dout("init\n");
1198 osdc->client = client;
1199 osdc->osdmap = NULL;
1200 init_rwsem(&osdc->map_sem);
1201 init_completion(&osdc->map_waiters);
1202 osdc->last_requested_map = 0;
1203 mutex_init(&osdc->request_mutex);
1204 osdc->last_tid = 0;
1205 osdc->osds = RB_ROOT;
1206 INIT_LIST_HEAD(&osdc->osd_lru);
1207 osdc->requests = RB_ROOT;
1208 INIT_LIST_HEAD(&osdc->req_lru);
1209 osdc->num_requests = 0;
1210 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1211 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1212
1213 schedule_delayed_work(&osdc->osds_timeout_work,
1214 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1215
1216 err = -ENOMEM;
1217 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1218 sizeof(struct ceph_osd_request));
1219 if (!osdc->req_mempool)
1220 goto out;
1221
1222 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1223 "osd_op");
1224 if (err < 0)
1225 goto out_mempool;
1226 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1227 OSD_OPREPLY_FRONT_LEN, 10, true,
1228 "osd_op_reply");
1229 if (err < 0)
1230 goto out_msgpool;
1231 return 0;
1232
1233out_msgpool:
1234 ceph_msgpool_destroy(&osdc->msgpool_op);
1235out_mempool:
1236 mempool_destroy(osdc->req_mempool);
1237out:
1238 return err;
1239}
1240
1241void ceph_osdc_stop(struct ceph_osd_client *osdc)
1242{
1243 cancel_delayed_work_sync(&osdc->timeout_work);
1244 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1245 if (osdc->osdmap) {
1246 ceph_osdmap_destroy(osdc->osdmap);
1247 osdc->osdmap = NULL;
1248 }
1249 remove_old_osds(osdc, 1);
1250 mempool_destroy(osdc->req_mempool);
1251 ceph_msgpool_destroy(&osdc->msgpool_op);
1252 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1253}
1254
1255/*
1256 * Read some contiguous pages. If we cross a stripe boundary, shorten
1257 * *plen. Return number of bytes read, or error.
1258 */
1259int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1260 struct ceph_vino vino, struct ceph_file_layout *layout,
1261 u64 off, u64 *plen,
1262 u32 truncate_seq, u64 truncate_size,
1263 struct page **pages, int num_pages)
1264{
1265 struct ceph_osd_request *req;
1266 int rc = 0;
1267
1268 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1269 vino.snap, off, *plen);
1270 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1271 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1272 NULL, 0, truncate_seq, truncate_size, NULL,
1273 false, 1);
1274 if (!req)
1275 return -ENOMEM;
1276
1277 /* it may be a short read due to an object boundary */
1278 req->r_pages = pages;
1279
1280 dout("readpages final extent is %llu~%llu (%d pages)\n",
1281 off, *plen, req->r_num_pages);
1282
1283 rc = ceph_osdc_start_request(osdc, req, false);
1284 if (!rc)
1285 rc = ceph_osdc_wait_request(osdc, req);
1286
1287 ceph_osdc_put_request(req);
1288 dout("readpages result %d\n", rc);
1289 return rc;
1290}
1291
1292/*
1293 * do a synchronous write on N pages
1294 */
1295int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1296 struct ceph_file_layout *layout,
1297 struct ceph_snap_context *snapc,
1298 u64 off, u64 len,
1299 u32 truncate_seq, u64 truncate_size,
1300 struct timespec *mtime,
1301 struct page **pages, int num_pages,
1302 int flags, int do_sync, bool nofail)
1303{
1304 struct ceph_osd_request *req;
1305 int rc = 0;
1306
1307 BUG_ON(vino.snap != CEPH_NOSNAP);
1308 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1309 CEPH_OSD_OP_WRITE,
1310 flags | CEPH_OSD_FLAG_ONDISK |
1311 CEPH_OSD_FLAG_WRITE,
1312 snapc, do_sync,
1313 truncate_seq, truncate_size, mtime,
1314 nofail, 1);
1315 if (!req)
1316 return -ENOMEM;
1317
1318 /* it may be a short write due to an object boundary */
1319 req->r_pages = pages;
1320 dout("writepages %llu~%llu (%d pages)\n", off, len,
1321 req->r_num_pages);
1322
1323 rc = ceph_osdc_start_request(osdc, req, nofail);
1324 if (!rc)
1325 rc = ceph_osdc_wait_request(osdc, req);
1326
1327 ceph_osdc_put_request(req);
1328 if (rc == 0)
1329 rc = len;
1330 dout("writepages result %d\n", rc);
1331 return rc;
1332}
1333
1334/*
1335 * handle incoming message
1336 */
1337static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1338{
1339 struct ceph_osd *osd = con->private;
1340 struct ceph_osd_client *osdc;
1341 int type = le16_to_cpu(msg->hdr.type);
1342
1343 if (!osd)
1344 goto out;
1345 osdc = osd->o_osdc;
1346
1347 switch (type) {
1348 case CEPH_MSG_OSD_MAP:
1349 ceph_osdc_handle_map(osdc, msg);
1350 break;
1351 case CEPH_MSG_OSD_OPREPLY:
1352 handle_reply(osdc, msg, con);
1353 break;
1354
1355 default:
1356 pr_err("received unknown message type %d %s\n", type,
1357 ceph_msg_type_name(type));
1358 }
1359out:
1360 ceph_msg_put(msg);
1361}
1362
1363/*
1364 * lookup and return message for incoming reply. set up reply message
1365 * pages.
1366 */
1367static struct ceph_msg *get_reply(struct ceph_connection *con,
1368 struct ceph_msg_header *hdr,
1369 int *skip)
1370{
1371 struct ceph_osd *osd = con->private;
1372 struct ceph_osd_client *osdc = osd->o_osdc;
1373 struct ceph_msg *m;
1374 struct ceph_osd_request *req;
1375 int front = le32_to_cpu(hdr->front_len);
1376 int data_len = le32_to_cpu(hdr->data_len);
1377 u64 tid;
1378
1379 tid = le64_to_cpu(hdr->tid);
1380 mutex_lock(&osdc->request_mutex);
1381 req = __lookup_request(osdc, tid);
1382 if (!req) {
1383 *skip = 1;
1384 m = NULL;
1385 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1386 osd->o_osd);
1387 goto out;
1388 }
1389
1390 if (req->r_con_filling_msg) {
1391 dout("get_reply revoking msg %p from old con %p\n",
1392 req->r_reply, req->r_con_filling_msg);
1393 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1394 ceph_con_put(req->r_con_filling_msg);
1395 req->r_con_filling_msg = NULL;
1396 }
1397
1398 if (front > req->r_reply->front.iov_len) {
1399 pr_warning("get_reply front %d > preallocated %d\n",
1400 front, (int)req->r_reply->front.iov_len);
1401 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1402 if (!m)
1403 goto out;
1404 ceph_msg_put(req->r_reply);
1405 req->r_reply = m;
1406 }
1407 m = ceph_msg_get(req->r_reply);
1408
1409 if (data_len > 0) {
1410 unsigned data_off = le16_to_cpu(hdr->data_off);
1411 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1412
1413 if (unlikely(req->r_num_pages < want)) {
1414 pr_warning("tid %lld reply %d > expected %d pages\n",
1415 tid, want, m->nr_pages);
1416 *skip = 1;
1417 ceph_msg_put(m);
1418 m = NULL;
1419 goto out;
1420 }
1421 m->pages = req->r_pages;
1422 m->nr_pages = req->r_num_pages;
1423 }
1424 *skip = 0;
1425 req->r_con_filling_msg = ceph_con_get(con);
1426 dout("get_reply tid %lld %p\n", tid, m);
1427
1428out:
1429 mutex_unlock(&osdc->request_mutex);
1430 return m;
1431
1432}
1433
1434static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1435 struct ceph_msg_header *hdr,
1436 int *skip)
1437{
1438 struct ceph_osd *osd = con->private;
1439 int type = le16_to_cpu(hdr->type);
1440 int front = le32_to_cpu(hdr->front_len);
1441
1442 switch (type) {
1443 case CEPH_MSG_OSD_MAP:
1444 return ceph_msg_new(type, front, GFP_NOFS);
1445 case CEPH_MSG_OSD_OPREPLY:
1446 return get_reply(con, hdr, skip);
1447 default:
1448 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1449 osd->o_osd);
1450 *skip = 1;
1451 return NULL;
1452 }
1453}
1454
1455/*
1456 * Wrappers to refcount containing ceph_osd struct
1457 */
1458static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1459{
1460 struct ceph_osd *osd = con->private;
1461 if (get_osd(osd))
1462 return con;
1463 return NULL;
1464}
1465
1466static void put_osd_con(struct ceph_connection *con)
1467{
1468 struct ceph_osd *osd = con->private;
1469 put_osd(osd);
1470}
1471
1472/*
1473 * authentication
1474 */
1475static int get_authorizer(struct ceph_connection *con,
1476 void **buf, int *len, int *proto,
1477 void **reply_buf, int *reply_len, int force_new)
1478{
1479 struct ceph_osd *o = con->private;
1480 struct ceph_osd_client *osdc = o->o_osdc;
1481 struct ceph_auth_client *ac = osdc->client->monc.auth;
1482 int ret = 0;
1483
1484 if (force_new && o->o_authorizer) {
1485 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1486 o->o_authorizer = NULL;
1487 }
1488 if (o->o_authorizer == NULL) {
1489 ret = ac->ops->create_authorizer(
1490 ac, CEPH_ENTITY_TYPE_OSD,
1491 &o->o_authorizer,
1492 &o->o_authorizer_buf,
1493 &o->o_authorizer_buf_len,
1494 &o->o_authorizer_reply_buf,
1495 &o->o_authorizer_reply_buf_len);
1496 if (ret)
1497 return ret;
1498 }
1499
1500 *proto = ac->protocol;
1501 *buf = o->o_authorizer_buf;
1502 *len = o->o_authorizer_buf_len;
1503 *reply_buf = o->o_authorizer_reply_buf;
1504 *reply_len = o->o_authorizer_reply_buf_len;
1505 return 0;
1506}
1507
1508
1509static int verify_authorizer_reply(struct ceph_connection *con, int len)
1510{
1511 struct ceph_osd *o = con->private;
1512 struct ceph_osd_client *osdc = o->o_osdc;
1513 struct ceph_auth_client *ac = osdc->client->monc.auth;
1514
1515 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1516}
1517
1518static int invalidate_authorizer(struct ceph_connection *con)
1519{
1520 struct ceph_osd *o = con->private;
1521 struct ceph_osd_client *osdc = o->o_osdc;
1522 struct ceph_auth_client *ac = osdc->client->monc.auth;
1523
1524 if (ac->ops->invalidate_authorizer)
1525 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1526
1527 return ceph_monc_validate_auth(&osdc->client->monc);
1528}
1529
1530static const struct ceph_connection_operations osd_con_ops = {
1531 .get = get_osd_con,
1532 .put = put_osd_con,
1533 .dispatch = dispatch,
1534 .get_authorizer = get_authorizer,
1535 .verify_authorizer_reply = verify_authorizer_reply,
1536 .invalidate_authorizer = invalidate_authorizer,
1537 .alloc_msg = alloc_msg,
1538 .fault = osd_reset,
1539};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
deleted file mode 100644
index ce776989ef6a..000000000000
--- a/fs/ceph/osd_client.h
+++ /dev/null
@@ -1,167 +0,0 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51 int r_pg_osds[CEPH_PG_MAX_SIZE];
52 int r_num_pg_osds;
53
54 struct ceph_connection *r_con_filling_msg;
55
56 struct ceph_msg *r_request, *r_reply;
57 int r_result;
58 int r_flags; /* any additional flags for the osd */
59 u32 r_sent; /* >0 if r_request is sending/sent */
60 int r_got_reply;
61
62 struct ceph_osd_client *r_osdc;
63 struct kref r_kref;
64 bool r_mempool;
65 struct completion r_completion, r_safe_completion;
66 ceph_osdc_callback_t r_callback, r_safe_callback;
67 struct ceph_eversion r_reassert_version;
68 struct list_head r_unsafe_item;
69
70 struct inode *r_inode; /* for use by callbacks */
71
72 char r_oid[40]; /* object name */
73 int r_oid_len;
74 unsigned long r_stamp; /* send OR check time */
75 bool r_resend; /* msg send failed, needs retry */
76
77 struct ceph_file_layout r_file_layout;
78 struct ceph_snap_context *r_snapc; /* snap context for writes */
79 unsigned r_num_pages; /* size of page array (follows) */
80 struct page **r_pages; /* pages for data payload */
81 int r_pages_from_pool;
82 int r_own_pages; /* if true, i own page list */
83};
84
85struct ceph_osd_client {
86 struct ceph_client *client;
87
88 struct ceph_osdmap *osdmap; /* current map */
89 struct rw_semaphore map_sem;
90 struct completion map_waiters;
91 u64 last_requested_map;
92
93 struct mutex request_mutex;
94 struct rb_root osds; /* osds */
95 struct list_head osd_lru; /* idle osds */
96 u64 timeout_tid; /* tid of timeout triggering rq */
97 u64 last_tid; /* tid of last request */
98 struct rb_root requests; /* pending requests */
99 struct list_head req_lru; /* pending requests lru */
100 int num_requests;
101 struct delayed_work timeout_work;
102 struct delayed_work osds_timeout_work;
103#ifdef CONFIG_DEBUG_FS
104 struct dentry *debugfs_file;
105#endif
106
107 mempool_t *req_mempool;
108
109 struct ceph_msgpool msgpool_op;
110 struct ceph_msgpool msgpool_op_reply;
111};
112
113extern int ceph_osdc_init(struct ceph_osd_client *osdc,
114 struct ceph_client *client);
115extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
116
117extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
118 struct ceph_msg *msg);
119extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
120 struct ceph_msg *msg);
121
122extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 offset, u64 *len, int op, int flags,
126 struct ceph_snap_context *snapc,
127 int do_sync, u32 truncate_seq,
128 u64 truncate_size,
129 struct timespec *mtime,
130 bool use_mempool, int num_reply);
131
132static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
133{
134 kref_get(&req->r_kref);
135}
136extern void ceph_osdc_release_request(struct kref *kref);
137static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
138{
139 kref_put(&req->r_kref, ceph_osdc_release_request);
140}
141
142extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
143 struct ceph_osd_request *req,
144 bool nofail);
145extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
146 struct ceph_osd_request *req);
147extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
148
149extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
150 struct ceph_vino vino,
151 struct ceph_file_layout *layout,
152 u64 off, u64 *plen,
153 u32 truncate_seq, u64 truncate_size,
154 struct page **pages, int nr_pages);
155
156extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
157 struct ceph_vino vino,
158 struct ceph_file_layout *layout,
159 struct ceph_snap_context *sc,
160 u64 off, u64 len,
161 u32 truncate_seq, u64 truncate_size,
162 struct timespec *mtime,
163 struct page **pages, int nr_pages,
164 int flags, int do_sync, bool nofail);
165
166#endif
167
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
deleted file mode 100644
index e31f118f1392..000000000000
--- a/fs/ceph/osdmap.c
+++ /dev/null
@@ -1,1110 +0,0 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5#include <asm/div64.h>
6
7#include "super.h"
8#include "osdmap.h"
9#include "crush/hash.h"
10#include "crush/mapper.h"
11#include "decode.h"
12
13char *ceph_osdmap_state_str(char *str, int len, int state)
14{
15 int flag = 0;
16
17 if (!len)
18 goto done;
19
20 *str = '\0';
21 if (state) {
22 if (state & CEPH_OSD_EXISTS) {
23 snprintf(str, len, "exists");
24 flag = 1;
25 }
26 if (state & CEPH_OSD_UP) {
27 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
28 "up");
29 flag = 1;
30 }
31 } else {
32 snprintf(str, len, "doesn't exist");
33 }
34done:
35 return str;
36}
37
38/* maps */
39
40static int calc_bits_of(unsigned t)
41{
42 int b = 0;
43 while (t) {
44 t = t >> 1;
45 b++;
46 }
47 return b;
48}
49
50/*
51 * the foo_mask is the smallest value 2^n-1 that is >= foo.
52 */
53static void calc_pg_masks(struct ceph_pg_pool_info *pi)
54{
55 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
56 pi->pgp_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
58 pi->lpg_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
60 pi->lpgp_num_mask =
61 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
62}
63
64/*
65 * decode crush map
66 */
67static int crush_decode_uniform_bucket(void **p, void *end,
68 struct crush_bucket_uniform *b)
69{
70 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
71 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
72 b->item_weight = ceph_decode_32(p);
73 return 0;
74bad:
75 return -EINVAL;
76}
77
78static int crush_decode_list_bucket(void **p, void *end,
79 struct crush_bucket_list *b)
80{
81 int j;
82 dout("crush_decode_list_bucket %p to %p\n", *p, end);
83 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
84 if (b->item_weights == NULL)
85 return -ENOMEM;
86 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
87 if (b->sum_weights == NULL)
88 return -ENOMEM;
89 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
90 for (j = 0; j < b->h.size; j++) {
91 b->item_weights[j] = ceph_decode_32(p);
92 b->sum_weights[j] = ceph_decode_32(p);
93 }
94 return 0;
95bad:
96 return -EINVAL;
97}
98
99static int crush_decode_tree_bucket(void **p, void *end,
100 struct crush_bucket_tree *b)
101{
102 int j;
103 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
104 ceph_decode_32_safe(p, end, b->num_nodes, bad);
105 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
106 if (b->node_weights == NULL)
107 return -ENOMEM;
108 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
109 for (j = 0; j < b->num_nodes; j++)
110 b->node_weights[j] = ceph_decode_32(p);
111 return 0;
112bad:
113 return -EINVAL;
114}
115
116static int crush_decode_straw_bucket(void **p, void *end,
117 struct crush_bucket_straw *b)
118{
119 int j;
120 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
121 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
122 if (b->item_weights == NULL)
123 return -ENOMEM;
124 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
125 if (b->straws == NULL)
126 return -ENOMEM;
127 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
128 for (j = 0; j < b->h.size; j++) {
129 b->item_weights[j] = ceph_decode_32(p);
130 b->straws[j] = ceph_decode_32(p);
131 }
132 return 0;
133bad:
134 return -EINVAL;
135}
136
137static struct crush_map *crush_decode(void *pbyval, void *end)
138{
139 struct crush_map *c;
140 int err = -EINVAL;
141 int i, j;
142 void **p = &pbyval;
143 void *start = pbyval;
144 u32 magic;
145
146 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
147
148 c = kzalloc(sizeof(*c), GFP_NOFS);
149 if (c == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 ceph_decode_need(p, end, 4*sizeof(u32), bad);
153 magic = ceph_decode_32(p);
154 if (magic != CRUSH_MAGIC) {
155 pr_err("crush_decode magic %x != current %x\n",
156 (unsigned)magic, (unsigned)CRUSH_MAGIC);
157 goto bad;
158 }
159 c->max_buckets = ceph_decode_32(p);
160 c->max_rules = ceph_decode_32(p);
161 c->max_devices = ceph_decode_32(p);
162
163 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
164 if (c->device_parents == NULL)
165 goto badmem;
166 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
167 if (c->bucket_parents == NULL)
168 goto badmem;
169
170 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
171 if (c->buckets == NULL)
172 goto badmem;
173 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
174 if (c->rules == NULL)
175 goto badmem;
176
177 /* buckets */
178 for (i = 0; i < c->max_buckets; i++) {
179 int size = 0;
180 u32 alg;
181 struct crush_bucket *b;
182
183 ceph_decode_32_safe(p, end, alg, bad);
184 if (alg == 0) {
185 c->buckets[i] = NULL;
186 continue;
187 }
188 dout("crush_decode bucket %d off %x %p to %p\n",
189 i, (int)(*p-start), *p, end);
190
191 switch (alg) {
192 case CRUSH_BUCKET_UNIFORM:
193 size = sizeof(struct crush_bucket_uniform);
194 break;
195 case CRUSH_BUCKET_LIST:
196 size = sizeof(struct crush_bucket_list);
197 break;
198 case CRUSH_BUCKET_TREE:
199 size = sizeof(struct crush_bucket_tree);
200 break;
201 case CRUSH_BUCKET_STRAW:
202 size = sizeof(struct crush_bucket_straw);
203 break;
204 default:
205 err = -EINVAL;
206 goto bad;
207 }
208 BUG_ON(size == 0);
209 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
210 if (b == NULL)
211 goto badmem;
212
213 ceph_decode_need(p, end, 4*sizeof(u32), bad);
214 b->id = ceph_decode_32(p);
215 b->type = ceph_decode_16(p);
216 b->alg = ceph_decode_8(p);
217 b->hash = ceph_decode_8(p);
218 b->weight = ceph_decode_32(p);
219 b->size = ceph_decode_32(p);
220
221 dout("crush_decode bucket size %d off %x %p to %p\n",
222 b->size, (int)(*p-start), *p, end);
223
224 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
225 if (b->items == NULL)
226 goto badmem;
227 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
228 if (b->perm == NULL)
229 goto badmem;
230 b->perm_n = 0;
231
232 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
233 for (j = 0; j < b->size; j++)
234 b->items[j] = ceph_decode_32(p);
235
236 switch (b->alg) {
237 case CRUSH_BUCKET_UNIFORM:
238 err = crush_decode_uniform_bucket(p, end,
239 (struct crush_bucket_uniform *)b);
240 if (err < 0)
241 goto bad;
242 break;
243 case CRUSH_BUCKET_LIST:
244 err = crush_decode_list_bucket(p, end,
245 (struct crush_bucket_list *)b);
246 if (err < 0)
247 goto bad;
248 break;
249 case CRUSH_BUCKET_TREE:
250 err = crush_decode_tree_bucket(p, end,
251 (struct crush_bucket_tree *)b);
252 if (err < 0)
253 goto bad;
254 break;
255 case CRUSH_BUCKET_STRAW:
256 err = crush_decode_straw_bucket(p, end,
257 (struct crush_bucket_straw *)b);
258 if (err < 0)
259 goto bad;
260 break;
261 }
262 }
263
264 /* rules */
265 dout("rule vec is %p\n", c->rules);
266 for (i = 0; i < c->max_rules; i++) {
267 u32 yes;
268 struct crush_rule *r;
269
270 ceph_decode_32_safe(p, end, yes, bad);
271 if (!yes) {
272 dout("crush_decode NO rule %d off %x %p to %p\n",
273 i, (int)(*p-start), *p, end);
274 c->rules[i] = NULL;
275 continue;
276 }
277
278 dout("crush_decode rule %d off %x %p to %p\n",
279 i, (int)(*p-start), *p, end);
280
281 /* len */
282 ceph_decode_32_safe(p, end, yes, bad);
283#if BITS_PER_LONG == 32
284 err = -EINVAL;
285 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
286 goto bad;
287#endif
288 r = c->rules[i] = kmalloc(sizeof(*r) +
289 yes*sizeof(struct crush_rule_step),
290 GFP_NOFS);
291 if (r == NULL)
292 goto badmem;
293 dout(" rule %d is at %p\n", i, r);
294 r->len = yes;
295 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
296 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
297 for (j = 0; j < r->len; j++) {
298 r->steps[j].op = ceph_decode_32(p);
299 r->steps[j].arg1 = ceph_decode_32(p);
300 r->steps[j].arg2 = ceph_decode_32(p);
301 }
302 }
303
304 /* ignore trailing name maps. */
305
306 dout("crush_decode success\n");
307 return c;
308
309badmem:
310 err = -ENOMEM;
311bad:
312 dout("crush_decode fail %d\n", err);
313 crush_destroy(c);
314 return ERR_PTR(err);
315}
316
317/*
318 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
319 * to a set of osds)
320 */
321static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
322{
323 u64 a = *(u64 *)&l;
324 u64 b = *(u64 *)&r;
325
326 if (a < b)
327 return -1;
328 if (a > b)
329 return 1;
330 return 0;
331}
332
333static int __insert_pg_mapping(struct ceph_pg_mapping *new,
334 struct rb_root *root)
335{
336 struct rb_node **p = &root->rb_node;
337 struct rb_node *parent = NULL;
338 struct ceph_pg_mapping *pg = NULL;
339 int c;
340
341 while (*p) {
342 parent = *p;
343 pg = rb_entry(parent, struct ceph_pg_mapping, node);
344 c = pgid_cmp(new->pgid, pg->pgid);
345 if (c < 0)
346 p = &(*p)->rb_left;
347 else if (c > 0)
348 p = &(*p)->rb_right;
349 else
350 return -EEXIST;
351 }
352
353 rb_link_node(&new->node, parent, p);
354 rb_insert_color(&new->node, root);
355 return 0;
356}
357
358static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
359 struct ceph_pg pgid)
360{
361 struct rb_node *n = root->rb_node;
362 struct ceph_pg_mapping *pg;
363 int c;
364
365 while (n) {
366 pg = rb_entry(n, struct ceph_pg_mapping, node);
367 c = pgid_cmp(pgid, pg->pgid);
368 if (c < 0)
369 n = n->rb_left;
370 else if (c > 0)
371 n = n->rb_right;
372 else
373 return pg;
374 }
375 return NULL;
376}
377
378/*
379 * rbtree of pg pool info
380 */
381static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
382{
383 struct rb_node **p = &root->rb_node;
384 struct rb_node *parent = NULL;
385 struct ceph_pg_pool_info *pi = NULL;
386
387 while (*p) {
388 parent = *p;
389 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
390 if (new->id < pi->id)
391 p = &(*p)->rb_left;
392 else if (new->id > pi->id)
393 p = &(*p)->rb_right;
394 else
395 return -EEXIST;
396 }
397
398 rb_link_node(&new->node, parent, p);
399 rb_insert_color(&new->node, root);
400 return 0;
401}
402
403static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
404{
405 struct ceph_pg_pool_info *pi;
406 struct rb_node *n = root->rb_node;
407
408 while (n) {
409 pi = rb_entry(n, struct ceph_pg_pool_info, node);
410 if (id < pi->id)
411 n = n->rb_left;
412 else if (id > pi->id)
413 n = n->rb_right;
414 else
415 return pi;
416 }
417 return NULL;
418}
419
420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
421{
422 rb_erase(&pi->node, root);
423 kfree(pi->name);
424 kfree(pi);
425}
426
427static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
428{
429 unsigned n, m;
430
431 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
432 calc_pg_masks(pi);
433
434 /* num_snaps * snap_info_t */
435 n = le32_to_cpu(pi->v.num_snaps);
436 while (n--) {
437 ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
438 sizeof(struct ceph_timespec), bad);
439 *p += sizeof(u64) + /* key */
440 1 + sizeof(u64) + /* u8, snapid */
441 sizeof(struct ceph_timespec);
442 m = ceph_decode_32(p); /* snap name */
443 *p += m;
444 }
445
446 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
447 return 0;
448
449bad:
450 return -EINVAL;
451}
452
453static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
454{
455 struct ceph_pg_pool_info *pi;
456 u32 num, len, pool;
457
458 ceph_decode_32_safe(p, end, num, bad);
459 dout(" %d pool names\n", num);
460 while (num--) {
461 ceph_decode_32_safe(p, end, pool, bad);
462 ceph_decode_32_safe(p, end, len, bad);
463 dout(" pool %d len %d\n", pool, len);
464 pi = __lookup_pg_pool(&map->pg_pools, pool);
465 if (pi) {
466 kfree(pi->name);
467 pi->name = kmalloc(len + 1, GFP_NOFS);
468 if (pi->name) {
469 memcpy(pi->name, *p, len);
470 pi->name[len] = '\0';
471 dout(" name is %s\n", pi->name);
472 }
473 }
474 *p += len;
475 }
476 return 0;
477
478bad:
479 return -EINVAL;
480}
481
482/*
483 * osd map
484 */
485void ceph_osdmap_destroy(struct ceph_osdmap *map)
486{
487 dout("osdmap_destroy %p\n", map);
488 if (map->crush)
489 crush_destroy(map->crush);
490 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
491 struct ceph_pg_mapping *pg =
492 rb_entry(rb_first(&map->pg_temp),
493 struct ceph_pg_mapping, node);
494 rb_erase(&pg->node, &map->pg_temp);
495 kfree(pg);
496 }
497 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
498 struct ceph_pg_pool_info *pi =
499 rb_entry(rb_first(&map->pg_pools),
500 struct ceph_pg_pool_info, node);
501 __remove_pg_pool(&map->pg_pools, pi);
502 }
503 kfree(map->osd_state);
504 kfree(map->osd_weight);
505 kfree(map->osd_addr);
506 kfree(map);
507}
508
509/*
510 * adjust max osd value. reallocate arrays.
511 */
512static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
513{
514 u8 *state;
515 struct ceph_entity_addr *addr;
516 u32 *weight;
517
518 state = kcalloc(max, sizeof(*state), GFP_NOFS);
519 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
520 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
521 if (state == NULL || addr == NULL || weight == NULL) {
522 kfree(state);
523 kfree(addr);
524 kfree(weight);
525 return -ENOMEM;
526 }
527
528 /* copy old? */
529 if (map->osd_state) {
530 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
531 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
532 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
533 kfree(map->osd_state);
534 kfree(map->osd_addr);
535 kfree(map->osd_weight);
536 }
537
538 map->osd_state = state;
539 map->osd_weight = weight;
540 map->osd_addr = addr;
541 map->max_osd = max;
542 return 0;
543}
544
545/*
546 * decode a full map.
547 */
548struct ceph_osdmap *osdmap_decode(void **p, void *end)
549{
550 struct ceph_osdmap *map;
551 u16 version;
552 u32 len, max, i;
553 u8 ev;
554 int err = -EINVAL;
555 void *start = *p;
556 struct ceph_pg_pool_info *pi;
557
558 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
559
560 map = kzalloc(sizeof(*map), GFP_NOFS);
561 if (map == NULL)
562 return ERR_PTR(-ENOMEM);
563 map->pg_temp = RB_ROOT;
564
565 ceph_decode_16_safe(p, end, version, bad);
566 if (version > CEPH_OSDMAP_VERSION) {
567 pr_warning("got unknown v %d > %d of osdmap\n", version,
568 CEPH_OSDMAP_VERSION);
569 goto bad;
570 }
571
572 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
573 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
574 map->epoch = ceph_decode_32(p);
575 ceph_decode_copy(p, &map->created, sizeof(map->created));
576 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
577
578 ceph_decode_32_safe(p, end, max, bad);
579 while (max--) {
580 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
581 pi = kzalloc(sizeof(*pi), GFP_NOFS);
582 if (!pi)
583 goto bad;
584 pi->id = ceph_decode_32(p);
585 ev = ceph_decode_8(p); /* encoding version */
586 if (ev > CEPH_PG_POOL_VERSION) {
587 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
588 ev, CEPH_PG_POOL_VERSION);
589 kfree(pi);
590 goto bad;
591 }
592 err = __decode_pool(p, end, pi);
593 if (err < 0)
594 goto bad;
595 __insert_pg_pool(&map->pg_pools, pi);
596 }
597
598 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
599 goto bad;
600
601 ceph_decode_32_safe(p, end, map->pool_max, bad);
602
603 ceph_decode_32_safe(p, end, map->flags, bad);
604
605 max = ceph_decode_32(p);
606
607 /* (re)alloc osd arrays */
608 err = osdmap_set_max_osd(map, max);
609 if (err < 0)
610 goto bad;
611 dout("osdmap_decode max_osd = %d\n", map->max_osd);
612
613 /* osds */
614 err = -EINVAL;
615 ceph_decode_need(p, end, 3*sizeof(u32) +
616 map->max_osd*(1 + sizeof(*map->osd_weight) +
617 sizeof(*map->osd_addr)), bad);
618 *p += 4; /* skip length field (should match max) */
619 ceph_decode_copy(p, map->osd_state, map->max_osd);
620
621 *p += 4; /* skip length field (should match max) */
622 for (i = 0; i < map->max_osd; i++)
623 map->osd_weight[i] = ceph_decode_32(p);
624
625 *p += 4; /* skip length field (should match max) */
626 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
627 for (i = 0; i < map->max_osd; i++)
628 ceph_decode_addr(&map->osd_addr[i]);
629
630 /* pg_temp */
631 ceph_decode_32_safe(p, end, len, bad);
632 for (i = 0; i < len; i++) {
633 int n, j;
634 struct ceph_pg pgid;
635 struct ceph_pg_mapping *pg;
636
637 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
638 ceph_decode_copy(p, &pgid, sizeof(pgid));
639 n = ceph_decode_32(p);
640 ceph_decode_need(p, end, n * sizeof(u32), bad);
641 err = -ENOMEM;
642 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
643 if (!pg)
644 goto bad;
645 pg->pgid = pgid;
646 pg->len = n;
647 for (j = 0; j < n; j++)
648 pg->osds[j] = ceph_decode_32(p);
649
650 err = __insert_pg_mapping(pg, &map->pg_temp);
651 if (err)
652 goto bad;
653 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
654 }
655
656 /* crush */
657 ceph_decode_32_safe(p, end, len, bad);
658 dout("osdmap_decode crush len %d from off 0x%x\n", len,
659 (int)(*p - start));
660 ceph_decode_need(p, end, len, bad);
661 map->crush = crush_decode(*p, end);
662 *p += len;
663 if (IS_ERR(map->crush)) {
664 err = PTR_ERR(map->crush);
665 map->crush = NULL;
666 goto bad;
667 }
668
669 /* ignore the rest of the map */
670 *p = end;
671
672 dout("osdmap_decode done %p %p\n", *p, end);
673 return map;
674
675bad:
676 dout("osdmap_decode fail\n");
677 ceph_osdmap_destroy(map);
678 return ERR_PTR(err);
679}
680
681/*
682 * decode and apply an incremental map update.
683 */
684struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
685 struct ceph_osdmap *map,
686 struct ceph_messenger *msgr)
687{
688 struct crush_map *newcrush = NULL;
689 struct ceph_fsid fsid;
690 u32 epoch = 0;
691 struct ceph_timespec modified;
692 u32 len, pool;
693 __s32 new_pool_max, new_flags, max;
694 void *start = *p;
695 int err = -EINVAL;
696 u16 version;
697 struct rb_node *rbp;
698
699 ceph_decode_16_safe(p, end, version, bad);
700 if (version > CEPH_OSDMAP_INC_VERSION) {
701 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
702 CEPH_OSDMAP_INC_VERSION);
703 goto bad;
704 }
705
706 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
707 bad);
708 ceph_decode_copy(p, &fsid, sizeof(fsid));
709 epoch = ceph_decode_32(p);
710 BUG_ON(epoch != map->epoch+1);
711 ceph_decode_copy(p, &modified, sizeof(modified));
712 new_pool_max = ceph_decode_32(p);
713 new_flags = ceph_decode_32(p);
714
715 /* full map? */
716 ceph_decode_32_safe(p, end, len, bad);
717 if (len > 0) {
718 dout("apply_incremental full map len %d, %p to %p\n",
719 len, *p, end);
720 return osdmap_decode(p, min(*p+len, end));
721 }
722
723 /* new crush? */
724 ceph_decode_32_safe(p, end, len, bad);
725 if (len > 0) {
726 dout("apply_incremental new crush map len %d, %p to %p\n",
727 len, *p, end);
728 newcrush = crush_decode(*p, min(*p+len, end));
729 if (IS_ERR(newcrush))
730 return ERR_CAST(newcrush);
731 *p += len;
732 }
733
734 /* new flags? */
735 if (new_flags >= 0)
736 map->flags = new_flags;
737 if (new_pool_max >= 0)
738 map->pool_max = new_pool_max;
739
740 ceph_decode_need(p, end, 5*sizeof(u32), bad);
741
742 /* new max? */
743 max = ceph_decode_32(p);
744 if (max >= 0) {
745 err = osdmap_set_max_osd(map, max);
746 if (err < 0)
747 goto bad;
748 }
749
750 map->epoch++;
751 map->modified = map->modified;
752 if (newcrush) {
753 if (map->crush)
754 crush_destroy(map->crush);
755 map->crush = newcrush;
756 newcrush = NULL;
757 }
758
759 /* new_pool */
760 ceph_decode_32_safe(p, end, len, bad);
761 while (len--) {
762 __u8 ev;
763 struct ceph_pg_pool_info *pi;
764
765 ceph_decode_32_safe(p, end, pool, bad);
766 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
767 ev = ceph_decode_8(p); /* encoding version */
768 if (ev > CEPH_PG_POOL_VERSION) {
769 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
770 ev, CEPH_PG_POOL_VERSION);
771 goto bad;
772 }
773 pi = __lookup_pg_pool(&map->pg_pools, pool);
774 if (!pi) {
775 pi = kzalloc(sizeof(*pi), GFP_NOFS);
776 if (!pi) {
777 err = -ENOMEM;
778 goto bad;
779 }
780 pi->id = pool;
781 __insert_pg_pool(&map->pg_pools, pi);
782 }
783 err = __decode_pool(p, end, pi);
784 if (err < 0)
785 goto bad;
786 }
787 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
788 goto bad;
789
790 /* old_pool */
791 ceph_decode_32_safe(p, end, len, bad);
792 while (len--) {
793 struct ceph_pg_pool_info *pi;
794
795 ceph_decode_32_safe(p, end, pool, bad);
796 pi = __lookup_pg_pool(&map->pg_pools, pool);
797 if (pi)
798 __remove_pg_pool(&map->pg_pools, pi);
799 }
800
801 /* new_up */
802 err = -EINVAL;
803 ceph_decode_32_safe(p, end, len, bad);
804 while (len--) {
805 u32 osd;
806 struct ceph_entity_addr addr;
807 ceph_decode_32_safe(p, end, osd, bad);
808 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
809 ceph_decode_addr(&addr);
810 pr_info("osd%d up\n", osd);
811 BUG_ON(osd >= map->max_osd);
812 map->osd_state[osd] |= CEPH_OSD_UP;
813 map->osd_addr[osd] = addr;
814 }
815
816 /* new_down */
817 ceph_decode_32_safe(p, end, len, bad);
818 while (len--) {
819 u32 osd;
820 ceph_decode_32_safe(p, end, osd, bad);
821 (*p)++; /* clean flag */
822 pr_info("osd%d down\n", osd);
823 if (osd < map->max_osd)
824 map->osd_state[osd] &= ~CEPH_OSD_UP;
825 }
826
827 /* new_weight */
828 ceph_decode_32_safe(p, end, len, bad);
829 while (len--) {
830 u32 osd, off;
831 ceph_decode_need(p, end, sizeof(u32)*2, bad);
832 osd = ceph_decode_32(p);
833 off = ceph_decode_32(p);
834 pr_info("osd%d weight 0x%x %s\n", osd, off,
835 off == CEPH_OSD_IN ? "(in)" :
836 (off == CEPH_OSD_OUT ? "(out)" : ""));
837 if (osd < map->max_osd)
838 map->osd_weight[osd] = off;
839 }
840
841 /* new_pg_temp */
842 rbp = rb_first(&map->pg_temp);
843 ceph_decode_32_safe(p, end, len, bad);
844 while (len--) {
845 struct ceph_pg_mapping *pg;
846 int j;
847 struct ceph_pg pgid;
848 u32 pglen;
849 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
850 ceph_decode_copy(p, &pgid, sizeof(pgid));
851 pglen = ceph_decode_32(p);
852
853 /* remove any? */
854 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
855 node)->pgid, pgid) <= 0) {
856 struct ceph_pg_mapping *cur =
857 rb_entry(rbp, struct ceph_pg_mapping, node);
858
859 rbp = rb_next(rbp);
860 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
861 rb_erase(&cur->node, &map->pg_temp);
862 kfree(cur);
863 }
864
865 if (pglen) {
866 /* insert */
867 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
868 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
869 if (!pg) {
870 err = -ENOMEM;
871 goto bad;
872 }
873 pg->pgid = pgid;
874 pg->len = pglen;
875 for (j = 0; j < pglen; j++)
876 pg->osds[j] = ceph_decode_32(p);
877 err = __insert_pg_mapping(pg, &map->pg_temp);
878 if (err) {
879 kfree(pg);
880 goto bad;
881 }
882 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
883 pglen);
884 }
885 }
886 while (rbp) {
887 struct ceph_pg_mapping *cur =
888 rb_entry(rbp, struct ceph_pg_mapping, node);
889
890 rbp = rb_next(rbp);
891 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
892 rb_erase(&cur->node, &map->pg_temp);
893 kfree(cur);
894 }
895
896 /* ignore the rest */
897 *p = end;
898 return map;
899
900bad:
901 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
902 epoch, (int)(*p - start), *p, start, end);
903 print_hex_dump(KERN_DEBUG, "osdmap: ",
904 DUMP_PREFIX_OFFSET, 16, 1,
905 start, end - start, true);
906 if (newcrush)
907 crush_destroy(newcrush);
908 return ERR_PTR(err);
909}
910
911
912
913
914/*
915 * calculate file layout from given offset, length.
916 * fill in correct oid, logical length, and object extent
917 * offset, length.
918 *
919 * for now, we write only a single su, until we can
920 * pass a stride back to the caller.
921 */
922void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
923 u64 off, u64 *plen,
924 u64 *ono,
925 u64 *oxoff, u64 *oxlen)
926{
927 u32 osize = le32_to_cpu(layout->fl_object_size);
928 u32 su = le32_to_cpu(layout->fl_stripe_unit);
929 u32 sc = le32_to_cpu(layout->fl_stripe_count);
930 u32 bl, stripeno, stripepos, objsetno;
931 u32 su_per_object;
932 u64 t, su_offset;
933
934 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
935 osize, su);
936 su_per_object = osize / su;
937 dout("osize %u / su %u = su_per_object %u\n", osize, su,
938 su_per_object);
939
940 BUG_ON((su & ~PAGE_MASK) != 0);
941 /* bl = *off / su; */
942 t = off;
943 do_div(t, su);
944 bl = t;
945 dout("off %llu / su %u = bl %u\n", off, su, bl);
946
947 stripeno = bl / sc;
948 stripepos = bl % sc;
949 objsetno = stripeno / su_per_object;
950
951 *ono = objsetno * sc + stripepos;
952 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
953
954 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
955 t = off;
956 su_offset = do_div(t, su);
957 *oxoff = su_offset + (stripeno % su_per_object) * su;
958
959 /*
960 * Calculate the length of the extent being written to the selected
961 * object. This is the minimum of the full length requested (plen) or
962 * the remainder of the current stripe being written to.
963 */
964 *oxlen = min_t(u64, *plen, su - su_offset);
965 *plen = *oxlen;
966
967 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
968}
969
970/*
971 * calculate an object layout (i.e. pgid) from an oid,
972 * file_layout, and osdmap
973 */
974int ceph_calc_object_layout(struct ceph_object_layout *ol,
975 const char *oid,
976 struct ceph_file_layout *fl,
977 struct ceph_osdmap *osdmap)
978{
979 unsigned num, num_mask;
980 struct ceph_pg pgid;
981 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
982 int poolid = le32_to_cpu(fl->fl_pg_pool);
983 struct ceph_pg_pool_info *pool;
984 unsigned ps;
985
986 BUG_ON(!osdmap);
987
988 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
989 if (!pool)
990 return -EIO;
991 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
992 if (preferred >= 0) {
993 ps += preferred;
994 num = le32_to_cpu(pool->v.lpg_num);
995 num_mask = pool->lpg_num_mask;
996 } else {
997 num = le32_to_cpu(pool->v.pg_num);
998 num_mask = pool->pg_num_mask;
999 }
1000
1001 pgid.ps = cpu_to_le16(ps);
1002 pgid.preferred = cpu_to_le16(preferred);
1003 pgid.pool = fl->fl_pg_pool;
1004 if (preferred >= 0)
1005 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
1006 (int)preferred);
1007 else
1008 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
1009
1010 ol->ol_pgid = pgid;
1011 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
1012 return 0;
1013}
1014
1015/*
1016 * Calculate raw osd vector for the given pgid. Return pointer to osd
1017 * array, or NULL on failure.
1018 */
1019static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1020 int *osds, int *num)
1021{
1022 struct ceph_pg_mapping *pg;
1023 struct ceph_pg_pool_info *pool;
1024 int ruleno;
1025 unsigned poolid, ps, pps;
1026 int preferred;
1027
1028 /* pg_temp? */
1029 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1030 if (pg) {
1031 *num = pg->len;
1032 return pg->osds;
1033 }
1034
1035 /* crush */
1036 poolid = le32_to_cpu(pgid.pool);
1037 ps = le16_to_cpu(pgid.ps);
1038 preferred = (s16)le16_to_cpu(pgid.preferred);
1039
1040 /* don't forcefeed bad device ids to crush */
1041 if (preferred >= osdmap->max_osd ||
1042 preferred >= osdmap->crush->max_devices)
1043 preferred = -1;
1044
1045 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1046 if (!pool)
1047 return NULL;
1048 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1049 pool->v.type, pool->v.size);
1050 if (ruleno < 0) {
1051 pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
1052 poolid, pool->v.crush_ruleset, pool->v.type,
1053 pool->v.size);
1054 return NULL;
1055 }
1056
1057 if (preferred >= 0)
1058 pps = ceph_stable_mod(ps,
1059 le32_to_cpu(pool->v.lpgp_num),
1060 pool->lpgp_num_mask);
1061 else
1062 pps = ceph_stable_mod(ps,
1063 le32_to_cpu(pool->v.pgp_num),
1064 pool->pgp_num_mask);
1065 pps += poolid;
1066 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1067 min_t(int, pool->v.size, *num),
1068 preferred, osdmap->osd_weight);
1069 return osds;
1070}
1071
1072/*
1073 * Return acting set for given pgid.
1074 */
1075int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1076 int *acting)
1077{
1078 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1079 int i, o, num = CEPH_PG_MAX_SIZE;
1080
1081 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1082 if (!osds)
1083 return -1;
1084
1085 /* primary is first up osd */
1086 o = 0;
1087 for (i = 0; i < num; i++)
1088 if (ceph_osd_is_up(osdmap, osds[i]))
1089 acting[o++] = osds[i];
1090 return o;
1091}
1092
1093/*
1094 * Return primary osd for given pgid, or -1 if none.
1095 */
1096int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1097{
1098 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1099 int i, num = CEPH_PG_MAX_SIZE;
1100
1101 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1102 if (!osds)
1103 return -1;
1104
1105 /* primary is first up osd */
1106 for (i = 0; i < num; i++)
1107 if (ceph_osd_is_up(osdmap, osds[i]))
1108 return osds[i];
1109 return -1;
1110}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
deleted file mode 100644
index 970b547e510d..000000000000
--- a/fs/ceph/osdmap.h
+++ /dev/null
@@ -1,128 +0,0 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26 char *name;
27};
28
29struct ceph_pg_mapping {
30 struct rb_node node;
31 struct ceph_pg pgid;
32 int len;
33 int osds[];
34};
35
36struct ceph_osdmap {
37 struct ceph_fsid fsid;
38 u32 epoch;
39 u32 mkfs_epoch;
40 struct ceph_timespec created, modified;
41
42 u32 flags; /* CEPH_OSDMAP_* */
43
44 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
45 u8 *osd_state; /* CEPH_OSD_* */
46 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
47 struct ceph_entity_addr *osd_addr;
48
49 struct rb_root pg_temp;
50 struct rb_root pg_pools;
51 u32 pool_max;
52
53 /* the CRUSH map specifies the mapping of placement groups to
54 * the list of osds that store+replicate them. */
55 struct crush_map *crush;
56};
57
58/*
59 * file layout helpers
60 */
61#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
62#define ceph_file_layout_stripe_count(l) \
63 ((__s32)le32_to_cpu((l).fl_stripe_count))
64#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
65#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
66#define ceph_file_layout_object_su(l) \
67 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
68#define ceph_file_layout_pg_preferred(l) \
69 ((__s32)le32_to_cpu((l).fl_pg_preferred))
70#define ceph_file_layout_pg_pool(l) \
71 ((__s32)le32_to_cpu((l).fl_pg_pool))
72
73static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
74{
75 return le32_to_cpu(l->fl_stripe_unit) *
76 le32_to_cpu(l->fl_stripe_count);
77}
78
79/* "period" == bytes before i start on a new set of objects */
80static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
81{
82 return le32_to_cpu(l->fl_object_size) *
83 le32_to_cpu(l->fl_stripe_count);
84}
85
86
87static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
88{
89 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
90}
91
92static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
93{
94 return map && (map->flags & flag);
95}
96
97extern char *ceph_osdmap_state_str(char *str, int len, int state);
98
99static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
100 int osd)
101{
102 if (osd >= map->max_osd)
103 return NULL;
104 return &map->osd_addr[osd];
105}
106
107extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
108extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
109 struct ceph_osdmap *map,
110 struct ceph_messenger *msgr);
111extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
112
113/* calculate mapping of a file extent to an object */
114extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
115 u64 off, u64 *plen,
116 u64 *bno, u64 *oxoff, u64 *oxlen);
117
118/* calculate mapping of object to a placement group */
119extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
120 const char *oid,
121 struct ceph_file_layout *fl,
122 struct ceph_osdmap *osdmap);
123extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
124 int *acting);
125extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
126 struct ceph_pg pgid);
127
128#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
deleted file mode 100644
index 46a368b6dce5..000000000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,63 +0,0 @@
1
2#include <linux/gfp.h>
3#include <linux/pagemap.h>
4#include <linux/highmem.h>
5
6#include "pagelist.h"
7
8static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
9{
10 struct page *page = list_entry(pl->head.prev, struct page,
11 lru);
12 kunmap(page);
13}
14
15int ceph_pagelist_release(struct ceph_pagelist *pl)
16{
17 if (pl->mapped_tail)
18 ceph_pagelist_unmap_tail(pl);
19
20 while (!list_empty(&pl->head)) {
21 struct page *page = list_first_entry(&pl->head, struct page,
22 lru);
23 list_del(&page->lru);
24 __free_page(page);
25 }
26 return 0;
27}
28
29static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
30{
31 struct page *page = __page_cache_alloc(GFP_NOFS);
32 if (!page)
33 return -ENOMEM;
34 pl->room += PAGE_SIZE;
35 list_add_tail(&page->lru, &pl->head);
36 if (pl->mapped_tail)
37 ceph_pagelist_unmap_tail(pl);
38 pl->mapped_tail = kmap(page);
39 return 0;
40}
41
42int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
43{
44 while (pl->room < len) {
45 size_t bit = pl->room;
46 int ret;
47
48 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
49 buf, bit);
50 pl->length += bit;
51 pl->room -= bit;
52 buf += bit;
53 len -= bit;
54 ret = ceph_pagelist_addpage(pl);
55 if (ret)
56 return ret;
57 }
58
59 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
60 pl->length += len;
61 pl->room -= len;
62 return 0;
63}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
deleted file mode 100644
index e8a4187e1087..000000000000
--- a/fs/ceph/pagelist.h
+++ /dev/null
@@ -1,54 +0,0 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
deleted file mode 100644
index 6d5247f2e81b..000000000000
--- a/fs/ceph/rados.h
+++ /dev/null
@@ -1,405 +0,0 @@
1#ifndef CEPH_RADOS_H
2#define CEPH_RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_INC_VERSION_EXT 5
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 5
18
19/*
20 * fs id
21 */
22struct ceph_fsid {
23 unsigned char fsid[16];
24};
25
26static inline int ceph_fsid_compare(const struct ceph_fsid *a,
27 const struct ceph_fsid *b)
28{
29 return memcmp(a, b, sizeof(*a));
30}
31
32/*
33 * ino, object, etc.
34 */
35typedef __le64 ceph_snapid_t;
36#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
37#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
38#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
39
40struct ceph_timespec {
41 __le32 tv_sec;
42 __le32 tv_nsec;
43} __attribute__ ((packed));
44
45
46/*
47 * object layout - how objects are mapped into PGs
48 */
49#define CEPH_OBJECT_LAYOUT_HASH 1
50#define CEPH_OBJECT_LAYOUT_LINEAR 2
51#define CEPH_OBJECT_LAYOUT_HASHINO 3
52
53/*
54 * pg layout -- how PGs are mapped onto (sets of) OSDs
55 */
56#define CEPH_PG_LAYOUT_CRUSH 0
57#define CEPH_PG_LAYOUT_HASH 1
58#define CEPH_PG_LAYOUT_LINEAR 2
59#define CEPH_PG_LAYOUT_HYBRID 3
60
61#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
62
63/*
64 * placement group.
65 * we encode this into one __le64.
66 */
67struct ceph_pg {
68 __le16 preferred; /* preferred primary osd */
69 __le16 ps; /* placement seed */
70 __le32 pool; /* object pool */
71} __attribute__ ((packed));
72
73/*
74 * pg_pool is a set of pgs storing a pool of objects
75 *
76 * pg_num -- base number of pseudorandomly placed pgs
77 *
78 * pgp_num -- effective number when calculating pg placement. this
79 * is used for pg_num increases. new pgs result in data being "split"
80 * into new pgs. for this to proceed smoothly, new pgs are intiially
81 * colocated with their parents; that is, pgp_num doesn't increase
82 * until the new pgs have successfully split. only _then_ are the new
83 * pgs placed independently.
84 *
85 * lpg_num -- localized pg count (per device). replicas are randomly
86 * selected.
87 *
88 * lpgp_num -- as above.
89 */
90#define CEPH_PG_TYPE_REP 1
91#define CEPH_PG_TYPE_RAID4 2
92#define CEPH_PG_POOL_VERSION 2
93struct ceph_pg_pool {
94 __u8 type; /* CEPH_PG_TYPE_* */
95 __u8 size; /* number of osds in each pg */
96 __u8 crush_ruleset; /* crush placement rule */
97 __u8 object_hash; /* hash mapping object name to ps */
98 __le32 pg_num, pgp_num; /* number of pg's */
99 __le32 lpg_num, lpgp_num; /* number of localized pg's */
100 __le32 last_change; /* most recent epoch changed */
101 __le64 snap_seq; /* seq for per-pool snapshot */
102 __le32 snap_epoch; /* epoch of last snap */
103 __le32 num_snaps;
104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
105 __le64 auid; /* who owns the pg */
106} __attribute__ ((packed));
107
108/*
109 * stable_mod func is used to control number of placement groups.
110 * similar to straight-up modulo, but produces a stable mapping as b
111 * increases over time. b is the number of bins, and bmask is the
112 * containing power of 2 minus 1.
113 *
114 * b <= bmask and bmask=(2**n)-1
115 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
116 */
117static inline int ceph_stable_mod(int x, int b, int bmask)
118{
119 if ((x & bmask) < b)
120 return x & bmask;
121 else
122 return x & (bmask >> 1);
123}
124
125/*
126 * object layout - how a given object should be stored.
127 */
128struct ceph_object_layout {
129 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
130 __le32 ol_stripe_unit; /* for per-object parity, if any */
131} __attribute__ ((packed));
132
133/*
134 * compound epoch+version, used by storage layer to serialize mutations
135 */
136struct ceph_eversion {
137 __le32 epoch;
138 __le64 version;
139} __attribute__ ((packed));
140
141/*
142 * osd map bits
143 */
144
145/* status bits */
146#define CEPH_OSD_EXISTS 1
147#define CEPH_OSD_UP 2
148
149/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
150#define CEPH_OSD_IN 0x10000
151#define CEPH_OSD_OUT 0
152
153
154/*
155 * osd map flag bits
156 */
157#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
158#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
159#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
160#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
161#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
162
163/*
164 * osd ops
165 */
166#define CEPH_OSD_OP_MODE 0xf000
167#define CEPH_OSD_OP_MODE_RD 0x1000
168#define CEPH_OSD_OP_MODE_WR 0x2000
169#define CEPH_OSD_OP_MODE_RMW 0x3000
170#define CEPH_OSD_OP_MODE_SUB 0x4000
171
172#define CEPH_OSD_OP_TYPE 0x0f00
173#define CEPH_OSD_OP_TYPE_LOCK 0x0100
174#define CEPH_OSD_OP_TYPE_DATA 0x0200
175#define CEPH_OSD_OP_TYPE_ATTR 0x0300
176#define CEPH_OSD_OP_TYPE_EXEC 0x0400
177#define CEPH_OSD_OP_TYPE_PG 0x0500
178
179enum {
180 /** data **/
181 /* read */
182 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
183 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
184
185 /* fancy read */
186 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
187
188 /* write */
189 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
190 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
191 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
192 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
193 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
194
195 /* fancy write */
196 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
197 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
198 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
199 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
200
201 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
202 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
204
205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
206 CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
207
208 /** attrs **/
209 /* read */
210 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
211 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
212 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
213
214 /* write */
215 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
216 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
217 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
218 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
219
220 /** subop **/
221 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
222 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
223 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
224 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
225 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
226
227 /** lock **/
228 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
229 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
230 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
231 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
232 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
233 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
234
235 /** exec **/
236 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
237
238 /** pg **/
239 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
240};
241
242static inline int ceph_osd_op_type_lock(int op)
243{
244 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
245}
246static inline int ceph_osd_op_type_data(int op)
247{
248 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
249}
250static inline int ceph_osd_op_type_attr(int op)
251{
252 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
253}
254static inline int ceph_osd_op_type_exec(int op)
255{
256 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
257}
258static inline int ceph_osd_op_type_pg(int op)
259{
260 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
261}
262
263static inline int ceph_osd_op_mode_subop(int op)
264{
265 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
266}
267static inline int ceph_osd_op_mode_read(int op)
268{
269 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
270}
271static inline int ceph_osd_op_mode_modify(int op)
272{
273 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
274}
275
276/*
277 * note that the following tmap stuff is also defined in the ceph librados.h
278 * any modification here needs to be updated there
279 */
280#define CEPH_OSD_TMAP_HDR 'h'
281#define CEPH_OSD_TMAP_SET 's'
282#define CEPH_OSD_TMAP_RM 'r'
283
284extern const char *ceph_osd_op_name(int op);
285
286
287/*
288 * osd op flags
289 *
290 * An op may be READ, WRITE, or READ|WRITE.
291 */
292enum {
293 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
294 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
295 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
296 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
297 CEPH_OSD_FLAG_READ = 16, /* op may read */
298 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
299 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
300 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
301 CEPH_OSD_FLAG_BALANCE_READS = 256,
302 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
303 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
304 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
305 CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
306};
307
308enum {
309 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
310};
311
312#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
313#define EBLACKLISTED ESHUTDOWN /* blacklisted */
314
315/* xattr comparison */
316enum {
317 CEPH_OSD_CMPXATTR_OP_NOP = 0,
318 CEPH_OSD_CMPXATTR_OP_EQ = 1,
319 CEPH_OSD_CMPXATTR_OP_NE = 2,
320 CEPH_OSD_CMPXATTR_OP_GT = 3,
321 CEPH_OSD_CMPXATTR_OP_GTE = 4,
322 CEPH_OSD_CMPXATTR_OP_LT = 5,
323 CEPH_OSD_CMPXATTR_OP_LTE = 6
324};
325
326enum {
327 CEPH_OSD_CMPXATTR_MODE_STRING = 1,
328 CEPH_OSD_CMPXATTR_MODE_U64 = 2
329};
330
331/*
332 * an individual object operation. each may be accompanied by some data
333 * payload
334 */
335struct ceph_osd_op {
336 __le16 op; /* CEPH_OSD_OP_* */
337 __le32 flags; /* CEPH_OSD_FLAG_* */
338 union {
339 struct {
340 __le64 offset, length;
341 __le64 truncate_size;
342 __le32 truncate_seq;
343 } __attribute__ ((packed)) extent;
344 struct {
345 __le32 name_len;
346 __le32 value_len;
347 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
348 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
349 } __attribute__ ((packed)) xattr;
350 struct {
351 __u8 class_len;
352 __u8 method_len;
353 __u8 argc;
354 __le32 indata_len;
355 } __attribute__ ((packed)) cls;
356 struct {
357 __le64 cookie, count;
358 } __attribute__ ((packed)) pgls;
359 struct {
360 __le64 snapid;
361 } __attribute__ ((packed)) snap;
362 };
363 __le32 payload_len;
364} __attribute__ ((packed));
365
366/*
367 * osd request message header. each request may include multiple
368 * ceph_osd_op object operations.
369 */
370struct ceph_osd_request_head {
371 __le32 client_inc; /* client incarnation */
372 struct ceph_object_layout layout; /* pgid */
373 __le32 osdmap_epoch; /* client's osdmap epoch */
374
375 __le32 flags;
376
377 struct ceph_timespec mtime; /* for mutations only */
378 struct ceph_eversion reassert_version; /* if we are replaying op */
379
380 __le32 object_len; /* length of object name */
381
382 __le64 snapid; /* snapid to read */
383 __le64 snap_seq; /* writer's snap context */
384 __le32 num_snaps;
385
386 __le16 num_ops;
387 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
388} __attribute__ ((packed));
389
390struct ceph_osd_reply_head {
391 __le32 client_inc; /* client incarnation */
392 __le32 flags;
393 struct ceph_object_layout layout;
394 __le32 osdmap_epoch;
395 struct ceph_eversion reassert_version; /* for replaying uncommitted */
396
397 __le32 result; /* result code */
398
399 __le32 object_len; /* length of object name */
400 __le32 num_ops;
401 struct ceph_osd_op ops[0]; /* ops[], object */
402} __attribute__ ((packed));
403
404
405#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 190b6c4a6f2b..39c243acd062 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/sort.h> 3#include <linux/sort.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5 5
6#include "super.h" 6#include "super.h"
7#include "decode.h" 7#include "mds_client.h"
8
9#include <linux/ceph/decode.h>
8 10
9/* 11/*
10 * Snapshots in ceph are driven in large part by cooperation from the 12 * Snapshots in ceph are driven in large part by cooperation from the
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
526 struct ceph_cap_snap *capsnap) 528 struct ceph_cap_snap *capsnap)
527{ 529{
528 struct inode *inode = &ci->vfs_inode; 530 struct inode *inode = &ci->vfs_inode;
529 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 531 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
530 532
531 BUG_ON(capsnap->writing); 533 BUG_ON(capsnap->writing);
532 capsnap->size = inode->i_size; 534 capsnap->size = inode->i_size;
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
747 struct ceph_mds_session *session, 749 struct ceph_mds_session *session,
748 struct ceph_msg *msg) 750 struct ceph_msg *msg)
749{ 751{
750 struct super_block *sb = mdsc->client->sb; 752 struct super_block *sb = mdsc->fsc->sb;
751 int mds = session->s_mds; 753 int mds = session->s_mds;
752 u64 split; 754 u64 split;
753 int op; 755 int op;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c
index c6179d3a26a2..cd5097d7c804 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/strings.c
@@ -1,71 +1,9 @@
1/* 1/*
2 * Ceph string constants 2 * Ceph fs string constants
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_AUTH: return "auth";
14 default: return "unknown";
15 }
16}
17
18const char *ceph_osd_op_name(int op)
19{
20 switch (op) {
21 case CEPH_OSD_OP_READ: return "read";
22 case CEPH_OSD_OP_STAT: return "stat";
23
24 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
25
26 case CEPH_OSD_OP_WRITE: return "write";
27 case CEPH_OSD_OP_DELETE: return "delete";
28 case CEPH_OSD_OP_TRUNCATE: return "truncate";
29 case CEPH_OSD_OP_ZERO: return "zero";
30 case CEPH_OSD_OP_WRITEFULL: return "writefull";
31 case CEPH_OSD_OP_ROLLBACK: return "rollback";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
49
50 case CEPH_OSD_OP_PULL: return "pull";
51 case CEPH_OSD_OP_PUSH: return "push";
52 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
53 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
54 case CEPH_OSD_OP_SCRUB: return "scrub";
55
56 case CEPH_OSD_OP_WRLOCK: return "wrlock";
57 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
58 case CEPH_OSD_OP_RDLOCK: return "rdlock";
59 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
60 case CEPH_OSD_OP_UPLOCK: return "uplock";
61 case CEPH_OSD_OP_DNLOCK: return "dnlock";
62
63 case CEPH_OSD_OP_CALL: return "call";
64
65 case CEPH_OSD_OP_PGLS: return "pgls";
66 }
67 return "???";
68}
69 7
70const char *ceph_mds_state_name(int s) 8const char *ceph_mds_state_name(int s)
71{ 9{
@@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o)
177 } 115 }
178 return "???"; 116 return "???";
179} 117}
180
181const char *ceph_pool_op_name(int op)
182{
183 switch (op) {
184 case POOL_OP_CREATE: return "create";
185 case POOL_OP_DELETE: return "delete";
186 case POOL_OP_AUID_CHANGE: return "auid change";
187 case POOL_OP_CREATE_SNAP: return "create snap";
188 case POOL_OP_DELETE_SNAP: return "delete snap";
189 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
190 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
191 }
192 return "???";
193}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9922628532b2..d6e0e0421891 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,5 +1,5 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/backing-dev.h> 4#include <linux/backing-dev.h>
5#include <linux/ctype.h> 5#include <linux/ctype.h>
@@ -15,10 +15,13 @@
15#include <linux/statfs.h> 15#include <linux/statfs.h>
16#include <linux/string.h> 16#include <linux/string.h>
17 17
18#include "decode.h"
19#include "super.h" 18#include "super.h"
20#include "mon_client.h" 19#include "mds_client.h"
21#include "auth.h" 20
21#include <linux/ceph/decode.h>
22#include <linux/ceph/mon_client.h>
23#include <linux/ceph/auth.h>
24#include <linux/ceph/debugfs.h>
22 25
23/* 26/*
24 * Ceph superblock operations 27 * Ceph superblock operations
@@ -26,36 +29,22 @@
26 * Handle the basics of mounting, unmounting. 29 * Handle the basics of mounting, unmounting.
27 */ 30 */
28 31
29
30/*
31 * find filename portion of a path (/foo/bar/baz -> baz)
32 */
33const char *ceph_file_part(const char *s, int len)
34{
35 const char *e = s + len;
36
37 while (e != s && *(e-1) != '/')
38 e--;
39 return e;
40}
41
42
43/* 32/*
44 * super ops 33 * super ops
45 */ 34 */
46static void ceph_put_super(struct super_block *s) 35static void ceph_put_super(struct super_block *s)
47{ 36{
48 struct ceph_client *client = ceph_sb_to_client(s); 37 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
49 38
50 dout("put_super\n"); 39 dout("put_super\n");
51 ceph_mdsc_close_sessions(&client->mdsc); 40 ceph_mdsc_close_sessions(fsc->mdsc);
52 41
53 /* 42 /*
54 * ensure we release the bdi before put_anon_super releases 43 * ensure we release the bdi before put_anon_super releases
55 * the device name. 44 * the device name.
56 */ 45 */
57 if (s->s_bdi == &client->backing_dev_info) { 46 if (s->s_bdi == &fsc->backing_dev_info) {
58 bdi_unregister(&client->backing_dev_info); 47 bdi_unregister(&fsc->backing_dev_info);
59 s->s_bdi = NULL; 48 s->s_bdi = NULL;
60 } 49 }
61 50
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
64 53
65static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) 54static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
66{ 55{
67 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode); 56 struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
68 struct ceph_monmap *monmap = client->monc.monmap; 57 struct ceph_monmap *monmap = fsc->client->monc.monmap;
69 struct ceph_statfs st; 58 struct ceph_statfs st;
70 u64 fsid; 59 u64 fsid;
71 int err; 60 int err;
72 61
73 dout("statfs\n"); 62 dout("statfs\n");
74 err = ceph_monc_do_statfs(&client->monc, &st); 63 err = ceph_monc_do_statfs(&fsc->client->monc, &st);
75 if (err < 0) 64 if (err < 0)
76 return err; 65 return err;
77 66
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
104 93
105static int ceph_sync_fs(struct super_block *sb, int wait) 94static int ceph_sync_fs(struct super_block *sb, int wait)
106{ 95{
107 struct ceph_client *client = ceph_sb_to_client(sb); 96 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
108 97
109 if (!wait) { 98 if (!wait) {
110 dout("sync_fs (non-blocking)\n"); 99 dout("sync_fs (non-blocking)\n");
111 ceph_flush_dirty_caps(&client->mdsc); 100 ceph_flush_dirty_caps(fsc->mdsc);
112 dout("sync_fs (non-blocking) done\n"); 101 dout("sync_fs (non-blocking) done\n");
113 return 0; 102 return 0;
114 } 103 }
115 104
116 dout("sync_fs (blocking)\n"); 105 dout("sync_fs (blocking)\n");
117 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); 106 ceph_osdc_sync(&fsc->client->osdc);
118 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); 107 ceph_mdsc_sync(fsc->mdsc);
119 dout("sync_fs (blocking) done\n"); 108 dout("sync_fs (blocking) done\n");
120 return 0; 109 return 0;
121} 110}
122 111
123static int default_congestion_kb(void)
124{
125 int congestion_kb;
126
127 /*
128 * Copied from NFS
129 *
130 * congestion size, scale with available memory.
131 *
132 * 64MB: 8192k
133 * 128MB: 11585k
134 * 256MB: 16384k
135 * 512MB: 23170k
136 * 1GB: 32768k
137 * 2GB: 46340k
138 * 4GB: 65536k
139 * 8GB: 92681k
140 * 16GB: 131072k
141 *
142 * This allows larger machines to have larger/more transfers.
143 * Limit the default to 256M
144 */
145 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
146 if (congestion_kb > 256*1024)
147 congestion_kb = 256*1024;
148
149 return congestion_kb;
150}
151
152/**
153 * ceph_show_options - Show mount options in /proc/mounts
154 * @m: seq_file to write to
155 * @mnt: mount descriptor
156 */
157static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
158{
159 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
160 struct ceph_mount_args *args = client->mount_args;
161
162 if (args->flags & CEPH_OPT_FSID)
163 seq_printf(m, ",fsid=%pU", &args->fsid);
164 if (args->flags & CEPH_OPT_NOSHARE)
165 seq_puts(m, ",noshare");
166 if (args->flags & CEPH_OPT_DIRSTAT)
167 seq_puts(m, ",dirstat");
168 if ((args->flags & CEPH_OPT_RBYTES) == 0)
169 seq_puts(m, ",norbytes");
170 if (args->flags & CEPH_OPT_NOCRC)
171 seq_puts(m, ",nocrc");
172 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
173 seq_puts(m, ",noasyncreaddir");
174
175 if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
176 seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
177 if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
178 seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
179 if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
180 seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
181 if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
182 seq_printf(m, ",osdkeepalivetimeout=%d",
183 args->osd_keepalive_timeout);
184 if (args->wsize)
185 seq_printf(m, ",wsize=%d", args->wsize);
186 if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
187 seq_printf(m, ",rsize=%d", args->rsize);
188 if (args->congestion_kb != default_congestion_kb())
189 seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
190 if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
191 seq_printf(m, ",caps_wanted_delay_min=%d",
192 args->caps_wanted_delay_min);
193 if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
194 seq_printf(m, ",caps_wanted_delay_max=%d",
195 args->caps_wanted_delay_max);
196 if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
197 seq_printf(m, ",cap_release_safety=%d",
198 args->cap_release_safety);
199 if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
200 seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
201 if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
202 seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
203 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
204 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
205 if (args->name)
206 seq_printf(m, ",name=%s", args->name);
207 if (args->secret)
208 seq_puts(m, ",secret=<hidden>");
209 return 0;
210}
211
212/*
213 * caches
214 */
215struct kmem_cache *ceph_inode_cachep;
216struct kmem_cache *ceph_cap_cachep;
217struct kmem_cache *ceph_dentry_cachep;
218struct kmem_cache *ceph_file_cachep;
219
220static void ceph_inode_init_once(void *foo)
221{
222 struct ceph_inode_info *ci = foo;
223 inode_init_once(&ci->vfs_inode);
224}
225
226static int __init init_caches(void)
227{
228 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
229 sizeof(struct ceph_inode_info),
230 __alignof__(struct ceph_inode_info),
231 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
232 ceph_inode_init_once);
233 if (ceph_inode_cachep == NULL)
234 return -ENOMEM;
235
236 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
237 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
238 if (ceph_cap_cachep == NULL)
239 goto bad_cap;
240
241 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
242 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
243 if (ceph_dentry_cachep == NULL)
244 goto bad_dentry;
245
246 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
247 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
248 if (ceph_file_cachep == NULL)
249 goto bad_file;
250
251 return 0;
252
253bad_file:
254 kmem_cache_destroy(ceph_dentry_cachep);
255bad_dentry:
256 kmem_cache_destroy(ceph_cap_cachep);
257bad_cap:
258 kmem_cache_destroy(ceph_inode_cachep);
259 return -ENOMEM;
260}
261
262static void destroy_caches(void)
263{
264 kmem_cache_destroy(ceph_inode_cachep);
265 kmem_cache_destroy(ceph_cap_cachep);
266 kmem_cache_destroy(ceph_dentry_cachep);
267 kmem_cache_destroy(ceph_file_cachep);
268}
269
270
271/*
272 * ceph_umount_begin - initiate forced umount. Tear down down the
273 * mount, skipping steps that may hang while waiting for server(s).
274 */
275static void ceph_umount_begin(struct super_block *sb)
276{
277 struct ceph_client *client = ceph_sb_to_client(sb);
278
279 dout("ceph_umount_begin - starting forced umount\n");
280 if (!client)
281 return;
282 client->mount_state = CEPH_MOUNT_SHUTDOWN;
283 return;
284}
285
286static const struct super_operations ceph_super_ops = {
287 .alloc_inode = ceph_alloc_inode,
288 .destroy_inode = ceph_destroy_inode,
289 .write_inode = ceph_write_inode,
290 .sync_fs = ceph_sync_fs,
291 .put_super = ceph_put_super,
292 .show_options = ceph_show_options,
293 .statfs = ceph_statfs,
294 .umount_begin = ceph_umount_begin,
295};
296
297
298const char *ceph_msg_type_name(int type)
299{
300 switch (type) {
301 case CEPH_MSG_SHUTDOWN: return "shutdown";
302 case CEPH_MSG_PING: return "ping";
303 case CEPH_MSG_AUTH: return "auth";
304 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
305 case CEPH_MSG_MON_MAP: return "mon_map";
306 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
307 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
308 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
309 case CEPH_MSG_STATFS: return "statfs";
310 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
311 case CEPH_MSG_MDS_MAP: return "mds_map";
312 case CEPH_MSG_CLIENT_SESSION: return "client_session";
313 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
314 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
315 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
316 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
317 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
318 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
319 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
320 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
321 case CEPH_MSG_OSD_MAP: return "osd_map";
322 case CEPH_MSG_OSD_OP: return "osd_op";
323 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
324 default: return "unknown";
325 }
326}
327
328
329/* 112/*
330 * mount options 113 * mount options
331 */ 114 */
332enum { 115enum {
333 Opt_wsize, 116 Opt_wsize,
334 Opt_rsize, 117 Opt_rsize,
335 Opt_osdtimeout,
336 Opt_osdkeepalivetimeout,
337 Opt_mount_timeout,
338 Opt_osd_idle_ttl,
339 Opt_caps_wanted_delay_min, 118 Opt_caps_wanted_delay_min,
340 Opt_caps_wanted_delay_max, 119 Opt_caps_wanted_delay_max,
341 Opt_cap_release_safety, 120 Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
344 Opt_congestion_kb, 123 Opt_congestion_kb,
345 Opt_last_int, 124 Opt_last_int,
346 /* int args above */ 125 /* int args above */
347 Opt_fsid,
348 Opt_snapdirname, 126 Opt_snapdirname,
349 Opt_name,
350 Opt_secret,
351 Opt_last_string, 127 Opt_last_string,
352 /* string args above */ 128 /* string args above */
353 Opt_ip,
354 Opt_noshare,
355 Opt_dirstat, 129 Opt_dirstat,
356 Opt_nodirstat, 130 Opt_nodirstat,
357 Opt_rbytes, 131 Opt_rbytes,
358 Opt_norbytes, 132 Opt_norbytes,
359 Opt_nocrc,
360 Opt_noasyncreaddir, 133 Opt_noasyncreaddir,
361}; 134};
362 135
363static match_table_t arg_tokens = { 136static match_table_t fsopt_tokens = {
364 {Opt_wsize, "wsize=%d"}, 137 {Opt_wsize, "wsize=%d"},
365 {Opt_rsize, "rsize=%d"}, 138 {Opt_rsize, "rsize=%d"},
366 {Opt_osdtimeout, "osdtimeout=%d"},
367 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
368 {Opt_mount_timeout, "mount_timeout=%d"},
369 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
370 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 139 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
371 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 140 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
372 {Opt_cap_release_safety, "cap_release_safety=%d"}, 141 {Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,459 @@ static match_table_t arg_tokens = {
374 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, 143 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
375 {Opt_congestion_kb, "write_congestion_kb=%d"}, 144 {Opt_congestion_kb, "write_congestion_kb=%d"},
376 /* int args above */ 145 /* int args above */
377 {Opt_fsid, "fsid=%s"},
378 {Opt_snapdirname, "snapdirname=%s"}, 146 {Opt_snapdirname, "snapdirname=%s"},
379 {Opt_name, "name=%s"},
380 {Opt_secret, "secret=%s"},
381 /* string args above */ 147 /* string args above */
382 {Opt_ip, "ip=%s"},
383 {Opt_noshare, "noshare"},
384 {Opt_dirstat, "dirstat"}, 148 {Opt_dirstat, "dirstat"},
385 {Opt_nodirstat, "nodirstat"}, 149 {Opt_nodirstat, "nodirstat"},
386 {Opt_rbytes, "rbytes"}, 150 {Opt_rbytes, "rbytes"},
387 {Opt_norbytes, "norbytes"}, 151 {Opt_norbytes, "norbytes"},
388 {Opt_nocrc, "nocrc"},
389 {Opt_noasyncreaddir, "noasyncreaddir"}, 152 {Opt_noasyncreaddir, "noasyncreaddir"},
390 {-1, NULL} 153 {-1, NULL}
391}; 154};
392 155
393static int parse_fsid(const char *str, struct ceph_fsid *fsid) 156static int parse_fsopt_token(char *c, void *private)
394{ 157{
395 int i = 0; 158 struct ceph_mount_options *fsopt = private;
396 char tmp[3]; 159 substring_t argstr[MAX_OPT_ARGS];
397 int err = -EINVAL; 160 int token, intval, ret;
398 int d; 161
399 162 token = match_token((char *)c, fsopt_tokens, argstr);
400 dout("parse_fsid '%s'\n", str); 163 if (token < 0)
401 tmp[2] = 0; 164 return -EINVAL;
402 while (*str && i < 16) { 165
403 if (ispunct(*str)) { 166 if (token < Opt_last_int) {
404 str++; 167 ret = match_int(&argstr[0], &intval);
405 continue; 168 if (ret < 0) {
169 pr_err("bad mount option arg (not int) "
170 "at '%s'\n", c);
171 return ret;
406 } 172 }
407 if (!isxdigit(str[0]) || !isxdigit(str[1])) 173 dout("got int token %d val %d\n", token, intval);
408 break; 174 } else if (token > Opt_last_int && token < Opt_last_string) {
409 tmp[0] = str[0]; 175 dout("got string token %d val %s\n", token,
410 tmp[1] = str[1]; 176 argstr[0].from);
411 if (sscanf(tmp, "%x", &d) < 1) 177 } else {
412 break; 178 dout("got token %d\n", token);
413 fsid->fsid[i] = d & 0xff;
414 i++;
415 str += 2;
416 } 179 }
417 180
418 if (i == 16) 181 switch (token) {
419 err = 0; 182 case Opt_snapdirname:
420 dout("parse_fsid ret %d got fsid %pU", err, fsid); 183 kfree(fsopt->snapdir_name);
421 return err; 184 fsopt->snapdir_name = kstrndup(argstr[0].from,
185 argstr[0].to-argstr[0].from,
186 GFP_KERNEL);
187 if (!fsopt->snapdir_name)
188 return -ENOMEM;
189 break;
190
191 /* misc */
192 case Opt_wsize:
193 fsopt->wsize = intval;
194 break;
195 case Opt_rsize:
196 fsopt->rsize = intval;
197 break;
198 case Opt_caps_wanted_delay_min:
199 fsopt->caps_wanted_delay_min = intval;
200 break;
201 case Opt_caps_wanted_delay_max:
202 fsopt->caps_wanted_delay_max = intval;
203 break;
204 case Opt_readdir_max_entries:
205 fsopt->max_readdir = intval;
206 break;
207 case Opt_readdir_max_bytes:
208 fsopt->max_readdir_bytes = intval;
209 break;
210 case Opt_congestion_kb:
211 fsopt->congestion_kb = intval;
212 break;
213 case Opt_dirstat:
214 fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
215 break;
216 case Opt_nodirstat:
217 fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
218 break;
219 case Opt_rbytes:
220 fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
221 break;
222 case Opt_norbytes:
223 fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
224 break;
225 case Opt_noasyncreaddir:
226 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
227 break;
228 default:
229 BUG_ON(token);
230 }
231 return 0;
422} 232}
423 233
424static struct ceph_mount_args *parse_mount_args(int flags, char *options, 234static void destroy_mount_options(struct ceph_mount_options *args)
425 const char *dev_name,
426 const char **path)
427{ 235{
428 struct ceph_mount_args *args; 236 dout("destroy_mount_options %p\n", args);
429 const char *c; 237 kfree(args->snapdir_name);
430 int err = -ENOMEM; 238 kfree(args);
431 substring_t argstr[MAX_OPT_ARGS]; 239}
432 240
433 args = kzalloc(sizeof(*args), GFP_KERNEL); 241static int strcmp_null(const char *s1, const char *s2)
434 if (!args) 242{
435 return ERR_PTR(-ENOMEM); 243 if (!s1 && !s2)
436 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr), 244 return 0;
437 GFP_KERNEL); 245 if (s1 && !s2)
438 if (!args->mon_addr) 246 return -1;
439 goto out; 247 if (!s1 && s2)
248 return 1;
249 return strcmp(s1, s2);
250}
440 251
441 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name); 252static int compare_mount_options(struct ceph_mount_options *new_fsopt,
442 253 struct ceph_options *new_opt,
443 /* start with defaults */ 254 struct ceph_fs_client *fsc)
444 args->sb_flags = flags; 255{
445 args->flags = CEPH_OPT_DEFAULT; 256 struct ceph_mount_options *fsopt1 = new_fsopt;
446 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; 257 struct ceph_mount_options *fsopt2 = fsc->mount_options;
447 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 258 int ofs = offsetof(struct ceph_mount_options, snapdir_name);
448 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ 259 int ret;
449 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
450 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
451 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
452 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
453 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
454 args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
455 args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
456 args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
457 args->congestion_kb = default_congestion_kb();
458
459 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
460 err = -EINVAL;
461 if (!dev_name)
462 goto out;
463 *path = strstr(dev_name, ":/");
464 if (*path == NULL) {
465 pr_err("device name is missing path (no :/ in %s)\n",
466 dev_name);
467 goto out;
468 }
469 260
470 /* get mon ip(s) */ 261 ret = memcmp(fsopt1, fsopt2, ofs);
471 err = ceph_parse_ips(dev_name, *path, args->mon_addr, 262 if (ret)
472 CEPH_MAX_MON, &args->num_mon); 263 return ret;
473 if (err < 0) 264
474 goto out; 265 ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
266 if (ret)
267 return ret;
268
269 return ceph_compare_options(new_opt, fsc->client);
270}
271
272static int parse_mount_options(struct ceph_mount_options **pfsopt,
273 struct ceph_options **popt,
274 int flags, char *options,
275 const char *dev_name,
276 const char **path)
277{
278 struct ceph_mount_options *fsopt;
279 const char *dev_name_end;
280 int err = -ENOMEM;
281
282 fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
283 if (!fsopt)
284 return -ENOMEM;
285
286 dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
287
288 fsopt->sb_flags = flags;
289 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
290
291 fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
292 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
293 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
294 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
295 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
296 fsopt->congestion_kb = default_congestion_kb();
297
298 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
299 err = -EINVAL;
300 if (!dev_name)
301 goto out;
302 *path = strstr(dev_name, ":/");
303 if (*path == NULL) {
304 pr_err("device name is missing path (no :/ in %s)\n",
305 dev_name);
306 goto out;
307 }
308 dev_name_end = *path;
309 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
475 310
476 /* path on server */ 311 /* path on server */
477 *path += 2; 312 *path += 2;
478 dout("server path '%s'\n", *path); 313 dout("server path '%s'\n", *path);
479 314
480 /* parse mount options */ 315 err = ceph_parse_options(popt, options, dev_name, dev_name_end,
481 while ((c = strsep(&options, ",")) != NULL) { 316 parse_fsopt_token, (void *)fsopt);
482 int token, intval, ret; 317 if (err)
483 if (!*c) 318 goto out;
484 continue; 319
485 err = -EINVAL; 320 /* success */
486 token = match_token((char *)c, arg_tokens, argstr); 321 *pfsopt = fsopt;
487 if (token < 0) { 322 return 0;
488 pr_err("bad mount option at '%s'\n", c);
489 goto out;
490 }
491 if (token < Opt_last_int) {
492 ret = match_int(&argstr[0], &intval);
493 if (ret < 0) {
494 pr_err("bad mount option arg (not int) "
495 "at '%s'\n", c);
496 continue;
497 }
498 dout("got int token %d val %d\n", token, intval);
499 } else if (token > Opt_last_int && token < Opt_last_string) {
500 dout("got string token %d val %s\n", token,
501 argstr[0].from);
502 } else {
503 dout("got token %d\n", token);
504 }
505 switch (token) {
506 case Opt_ip:
507 err = ceph_parse_ips(argstr[0].from,
508 argstr[0].to,
509 &args->my_addr,
510 1, NULL);
511 if (err < 0)
512 goto out;
513 args->flags |= CEPH_OPT_MYIP;
514 break;
515
516 case Opt_fsid:
517 err = parse_fsid(argstr[0].from, &args->fsid);
518 if (err == 0)
519 args->flags |= CEPH_OPT_FSID;
520 break;
521 case Opt_snapdirname:
522 kfree(args->snapdir_name);
523 args->snapdir_name = kstrndup(argstr[0].from,
524 argstr[0].to-argstr[0].from,
525 GFP_KERNEL);
526 break;
527 case Opt_name:
528 args->name = kstrndup(argstr[0].from,
529 argstr[0].to-argstr[0].from,
530 GFP_KERNEL);
531 break;
532 case Opt_secret:
533 args->secret = kstrndup(argstr[0].from,
534 argstr[0].to-argstr[0].from,
535 GFP_KERNEL);
536 break;
537
538 /* misc */
539 case Opt_wsize:
540 args->wsize = intval;
541 break;
542 case Opt_rsize:
543 args->rsize = intval;
544 break;
545 case Opt_osdtimeout:
546 args->osd_timeout = intval;
547 break;
548 case Opt_osdkeepalivetimeout:
549 args->osd_keepalive_timeout = intval;
550 break;
551 case Opt_osd_idle_ttl:
552 args->osd_idle_ttl = intval;
553 break;
554 case Opt_mount_timeout:
555 args->mount_timeout = intval;
556 break;
557 case Opt_caps_wanted_delay_min:
558 args->caps_wanted_delay_min = intval;
559 break;
560 case Opt_caps_wanted_delay_max:
561 args->caps_wanted_delay_max = intval;
562 break;
563 case Opt_readdir_max_entries:
564 args->max_readdir = intval;
565 break;
566 case Opt_readdir_max_bytes:
567 args->max_readdir_bytes = intval;
568 break;
569 case Opt_congestion_kb:
570 args->congestion_kb = intval;
571 break;
572
573 case Opt_noshare:
574 args->flags |= CEPH_OPT_NOSHARE;
575 break;
576
577 case Opt_dirstat:
578 args->flags |= CEPH_OPT_DIRSTAT;
579 break;
580 case Opt_nodirstat:
581 args->flags &= ~CEPH_OPT_DIRSTAT;
582 break;
583 case Opt_rbytes:
584 args->flags |= CEPH_OPT_RBYTES;
585 break;
586 case Opt_norbytes:
587 args->flags &= ~CEPH_OPT_RBYTES;
588 break;
589 case Opt_nocrc:
590 args->flags |= CEPH_OPT_NOCRC;
591 break;
592 case Opt_noasyncreaddir:
593 args->flags |= CEPH_OPT_NOASYNCREADDIR;
594 break;
595
596 default:
597 BUG_ON(token);
598 }
599 }
600 return args;
601 323
602out: 324out:
603 kfree(args->mon_addr); 325 destroy_mount_options(fsopt);
604 kfree(args); 326 return err;
605 return ERR_PTR(err);
606} 327}
607 328
608static void destroy_mount_args(struct ceph_mount_args *args) 329/**
330 * ceph_show_options - Show mount options in /proc/mounts
331 * @m: seq_file to write to
332 * @mnt: mount descriptor
333 */
334static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
609{ 335{
610 dout("destroy_mount_args %p\n", args); 336 struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
611 kfree(args->snapdir_name); 337 struct ceph_mount_options *fsopt = fsc->mount_options;
612 args->snapdir_name = NULL; 338 struct ceph_options *opt = fsc->client->options;
613 kfree(args->name); 339
614 args->name = NULL; 340 if (opt->flags & CEPH_OPT_FSID)
615 kfree(args->secret); 341 seq_printf(m, ",fsid=%pU", &opt->fsid);
616 args->secret = NULL; 342 if (opt->flags & CEPH_OPT_NOSHARE)
617 kfree(args); 343 seq_puts(m, ",noshare");
344 if (opt->flags & CEPH_OPT_NOCRC)
345 seq_puts(m, ",nocrc");
346
347 if (opt->name)
348 seq_printf(m, ",name=%s", opt->name);
349 if (opt->secret)
350 seq_puts(m, ",secret=<hidden>");
351
352 if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
353 seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
354 if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
355 seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
356 if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
357 seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
358 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
359 seq_printf(m, ",osdkeepalivetimeout=%d",
360 opt->osd_keepalive_timeout);
361
362 if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
363 seq_puts(m, ",dirstat");
364 if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
365 seq_puts(m, ",norbytes");
366 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
367 seq_puts(m, ",noasyncreaddir");
368
369 if (fsopt->wsize)
370 seq_printf(m, ",wsize=%d", fsopt->wsize);
371 if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
372 seq_printf(m, ",rsize=%d", fsopt->rsize);
373 if (fsopt->congestion_kb != default_congestion_kb())
374 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
375 if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
376 seq_printf(m, ",caps_wanted_delay_min=%d",
377 fsopt->caps_wanted_delay_min);
378 if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
379 seq_printf(m, ",caps_wanted_delay_max=%d",
380 fsopt->caps_wanted_delay_max);
381 if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
382 seq_printf(m, ",cap_release_safety=%d",
383 fsopt->cap_release_safety);
384 if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
385 seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
386 if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
387 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
388 if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
389 seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
390 return 0;
618} 391}
619 392
620/* 393/*
621 * create a fresh client instance 394 * handle any mon messages the standard library doesn't understand.
395 * return error if we don't either.
622 */ 396 */
623static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) 397static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
624{ 398{
625 struct ceph_client *client; 399 struct ceph_fs_client *fsc = client->private;
400 int type = le16_to_cpu(msg->hdr.type);
401
402 switch (type) {
403 case CEPH_MSG_MDS_MAP:
404 ceph_mdsc_handle_map(fsc->mdsc, msg);
405 return 0;
406
407 default:
408 return -1;
409 }
410}
411
412/*
413 * create a new fs client
414 */
415struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
416 struct ceph_options *opt)
417{
418 struct ceph_fs_client *fsc;
626 int err = -ENOMEM; 419 int err = -ENOMEM;
627 420
628 client = kzalloc(sizeof(*client), GFP_KERNEL); 421 fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
629 if (client == NULL) 422 if (!fsc)
630 return ERR_PTR(-ENOMEM); 423 return ERR_PTR(-ENOMEM);
631 424
632 mutex_init(&client->mount_mutex); 425 fsc->client = ceph_create_client(opt, fsc);
633 426 if (IS_ERR(fsc->client)) {
634 init_waitqueue_head(&client->auth_wq); 427 err = PTR_ERR(fsc->client);
428 goto fail;
429 }
430 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
431 fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
432 fsc->client->monc.want_mdsmap = 1;
635 433
636 client->sb = NULL; 434 fsc->mount_options = fsopt;
637 client->mount_state = CEPH_MOUNT_MOUNTING;
638 client->mount_args = args;
639 435
640 client->msgr = NULL; 436 fsc->sb = NULL;
437 fsc->mount_state = CEPH_MOUNT_MOUNTING;
641 438
642 client->auth_err = 0; 439 atomic_long_set(&fsc->writeback_count, 0);
643 atomic_long_set(&client->writeback_count, 0);
644 440
645 err = bdi_init(&client->backing_dev_info); 441 err = bdi_init(&fsc->backing_dev_info);
646 if (err < 0) 442 if (err < 0)
647 goto fail; 443 goto fail_client;
648 444
649 err = -ENOMEM; 445 err = -ENOMEM;
650 client->wb_wq = create_workqueue("ceph-writeback"); 446 fsc->wb_wq = create_workqueue("ceph-writeback");
651 if (client->wb_wq == NULL) 447 if (fsc->wb_wq == NULL)
652 goto fail_bdi; 448 goto fail_bdi;
653 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); 449 fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
654 if (client->pg_inv_wq == NULL) 450 if (fsc->pg_inv_wq == NULL)
655 goto fail_wb_wq; 451 goto fail_wb_wq;
656 client->trunc_wq = create_singlethread_workqueue("ceph-trunc"); 452 fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
657 if (client->trunc_wq == NULL) 453 if (fsc->trunc_wq == NULL)
658 goto fail_pg_inv_wq; 454 goto fail_pg_inv_wq;
659 455
660 /* set up mempools */ 456 /* set up mempools */
661 err = -ENOMEM; 457 err = -ENOMEM;
662 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10, 458 fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
663 client->mount_args->wsize >> PAGE_CACHE_SHIFT); 459 fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
664 if (!client->wb_pagevec_pool) 460 if (!fsc->wb_pagevec_pool)
665 goto fail_trunc_wq; 461 goto fail_trunc_wq;
666 462
667 /* caps */ 463 /* caps */
668 client->min_caps = args->max_readdir; 464 fsc->min_caps = fsopt->max_readdir;
465
466 return fsc;
669 467
670 /* subsystems */
671 err = ceph_monc_init(&client->monc, client);
672 if (err < 0)
673 goto fail_mempool;
674 err = ceph_osdc_init(&client->osdc, client);
675 if (err < 0)
676 goto fail_monc;
677 err = ceph_mdsc_init(&client->mdsc, client);
678 if (err < 0)
679 goto fail_osdc;
680 return client;
681
682fail_osdc:
683 ceph_osdc_stop(&client->osdc);
684fail_monc:
685 ceph_monc_stop(&client->monc);
686fail_mempool:
687 mempool_destroy(client->wb_pagevec_pool);
688fail_trunc_wq: 468fail_trunc_wq:
689 destroy_workqueue(client->trunc_wq); 469 destroy_workqueue(fsc->trunc_wq);
690fail_pg_inv_wq: 470fail_pg_inv_wq:
691 destroy_workqueue(client->pg_inv_wq); 471 destroy_workqueue(fsc->pg_inv_wq);
692fail_wb_wq: 472fail_wb_wq:
693 destroy_workqueue(client->wb_wq); 473 destroy_workqueue(fsc->wb_wq);
694fail_bdi: 474fail_bdi:
695 bdi_destroy(&client->backing_dev_info); 475 bdi_destroy(&fsc->backing_dev_info);
476fail_client:
477 ceph_destroy_client(fsc->client);
696fail: 478fail:
697 kfree(client); 479 kfree(fsc);
698 return ERR_PTR(err); 480 return ERR_PTR(err);
699} 481}
700 482
701static void ceph_destroy_client(struct ceph_client *client) 483void destroy_fs_client(struct ceph_fs_client *fsc)
702{ 484{
703 dout("destroy_client %p\n", client); 485 dout("destroy_fs_client %p\n", fsc);
704 486
705 /* unmount */ 487 destroy_workqueue(fsc->wb_wq);
706 ceph_mdsc_stop(&client->mdsc); 488 destroy_workqueue(fsc->pg_inv_wq);
707 ceph_osdc_stop(&client->osdc); 489 destroy_workqueue(fsc->trunc_wq);
708 490
709 /* 491 bdi_destroy(&fsc->backing_dev_info);
710 * make sure mds and osd connections close out before destroying
711 * the auth module, which is needed to free those connections'
712 * ceph_authorizers.
713 */
714 ceph_msgr_flush();
715
716 ceph_monc_stop(&client->monc);
717 492
718 ceph_debugfs_client_cleanup(client); 493 mempool_destroy(fsc->wb_pagevec_pool);
719 destroy_workqueue(client->wb_wq);
720 destroy_workqueue(client->pg_inv_wq);
721 destroy_workqueue(client->trunc_wq);
722 494
723 bdi_destroy(&client->backing_dev_info); 495 destroy_mount_options(fsc->mount_options);
724 496
725 if (client->msgr) 497 ceph_fs_debugfs_cleanup(fsc);
726 ceph_messenger_destroy(client->msgr);
727 mempool_destroy(client->wb_pagevec_pool);
728 498
729 destroy_mount_args(client->mount_args); 499 ceph_destroy_client(fsc->client);
730 500
731 kfree(client); 501 kfree(fsc);
732 dout("destroy_client %p done\n", client); 502 dout("destroy_fs_client %p done\n", fsc);
733} 503}
734 504
735/* 505/*
736 * Initially learn our fsid, or verify an fsid matches. 506 * caches
737 */ 507 */
738int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) 508struct kmem_cache *ceph_inode_cachep;
509struct kmem_cache *ceph_cap_cachep;
510struct kmem_cache *ceph_dentry_cachep;
511struct kmem_cache *ceph_file_cachep;
512
513static void ceph_inode_init_once(void *foo)
739{ 514{
740 if (client->have_fsid) { 515 struct ceph_inode_info *ci = foo;
741 if (ceph_fsid_compare(&client->fsid, fsid)) { 516 inode_init_once(&ci->vfs_inode);
742 pr_err("bad fsid, had %pU got %pU", 517}
743 &client->fsid, fsid); 518
744 return -1; 519static int __init init_caches(void)
745 } 520{
746 } else { 521 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
747 pr_info("client%lld fsid %pU\n", client->monc.auth->global_id, 522 sizeof(struct ceph_inode_info),
748 fsid); 523 __alignof__(struct ceph_inode_info),
749 memcpy(&client->fsid, fsid, sizeof(*fsid)); 524 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
750 ceph_debugfs_client_init(client); 525 ceph_inode_init_once);
751 client->have_fsid = true; 526 if (ceph_inode_cachep == NULL)
752 } 527 return -ENOMEM;
528
529 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
530 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
531 if (ceph_cap_cachep == NULL)
532 goto bad_cap;
533
534 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
535 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
536 if (ceph_dentry_cachep == NULL)
537 goto bad_dentry;
538
539 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
540 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
541 if (ceph_file_cachep == NULL)
542 goto bad_file;
543
753 return 0; 544 return 0;
545
546bad_file:
547 kmem_cache_destroy(ceph_dentry_cachep);
548bad_dentry:
549 kmem_cache_destroy(ceph_cap_cachep);
550bad_cap:
551 kmem_cache_destroy(ceph_inode_cachep);
552 return -ENOMEM;
754} 553}
755 554
555static void destroy_caches(void)
556{
557 kmem_cache_destroy(ceph_inode_cachep);
558 kmem_cache_destroy(ceph_cap_cachep);
559 kmem_cache_destroy(ceph_dentry_cachep);
560 kmem_cache_destroy(ceph_file_cachep);
561}
562
563
756/* 564/*
757 * true if we have the mon map (and have thus joined the cluster) 565 * ceph_umount_begin - initiate forced umount. Tear down down the
566 * mount, skipping steps that may hang while waiting for server(s).
758 */ 567 */
759static int have_mon_and_osd_map(struct ceph_client *client) 568static void ceph_umount_begin(struct super_block *sb)
760{ 569{
761 return client->monc.monmap && client->monc.monmap->epoch && 570 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
762 client->osdc.osdmap && client->osdc.osdmap->epoch; 571
572 dout("ceph_umount_begin - starting forced umount\n");
573 if (!fsc)
574 return;
575 fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
576 return;
763} 577}
764 578
579static const struct super_operations ceph_super_ops = {
580 .alloc_inode = ceph_alloc_inode,
581 .destroy_inode = ceph_destroy_inode,
582 .write_inode = ceph_write_inode,
583 .sync_fs = ceph_sync_fs,
584 .put_super = ceph_put_super,
585 .show_options = ceph_show_options,
586 .statfs = ceph_statfs,
587 .umount_begin = ceph_umount_begin,
588};
589
765/* 590/*
766 * Bootstrap mount by opening the root directory. Note the mount 591 * Bootstrap mount by opening the root directory. Note the mount
767 * @started time from caller, and time out if this takes too long. 592 * @started time from caller, and time out if this takes too long.
768 */ 593 */
769static struct dentry *open_root_dentry(struct ceph_client *client, 594static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
770 const char *path, 595 const char *path,
771 unsigned long started) 596 unsigned long started)
772{ 597{
773 struct ceph_mds_client *mdsc = &client->mdsc; 598 struct ceph_mds_client *mdsc = fsc->mdsc;
774 struct ceph_mds_request *req = NULL; 599 struct ceph_mds_request *req = NULL;
775 int err; 600 int err;
776 struct dentry *root; 601 struct dentry *root;
@@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
784 req->r_ino1.ino = CEPH_INO_ROOT; 609 req->r_ino1.ino = CEPH_INO_ROOT;
785 req->r_ino1.snap = CEPH_NOSNAP; 610 req->r_ino1.snap = CEPH_NOSNAP;
786 req->r_started = started; 611 req->r_started = started;
787 req->r_timeout = client->mount_args->mount_timeout * HZ; 612 req->r_timeout = fsc->client->options->mount_timeout * HZ;
788 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 613 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
789 req->r_num_caps = 2; 614 req->r_num_caps = 2;
790 err = ceph_mdsc_do_request(mdsc, NULL, req); 615 err = ceph_mdsc_do_request(mdsc, NULL, req);
791 if (err == 0) { 616 if (err == 0) {
792 dout("open_root_inode success\n"); 617 dout("open_root_inode success\n");
793 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && 618 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
794 client->sb->s_root == NULL) 619 fsc->sb->s_root == NULL)
795 root = d_alloc_root(req->r_target_inode); 620 root = d_alloc_root(req->r_target_inode);
796 else 621 else
797 root = d_obtain_alias(req->r_target_inode); 622 root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
804 return root; 629 return root;
805} 630}
806 631
632
633
634
807/* 635/*
808 * mount: join the ceph cluster, and open root directory. 636 * mount: join the ceph cluster, and open root directory.
809 */ 637 */
810static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, 638static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
811 const char *path) 639 const char *path)
812{ 640{
813 struct ceph_entity_addr *myaddr = NULL;
814 int err; 641 int err;
815 unsigned long timeout = client->mount_args->mount_timeout * HZ;
816 unsigned long started = jiffies; /* note the start time */ 642 unsigned long started = jiffies; /* note the start time */
817 struct dentry *root; 643 struct dentry *root;
644 int first = 0; /* first vfsmount for this super_block */
818 645
819 dout("mount start\n"); 646 dout("mount start\n");
820 mutex_lock(&client->mount_mutex); 647 mutex_lock(&fsc->client->mount_mutex);
821
822 /* initialize the messenger */
823 if (client->msgr == NULL) {
824 if (ceph_test_opt(client, MYIP))
825 myaddr = &client->mount_args->my_addr;
826 client->msgr = ceph_messenger_create(myaddr);
827 if (IS_ERR(client->msgr)) {
828 err = PTR_ERR(client->msgr);
829 client->msgr = NULL;
830 goto out;
831 }
832 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
833 }
834 648
835 /* open session, and wait for mon, mds, and osd maps */ 649 err = __ceph_open_session(fsc->client, started);
836 err = ceph_monc_open_session(&client->monc);
837 if (err < 0) 650 if (err < 0)
838 goto out; 651 goto out;
839 652
840 while (!have_mon_and_osd_map(client)) {
841 err = -EIO;
842 if (timeout && time_after_eq(jiffies, started + timeout))
843 goto out;
844
845 /* wait */
846 dout("mount waiting for mon_map\n");
847 err = wait_event_interruptible_timeout(client->auth_wq,
848 have_mon_and_osd_map(client) || (client->auth_err < 0),
849 timeout);
850 if (err == -EINTR || err == -ERESTARTSYS)
851 goto out;
852 if (client->auth_err < 0) {
853 err = client->auth_err;
854 goto out;
855 }
856 }
857
858 dout("mount opening root\n"); 653 dout("mount opening root\n");
859 root = open_root_dentry(client, "", started); 654 root = open_root_dentry(fsc, "", started);
860 if (IS_ERR(root)) { 655 if (IS_ERR(root)) {
861 err = PTR_ERR(root); 656 err = PTR_ERR(root);
862 goto out; 657 goto out;
863 } 658 }
864 if (client->sb->s_root) 659 if (fsc->sb->s_root) {
865 dput(root); 660 dput(root);
866 else 661 } else {
867 client->sb->s_root = root; 662 fsc->sb->s_root = root;
663 first = 1;
664
665 err = ceph_fs_debugfs_init(fsc);
666 if (err < 0)
667 goto fail;
668 }
868 669
869 if (path[0] == 0) { 670 if (path[0] == 0) {
870 dget(root); 671 dget(root);
871 } else { 672 } else {
872 dout("mount opening base mountpoint\n"); 673 dout("mount opening base mountpoint\n");
873 root = open_root_dentry(client, path, started); 674 root = open_root_dentry(fsc, path, started);
874 if (IS_ERR(root)) { 675 if (IS_ERR(root)) {
875 err = PTR_ERR(root); 676 err = PTR_ERR(root);
876 dput(client->sb->s_root); 677 goto fail;
877 client->sb->s_root = NULL;
878 goto out;
879 } 678 }
880 } 679 }
881 680
882 mnt->mnt_root = root; 681 mnt->mnt_root = root;
883 mnt->mnt_sb = client->sb; 682 mnt->mnt_sb = fsc->sb;
884 683
885 client->mount_state = CEPH_MOUNT_MOUNTED; 684 fsc->mount_state = CEPH_MOUNT_MOUNTED;
886 dout("mount success\n"); 685 dout("mount success\n");
887 err = 0; 686 err = 0;
888 687
889out: 688out:
890 mutex_unlock(&client->mount_mutex); 689 mutex_unlock(&fsc->client->mount_mutex);
891 return err; 690 return err;
691
692fail:
693 if (first) {
694 dput(fsc->sb->s_root);
695 fsc->sb->s_root = NULL;
696 }
697 goto out;
892} 698}
893 699
894static int ceph_set_super(struct super_block *s, void *data) 700static int ceph_set_super(struct super_block *s, void *data)
895{ 701{
896 struct ceph_client *client = data; 702 struct ceph_fs_client *fsc = data;
897 int ret; 703 int ret;
898 704
899 dout("set_super %p data %p\n", s, data); 705 dout("set_super %p data %p\n", s, data);
900 706
901 s->s_flags = client->mount_args->sb_flags; 707 s->s_flags = fsc->mount_options->sb_flags;
902 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ 708 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
903 709
904 s->s_fs_info = client; 710 s->s_fs_info = fsc;
905 client->sb = s; 711 fsc->sb = s;
906 712
907 s->s_op = &ceph_super_ops; 713 s->s_op = &ceph_super_ops;
908 s->s_export_op = &ceph_export_ops; 714 s->s_export_op = &ceph_export_ops;
@@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data)
917 723
918fail: 724fail:
919 s->s_fs_info = NULL; 725 s->s_fs_info = NULL;
920 client->sb = NULL; 726 fsc->sb = NULL;
921 return ret; 727 return ret;
922} 728}
923 729
@@ -926,30 +732,23 @@ fail:
926 */ 732 */
927static int ceph_compare_super(struct super_block *sb, void *data) 733static int ceph_compare_super(struct super_block *sb, void *data)
928{ 734{
929 struct ceph_client *new = data; 735 struct ceph_fs_client *new = data;
930 struct ceph_mount_args *args = new->mount_args; 736 struct ceph_mount_options *fsopt = new->mount_options;
931 struct ceph_client *other = ceph_sb_to_client(sb); 737 struct ceph_options *opt = new->client->options;
932 int i; 738 struct ceph_fs_client *other = ceph_sb_to_client(sb);
933 739
934 dout("ceph_compare_super %p\n", sb); 740 dout("ceph_compare_super %p\n", sb);
935 if (args->flags & CEPH_OPT_FSID) { 741
936 if (ceph_fsid_compare(&args->fsid, &other->fsid)) { 742 if (compare_mount_options(fsopt, opt, other)) {
937 dout("fsid doesn't match\n"); 743 dout("monitor(s)/mount options don't match\n");
938 return 0; 744 return 0;
939 }
940 } else {
941 /* do we share (a) monitor? */
942 for (i = 0; i < new->monc.monmap->num_mon; i++)
943 if (ceph_monmap_contains(other->monc.monmap,
944 &new->monc.monmap->mon_inst[i].addr))
945 break;
946 if (i == new->monc.monmap->num_mon) {
947 dout("mon ip not part of monmap\n");
948 return 0;
949 }
950 dout("mon ip matches existing sb %p\n", sb);
951 } 745 }
952 if (args->sb_flags != other->mount_args->sb_flags) { 746 if ((opt->flags & CEPH_OPT_FSID) &&
747 ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
748 dout("fsid doesn't match\n");
749 return 0;
750 }
751 if (fsopt->sb_flags != other->mount_options->sb_flags) {
953 dout("flags differ\n"); 752 dout("flags differ\n");
954 return 0; 753 return 0;
955 } 754 }
@@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data)
961 */ 760 */
962static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 761static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
963 762
964static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 763static int ceph_register_bdi(struct super_block *sb,
764 struct ceph_fs_client *fsc)
965{ 765{
966 int err; 766 int err;
967 767
968 /* set ra_pages based on rsize mount option? */ 768 /* set ra_pages based on rsize mount option? */
969 if (client->mount_args->rsize >= PAGE_CACHE_SIZE) 769 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
970 client->backing_dev_info.ra_pages = 770 fsc->backing_dev_info.ra_pages =
971 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 771 (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
972 >> PAGE_SHIFT; 772 >> PAGE_SHIFT;
973 err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", 773 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
974 atomic_long_inc_return(&bdi_seq)); 774 atomic_long_inc_return(&bdi_seq));
975 if (!err) 775 if (!err)
976 sb->s_bdi = &client->backing_dev_info; 776 sb->s_bdi = &fsc->backing_dev_info;
977 return err; 777 return err;
978} 778}
979 779
@@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type,
982 struct vfsmount *mnt) 782 struct vfsmount *mnt)
983{ 783{
984 struct super_block *sb; 784 struct super_block *sb;
985 struct ceph_client *client; 785 struct ceph_fs_client *fsc;
986 int err; 786 int err;
987 int (*compare_super)(struct super_block *, void *) = ceph_compare_super; 787 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
988 const char *path = NULL; 788 const char *path = NULL;
989 struct ceph_mount_args *args; 789 struct ceph_mount_options *fsopt = NULL;
790 struct ceph_options *opt = NULL;
990 791
991 dout("ceph_get_sb\n"); 792 dout("ceph_get_sb\n");
992 args = parse_mount_args(flags, data, dev_name, &path); 793 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
993 if (IS_ERR(args)) { 794 if (err < 0)
994 err = PTR_ERR(args);
995 goto out_final; 795 goto out_final;
996 }
997 796
998 /* create client (which we may/may not use) */ 797 /* create client (which we may/may not use) */
999 client = ceph_create_client(args); 798 fsc = create_fs_client(fsopt, opt);
1000 if (IS_ERR(client)) { 799 if (IS_ERR(fsc)) {
1001 err = PTR_ERR(client); 800 err = PTR_ERR(fsc);
801 kfree(fsopt);
802 kfree(opt);
1002 goto out_final; 803 goto out_final;
1003 } 804 }
1004 805
1005 if (client->mount_args->flags & CEPH_OPT_NOSHARE) 806 err = ceph_mdsc_init(fsc);
807 if (err < 0)
808 goto out;
809
810 if (ceph_test_opt(fsc->client, NOSHARE))
1006 compare_super = NULL; 811 compare_super = NULL;
1007 sb = sget(fs_type, compare_super, ceph_set_super, client); 812 sb = sget(fs_type, compare_super, ceph_set_super, fsc);
1008 if (IS_ERR(sb)) { 813 if (IS_ERR(sb)) {
1009 err = PTR_ERR(sb); 814 err = PTR_ERR(sb);
1010 goto out; 815 goto out;
1011 } 816 }
1012 817
1013 if (ceph_sb_to_client(sb) != client) { 818 if (ceph_sb_to_client(sb) != fsc) {
1014 ceph_destroy_client(client); 819 ceph_mdsc_destroy(fsc);
1015 client = ceph_sb_to_client(sb); 820 destroy_fs_client(fsc);
1016 dout("get_sb got existing client %p\n", client); 821 fsc = ceph_sb_to_client(sb);
822 dout("get_sb got existing client %p\n", fsc);
1017 } else { 823 } else {
1018 dout("get_sb using new client %p\n", client); 824 dout("get_sb using new client %p\n", fsc);
1019 err = ceph_register_bdi(sb, client); 825 err = ceph_register_bdi(sb, fsc);
1020 if (err < 0) 826 if (err < 0)
1021 goto out_splat; 827 goto out_splat;
1022 } 828 }
1023 829
1024 err = ceph_mount(client, mnt, path); 830 err = ceph_mount(fsc, mnt, path);
1025 if (err < 0) 831 if (err < 0)
1026 goto out_splat; 832 goto out_splat;
1027 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, 833 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
@@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type,
1029 return 0; 835 return 0;
1030 836
1031out_splat: 837out_splat:
1032 ceph_mdsc_close_sessions(&client->mdsc); 838 ceph_mdsc_close_sessions(fsc->mdsc);
1033 deactivate_locked_super(sb); 839 deactivate_locked_super(sb);
1034 goto out_final; 840 goto out_final;
1035 841
1036out: 842out:
1037 ceph_destroy_client(client); 843 ceph_mdsc_destroy(fsc);
844 destroy_fs_client(fsc);
1038out_final: 845out_final:
1039 dout("ceph_get_sb fail %d\n", err); 846 dout("ceph_get_sb fail %d\n", err);
1040 return err; 847 return err;
@@ -1042,11 +849,12 @@ out_final:
1042 849
1043static void ceph_kill_sb(struct super_block *s) 850static void ceph_kill_sb(struct super_block *s)
1044{ 851{
1045 struct ceph_client *client = ceph_sb_to_client(s); 852 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
1046 dout("kill_sb %p\n", s); 853 dout("kill_sb %p\n", s);
1047 ceph_mdsc_pre_umount(&client->mdsc); 854 ceph_mdsc_pre_umount(fsc->mdsc);
1048 kill_anon_super(s); /* will call put_super after sb is r/o */ 855 kill_anon_super(s); /* will call put_super after sb is r/o */
1049 ceph_destroy_client(client); 856 ceph_mdsc_destroy(fsc);
857 destroy_fs_client(fsc);
1050} 858}
1051 859
1052static struct file_system_type ceph_fs_type = { 860static struct file_system_type ceph_fs_type = {
@@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = {
1062 870
1063static int __init init_ceph(void) 871static int __init init_ceph(void)
1064{ 872{
1065 int ret = 0; 873 int ret = init_caches();
1066
1067 ret = ceph_debugfs_init();
1068 if (ret < 0)
1069 goto out;
1070
1071 ret = ceph_msgr_init();
1072 if (ret < 0)
1073 goto out_debugfs;
1074
1075 ret = init_caches();
1076 if (ret) 874 if (ret)
1077 goto out_msgr; 875 goto out;
1078 876
1079 ret = register_filesystem(&ceph_fs_type); 877 ret = register_filesystem(&ceph_fs_type);
1080 if (ret) 878 if (ret)
1081 goto out_icache; 879 goto out_icache;
1082 880
1083 pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", 881 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1084 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, 882
1085 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
1086 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
1087 return 0; 883 return 0;
1088 884
1089out_icache: 885out_icache:
1090 destroy_caches(); 886 destroy_caches();
1091out_msgr:
1092 ceph_msgr_exit();
1093out_debugfs:
1094 ceph_debugfs_cleanup();
1095out: 887out:
1096 return ret; 888 return ret;
1097} 889}
@@ -1101,8 +893,6 @@ static void __exit exit_ceph(void)
1101 dout("exit_ceph\n"); 893 dout("exit_ceph\n");
1102 unregister_filesystem(&ceph_fs_type); 894 unregister_filesystem(&ceph_fs_type);
1103 destroy_caches(); 895 destroy_caches();
1104 ceph_msgr_exit();
1105 ceph_debugfs_cleanup();
1106} 896}
1107 897
1108module_init(init_ceph); 898module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b87638e84c4b..1886294e12f7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
1#ifndef _FS_CEPH_SUPER_H 1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H 2#define _FS_CEPH_SUPER_H
3 3
4#include "ceph_debug.h" 4#include <linux/ceph/ceph_debug.h>
5 5
6#include <asm/unaligned.h> 6#include <asm/unaligned.h>
7#include <linux/backing-dev.h> 7#include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
14#include <linux/writeback.h> 14#include <linux/writeback.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include "types.h" 17#include <linux/ceph/libceph.h>
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "mds_client.h"
22#include "osd_client.h"
23#include "ceph_fs.h"
24 18
25/* f_type in struct statfs */ 19/* f_type in struct statfs */
26#define CEPH_SUPER_MAGIC 0x00c36400 20#define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
30#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ 24#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 25#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32 26
33/* 27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
34 * Supported features 28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
35 */ 29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
36#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
37#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR
38 30
39/* 31#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
40 * mount options
41 */
42#define CEPH_OPT_FSID (1<<0)
43#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
44#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
45#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
46#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
47#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
48#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
49 32
50#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) 33#define ceph_set_mount_opt(fsc, opt) \
34 (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
35#define ceph_test_mount_opt(fsc, opt) \
36 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
51 37
52#define ceph_set_opt(client, opt) \ 38#define CEPH_MAX_READDIR_DEFAULT 1024
53 (client)->mount_args->flags |= CEPH_OPT_##opt; 39#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
54#define ceph_test_opt(client, opt) \ 40#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
55 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
56 41
57 42struct ceph_mount_options {
58struct ceph_mount_args {
59 int sb_flags;
60 int flags; 43 int flags;
61 struct ceph_fsid fsid; 44 int sb_flags;
62 struct ceph_entity_addr my_addr; 45
63 int num_mon;
64 struct ceph_entity_addr *mon_addr;
65 int mount_timeout;
66 int osd_idle_ttl;
67 int osd_timeout;
68 int osd_keepalive_timeout;
69 int wsize; 46 int wsize;
70 int rsize; /* max readahead */ 47 int rsize; /* max readahead */
71 int congestion_kb; /* max writeback in flight */ 48 int congestion_kb; /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
73 int cap_release_safety; 50 int cap_release_safety;
74 int max_readdir; /* max readdir result (entires) */ 51 int max_readdir; /* max readdir result (entires) */
75 int max_readdir_bytes; /* max readdir result (bytes) */ 52 int max_readdir_bytes; /* max readdir result (bytes) */
76 char *snapdir_name; /* default ".snap" */
77 char *name;
78 char *secret;
79};
80 53
81/* 54 /*
82 * defaults 55 * everything above this point can be memcmp'd; everything below
83 */ 56 * is handled in compare_mount_options()
84#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 57 */
85#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
86#define CEPH_OSD_KEEPALIVE_DEFAULT 5
87#define CEPH_OSD_IDLE_TTL_DEFAULT 60
88#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
89#define CEPH_MAX_READDIR_DEFAULT 1024
90#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
91
92#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
93#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
94
95#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
96#define CEPH_AUTH_NAME_DEFAULT "guest"
97/*
98 * Delay telling the MDS we no longer want caps, in case we reopen
99 * the file. Delay a minimum amount of time, even if we send a cap
100 * message for some other reason. Otherwise, take the oppotunity to
101 * update the mds to avoid sending another message later.
102 */
103#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
104#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
105
106#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
107
108/* mount state */
109enum {
110 CEPH_MOUNT_MOUNTING,
111 CEPH_MOUNT_MOUNTED,
112 CEPH_MOUNT_UNMOUNTING,
113 CEPH_MOUNT_UNMOUNTED,
114 CEPH_MOUNT_SHUTDOWN,
115};
116
117/*
118 * subtract jiffies
119 */
120static inline unsigned long time_sub(unsigned long a, unsigned long b)
121{
122 BUG_ON(time_after(b, a));
123 return (long)a - (long)b;
124}
125
126/*
127 * per-filesystem client state
128 *
129 * possibly shared by multiple mount points, if they are
130 * mounting the same ceph filesystem/cluster.
131 */
132struct ceph_client {
133 struct ceph_fsid fsid;
134 bool have_fsid;
135 58
136 struct mutex mount_mutex; /* serialize mount attempts */ 59 char *snapdir_name; /* default ".snap" */
137 struct ceph_mount_args *mount_args; 60};
138 61
62struct ceph_fs_client {
139 struct super_block *sb; 63 struct super_block *sb;
140 64
141 unsigned long mount_state; 65 struct ceph_mount_options *mount_options;
142 wait_queue_head_t auth_wq; 66 struct ceph_client *client;
143
144 int auth_err;
145 67
68 unsigned long mount_state;
146 int min_caps; /* min caps i added */ 69 int min_caps; /* min caps i added */
147 70
148 struct ceph_messenger *msgr; /* messenger instance */ 71 struct ceph_mds_client *mdsc;
149 struct ceph_mon_client monc;
150 struct ceph_mds_client mdsc;
151 struct ceph_osd_client osdc;
152 72
153 /* writeback */ 73 /* writeback */
154 mempool_t *wb_pagevec_pool; 74 mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
160 struct backing_dev_info backing_dev_info; 80 struct backing_dev_info backing_dev_info;
161 81
162#ifdef CONFIG_DEBUG_FS 82#ifdef CONFIG_DEBUG_FS
163 struct dentry *debugfs_monmap; 83 struct dentry *debugfs_dentry_lru, *debugfs_caps;
164 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
165 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
166 struct dentry *debugfs_congestion_kb; 84 struct dentry *debugfs_congestion_kb;
167 struct dentry *debugfs_bdi; 85 struct dentry *debugfs_bdi;
86 struct dentry *debugfs_mdsc, *debugfs_mdsmap;
168#endif 87#endif
169}; 88};
170 89
90
171/* 91/*
172 * File i/o capability. This tracks shared state with the metadata 92 * File i/o capability. This tracks shared state with the metadata
173 * server that allows us to cache or writeback attributes or to read 93 * server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
275 int should_free_val; 195 int should_free_val;
276}; 196};
277 197
198/*
199 * Ceph dentry state
200 */
201struct ceph_dentry_info {
202 struct ceph_mds_session *lease_session;
203 u32 lease_gen, lease_shared_gen;
204 u32 lease_seq;
205 unsigned long lease_renew_after, lease_renew_from;
206 struct list_head lru;
207 struct dentry *dentry;
208 u64 time;
209 u64 offset;
210};
211
278struct ceph_inode_xattrs_info { 212struct ceph_inode_xattrs_info {
279 /* 213 /*
280 * (still encoded) xattr blob. we avoid the overhead of parsing 214 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
296/* 230/*
297 * Ceph inode. 231 * Ceph inode.
298 */ 232 */
299#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
300#define CEPH_I_NODELAY 4 /* do not delay cap release */
301#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
302#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
303
304struct ceph_inode_info { 233struct ceph_inode_info {
305 struct ceph_vino i_vino; /* ceph ino + snap */ 234 struct ceph_vino i_vino; /* ceph ino + snap */
306 235
@@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
391 return container_of(inode, struct ceph_inode_info, vfs_inode); 320 return container_of(inode, struct ceph_inode_info, vfs_inode);
392} 321}
393 322
323static inline struct ceph_vino ceph_vino(struct inode *inode)
324{
325 return ceph_inode(inode)->i_vino;
326}
327
328/*
329 * ino_t is <64 bits on many architectures, blech.
330 *
331 * don't include snap in ino hash, at least for now.
332 */
333static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
334{
335 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
336#if BITS_PER_LONG == 32
337 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
338 if (!ino)
339 ino = 1;
340#endif
341 return ino;
342}
343
344/* for printf-style formatting */
345#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
346
347static inline u64 ceph_ino(struct inode *inode)
348{
349 return ceph_inode(inode)->i_vino.ino;
350}
351static inline u64 ceph_snap(struct inode *inode)
352{
353 return ceph_inode(inode)->i_vino.snap;
354}
355
356static inline int ceph_ino_compare(struct inode *inode, void *data)
357{
358 struct ceph_vino *pvino = (struct ceph_vino *)data;
359 struct ceph_inode_info *ci = ceph_inode(inode);
360 return ci->i_vino.ino == pvino->ino &&
361 ci->i_vino.snap == pvino->snap;
362}
363
364static inline struct inode *ceph_find_inode(struct super_block *sb,
365 struct ceph_vino vino)
366{
367 ino_t t = ceph_vino_to_ino(vino);
368 return ilookup5(sb, t, ceph_ino_compare, &vino);
369}
370
371
372/*
373 * Ceph inode.
374 */
375#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
376#define CEPH_I_NODELAY 4 /* do not delay cap release */
377#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
378#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
379
394static inline void ceph_i_clear(struct inode *inode, unsigned mask) 380static inline void ceph_i_clear(struct inode *inode, unsigned mask)
395{ 381{
396 struct ceph_inode_info *ci = ceph_inode(inode); 382 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -414,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
414 struct ceph_inode_info *ci = ceph_inode(inode); 400 struct ceph_inode_info *ci = ceph_inode(inode);
415 bool r; 401 bool r;
416 402
417 smp_mb(); 403 spin_lock(&inode->i_lock);
418 r = (ci->i_ceph_flags & mask) == mask; 404 r = (ci->i_ceph_flags & mask) == mask;
405 spin_unlock(&inode->i_lock);
419 return r; 406 return r;
420} 407}
421 408
@@ -432,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
432 struct ceph_inode_frag *pfrag, 419 struct ceph_inode_frag *pfrag,
433 int *found); 420 int *found);
434 421
435/*
436 * Ceph dentry state
437 */
438struct ceph_dentry_info {
439 struct ceph_mds_session *lease_session;
440 u32 lease_gen, lease_shared_gen;
441 u32 lease_seq;
442 unsigned long lease_renew_after, lease_renew_from;
443 struct list_head lru;
444 struct dentry *dentry;
445 u64 time;
446 u64 offset;
447};
448
449static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) 422static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
450{ 423{
451 return (struct ceph_dentry_info *)dentry->d_fsdata; 424 return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
456 return ((loff_t)frag << 32) | (loff_t)off; 429 return ((loff_t)frag << 32) | (loff_t)off;
457} 430}
458 431
459/*
460 * ino_t is <64 bits on many architectures, blech.
461 *
462 * don't include snap in ino hash, at least for now.
463 */
464static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
465{
466 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
467#if BITS_PER_LONG == 32
468 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
469 if (!ino)
470 ino = 1;
471#endif
472 return ino;
473}
474
475static inline int ceph_set_ino_cb(struct inode *inode, void *data) 432static inline int ceph_set_ino_cb(struct inode *inode, void *data)
476{ 433{
477 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; 434 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
479 return 0; 436 return 0;
480} 437}
481 438
482static inline struct ceph_vino ceph_vino(struct inode *inode)
483{
484 return ceph_inode(inode)->i_vino;
485}
486
487/* for printf-style formatting */
488#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
489
490static inline u64 ceph_ino(struct inode *inode)
491{
492 return ceph_inode(inode)->i_vino.ino;
493}
494static inline u64 ceph_snap(struct inode *inode)
495{
496 return ceph_inode(inode)->i_vino.snap;
497}
498
499static inline int ceph_ino_compare(struct inode *inode, void *data)
500{
501 struct ceph_vino *pvino = (struct ceph_vino *)data;
502 struct ceph_inode_info *ci = ceph_inode(inode);
503 return ci->i_vino.ino == pvino->ino &&
504 ci->i_vino.snap == pvino->snap;
505}
506
507static inline struct inode *ceph_find_inode(struct super_block *sb,
508 struct ceph_vino vino)
509{
510 ino_t t = ceph_vino_to_ino(vino);
511 return ilookup5(sb, t, ceph_ino_compare, &vino);
512}
513
514
515/* 439/*
516 * caps helpers 440 * caps helpers
517 */ 441 */
@@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
576 struct ceph_cap_reservation *ctx, int need); 500 struct ceph_cap_reservation *ctx, int need);
577extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 501extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
578 struct ceph_cap_reservation *ctx); 502 struct ceph_cap_reservation *ctx);
579extern void ceph_reservation_status(struct ceph_client *client, 503extern void ceph_reservation_status(struct ceph_fs_client *client,
580 int *total, int *avail, int *used, 504 int *total, int *avail, int *used,
581 int *reserved, int *min); 505 int *reserved, int *min);
582 506
583static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) 507static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
584{ 508{
585 return (struct ceph_client *)inode->i_sb->s_fs_info; 509 return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
586} 510}
587 511
588static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) 512static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
589{ 513{
590 return (struct ceph_client *)sb->s_fs_info; 514 return (struct ceph_fs_client *)sb->s_fs_info;
591} 515}
592 516
593 517
@@ -617,51 +541,6 @@ struct ceph_file_info {
617 541
618 542
619/* 543/*
620 * snapshots
621 */
622
623/*
624 * A "snap context" is the set of existing snapshots when we
625 * write data. It is used by the OSD to guide its COW behavior.
626 *
627 * The ceph_snap_context is refcounted, and attached to each dirty
628 * page, indicating which context the dirty data belonged when it was
629 * dirtied.
630 */
631struct ceph_snap_context {
632 atomic_t nref;
633 u64 seq;
634 int num_snaps;
635 u64 snaps[];
636};
637
638static inline struct ceph_snap_context *
639ceph_get_snap_context(struct ceph_snap_context *sc)
640{
641 /*
642 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
643 atomic_read(&sc->nref)+1);
644 */
645 if (sc)
646 atomic_inc(&sc->nref);
647 return sc;
648}
649
650static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
651{
652 if (!sc)
653 return;
654 /*
655 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
656 atomic_read(&sc->nref)-1);
657 */
658 if (atomic_dec_and_test(&sc->nref)) {
659 /*printk(" deleting snap_context %p\n", sc);*/
660 kfree(sc);
661 }
662}
663
664/*
665 * A "snap realm" describes a subset of the file hierarchy sharing 544 * A "snap realm" describes a subset of the file hierarchy sharing
666 * the same set of snapshots that apply to it. The realms themselves 545 * the same set of snapshots that apply to it. The realms themselves
667 * are organized into a hierarchy, such that children inherit (some of) 546 * are organized into a hierarchy, such that children inherit (some of)
@@ -699,16 +578,33 @@ struct ceph_snap_realm {
699 spinlock_t inodes_with_caps_lock; 578 spinlock_t inodes_with_caps_lock;
700}; 579};
701 580
702 581static inline int default_congestion_kb(void)
703
704/*
705 * calculate the number of pages a given length and offset map onto,
706 * if we align the data.
707 */
708static inline int calc_pages_for(u64 off, u64 len)
709{ 582{
710 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - 583 int congestion_kb;
711 (off >> PAGE_CACHE_SHIFT); 584
585 /*
586 * Copied from NFS
587 *
588 * congestion size, scale with available memory.
589 *
590 * 64MB: 8192k
591 * 128MB: 11585k
592 * 256MB: 16384k
593 * 512MB: 23170k
594 * 1GB: 32768k
595 * 2GB: 46340k
596 * 4GB: 65536k
597 * 8GB: 92681k
598 * 16GB: 131072k
599 *
600 * This allows larger machines to have larger/more transfers.
601 * Limit the default to 256M
602 */
603 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
604 if (congestion_kb > 256*1024)
605 congestion_kb = 256*1024;
606
607 return congestion_kb;
712} 608}
713 609
714 610
@@ -741,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
741 ci_item)->writing; 637 ci_item)->writing;
742} 638}
743 639
744
745/* super.c */
746extern struct kmem_cache *ceph_inode_cachep;
747extern struct kmem_cache *ceph_cap_cachep;
748extern struct kmem_cache *ceph_dentry_cachep;
749extern struct kmem_cache *ceph_file_cachep;
750
751extern const char *ceph_msg_type_name(int type);
752extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
753
754/* inode.c */ 640/* inode.c */
755extern const struct inode_operations ceph_file_iops; 641extern const struct inode_operations ceph_file_iops;
756 642
@@ -857,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
857/* file.c */ 743/* file.c */
858extern const struct file_operations ceph_file_fops; 744extern const struct file_operations ceph_file_fops;
859extern const struct address_space_operations ceph_aops; 745extern const struct address_space_operations ceph_aops;
746extern int ceph_copy_to_page_vector(struct page **pages,
747 const char *data,
748 loff_t off, size_t len);
749extern int ceph_copy_from_page_vector(struct page **pages,
750 char *data,
751 loff_t off, size_t len);
752extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
860extern int ceph_open(struct inode *inode, struct file *file); 753extern int ceph_open(struct inode *inode, struct file *file);
861extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, 754extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
862 struct nameidata *nd, int mode, 755 struct nameidata *nd, int mode,
863 int locked_dir); 756 int locked_dir);
864extern int ceph_release(struct inode *inode, struct file *filp); 757extern int ceph_release(struct inode *inode, struct file *filp);
865extern void ceph_release_page_vector(struct page **pages, int num_pages);
866 758
867/* dir.c */ 759/* dir.c */
868extern const struct file_operations ceph_dir_fops; 760extern const struct file_operations ceph_dir_fops;
@@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
892/* export.c */ 784/* export.c */
893extern const struct export_operations ceph_export_ops; 785extern const struct export_operations ceph_export_ops;
894 786
895/* debugfs.c */
896extern int ceph_debugfs_init(void);
897extern void ceph_debugfs_cleanup(void);
898extern int ceph_debugfs_client_init(struct ceph_client *client);
899extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
900
901/* locks.c */ 787/* locks.c */
902extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); 788extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
903extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); 789extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
914 return NULL; 800 return NULL;
915} 801}
916 802
803/* debugfs.c */
804extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
805extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
806
917#endif /* _FS_CEPH_SUPER_H */ 807#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
deleted file mode 100644
index 28b35a005ec2..000000000000
--- a/fs/ceph/types.h
+++ /dev/null
@@ -1,29 +0,0 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9578af610b73..6e12a6ba5f79 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2
2#include "super.h" 3#include "super.h"
3#include "decode.h" 4#include "mds_client.h"
5
6#include <linux/ceph/decode.h>
4 7
5#include <linux/xattr.h> 8#include <linux/xattr.h>
6#include <linux/slab.h> 9#include <linux/slab.h>
@@ -620,12 +623,12 @@ out:
620static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 623static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
621 const char *value, size_t size, int flags) 624 const char *value, size_t size, int flags)
622{ 625{
623 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 626 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
624 struct inode *inode = dentry->d_inode; 627 struct inode *inode = dentry->d_inode;
625 struct ceph_inode_info *ci = ceph_inode(inode); 628 struct ceph_inode_info *ci = ceph_inode(inode);
626 struct inode *parent_inode = dentry->d_parent->d_inode; 629 struct inode *parent_inode = dentry->d_parent->d_inode;
627 struct ceph_mds_request *req; 630 struct ceph_mds_request *req;
628 struct ceph_mds_client *mdsc = &client->mdsc; 631 struct ceph_mds_client *mdsc = fsc->mdsc;
629 int err; 632 int err;
630 int i, nr_pages; 633 int i, nr_pages;
631 struct page **pages = NULL; 634 struct page **pages = NULL;
@@ -713,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
713 716
714 /* preallocate memory for xattr name, value, index node */ 717 /* preallocate memory for xattr name, value, index node */
715 err = -ENOMEM; 718 err = -ENOMEM;
716 newname = kmalloc(name_len + 1, GFP_NOFS); 719 newname = kmemdup(name, name_len + 1, GFP_NOFS);
717 if (!newname) 720 if (!newname)
718 goto out; 721 goto out;
719 memcpy(newname, name, name_len + 1);
720 722
721 if (val_len) { 723 if (val_len) {
722 newval = kmalloc(val_len + 1, GFP_NOFS); 724 newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -777,8 +779,8 @@ out:
777 779
778static int ceph_send_removexattr(struct dentry *dentry, const char *name) 780static int ceph_send_removexattr(struct dentry *dentry, const char *name)
779{ 781{
780 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 782 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
781 struct ceph_mds_client *mdsc = &client->mdsc; 783 struct ceph_mds_client *mdsc = fsc->mdsc;
782 struct inode *inode = dentry->d_inode; 784 struct inode *inode = dentry->d_inode;
783 struct inode *parent_inode = dentry->d_parent->d_inode; 785 struct inode *parent_inode = dentry->d_parent->d_inode;
784 struct ceph_mds_request *req; 786 struct ceph_mds_request *req;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd37..7e83b356cc9e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -232,7 +232,7 @@ static int
232small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 232small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
233 void **request_buf) 233 void **request_buf)
234{ 234{
235 int rc = 0; 235 int rc;
236 236
237 rc = cifs_reconnect_tcon(tcon, smb_command); 237 rc = cifs_reconnect_tcon(tcon, smb_command);
238 if (rc) 238 if (rc)
@@ -250,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
250 if (tcon != NULL) 250 if (tcon != NULL)
251 cifs_stats_inc(&tcon->num_smbs_sent); 251 cifs_stats_inc(&tcon->num_smbs_sent);
252 252
253 return rc; 253 return 0;
254} 254}
255 255
256int 256int
@@ -281,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct,
281 281
282/* If the return code is zero, this function must fill in request_buf pointer */ 282/* If the return code is zero, this function must fill in request_buf pointer */
283static int 283static int
284smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 284__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
285 void **request_buf /* returned */ , 285 void **request_buf, void **response_buf)
286 void **response_buf /* returned */ )
287{ 286{
288 int rc = 0;
289
290 rc = cifs_reconnect_tcon(tcon, smb_command);
291 if (rc)
292 return rc;
293
294 *request_buf = cifs_buf_get(); 287 *request_buf = cifs_buf_get();
295 if (*request_buf == NULL) { 288 if (*request_buf == NULL) {
296 /* BB should we add a retry in here if not a writepage? */ 289 /* BB should we add a retry in here if not a writepage? */
@@ -309,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
309 if (tcon != NULL) 302 if (tcon != NULL)
310 cifs_stats_inc(&tcon->num_smbs_sent); 303 cifs_stats_inc(&tcon->num_smbs_sent);
311 304
312 return rc; 305 return 0;
306}
307
308/* If the return code is zero, this function must fill in request_buf pointer */
309static int
310smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
311 void **request_buf, void **response_buf)
312{
313 int rc;
314
315 rc = cifs_reconnect_tcon(tcon, smb_command);
316 if (rc)
317 return rc;
318
319 return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
320}
321
322static int
323smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
324 void **request_buf, void **response_buf)
325{
326 if (tcon->ses->need_reconnect || tcon->need_reconnect)
327 return -EHOSTDOWN;
328
329 return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
313} 330}
314 331
315static int validate_t2(struct smb_t2_rsp *pSMB) 332static int validate_t2(struct smb_t2_rsp *pSMB)
@@ -4534,8 +4551,8 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
4534 4551
4535 cFYI(1, "In QFSUnixInfo"); 4552 cFYI(1, "In QFSUnixInfo");
4536QFSUnixRetry: 4553QFSUnixRetry:
4537 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4554 rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
4538 (void **) &pSMBr); 4555 (void **) &pSMB, (void **) &pSMBr);
4539 if (rc) 4556 if (rc)
4540 return rc; 4557 return rc;
4541 4558
@@ -4604,8 +4621,8 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
4604 cFYI(1, "In SETFSUnixInfo"); 4621 cFYI(1, "In SETFSUnixInfo");
4605SETFSUnixRetry: 4622SETFSUnixRetry:
4606 /* BB switch to small buf init to save memory */ 4623 /* BB switch to small buf init to save memory */
4607 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 4624 rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
4608 (void **) &pSMBr); 4625 (void **) &pSMB, (void **) &pSMBr);
4609 if (rc) 4626 if (rc)
4610 return rc; 4627 return rc;
4611 4628
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 93f77d438d3c..53cce8cc2224 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -801,6 +801,8 @@ retry_iget5_locked:
801 inode->i_flags |= S_NOATIME | S_NOCMTIME; 801 inode->i_flags |= S_NOATIME | S_NOCMTIME;
802 if (inode->i_state & I_NEW) { 802 if (inode->i_state & I_NEW) {
803 inode->i_ino = hash; 803 inode->i_ino = hash;
804 if (S_ISREG(inode->i_mode))
805 inode->i_data.backing_dev_info = sb->s_bdi;
804#ifdef CONFIG_CIFS_FSCACHE 806#ifdef CONFIG_CIFS_FSCACHE
805 /* initialize per-inode cache cookie pointer */ 807 /* initialize per-inode cache cookie pointer */
806 CIFS_I(inode)->fscache = NULL; 808 CIFS_I(inode)->fscache = NULL;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 03e59aa318eb..d0ad09d57789 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -599,69 +599,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
599#define HIDPGETCONNLIST _IOR('H', 210, int) 599#define HIDPGETCONNLIST _IOR('H', 210, int)
600#define HIDPGETCONNINFO _IOR('H', 211, int) 600#define HIDPGETCONNINFO _IOR('H', 211, int)
601 601
602#ifdef CONFIG_BLOCK
603struct raw32_config_request
604{
605 compat_int_t raw_minor;
606 __u64 block_major;
607 __u64 block_minor;
608} __attribute__((packed));
609
610static int get_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
611{
612 int ret;
613
614 if (!access_ok(VERIFY_READ, user_req, sizeof(struct raw32_config_request)))
615 return -EFAULT;
616
617 ret = __get_user(req->raw_minor, &user_req->raw_minor);
618 ret |= __get_user(req->block_major, &user_req->block_major);
619 ret |= __get_user(req->block_minor, &user_req->block_minor);
620
621 return ret ? -EFAULT : 0;
622}
623
624static int set_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
625{
626 int ret;
627
628 if (!access_ok(VERIFY_WRITE, user_req, sizeof(struct raw32_config_request)))
629 return -EFAULT;
630
631 ret = __put_user(req->raw_minor, &user_req->raw_minor);
632 ret |= __put_user(req->block_major, &user_req->block_major);
633 ret |= __put_user(req->block_minor, &user_req->block_minor);
634
635 return ret ? -EFAULT : 0;
636}
637
638static int raw_ioctl(unsigned fd, unsigned cmd,
639 struct raw32_config_request __user *user_req)
640{
641 int ret;
642
643 switch (cmd) {
644 case RAW_SETBIND:
645 default: { /* RAW_GETBIND */
646 struct raw_config_request req;
647 mm_segment_t oldfs = get_fs();
648
649 if ((ret = get_raw32_request(&req, user_req)))
650 return ret;
651
652 set_fs(KERNEL_DS);
653 ret = sys_ioctl(fd,cmd,(unsigned long)&req);
654 set_fs(oldfs);
655
656 if ((!ret) && (cmd == RAW_GETBIND)) {
657 ret = set_raw32_request(&req, user_req);
658 }
659 break;
660 }
661 }
662 return ret;
663}
664#endif /* CONFIG_BLOCK */
665 602
666struct serial_struct32 { 603struct serial_struct32 {
667 compat_int_t type; 604 compat_int_t type;
@@ -1262,9 +1199,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
1262COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) 1199COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
1263COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) 1200COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
1264COMPATIBLE_IOCTL(OSS_GETVERSION) 1201COMPATIBLE_IOCTL(OSS_GETVERSION)
1265/* Raw devices */
1266COMPATIBLE_IOCTL(RAW_SETBIND)
1267COMPATIBLE_IOCTL(RAW_GETBIND)
1268/* SMB ioctls which do not need any translations */ 1202/* SMB ioctls which do not need any translations */
1269COMPATIBLE_IOCTL(SMB_IOC_NEWCONN) 1203COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
1270/* Watchdog */ 1204/* Watchdog */
@@ -1523,10 +1457,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
1523 case MTIOCGET32: 1457 case MTIOCGET32:
1524 case MTIOCPOS32: 1458 case MTIOCPOS32:
1525 return mt_ioctl_trans(fd, cmd, argp); 1459 return mt_ioctl_trans(fd, cmd, argp);
1526 /* Raw devices */
1527 case RAW_SETBIND:
1528 case RAW_GETBIND:
1529 return raw_ioctl(fd, cmd, argp);
1530#endif 1460#endif
1531 /* One SMB ioctl needs translations. */ 1461 /* One SMB ioctl needs translations. */
1532#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) 1462#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
diff --git a/fs/exec.c b/fs/exec.c
index 828dd2461d6b..6d2b6f936858 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2014,3 +2014,43 @@ fail_creds:
2014fail: 2014fail:
2015 return; 2015 return;
2016} 2016}
2017
2018/*
2019 * Core dumping helper functions. These are the only things you should
2020 * do on a core-file: use only these functions to write out all the
2021 * necessary info.
2022 */
2023int dump_write(struct file *file, const void *addr, int nr)
2024{
2025 return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
2026}
2027EXPORT_SYMBOL(dump_write);
2028
2029int dump_seek(struct file *file, loff_t off)
2030{
2031 int ret = 1;
2032
2033 if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
2034 if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
2035 return 0;
2036 } else {
2037 char *buf = (char *)get_zeroed_page(GFP_KERNEL);
2038
2039 if (!buf)
2040 return 0;
2041 while (off > 0) {
2042 unsigned long n = off;
2043
2044 if (n > PAGE_SIZE)
2045 n = PAGE_SIZE;
2046 if (!dump_write(file, buf, n)) {
2047 ret = 0;
2048 break;
2049 }
2050 off -= n;
2051 }
2052 free_page((unsigned long)buf);
2053 }
2054 return ret;
2055}
2056EXPORT_SYMBOL(dump_seek);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index eb7368ebd8cd..3eadd97324b1 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -54,6 +54,9 @@ struct page_collect {
54 unsigned nr_pages; 54 unsigned nr_pages;
55 unsigned long length; 55 unsigned long length;
56 loff_t pg_first; /* keep 64bit also in 32-arches */ 56 loff_t pg_first; /* keep 64bit also in 32-arches */
57 bool read_4_write; /* This means two things: that the read is sync
58 * And the pages should not be unlocked.
59 */
57}; 60};
58 61
59static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, 62static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -71,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
71 pcol->nr_pages = 0; 74 pcol->nr_pages = 0;
72 pcol->length = 0; 75 pcol->length = 0;
73 pcol->pg_first = -1; 76 pcol->pg_first = -1;
77 pcol->read_4_write = false;
74} 78}
75 79
76static void _pcol_reset(struct page_collect *pcol) 80static void _pcol_reset(struct page_collect *pcol)
@@ -347,7 +351,8 @@ static int readpage_strip(void *data, struct page *page)
347 if (PageError(page)) 351 if (PageError(page))
348 ClearPageError(page); 352 ClearPageError(page);
349 353
350 unlock_page(page); 354 if (!pcol->read_4_write)
355 unlock_page(page);
351 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," 356 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
352 " splitting\n", inode->i_ino, page->index); 357 " splitting\n", inode->i_ino, page->index);
353 358
@@ -428,6 +433,7 @@ static int _readpage(struct page *page, bool is_sync)
428 /* readpage_strip might call read_exec(,is_sync==false) at several 433 /* readpage_strip might call read_exec(,is_sync==false) at several
429 * places but not if we have a single page. 434 * places but not if we have a single page.
430 */ 435 */
436 pcol.read_4_write = is_sync;
431 ret = readpage_strip(&pcol, page); 437 ret = readpage_strip(&pcol, page);
432 if (ret) { 438 if (ret) {
433 EXOFS_ERR("_readpage => %d\n", ret); 439 EXOFS_ERR("_readpage => %d\n", ret);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2e20bd771337..377768009106 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1842,8 +1842,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1842 goto failed_mount; 1842 goto failed_mount;
1843 } 1843 }
1844 1844
1845 if (le32_to_cpu(es->s_blocks_count) > 1845 if (generic_check_addressable(sb->s_blocksize_bits,
1846 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 1846 le32_to_cpu(es->s_blocks_count))) {
1847 ext3_msg(sb, KERN_ERR, 1847 ext3_msg(sb, KERN_ERR,
1848 "error: filesystem is too large to mount safely"); 1848 "error: filesystem is too large to mount safely");
1849 if (sizeof(sector_t) < 8) 1849 if (sizeof(sector_t) < 8)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 24e7699f915d..8ecc1e590303 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2826,15 +2826,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2826 * Test whether we have more sectors than will fit in sector_t, 2826 * Test whether we have more sectors than will fit in sector_t,
2827 * and whether the max offset is addressable by the page cache. 2827 * and whether the max offset is addressable by the page cache.
2828 */ 2828 */
2829 if ((ext4_blocks_count(es) > 2829 ret = generic_check_addressable(sb->s_blocksize_bits,
2830 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || 2830 ext4_blocks_count(es));
2831 (ext4_blocks_count(es) > 2831 if (ret) {
2832 (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
2833 ext4_msg(sb, KERN_ERR, "filesystem" 2832 ext4_msg(sb, KERN_ERR, "filesystem"
2834 " too large to mount safely on this system"); 2833 " too large to mount safely on this system");
2835 if (sizeof(sector_t) < 8) 2834 if (sizeof(sector_t) < 8)
2836 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 2835 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
2837 ret = -EFBIG;
2838 goto failed_mount; 2836 goto failed_mount;
2839 } 2837 }
2840 2838
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5581122bd2c0..ab38fef1c9a1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -72,22 +72,11 @@ int writeback_in_progress(struct backing_dev_info *bdi)
72static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) 72static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
73{ 73{
74 struct super_block *sb = inode->i_sb; 74 struct super_block *sb = inode->i_sb;
75 struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
76 75
77 /* 76 if (strcmp(sb->s_type->name, "bdev") == 0)
78 * For inodes on standard filesystems, we use superblock's bdi. For 77 return inode->i_mapping->backing_dev_info;
79 * inodes on virtual filesystems, we want to use inode mapping's bdi 78
80 * because they can possibly point to something useful (think about 79 return sb->s_bdi;
81 * block_dev filesystem).
82 */
83 if (sb->s_bdi && sb->s_bdi != &noop_backing_dev_info) {
84 /* Some device inodes could play dirty tricks. Catch them... */
85 WARN(bdi != sb->s_bdi && bdi_cap_writeback_dirty(bdi),
86 "Dirtiable inode bdi %s != sb bdi %s\n",
87 bdi->name, sb->s_bdi->name);
88 return sb->s_bdi;
89 }
90 return bdi;
91} 80}
92 81
93static void bdi_queue_work(struct backing_dev_info *bdi, 82static void bdi_queue_work(struct backing_dev_info *bdi,
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index d367af1514ef..cde755cca564 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1354,7 +1354,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1354 loff_t file_size; 1354 loff_t file_size;
1355 unsigned int num; 1355 unsigned int num;
1356 unsigned int offset; 1356 unsigned int offset;
1357 size_t total_len; 1357 size_t total_len = 0;
1358 1358
1359 req = fuse_get_req(fc); 1359 req = fuse_get_req(fc);
1360 if (IS_ERR(req)) 1360 if (IS_ERR(req))
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cc9665522148..c465ae066c62 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL && (64BIT || LBDAF) 3 depends on (64BIT || LBDAF)
4 select DLM if GFS2_FS_LOCKING_DLM 4 select DLM if GFS2_FS_LOCKING_DLM
5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM 5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
6 select SYSFS if GFS2_FS_LOCKING_DLM 6 select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 194fe16d8418..6b24afb96aae 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
36#include "glops.h" 36#include "glops.h"
37 37
38 38
39static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, 39void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
40 unsigned int from, unsigned int to) 40 unsigned int from, unsigned int to)
41{ 41{
42 struct buffer_head *head = page_buffers(page); 42 struct buffer_head *head = page_buffers(page);
43 unsigned int bsize = head->b_size; 43 unsigned int bsize = head->b_size;
@@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
615 unsigned int data_blocks = 0, ind_blocks = 0, rblocks; 615 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
616 int alloc_required; 616 int alloc_required;
617 int error = 0; 617 int error = 0;
618 struct gfs2_alloc *al; 618 struct gfs2_alloc *al = NULL;
619 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 619 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
620 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 620 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
621 unsigned to = from + len; 621 unsigned to = from + len;
@@ -663,6 +663,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
663 rblocks += RES_STATFS + RES_QUOTA; 663 rblocks += RES_STATFS + RES_QUOTA;
664 if (&ip->i_inode == sdp->sd_rindex) 664 if (&ip->i_inode == sdp->sd_rindex)
665 rblocks += 2 * RES_STATFS; 665 rblocks += 2 * RES_STATFS;
666 if (alloc_required)
667 rblocks += gfs2_rg_blocks(al);
666 668
667 error = gfs2_trans_begin(sdp, rblocks, 669 error = gfs2_trans_begin(sdp, rblocks,
668 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); 670 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -696,13 +698,11 @@ out:
696 698
697 page_cache_release(page); 699 page_cache_release(page);
698 700
699 /* 701 gfs2_trans_end(sdp);
700 * XXX(truncate): the call below should probably be replaced with
701 * a call to the gfs2-specific truncate blocks helper to actually
702 * release disk blocks..
703 */
704 if (pos + len > ip->i_inode.i_size) 702 if (pos + len > ip->i_inode.i_size)
705 truncate_setsize(&ip->i_inode, ip->i_inode.i_size); 703 gfs2_trim_blocks(&ip->i_inode);
704 goto out_trans_fail;
705
706out_endtrans: 706out_endtrans:
707 gfs2_trans_end(sdp); 707 gfs2_trans_end(sdp);
708out_trans_fail: 708out_trans_fail:
@@ -802,10 +802,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
802 page_cache_release(page); 802 page_cache_release(page);
803 803
804 if (copied) { 804 if (copied) {
805 if (inode->i_size < to) { 805 if (inode->i_size < to)
806 i_size_write(inode, to); 806 i_size_write(inode, to);
807 ip->i_disksize = inode->i_size;
808 }
809 gfs2_dinode_out(ip, di); 807 gfs2_dinode_out(ip, di);
810 mark_inode_dirty(inode); 808 mark_inode_dirty(inode);
811 } 809 }
@@ -876,8 +874,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
876 874
877 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 875 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
878 if (ret > 0) { 876 if (ret > 0) {
879 if (inode->i_size > ip->i_disksize)
880 ip->i_disksize = inode->i_size;
881 gfs2_dinode_out(ip, dibh->b_data); 877 gfs2_dinode_out(ip, dibh->b_data);
882 mark_inode_dirty(inode); 878 mark_inode_dirty(inode);
883 } 879 }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6f482809d1a3..5476c066d4ee 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
50 * @ip: the inode 50 * @ip: the inode
51 * @dibh: the dinode buffer 51 * @dibh: the dinode buffer
52 * @block: the block number that was allocated 52 * @block: the block number that was allocated
53 * @private: any locked page held by the caller process 53 * @page: The (optional) page. This is looked up if @page is NULL
54 * 54 *
55 * Returns: errno 55 * Returns: errno
56 */ 56 */
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
109/** 109/**
110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big 110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
111 * @ip: The GFS2 inode to unstuff 111 * @ip: The GFS2 inode to unstuff
112 * @unstuffer: the routine that handles unstuffing a non-zero length file 112 * @page: The (optional) page. This is looked up if the @page is NULL
113 * @private: private data for the unstuffer
114 * 113 *
115 * This routine unstuffs a dinode and returns it to a "normal" state such 114 * This routine unstuffs a dinode and returns it to a "normal" state such
116 * that the height can be grown in the traditional way. 115 * that the height can be grown in the traditional way.
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
132 if (error) 131 if (error)
133 goto out; 132 goto out;
134 133
135 if (ip->i_disksize) { 134 if (i_size_read(&ip->i_inode)) {
136 /* Get a free block, fill it with the stuffed data, 135 /* Get a free block, fill it with the stuffed data,
137 and write it out to disk */ 136 and write it out to disk */
138 137
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
161 di = (struct gfs2_dinode *)dibh->b_data; 160 di = (struct gfs2_dinode *)dibh->b_data;
162 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 161 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
163 162
164 if (ip->i_disksize) { 163 if (i_size_read(&ip->i_inode)) {
165 *(__be64 *)(di + 1) = cpu_to_be64(block); 164 *(__be64 *)(di + 1) = cpu_to_be64(block);
166 gfs2_add_inode_blocks(&ip->i_inode, 1); 165 gfs2_add_inode_blocks(&ip->i_inode, 1);
167 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 166 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -885,83 +884,14 @@ out:
885} 884}
886 885
887/** 886/**
888 * do_grow - Make a file look bigger than it is
889 * @ip: the inode
890 * @size: the size to set the file to
891 *
892 * Called with an exclusive lock on @ip.
893 *
894 * Returns: errno
895 */
896
897static int do_grow(struct gfs2_inode *ip, u64 size)
898{
899 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
900 struct gfs2_alloc *al;
901 struct buffer_head *dibh;
902 int error;
903
904 al = gfs2_alloc_get(ip);
905 if (!al)
906 return -ENOMEM;
907
908 error = gfs2_quota_lock_check(ip);
909 if (error)
910 goto out;
911
912 al->al_requested = sdp->sd_max_height + RES_DATA;
913
914 error = gfs2_inplace_reserve(ip);
915 if (error)
916 goto out_gunlock_q;
917
918 error = gfs2_trans_begin(sdp,
919 sdp->sd_max_height + al->al_rgd->rd_length +
920 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
921 if (error)
922 goto out_ipres;
923
924 error = gfs2_meta_inode_buffer(ip, &dibh);
925 if (error)
926 goto out_end_trans;
927
928 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
929 if (gfs2_is_stuffed(ip)) {
930 error = gfs2_unstuff_dinode(ip, NULL);
931 if (error)
932 goto out_brelse;
933 }
934 }
935
936 ip->i_disksize = size;
937 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
938 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
939 gfs2_dinode_out(ip, dibh->b_data);
940
941out_brelse:
942 brelse(dibh);
943out_end_trans:
944 gfs2_trans_end(sdp);
945out_ipres:
946 gfs2_inplace_release(ip);
947out_gunlock_q:
948 gfs2_quota_unlock(ip);
949out:
950 gfs2_alloc_put(ip);
951 return error;
952}
953
954
955/**
956 * gfs2_block_truncate_page - Deal with zeroing out data for truncate 887 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
957 * 888 *
958 * This is partly borrowed from ext3. 889 * This is partly borrowed from ext3.
959 */ 890 */
960static int gfs2_block_truncate_page(struct address_space *mapping) 891static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
961{ 892{
962 struct inode *inode = mapping->host; 893 struct inode *inode = mapping->host;
963 struct gfs2_inode *ip = GFS2_I(inode); 894 struct gfs2_inode *ip = GFS2_I(inode);
964 loff_t from = inode->i_size;
965 unsigned long index = from >> PAGE_CACHE_SHIFT; 895 unsigned long index = from >> PAGE_CACHE_SHIFT;
966 unsigned offset = from & (PAGE_CACHE_SIZE-1); 896 unsigned offset = from & (PAGE_CACHE_SIZE-1);
967 unsigned blocksize, iblock, length, pos; 897 unsigned blocksize, iblock, length, pos;
@@ -1023,9 +953,11 @@ unlock:
1023 return err; 953 return err;
1024} 954}
1025 955
1026static int trunc_start(struct gfs2_inode *ip, u64 size) 956static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1027{ 957{
1028 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 958 struct gfs2_inode *ip = GFS2_I(inode);
959 struct gfs2_sbd *sdp = GFS2_SB(inode);
960 struct address_space *mapping = inode->i_mapping;
1029 struct buffer_head *dibh; 961 struct buffer_head *dibh;
1030 int journaled = gfs2_is_jdata(ip); 962 int journaled = gfs2_is_jdata(ip);
1031 int error; 963 int error;
@@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1039 if (error) 971 if (error)
1040 goto out; 972 goto out;
1041 973
974 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
975
1042 if (gfs2_is_stuffed(ip)) { 976 if (gfs2_is_stuffed(ip)) {
1043 u64 dsize = size + sizeof(struct gfs2_dinode); 977 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1044 ip->i_disksize = size;
1045 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1046 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1047 gfs2_dinode_out(ip, dibh->b_data);
1048 if (dsize > dibh->b_size)
1049 dsize = dibh->b_size;
1050 gfs2_buffer_clear_tail(dibh, dsize);
1051 error = 1;
1052 } else { 978 } else {
1053 if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) 979 if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
1054 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 980 error = gfs2_block_truncate_page(mapping, newsize);
1055 981 if (error)
1056 if (!error) { 982 goto out_brelse;
1057 ip->i_disksize = size;
1058 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1059 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1060 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1061 gfs2_dinode_out(ip, dibh->b_data);
1062 } 983 }
984 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1063 } 985 }
1064 986
1065 brelse(dibh); 987 i_size_write(inode, newsize);
988 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
989 gfs2_dinode_out(ip, dibh->b_data);
1066 990
991 truncate_pagecache(inode, oldsize, newsize);
992out_brelse:
993 brelse(dibh);
1067out: 994out:
1068 gfs2_trans_end(sdp); 995 gfs2_trans_end(sdp);
1069 return error; 996 return error;
@@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip)
1123 if (error) 1050 if (error)
1124 goto out; 1051 goto out;
1125 1052
1126 if (!ip->i_disksize) { 1053 if (!i_size_read(&ip->i_inode)) {
1127 ip->i_height = 0; 1054 ip->i_height = 0;
1128 ip->i_goal = ip->i_no_addr; 1055 ip->i_goal = ip->i_no_addr;
1129 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1056 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1143,92 +1070,154 @@ out:
1143 1070
1144/** 1071/**
1145 * do_shrink - make a file smaller 1072 * do_shrink - make a file smaller
1146 * @ip: the inode 1073 * @inode: the inode
1147 * @size: the size to make the file 1074 * @oldsize: the current inode size
1148 * @truncator: function to truncate the last partial block 1075 * @newsize: the size to make the file
1149 * 1076 *
1150 * Called with an exclusive lock on @ip. 1077 * Called with an exclusive lock on @inode. The @size must
1078 * be equal to or smaller than the current inode size.
1151 * 1079 *
1152 * Returns: errno 1080 * Returns: errno
1153 */ 1081 */
1154 1082
1155static int do_shrink(struct gfs2_inode *ip, u64 size) 1083static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
1156{ 1084{
1085 struct gfs2_inode *ip = GFS2_I(inode);
1157 int error; 1086 int error;
1158 1087
1159 error = trunc_start(ip, size); 1088 error = trunc_start(inode, oldsize, newsize);
1160 if (error < 0) 1089 if (error < 0)
1161 return error; 1090 return error;
1162 if (error > 0) 1091 if (gfs2_is_stuffed(ip))
1163 return 0; 1092 return 0;
1164 1093
1165 error = trunc_dealloc(ip, size); 1094 error = trunc_dealloc(ip, newsize);
1166 if (!error) 1095 if (error == 0)
1167 error = trunc_end(ip); 1096 error = trunc_end(ip);
1168 1097
1169 return error; 1098 return error;
1170} 1099}
1171 1100
1172static int do_touch(struct gfs2_inode *ip, u64 size) 1101void gfs2_trim_blocks(struct inode *inode)
1173{ 1102{
1174 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1103 u64 size = inode->i_size;
1104 int ret;
1105
1106 ret = do_shrink(inode, size, size);
1107 WARN_ON(ret != 0);
1108}
1109
1110/**
1111 * do_grow - Touch and update inode size
1112 * @inode: The inode
1113 * @size: The new size
1114 *
1115 * This function updates the timestamps on the inode and
1116 * may also increase the size of the inode. This function
1117 * must not be called with @size any smaller than the current
1118 * inode size.
1119 *
1120 * Although it is not strictly required to unstuff files here,
1121 * earlier versions of GFS2 have a bug in the stuffed file reading
1122 * code which will result in a buffer overrun if the size is larger
1123 * than the max stuffed file size. In order to prevent this from
1124 * occuring, such files are unstuffed, but in other cases we can
1125 * just update the inode size directly.
1126 *
1127 * Returns: 0 on success, or -ve on error
1128 */
1129
1130static int do_grow(struct inode *inode, u64 size)
1131{
1132 struct gfs2_inode *ip = GFS2_I(inode);
1133 struct gfs2_sbd *sdp = GFS2_SB(inode);
1175 struct buffer_head *dibh; 1134 struct buffer_head *dibh;
1135 struct gfs2_alloc *al = NULL;
1176 int error; 1136 int error;
1177 1137
1178 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 1138 if (gfs2_is_stuffed(ip) &&
1139 (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1140 al = gfs2_alloc_get(ip);
1141 if (al == NULL)
1142 return -ENOMEM;
1143
1144 error = gfs2_quota_lock_check(ip);
1145 if (error)
1146 goto do_grow_alloc_put;
1147
1148 al->al_requested = 1;
1149 error = gfs2_inplace_reserve(ip);
1150 if (error)
1151 goto do_grow_qunlock;
1152 }
1153
1154 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
1179 if (error) 1155 if (error)
1180 return error; 1156 goto do_grow_release;
1181 1157
1182 down_write(&ip->i_rw_mutex); 1158 if (al) {
1159 error = gfs2_unstuff_dinode(ip, NULL);
1160 if (error)
1161 goto do_end_trans;
1162 }
1183 1163
1184 error = gfs2_meta_inode_buffer(ip, &dibh); 1164 error = gfs2_meta_inode_buffer(ip, &dibh);
1185 if (error) 1165 if (error)
1186 goto do_touch_out; 1166 goto do_end_trans;
1187 1167
1168 i_size_write(inode, size);
1188 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1169 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1189 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1170 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1190 gfs2_dinode_out(ip, dibh->b_data); 1171 gfs2_dinode_out(ip, dibh->b_data);
1191 brelse(dibh); 1172 brelse(dibh);
1192 1173
1193do_touch_out: 1174do_end_trans:
1194 up_write(&ip->i_rw_mutex);
1195 gfs2_trans_end(sdp); 1175 gfs2_trans_end(sdp);
1176do_grow_release:
1177 if (al) {
1178 gfs2_inplace_release(ip);
1179do_grow_qunlock:
1180 gfs2_quota_unlock(ip);
1181do_grow_alloc_put:
1182 gfs2_alloc_put(ip);
1183 }
1196 return error; 1184 return error;
1197} 1185}
1198 1186
1199/** 1187/**
1200 * gfs2_truncatei - make a file a given size 1188 * gfs2_setattr_size - make a file a given size
1201 * @ip: the inode 1189 * @inode: the inode
1202 * @size: the size to make the file 1190 * @newsize: the size to make the file
1203 * @truncator: function to truncate the last partial block
1204 * 1191 *
1205 * The file size can grow, shrink, or stay the same size. 1192 * The file size can grow, shrink, or stay the same size. This
1193 * is called holding i_mutex and an exclusive glock on the inode
1194 * in question.
1206 * 1195 *
1207 * Returns: errno 1196 * Returns: errno
1208 */ 1197 */
1209 1198
1210int gfs2_truncatei(struct gfs2_inode *ip, u64 size) 1199int gfs2_setattr_size(struct inode *inode, u64 newsize)
1211{ 1200{
1212 int error; 1201 int ret;
1202 u64 oldsize;
1213 1203
1214 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) 1204 BUG_ON(!S_ISREG(inode->i_mode));
1215 return -EINVAL;
1216 1205
1217 if (size > ip->i_disksize) 1206 ret = inode_newsize_ok(inode, newsize);
1218 error = do_grow(ip, size); 1207 if (ret)
1219 else if (size < ip->i_disksize) 1208 return ret;
1220 error = do_shrink(ip, size);
1221 else
1222 /* update time stamps */
1223 error = do_touch(ip, size);
1224 1209
1225 return error; 1210 oldsize = inode->i_size;
1211 if (newsize >= oldsize)
1212 return do_grow(inode, newsize);
1213
1214 return do_shrink(inode, oldsize, newsize);
1226} 1215}
1227 1216
1228int gfs2_truncatei_resume(struct gfs2_inode *ip) 1217int gfs2_truncatei_resume(struct gfs2_inode *ip)
1229{ 1218{
1230 int error; 1219 int error;
1231 error = trunc_dealloc(ip, ip->i_disksize); 1220 error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1232 if (!error) 1221 if (!error)
1233 error = trunc_end(ip); 1222 error = trunc_end(ip);
1234 return error; 1223 return error;
@@ -1269,7 +1258,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1269 1258
1270 shift = sdp->sd_sb.sb_bsize_shift; 1259 shift = sdp->sd_sb.sb_bsize_shift;
1271 BUG_ON(gfs2_is_dir(ip)); 1260 BUG_ON(gfs2_is_dir(ip));
1272 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; 1261 end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1273 lblock = offset >> shift; 1262 lblock = offset >> shift;
1274 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; 1263 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1275 if (lblock_stop > end_of_file) 1264 if (lblock_stop > end_of_file)
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index a20a5213135a..42fea03e2bd9 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
44 } 44 }
45} 45}
46 46
47int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); 47extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
48int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); 48extern int gfs2_block_map(struct inode *inode, sector_t lblock,
49int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); 49 struct buffer_head *bh, int create);
50 50extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
51int gfs2_truncatei(struct gfs2_inode *ip, u64 size); 51 u64 *dblock, unsigned *extlen);
52int gfs2_truncatei_resume(struct gfs2_inode *ip); 52extern int gfs2_setattr_size(struct inode *inode, u64 size);
53int gfs2_file_dealloc(struct gfs2_inode *ip); 53extern void gfs2_trim_blocks(struct inode *inode);
54int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 54extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
55 unsigned int len); 55extern int gfs2_file_dealloc(struct gfs2_inode *ip);
56extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
57 unsigned int len);
56 58
57#endif /* __BMAP_DOT_H__ */ 59#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index bb7907bde3d8..6798755b3858 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
49 ip = GFS2_I(inode); 49 ip = GFS2_I(inode);
50 } 50 }
51 51
52 if (sdp->sd_args.ar_localcaching) 52 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
53 goto valid; 53 goto valid;
54 54
55 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); 55 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b9dd88a78dd4..5c356d09c321 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) 79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) 80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
81 81
82struct qstr gfs2_qdot __read_mostly;
83struct qstr gfs2_qdotdot __read_mostly;
84
82typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, 85typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
83 u64 leaf_no, void *data); 86 u64 leaf_no, void *data);
84typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, 87typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
127 130
128 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 131 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
129 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); 132 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
130 if (ip->i_disksize < offset + size) 133 if (ip->i_inode.i_size < offset + size)
131 ip->i_disksize = offset + size; 134 i_size_write(&ip->i_inode, offset + size);
132 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 135 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
133 gfs2_dinode_out(ip, dibh->b_data); 136 gfs2_dinode_out(ip, dibh->b_data);
134 137
@@ -225,8 +228,8 @@ out:
225 if (error) 228 if (error)
226 return error; 229 return error;
227 230
228 if (ip->i_disksize < offset + copied) 231 if (ip->i_inode.i_size < offset + copied)
229 ip->i_disksize = offset + copied; 232 i_size_write(&ip->i_inode, offset + copied);
230 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 233 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
231 234
232 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 235 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
275 unsigned int o; 278 unsigned int o;
276 int copied = 0; 279 int copied = 0;
277 int error = 0; 280 int error = 0;
281 u64 disksize = i_size_read(&ip->i_inode);
278 282
279 if (offset >= ip->i_disksize) 283 if (offset >= disksize)
280 return 0; 284 return 0;
281 285
282 if (offset + size > ip->i_disksize) 286 if (offset + size > disksize)
283 size = ip->i_disksize - offset; 287 size = disksize - offset;
284 288
285 if (!size) 289 if (!size)
286 return 0; 290 return 0;
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
727 unsigned hsize = 1 << ip->i_depth; 731 unsigned hsize = 1 << ip->i_depth;
728 unsigned index; 732 unsigned index;
729 u64 ln; 733 u64 ln;
730 if (hsize * sizeof(u64) != ip->i_disksize) { 734 if (hsize * sizeof(u64) != i_size_read(inode)) {
731 gfs2_consist_inode(ip); 735 gfs2_consist_inode(ip);
732 return ERR_PTR(-EIO); 736 return ERR_PTR(-EIO);
733 } 737 }
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode)
879 for (x = sdp->sd_hash_ptrs; x--; lp++) 883 for (x = sdp->sd_hash_ptrs; x--; lp++)
880 *lp = cpu_to_be64(bn); 884 *lp = cpu_to_be64(bn);
881 885
882 dip->i_disksize = sdp->sd_sb.sb_bsize / 2; 886 i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
883 gfs2_add_inode_blocks(&dip->i_inode, 1); 887 gfs2_add_inode_blocks(&dip->i_inode, 1);
884 dip->i_diskflags |= GFS2_DIF_EXHASH; 888 dip->i_diskflags |= GFS2_DIF_EXHASH;
885 889
@@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1057 u64 *buf; 1061 u64 *buf;
1058 u64 *from, *to; 1062 u64 *from, *to;
1059 u64 block; 1063 u64 block;
1064 u64 disksize = i_size_read(&dip->i_inode);
1060 int x; 1065 int x;
1061 int error = 0; 1066 int error = 0;
1062 1067
1063 hsize = 1 << dip->i_depth; 1068 hsize = 1 << dip->i_depth;
1064 if (hsize * sizeof(u64) != dip->i_disksize) { 1069 if (hsize * sizeof(u64) != disksize) {
1065 gfs2_consist_inode(dip); 1070 gfs2_consist_inode(dip);
1066 return -EIO; 1071 return -EIO;
1067 } 1072 }
@@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1072 if (!buf) 1077 if (!buf)
1073 return -ENOMEM; 1078 return -ENOMEM;
1074 1079
1075 for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { 1080 for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
1076 error = gfs2_dir_read_data(dip, (char *)buf, 1081 error = gfs2_dir_read_data(dip, (char *)buf,
1077 block * sdp->sd_hash_bsize, 1082 block * sdp->sd_hash_bsize,
1078 sdp->sd_hash_bsize, 1); 1083 sdp->sd_hash_bsize, 1);
@@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1370 unsigned depth = 0; 1375 unsigned depth = 0;
1371 1376
1372 hsize = 1 << dip->i_depth; 1377 hsize = 1 << dip->i_depth;
1373 if (hsize * sizeof(u64) != dip->i_disksize) { 1378 if (hsize * sizeof(u64) != i_size_read(inode)) {
1374 gfs2_consist_inode(dip); 1379 gfs2_consist_inode(dip);
1375 return -EIO; 1380 return -EIO;
1376 } 1381 }
@@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1784 int error = 0; 1789 int error = 0;
1785 1790
1786 hsize = 1 << dip->i_depth; 1791 hsize = 1 << dip->i_depth;
1787 if (hsize * sizeof(u64) != dip->i_disksize) { 1792 if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
1788 gfs2_consist_inode(dip); 1793 gfs2_consist_inode(dip);
1789 return -EIO; 1794 return -EIO;
1790 } 1795 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f919440c3be..a98f644bd3df 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
17struct gfs2_inode; 17struct gfs2_inode;
18struct gfs2_inum; 18struct gfs2_inum;
19 19
20struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename); 20extern struct inode *gfs2_dir_search(struct inode *dir,
21int gfs2_dir_check(struct inode *dir, const struct qstr *filename, 21 const struct qstr *filename);
22 const struct gfs2_inode *ip); 22extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
23int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 23 const struct gfs2_inode *ip);
24 const struct gfs2_inode *ip, unsigned int type); 24extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
25int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); 25 const struct gfs2_inode *ip, unsigned int type);
26int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, 26extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
27 filldir_t filldir); 27extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
28int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, 28 filldir_t filldir);
29 const struct gfs2_inode *nip, unsigned int new_type); 29extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
30 const struct gfs2_inode *nip, unsigned int new_type);
30 31
31int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); 32extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
32 33
33int gfs2_diradd_alloc_required(struct inode *dir, 34extern int gfs2_diradd_alloc_required(struct inode *dir,
34 const struct qstr *filename); 35 const struct qstr *filename);
35int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, 36extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
36 struct buffer_head **bhp); 37 struct buffer_head **bhp);
37 38
38static inline u32 gfs2_disk_hash(const char *data, int len) 39static inline u32 gfs2_disk_hash(const char *data, int len)
39{ 40{
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
61 memcpy(dent + 1, name->name, name->len); 62 memcpy(dent + 1, name->name, name->len);
62} 63}
63 64
65extern struct qstr gfs2_qdot;
66extern struct qstr gfs2_qdotdot;
67
64#endif /* __DIR_DOT_H__ */ 68#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index dfe237a3f8ad..06d582732d34 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name,
126 126
127static struct dentry *gfs2_get_parent(struct dentry *child) 127static struct dentry *gfs2_get_parent(struct dentry *child)
128{ 128{
129 struct qstr dotdot;
130 struct dentry *dentry; 129 struct dentry *dentry;
131 130
132 /* 131 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
133 * XXX(hch): it would be a good idea to keep this around as a
134 * static variable.
135 */
136 gfs2_str2qstr(&dotdot, "..");
137
138 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
139 if (!IS_ERR(dentry)) 132 if (!IS_ERR(dentry))
140 dentry->d_op = &gfs2_dops; 133 dentry->d_op = &gfs2_dops;
141 return dentry; 134 return dentry;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 8fcfefb96077..a51079bd4af1 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -382,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
382 rblocks = RES_DINODE + ind_blocks; 382 rblocks = RES_DINODE + ind_blocks;
383 if (gfs2_is_jdata(ip)) 383 if (gfs2_is_jdata(ip))
384 rblocks += data_blocks ? data_blocks : 1; 384 rblocks += data_blocks ? data_blocks : 1;
385 if (ind_blocks || data_blocks) 385 if (ind_blocks || data_blocks) {
386 rblocks += RES_STATFS + RES_QUOTA; 386 rblocks += RES_STATFS + RES_QUOTA;
387 rblocks += gfs2_rg_blocks(al);
388 }
387 ret = gfs2_trans_begin(sdp, rblocks, 0); 389 ret = gfs2_trans_begin(sdp, rblocks, 0);
388 if (ret) 390 if (ret)
389 goto out_trans_fail; 391 goto out_trans_fail;
@@ -491,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
491 goto fail; 493 goto fail;
492 494
493 if (!(file->f_flags & O_LARGEFILE) && 495 if (!(file->f_flags & O_LARGEFILE) &&
494 ip->i_disksize > MAX_NON_LFS) { 496 i_size_read(inode) > MAX_NON_LFS) {
495 error = -EOVERFLOW; 497 error = -EOVERFLOW;
496 goto fail_gunlock; 498 goto fail_gunlock;
497 } 499 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9adf8f924e08..87778857f099 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
441 else 441 else
442 gfs2_glock_put_nolock(gl); 442 gfs2_glock_put_nolock(gl);
443 } 443 }
444 if (held1 && held2 && list_empty(&gl->gl_holders))
445 clear_bit(GLF_QUEUED, &gl->gl_flags);
444 446
445 gl->gl_state = new_state; 447 gl->gl_state = new_state;
446 gl->gl_tchange = jiffies; 448 gl->gl_tchange = jiffies;
@@ -1012,6 +1014,7 @@ fail:
1012 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) 1014 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
1013 insert_pt = &gh2->gh_list; 1015 insert_pt = &gh2->gh_list;
1014 } 1016 }
1017 set_bit(GLF_QUEUED, &gl->gl_flags);
1015 if (likely(insert_pt == NULL)) { 1018 if (likely(insert_pt == NULL)) {
1016 list_add_tail(&gh->gh_list, &gl->gl_holders); 1019 list_add_tail(&gh->gh_list, &gl->gl_holders);
1017 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) 1020 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1310,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1310 1313
1311 gfs2_glock_hold(gl); 1314 gfs2_glock_hold(gl);
1312 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; 1315 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1313 if (time_before(now, holdtime)) 1316 if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
1314 delay = holdtime - now; 1317 if (time_before(now, holdtime))
1315 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) 1318 delay = holdtime - now;
1316 delay = gl->gl_ops->go_min_hold_time; 1319 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
1320 delay = gl->gl_ops->go_min_hold_time;
1321 }
1317 1322
1318 spin_lock(&gl->gl_spin); 1323 spin_lock(&gl->gl_spin);
1319 handle_callback(gl, state, delay); 1324 handle_callback(gl, state, delay);
@@ -1512,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl)
1512 spin_unlock(&lru_lock); 1517 spin_unlock(&lru_lock);
1513 1518
1514 spin_lock(&gl->gl_spin); 1519 spin_lock(&gl->gl_spin);
1515 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) 1520 if (gl->gl_state != LM_ST_UNLOCKED)
1516 handle_callback(gl, LM_ST_UNLOCKED, 0); 1521 handle_callback(gl, LM_ST_UNLOCKED, 0);
1517 spin_unlock(&gl->gl_spin); 1522 spin_unlock(&gl->gl_spin);
1518 gfs2_glock_hold(gl); 1523 gfs2_glock_hold(gl);
@@ -1660,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
1660 *p++ = 'I'; 1665 *p++ = 'I';
1661 if (test_bit(GLF_FROZEN, gflags)) 1666 if (test_bit(GLF_FROZEN, gflags))
1662 *p++ = 'F'; 1667 *p++ = 'F';
1668 if (test_bit(GLF_QUEUED, gflags))
1669 *p++ = 'q';
1663 *p = 0; 1670 *p = 0;
1664 return buf; 1671 return buf;
1665} 1672}
@@ -1776,10 +1783,12 @@ int __init gfs2_glock_init(void)
1776 } 1783 }
1777#endif 1784#endif
1778 1785
1779 glock_workqueue = create_workqueue("glock_workqueue"); 1786 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
1787 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1780 if (IS_ERR(glock_workqueue)) 1788 if (IS_ERR(glock_workqueue))
1781 return PTR_ERR(glock_workqueue); 1789 return PTR_ERR(glock_workqueue);
1782 gfs2_delete_workqueue = create_workqueue("delete_workqueue"); 1790 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
1791 WQ_FREEZEABLE, 0);
1783 if (IS_ERR(gfs2_delete_workqueue)) { 1792 if (IS_ERR(gfs2_delete_workqueue)) {
1784 destroy_workqueue(glock_workqueue); 1793 destroy_workqueue(glock_workqueue);
1785 return PTR_ERR(gfs2_delete_workqueue); 1794 return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2bda1911b156..db1c26d6d220 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
216 216
217/** 217/**
218 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock 218 * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
219 * @gl: the glock 219 * @gl: the glock
220 * @state: the state we're requesting 220 * @state: the state we're requesting
221 * @flags: the modifier flags 221 * @flags: the modifier flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49f97d3bb690..0d149dcc04e5 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
262 const struct gfs2_inode *ip = gl->gl_object; 262 const struct gfs2_inode *ip = gl->gl_object;
263 if (ip == NULL) 263 if (ip == NULL)
264 return 0; 264 return 0;
265 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n", 265 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
266 (unsigned long long)ip->i_no_formal_ino, 266 (unsigned long long)ip->i_no_formal_ino,
267 (unsigned long long)ip->i_no_addr, 267 (unsigned long long)ip->i_no_addr,
268 IF2DT(ip->i_inode.i_mode), ip->i_flags, 268 IF2DT(ip->i_inode.i_mode), ip->i_flags,
269 (unsigned int)ip->i_diskflags, 269 (unsigned int)ip->i_diskflags,
270 (unsigned long long)ip->i_inode.i_size, 270 (unsigned long long)i_size_read(&ip->i_inode));
271 (unsigned long long)ip->i_disksize);
272 return 0; 271 return 0;
273} 272}
274 273
@@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
453 [LM_TYPE_META] = &gfs2_meta_glops, 452 [LM_TYPE_META] = &gfs2_meta_glops,
454 [LM_TYPE_INODE] = &gfs2_inode_glops, 453 [LM_TYPE_INODE] = &gfs2_inode_glops,
455 [LM_TYPE_RGRP] = &gfs2_rgrp_glops, 454 [LM_TYPE_RGRP] = &gfs2_rgrp_glops,
456 [LM_TYPE_NONDISK] = &gfs2_trans_glops,
457 [LM_TYPE_IOPEN] = &gfs2_iopen_glops, 455 [LM_TYPE_IOPEN] = &gfs2_iopen_glops,
458 [LM_TYPE_FLOCK] = &gfs2_flock_glops, 456 [LM_TYPE_FLOCK] = &gfs2_flock_glops,
459 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, 457 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fdbf4b366fa5..764fbb49efc8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -196,6 +196,7 @@ enum {
196 GLF_REPLY_PENDING = 9, 196 GLF_REPLY_PENDING = 9,
197 GLF_INITIAL = 10, 197 GLF_INITIAL = 10,
198 GLF_FROZEN = 11, 198 GLF_FROZEN = 11,
199 GLF_QUEUED = 12,
199}; 200};
200 201
201struct gfs2_glock { 202struct gfs2_glock {
@@ -267,7 +268,6 @@ struct gfs2_inode {
267 u64 i_no_formal_ino; 268 u64 i_no_formal_ino;
268 u64 i_generation; 269 u64 i_generation;
269 u64 i_eattr; 270 u64 i_eattr;
270 loff_t i_disksize;
271 unsigned long i_flags; /* GIF_... */ 271 unsigned long i_flags; /* GIF_... */
272 struct gfs2_glock *i_gl; /* Move into i_gh? */ 272 struct gfs2_glock *i_gl; /* Move into i_gh? */
273 struct gfs2_holder i_iopen_gh; 273 struct gfs2_holder i_iopen_gh;
@@ -416,11 +416,8 @@ struct gfs2_args {
416 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ 416 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
417 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ 417 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
418 unsigned int ar_spectator:1; /* Don't get a journal */ 418 unsigned int ar_spectator:1; /* Don't get a journal */
419 unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */
420 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ 419 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */
421 unsigned int ar_localcaching:1; /* Local caching */
422 unsigned int ar_debug:1; /* Oops on errors */ 420 unsigned int ar_debug:1; /* Oops on errors */
423 unsigned int ar_upgrade:1; /* Upgrade ondisk format */
424 unsigned int ar_posix_acl:1; /* Enable posix acls */ 421 unsigned int ar_posix_acl:1; /* Enable posix acls */
425 unsigned int ar_quota:2; /* off/account/on */ 422 unsigned int ar_quota:2; /* off/account/on */
426 unsigned int ar_suiddir:1; /* suiddir support */ 423 unsigned int ar_suiddir:1; /* suiddir support */
@@ -497,7 +494,7 @@ struct gfs2_sb_host {
497 */ 494 */
498 495
499struct lm_lockstruct { 496struct lm_lockstruct {
500 unsigned int ls_jid; 497 int ls_jid;
501 unsigned int ls_first; 498 unsigned int ls_first;
502 unsigned int ls_first_done; 499 unsigned int ls_first_done;
503 unsigned int ls_nodir; 500 unsigned int ls_nodir;
@@ -572,6 +569,7 @@ struct gfs2_sbd {
572 struct list_head sd_rindex_mru_list; 569 struct list_head sd_rindex_mru_list;
573 struct gfs2_rgrpd *sd_rindex_forward; 570 struct gfs2_rgrpd *sd_rindex_forward;
574 unsigned int sd_rgrps; 571 unsigned int sd_rgrps;
572 unsigned int sd_max_rg_data;
575 573
576 /* Journal index stuff */ 574 /* Journal index stuff */
577 575
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 08140f185a37..06370f8bd8cf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
359 * to do that. 359 * to do that.
360 */ 360 */
361 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); 361 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
362 ip->i_disksize = be64_to_cpu(str->di_size); 362 i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
363 i_size_write(&ip->i_inode, ip->i_disksize);
364 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); 363 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
365 atime.tv_sec = be64_to_cpu(str->di_atime); 364 atime.tv_sec = be64_to_cpu(str->di_atime);
366 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 365 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -1055,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1055 str->di_uid = cpu_to_be32(ip->i_inode.i_uid); 1054 str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
1056 str->di_gid = cpu_to_be32(ip->i_inode.i_gid); 1055 str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
1057 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); 1056 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
1058 str->di_size = cpu_to_be64(ip->i_disksize); 1057 str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
1059 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 1058 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
1060 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); 1059 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1061 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); 1060 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1085,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
1085 (unsigned long long)ip->i_no_formal_ino); 1084 (unsigned long long)ip->i_no_formal_ino);
1086 printk(KERN_INFO " no_addr = %llu\n", 1085 printk(KERN_INFO " no_addr = %llu\n",
1087 (unsigned long long)ip->i_no_addr); 1086 (unsigned long long)ip->i_no_addr);
1088 printk(KERN_INFO " i_disksize = %llu\n", 1087 printk(KERN_INFO " i_size = %llu\n",
1089 (unsigned long long)ip->i_disksize); 1088 (unsigned long long)i_size_read(&ip->i_inode));
1090 printk(KERN_INFO " blocks = %llu\n", 1089 printk(KERN_INFO " blocks = %llu\n",
1091 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); 1090 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
1092 printk(KERN_INFO " i_goal = %llu\n", 1091 printk(KERN_INFO " i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 300ada3f21de..6720d7d5fbc6 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
19extern int gfs2_internal_read(struct gfs2_inode *ip, 19extern int gfs2_internal_read(struct gfs2_inode *ip,
20 struct file_ra_state *ra_state, 20 struct file_ra_state *ra_state,
21 char *buf, loff_t *pos, unsigned size); 21 char *buf, loff_t *pos, unsigned size);
22extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
23 unsigned int from, unsigned int to);
22extern void gfs2_set_aops(struct inode *inode); 24extern void gfs2_set_aops(struct inode *inode);
23 25
24static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) 26static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
80 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); 82 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
81} 83}
82 84
85static inline int gfs2_check_internal_file_size(struct inode *inode,
86 u64 minsize, u64 maxsize)
87{
88 u64 size = i_size_read(inode);
89 if (size < minsize || size > maxsize)
90 goto err;
91 if (size & ((1 << inode->i_blkbits) - 1))
92 goto err;
93 return 0;
94err:
95 gfs2_consist_inode(GFS2_I(inode));
96 return -EIO;
97}
83 98
84extern void gfs2_set_iop(struct inode *inode); 99extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 100extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e0470ed34c2..1c09425b45fd 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
42 ret |= LM_OUT_CANCELED; 42 ret |= LM_OUT_CANCELED;
43 goto out; 43 goto out;
44 case -EAGAIN: /* Try lock fails */ 44 case -EAGAIN: /* Try lock fails */
45 case -EDEADLK: /* Deadlock detected */
45 goto out; 46 goto out;
46 case -EINVAL: /* Invalid */ 47 case -ETIMEDOUT: /* Canceled due to timeout */
47 case -ENOMEM: /* Out of memory */
48 ret |= LM_OUT_ERROR; 48 ret |= LM_OUT_ERROR;
49 goto out; 49 goto out;
50 case 0: /* Success */ 50 case 0: /* Success */
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index b1e9630eb46a..d7eb1e209aa8 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,6 +24,7 @@
24#include "glock.h" 24#include "glock.h"
25#include "quota.h" 25#include "quota.h"
26#include "recovery.h" 26#include "recovery.h"
27#include "dir.h"
27 28
28static struct shrinker qd_shrinker = { 29static struct shrinker qd_shrinker = {
29 .shrink = gfs2_shrink_qd_memory, 30 .shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
78{ 79{
79 int error; 80 int error;
80 81
82 gfs2_str2qstr(&gfs2_qdot, ".");
83 gfs2_str2qstr(&gfs2_qdotdot, "..");
84
81 error = gfs2_sys_init(); 85 error = gfs2_sys_init();
82 if (error) 86 if (error)
83 return error; 87 return error;
@@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void)
140 144
141 error = -ENOMEM; 145 error = -ENOMEM;
142 gfs_recovery_wq = alloc_workqueue("gfs_recovery", 146 gfs_recovery_wq = alloc_workqueue("gfs_recovery",
143 WQ_NON_REENTRANT | WQ_RESCUER, 0); 147 WQ_RESCUER | WQ_FREEZEABLE, 0);
144 if (!gfs_recovery_wq) 148 if (!gfs_recovery_wq)
145 goto fail_wq; 149 goto fail_wq;
146 150
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4d4b1e8ac64c..aeafc233dc89 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -38,14 +38,6 @@
38#define DO 0 38#define DO 0
39#define UNDO 1 39#define UNDO 1
40 40
41static const u32 gfs2_old_fs_formats[] = {
42 0
43};
44
45static const u32 gfs2_old_multihost_formats[] = {
46 0
47};
48
49/** 41/**
50 * gfs2_tune_init - Fill a gfs2_tune structure with default values 42 * gfs2_tune_init - Fill a gfs2_tune structure with default values
51 * @gt: tune 43 * @gt: tune
@@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
135 127
136static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) 128static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
137{ 129{
138 unsigned int x;
139
140 if (sb->sb_magic != GFS2_MAGIC || 130 if (sb->sb_magic != GFS2_MAGIC ||
141 sb->sb_type != GFS2_METATYPE_SB) { 131 sb->sb_type != GFS2_METATYPE_SB) {
142 if (!silent) 132 if (!silent)
@@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
150 sb->sb_multihost_format == GFS2_FORMAT_MULTI) 140 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
151 return 0; 141 return 0;
152 142
153 if (sb->sb_fs_format != GFS2_FORMAT_FS) { 143 fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
154 for (x = 0; gfs2_old_fs_formats[x]; x++)
155 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
156 break;
157 144
158 if (!gfs2_old_fs_formats[x]) { 145 return -EINVAL;
159 printk(KERN_WARNING
160 "GFS2: code version (%u, %u) is incompatible "
161 "with ondisk format (%u, %u)\n",
162 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
163 sb->sb_fs_format, sb->sb_multihost_format);
164 printk(KERN_WARNING
165 "GFS2: I don't know how to upgrade this FS\n");
166 return -EINVAL;
167 }
168 }
169
170 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
171 for (x = 0; gfs2_old_multihost_formats[x]; x++)
172 if (gfs2_old_multihost_formats[x] ==
173 sb->sb_multihost_format)
174 break;
175
176 if (!gfs2_old_multihost_formats[x]) {
177 printk(KERN_WARNING
178 "GFS2: code version (%u, %u) is incompatible "
179 "with ondisk format (%u, %u)\n",
180 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
181 sb->sb_fs_format, sb->sb_multihost_format);
182 printk(KERN_WARNING
183 "GFS2: I don't know how to upgrade this FS\n");
184 return -EINVAL;
185 }
186 }
187
188 if (!sdp->sd_args.ar_upgrade) {
189 printk(KERN_WARNING
190 "GFS2: code version (%u, %u) is incompatible "
191 "with ondisk format (%u, %u)\n",
192 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
193 sb->sb_fs_format, sb->sb_multihost_format);
194 printk(KERN_INFO
195 "GFS2: Use the \"upgrade\" mount option to upgrade "
196 "the FS\n");
197 printk(KERN_INFO "GFS2: See the manual for more details\n");
198 return -EINVAL;
199 }
200
201 return 0;
202} 146}
203 147
204static void end_bio_io_page(struct bio *bio, int error) 148static void end_bio_io_page(struct bio *bio, int error)
@@ -586,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
586 530
587 prev_db = 0; 531 prev_db = 0;
588 532
589 for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) { 533 for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
590 bh.b_state = 0; 534 bh.b_state = 0;
591 bh.b_blocknr = 0; 535 bh.b_blocknr = 0;
592 bh.b_size = 1 << ip->i_inode.i_blkbits; 536 bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1022 if (!strcmp("lock_nolock", proto)) { 966 if (!strcmp("lock_nolock", proto)) {
1023 lm = &nolock_ops; 967 lm = &nolock_ops;
1024 sdp->sd_args.ar_localflocks = 1; 968 sdp->sd_args.ar_localflocks = 1;
1025 sdp->sd_args.ar_localcaching = 1;
1026#ifdef CONFIG_GFS2_FS_LOCKING_DLM 969#ifdef CONFIG_GFS2_FS_LOCKING_DLM
1027 } else if (!strcmp("lock_dlm", proto)) { 970 } else if (!strcmp("lock_dlm", proto)) {
1028 lm = &gfs2_dlm_ops; 971 lm = &gfs2_dlm_ops;
@@ -1113,8 +1056,6 @@ static int gfs2_journalid_wait(void *word)
1113 1056
1114static int wait_on_journal(struct gfs2_sbd *sdp) 1057static int wait_on_journal(struct gfs2_sbd *sdp)
1115{ 1058{
1116 if (sdp->sd_args.ar_spectator)
1117 return 0;
1118 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 1059 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
1119 return 0; 1060 return 0;
1120 1061
@@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1217 if (error) 1158 if (error)
1218 goto fail_sb; 1159 goto fail_sb;
1219 1160
1161 /*
1162 * If user space has failed to join the cluster or some similar
1163 * failure has occurred, then the journal id will contain a
1164 * negative (error) number. This will then be returned to the
1165 * caller (of the mount syscall). We do this even for spectator
1166 * mounts (which just write a jid of 0 to indicate "ok" even though
1167 * the jid is unused in the spectator case)
1168 */
1169 if (sdp->sd_lockstruct.ls_jid < 0) {
1170 error = sdp->sd_lockstruct.ls_jid;
1171 sdp->sd_lockstruct.ls_jid = 0;
1172 goto fail_sb;
1173 }
1174
1220 error = init_inodes(sdp, DO); 1175 error = init_inodes(sdp, DO);
1221 if (error) 1176 if (error)
1222 goto fail_sb; 1177 goto fail_sb;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1009be2c9737..0534510200d5 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,6 +18,8 @@
18#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
19#include <linux/crc32.h> 19#include <linux/crc32.h>
20#include <linux/fiemap.h> 20#include <linux/fiemap.h>
21#include <linux/swap.h>
22#include <linux/falloc.h>
21#include <asm/uaccess.h> 23#include <asm/uaccess.h>
22 24
23#include "gfs2.h" 25#include "gfs2.h"
@@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
217 goto out_gunlock_q; 219 goto out_gunlock_q;
218 220
219 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 221 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
220 al->al_rgd->rd_length + 222 gfs2_rg_blocks(al) +
221 2 * RES_DINODE + RES_STATFS + 223 2 * RES_DINODE + RES_STATFS +
222 RES_QUOTA, 0); 224 RES_QUOTA, 0);
223 if (error) 225 if (error)
@@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
406 408
407 ip = ghs[1].gh_gl->gl_object; 409 ip = ghs[1].gh_gl->gl_object;
408 410
409 ip->i_disksize = size;
410 i_size_write(inode, size); 411 i_size_write(inode, size);
411 412
412 error = gfs2_meta_inode_buffer(ip, &dibh); 413 error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
461 ip = ghs[1].gh_gl->gl_object; 462 ip = ghs[1].gh_gl->gl_object;
462 463
463 ip->i_inode.i_nlink = 2; 464 ip->i_inode.i_nlink = 2;
464 ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); 465 i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
465 ip->i_diskflags |= GFS2_DIF_JDATA; 466 ip->i_diskflags |= GFS2_DIF_JDATA;
466 ip->i_entries = 2; 467 ip->i_entries = 2;
467 468
@@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
470 if (!gfs2_assert_withdraw(sdp, !error)) { 471 if (!gfs2_assert_withdraw(sdp, !error)) {
471 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; 472 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
472 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); 473 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
473 struct qstr str;
474 474
475 gfs2_str2qstr(&str, ".");
476 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 475 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
477 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent); 476 gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
478 dent->de_inum = di->di_num; /* already GFS2 endian */ 477 dent->de_inum = di->di_num; /* already GFS2 endian */
479 dent->de_type = cpu_to_be16(DT_DIR); 478 dent->de_type = cpu_to_be16(DT_DIR);
480 di->di_entries = cpu_to_be32(1); 479 di->di_entries = cpu_to_be32(1);
481 480
482 gfs2_str2qstr(&str, "..");
483 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); 481 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
484 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); 482 gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
485 483
486 gfs2_inum_out(dip, dent); 484 gfs2_inum_out(dip, dent);
487 dent->de_type = cpu_to_be16(DT_DIR); 485 dent->de_type = cpu_to_be16(DT_DIR);
@@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
522static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, 520static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
523 struct gfs2_inode *ip) 521 struct gfs2_inode *ip)
524{ 522{
525 struct qstr dotname;
526 int error; 523 int error;
527 524
528 if (ip->i_entries != 2) { 525 if (ip->i_entries != 2) {
@@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
539 if (error) 536 if (error)
540 return error; 537 return error;
541 538
542 gfs2_str2qstr(&dotname, "."); 539 error = gfs2_dir_del(ip, &gfs2_qdot);
543 error = gfs2_dir_del(ip, &dotname);
544 if (error) 540 if (error)
545 return error; 541 return error;
546 542
547 gfs2_str2qstr(&dotname, ".."); 543 error = gfs2_dir_del(ip, &gfs2_qdotdot);
548 error = gfs2_dir_del(ip, &dotname);
549 if (error) 544 if (error)
550 return error; 545 return error;
551 546
@@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
694 struct inode *dir = &to->i_inode; 689 struct inode *dir = &to->i_inode;
695 struct super_block *sb = dir->i_sb; 690 struct super_block *sb = dir->i_sb;
696 struct inode *tmp; 691 struct inode *tmp;
697 struct qstr dotdot;
698 int error = 0; 692 int error = 0;
699 693
700 gfs2_str2qstr(&dotdot, "..");
701
702 igrab(dir); 694 igrab(dir);
703 695
704 for (;;) { 696 for (;;) {
@@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
711 break; 703 break;
712 } 704 }
713 705
714 tmp = gfs2_lookupi(dir, &dotdot, 1); 706 tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
715 if (IS_ERR(tmp)) { 707 if (IS_ERR(tmp)) {
716 error = PTR_ERR(tmp); 708 error = PTR_ERR(tmp);
717 break; 709 break;
@@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
744 struct gfs2_inode *ip = GFS2_I(odentry->d_inode); 736 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
745 struct gfs2_inode *nip = NULL; 737 struct gfs2_inode *nip = NULL;
746 struct gfs2_sbd *sdp = GFS2_SB(odir); 738 struct gfs2_sbd *sdp = GFS2_SB(odir);
747 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; 739 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
748 struct gfs2_rgrpd *nrgd; 740 struct gfs2_rgrpd *nrgd;
749 unsigned int num_gh; 741 unsigned int num_gh;
750 int dir_rename = 0; 742 int dir_rename = 0;
@@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
758 return 0; 750 return 0;
759 } 751 }
760 752
753 error = gfs2_rindex_hold(sdp, &ri_gh);
754 if (error)
755 return error;
761 756
762 if (odip != ndip) { 757 if (odip != ndip) {
763 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 758 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
887 882
888 al->al_requested = sdp->sd_max_dirres; 883 al->al_requested = sdp->sd_max_dirres;
889 884
890 error = gfs2_inplace_reserve(ndip); 885 error = gfs2_inplace_reserve_ri(ndip);
891 if (error) 886 if (error)
892 goto out_gunlock_q; 887 goto out_gunlock_q;
893 888
894 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 889 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
895 al->al_rgd->rd_length + 890 gfs2_rg_blocks(al) +
896 4 * RES_DINODE + 4 * RES_LEAF + 891 4 * RES_DINODE + 4 * RES_LEAF +
897 RES_STATFS + RES_QUOTA + 4, 0); 892 RES_STATFS + RES_QUOTA + 4, 0);
898 if (error) 893 if (error)
@@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
920 } 915 }
921 916
922 if (dir_rename) { 917 if (dir_rename) {
923 struct qstr name;
924 gfs2_str2qstr(&name, "..");
925
926 error = gfs2_change_nlink(ndip, +1); 918 error = gfs2_change_nlink(ndip, +1);
927 if (error) 919 if (error)
928 goto out_end_trans; 920 goto out_end_trans;
@@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
930 if (error) 922 if (error)
931 goto out_end_trans; 923 goto out_end_trans;
932 924
933 error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR); 925 error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
934 if (error) 926 if (error)
935 goto out_end_trans; 927 goto out_end_trans;
936 } else { 928 } else {
@@ -972,6 +964,7 @@ out_gunlock_r:
972 if (r_gh.gh_gl) 964 if (r_gh.gh_gl)
973 gfs2_glock_dq_uninit(&r_gh); 965 gfs2_glock_dq_uninit(&r_gh);
974out: 966out:
967 gfs2_glock_dq_uninit(&ri_gh);
975 return error; 968 return error;
976} 969}
977 970
@@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
990 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 983 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
991 struct gfs2_holder i_gh; 984 struct gfs2_holder i_gh;
992 struct buffer_head *dibh; 985 struct buffer_head *dibh;
993 unsigned int x; 986 unsigned int x, size;
994 char *buf; 987 char *buf;
995 int error; 988 int error;
996 989
@@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1002 return NULL; 995 return NULL;
1003 } 996 }
1004 997
1005 if (!ip->i_disksize) { 998 size = (unsigned int)i_size_read(&ip->i_inode);
999 if (size == 0) {
1006 gfs2_consist_inode(ip); 1000 gfs2_consist_inode(ip);
1007 buf = ERR_PTR(-EIO); 1001 buf = ERR_PTR(-EIO);
1008 goto out; 1002 goto out;
@@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1014 goto out; 1008 goto out;
1015 } 1009 }
1016 1010
1017 x = ip->i_disksize + 1; 1011 x = size + 1;
1018 buf = kmalloc(x, GFP_NOFS); 1012 buf = kmalloc(x, GFP_NOFS);
1019 if (!buf) 1013 if (!buf)
1020 buf = ERR_PTR(-ENOMEM); 1014 buf = ERR_PTR(-ENOMEM);
@@ -1071,30 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask)
1071 return error; 1065 return error;
1072} 1066}
1073 1067
1074/*
1075 * XXX(truncate): the truncate_setsize calls should be moved to the end.
1076 */
1077static int setattr_size(struct inode *inode, struct iattr *attr)
1078{
1079 struct gfs2_inode *ip = GFS2_I(inode);
1080 struct gfs2_sbd *sdp = GFS2_SB(inode);
1081 int error;
1082
1083 if (attr->ia_size != ip->i_disksize) {
1084 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1085 if (error)
1086 return error;
1087 truncate_setsize(inode, attr->ia_size);
1088 gfs2_trans_end(sdp);
1089 }
1090
1091 error = gfs2_truncatei(ip, attr->ia_size);
1092 if (error && (inode->i_size != ip->i_disksize))
1093 i_size_write(inode, ip->i_disksize);
1094
1095 return error;
1096}
1097
1098static int setattr_chown(struct inode *inode, struct iattr *attr) 1068static int setattr_chown(struct inode *inode, struct iattr *attr)
1099{ 1069{
1100 struct gfs2_inode *ip = GFS2_I(inode); 1070 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1195,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
1195 goto out; 1165 goto out;
1196 1166
1197 if (attr->ia_valid & ATTR_SIZE) 1167 if (attr->ia_valid & ATTR_SIZE)
1198 error = setattr_size(inode, attr); 1168 error = gfs2_setattr_size(inode, attr->ia_size);
1199 else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) 1169 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1200 error = setattr_chown(inode, attr); 1170 error = setattr_chown(inode, attr);
1201 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) 1171 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
@@ -1301,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1301 return ret; 1271 return ret;
1302} 1272}
1303 1273
1274static void empty_write_end(struct page *page, unsigned from,
1275 unsigned to)
1276{
1277 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
1278
1279 page_zero_new_buffers(page, from, to);
1280 flush_dcache_page(page);
1281 mark_page_accessed(page);
1282
1283 if (!gfs2_is_writeback(ip))
1284 gfs2_page_add_databufs(ip, page, from, to);
1285
1286 block_commit_write(page, from, to);
1287}
1288
1289
1290static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
1291{
1292 unsigned start, end, next;
1293 struct buffer_head *bh, *head;
1294 int error;
1295
1296 if (!page_has_buffers(page)) {
1297 error = block_prepare_write(page, from, to, gfs2_block_map);
1298 if (unlikely(error))
1299 return error;
1300
1301 empty_write_end(page, from, to);
1302 return 0;
1303 }
1304
1305 bh = head = page_buffers(page);
1306 next = end = 0;
1307 while (next < from) {
1308 next += bh->b_size;
1309 bh = bh->b_this_page;
1310 }
1311 start = next;
1312 do {
1313 next += bh->b_size;
1314 if (buffer_mapped(bh)) {
1315 if (end) {
1316 error = block_prepare_write(page, start, end,
1317 gfs2_block_map);
1318 if (unlikely(error))
1319 return error;
1320 empty_write_end(page, start, end);
1321 end = 0;
1322 }
1323 start = next;
1324 }
1325 else
1326 end = next;
1327 bh = bh->b_this_page;
1328 } while (next < to);
1329
1330 if (end) {
1331 error = block_prepare_write(page, start, end, gfs2_block_map);
1332 if (unlikely(error))
1333 return error;
1334 empty_write_end(page, start, end);
1335 }
1336
1337 return 0;
1338}
1339
1340static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
1341 int mode)
1342{
1343 struct gfs2_inode *ip = GFS2_I(inode);
1344 struct buffer_head *dibh;
1345 int error;
1346 u64 start = offset >> PAGE_CACHE_SHIFT;
1347 unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
1348 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
1349 pgoff_t curr;
1350 struct page *page;
1351 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
1352 unsigned int from, to;
1353
1354 if (!end_offset)
1355 end_offset = PAGE_CACHE_SIZE;
1356
1357 error = gfs2_meta_inode_buffer(ip, &dibh);
1358 if (unlikely(error))
1359 goto out;
1360
1361 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1362
1363 if (gfs2_is_stuffed(ip)) {
1364 error = gfs2_unstuff_dinode(ip, NULL);
1365 if (unlikely(error))
1366 goto out;
1367 }
1368
1369 curr = start;
1370 offset = start << PAGE_CACHE_SHIFT;
1371 from = start_offset;
1372 to = PAGE_CACHE_SIZE;
1373 while (curr <= end) {
1374 page = grab_cache_page_write_begin(inode->i_mapping, curr,
1375 AOP_FLAG_NOFS);
1376 if (unlikely(!page)) {
1377 error = -ENOMEM;
1378 goto out;
1379 }
1380
1381 if (curr == end)
1382 to = end_offset;
1383 error = write_empty_blocks(page, from, to);
1384 if (!error && offset + to > inode->i_size &&
1385 !(mode & FALLOC_FL_KEEP_SIZE)) {
1386 i_size_write(inode, offset + to);
1387 }
1388 unlock_page(page);
1389 page_cache_release(page);
1390 if (error)
1391 goto out;
1392 curr++;
1393 offset += PAGE_CACHE_SIZE;
1394 from = 0;
1395 }
1396
1397 gfs2_dinode_out(ip, dibh->b_data);
1398 mark_inode_dirty(inode);
1399
1400 brelse(dibh);
1401
1402out:
1403 return error;
1404}
1405
1406static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
1407 unsigned int *data_blocks, unsigned int *ind_blocks)
1408{
1409 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1410 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
1411 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
1412
1413 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
1414 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1415 max_data -= tmp;
1416 }
1417 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
1418 so it might end up with fewer data blocks */
1419 if (max_data <= *data_blocks)
1420 return;
1421 *data_blocks = max_data;
1422 *ind_blocks = max_blocks - max_data;
1423 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
1424 if (*len > max) {
1425 *len = max;
1426 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
1427 }
1428}
1429
1430static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1431 loff_t len)
1432{
1433 struct gfs2_sbd *sdp = GFS2_SB(inode);
1434 struct gfs2_inode *ip = GFS2_I(inode);
1435 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
1436 loff_t bytes, max_bytes;
1437 struct gfs2_alloc *al;
1438 int error;
1439 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
1440 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
1441
1442 offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
1443 sdp->sd_sb.sb_bsize_shift;
1444
1445 len = next - offset;
1446 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
1447 if (!bytes)
1448 bytes = UINT_MAX;
1449
1450 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
1451 error = gfs2_glock_nq(&ip->i_gh);
1452 if (unlikely(error))
1453 goto out_uninit;
1454
1455 if (!gfs2_write_alloc_required(ip, offset, len))
1456 goto out_unlock;
1457
1458 while (len > 0) {
1459 if (len < bytes)
1460 bytes = len;
1461 al = gfs2_alloc_get(ip);
1462 if (!al) {
1463 error = -ENOMEM;
1464 goto out_unlock;
1465 }
1466
1467 error = gfs2_quota_lock_check(ip);
1468 if (error)
1469 goto out_alloc_put;
1470
1471retry:
1472 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
1473
1474 al->al_requested = data_blocks + ind_blocks;
1475 error = gfs2_inplace_reserve(ip);
1476 if (error) {
1477 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
1478 bytes >>= 1;
1479 goto retry;
1480 }
1481 goto out_qunlock;
1482 }
1483 max_bytes = bytes;
1484 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
1485 al->al_requested = data_blocks + ind_blocks;
1486
1487 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
1488 RES_RG_HDR + gfs2_rg_blocks(al);
1489 if (gfs2_is_jdata(ip))
1490 rblocks += data_blocks ? data_blocks : 1;
1491
1492 error = gfs2_trans_begin(sdp, rblocks,
1493 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
1494 if (error)
1495 goto out_trans_fail;
1496
1497 error = fallocate_chunk(inode, offset, max_bytes, mode);
1498 gfs2_trans_end(sdp);
1499
1500 if (error)
1501 goto out_trans_fail;
1502
1503 len -= max_bytes;
1504 offset += max_bytes;
1505 gfs2_inplace_release(ip);
1506 gfs2_quota_unlock(ip);
1507 gfs2_alloc_put(ip);
1508 }
1509 goto out_unlock;
1510
1511out_trans_fail:
1512 gfs2_inplace_release(ip);
1513out_qunlock:
1514 gfs2_quota_unlock(ip);
1515out_alloc_put:
1516 gfs2_alloc_put(ip);
1517out_unlock:
1518 gfs2_glock_dq(&ip->i_gh);
1519out_uninit:
1520 gfs2_holder_uninit(&ip->i_gh);
1521 return error;
1522}
1523
1524
1304static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1525static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1305 u64 start, u64 len) 1526 u64 start, u64 len)
1306{ 1527{
@@ -1351,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = {
1351 .getxattr = gfs2_getxattr, 1572 .getxattr = gfs2_getxattr,
1352 .listxattr = gfs2_listxattr, 1573 .listxattr = gfs2_listxattr,
1353 .removexattr = gfs2_removexattr, 1574 .removexattr = gfs2_removexattr,
1575 .fallocate = gfs2_fallocate,
1354 .fiemap = gfs2_fiemap, 1576 .fiemap = gfs2_fiemap,
1355}; 1577};
1356 1578
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 1bc6b5695e6d..58a9b9998b42 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,10 +735,8 @@ get_a_page:
735 goto out; 735 goto out;
736 736
737 size = loc + sizeof(struct gfs2_quota); 737 size = loc + sizeof(struct gfs2_quota);
738 if (size > inode->i_size) { 738 if (size > inode->i_size)
739 ip->i_disksize = size;
740 i_size_write(inode, size); 739 i_size_write(inode, size);
741 }
742 inode->i_mtime = inode->i_atime = CURRENT_TIME; 740 inode->i_mtime = inode->i_atime = CURRENT_TIME;
743 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 741 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
744 gfs2_dinode_out(ip, dibh->b_data); 742 gfs2_dinode_out(ip, dibh->b_data);
@@ -817,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
817 goto out_alloc; 815 goto out_alloc;
818 816
819 if (nalloc) 817 if (nalloc)
820 blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS; 818 blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
821 819
822 error = gfs2_trans_begin(sdp, blocks, 0); 820 error = gfs2_trans_begin(sdp, blocks, 0);
823 if (error) 821 if (error)
@@ -1190,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
1190int gfs2_quota_init(struct gfs2_sbd *sdp) 1188int gfs2_quota_init(struct gfs2_sbd *sdp)
1191{ 1189{
1192 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1190 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1193 unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; 1191 u64 size = i_size_read(sdp->sd_qc_inode);
1192 unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
1194 unsigned int x, slot = 0; 1193 unsigned int x, slot = 0;
1195 unsigned int found = 0; 1194 unsigned int found = 0;
1196 u64 dblock; 1195 u64 dblock;
1197 u32 extlen = 0; 1196 u32 extlen = 0;
1198 int error; 1197 int error;
1199 1198
1200 if (!ip->i_disksize || ip->i_disksize > (64 << 20) || 1199 if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
1201 ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
1202 gfs2_consist_inode(ip);
1203 return -EIO; 1200 return -EIO;
1204 } 1201
1205 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; 1202 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1206 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); 1203 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1207 1204
@@ -1589,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1589 error = gfs2_inplace_reserve(ip); 1586 error = gfs2_inplace_reserve(ip);
1590 if (error) 1587 if (error)
1591 goto out_alloc; 1588 goto out_alloc;
1589 blocks += gfs2_rg_blocks(al);
1592 } 1590 }
1593 1591
1594 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); 1592 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f7f89a94a5a4..f2a02edcac8f 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work)
455 int ro = 0; 455 int ro = 0;
456 unsigned int pass; 456 unsigned int pass;
457 int error; 457 int error;
458 int jlocked = 0;
458 459
459 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { 460 if (sdp->sd_args.ar_spectator ||
461 (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
460 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", 462 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
461 jd->jd_jid); 463 jd->jd_jid);
462 464 jlocked = 1;
463 /* Acquire the journal lock so we can do recovery */ 465 /* Acquire the journal lock so we can do recovery */
464 466
465 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, 467 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work)
554 jd->jd_jid, t); 556 jd->jd_jid, t);
555 } 557 }
556 558
557 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
558 gfs2_glock_dq_uninit(&ji_gh);
559
560 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); 559 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
561 560
562 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) 561 if (jlocked) {
562 gfs2_glock_dq_uninit(&ji_gh);
563 gfs2_glock_dq_uninit(&j_gh); 563 gfs2_glock_dq_uninit(&j_gh);
564 }
564 565
565 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); 566 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
566 goto done; 567 goto done;
@@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work)
568fail_gunlock_tr: 569fail_gunlock_tr:
569 gfs2_glock_dq_uninit(&t_gh); 570 gfs2_glock_dq_uninit(&t_gh);
570fail_gunlock_ji: 571fail_gunlock_ji:
571 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { 572 if (jlocked) {
572 gfs2_glock_dq_uninit(&ji_gh); 573 gfs2_glock_dq_uninit(&ji_gh);
573fail_gunlock_j: 574fail_gunlock_j:
574 gfs2_glock_dq_uninit(&j_gh); 575 gfs2_glock_dq_uninit(&j_gh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e45..fb67f593f408 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
500 for (rgrps = 0;; rgrps++) { 500 for (rgrps = 0;; rgrps++) {
501 loff_t pos = rgrps * sizeof(struct gfs2_rindex); 501 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
502 502
503 if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize) 503 if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
504 break; 504 break;
505 error = gfs2_internal_read(ip, &ra_state, buf, &pos, 505 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
506 sizeof(struct gfs2_rindex)); 506 sizeof(struct gfs2_rindex));
@@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
589 struct inode *inode = &ip->i_inode; 589 struct inode *inode = &ip->i_inode;
590 struct file_ra_state ra_state; 590 struct file_ra_state ra_state;
591 u64 rgrp_count = ip->i_disksize; 591 u64 rgrp_count = i_size_read(inode);
592 struct gfs2_rgrpd *rgd;
593 unsigned int max_data = 0;
592 int error; 594 int error;
593 595
594 do_div(rgrp_count, sizeof(struct gfs2_rindex)); 596 do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
603 } 605 }
604 } 606 }
605 607
608 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
609 if (rgd->rd_data > max_data)
610 max_data = rgd->rd_data;
611 sdp->sd_max_rg_data = max_data;
606 sdp->sd_rindex_uptodate = 1; 612 sdp->sd_rindex_uptodate = 1;
607 return 0; 613 return 0;
608} 614}
@@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
622 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 628 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
623 struct inode *inode = &ip->i_inode; 629 struct inode *inode = &ip->i_inode;
624 struct file_ra_state ra_state; 630 struct file_ra_state ra_state;
631 struct gfs2_rgrpd *rgd;
632 unsigned int max_data = 0;
625 int error; 633 int error;
626 634
627 file_ra_state_init(&ra_state, inode->i_mapping); 635 file_ra_state_init(&ra_state, inode->i_mapping);
628 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { 636 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
629 /* Ignore partials */ 637 /* Ignore partials */
630 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > 638 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
631 ip->i_disksize) 639 i_size_read(inode))
632 break; 640 break;
633 error = read_rindex_entry(ip, &ra_state); 641 error = read_rindex_entry(ip, &ra_state);
634 if (error) { 642 if (error) {
@@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
636 return error; 644 return error;
637 } 645 }
638 } 646 }
647 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
648 if (rgd->rd_data > max_data)
649 max_data = rgd->rd_data;
650 sdp->sd_max_rg_data = max_data;
639 651
640 sdp->sd_rindex_uptodate = 1; 652 sdp->sd_rindex_uptodate = 1;
641 return 0; 653 return 0;
@@ -1188,7 +1200,8 @@ out:
1188 * Returns: errno 1200 * Returns: errno
1189 */ 1201 */
1190 1202
1191int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) 1203int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
1204 char *file, unsigned int line)
1192{ 1205{
1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1206 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1194 struct gfs2_alloc *al = ip->i_alloc; 1207 struct gfs2_alloc *al = ip->i_alloc;
@@ -1199,12 +1212,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1199 return -EINVAL; 1212 return -EINVAL;
1200 1213
1201try_again: 1214try_again:
1202 /* We need to hold the rindex unless the inode we're using is 1215 if (hold_rindex) {
1203 the rindex itself, in which case it's already held. */ 1216 /* We need to hold the rindex unless the inode we're using is
1204 if (ip != GFS2_I(sdp->sd_rindex)) 1217 the rindex itself, in which case it's already held. */
1205 error = gfs2_rindex_hold(sdp, &al->al_ri_gh); 1218 if (ip != GFS2_I(sdp->sd_rindex))
1206 else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ 1219 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1207 error = gfs2_ri_update_special(ip); 1220 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1221 in, so: */
1222 error = gfs2_ri_update_special(ip);
1223 }
1208 1224
1209 if (error) 1225 if (error)
1210 return error; 1226 return error;
@@ -1215,7 +1231,7 @@ try_again:
1215 try to free it, and try the allocation again. */ 1231 try to free it, and try the allocation again. */
1216 error = get_local_rgrp(ip, &unlinked, &last_unlinked); 1232 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1217 if (error) { 1233 if (error) {
1218 if (ip != GFS2_I(sdp->sd_rindex)) 1234 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
1219 gfs2_glock_dq_uninit(&al->al_ri_gh); 1235 gfs2_glock_dq_uninit(&al->al_ri_gh);
1220 if (error != -EAGAIN) 1236 if (error != -EAGAIN)
1221 return error; 1237 return error;
@@ -1257,7 +1273,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1257 al->al_rgd = NULL; 1273 al->al_rgd = NULL;
1258 if (al->al_rgd_gh.gh_gl) 1274 if (al->al_rgd_gh.gh_gl)
1259 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1275 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1260 if (ip != GFS2_I(sdp->sd_rindex)) 1276 if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
1261 gfs2_glock_dq_uninit(&al->al_ri_gh); 1277 gfs2_glock_dq_uninit(&al->al_ri_gh);
1262} 1278}
1263 1279
@@ -1496,11 +1512,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1496 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1512 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1497 struct buffer_head *dibh; 1513 struct buffer_head *dibh;
1498 struct gfs2_alloc *al = ip->i_alloc; 1514 struct gfs2_alloc *al = ip->i_alloc;
1499 struct gfs2_rgrpd *rgd = al->al_rgd; 1515 struct gfs2_rgrpd *rgd;
1500 u32 goal, blk; 1516 u32 goal, blk;
1501 u64 block; 1517 u64 block;
1502 int error; 1518 int error;
1503 1519
1520 /* Only happens if there is a bug in gfs2, return something distinctive
1521 * to ensure that it is noticed.
1522 */
1523 if (al == NULL)
1524 return -ECANCELED;
1525
1526 rgd = al->al_rgd;
1527
1504 if (rgrp_contains_block(rgd, ip->i_goal)) 1528 if (rgrp_contains_block(rgd, ip->i_goal))
1505 goal = ip->i_goal - rgd->rd_data0; 1529 goal = ip->i_goal - rgd->rd_data0;
1506 else 1530 else
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index f07119d89557..0e35c0466f9a 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
39 ip->i_alloc = NULL; 39 ip->i_alloc = NULL;
40} 40}
41 41
42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, 42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
43 unsigned int line); 43 char *file, unsigned int line);
44#define gfs2_inplace_reserve(ip) \ 44#define gfs2_inplace_reserve(ip) \
45gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) 45 gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
46#define gfs2_inplace_reserve_ri(ip) \
47 gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
46 48
47extern void gfs2_inplace_release(struct gfs2_inode *ip); 49extern void gfs2_inplace_release(struct gfs2_inode *ip);
48 50
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 77cb9f830ee4..047d1176096c 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
85 {Opt_locktable, "locktable=%s"}, 85 {Opt_locktable, "locktable=%s"},
86 {Opt_hostdata, "hostdata=%s"}, 86 {Opt_hostdata, "hostdata=%s"},
87 {Opt_spectator, "spectator"}, 87 {Opt_spectator, "spectator"},
88 {Opt_spectator, "norecovery"},
88 {Opt_ignore_local_fs, "ignore_local_fs"}, 89 {Opt_ignore_local_fs, "ignore_local_fs"},
89 {Opt_localflocks, "localflocks"}, 90 {Opt_localflocks, "localflocks"},
90 {Opt_localcaching, "localcaching"}, 91 {Opt_localcaching, "localcaching"},
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
159 args->ar_spectator = 1; 160 args->ar_spectator = 1;
160 break; 161 break;
161 case Opt_ignore_local_fs: 162 case Opt_ignore_local_fs:
162 args->ar_ignore_local_fs = 1; 163 /* Retained for backwards compat only */
163 break; 164 break;
164 case Opt_localflocks: 165 case Opt_localflocks:
165 args->ar_localflocks = 1; 166 args->ar_localflocks = 1;
166 break; 167 break;
167 case Opt_localcaching: 168 case Opt_localcaching:
168 args->ar_localcaching = 1; 169 /* Retained for backwards compat only */
169 break; 170 break;
170 case Opt_debug: 171 case Opt_debug:
171 if (args->ar_errors == GFS2_ERRORS_PANIC) { 172 if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
179 args->ar_debug = 0; 180 args->ar_debug = 0;
180 break; 181 break;
181 case Opt_upgrade: 182 case Opt_upgrade:
182 args->ar_upgrade = 1; 183 /* Retained for backwards compat only */
183 break; 184 break;
184 case Opt_acl: 185 case Opt_acl:
185 args->ar_posix_acl = 1; 186 args->ar_posix_acl = 1;
@@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
342{ 343{
343 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 344 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
344 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); 345 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
346 u64 size = i_size_read(jd->jd_inode);
345 347
346 if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || 348 if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
347 (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
348 gfs2_consist_inode(ip);
349 return -EIO; 349 return -EIO;
350 }
351 jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
352 350
353 if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) { 351 jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
352
353 if (gfs2_write_alloc_required(ip, 0, size)) {
354 gfs2_consist_inode(ip); 354 gfs2_consist_inode(ip);
355 return -EIO; 355 return -EIO;
356 } 356 }
@@ -1129,9 +1129,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1129 1129
1130 /* Some flags must not be changed */ 1130 /* Some flags must not be changed */
1131 if (args_neq(&args, &sdp->sd_args, spectator) || 1131 if (args_neq(&args, &sdp->sd_args, spectator) ||
1132 args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
1133 args_neq(&args, &sdp->sd_args, localflocks) || 1132 args_neq(&args, &sdp->sd_args, localflocks) ||
1134 args_neq(&args, &sdp->sd_args, localcaching) ||
1135 args_neq(&args, &sdp->sd_args, meta)) 1133 args_neq(&args, &sdp->sd_args, meta))
1136 return -EINVAL; 1134 return -EINVAL;
1137 1135
@@ -1234,16 +1232,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1234 seq_printf(s, ",hostdata=%s", args->ar_hostdata); 1232 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
1235 if (args->ar_spectator) 1233 if (args->ar_spectator)
1236 seq_printf(s, ",spectator"); 1234 seq_printf(s, ",spectator");
1237 if (args->ar_ignore_local_fs)
1238 seq_printf(s, ",ignore_local_fs");
1239 if (args->ar_localflocks) 1235 if (args->ar_localflocks)
1240 seq_printf(s, ",localflocks"); 1236 seq_printf(s, ",localflocks");
1241 if (args->ar_localcaching)
1242 seq_printf(s, ",localcaching");
1243 if (args->ar_debug) 1237 if (args->ar_debug)
1244 seq_printf(s, ",debug"); 1238 seq_printf(s, ",debug");
1245 if (args->ar_upgrade)
1246 seq_printf(s, ",upgrade");
1247 if (args->ar_posix_acl) 1239 if (args->ar_posix_acl)
1248 seq_printf(s, ",acl"); 1240 seq_printf(s, ",acl");
1249 if (args->ar_quota != GFS2_QUOTA_DEFAULT) { 1241 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ccacffd2faaa..748ccb557c18 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
230 230
231 if (gltype > LM_TYPE_JOURNAL) 231 if (gltype > LM_TYPE_JOURNAL)
232 return -EINVAL; 232 return -EINVAL;
233 glops = gfs2_glops_list[gltype]; 233 if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
234 glops = &gfs2_trans_glops;
235 else
236 glops = gfs2_glops_list[gltype];
234 if (glops == NULL) 237 if (glops == NULL)
235 return -EINVAL; 238 return -EINVAL;
236 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) 239 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
@@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
399 402
400static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) 403static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
401{ 404{
402 return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); 405 return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
403} 406}
404 407
405static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) 408static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
406{ 409{
407 unsigned jid; 410 int jid;
408 int rv; 411 int rv;
409 412
410 rv = sscanf(buf, "%u", &jid); 413 rv = sscanf(buf, "%d", &jid);
411 if (rv != 1) 414 if (rv != 1)
412 return -EINVAL; 415 return -EINVAL;
413 416
414 spin_lock(&sdp->sd_jindex_spin); 417 spin_lock(&sdp->sd_jindex_spin);
415 rv = -EINVAL; 418 rv = -EINVAL;
416 if (sdp->sd_args.ar_spectator)
417 goto out;
418 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 419 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
419 goto out; 420 goto out;
420 rv = -EBUSY; 421 rv = -EBUSY;
421 if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) 422 if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
422 goto out; 423 goto out;
424 rv = 0;
425 if (sdp->sd_args.ar_spectator && jid > 0)
426 rv = jid = -EINVAL;
423 sdp->sd_lockstruct.ls_jid = jid; 427 sdp->sd_lockstruct.ls_jid = jid;
428 clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
424 smp_mb__after_clear_bit(); 429 smp_mb__after_clear_bit();
425 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); 430 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
426 rv = 0;
427out: 431out:
428 spin_unlock(&sdp->sd_jindex_spin); 432 spin_unlock(&sdp->sd_jindex_spin);
429 return rv ? rv : len; 433 return rv ? rv : len;
@@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
617 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); 621 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
618 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 622 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
619 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) 623 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
620 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); 624 add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
621 if (gfs2_uuid_valid(uuid)) 625 if (gfs2_uuid_valid(uuid))
622 add_uevent_var(env, "UUID=%pUB", uuid); 626 add_uevent_var(env, "UUID=%pUB", uuid);
623 return 0; 627 return 0;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 148d55c14171..cedb0bb96d96 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ 39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
40 {(1UL << GLF_REPLY_PENDING), "r" }, \ 40 {(1UL << GLF_REPLY_PENDING), "r" }, \
41 {(1UL << GLF_INITIAL), "I" }, \ 41 {(1UL << GLF_INITIAL), "I" }, \
42 {(1UL << GLF_FROZEN), "F" }) 42 {(1UL << GLF_FROZEN), "F" }, \
43 {(1UL << GLF_QUEUED), "q" })
43 44
44#ifndef NUMPTY 45#ifndef NUMPTY
45#define NUMPTY 46#define NUMPTY
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index edf9d4bd908e..fb56b783e028 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,11 +20,20 @@ struct gfs2_glock;
20#define RES_JDATA 1 20#define RES_JDATA 1
21#define RES_DATA 1 21#define RES_DATA 1
22#define RES_LEAF 1 22#define RES_LEAF 1
23#define RES_RG_HDR 1
23#define RES_RG_BIT 2 24#define RES_RG_BIT 2
24#define RES_EATTR 1 25#define RES_EATTR 1
25#define RES_STATFS 1 26#define RES_STATFS 1
26#define RES_QUOTA 2 27#define RES_QUOTA 2
27 28
29/* reserve either the number of blocks to be allocated plus the rg header
30 * block, or all of the blocks in the rg, whichever is smaller */
31static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
32{
33 return (al->al_requested < al->al_rgd->rd_length)?
34 al->al_requested + 1 : al->al_rgd->rd_length;
35}
36
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, 37int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes); 38 unsigned int revokes);
30 39
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 776af6eb4bcb..30b58f07c8a6 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
734 goto out_gunlock_q; 734 goto out_gunlock_q;
735 735
736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), 736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
737 blks + al->al_rgd->rd_length + 737 blks + gfs2_rg_blocks(al) +
738 RES_DINODE + RES_STATFS + RES_QUOTA, 0); 738 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
739 if (error) 739 if (error)
740 goto out_ipres; 740 goto out_ipres;
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 4129cdb3f0d8..571abe97b42a 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
23 fd->search_key = ptr; 23 fd->search_key = ptr;
24 fd->key = ptr + tree->max_key_len + 2; 24 fd->key = ptr + tree->max_key_len + 2;
25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); 25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
26 down(&tree->tree_lock); 26 mutex_lock(&tree->tree_lock);
27 return 0; 27 return 0;
28} 28}
29 29
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
32 hfs_bnode_put(fd->bnode); 32 hfs_bnode_put(fd->bnode);
33 kfree(fd->search_key); 33 kfree(fd->search_key);
34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); 34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
35 up(&fd->tree->tree_lock); 35 mutex_unlock(&fd->tree->tree_lock);
36 fd->tree = NULL; 36 fd->tree = NULL;
37} 37}
38 38
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 38a0a9917d7f..3ebc437736fe 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
27 if (!tree) 27 if (!tree)
28 return NULL; 28 return NULL;
29 29
30 init_MUTEX(&tree->tree_lock); 30 mutex_init(&tree->tree_lock);
31 spin_lock_init(&tree->hash_lock); 31 spin_lock_init(&tree->hash_lock);
32 /* Set the correct compare function */ 32 /* Set the correct compare function */
33 tree->sb = sb; 33 tree->sb = sb;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index cc51905ac21d..2a1d712f85dc 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -33,7 +33,7 @@ struct hfs_btree {
33 unsigned int depth; 33 unsigned int depth;
34 34
35 //unsigned int map1_size, map_size; 35 //unsigned int map1_size, map_size;
36 struct semaphore tree_lock; 36 struct mutex tree_lock;
37 37
38 unsigned int pages_per_bnode; 38 unsigned int pages_per_bnode;
39 spinlock_t hash_lock; 39 spinlock_t hash_lock;
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5007a41f1be9..d182438c7ae4 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
23 fd->search_key = ptr; 23 fd->search_key = ptr;
24 fd->key = ptr + tree->max_key_len + 2; 24 fd->key = ptr + tree->max_key_len + 2;
25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); 25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
26 down(&tree->tree_lock); 26 mutex_lock(&tree->tree_lock);
27 return 0; 27 return 0;
28} 28}
29 29
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
32 hfs_bnode_put(fd->bnode); 32 hfs_bnode_put(fd->bnode);
33 kfree(fd->search_key); 33 kfree(fd->search_key);
34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); 34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
35 up(&fd->tree->tree_lock); 35 mutex_unlock(&fd->tree->tree_lock);
36 fd->tree = NULL; 36 fd->tree = NULL;
37} 37}
38 38
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
52 rec = (e + b) / 2; 52 rec = (e + b) / 2;
53 len = hfs_brec_lenoff(bnode, rec, &off); 53 len = hfs_brec_lenoff(bnode, rec, &off);
54 keylen = hfs_brec_keylen(bnode, rec); 54 keylen = hfs_brec_keylen(bnode, rec);
55 if (keylen == 0) {
56 res = -EINVAL;
57 goto fail;
58 }
55 hfs_bnode_read(bnode, fd->key, off, keylen); 59 hfs_bnode_read(bnode, fd->key, off, keylen);
56 cmpval = bnode->tree->keycmp(fd->key, fd->search_key); 60 cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
57 if (!cmpval) { 61 if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
67 if (rec != e && e >= 0) { 71 if (rec != e && e >= 0) {
68 len = hfs_brec_lenoff(bnode, e, &off); 72 len = hfs_brec_lenoff(bnode, e, &off);
69 keylen = hfs_brec_keylen(bnode, e); 73 keylen = hfs_brec_keylen(bnode, e);
74 if (keylen == 0) {
75 res = -EINVAL;
76 goto fail;
77 }
70 hfs_bnode_read(bnode, fd->key, off, keylen); 78 hfs_bnode_read(bnode, fd->key, off, keylen);
71 } 79 }
72done: 80done:
@@ -75,6 +83,7 @@ done:
75 fd->keylength = keylen; 83 fd->keylength = keylen;
76 fd->entryoffset = off + keylen; 84 fd->entryoffset = off + keylen;
77 fd->entrylength = len - keylen; 85 fd->entrylength = len - keylen;
86fail:
78 return res; 87 return res;
79} 88}
80 89
@@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
198 207
199 len = hfs_brec_lenoff(bnode, fd->record, &off); 208 len = hfs_brec_lenoff(bnode, fd->record, &off);
200 keylen = hfs_brec_keylen(bnode, fd->record); 209 keylen = hfs_brec_keylen(bnode, fd->record);
210 if (keylen == 0) {
211 res = -EINVAL;
212 goto out;
213 }
201 fd->keyoffset = off; 214 fd->keyoffset = off;
202 fd->keylength = keylen; 215 fd->keylength = keylen;
203 fd->entryoffset = off + keylen; 216 fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ea30afc2a03c..ad57f5991eb1 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -17,6 +17,7 @@
17 17
18int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) 18int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
19{ 19{
20 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
20 struct page *page; 21 struct page *page;
21 struct address_space *mapping; 22 struct address_space *mapping;
22 __be32 *pptr, *curr, *end; 23 __be32 *pptr, *curr, *end;
@@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
29 return size; 30 return size;
30 31
31 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); 32 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
32 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 33 mutex_lock(&sbi->alloc_mutex);
33 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 34 mapping = sbi->alloc_file->i_mapping;
34 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); 35 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
35 if (IS_ERR(page)) { 36 if (IS_ERR(page)) {
36 start = size; 37 start = size;
@@ -150,16 +151,17 @@ done:
150 set_page_dirty(page); 151 set_page_dirty(page);
151 kunmap(page); 152 kunmap(page);
152 *max = offset + (curr - pptr) * 32 + i - start; 153 *max = offset + (curr - pptr) * 32 + i - start;
153 HFSPLUS_SB(sb).free_blocks -= *max; 154 sbi->free_blocks -= *max;
154 sb->s_dirt = 1; 155 sb->s_dirt = 1;
155 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); 156 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
156out: 157out:
157 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 158 mutex_unlock(&sbi->alloc_mutex);
158 return start; 159 return start;
159} 160}
160 161
161int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) 162int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
162{ 163{
164 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
163 struct page *page; 165 struct page *page;
164 struct address_space *mapping; 166 struct address_space *mapping;
165 __be32 *pptr, *curr, *end; 167 __be32 *pptr, *curr, *end;
@@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
172 174
173 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); 175 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
174 /* are all of the bits in range? */ 176 /* are all of the bits in range? */
175 if ((offset + count) > HFSPLUS_SB(sb).total_blocks) 177 if ((offset + count) > sbi->total_blocks)
176 return -2; 178 return -2;
177 179
178 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 180 mutex_lock(&sbi->alloc_mutex);
179 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 181 mapping = sbi->alloc_file->i_mapping;
180 pnr = offset / PAGE_CACHE_BITS; 182 pnr = offset / PAGE_CACHE_BITS;
181 page = read_mapping_page(mapping, pnr, NULL); 183 page = read_mapping_page(mapping, pnr, NULL);
182 pptr = kmap(page); 184 pptr = kmap(page);
@@ -224,9 +226,9 @@ done:
224out: 226out:
225 set_page_dirty(page); 227 set_page_dirty(page);
226 kunmap(page); 228 kunmap(page);
227 HFSPLUS_SB(sb).free_blocks += len; 229 sbi->free_blocks += len;
228 sb->s_dirt = 1; 230 sb->s_dirt = 1;
229 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 231 mutex_unlock(&sbi->alloc_mutex);
230 232
231 return 0; 233 return 0;
232} 234}
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index c88e5d72a402..2f39d05443e1 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
42 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); 42 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
43 if (!recoff) 43 if (!recoff)
44 return 0; 44 return 0;
45 if (node->tree->attributes & HFS_TREE_BIGKEYS) 45
46 retval = hfs_bnode_read_u16(node, recoff) + 2; 46 retval = hfs_bnode_read_u16(node, recoff) + 2;
47 else 47 if (retval > node->tree->max_key_len + 2) {
48 retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; 48 printk(KERN_ERR "hfs: keylen %d too large\n",
49 retval);
50 retval = 0;
51 }
49 } 52 }
50 return retval; 53 return retval;
51} 54}
@@ -216,7 +219,7 @@ skip:
216static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) 219static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
217{ 220{
218 struct hfs_btree *tree; 221 struct hfs_btree *tree;
219 struct hfs_bnode *node, *new_node; 222 struct hfs_bnode *node, *new_node, *next_node;
220 struct hfs_bnode_desc node_desc; 223 struct hfs_bnode_desc node_desc;
221 int num_recs, new_rec_off, new_off, old_rec_off; 224 int num_recs, new_rec_off, new_off, old_rec_off;
222 int data_start, data_end, size; 225 int data_start, data_end, size;
@@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
235 new_node->type = node->type; 238 new_node->type = node->type;
236 new_node->height = node->height; 239 new_node->height = node->height;
237 240
241 if (node->next)
242 next_node = hfs_bnode_find(tree, node->next);
243 else
244 next_node = NULL;
245
246 if (IS_ERR(next_node)) {
247 hfs_bnode_put(node);
248 hfs_bnode_put(new_node);
249 return next_node;
250 }
251
238 size = tree->node_size / 2 - node->num_recs * 2 - 14; 252 size = tree->node_size / 2 - node->num_recs * 2 - 14;
239 old_rec_off = tree->node_size - 4; 253 old_rec_off = tree->node_size - 4;
240 num_recs = 1; 254 num_recs = 1;
@@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
248 /* panic? */ 262 /* panic? */
249 hfs_bnode_put(node); 263 hfs_bnode_put(node);
250 hfs_bnode_put(new_node); 264 hfs_bnode_put(new_node);
265 if (next_node)
266 hfs_bnode_put(next_node);
251 return ERR_PTR(-ENOSPC); 267 return ERR_PTR(-ENOSPC);
252 } 268 }
253 269
@@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
302 hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); 318 hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
303 319
304 /* update next bnode header */ 320 /* update next bnode header */
305 if (new_node->next) { 321 if (next_node) {
306 struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
307 next_node->prev = new_node->this; 322 next_node->prev = new_node->this;
308 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); 323 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
309 node_desc.prev = cpu_to_be32(next_node->prev); 324 node_desc.prev = cpu_to_be32(next_node->prev);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index e49fcee1e293..22e4d4e32999 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
30 if (!tree) 30 if (!tree)
31 return NULL; 31 return NULL;
32 32
33 init_MUTEX(&tree->tree_lock); 33 mutex_init(&tree->tree_lock);
34 spin_lock_init(&tree->hash_lock); 34 spin_lock_init(&tree->hash_lock);
35 tree->sb = sb; 35 tree->sb = sb;
36 tree->cnid = id; 36 tree->cnid = id;
@@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
39 goto free_tree; 39 goto free_tree;
40 tree->inode = inode; 40 tree->inode = inode;
41 41
42 if (!HFSPLUS_I(tree->inode)->first_blocks) {
43 printk(KERN_ERR
44 "hfs: invalid btree extent records (0 size).\n");
45 goto free_inode;
46 }
47
42 mapping = tree->inode->i_mapping; 48 mapping = tree->inode->i_mapping;
43 page = read_mapping_page(mapping, 0, NULL); 49 page = read_mapping_page(mapping, 0, NULL);
44 if (IS_ERR(page)) 50 if (IS_ERR(page))
45 goto free_tree; 51 goto free_inode;
46 52
47 /* Load the header */ 53 /* Load the header */
48 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); 54 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
57 tree->max_key_len = be16_to_cpu(head->max_key_len); 63 tree->max_key_len = be16_to_cpu(head->max_key_len);
58 tree->depth = be16_to_cpu(head->depth); 64 tree->depth = be16_to_cpu(head->depth);
59 65
60 /* Set the correct compare function */ 66 /* Verify the tree and set the correct compare function */
61 if (id == HFSPLUS_EXT_CNID) { 67 switch (id) {
68 case HFSPLUS_EXT_CNID:
69 if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
70 printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
71 tree->max_key_len);
72 goto fail_page;
73 }
74 if (tree->attributes & HFS_TREE_VARIDXKEYS) {
75 printk(KERN_ERR "hfs: invalid extent btree flag\n");
76 goto fail_page;
77 }
78
62 tree->keycmp = hfsplus_ext_cmp_key; 79 tree->keycmp = hfsplus_ext_cmp_key;
63 } else if (id == HFSPLUS_CAT_CNID) { 80 break;
64 if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) && 81 case HFSPLUS_CAT_CNID:
82 if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
83 printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
84 tree->max_key_len);
85 goto fail_page;
86 }
87 if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
88 printk(KERN_ERR "hfs: invalid catalog btree flag\n");
89 goto fail_page;
90 }
91
92 if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
65 (head->key_type == HFSPLUS_KEY_BINARY)) 93 (head->key_type == HFSPLUS_KEY_BINARY))
66 tree->keycmp = hfsplus_cat_bin_cmp_key; 94 tree->keycmp = hfsplus_cat_bin_cmp_key;
67 else { 95 else {
68 tree->keycmp = hfsplus_cat_case_cmp_key; 96 tree->keycmp = hfsplus_cat_case_cmp_key;
69 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD; 97 set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
70 } 98 }
71 } else { 99 break;
100 default:
72 printk(KERN_ERR "hfs: unknown B*Tree requested\n"); 101 printk(KERN_ERR "hfs: unknown B*Tree requested\n");
73 goto fail_page; 102 goto fail_page;
74 } 103 }
75 104
105 if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
106 printk(KERN_ERR "hfs: invalid btree flag\n");
107 goto fail_page;
108 }
109
76 size = tree->node_size; 110 size = tree->node_size;
77 if (!is_power_of_2(size)) 111 if (!is_power_of_2(size))
78 goto fail_page; 112 goto fail_page;
79 if (!tree->node_count) 113 if (!tree->node_count)
80 goto fail_page; 114 goto fail_page;
115
81 tree->node_size_shift = ffs(size) - 1; 116 tree->node_size_shift = ffs(size) - 1;
82 117
83 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 118 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
87 return tree; 122 return tree;
88 123
89 fail_page: 124 fail_page:
90 tree->inode->i_mapping->a_ops = &hfsplus_aops;
91 page_cache_release(page); 125 page_cache_release(page);
92 free_tree: 126 free_inode:
127 tree->inode->i_mapping->a_ops = &hfsplus_aops;
93 iput(tree->inode); 128 iput(tree->inode);
129 free_tree:
94 kfree(tree); 130 kfree(tree);
95 return NULL; 131 return NULL;
96} 132}
@@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
192 228
193 while (!tree->free_nodes) { 229 while (!tree->free_nodes) {
194 struct inode *inode = tree->inode; 230 struct inode *inode = tree->inode;
231 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
195 u32 count; 232 u32 count;
196 int res; 233 int res;
197 234
198 res = hfsplus_file_extend(inode); 235 res = hfsplus_file_extend(inode);
199 if (res) 236 if (res)
200 return ERR_PTR(res); 237 return ERR_PTR(res);
201 HFSPLUS_I(inode).phys_size = inode->i_size = 238 hip->phys_size = inode->i_size =
202 (loff_t)HFSPLUS_I(inode).alloc_blocks << 239 (loff_t)hip->alloc_blocks <<
203 HFSPLUS_SB(tree->sb).alloc_blksz_shift; 240 HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
204 HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks << 241 hip->fs_blocks =
205 HFSPLUS_SB(tree->sb).fs_shift; 242 hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
206 inode_set_bytes(inode, inode->i_size); 243 inode_set_bytes(inode, inode->i_size);
207 count = inode->i_size >> tree->node_size_shift; 244 count = inode->i_size >> tree->node_size_shift;
208 tree->free_nodes = count - tree->node_count; 245 tree->free_nodes = count - tree->node_count;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index f6874acb2cf2..8af45fc5b051 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
67 key->key_len = cpu_to_be16(6 + ustrlen); 67 key->key_len = cpu_to_be16(6 + ustrlen);
68} 68}
69 69
70static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) 70void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
71{ 71{
72 if (inode->i_flags & S_IMMUTABLE) 72 if (inode->i_flags & S_IMMUTABLE)
73 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; 73 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
77 perms->rootflags |= HFSPLUS_FLG_APPEND; 77 perms->rootflags |= HFSPLUS_FLG_APPEND;
78 else 78 else
79 perms->rootflags &= ~HFSPLUS_FLG_APPEND; 79 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
80 HFSPLUS_I(inode).rootflags = perms->rootflags; 80
81 HFSPLUS_I(inode).userflags = perms->userflags; 81 perms->userflags = HFSPLUS_I(inode)->userflags;
82 perms->mode = cpu_to_be16(inode->i_mode); 82 perms->mode = cpu_to_be16(inode->i_mode);
83 perms->owner = cpu_to_be32(inode->i_uid); 83 perms->owner = cpu_to_be32(inode->i_uid);
84 perms->group = cpu_to_be32(inode->i_gid); 84 perms->group = cpu_to_be32(inode->i_gid);
85
86 if (S_ISREG(inode->i_mode))
87 perms->dev = cpu_to_be32(inode->i_nlink);
88 else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
89 perms->dev = cpu_to_be32(inode->i_rdev);
90 else
91 perms->dev = 0;
85} 92}
86 93
87static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode) 94static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
88{ 95{
96 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
97
89 if (S_ISDIR(inode->i_mode)) { 98 if (S_ISDIR(inode->i_mode)) {
90 struct hfsplus_cat_folder *folder; 99 struct hfsplus_cat_folder *folder;
91 100
@@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
93 memset(folder, 0, sizeof(*folder)); 102 memset(folder, 0, sizeof(*folder));
94 folder->type = cpu_to_be16(HFSPLUS_FOLDER); 103 folder->type = cpu_to_be16(HFSPLUS_FOLDER);
95 folder->id = cpu_to_be32(inode->i_ino); 104 folder->id = cpu_to_be32(inode->i_ino);
96 HFSPLUS_I(inode).create_date = 105 HFSPLUS_I(inode)->create_date =
97 folder->create_date = 106 folder->create_date =
98 folder->content_mod_date = 107 folder->content_mod_date =
99 folder->attribute_mod_date = 108 folder->attribute_mod_date =
100 folder->access_date = hfsp_now2mt(); 109 folder->access_date = hfsp_now2mt();
101 hfsplus_set_perms(inode, &folder->permissions); 110 hfsplus_cat_set_perms(inode, &folder->permissions);
102 if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir) 111 if (inode == sbi->hidden_dir)
103 /* invisible and namelocked */ 112 /* invisible and namelocked */
104 folder->user_info.frFlags = cpu_to_be16(0x5000); 113 folder->user_info.frFlags = cpu_to_be16(0x5000);
105 return sizeof(*folder); 114 return sizeof(*folder);
@@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
111 file->type = cpu_to_be16(HFSPLUS_FILE); 120 file->type = cpu_to_be16(HFSPLUS_FILE);
112 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); 121 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
113 file->id = cpu_to_be32(cnid); 122 file->id = cpu_to_be32(cnid);
114 HFSPLUS_I(inode).create_date = 123 HFSPLUS_I(inode)->create_date =
115 file->create_date = 124 file->create_date =
116 file->content_mod_date = 125 file->content_mod_date =
117 file->attribute_mod_date = 126 file->attribute_mod_date =
118 file->access_date = hfsp_now2mt(); 127 file->access_date = hfsp_now2mt();
119 if (cnid == inode->i_ino) { 128 if (cnid == inode->i_ino) {
120 hfsplus_set_perms(inode, &file->permissions); 129 hfsplus_cat_set_perms(inode, &file->permissions);
121 if (S_ISLNK(inode->i_mode)) { 130 if (S_ISLNK(inode->i_mode)) {
122 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); 131 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
123 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); 132 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
124 } else { 133 } else {
125 file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type); 134 file->user_info.fdType = cpu_to_be32(sbi->type);
126 file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator); 135 file->user_info.fdCreator = cpu_to_be32(sbi->creator);
127 } 136 }
128 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 137 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
129 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 138 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
@@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
131 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); 140 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
132 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); 141 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
133 file->user_info.fdFlags = cpu_to_be16(0x100); 142 file->user_info.fdFlags = cpu_to_be16(0x100);
134 file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date; 143 file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
135 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev); 144 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
136 } 145 }
137 return sizeof(*file); 146 return sizeof(*file);
138 } 147 }
@@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
180 189
181int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) 190int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
182{ 191{
192 struct super_block *sb = dir->i_sb;
183 struct hfs_find_data fd; 193 struct hfs_find_data fd;
184 struct super_block *sb;
185 hfsplus_cat_entry entry; 194 hfsplus_cat_entry entry;
186 int entry_size; 195 int entry_size;
187 int err; 196 int err;
188 197
189 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); 198 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
190 sb = dir->i_sb; 199 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
191 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
192 200
193 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); 201 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
194 entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? 202 entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -234,7 +242,7 @@ err2:
234 242
235int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) 243int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
236{ 244{
237 struct super_block *sb; 245 struct super_block *sb = dir->i_sb;
238 struct hfs_find_data fd; 246 struct hfs_find_data fd;
239 struct hfsplus_fork_raw fork; 247 struct hfsplus_fork_raw fork;
240 struct list_head *pos; 248 struct list_head *pos;
@@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
242 u16 type; 250 u16 type;
243 251
244 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); 252 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
245 sb = dir->i_sb; 253 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
246 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
247 254
248 if (!str) { 255 if (!str) {
249 int len; 256 int len;
@@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
279 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); 286 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
280 } 287 }
281 288
282 list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) { 289 list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
283 struct hfsplus_readdir_data *rd = 290 struct hfsplus_readdir_data *rd =
284 list_entry(pos, struct hfsplus_readdir_data, list); 291 list_entry(pos, struct hfsplus_readdir_data, list);
285 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) 292 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
@@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid,
312 struct inode *src_dir, struct qstr *src_name, 319 struct inode *src_dir, struct qstr *src_name,
313 struct inode *dst_dir, struct qstr *dst_name) 320 struct inode *dst_dir, struct qstr *dst_name)
314{ 321{
315 struct super_block *sb; 322 struct super_block *sb = src_dir->i_sb;
316 struct hfs_find_data src_fd, dst_fd; 323 struct hfs_find_data src_fd, dst_fd;
317 hfsplus_cat_entry entry; 324 hfsplus_cat_entry entry;
318 int entry_size, type; 325 int entry_size, type;
@@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid,
320 327
321 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, 328 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
322 dst_dir->i_ino, dst_name->name); 329 dst_dir->i_ino, dst_name->name);
323 sb = src_dir->i_sb; 330 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
324 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
325 dst_fd = src_fd; 331 dst_fd = src_fd;
326 332
327 /* find the old dir entry and read the data */ 333 /* find the old dir entry and read the data */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 764fd1bdca88..d236d85ec9d7 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
39 39
40 dentry->d_op = &hfsplus_dentry_operations; 40 dentry->d_op = &hfsplus_dentry_operations;
41 dentry->d_fsdata = NULL; 41 dentry->d_fsdata = NULL;
42 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 42 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); 43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
44again: 44again:
45 err = hfs_brec_read(&fd, &entry, sizeof(entry)); 45 err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -68,9 +68,9 @@ again:
68 cnid = be32_to_cpu(entry.file.id); 68 cnid = be32_to_cpu(entry.file.id);
69 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && 69 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
70 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && 70 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
71 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date || 71 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
72 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) && 72 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
73 HFSPLUS_SB(sb).hidden_dir) { 73 HFSPLUS_SB(sb)->hidden_dir) {
74 struct qstr str; 74 struct qstr str;
75 char name[32]; 75 char name[32];
76 76
@@ -86,7 +86,8 @@ again:
86 linkid = be32_to_cpu(entry.file.permissions.dev); 86 linkid = be32_to_cpu(entry.file.permissions.dev);
87 str.len = sprintf(name, "iNode%d", linkid); 87 str.len = sprintf(name, "iNode%d", linkid);
88 str.name = name; 88 str.name = name;
89 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str); 89 hfsplus_cat_build_key(sb, fd.search_key,
90 HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
90 goto again; 91 goto again;
91 } 92 }
92 } else if (!dentry->d_fsdata) 93 } else if (!dentry->d_fsdata)
@@ -101,7 +102,7 @@ again:
101 if (IS_ERR(inode)) 102 if (IS_ERR(inode))
102 return ERR_CAST(inode); 103 return ERR_CAST(inode);
103 if (S_ISREG(inode->i_mode)) 104 if (S_ISREG(inode->i_mode))
104 HFSPLUS_I(inode).dev = linkid; 105 HFSPLUS_I(inode)->linkid = linkid;
105out: 106out:
106 d_add(dentry, inode); 107 d_add(dentry, inode);
107 return NULL; 108 return NULL;
@@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
124 if (filp->f_pos >= inode->i_size) 125 if (filp->f_pos >= inode->i_size)
125 return 0; 126 return 0;
126 127
127 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 128 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
128 hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); 129 hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
129 err = hfs_brec_find(&fd); 130 err = hfs_brec_find(&fd);
130 if (err) 131 if (err)
@@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
180 err = -EIO; 181 err = -EIO;
181 goto out; 182 goto out;
182 } 183 }
183 if (HFSPLUS_SB(sb).hidden_dir && 184 if (HFSPLUS_SB(sb)->hidden_dir &&
184 HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id)) 185 HFSPLUS_SB(sb)->hidden_dir->i_ino ==
186 be32_to_cpu(entry.folder.id))
185 goto next; 187 goto next;
186 if (filldir(dirent, strbuf, len, filp->f_pos, 188 if (filldir(dirent, strbuf, len, filp->f_pos,
187 be32_to_cpu(entry.folder.id), DT_DIR)) 189 be32_to_cpu(entry.folder.id), DT_DIR))
@@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
217 } 219 }
218 filp->private_data = rd; 220 filp->private_data = rd;
219 rd->file = filp; 221 rd->file = filp;
220 list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list); 222 list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
221 } 223 }
222 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); 224 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
223out: 225out:
@@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
229{ 231{
230 struct hfsplus_readdir_data *rd = file->private_data; 232 struct hfsplus_readdir_data *rd = file->private_data;
231 if (rd) { 233 if (rd) {
234 mutex_lock(&inode->i_mutex);
232 list_del(&rd->list); 235 list_del(&rd->list);
236 mutex_unlock(&inode->i_mutex);
233 kfree(rd); 237 kfree(rd);
234 } 238 }
235 return 0; 239 return 0;
236} 240}
237 241
238static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
239 struct nameidata *nd)
240{
241 struct inode *inode;
242 int res;
243
244 inode = hfsplus_new_inode(dir->i_sb, mode);
245 if (!inode)
246 return -ENOSPC;
247
248 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
249 if (res) {
250 inode->i_nlink = 0;
251 hfsplus_delete_inode(inode);
252 iput(inode);
253 return res;
254 }
255 hfsplus_instantiate(dentry, inode, inode->i_ino);
256 mark_inode_dirty(inode);
257 return 0;
258}
259
260static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, 242static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
261 struct dentry *dst_dentry) 243 struct dentry *dst_dentry)
262{ 244{
263 struct super_block *sb = dst_dir->i_sb; 245 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
264 struct inode *inode = src_dentry->d_inode; 246 struct inode *inode = src_dentry->d_inode;
265 struct inode *src_dir = src_dentry->d_parent->d_inode; 247 struct inode *src_dir = src_dentry->d_parent->d_inode;
266 struct qstr str; 248 struct qstr str;
@@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
270 252
271 if (HFSPLUS_IS_RSRC(inode)) 253 if (HFSPLUS_IS_RSRC(inode))
272 return -EPERM; 254 return -EPERM;
255 if (!S_ISREG(inode->i_mode))
256 return -EPERM;
273 257
258 mutex_lock(&sbi->vh_mutex);
274 if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { 259 if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
275 for (;;) { 260 for (;;) {
276 get_random_bytes(&id, sizeof(cnid)); 261 get_random_bytes(&id, sizeof(cnid));
@@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
279 str.len = sprintf(name, "iNode%d", id); 264 str.len = sprintf(name, "iNode%d", id);
280 res = hfsplus_rename_cat(inode->i_ino, 265 res = hfsplus_rename_cat(inode->i_ino,
281 src_dir, &src_dentry->d_name, 266 src_dir, &src_dentry->d_name,
282 HFSPLUS_SB(sb).hidden_dir, &str); 267 sbi->hidden_dir, &str);
283 if (!res) 268 if (!res)
284 break; 269 break;
285 if (res != -EEXIST) 270 if (res != -EEXIST)
286 return res; 271 goto out;
287 } 272 }
288 HFSPLUS_I(inode).dev = id; 273 HFSPLUS_I(inode)->linkid = id;
289 cnid = HFSPLUS_SB(sb).next_cnid++; 274 cnid = sbi->next_cnid++;
290 src_dentry->d_fsdata = (void *)(unsigned long)cnid; 275 src_dentry->d_fsdata = (void *)(unsigned long)cnid;
291 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); 276 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
292 if (res) 277 if (res)
293 /* panic? */ 278 /* panic? */
294 return res; 279 goto out;
295 HFSPLUS_SB(sb).file_count++; 280 sbi->file_count++;
296 } 281 }
297 cnid = HFSPLUS_SB(sb).next_cnid++; 282 cnid = sbi->next_cnid++;
298 res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); 283 res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
299 if (res) 284 if (res)
300 return res; 285 goto out;
301 286
302 inc_nlink(inode); 287 inc_nlink(inode);
303 hfsplus_instantiate(dst_dentry, inode, cnid); 288 hfsplus_instantiate(dst_dentry, inode, cnid);
304 atomic_inc(&inode->i_count); 289 atomic_inc(&inode->i_count);
305 inode->i_ctime = CURRENT_TIME_SEC; 290 inode->i_ctime = CURRENT_TIME_SEC;
306 mark_inode_dirty(inode); 291 mark_inode_dirty(inode);
307 HFSPLUS_SB(sb).file_count++; 292 sbi->file_count++;
308 sb->s_dirt = 1; 293 dst_dir->i_sb->s_dirt = 1;
309 294out:
310 return 0; 295 mutex_unlock(&sbi->vh_mutex);
296 return res;
311} 297}
312 298
313static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) 299static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
314{ 300{
315 struct super_block *sb = dir->i_sb; 301 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
316 struct inode *inode = dentry->d_inode; 302 struct inode *inode = dentry->d_inode;
317 struct qstr str; 303 struct qstr str;
318 char name[32]; 304 char name[32];
@@ -322,21 +308,22 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
322 if (HFSPLUS_IS_RSRC(inode)) 308 if (HFSPLUS_IS_RSRC(inode))
323 return -EPERM; 309 return -EPERM;
324 310
311 mutex_lock(&sbi->vh_mutex);
325 cnid = (u32)(unsigned long)dentry->d_fsdata; 312 cnid = (u32)(unsigned long)dentry->d_fsdata;
326 if (inode->i_ino == cnid && 313 if (inode->i_ino == cnid &&
327 atomic_read(&HFSPLUS_I(inode).opencnt)) { 314 atomic_read(&HFSPLUS_I(inode)->opencnt)) {
328 str.name = name; 315 str.name = name;
329 str.len = sprintf(name, "temp%lu", inode->i_ino); 316 str.len = sprintf(name, "temp%lu", inode->i_ino);
330 res = hfsplus_rename_cat(inode->i_ino, 317 res = hfsplus_rename_cat(inode->i_ino,
331 dir, &dentry->d_name, 318 dir, &dentry->d_name,
332 HFSPLUS_SB(sb).hidden_dir, &str); 319 sbi->hidden_dir, &str);
333 if (!res) 320 if (!res)
334 inode->i_flags |= S_DEAD; 321 inode->i_flags |= S_DEAD;
335 return res; 322 goto out;
336 } 323 }
337 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); 324 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
338 if (res) 325 if (res)
339 return res; 326 goto out;
340 327
341 if (inode->i_nlink > 0) 328 if (inode->i_nlink > 0)
342 drop_nlink(inode); 329 drop_nlink(inode);
@@ -344,10 +331,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
344 clear_nlink(inode); 331 clear_nlink(inode);
345 if (!inode->i_nlink) { 332 if (!inode->i_nlink) {
346 if (inode->i_ino != cnid) { 333 if (inode->i_ino != cnid) {
347 HFSPLUS_SB(sb).file_count--; 334 sbi->file_count--;
348 if (!atomic_read(&HFSPLUS_I(inode).opencnt)) { 335 if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
349 res = hfsplus_delete_cat(inode->i_ino, 336 res = hfsplus_delete_cat(inode->i_ino,
350 HFSPLUS_SB(sb).hidden_dir, 337 sbi->hidden_dir,
351 NULL); 338 NULL);
352 if (!res) 339 if (!res)
353 hfsplus_delete_inode(inode); 340 hfsplus_delete_inode(inode);
@@ -356,107 +343,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
356 } else 343 } else
357 hfsplus_delete_inode(inode); 344 hfsplus_delete_inode(inode);
358 } else 345 } else
359 HFSPLUS_SB(sb).file_count--; 346 sbi->file_count--;
360 inode->i_ctime = CURRENT_TIME_SEC; 347 inode->i_ctime = CURRENT_TIME_SEC;
361 mark_inode_dirty(inode); 348 mark_inode_dirty(inode);
362 349out:
350 mutex_unlock(&sbi->vh_mutex);
363 return res; 351 return res;
364} 352}
365 353
366static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
367{
368 struct inode *inode;
369 int res;
370
371 inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
372 if (!inode)
373 return -ENOSPC;
374
375 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
376 if (res) {
377 inode->i_nlink = 0;
378 hfsplus_delete_inode(inode);
379 iput(inode);
380 return res;
381 }
382 hfsplus_instantiate(dentry, inode, inode->i_ino);
383 mark_inode_dirty(inode);
384 return 0;
385}
386
387static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) 354static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
388{ 355{
389 struct inode *inode; 356 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
357 struct inode *inode = dentry->d_inode;
390 int res; 358 int res;
391 359
392 inode = dentry->d_inode;
393 if (inode->i_size != 2) 360 if (inode->i_size != 2)
394 return -ENOTEMPTY; 361 return -ENOTEMPTY;
362
363 mutex_lock(&sbi->vh_mutex);
395 res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); 364 res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
396 if (res) 365 if (res)
397 return res; 366 goto out;
398 clear_nlink(inode); 367 clear_nlink(inode);
399 inode->i_ctime = CURRENT_TIME_SEC; 368 inode->i_ctime = CURRENT_TIME_SEC;
400 hfsplus_delete_inode(inode); 369 hfsplus_delete_inode(inode);
401 mark_inode_dirty(inode); 370 mark_inode_dirty(inode);
402 return 0; 371out:
372 mutex_unlock(&sbi->vh_mutex);
373 return res;
403} 374}
404 375
405static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, 376static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
406 const char *symname) 377 const char *symname)
407{ 378{
408 struct super_block *sb; 379 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
409 struct inode *inode; 380 struct inode *inode;
410 int res; 381 int res = -ENOSPC;
411 382
412 sb = dir->i_sb; 383 mutex_lock(&sbi->vh_mutex);
413 inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO); 384 inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
414 if (!inode) 385 if (!inode)
415 return -ENOSPC; 386 goto out;
416 387
417 res = page_symlink(inode, symname, strlen(symname) + 1); 388 res = page_symlink(inode, symname, strlen(symname) + 1);
418 if (res) { 389 if (res)
419 inode->i_nlink = 0; 390 goto out_err;
420 hfsplus_delete_inode(inode);
421 iput(inode);
422 return res;
423 }
424 391
425 mark_inode_dirty(inode);
426 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 392 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
393 if (res)
394 goto out_err;
427 395
428 if (!res) { 396 hfsplus_instantiate(dentry, inode, inode->i_ino);
429 hfsplus_instantiate(dentry, inode, inode->i_ino); 397 mark_inode_dirty(inode);
430 mark_inode_dirty(inode); 398 goto out;
431 }
432 399
400out_err:
401 inode->i_nlink = 0;
402 hfsplus_delete_inode(inode);
403 iput(inode);
404out:
405 mutex_unlock(&sbi->vh_mutex);
433 return res; 406 return res;
434} 407}
435 408
436static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, 409static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
437 int mode, dev_t rdev) 410 int mode, dev_t rdev)
438{ 411{
439 struct super_block *sb; 412 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
440 struct inode *inode; 413 struct inode *inode;
441 int res; 414 int res = -ENOSPC;
442 415
443 sb = dir->i_sb; 416 mutex_lock(&sbi->vh_mutex);
444 inode = hfsplus_new_inode(sb, mode); 417 inode = hfsplus_new_inode(dir->i_sb, mode);
445 if (!inode) 418 if (!inode)
446 return -ENOSPC; 419 goto out;
420
421 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
422 init_special_inode(inode, mode, rdev);
447 423
448 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 424 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
449 if (res) { 425 if (res) {
450 inode->i_nlink = 0; 426 inode->i_nlink = 0;
451 hfsplus_delete_inode(inode); 427 hfsplus_delete_inode(inode);
452 iput(inode); 428 iput(inode);
453 return res; 429 goto out;
454 } 430 }
455 init_special_inode(inode, mode, rdev); 431
456 hfsplus_instantiate(dentry, inode, inode->i_ino); 432 hfsplus_instantiate(dentry, inode, inode->i_ino);
457 mark_inode_dirty(inode); 433 mark_inode_dirty(inode);
434out:
435 mutex_unlock(&sbi->vh_mutex);
436 return res;
437}
458 438
459 return 0; 439static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
440 struct nameidata *nd)
441{
442 return hfsplus_mknod(dir, dentry, mode, 0);
443}
444
445static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
446{
447 return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
460} 448}
461 449
462static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, 450static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -466,7 +454,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
466 454
467 /* Unlink destination if it already exists */ 455 /* Unlink destination if it already exists */
468 if (new_dentry->d_inode) { 456 if (new_dentry->d_inode) {
469 res = hfsplus_unlink(new_dir, new_dentry); 457 if (S_ISDIR(new_dentry->d_inode->i_mode))
458 res = hfsplus_rmdir(new_dir, new_dentry);
459 else
460 res = hfsplus_unlink(new_dir, new_dentry);
470 if (res) 461 if (res)
471 return res; 462 return res;
472 } 463 }
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0022eec63cda..0c9cb1820a52 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
85 85
86static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) 86static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
87{ 87{
88 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
88 int res; 89 int res;
89 90
90 hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start, 91 WARN_ON(!mutex_is_locked(&hip->extents_lock));
91 HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 92
93 hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
94 HFSPLUS_IS_RSRC(inode) ?
95 HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
96
92 res = hfs_brec_find(fd); 97 res = hfs_brec_find(fd);
93 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) { 98 if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
94 if (res != -ENOENT) 99 if (res != -ENOENT)
95 return; 100 return;
96 hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec)); 101 hfs_brec_insert(fd, hip->cached_extents,
97 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 102 sizeof(hfsplus_extent_rec));
103 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
98 } else { 104 } else {
99 if (res) 105 if (res)
100 return; 106 return;
101 hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength); 107 hfs_bnode_write(fd->bnode, hip->cached_extents,
102 HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY; 108 fd->entryoffset, fd->entrylength);
109 hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
103 } 110 }
104} 111}
105 112
106void hfsplus_ext_write_extent(struct inode *inode) 113static void hfsplus_ext_write_extent_locked(struct inode *inode)
107{ 114{
108 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) { 115 if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
109 struct hfs_find_data fd; 116 struct hfs_find_data fd;
110 117
111 hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); 118 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
112 __hfsplus_ext_write_extent(inode, &fd); 119 __hfsplus_ext_write_extent(inode, &fd);
113 hfs_find_exit(&fd); 120 hfs_find_exit(&fd);
114 } 121 }
115} 122}
116 123
124void hfsplus_ext_write_extent(struct inode *inode)
125{
126 mutex_lock(&HFSPLUS_I(inode)->extents_lock);
127 hfsplus_ext_write_extent_locked(inode);
128 mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
129}
130
117static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, 131static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
118 struct hfsplus_extent *extent, 132 struct hfsplus_extent *extent,
119 u32 cnid, u32 block, u8 type) 133 u32 cnid, u32 block, u8 type)
@@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
136 150
137static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) 151static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
138{ 152{
153 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
139 int res; 154 int res;
140 155
141 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) 156 WARN_ON(!mutex_is_locked(&hip->extents_lock));
157
158 if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
142 __hfsplus_ext_write_extent(inode, fd); 159 __hfsplus_ext_write_extent(inode, fd);
143 160
144 res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino, 161 res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
145 block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 162 block, HFSPLUS_IS_RSRC(inode) ?
163 HFSPLUS_TYPE_RSRC :
164 HFSPLUS_TYPE_DATA);
146 if (!res) { 165 if (!res) {
147 HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block); 166 hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
148 HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents); 167 hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
149 } else { 168 } else {
150 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; 169 hip->cached_start = hip->cached_blocks = 0;
151 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 170 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
152 } 171 }
153 return res; 172 return res;
154} 173}
155 174
156static int hfsplus_ext_read_extent(struct inode *inode, u32 block) 175static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
157{ 176{
177 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
158 struct hfs_find_data fd; 178 struct hfs_find_data fd;
159 int res; 179 int res;
160 180
161 if (block >= HFSPLUS_I(inode).cached_start && 181 if (block >= hip->cached_start &&
162 block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks) 182 block < hip->cached_start + hip->cached_blocks)
163 return 0; 183 return 0;
164 184
165 hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); 185 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
166 res = __hfsplus_ext_cache_extent(&fd, inode, block); 186 res = __hfsplus_ext_cache_extent(&fd, inode, block);
167 hfs_find_exit(&fd); 187 hfs_find_exit(&fd);
168 return res; 188 return res;
@@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
172int hfsplus_get_block(struct inode *inode, sector_t iblock, 192int hfsplus_get_block(struct inode *inode, sector_t iblock,
173 struct buffer_head *bh_result, int create) 193 struct buffer_head *bh_result, int create)
174{ 194{
175 struct super_block *sb; 195 struct super_block *sb = inode->i_sb;
196 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
197 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
176 int res = -EIO; 198 int res = -EIO;
177 u32 ablock, dblock, mask; 199 u32 ablock, dblock, mask;
178 int shift; 200 int shift;
179 201
180 sb = inode->i_sb;
181
182 /* Convert inode block to disk allocation block */ 202 /* Convert inode block to disk allocation block */
183 shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits; 203 shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
184 ablock = iblock >> HFSPLUS_SB(sb).fs_shift; 204 ablock = iblock >> sbi->fs_shift;
185 205
186 if (iblock >= HFSPLUS_I(inode).fs_blocks) { 206 if (iblock >= hip->fs_blocks) {
187 if (iblock > HFSPLUS_I(inode).fs_blocks || !create) 207 if (iblock > hip->fs_blocks || !create)
188 return -EIO; 208 return -EIO;
189 if (ablock >= HFSPLUS_I(inode).alloc_blocks) { 209 if (ablock >= hip->alloc_blocks) {
190 res = hfsplus_file_extend(inode); 210 res = hfsplus_file_extend(inode);
191 if (res) 211 if (res)
192 return res; 212 return res;
@@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
194 } else 214 } else
195 create = 0; 215 create = 0;
196 216
197 if (ablock < HFSPLUS_I(inode).first_blocks) { 217 if (ablock < hip->first_blocks) {
198 dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock); 218 dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
199 goto done; 219 goto done;
200 } 220 }
201 221
202 if (inode->i_ino == HFSPLUS_EXT_CNID) 222 if (inode->i_ino == HFSPLUS_EXT_CNID)
203 return -EIO; 223 return -EIO;
204 224
205 mutex_lock(&HFSPLUS_I(inode).extents_lock); 225 mutex_lock(&hip->extents_lock);
206 res = hfsplus_ext_read_extent(inode, ablock); 226 res = hfsplus_ext_read_extent(inode, ablock);
207 if (!res) { 227 if (!res) {
208 dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock - 228 dblock = hfsplus_ext_find_block(hip->cached_extents,
209 HFSPLUS_I(inode).cached_start); 229 ablock - hip->cached_start);
210 } else { 230 } else {
211 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 231 mutex_unlock(&hip->extents_lock);
212 return -EIO; 232 return -EIO;
213 } 233 }
214 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 234 mutex_unlock(&hip->extents_lock);
215 235
216done: 236done:
217 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); 237 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
218 mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1; 238 mask = (1 << sbi->fs_shift) - 1;
219 map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask)); 239 map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
220 if (create) { 240 if (create) {
221 set_buffer_new(bh_result); 241 set_buffer_new(bh_result);
222 HFSPLUS_I(inode).phys_size += sb->s_blocksize; 242 hip->phys_size += sb->s_blocksize;
223 HFSPLUS_I(inode).fs_blocks++; 243 hip->fs_blocks++;
224 inode_add_bytes(inode, sb->s_blocksize); 244 inode_add_bytes(inode, sb->s_blocksize);
225 mark_inode_dirty(inode); 245 mark_inode_dirty(inode);
226 } 246 }
@@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
327 if (total_blocks == blocks) 347 if (total_blocks == blocks)
328 return 0; 348 return 0;
329 349
330 hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); 350 hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
331 do { 351 do {
332 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, 352 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
333 total_blocks, type); 353 total_blocks, type);
@@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
348int hfsplus_file_extend(struct inode *inode) 368int hfsplus_file_extend(struct inode *inode)
349{ 369{
350 struct super_block *sb = inode->i_sb; 370 struct super_block *sb = inode->i_sb;
371 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
372 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
351 u32 start, len, goal; 373 u32 start, len, goal;
352 int res; 374 int res;
353 375
354 if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) { 376 if (sbi->alloc_file->i_size * 8 <
377 sbi->total_blocks - sbi->free_blocks + 8) {
355 // extend alloc file 378 // extend alloc file
356 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8, 379 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
357 HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks); 380 sbi->alloc_file->i_size * 8,
381 sbi->total_blocks, sbi->free_blocks);
358 return -ENOSPC; 382 return -ENOSPC;
359 } 383 }
360 384
361 mutex_lock(&HFSPLUS_I(inode).extents_lock); 385 mutex_lock(&hip->extents_lock);
362 if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks) 386 if (hip->alloc_blocks == hip->first_blocks)
363 goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents); 387 goal = hfsplus_ext_lastblock(hip->first_extents);
364 else { 388 else {
365 res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks); 389 res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
366 if (res) 390 if (res)
367 goto out; 391 goto out;
368 goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents); 392 goal = hfsplus_ext_lastblock(hip->cached_extents);
369 } 393 }
370 394
371 len = HFSPLUS_I(inode).clump_blocks; 395 len = hip->clump_blocks;
372 start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len); 396 start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
373 if (start >= HFSPLUS_SB(sb).total_blocks) { 397 if (start >= sbi->total_blocks) {
374 start = hfsplus_block_allocate(sb, goal, 0, &len); 398 start = hfsplus_block_allocate(sb, goal, 0, &len);
375 if (start >= goal) { 399 if (start >= goal) {
376 res = -ENOSPC; 400 res = -ENOSPC;
@@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode)
379 } 403 }
380 404
381 dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); 405 dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
382 if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) { 406
383 if (!HFSPLUS_I(inode).first_blocks) { 407 if (hip->alloc_blocks <= hip->first_blocks) {
408 if (!hip->first_blocks) {
384 dprint(DBG_EXTENT, "first extents\n"); 409 dprint(DBG_EXTENT, "first extents\n");
385 /* no extents yet */ 410 /* no extents yet */
386 HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start); 411 hip->first_extents[0].start_block = cpu_to_be32(start);
387 HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len); 412 hip->first_extents[0].block_count = cpu_to_be32(len);
388 res = 0; 413 res = 0;
389 } else { 414 } else {
390 /* try to append to extents in inode */ 415 /* try to append to extents in inode */
391 res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents, 416 res = hfsplus_add_extent(hip->first_extents,
392 HFSPLUS_I(inode).alloc_blocks, 417 hip->alloc_blocks,
393 start, len); 418 start, len);
394 if (res == -ENOSPC) 419 if (res == -ENOSPC)
395 goto insert_extent; 420 goto insert_extent;
396 } 421 }
397 if (!res) { 422 if (!res) {
398 hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); 423 hfsplus_dump_extent(hip->first_extents);
399 HFSPLUS_I(inode).first_blocks += len; 424 hip->first_blocks += len;
400 } 425 }
401 } else { 426 } else {
402 res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents, 427 res = hfsplus_add_extent(hip->cached_extents,
403 HFSPLUS_I(inode).alloc_blocks - 428 hip->alloc_blocks - hip->cached_start,
404 HFSPLUS_I(inode).cached_start,
405 start, len); 429 start, len);
406 if (!res) { 430 if (!res) {
407 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 431 hfsplus_dump_extent(hip->cached_extents);
408 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; 432 hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
409 HFSPLUS_I(inode).cached_blocks += len; 433 hip->cached_blocks += len;
410 } else if (res == -ENOSPC) 434 } else if (res == -ENOSPC)
411 goto insert_extent; 435 goto insert_extent;
412 } 436 }
413out: 437out:
414 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 438 mutex_unlock(&hip->extents_lock);
415 if (!res) { 439 if (!res) {
416 HFSPLUS_I(inode).alloc_blocks += len; 440 hip->alloc_blocks += len;
417 mark_inode_dirty(inode); 441 mark_inode_dirty(inode);
418 } 442 }
419 return res; 443 return res;
420 444
421insert_extent: 445insert_extent:
422 dprint(DBG_EXTENT, "insert new extent\n"); 446 dprint(DBG_EXTENT, "insert new extent\n");
423 hfsplus_ext_write_extent(inode); 447 hfsplus_ext_write_extent_locked(inode);
424 448
425 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 449 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
426 HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start); 450 hip->cached_extents[0].start_block = cpu_to_be32(start);
427 HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len); 451 hip->cached_extents[0].block_count = cpu_to_be32(len);
428 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 452 hfsplus_dump_extent(hip->cached_extents);
429 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; 453 hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
430 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks; 454 hip->cached_start = hip->alloc_blocks;
431 HFSPLUS_I(inode).cached_blocks = len; 455 hip->cached_blocks = len;
432 456
433 res = 0; 457 res = 0;
434 goto out; 458 goto out;
@@ -437,13 +461,15 @@ insert_extent:
437void hfsplus_file_truncate(struct inode *inode) 461void hfsplus_file_truncate(struct inode *inode)
438{ 462{
439 struct super_block *sb = inode->i_sb; 463 struct super_block *sb = inode->i_sb;
464 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
440 struct hfs_find_data fd; 465 struct hfs_find_data fd;
441 u32 alloc_cnt, blk_cnt, start; 466 u32 alloc_cnt, blk_cnt, start;
442 int res; 467 int res;
443 468
444 dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, 469 dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
445 (long long)HFSPLUS_I(inode).phys_size, inode->i_size); 470 inode->i_ino, (long long)hip->phys_size, inode->i_size);
446 if (inode->i_size > HFSPLUS_I(inode).phys_size) { 471
472 if (inode->i_size > hip->phys_size) {
447 struct address_space *mapping = inode->i_mapping; 473 struct address_space *mapping = inode->i_mapping;
448 struct page *page; 474 struct page *page;
449 void *fsdata; 475 void *fsdata;
@@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode)
460 return; 486 return;
461 mark_inode_dirty(inode); 487 mark_inode_dirty(inode);
462 return; 488 return;
463 } else if (inode->i_size == HFSPLUS_I(inode).phys_size) 489 } else if (inode->i_size == hip->phys_size)
464 return; 490 return;
465 491
466 blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift; 492 blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
467 alloc_cnt = HFSPLUS_I(inode).alloc_blocks; 493 HFSPLUS_SB(sb)->alloc_blksz_shift;
494 alloc_cnt = hip->alloc_blocks;
468 if (blk_cnt == alloc_cnt) 495 if (blk_cnt == alloc_cnt)
469 goto out; 496 goto out;
470 497
471 mutex_lock(&HFSPLUS_I(inode).extents_lock); 498 mutex_lock(&hip->extents_lock);
472 hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); 499 hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
473 while (1) { 500 while (1) {
474 if (alloc_cnt == HFSPLUS_I(inode).first_blocks) { 501 if (alloc_cnt == hip->first_blocks) {
475 hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents, 502 hfsplus_free_extents(sb, hip->first_extents,
476 alloc_cnt, alloc_cnt - blk_cnt); 503 alloc_cnt, alloc_cnt - blk_cnt);
477 hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); 504 hfsplus_dump_extent(hip->first_extents);
478 HFSPLUS_I(inode).first_blocks = blk_cnt; 505 hip->first_blocks = blk_cnt;
479 break; 506 break;
480 } 507 }
481 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); 508 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
482 if (res) 509 if (res)
483 break; 510 break;
484 start = HFSPLUS_I(inode).cached_start; 511 start = hip->cached_start;
485 hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents, 512 hfsplus_free_extents(sb, hip->cached_extents,
486 alloc_cnt - start, alloc_cnt - blk_cnt); 513 alloc_cnt - start, alloc_cnt - blk_cnt);
487 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 514 hfsplus_dump_extent(hip->cached_extents);
488 if (blk_cnt > start) { 515 if (blk_cnt > start) {
489 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; 516 hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
490 break; 517 break;
491 } 518 }
492 alloc_cnt = start; 519 alloc_cnt = start;
493 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; 520 hip->cached_start = hip->cached_blocks = 0;
494 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 521 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
495 hfs_brec_remove(&fd); 522 hfs_brec_remove(&fd);
496 } 523 }
497 hfs_find_exit(&fd); 524 hfs_find_exit(&fd);
498 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 525 mutex_unlock(&hip->extents_lock);
499 526
500 HFSPLUS_I(inode).alloc_blocks = blk_cnt; 527 hip->alloc_blocks = blk_cnt;
501out: 528out:
502 HFSPLUS_I(inode).phys_size = inode->i_size; 529 hip->phys_size = inode->i_size;
503 HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 530 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
504 inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); 531 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
505 mark_inode_dirty(inode); 532 mark_inode_dirty(inode);
506} 533}
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index dc856be3c2b0..cb3653efb57a 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -62,7 +62,7 @@ struct hfs_btree {
62 unsigned int depth; 62 unsigned int depth;
63 63
64 //unsigned int map1_size, map_size; 64 //unsigned int map1_size, map_size;
65 struct semaphore tree_lock; 65 struct mutex tree_lock;
66 66
67 unsigned int pages_per_bnode; 67 unsigned int pages_per_bnode;
68 spinlock_t hash_lock; 68 spinlock_t hash_lock;
@@ -121,16 +121,21 @@ struct hfsplus_sb_info {
121 u32 sect_count; 121 u32 sect_count;
122 int fs_shift; 122 int fs_shift;
123 123
124 /* Stuff in host order from Vol Header */ 124 /* immutable data from the volume header */
125 u32 alloc_blksz; 125 u32 alloc_blksz;
126 int alloc_blksz_shift; 126 int alloc_blksz_shift;
127 u32 total_blocks; 127 u32 total_blocks;
128 u32 data_clump_blocks, rsrc_clump_blocks;
129
130 /* mutable data from the volume header, protected by alloc_mutex */
128 u32 free_blocks; 131 u32 free_blocks;
129 u32 next_alloc; 132 struct mutex alloc_mutex;
133
134 /* mutable data from the volume header, protected by vh_mutex */
130 u32 next_cnid; 135 u32 next_cnid;
131 u32 file_count; 136 u32 file_count;
132 u32 folder_count; 137 u32 folder_count;
133 u32 data_clump_blocks, rsrc_clump_blocks; 138 struct mutex vh_mutex;
134 139
135 /* Config options */ 140 /* Config options */
136 u32 creator; 141 u32 creator;
@@ -143,40 +148,50 @@ struct hfsplus_sb_info {
143 int part, session; 148 int part, session;
144 149
145 unsigned long flags; 150 unsigned long flags;
146
147 struct hlist_head rsrc_inodes;
148}; 151};
149 152
150#define HFSPLUS_SB_WRITEBACKUP 0x0001 153#define HFSPLUS_SB_WRITEBACKUP 0
151#define HFSPLUS_SB_NODECOMPOSE 0x0002 154#define HFSPLUS_SB_NODECOMPOSE 1
152#define HFSPLUS_SB_FORCE 0x0004 155#define HFSPLUS_SB_FORCE 2
153#define HFSPLUS_SB_HFSX 0x0008 156#define HFSPLUS_SB_HFSX 3
154#define HFSPLUS_SB_CASEFOLD 0x0010 157#define HFSPLUS_SB_CASEFOLD 4
155 158
156 159
157struct hfsplus_inode_info { 160struct hfsplus_inode_info {
158 struct mutex extents_lock;
159 u32 clump_blocks, alloc_blocks;
160 sector_t fs_blocks;
161 /* Allocation extents from catalog record or volume header */
162 hfsplus_extent_rec first_extents;
163 u32 first_blocks;
164 hfsplus_extent_rec cached_extents;
165 u32 cached_start, cached_blocks;
166 atomic_t opencnt; 161 atomic_t opencnt;
167 162
168 struct inode *rsrc_inode; 163 /*
164 * Extent allocation information, protected by extents_lock.
165 */
166 u32 first_blocks;
167 u32 clump_blocks;
168 u32 alloc_blocks;
169 u32 cached_start;
170 u32 cached_blocks;
171 hfsplus_extent_rec first_extents;
172 hfsplus_extent_rec cached_extents;
169 unsigned long flags; 173 unsigned long flags;
174 struct mutex extents_lock;
170 175
176 /*
177 * Immutable data.
178 */
179 struct inode *rsrc_inode;
171 __be32 create_date; 180 __be32 create_date;
172 /* Device number in hfsplus_permissions in catalog */
173 u32 dev;
174 /* BSD system and user file flags */
175 u8 rootflags;
176 u8 userflags;
177 181
182 /*
183 * Protected by sbi->vh_mutex.
184 */
185 u32 linkid;
186
187 /*
188 * Protected by i_mutex.
189 */
190 sector_t fs_blocks;
191 u8 userflags; /* BSD user file flags */
178 struct list_head open_dir_list; 192 struct list_head open_dir_list;
179 loff_t phys_size; 193 loff_t phys_size;
194
180 struct inode vfs_inode; 195 struct inode vfs_inode;
181}; 196};
182 197
@@ -184,8 +199,8 @@ struct hfsplus_inode_info {
184#define HFSPLUS_FLG_EXT_DIRTY 0x0002 199#define HFSPLUS_FLG_EXT_DIRTY 0x0002
185#define HFSPLUS_FLG_EXT_NEW 0x0004 200#define HFSPLUS_FLG_EXT_NEW 0x0004
186 201
187#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)) 202#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
188#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC) 203#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
189 204
190struct hfs_find_data { 205struct hfs_find_data {
191 /* filled by caller */ 206 /* filled by caller */
@@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
311int hfsplus_delete_cat(u32, struct inode *, struct qstr *); 326int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
312int hfsplus_rename_cat(u32, struct inode *, struct qstr *, 327int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
313 struct inode *, struct qstr *); 328 struct inode *, struct qstr *);
329void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
314 330
315/* dir.c */ 331/* dir.c */
316extern const struct inode_operations hfsplus_dir_inode_operations; 332extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -372,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *);
372int hfs_part_find(struct super_block *, sector_t *, sector_t *); 388int hfs_part_find(struct super_block *, sector_t *, sector_t *);
373 389
374/* access macros */ 390/* access macros */
375/*
376static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) 391static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
377{ 392{
378 return sb->s_fs_info; 393 return sb->s_fs_info;
379} 394}
395
380static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) 396static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
381{ 397{
382 return list_entry(inode, struct hfsplus_inode_info, vfs_inode); 398 return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
383} 399}
384*/
385#define HFSPLUS_SB(super) (*(struct hfsplus_sb_info *)(super)->s_fs_info)
386#define HFSPLUS_I(inode) (*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
387
388#if 1
389#define hfsplus_kmap(p) ({ struct page *__p = (p); kmap(__p); })
390#define hfsplus_kunmap(p) ({ struct page *__p = (p); kunmap(__p); __p; })
391#else
392#define hfsplus_kmap(p) kmap(p)
393#define hfsplus_kunmap(p) kunmap(p)
394#endif
395 400
396#define sb_bread512(sb, sec, data) ({ \ 401#define sb_bread512(sb, sec, data) ({ \
397 struct buffer_head *__bh; \ 402 struct buffer_head *__bh; \
@@ -419,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
419#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) 424#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec)
420#define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) 425#define hfsp_now2mt() __hfsp_ut2mt(get_seconds())
421 426
422#define kdev_t_to_nr(x) (x)
423
424#endif 427#endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index fe99fe8db61a..6892899fd6fb 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -200,6 +200,7 @@ struct hfsplus_cat_key {
200 struct hfsplus_unistr name; 200 struct hfsplus_unistr name;
201} __packed; 201} __packed;
202 202
203#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key))
203 204
204/* Structs from hfs.h */ 205/* Structs from hfs.h */
205struct hfsp_point { 206struct hfsp_point {
@@ -323,7 +324,7 @@ struct hfsplus_ext_key {
323 __be32 start_block; 324 __be32 start_block;
324} __packed; 325} __packed;
325 326
326#define HFSPLUS_EXT_KEYLEN 12 327#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key)
327 328
328/* HFS+ generic BTree key */ 329/* HFS+ generic BTree key */
329typedef union { 330typedef union {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c5a979d62c65..78449280dae0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -36,7 +36,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
36 *pagep = NULL; 36 *pagep = NULL;
37 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 37 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
38 hfsplus_get_block, 38 hfsplus_get_block,
39 &HFSPLUS_I(mapping->host).phys_size); 39 &HFSPLUS_I(mapping->host)->phys_size);
40 if (unlikely(ret)) { 40 if (unlikely(ret)) {
41 loff_t isize = mapping->host->i_size; 41 loff_t isize = mapping->host->i_size;
42 if (pos + len > isize) 42 if (pos + len > isize)
@@ -62,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
62 62
63 switch (inode->i_ino) { 63 switch (inode->i_ino) {
64 case HFSPLUS_EXT_CNID: 64 case HFSPLUS_EXT_CNID:
65 tree = HFSPLUS_SB(sb).ext_tree; 65 tree = HFSPLUS_SB(sb)->ext_tree;
66 break; 66 break;
67 case HFSPLUS_CAT_CNID: 67 case HFSPLUS_CAT_CNID:
68 tree = HFSPLUS_SB(sb).cat_tree; 68 tree = HFSPLUS_SB(sb)->cat_tree;
69 break; 69 break;
70 case HFSPLUS_ATTR_CNID: 70 case HFSPLUS_ATTR_CNID:
71 tree = HFSPLUS_SB(sb).attr_tree; 71 tree = HFSPLUS_SB(sb)->attr_tree;
72 break; 72 break;
73 default: 73 default:
74 BUG(); 74 BUG();
@@ -172,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
172 struct hfs_find_data fd; 172 struct hfs_find_data fd;
173 struct super_block *sb = dir->i_sb; 173 struct super_block *sb = dir->i_sb;
174 struct inode *inode = NULL; 174 struct inode *inode = NULL;
175 struct hfsplus_inode_info *hip;
175 int err; 176 int err;
176 177
177 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) 178 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
178 goto out; 179 goto out;
179 180
180 inode = HFSPLUS_I(dir).rsrc_inode; 181 inode = HFSPLUS_I(dir)->rsrc_inode;
181 if (inode) 182 if (inode)
182 goto out; 183 goto out;
183 184
@@ -185,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
185 if (!inode) 186 if (!inode)
186 return ERR_PTR(-ENOMEM); 187 return ERR_PTR(-ENOMEM);
187 188
189 hip = HFSPLUS_I(inode);
188 inode->i_ino = dir->i_ino; 190 inode->i_ino = dir->i_ino;
189 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 191 INIT_LIST_HEAD(&hip->open_dir_list);
190 mutex_init(&HFSPLUS_I(inode).extents_lock); 192 mutex_init(&hip->extents_lock);
191 HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC; 193 hip->flags = HFSPLUS_FLG_RSRC;
192 194
193 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 195 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
194 err = hfsplus_find_cat(sb, dir->i_ino, &fd); 196 err = hfsplus_find_cat(sb, dir->i_ino, &fd);
195 if (!err) 197 if (!err)
196 err = hfsplus_cat_read_inode(inode, &fd); 198 err = hfsplus_cat_read_inode(inode, &fd);
@@ -199,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
199 iput(inode); 201 iput(inode);
200 return ERR_PTR(err); 202 return ERR_PTR(err);
201 } 203 }
202 HFSPLUS_I(inode).rsrc_inode = dir; 204 hip->rsrc_inode = dir;
203 HFSPLUS_I(dir).rsrc_inode = inode; 205 HFSPLUS_I(dir)->rsrc_inode = inode;
204 igrab(dir); 206 igrab(dir);
205 hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes); 207
208 /*
209 * __mark_inode_dirty expects inodes to be hashed. Since we don't
210 * want resource fork inodes in the regular inode space, we make them
211 * appear hashed, but do not put on any lists. hlist_del()
212 * will work fine and require no locking.
213 */
214 inode->i_hash.pprev = &inode->i_hash.next;
215
206 mark_inode_dirty(inode); 216 mark_inode_dirty(inode);
207out: 217out:
208 d_add(dentry, inode); 218 d_add(dentry, inode);
@@ -211,30 +221,27 @@ out:
211 221
212static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) 222static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
213{ 223{
214 struct super_block *sb = inode->i_sb; 224 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
215 u16 mode; 225 u16 mode;
216 226
217 mode = be16_to_cpu(perms->mode); 227 mode = be16_to_cpu(perms->mode);
218 228
219 inode->i_uid = be32_to_cpu(perms->owner); 229 inode->i_uid = be32_to_cpu(perms->owner);
220 if (!inode->i_uid && !mode) 230 if (!inode->i_uid && !mode)
221 inode->i_uid = HFSPLUS_SB(sb).uid; 231 inode->i_uid = sbi->uid;
222 232
223 inode->i_gid = be32_to_cpu(perms->group); 233 inode->i_gid = be32_to_cpu(perms->group);
224 if (!inode->i_gid && !mode) 234 if (!inode->i_gid && !mode)
225 inode->i_gid = HFSPLUS_SB(sb).gid; 235 inode->i_gid = sbi->gid;
226 236
227 if (dir) { 237 if (dir) {
228 mode = mode ? (mode & S_IALLUGO) : 238 mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
229 (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
230 mode |= S_IFDIR; 239 mode |= S_IFDIR;
231 } else if (!mode) 240 } else if (!mode)
232 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & 241 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
233 ~(HFSPLUS_SB(sb).umask));
234 inode->i_mode = mode; 242 inode->i_mode = mode;
235 243
236 HFSPLUS_I(inode).rootflags = perms->rootflags; 244 HFSPLUS_I(inode)->userflags = perms->userflags;
237 HFSPLUS_I(inode).userflags = perms->userflags;
238 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) 245 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
239 inode->i_flags |= S_IMMUTABLE; 246 inode->i_flags |= S_IMMUTABLE;
240 else 247 else
@@ -245,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
245 inode->i_flags &= ~S_APPEND; 252 inode->i_flags &= ~S_APPEND;
246} 253}
247 254
248static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
249{
250 if (inode->i_flags & S_IMMUTABLE)
251 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
252 else
253 perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
254 if (inode->i_flags & S_APPEND)
255 perms->rootflags |= HFSPLUS_FLG_APPEND;
256 else
257 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
258 perms->userflags = HFSPLUS_I(inode).userflags;
259 perms->mode = cpu_to_be16(inode->i_mode);
260 perms->owner = cpu_to_be32(inode->i_uid);
261 perms->group = cpu_to_be32(inode->i_gid);
262 perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
263}
264
265static int hfsplus_file_open(struct inode *inode, struct file *file) 255static int hfsplus_file_open(struct inode *inode, struct file *file)
266{ 256{
267 if (HFSPLUS_IS_RSRC(inode)) 257 if (HFSPLUS_IS_RSRC(inode))
268 inode = HFSPLUS_I(inode).rsrc_inode; 258 inode = HFSPLUS_I(inode)->rsrc_inode;
269 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 259 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
270 return -EOVERFLOW; 260 return -EOVERFLOW;
271 atomic_inc(&HFSPLUS_I(inode).opencnt); 261 atomic_inc(&HFSPLUS_I(inode)->opencnt);
272 return 0; 262 return 0;
273} 263}
274 264
@@ -277,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
277 struct super_block *sb = inode->i_sb; 267 struct super_block *sb = inode->i_sb;
278 268
279 if (HFSPLUS_IS_RSRC(inode)) 269 if (HFSPLUS_IS_RSRC(inode))
280 inode = HFSPLUS_I(inode).rsrc_inode; 270 inode = HFSPLUS_I(inode)->rsrc_inode;
281 if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) { 271 if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
282 mutex_lock(&inode->i_mutex); 272 mutex_lock(&inode->i_mutex);
283 hfsplus_file_truncate(inode); 273 hfsplus_file_truncate(inode);
284 if (inode->i_flags & S_DEAD) { 274 if (inode->i_flags & S_DEAD) {
285 hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); 275 hfsplus_delete_cat(inode->i_ino,
276 HFSPLUS_SB(sb)->hidden_dir, NULL);
286 hfsplus_delete_inode(inode); 277 hfsplus_delete_inode(inode);
287 } 278 }
288 mutex_unlock(&inode->i_mutex); 279 mutex_unlock(&inode->i_mutex);
@@ -361,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = {
361 352
362struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 353struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
363{ 354{
355 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
364 struct inode *inode = new_inode(sb); 356 struct inode *inode = new_inode(sb);
357 struct hfsplus_inode_info *hip;
358
365 if (!inode) 359 if (!inode)
366 return NULL; 360 return NULL;
367 361
368 inode->i_ino = HFSPLUS_SB(sb).next_cnid++; 362 inode->i_ino = sbi->next_cnid++;
369 inode->i_mode = mode; 363 inode->i_mode = mode;
370 inode->i_uid = current_fsuid(); 364 inode->i_uid = current_fsuid();
371 inode->i_gid = current_fsgid(); 365 inode->i_gid = current_fsgid();
372 inode->i_nlink = 1; 366 inode->i_nlink = 1;
373 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 367 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
374 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 368
375 mutex_init(&HFSPLUS_I(inode).extents_lock); 369 hip = HFSPLUS_I(inode);
376 atomic_set(&HFSPLUS_I(inode).opencnt, 0); 370 INIT_LIST_HEAD(&hip->open_dir_list);
377 HFSPLUS_I(inode).flags = 0; 371 mutex_init(&hip->extents_lock);
378 memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec)); 372 atomic_set(&hip->opencnt, 0);
379 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 373 hip->flags = 0;
380 HFSPLUS_I(inode).alloc_blocks = 0; 374 memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
381 HFSPLUS_I(inode).first_blocks = 0; 375 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
382 HFSPLUS_I(inode).cached_start = 0; 376 hip->alloc_blocks = 0;
383 HFSPLUS_I(inode).cached_blocks = 0; 377 hip->first_blocks = 0;
384 HFSPLUS_I(inode).phys_size = 0; 378 hip->cached_start = 0;
385 HFSPLUS_I(inode).fs_blocks = 0; 379 hip->cached_blocks = 0;
386 HFSPLUS_I(inode).rsrc_inode = NULL; 380 hip->phys_size = 0;
381 hip->fs_blocks = 0;
382 hip->rsrc_inode = NULL;
387 if (S_ISDIR(inode->i_mode)) { 383 if (S_ISDIR(inode->i_mode)) {
388 inode->i_size = 2; 384 inode->i_size = 2;
389 HFSPLUS_SB(sb).folder_count++; 385 sbi->folder_count++;
390 inode->i_op = &hfsplus_dir_inode_operations; 386 inode->i_op = &hfsplus_dir_inode_operations;
391 inode->i_fop = &hfsplus_dir_operations; 387 inode->i_fop = &hfsplus_dir_operations;
392 } else if (S_ISREG(inode->i_mode)) { 388 } else if (S_ISREG(inode->i_mode)) {
393 HFSPLUS_SB(sb).file_count++; 389 sbi->file_count++;
394 inode->i_op = &hfsplus_file_inode_operations; 390 inode->i_op = &hfsplus_file_inode_operations;
395 inode->i_fop = &hfsplus_file_operations; 391 inode->i_fop = &hfsplus_file_operations;
396 inode->i_mapping->a_ops = &hfsplus_aops; 392 inode->i_mapping->a_ops = &hfsplus_aops;
397 HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks; 393 hip->clump_blocks = sbi->data_clump_blocks;
398 } else if (S_ISLNK(inode->i_mode)) { 394 } else if (S_ISLNK(inode->i_mode)) {
399 HFSPLUS_SB(sb).file_count++; 395 sbi->file_count++;
400 inode->i_op = &page_symlink_inode_operations; 396 inode->i_op = &page_symlink_inode_operations;
401 inode->i_mapping->a_ops = &hfsplus_aops; 397 inode->i_mapping->a_ops = &hfsplus_aops;
402 HFSPLUS_I(inode).clump_blocks = 1; 398 hip->clump_blocks = 1;
403 } else 399 } else
404 HFSPLUS_SB(sb).file_count++; 400 sbi->file_count++;
405 insert_inode_hash(inode); 401 insert_inode_hash(inode);
406 mark_inode_dirty(inode); 402 mark_inode_dirty(inode);
407 sb->s_dirt = 1; 403 sb->s_dirt = 1;
@@ -414,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode)
414 struct super_block *sb = inode->i_sb; 410 struct super_block *sb = inode->i_sb;
415 411
416 if (S_ISDIR(inode->i_mode)) { 412 if (S_ISDIR(inode->i_mode)) {
417 HFSPLUS_SB(sb).folder_count--; 413 HFSPLUS_SB(sb)->folder_count--;
418 sb->s_dirt = 1; 414 sb->s_dirt = 1;
419 return; 415 return;
420 } 416 }
421 HFSPLUS_SB(sb).file_count--; 417 HFSPLUS_SB(sb)->file_count--;
422 if (S_ISREG(inode->i_mode)) { 418 if (S_ISREG(inode->i_mode)) {
423 if (!inode->i_nlink) { 419 if (!inode->i_nlink) {
424 inode->i_size = 0; 420 inode->i_size = 0;
@@ -434,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode)
434void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 430void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
435{ 431{
436 struct super_block *sb = inode->i_sb; 432 struct super_block *sb = inode->i_sb;
433 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
434 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
437 u32 count; 435 u32 count;
438 int i; 436 int i;
439 437
440 memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents, 438 memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
441 sizeof(hfsplus_extent_rec));
442 for (count = 0, i = 0; i < 8; i++) 439 for (count = 0, i = 0; i < 8; i++)
443 count += be32_to_cpu(fork->extents[i].block_count); 440 count += be32_to_cpu(fork->extents[i].block_count);
444 HFSPLUS_I(inode).first_blocks = count; 441 hip->first_blocks = count;
445 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 442 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
446 HFSPLUS_I(inode).cached_start = 0; 443 hip->cached_start = 0;
447 HFSPLUS_I(inode).cached_blocks = 0; 444 hip->cached_blocks = 0;
448 445
449 HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks); 446 hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
450 inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size); 447 hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
451 HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 448 hip->fs_blocks =
452 inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); 449 (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
453 HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift; 450 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
454 if (!HFSPLUS_I(inode).clump_blocks) 451 hip->clump_blocks =
455 HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks : 452 be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
456 HFSPLUS_SB(sb).data_clump_blocks; 453 if (!hip->clump_blocks) {
454 hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
455 sbi->rsrc_clump_blocks :
456 sbi->data_clump_blocks;
457 }
457} 458}
458 459
459void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 460void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
460{ 461{
461 memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents, 462 memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
462 sizeof(hfsplus_extent_rec)); 463 sizeof(hfsplus_extent_rec));
463 fork->total_size = cpu_to_be64(inode->i_size); 464 fork->total_size = cpu_to_be64(inode->i_size);
464 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks); 465 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
465} 466}
466 467
467int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) 468int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -472,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
472 473
473 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); 474 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
474 475
475 HFSPLUS_I(inode).dev = 0; 476 HFSPLUS_I(inode)->linkid = 0;
476 if (type == HFSPLUS_FOLDER) { 477 if (type == HFSPLUS_FOLDER) {
477 struct hfsplus_cat_folder *folder = &entry.folder; 478 struct hfsplus_cat_folder *folder = &entry.folder;
478 479
@@ -486,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
486 inode->i_atime = hfsp_mt2ut(folder->access_date); 487 inode->i_atime = hfsp_mt2ut(folder->access_date);
487 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); 488 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
488 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); 489 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
489 HFSPLUS_I(inode).create_date = folder->create_date; 490 HFSPLUS_I(inode)->create_date = folder->create_date;
490 HFSPLUS_I(inode).fs_blocks = 0; 491 HFSPLUS_I(inode)->fs_blocks = 0;
491 inode->i_op = &hfsplus_dir_inode_operations; 492 inode->i_op = &hfsplus_dir_inode_operations;
492 inode->i_fop = &hfsplus_dir_operations; 493 inode->i_fop = &hfsplus_dir_operations;
493 } else if (type == HFSPLUS_FILE) { 494 } else if (type == HFSPLUS_FILE) {
@@ -518,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
518 inode->i_atime = hfsp_mt2ut(file->access_date); 519 inode->i_atime = hfsp_mt2ut(file->access_date);
519 inode->i_mtime = hfsp_mt2ut(file->content_mod_date); 520 inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
520 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); 521 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
521 HFSPLUS_I(inode).create_date = file->create_date; 522 HFSPLUS_I(inode)->create_date = file->create_date;
522 } else { 523 } else {
523 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); 524 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
524 res = -EIO; 525 res = -EIO;
@@ -533,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode)
533 hfsplus_cat_entry entry; 534 hfsplus_cat_entry entry;
534 535
535 if (HFSPLUS_IS_RSRC(inode)) 536 if (HFSPLUS_IS_RSRC(inode))
536 main_inode = HFSPLUS_I(inode).rsrc_inode; 537 main_inode = HFSPLUS_I(inode)->rsrc_inode;
537 538
538 if (!main_inode->i_nlink) 539 if (!main_inode->i_nlink)
539 return 0; 540 return 0;
540 541
541 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd)) 542 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
542 /* panic? */ 543 /* panic? */
543 return -EIO; 544 return -EIO;
544 545
@@ -554,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
554 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 555 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
555 sizeof(struct hfsplus_cat_folder)); 556 sizeof(struct hfsplus_cat_folder));
556 /* simple node checks? */ 557 /* simple node checks? */
557 hfsplus_set_perms(inode, &folder->permissions); 558 hfsplus_cat_set_perms(inode, &folder->permissions);
558 folder->access_date = hfsp_ut2mt(inode->i_atime); 559 folder->access_date = hfsp_ut2mt(inode->i_atime);
559 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); 560 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
560 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); 561 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -576,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
576 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 577 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
577 sizeof(struct hfsplus_cat_file)); 578 sizeof(struct hfsplus_cat_file));
578 hfsplus_inode_write_fork(inode, &file->data_fork); 579 hfsplus_inode_write_fork(inode, &file->data_fork);
579 if (S_ISREG(inode->i_mode)) 580 hfsplus_cat_set_perms(inode, &file->permissions);
580 HFSPLUS_I(inode).dev = inode->i_nlink;
581 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
582 HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev);
583 hfsplus_set_perms(inode, &file->permissions);
584 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 581 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
585 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 582 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
586 else 583 else
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ac405f099026..5b4667e08ef7 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,83 +17,98 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/xattr.h> 19#include <linux/xattr.h>
20#include <linux/smp_lock.h>
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
22#include "hfsplus_fs.h" 21#include "hfsplus_fs.h"
23 22
24long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 23static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
25{ 24{
26 struct inode *inode = filp->f_path.dentry->d_inode; 25 struct inode *inode = file->f_path.dentry->d_inode;
26 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
27 unsigned int flags = 0;
28
29 if (inode->i_flags & S_IMMUTABLE)
30 flags |= FS_IMMUTABLE_FL;
31 if (inode->i_flags |= S_APPEND)
32 flags |= FS_APPEND_FL;
33 if (hip->userflags & HFSPLUS_FLG_NODUMP)
34 flags |= FS_NODUMP_FL;
35
36 return put_user(flags, user_flags);
37}
38
39static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
40{
41 struct inode *inode = file->f_path.dentry->d_inode;
42 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
27 unsigned int flags; 43 unsigned int flags;
44 int err = 0;
28 45
29 lock_kernel(); 46 err = mnt_want_write(file->f_path.mnt);
30 switch (cmd) { 47 if (err)
31 case HFSPLUS_IOC_EXT2_GETFLAGS: 48 goto out;
32 flags = 0;
33 if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
34 flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
35 if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
36 flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
37 if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
38 flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
39 return put_user(flags, (int __user *)arg);
40 case HFSPLUS_IOC_EXT2_SETFLAGS: {
41 int err = 0;
42 err = mnt_want_write(filp->f_path.mnt);
43 if (err) {
44 unlock_kernel();
45 return err;
46 }
47 49
48 if (!is_owner_or_cap(inode)) { 50 if (!is_owner_or_cap(inode)) {
49 err = -EACCES; 51 err = -EACCES;
50 goto setflags_out; 52 goto out_drop_write;
51 } 53 }
52 if (get_user(flags, (int __user *)arg)) {
53 err = -EFAULT;
54 goto setflags_out;
55 }
56 if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
57 HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
58 if (!capable(CAP_LINUX_IMMUTABLE)) {
59 err = -EPERM;
60 goto setflags_out;
61 }
62 }
63 54
64 /* don't silently ignore unsupported ext2 flags */ 55 if (get_user(flags, user_flags)) {
65 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { 56 err = -EFAULT;
66 err = -EOPNOTSUPP; 57 goto out_drop_write;
67 goto setflags_out; 58 }
68 } 59
69 if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */ 60 mutex_lock(&inode->i_mutex);
70 inode->i_flags |= S_IMMUTABLE; 61
71 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE; 62 if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
72 } else { 63 inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
73 inode->i_flags &= ~S_IMMUTABLE; 64 if (!capable(CAP_LINUX_IMMUTABLE)) {
74 HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE; 65 err = -EPERM;
75 } 66 goto out_unlock_inode;
76 if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
77 inode->i_flags |= S_APPEND;
78 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
79 } else {
80 inode->i_flags &= ~S_APPEND;
81 HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
82 } 67 }
83 if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
84 HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
85 else
86 HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
87
88 inode->i_ctime = CURRENT_TIME_SEC;
89 mark_inode_dirty(inode);
90setflags_out:
91 mnt_drop_write(filp->f_path.mnt);
92 unlock_kernel();
93 return err;
94 } 68 }
69
70 /* don't silently ignore unsupported ext2 flags */
71 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
72 err = -EOPNOTSUPP;
73 goto out_unlock_inode;
74 }
75
76 if (flags & FS_IMMUTABLE_FL)
77 inode->i_flags |= S_IMMUTABLE;
78 else
79 inode->i_flags &= ~S_IMMUTABLE;
80
81 if (flags & FS_APPEND_FL)
82 inode->i_flags |= S_APPEND;
83 else
84 inode->i_flags &= ~S_APPEND;
85
86 if (flags & FS_NODUMP_FL)
87 hip->userflags |= HFSPLUS_FLG_NODUMP;
88 else
89 hip->userflags &= ~HFSPLUS_FLG_NODUMP;
90
91 inode->i_ctime = CURRENT_TIME_SEC;
92 mark_inode_dirty(inode);
93
94out_unlock_inode:
95 mutex_lock(&inode->i_mutex);
96out_drop_write:
97 mnt_drop_write(file->f_path.mnt);
98out:
99 return err;
100}
101
102long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
103{
104 void __user *argp = (void __user *)arg;
105
106 switch (cmd) {
107 case HFSPLUS_IOC_EXT2_GETFLAGS:
108 return hfsplus_ioctl_getflags(file, argp);
109 case HFSPLUS_IOC_EXT2_SETFLAGS:
110 return hfsplus_ioctl_setflags(file, argp);
95 default: 111 default:
96 unlock_kernel();
97 return -ENOTTY; 112 return -ENOTTY;
98 } 113 }
99} 114}
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
110 if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) 125 if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
111 return -EOPNOTSUPP; 126 return -EOPNOTSUPP;
112 127
113 res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); 128 res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
114 if (res) 129 if (res)
115 return res; 130 return res;
116 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); 131 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
153 return -EOPNOTSUPP; 168 return -EOPNOTSUPP;
154 169
155 if (size) { 170 if (size) {
156 res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); 171 res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
157 if (res) 172 if (res)
158 return res; 173 return res;
159 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); 174 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
177 } else 192 } else
178 res = size ? -ERANGE : 4; 193 res = size ? -ERANGE : 4;
179 } else 194 } else
180 res = -ENODATA; 195 res = -EOPNOTSUPP;
181out: 196out:
182 if (size) 197 if (size)
183 hfs_find_exit(&fd); 198 hfs_find_exit(&fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 572628b4b07d..f9ab276a4d8d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
143 kfree(p); 143 kfree(p);
144 break; 144 break;
145 case opt_decompose: 145 case opt_decompose:
146 sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE; 146 clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
147 break; 147 break;
148 case opt_nodecompose: 148 case opt_nodecompose:
149 sbi->flags |= HFSPLUS_SB_NODECOMPOSE; 149 set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
150 break; 150 break;
151 case opt_force: 151 case opt_force:
152 sbi->flags |= HFSPLUS_SB_FORCE; 152 set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
153 break; 153 break;
154 default: 154 default:
155 return 0; 155 return 0;
@@ -171,7 +171,7 @@ done:
171 171
172int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) 172int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
173{ 173{
174 struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb); 174 struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
175 175
176 if (sbi->creator != HFSPLUS_DEF_CR_TYPE) 176 if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
177 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); 177 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
@@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
184 seq_printf(seq, ",session=%u", sbi->session); 184 seq_printf(seq, ",session=%u", sbi->session);
185 if (sbi->nls) 185 if (sbi->nls)
186 seq_printf(seq, ",nls=%s", sbi->nls->charset); 186 seq_printf(seq, ",nls=%s", sbi->nls->charset);
187 if (sbi->flags & HFSPLUS_SB_NODECOMPOSE) 187 if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
188 seq_printf(seq, ",nodecompose"); 188 seq_printf(seq, ",nodecompose");
189 return 0; 189 return 0;
190} 190}
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 1528a6fd0299..208b16c645cc 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -74,6 +74,7 @@ struct old_pmap {
74int hfs_part_find(struct super_block *sb, 74int hfs_part_find(struct super_block *sb,
75 sector_t *part_start, sector_t *part_size) 75 sector_t *part_start, sector_t *part_size)
76{ 76{
77 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
77 struct buffer_head *bh; 78 struct buffer_head *bh;
78 __be16 *data; 79 __be16 *data;
79 int i, size, res; 80 int i, size, res;
@@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb,
95 for (i = 0; i < size; p++, i++) { 96 for (i = 0; i < size; p++, i++) {
96 if (p->pdStart && p->pdSize && 97 if (p->pdStart && p->pdSize &&
97 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && 98 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
98 (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { 99 (sbi->part < 0 || sbi->part == i)) {
99 *part_start += be32_to_cpu(p->pdStart); 100 *part_start += be32_to_cpu(p->pdStart);
100 *part_size = be32_to_cpu(p->pdSize); 101 *part_size = be32_to_cpu(p->pdSize);
101 res = 0; 102 res = 0;
@@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb,
111 size = be32_to_cpu(pm->pmMapBlkCnt); 112 size = be32_to_cpu(pm->pmMapBlkCnt);
112 for (i = 0; i < size;) { 113 for (i = 0; i < size;) {
113 if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && 114 if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
114 (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { 115 (sbi->part < 0 || sbi->part == i)) {
115 *part_start += be32_to_cpu(pm->pmPyPartStart); 116 *part_start += be32_to_cpu(pm->pmPyPartStart);
116 *part_size = be32_to_cpu(pm->pmPartBlkCnt); 117 *part_size = be32_to_cpu(pm->pmPartBlkCnt);
117 res = 0; 118 res = 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3b55c050c742..9a88d7536103 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,7 +12,6 @@
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/smp_lock.h>
16#include <linux/vfs.h> 15#include <linux/vfs.h>
17#include <linux/nls.h> 16#include <linux/nls.h>
18 17
@@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode);
21 20
22#include "hfsplus_fs.h" 21#include "hfsplus_fs.h"
23 22
24struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) 23static int hfsplus_system_read_inode(struct inode *inode)
25{ 24{
26 struct hfs_find_data fd; 25 struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
27 struct hfsplus_vh *vhdr;
28 struct inode *inode;
29 long err = -EIO;
30
31 inode = iget_locked(sb, ino);
32 if (!inode)
33 return ERR_PTR(-ENOMEM);
34 if (!(inode->i_state & I_NEW))
35 return inode;
36 26
37 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 27 switch (inode->i_ino) {
38 mutex_init(&HFSPLUS_I(inode).extents_lock);
39 HFSPLUS_I(inode).flags = 0;
40 HFSPLUS_I(inode).rsrc_inode = NULL;
41 atomic_set(&HFSPLUS_I(inode).opencnt, 0);
42
43 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
44 read_inode:
45 hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
46 err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
47 if (!err)
48 err = hfsplus_cat_read_inode(inode, &fd);
49 hfs_find_exit(&fd);
50 if (err)
51 goto bad_inode;
52 goto done;
53 }
54 vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
55 switch(inode->i_ino) {
56 case HFSPLUS_ROOT_CNID:
57 goto read_inode;
58 case HFSPLUS_EXT_CNID: 28 case HFSPLUS_EXT_CNID:
59 hfsplus_inode_read_fork(inode, &vhdr->ext_file); 29 hfsplus_inode_read_fork(inode, &vhdr->ext_file);
60 inode->i_mapping->a_ops = &hfsplus_btree_aops; 30 inode->i_mapping->a_ops = &hfsplus_btree_aops;
@@ -75,74 +45,101 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
75 inode->i_mapping->a_ops = &hfsplus_btree_aops; 45 inode->i_mapping->a_ops = &hfsplus_btree_aops;
76 break; 46 break;
77 default: 47 default:
78 goto bad_inode; 48 return -EIO;
49 }
50
51 return 0;
52}
53
54struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
55{
56 struct hfs_find_data fd;
57 struct inode *inode;
58 int err;
59
60 inode = iget_locked(sb, ino);
61 if (!inode)
62 return ERR_PTR(-ENOMEM);
63 if (!(inode->i_state & I_NEW))
64 return inode;
65
66 INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
67 mutex_init(&HFSPLUS_I(inode)->extents_lock);
68 HFSPLUS_I(inode)->flags = 0;
69 HFSPLUS_I(inode)->rsrc_inode = NULL;
70 atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
71
72 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
73 inode->i_ino == HFSPLUS_ROOT_CNID) {
74 hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
75 err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
76 if (!err)
77 err = hfsplus_cat_read_inode(inode, &fd);
78 hfs_find_exit(&fd);
79 } else {
80 err = hfsplus_system_read_inode(inode);
81 }
82
83 if (err) {
84 iget_failed(inode);
85 return ERR_PTR(err);
79 } 86 }
80 87
81done:
82 unlock_new_inode(inode); 88 unlock_new_inode(inode);
83 return inode; 89 return inode;
84
85bad_inode:
86 iget_failed(inode);
87 return ERR_PTR(err);
88} 90}
89 91
90static int hfsplus_write_inode(struct inode *inode, 92static int hfsplus_system_write_inode(struct inode *inode)
91 struct writeback_control *wbc)
92{ 93{
93 struct hfsplus_vh *vhdr; 94 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
94 int ret = 0; 95 struct hfsplus_vh *vhdr = sbi->s_vhdr;
96 struct hfsplus_fork_raw *fork;
97 struct hfs_btree *tree = NULL;
95 98
96 dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
97 hfsplus_ext_write_extent(inode);
98 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
99 return hfsplus_cat_write_inode(inode);
100 }
101 vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
102 switch (inode->i_ino) { 99 switch (inode->i_ino) {
103 case HFSPLUS_ROOT_CNID:
104 ret = hfsplus_cat_write_inode(inode);
105 break;
106 case HFSPLUS_EXT_CNID: 100 case HFSPLUS_EXT_CNID:
107 if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) { 101 fork = &vhdr->ext_file;
108 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 102 tree = sbi->ext_tree;
109 inode->i_sb->s_dirt = 1;
110 }
111 hfsplus_inode_write_fork(inode, &vhdr->ext_file);
112 hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
113 break; 103 break;
114 case HFSPLUS_CAT_CNID: 104 case HFSPLUS_CAT_CNID:
115 if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) { 105 fork = &vhdr->cat_file;
116 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 106 tree = sbi->cat_tree;
117 inode->i_sb->s_dirt = 1;
118 }
119 hfsplus_inode_write_fork(inode, &vhdr->cat_file);
120 hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
121 break; 107 break;
122 case HFSPLUS_ALLOC_CNID: 108 case HFSPLUS_ALLOC_CNID:
123 if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) { 109 fork = &vhdr->alloc_file;
124 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
125 inode->i_sb->s_dirt = 1;
126 }
127 hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
128 break; 110 break;
129 case HFSPLUS_START_CNID: 111 case HFSPLUS_START_CNID:
130 if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) { 112 fork = &vhdr->start_file;
131 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
132 inode->i_sb->s_dirt = 1;
133 }
134 hfsplus_inode_write_fork(inode, &vhdr->start_file);
135 break; 113 break;
136 case HFSPLUS_ATTR_CNID: 114 case HFSPLUS_ATTR_CNID:
137 if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) { 115 fork = &vhdr->attr_file;
138 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 116 tree = sbi->attr_tree;
139 inode->i_sb->s_dirt = 1; 117 default:
140 } 118 return -EIO;
141 hfsplus_inode_write_fork(inode, &vhdr->attr_file); 119 }
142 hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree); 120
143 break; 121 if (fork->total_size != cpu_to_be64(inode->i_size)) {
122 set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
123 inode->i_sb->s_dirt = 1;
144 } 124 }
145 return ret; 125 hfsplus_inode_write_fork(inode, fork);
126 if (tree)
127 hfs_btree_write(tree);
128 return 0;
129}
130
131static int hfsplus_write_inode(struct inode *inode,
132 struct writeback_control *wbc)
133{
134 dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
135
136 hfsplus_ext_write_extent(inode);
137
138 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
139 inode->i_ino == HFSPLUS_ROOT_CNID)
140 return hfsplus_cat_write_inode(inode);
141 else
142 return hfsplus_system_write_inode(inode);
146} 143}
147 144
148static void hfsplus_evict_inode(struct inode *inode) 145static void hfsplus_evict_inode(struct inode *inode)
@@ -151,51 +148,53 @@ static void hfsplus_evict_inode(struct inode *inode)
151 truncate_inode_pages(&inode->i_data, 0); 148 truncate_inode_pages(&inode->i_data, 0);
152 end_writeback(inode); 149 end_writeback(inode);
153 if (HFSPLUS_IS_RSRC(inode)) { 150 if (HFSPLUS_IS_RSRC(inode)) {
154 HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL; 151 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
155 iput(HFSPLUS_I(inode).rsrc_inode); 152 iput(HFSPLUS_I(inode)->rsrc_inode);
156 } 153 }
157} 154}
158 155
159int hfsplus_sync_fs(struct super_block *sb, int wait) 156int hfsplus_sync_fs(struct super_block *sb, int wait)
160{ 157{
161 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 158 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
159 struct hfsplus_vh *vhdr = sbi->s_vhdr;
162 160
163 dprint(DBG_SUPER, "hfsplus_write_super\n"); 161 dprint(DBG_SUPER, "hfsplus_write_super\n");
164 162
165 lock_super(sb); 163 mutex_lock(&sbi->vh_mutex);
164 mutex_lock(&sbi->alloc_mutex);
166 sb->s_dirt = 0; 165 sb->s_dirt = 0;
167 166
168 vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); 167 vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
169 vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); 168 vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
170 vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid); 169 vhdr->folder_count = cpu_to_be32(sbi->folder_count);
171 vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count); 170 vhdr->file_count = cpu_to_be32(sbi->file_count);
172 vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count);
173 171
174 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 172 mark_buffer_dirty(sbi->s_vhbh);
175 if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) { 173 if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
176 if (HFSPLUS_SB(sb).sect_count) { 174 if (sbi->sect_count) {
177 struct buffer_head *bh; 175 struct buffer_head *bh;
178 u32 block, offset; 176 u32 block, offset;
179 177
180 block = HFSPLUS_SB(sb).blockoffset; 178 block = sbi->blockoffset;
181 block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9); 179 block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
182 offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1); 180 offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
183 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset, 181 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
184 HFSPLUS_SB(sb).sect_count, block, offset); 182 sbi->blockoffset, sbi->sect_count,
183 block, offset);
185 bh = sb_bread(sb, block); 184 bh = sb_bread(sb, block);
186 if (bh) { 185 if (bh) {
187 vhdr = (struct hfsplus_vh *)(bh->b_data + offset); 186 vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
188 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) { 187 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
189 memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr)); 188 memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
190 mark_buffer_dirty(bh); 189 mark_buffer_dirty(bh);
191 brelse(bh); 190 brelse(bh);
192 } else 191 } else
193 printk(KERN_WARNING "hfs: backup not found!\n"); 192 printk(KERN_WARNING "hfs: backup not found!\n");
194 } 193 }
195 } 194 }
196 HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
197 } 195 }
198 unlock_super(sb); 196 mutex_unlock(&sbi->alloc_mutex);
197 mutex_unlock(&sbi->vh_mutex);
199 return 0; 198 return 0;
200} 199}
201 200
@@ -209,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb)
209 208
210static void hfsplus_put_super(struct super_block *sb) 209static void hfsplus_put_super(struct super_block *sb)
211{ 210{
211 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
212
212 dprint(DBG_SUPER, "hfsplus_put_super\n"); 213 dprint(DBG_SUPER, "hfsplus_put_super\n");
214
213 if (!sb->s_fs_info) 215 if (!sb->s_fs_info)
214 return; 216 return;
215 217
216 lock_kernel();
217
218 if (sb->s_dirt) 218 if (sb->s_dirt)
219 hfsplus_write_super(sb); 219 hfsplus_write_super(sb);
220 if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { 220 if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
221 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 221 struct hfsplus_vh *vhdr = sbi->s_vhdr;
222 222
223 vhdr->modify_date = hfsp_now2mt(); 223 vhdr->modify_date = hfsp_now2mt();
224 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); 224 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
225 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); 225 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
226 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 226 mark_buffer_dirty(sbi->s_vhbh);
227 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 227 sync_dirty_buffer(sbi->s_vhbh);
228 } 228 }
229 229
230 hfs_btree_close(HFSPLUS_SB(sb).cat_tree); 230 hfs_btree_close(sbi->cat_tree);
231 hfs_btree_close(HFSPLUS_SB(sb).ext_tree); 231 hfs_btree_close(sbi->ext_tree);
232 iput(HFSPLUS_SB(sb).alloc_file); 232 iput(sbi->alloc_file);
233 iput(HFSPLUS_SB(sb).hidden_dir); 233 iput(sbi->hidden_dir);
234 brelse(HFSPLUS_SB(sb).s_vhbh); 234 brelse(sbi->s_vhbh);
235 unload_nls(HFSPLUS_SB(sb).nls); 235 unload_nls(sbi->nls);
236 kfree(sb->s_fs_info); 236 kfree(sb->s_fs_info);
237 sb->s_fs_info = NULL; 237 sb->s_fs_info = NULL;
238
239 unlock_kernel();
240} 238}
241 239
242static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) 240static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
243{ 241{
244 struct super_block *sb = dentry->d_sb; 242 struct super_block *sb = dentry->d_sb;
243 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
245 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 244 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
246 245
247 buf->f_type = HFSPLUS_SUPER_MAGIC; 246 buf->f_type = HFSPLUS_SUPER_MAGIC;
248 buf->f_bsize = sb->s_blocksize; 247 buf->f_bsize = sb->s_blocksize;
249 buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift; 248 buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
250 buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift; 249 buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
251 buf->f_bavail = buf->f_bfree; 250 buf->f_bavail = buf->f_bfree;
252 buf->f_files = 0xFFFFFFFF; 251 buf->f_files = 0xFFFFFFFF;
253 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; 252 buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
254 buf->f_fsid.val[0] = (u32)id; 253 buf->f_fsid.val[0] = (u32)id;
255 buf->f_fsid.val[1] = (u32)(id >> 32); 254 buf->f_fsid.val[1] = (u32)(id >> 32);
256 buf->f_namelen = HFSPLUS_MAX_STRLEN; 255 buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -263,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
263 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 262 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
264 return 0; 263 return 0;
265 if (!(*flags & MS_RDONLY)) { 264 if (!(*flags & MS_RDONLY)) {
266 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 265 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
267 struct hfsplus_sb_info sbi; 266 struct hfsplus_sb_info sbi;
268 267
269 memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); 268 memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
270 sbi.nls = HFSPLUS_SB(sb).nls; 269 sbi.nls = HFSPLUS_SB(sb)->nls;
271 if (!hfsplus_parse_options(data, &sbi)) 270 if (!hfsplus_parse_options(data, &sbi))
272 return -EINVAL; 271 return -EINVAL;
273 272
@@ -276,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
276 "running fsck.hfsplus is recommended. leaving read-only.\n"); 275 "running fsck.hfsplus is recommended. leaving read-only.\n");
277 sb->s_flags |= MS_RDONLY; 276 sb->s_flags |= MS_RDONLY;
278 *flags |= MS_RDONLY; 277 *flags |= MS_RDONLY;
279 } else if (sbi.flags & HFSPLUS_SB_FORCE) { 278 } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
280 /* nothing */ 279 /* nothing */
281 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 280 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
282 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); 281 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
@@ -320,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
320 return -ENOMEM; 319 return -ENOMEM;
321 320
322 sb->s_fs_info = sbi; 321 sb->s_fs_info = sbi;
323 INIT_HLIST_HEAD(&sbi->rsrc_inodes); 322 mutex_init(&sbi->alloc_mutex);
323 mutex_init(&sbi->vh_mutex);
324 hfsplus_fill_defaults(sbi); 324 hfsplus_fill_defaults(sbi);
325 if (!hfsplus_parse_options(data, sbi)) { 325 if (!hfsplus_parse_options(data, sbi)) {
326 printk(KERN_ERR "hfs: unable to parse mount options\n"); 326 printk(KERN_ERR "hfs: unable to parse mount options\n");
@@ -344,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
344 err = -EINVAL; 344 err = -EINVAL;
345 goto cleanup; 345 goto cleanup;
346 } 346 }
347 vhdr = HFSPLUS_SB(sb).s_vhdr; 347 vhdr = sbi->s_vhdr;
348 348
349 /* Copy parts of the volume header into the superblock */ 349 /* Copy parts of the volume header into the superblock */
350 sb->s_magic = HFSPLUS_VOLHEAD_SIG; 350 sb->s_magic = HFSPLUS_VOLHEAD_SIG;
@@ -353,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
353 printk(KERN_ERR "hfs: wrong filesystem version\n"); 353 printk(KERN_ERR "hfs: wrong filesystem version\n");
354 goto cleanup; 354 goto cleanup;
355 } 355 }
356 HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks); 356 sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
357 HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks); 357 sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
358 HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc); 358 sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
359 HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid); 359 sbi->file_count = be32_to_cpu(vhdr->file_count);
360 HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count); 360 sbi->folder_count = be32_to_cpu(vhdr->folder_count);
361 HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count); 361 sbi->data_clump_blocks =
362 HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; 362 be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
363 if (!HFSPLUS_SB(sb).data_clump_blocks) 363 if (!sbi->data_clump_blocks)
364 HFSPLUS_SB(sb).data_clump_blocks = 1; 364 sbi->data_clump_blocks = 1;
365 HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; 365 sbi->rsrc_clump_blocks =
366 if (!HFSPLUS_SB(sb).rsrc_clump_blocks) 366 be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
367 HFSPLUS_SB(sb).rsrc_clump_blocks = 1; 367 if (!sbi->rsrc_clump_blocks)
368 sbi->rsrc_clump_blocks = 1;
368 369
369 /* Set up operations so we can load metadata */ 370 /* Set up operations so we can load metadata */
370 sb->s_op = &hfsplus_sops; 371 sb->s_op = &hfsplus_sops;
@@ -374,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
374 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, " 375 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
375 "running fsck.hfsplus is recommended. mounting read-only.\n"); 376 "running fsck.hfsplus is recommended. mounting read-only.\n");
376 sb->s_flags |= MS_RDONLY; 377 sb->s_flags |= MS_RDONLY;
377 } else if (sbi->flags & HFSPLUS_SB_FORCE) { 378 } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
378 /* nothing */ 379 /* nothing */
379 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 380 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
380 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); 381 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
@@ -384,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
384 "use the force option at your own risk, mounting read-only.\n"); 385 "use the force option at your own risk, mounting read-only.\n");
385 sb->s_flags |= MS_RDONLY; 386 sb->s_flags |= MS_RDONLY;
386 } 387 }
387 sbi->flags &= ~HFSPLUS_SB_FORCE;
388 388
389 /* Load metadata objects (B*Trees) */ 389 /* Load metadata objects (B*Trees) */
390 HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); 390 sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
391 if (!HFSPLUS_SB(sb).ext_tree) { 391 if (!sbi->ext_tree) {
392 printk(KERN_ERR "hfs: failed to load extents file\n"); 392 printk(KERN_ERR "hfs: failed to load extents file\n");
393 goto cleanup; 393 goto cleanup;
394 } 394 }
395 HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); 395 sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
396 if (!HFSPLUS_SB(sb).cat_tree) { 396 if (!sbi->cat_tree) {
397 printk(KERN_ERR "hfs: failed to load catalog file\n"); 397 printk(KERN_ERR "hfs: failed to load catalog file\n");
398 goto cleanup; 398 goto cleanup;
399 } 399 }
@@ -404,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
404 err = PTR_ERR(inode); 404 err = PTR_ERR(inode);
405 goto cleanup; 405 goto cleanup;
406 } 406 }
407 HFSPLUS_SB(sb).alloc_file = inode; 407 sbi->alloc_file = inode;
408 408
409 /* Load the root directory */ 409 /* Load the root directory */
410 root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); 410 root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
@@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
423 423
424 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 424 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
425 str.name = HFSP_HIDDENDIR_NAME; 425 str.name = HFSP_HIDDENDIR_NAME;
426 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 426 hfs_find_init(sbi->cat_tree, &fd);
427 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); 427 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
428 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { 428 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
429 hfs_find_exit(&fd); 429 hfs_find_exit(&fd);
@@ -434,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
434 err = PTR_ERR(inode); 434 err = PTR_ERR(inode);
435 goto cleanup; 435 goto cleanup;
436 } 436 }
437 HFSPLUS_SB(sb).hidden_dir = inode; 437 sbi->hidden_dir = inode;
438 } else 438 } else
439 hfs_find_exit(&fd); 439 hfs_find_exit(&fd);
440 440
@@ -449,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
449 be32_add_cpu(&vhdr->write_count, 1); 449 be32_add_cpu(&vhdr->write_count, 1);
450 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); 450 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
451 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); 451 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
452 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 452 mark_buffer_dirty(sbi->s_vhbh);
453 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 453 sync_dirty_buffer(sbi->s_vhbh);
454 454
455 if (!HFSPLUS_SB(sb).hidden_dir) { 455 if (!sbi->hidden_dir) {
456 printk(KERN_DEBUG "hfs: create hidden dir...\n"); 456 printk(KERN_DEBUG "hfs: create hidden dir...\n");
457 HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR); 457
458 hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode, 458 mutex_lock(&sbi->vh_mutex);
459 &str, HFSPLUS_SB(sb).hidden_dir); 459 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
460 mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir); 460 hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
461 &str, sbi->hidden_dir);
462 mutex_unlock(&sbi->vh_mutex);
463
464 mark_inode_dirty(sbi->hidden_dir);
461 } 465 }
462out: 466out:
463 unload_nls(sbi->nls); 467 unload_nls(sbi->nls);
@@ -486,7 +490,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
486 490
487static void hfsplus_destroy_inode(struct inode *inode) 491static void hfsplus_destroy_inode(struct inode *inode)
488{ 492{
489 kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode)); 493 kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
490} 494}
491 495
492#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) 496#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info)
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 628ccf6fa402..b66d67de882c 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
121int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) 121int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
122{ 122{
123 const hfsplus_unichr *ip; 123 const hfsplus_unichr *ip;
124 struct nls_table *nls = HFSPLUS_SB(sb).nls; 124 struct nls_table *nls = HFSPLUS_SB(sb)->nls;
125 u8 *op; 125 u8 *op;
126 u16 cc, c0, c1; 126 u16 cc, c0, c1;
127 u16 *ce1, *ce2; 127 u16 *ce1, *ce2;
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
132 ustrlen = be16_to_cpu(ustr->length); 132 ustrlen = be16_to_cpu(ustr->length);
133 len = *len_p; 133 len = *len_p;
134 ce1 = NULL; 134 ce1 = NULL;
135 compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 135 compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
136 136
137 while (ustrlen > 0) { 137 while (ustrlen > 0) {
138 c0 = be16_to_cpu(*ip++); 138 c0 = be16_to_cpu(*ip++);
@@ -246,7 +246,7 @@ out:
246static inline int asc2unichar(struct super_block *sb, const char *astr, int len, 246static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
247 wchar_t *uc) 247 wchar_t *uc)
248{ 248{
249 int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc); 249 int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
250 if (size <= 0) { 250 if (size <= 0) {
251 *uc = '?'; 251 *uc = '?';
252 size = 1; 252 size = 1;
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
293 u16 *dstr, outlen = 0; 293 u16 *dstr, outlen = 0;
294 wchar_t c; 294 wchar_t c;
295 295
296 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 296 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { 297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
298 size = asc2unichar(sb, astr, len, &c); 298 size = asc2unichar(sb, astr, len, &c);
299 299
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
330 wchar_t c; 330 wchar_t c;
331 u16 c2; 331 u16 c2;
332 332
333 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); 333 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
334 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 334 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
335 hash = init_name_hash(); 335 hash = init_name_hash();
336 astr = str->name; 336 astr = str->name;
337 len = str->len; 337 len = str->len;
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
373 u16 c1, c2; 373 u16 c1, c2;
374 wchar_t c; 374 wchar_t c;
375 375
376 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); 376 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
377 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 377 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
378 astr1 = s1->name; 378 astr1 = s1->name;
379 len1 = s1->len; 379 len1 = s1->len;
380 astr2 = s2->name; 380 astr2 = s2->name;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index bed78ac8f6d1..8972c20b3216 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
65 *start = 0; 65 *start = 0;
66 *size = sb->s_bdev->bd_inode->i_size >> 9; 66 *size = sb->s_bdev->bd_inode->i_size >> 9;
67 67
68 if (HFSPLUS_SB(sb).session >= 0) { 68 if (HFSPLUS_SB(sb)->session >= 0) {
69 te.cdte_track = HFSPLUS_SB(sb).session; 69 te.cdte_track = HFSPLUS_SB(sb)->session;
70 te.cdte_format = CDROM_LBA; 70 te.cdte_format = CDROM_LBA;
71 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); 71 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
72 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { 72 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
@@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
87/* Takes in super block, returns true if good data read */ 87/* Takes in super block, returns true if good data read */
88int hfsplus_read_wrapper(struct super_block *sb) 88int hfsplus_read_wrapper(struct super_block *sb)
89{ 89{
90 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
90 struct buffer_head *bh; 91 struct buffer_head *bh;
91 struct hfsplus_vh *vhdr; 92 struct hfsplus_vh *vhdr;
92 struct hfsplus_wd wd; 93 struct hfsplus_wd wd;
@@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
122 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) 123 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
123 break; 124 break;
124 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) { 125 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
125 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX; 126 set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
126 break; 127 break;
127 } 128 }
128 brelse(bh); 129 brelse(bh);
@@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb)
143 if (blocksize < HFSPLUS_SECTOR_SIZE || 144 if (blocksize < HFSPLUS_SECTOR_SIZE ||
144 ((blocksize - 1) & blocksize)) 145 ((blocksize - 1) & blocksize))
145 return -EINVAL; 146 return -EINVAL;
146 HFSPLUS_SB(sb).alloc_blksz = blocksize; 147 sbi->alloc_blksz = blocksize;
147 HFSPLUS_SB(sb).alloc_blksz_shift = 0; 148 sbi->alloc_blksz_shift = 0;
148 while ((blocksize >>= 1) != 0) 149 while ((blocksize >>= 1) != 0)
149 HFSPLUS_SB(sb).alloc_blksz_shift++; 150 sbi->alloc_blksz_shift++;
150 blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE); 151 blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
151 152
152 /* align block size to block offset */ 153 /* align block size to block offset */
153 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) 154 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
@@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb)
158 return -EINVAL; 159 return -EINVAL;
159 } 160 }
160 161
161 HFSPLUS_SB(sb).blockoffset = part_start >> 162 sbi->blockoffset =
162 (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); 163 part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
163 HFSPLUS_SB(sb).sect_count = part_size; 164 sbi->sect_count = part_size;
164 HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift - 165 sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
165 sb->s_blocksize_bits;
166 166
167 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); 167 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
168 if (!bh) 168 if (!bh)
169 return -EIO; 169 return -EIO;
170 170
171 /* should still be the same... */ 171 /* should still be the same... */
172 if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ? 172 if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
173 cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) : 173 if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
174 cpu_to_be16(HFSPLUS_VOLHEAD_SIG))) 174 goto error;
175 goto error; 175 } else {
176 HFSPLUS_SB(sb).s_vhbh = bh; 176 if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
177 HFSPLUS_SB(sb).s_vhdr = vhdr; 177 goto error;
178 }
179
180 sbi->s_vhbh = bh;
181 sbi->s_vhdr = vhdr;
178 182
179 return 0; 183 return 0;
180 error: 184 error:
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 56bd15c5bf6c..63b6f5632318 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,6 +1,7 @@
1config HPFS_FS 1config HPFS_FS
2 tristate "OS/2 HPFS file system support" 2 tristate "OS/2 HPFS file system support"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # nontrivial to fix
4 help 5 help
5 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS 6 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
6 is the file system used for organizing files on OS/2 hard disk 7 is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0e8014ea6b94..262419f83d80 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1371,6 +1371,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
1371 1371
1372 if (!compat && !ro && !incompat) 1372 if (!compat && !ro && !incompat)
1373 return 1; 1373 return 1;
1374 /* Load journal superblock if it is not loaded yet. */
1375 if (journal->j_format_version == 0 &&
1376 journal_get_superblock(journal) != 0)
1377 return 0;
1374 if (journal->j_format_version == 1) 1378 if (journal->j_format_version == 1)
1375 return 0; 1379 return 0;
1376 1380
diff --git a/fs/libfs.c b/fs/libfs.c
index 0a9da95317f7..62baa0387d6e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -913,6 +913,35 @@ int generic_file_fsync(struct file *file, int datasync)
913} 913}
914EXPORT_SYMBOL(generic_file_fsync); 914EXPORT_SYMBOL(generic_file_fsync);
915 915
916/**
917 * generic_check_addressable - Check addressability of file system
918 * @blocksize_bits: log of file system block size
919 * @num_blocks: number of blocks in file system
920 *
921 * Determine whether a file system with @num_blocks blocks (and a
922 * block size of 2**@blocksize_bits) is addressable by the sector_t
923 * and page cache of the system. Return 0 if so and -EFBIG otherwise.
924 */
925int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
926{
927 u64 last_fs_block = num_blocks - 1;
928 u64 last_fs_page =
929 last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
930
931 if (unlikely(num_blocks == 0))
932 return 0;
933
934 if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
935 return -EINVAL;
936
937 if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
938 (last_fs_page > (pgoff_t)(~0ULL))) {
939 return -EFBIG;
940 }
941 return 0;
942}
943EXPORT_SYMBOL(generic_check_addressable);
944
916/* 945/*
917 * No-op implementation of ->fsync for in-memory filesystems. 946 * No-op implementation of ->fsync for in-memory filesystems.
918 */ 947 */
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index f7e13db613cb..b950415d7c43 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -1,6 +1,7 @@
1config NFS_FS 1config NFS_FS
2 tristate "NFS client support" 2 tristate "NFS client support"
3 depends on INET && FILE_LOCKING 3 depends on INET && FILE_LOCKING
4 depends on BKL # fix as soon as lockd is done
4 select LOCKD 5 select LOCKD
5 select SUNRPC 6 select SUNRPC
6 select NFS_ACL_SUPPORT if NFS_V3_ACL 7 select NFS_ACL_SUPPORT if NFS_V3_ACL
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 4264377552e2..7cf4ddafb4ab 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -2,6 +2,7 @@ config NFSD
2 tristate "NFS server support" 2 tristate "NFS server support"
3 depends on INET 3 depends on INET
4 depends on FILE_LOCKING 4 depends on FILE_LOCKING
5 depends on BKL # fix as soon as lockd is done
5 select LOCKD 6 select LOCKD
6 select SUNRPC 7 select SUNRPC
7 select EXPORTFS 8 select EXPORTFS
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index cdfb8c6a4206..c16f8d8331b5 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp)
196static inline void 196static inline void
197fh_unlock(struct svc_fh *fhp) 197fh_unlock(struct svc_fh *fhp)
198{ 198{
199 BUG_ON(!fhp->fh_dentry);
200
201 if (fhp->fh_locked) { 199 if (fhp->fh_locked) {
202 fill_post_wcc(fhp); 200 fill_post_wcc(fhp);
203 mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); 201 mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629eedd82..b388443c3a09 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,4 +3,4 @@ config FSNOTIFY
3 3
4source "fs/notify/dnotify/Kconfig" 4source "fs/notify/dnotify/Kconfig"
5source "fs/notify/inotify/Kconfig" 5source "fs/notify/inotify/Kconfig"
6source "fs/notify/fanotify/Kconfig" 6#source "fs/notify/fanotify/Kconfig"
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0de69c9a08be..5cfeee118158 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -883,8 +883,8 @@ struct ocfs2_write_ctxt {
883 * out in so that future reads from that region will get 883 * out in so that future reads from that region will get
884 * zero's. 884 * zero's.
885 */ 885 */
886 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
887 unsigned int w_num_pages; 886 unsigned int w_num_pages;
887 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
888 struct page *w_target_page; 888 struct page *w_target_page;
889 889
890 /* 890 /*
@@ -1642,7 +1642,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
1642 return ret; 1642 return ret;
1643} 1643}
1644 1644
1645int ocfs2_write_begin_nolock(struct address_space *mapping, 1645int ocfs2_write_begin_nolock(struct file *filp,
1646 struct address_space *mapping,
1646 loff_t pos, unsigned len, unsigned flags, 1647 loff_t pos, unsigned len, unsigned flags,
1647 struct page **pagep, void **fsdata, 1648 struct page **pagep, void **fsdata,
1648 struct buffer_head *di_bh, struct page *mmap_page) 1649 struct buffer_head *di_bh, struct page *mmap_page)
@@ -1692,7 +1693,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1692 mlog_errno(ret); 1693 mlog_errno(ret);
1693 goto out; 1694 goto out;
1694 } else if (ret == 1) { 1695 } else if (ret == 1) {
1695 ret = ocfs2_refcount_cow(inode, di_bh, 1696 ret = ocfs2_refcount_cow(inode, filp, di_bh,
1696 wc->w_cpos, wc->w_clen, UINT_MAX); 1697 wc->w_cpos, wc->w_clen, UINT_MAX);
1697 if (ret) { 1698 if (ret) {
1698 mlog_errno(ret); 1699 mlog_errno(ret);
@@ -1854,7 +1855,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1854 */ 1855 */
1855 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1856 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1856 1857
1857 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, 1858 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
1858 fsdata, di_bh, NULL); 1859 fsdata, di_bh, NULL);
1859 if (ret) { 1860 if (ret) {
1860 mlog_errno(ret); 1861 mlog_errno(ret);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index c48e93ffc513..7606f663da6d 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
48 loff_t pos, unsigned len, unsigned copied, 48 loff_t pos, unsigned len, unsigned copied,
49 struct page *page, void *fsdata); 49 struct page *page, void *fsdata);
50 50
51int ocfs2_write_begin_nolock(struct address_space *mapping, 51int ocfs2_write_begin_nolock(struct file *filp,
52 struct address_space *mapping,
52 loff_t pos, unsigned len, unsigned flags, 53 loff_t pos, unsigned len, unsigned flags,
53 struct page **pagep, void **fsdata, 54 struct page **pagep, void **fsdata,
54 struct buffer_head *di_bh, struct page *mmap_page); 55 struct buffer_head *di_bh, struct page *mmap_page);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 41d5f1f92d56..52c7557f3e25 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -62,10 +62,51 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
62static LIST_HEAD(o2hb_node_events); 62static LIST_HEAD(o2hb_node_events);
63static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); 63static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
64 64
65/*
66 * In global heartbeat, we maintain a series of region bitmaps.
67 * - o2hb_region_bitmap allows us to limit the region number to max region.
68 * - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
69 * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
70 * heartbeat on it.
71 * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
72 */
73static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
74static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
75static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
76static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
77
78#define O2HB_DB_TYPE_LIVENODES 0
79#define O2HB_DB_TYPE_LIVEREGIONS 1
80#define O2HB_DB_TYPE_QUORUMREGIONS 2
81#define O2HB_DB_TYPE_FAILEDREGIONS 3
82#define O2HB_DB_TYPE_REGION_LIVENODES 4
83#define O2HB_DB_TYPE_REGION_NUMBER 5
84#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
85struct o2hb_debug_buf {
86 int db_type;
87 int db_size;
88 int db_len;
89 void *db_data;
90};
91
92static struct o2hb_debug_buf *o2hb_db_livenodes;
93static struct o2hb_debug_buf *o2hb_db_liveregions;
94static struct o2hb_debug_buf *o2hb_db_quorumregions;
95static struct o2hb_debug_buf *o2hb_db_failedregions;
96
65#define O2HB_DEBUG_DIR "o2hb" 97#define O2HB_DEBUG_DIR "o2hb"
66#define O2HB_DEBUG_LIVENODES "livenodes" 98#define O2HB_DEBUG_LIVENODES "livenodes"
99#define O2HB_DEBUG_LIVEREGIONS "live_regions"
100#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions"
101#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
102#define O2HB_DEBUG_REGION_NUMBER "num"
103#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
104
67static struct dentry *o2hb_debug_dir; 105static struct dentry *o2hb_debug_dir;
68static struct dentry *o2hb_debug_livenodes; 106static struct dentry *o2hb_debug_livenodes;
107static struct dentry *o2hb_debug_liveregions;
108static struct dentry *o2hb_debug_quorumregions;
109static struct dentry *o2hb_debug_failedregions;
69 110
70static LIST_HEAD(o2hb_all_regions); 111static LIST_HEAD(o2hb_all_regions);
71 112
@@ -77,7 +118,19 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
77 118
78#define O2HB_DEFAULT_BLOCK_BITS 9 119#define O2HB_DEFAULT_BLOCK_BITS 9
79 120
121enum o2hb_heartbeat_modes {
122 O2HB_HEARTBEAT_LOCAL = 0,
123 O2HB_HEARTBEAT_GLOBAL,
124 O2HB_HEARTBEAT_NUM_MODES,
125};
126
127char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
128 "local", /* O2HB_HEARTBEAT_LOCAL */
129 "global", /* O2HB_HEARTBEAT_GLOBAL */
130};
131
80unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; 132unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
133unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
81 134
82/* Only sets a new threshold if there are no active regions. 135/* Only sets a new threshold if there are no active regions.
83 * 136 *
@@ -94,6 +147,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
94 } 147 }
95} 148}
96 149
150static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
151{
152 int ret = -1;
153
154 if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
155 spin_lock(&o2hb_live_lock);
156 if (list_empty(&o2hb_all_regions)) {
157 o2hb_heartbeat_mode = hb_mode;
158 ret = 0;
159 }
160 spin_unlock(&o2hb_live_lock);
161 }
162
163 return ret;
164}
165
97struct o2hb_node_event { 166struct o2hb_node_event {
98 struct list_head hn_item; 167 struct list_head hn_item;
99 enum o2hb_callback_type hn_event_type; 168 enum o2hb_callback_type hn_event_type;
@@ -135,6 +204,18 @@ struct o2hb_region {
135 struct block_device *hr_bdev; 204 struct block_device *hr_bdev;
136 struct o2hb_disk_slot *hr_slots; 205 struct o2hb_disk_slot *hr_slots;
137 206
207 /* live node map of this region */
208 unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
209 unsigned int hr_region_num;
210
211 struct dentry *hr_debug_dir;
212 struct dentry *hr_debug_livenodes;
213 struct dentry *hr_debug_regnum;
214 struct dentry *hr_debug_elapsed_time;
215 struct o2hb_debug_buf *hr_db_livenodes;
216 struct o2hb_debug_buf *hr_db_regnum;
217 struct o2hb_debug_buf *hr_db_elapsed_time;
218
138 /* let the person setting up hb wait for it to return until it 219 /* let the person setting up hb wait for it to return until it
139 * has reached a 'steady' state. This will be fixed when we have 220 * has reached a 'steady' state. This will be fixed when we have
140 * a more complete api that doesn't lead to this sort of fragility. */ 221 * a more complete api that doesn't lead to this sort of fragility. */
@@ -163,8 +244,19 @@ struct o2hb_bio_wait_ctxt {
163 int wc_error; 244 int wc_error;
164}; 245};
165 246
247static int o2hb_pop_count(void *map, int count)
248{
249 int i = -1, pop = 0;
250
251 while ((i = find_next_bit(map, count, i + 1)) < count)
252 pop++;
253 return pop;
254}
255
166static void o2hb_write_timeout(struct work_struct *work) 256static void o2hb_write_timeout(struct work_struct *work)
167{ 257{
258 int failed, quorum;
259 unsigned long flags;
168 struct o2hb_region *reg = 260 struct o2hb_region *reg =
169 container_of(work, struct o2hb_region, 261 container_of(work, struct o2hb_region,
170 hr_write_timeout_work.work); 262 hr_write_timeout_work.work);
@@ -172,6 +264,28 @@ static void o2hb_write_timeout(struct work_struct *work)
172 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " 264 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
173 "milliseconds\n", reg->hr_dev_name, 265 "milliseconds\n", reg->hr_dev_name,
174 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 266 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
267
268 if (o2hb_global_heartbeat_active()) {
269 spin_lock_irqsave(&o2hb_live_lock, flags);
270 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
271 set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
272 failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
273 O2NM_MAX_REGIONS);
274 quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
275 O2NM_MAX_REGIONS);
276 spin_unlock_irqrestore(&o2hb_live_lock, flags);
277
278 mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
279 quorum, failed);
280
281 /*
282 * Fence if the number of failed regions >= half the number
283 * of quorum regions
284 */
285 if ((failed << 1) < quorum)
286 return;
287 }
288
175 o2quo_disk_timeout(); 289 o2quo_disk_timeout();
176} 290}
177 291
@@ -180,6 +294,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
180 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", 294 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
181 O2HB_MAX_WRITE_TIMEOUT_MS); 295 O2HB_MAX_WRITE_TIMEOUT_MS);
182 296
297 if (o2hb_global_heartbeat_active()) {
298 spin_lock(&o2hb_live_lock);
299 clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
300 spin_unlock(&o2hb_live_lock);
301 }
183 cancel_delayed_work(&reg->hr_write_timeout_work); 302 cancel_delayed_work(&reg->hr_write_timeout_work);
184 reg->hr_last_timeout_start = jiffies; 303 reg->hr_last_timeout_start = jiffies;
185 schedule_delayed_work(&reg->hr_write_timeout_work, 304 schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -513,6 +632,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
513{ 632{
514 assert_spin_locked(&o2hb_live_lock); 633 assert_spin_locked(&o2hb_live_lock);
515 634
635 BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
636
516 event->hn_event_type = type; 637 event->hn_event_type = type;
517 event->hn_node = node; 638 event->hn_node = node;
518 event->hn_node_num = node_num; 639 event->hn_node_num = node_num;
@@ -554,6 +675,35 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
554 o2nm_node_put(node); 675 o2nm_node_put(node);
555} 676}
556 677
678static void o2hb_set_quorum_device(struct o2hb_region *reg,
679 struct o2hb_disk_slot *slot)
680{
681 assert_spin_locked(&o2hb_live_lock);
682
683 if (!o2hb_global_heartbeat_active())
684 return;
685
686 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
687 return;
688
689 /*
690 * A region can be added to the quorum only when it sees all
691 * live nodes heartbeat on it. In other words, the region has been
692 * added to all nodes.
693 */
694 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
695 sizeof(o2hb_live_node_bitmap)))
696 return;
697
698 if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
699 return;
700
701 printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
702 config_item_name(&reg->hr_item));
703
704 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
705}
706
557static int o2hb_check_slot(struct o2hb_region *reg, 707static int o2hb_check_slot(struct o2hb_region *reg,
558 struct o2hb_disk_slot *slot) 708 struct o2hb_disk_slot *slot)
559{ 709{
@@ -565,14 +715,22 @@ static int o2hb_check_slot(struct o2hb_region *reg,
565 u64 cputime; 715 u64 cputime;
566 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; 716 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
567 unsigned int slot_dead_ms; 717 unsigned int slot_dead_ms;
718 int tmp;
568 719
569 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); 720 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
570 721
571 /* Is this correct? Do we assume that the node doesn't exist 722 /*
572 * if we're not configured for him? */ 723 * If a node is no longer configured but is still in the livemap, we
724 * may need to clear that bit from the livemap.
725 */
573 node = o2nm_get_node_by_num(slot->ds_node_num); 726 node = o2nm_get_node_by_num(slot->ds_node_num);
574 if (!node) 727 if (!node) {
575 return 0; 728 spin_lock(&o2hb_live_lock);
729 tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
730 spin_unlock(&o2hb_live_lock);
731 if (!tmp)
732 return 0;
733 }
576 734
577 if (!o2hb_verify_crc(reg, hb_block)) { 735 if (!o2hb_verify_crc(reg, hb_block)) {
578 /* all paths from here will drop o2hb_live_lock for 736 /* all paths from here will drop o2hb_live_lock for
@@ -639,8 +797,12 @@ fire_callbacks:
639 mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", 797 mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
640 slot->ds_node_num, (long long)slot->ds_last_generation); 798 slot->ds_node_num, (long long)slot->ds_last_generation);
641 799
800 set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
801
642 /* first on the list generates a callback */ 802 /* first on the list generates a callback */
643 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { 803 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
804 mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
805 "bitmap\n", slot->ds_node_num);
644 set_bit(slot->ds_node_num, o2hb_live_node_bitmap); 806 set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
645 807
646 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, 808 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
@@ -684,13 +846,18 @@ fire_callbacks:
684 mlog(ML_HEARTBEAT, "Node %d left my region\n", 846 mlog(ML_HEARTBEAT, "Node %d left my region\n",
685 slot->ds_node_num); 847 slot->ds_node_num);
686 848
849 clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
850
687 /* last off the live_slot generates a callback */ 851 /* last off the live_slot generates a callback */
688 list_del_init(&slot->ds_live_item); 852 list_del_init(&slot->ds_live_item);
689 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { 853 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
854 mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
855 "nodes bitmap\n", slot->ds_node_num);
690 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); 856 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
691 857
692 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, 858 /* node can be null */
693 slot->ds_node_num); 859 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
860 node, slot->ds_node_num);
694 861
695 changed = 1; 862 changed = 1;
696 } 863 }
@@ -706,11 +873,14 @@ fire_callbacks:
706 slot->ds_equal_samples = 0; 873 slot->ds_equal_samples = 0;
707 } 874 }
708out: 875out:
876 o2hb_set_quorum_device(reg, slot);
877
709 spin_unlock(&o2hb_live_lock); 878 spin_unlock(&o2hb_live_lock);
710 879
711 o2hb_run_event_list(&event); 880 o2hb_run_event_list(&event);
712 881
713 o2nm_node_put(node); 882 if (node)
883 o2nm_node_put(node);
714 return changed; 884 return changed;
715} 885}
716 886
@@ -737,6 +907,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
737{ 907{
738 int i, ret, highest_node, change = 0; 908 int i, ret, highest_node, change = 0;
739 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 909 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
910 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
740 struct o2hb_bio_wait_ctxt write_wc; 911 struct o2hb_bio_wait_ctxt write_wc;
741 912
742 ret = o2nm_configured_node_map(configured_nodes, 913 ret = o2nm_configured_node_map(configured_nodes,
@@ -746,6 +917,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
746 return ret; 917 return ret;
747 } 918 }
748 919
920 /*
921 * If a node is not configured but is in the livemap, we still need
922 * to read the slot so as to be able to remove it from the livemap.
923 */
924 o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
925 i = -1;
926 while ((i = find_next_bit(live_node_bitmap,
927 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
928 set_bit(i, configured_nodes);
929 }
930
749 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); 931 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
750 if (highest_node >= O2NM_MAX_NODES) { 932 if (highest_node >= O2NM_MAX_NODES) {
751 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); 933 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
@@ -917,21 +1099,59 @@ static int o2hb_thread(void *data)
917#ifdef CONFIG_DEBUG_FS 1099#ifdef CONFIG_DEBUG_FS
918static int o2hb_debug_open(struct inode *inode, struct file *file) 1100static int o2hb_debug_open(struct inode *inode, struct file *file)
919{ 1101{
1102 struct o2hb_debug_buf *db = inode->i_private;
1103 struct o2hb_region *reg;
920 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 1104 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
921 char *buf = NULL; 1105 char *buf = NULL;
922 int i = -1; 1106 int i = -1;
923 int out = 0; 1107 int out = 0;
924 1108
1109 /* max_nodes should be the largest bitmap we pass here */
1110 BUG_ON(sizeof(map) < db->db_size);
1111
925 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 1112 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
926 if (!buf) 1113 if (!buf)
927 goto bail; 1114 goto bail;
928 1115
929 o2hb_fill_node_map(map, sizeof(map)); 1116 switch (db->db_type) {
1117 case O2HB_DB_TYPE_LIVENODES:
1118 case O2HB_DB_TYPE_LIVEREGIONS:
1119 case O2HB_DB_TYPE_QUORUMREGIONS:
1120 case O2HB_DB_TYPE_FAILEDREGIONS:
1121 spin_lock(&o2hb_live_lock);
1122 memcpy(map, db->db_data, db->db_size);
1123 spin_unlock(&o2hb_live_lock);
1124 break;
1125
1126 case O2HB_DB_TYPE_REGION_LIVENODES:
1127 spin_lock(&o2hb_live_lock);
1128 reg = (struct o2hb_region *)db->db_data;
1129 memcpy(map, reg->hr_live_node_bitmap, db->db_size);
1130 spin_unlock(&o2hb_live_lock);
1131 break;
1132
1133 case O2HB_DB_TYPE_REGION_NUMBER:
1134 reg = (struct o2hb_region *)db->db_data;
1135 out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
1136 reg->hr_region_num);
1137 goto done;
1138
1139 case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
1140 reg = (struct o2hb_region *)db->db_data;
1141 out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
1142 jiffies_to_msecs(jiffies -
1143 reg->hr_last_timeout_start));
1144 goto done;
1145
1146 default:
1147 goto done;
1148 }
930 1149
931 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) 1150 while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
932 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); 1151 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
933 out += snprintf(buf + out, PAGE_SIZE - out, "\n"); 1152 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
934 1153
1154done:
935 i_size_write(inode, out); 1155 i_size_write(inode, out);
936 1156
937 file->private_data = buf; 1157 file->private_data = buf;
@@ -978,10 +1198,104 @@ static const struct file_operations o2hb_debug_fops = {
978 1198
979void o2hb_exit(void) 1199void o2hb_exit(void)
980{ 1200{
981 if (o2hb_debug_livenodes) 1201 kfree(o2hb_db_livenodes);
982 debugfs_remove(o2hb_debug_livenodes); 1202 kfree(o2hb_db_liveregions);
983 if (o2hb_debug_dir) 1203 kfree(o2hb_db_quorumregions);
984 debugfs_remove(o2hb_debug_dir); 1204 kfree(o2hb_db_failedregions);
1205 debugfs_remove(o2hb_debug_failedregions);
1206 debugfs_remove(o2hb_debug_quorumregions);
1207 debugfs_remove(o2hb_debug_liveregions);
1208 debugfs_remove(o2hb_debug_livenodes);
1209 debugfs_remove(o2hb_debug_dir);
1210}
1211
1212static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
1213 struct o2hb_debug_buf **db, int db_len,
1214 int type, int size, int len, void *data)
1215{
1216 *db = kmalloc(db_len, GFP_KERNEL);
1217 if (!*db)
1218 return NULL;
1219
1220 (*db)->db_type = type;
1221 (*db)->db_size = size;
1222 (*db)->db_len = len;
1223 (*db)->db_data = data;
1224
1225 return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
1226 &o2hb_debug_fops);
1227}
1228
1229static int o2hb_debug_init(void)
1230{
1231 int ret = -ENOMEM;
1232
1233 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
1234 if (!o2hb_debug_dir) {
1235 mlog_errno(ret);
1236 goto bail;
1237 }
1238
1239 o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
1240 o2hb_debug_dir,
1241 &o2hb_db_livenodes,
1242 sizeof(*o2hb_db_livenodes),
1243 O2HB_DB_TYPE_LIVENODES,
1244 sizeof(o2hb_live_node_bitmap),
1245 O2NM_MAX_NODES,
1246 o2hb_live_node_bitmap);
1247 if (!o2hb_debug_livenodes) {
1248 mlog_errno(ret);
1249 goto bail;
1250 }
1251
1252 o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
1253 o2hb_debug_dir,
1254 &o2hb_db_liveregions,
1255 sizeof(*o2hb_db_liveregions),
1256 O2HB_DB_TYPE_LIVEREGIONS,
1257 sizeof(o2hb_live_region_bitmap),
1258 O2NM_MAX_REGIONS,
1259 o2hb_live_region_bitmap);
1260 if (!o2hb_debug_liveregions) {
1261 mlog_errno(ret);
1262 goto bail;
1263 }
1264
1265 o2hb_debug_quorumregions =
1266 o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
1267 o2hb_debug_dir,
1268 &o2hb_db_quorumregions,
1269 sizeof(*o2hb_db_quorumregions),
1270 O2HB_DB_TYPE_QUORUMREGIONS,
1271 sizeof(o2hb_quorum_region_bitmap),
1272 O2NM_MAX_REGIONS,
1273 o2hb_quorum_region_bitmap);
1274 if (!o2hb_debug_quorumregions) {
1275 mlog_errno(ret);
1276 goto bail;
1277 }
1278
1279 o2hb_debug_failedregions =
1280 o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
1281 o2hb_debug_dir,
1282 &o2hb_db_failedregions,
1283 sizeof(*o2hb_db_failedregions),
1284 O2HB_DB_TYPE_FAILEDREGIONS,
1285 sizeof(o2hb_failed_region_bitmap),
1286 O2NM_MAX_REGIONS,
1287 o2hb_failed_region_bitmap);
1288 if (!o2hb_debug_failedregions) {
1289 mlog_errno(ret);
1290 goto bail;
1291 }
1292
1293 ret = 0;
1294bail:
1295 if (ret)
1296 o2hb_exit();
1297
1298 return ret;
985} 1299}
986 1300
987int o2hb_init(void) 1301int o2hb_init(void)
@@ -997,24 +1311,12 @@ int o2hb_init(void)
997 INIT_LIST_HEAD(&o2hb_node_events); 1311 INIT_LIST_HEAD(&o2hb_node_events);
998 1312
999 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); 1313 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
1314 memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
1315 memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
1316 memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
1317 memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
1000 1318
1001 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); 1319 return o2hb_debug_init();
1002 if (!o2hb_debug_dir) {
1003 mlog_errno(-ENOMEM);
1004 return -ENOMEM;
1005 }
1006
1007 o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
1008 S_IFREG|S_IRUSR,
1009 o2hb_debug_dir, NULL,
1010 &o2hb_debug_fops);
1011 if (!o2hb_debug_livenodes) {
1012 mlog_errno(-ENOMEM);
1013 debugfs_remove(o2hb_debug_dir);
1014 return -ENOMEM;
1015 }
1016
1017 return 0;
1018} 1320}
1019 1321
1020/* if we're already in a callback then we're already serialized by the sem */ 1322/* if we're already in a callback then we're already serialized by the sem */
@@ -1078,6 +1380,13 @@ static void o2hb_region_release(struct config_item *item)
1078 if (reg->hr_slots) 1380 if (reg->hr_slots)
1079 kfree(reg->hr_slots); 1381 kfree(reg->hr_slots);
1080 1382
1383 kfree(reg->hr_db_regnum);
1384 kfree(reg->hr_db_livenodes);
1385 debugfs_remove(reg->hr_debug_livenodes);
1386 debugfs_remove(reg->hr_debug_regnum);
1387 debugfs_remove(reg->hr_debug_elapsed_time);
1388 debugfs_remove(reg->hr_debug_dir);
1389
1081 spin_lock(&o2hb_live_lock); 1390 spin_lock(&o2hb_live_lock);
1082 list_del(&reg->hr_all_item); 1391 list_del(&reg->hr_all_item);
1083 spin_unlock(&o2hb_live_lock); 1392 spin_unlock(&o2hb_live_lock);
@@ -1441,6 +1750,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1441 /* Ok, we were woken. Make sure it wasn't by drop_item() */ 1750 /* Ok, we were woken. Make sure it wasn't by drop_item() */
1442 spin_lock(&o2hb_live_lock); 1751 spin_lock(&o2hb_live_lock);
1443 hb_task = reg->hr_task; 1752 hb_task = reg->hr_task;
1753 if (o2hb_global_heartbeat_active())
1754 set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
1444 spin_unlock(&o2hb_live_lock); 1755 spin_unlock(&o2hb_live_lock);
1445 1756
1446 if (hb_task) 1757 if (hb_task)
@@ -1448,6 +1759,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1448 else 1759 else
1449 ret = -EIO; 1760 ret = -EIO;
1450 1761
1762 if (hb_task && o2hb_global_heartbeat_active())
1763 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
1764 config_item_name(&reg->hr_item));
1765
1451out: 1766out:
1452 if (filp) 1767 if (filp)
1453 fput(filp); 1768 fput(filp);
@@ -1586,21 +1901,94 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
1586 : NULL; 1901 : NULL;
1587} 1902}
1588 1903
1904static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
1905{
1906 int ret = -ENOMEM;
1907
1908 reg->hr_debug_dir =
1909 debugfs_create_dir(config_item_name(&reg->hr_item), dir);
1910 if (!reg->hr_debug_dir) {
1911 mlog_errno(ret);
1912 goto bail;
1913 }
1914
1915 reg->hr_debug_livenodes =
1916 o2hb_debug_create(O2HB_DEBUG_LIVENODES,
1917 reg->hr_debug_dir,
1918 &(reg->hr_db_livenodes),
1919 sizeof(*(reg->hr_db_livenodes)),
1920 O2HB_DB_TYPE_REGION_LIVENODES,
1921 sizeof(reg->hr_live_node_bitmap),
1922 O2NM_MAX_NODES, reg);
1923 if (!reg->hr_debug_livenodes) {
1924 mlog_errno(ret);
1925 goto bail;
1926 }
1927
1928 reg->hr_debug_regnum =
1929 o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
1930 reg->hr_debug_dir,
1931 &(reg->hr_db_regnum),
1932 sizeof(*(reg->hr_db_regnum)),
1933 O2HB_DB_TYPE_REGION_NUMBER,
1934 0, O2NM_MAX_NODES, reg);
1935 if (!reg->hr_debug_regnum) {
1936 mlog_errno(ret);
1937 goto bail;
1938 }
1939
1940 reg->hr_debug_elapsed_time =
1941 o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
1942 reg->hr_debug_dir,
1943 &(reg->hr_db_elapsed_time),
1944 sizeof(*(reg->hr_db_elapsed_time)),
1945 O2HB_DB_TYPE_REGION_ELAPSED_TIME,
1946 0, 0, reg);
1947 if (!reg->hr_debug_elapsed_time) {
1948 mlog_errno(ret);
1949 goto bail;
1950 }
1951
1952 ret = 0;
1953bail:
1954 return ret;
1955}
1956
1589static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, 1957static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
1590 const char *name) 1958 const char *name)
1591{ 1959{
1592 struct o2hb_region *reg = NULL; 1960 struct o2hb_region *reg = NULL;
1961 int ret;
1593 1962
1594 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); 1963 reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
1595 if (reg == NULL) 1964 if (reg == NULL)
1596 return ERR_PTR(-ENOMEM); 1965 return ERR_PTR(-ENOMEM);
1597 1966
1598 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type); 1967 if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
1968 return ERR_PTR(-ENAMETOOLONG);
1599 1969
1600 spin_lock(&o2hb_live_lock); 1970 spin_lock(&o2hb_live_lock);
1971 reg->hr_region_num = 0;
1972 if (o2hb_global_heartbeat_active()) {
1973 reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
1974 O2NM_MAX_REGIONS);
1975 if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
1976 spin_unlock(&o2hb_live_lock);
1977 return ERR_PTR(-EFBIG);
1978 }
1979 set_bit(reg->hr_region_num, o2hb_region_bitmap);
1980 }
1601 list_add_tail(&reg->hr_all_item, &o2hb_all_regions); 1981 list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
1602 spin_unlock(&o2hb_live_lock); 1982 spin_unlock(&o2hb_live_lock);
1603 1983
1984 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
1985
1986 ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
1987 if (ret) {
1988 config_item_put(&reg->hr_item);
1989 return ERR_PTR(ret);
1990 }
1991
1604 return &reg->hr_item; 1992 return &reg->hr_item;
1605} 1993}
1606 1994
@@ -1612,6 +2000,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1612 2000
1613 /* stop the thread when the user removes the region dir */ 2001 /* stop the thread when the user removes the region dir */
1614 spin_lock(&o2hb_live_lock); 2002 spin_lock(&o2hb_live_lock);
2003 if (o2hb_global_heartbeat_active()) {
2004 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2005 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2006 }
1615 hb_task = reg->hr_task; 2007 hb_task = reg->hr_task;
1616 reg->hr_task = NULL; 2008 reg->hr_task = NULL;
1617 spin_unlock(&o2hb_live_lock); 2009 spin_unlock(&o2hb_live_lock);
@@ -1628,6 +2020,9 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1628 wake_up(&o2hb_steady_queue); 2020 wake_up(&o2hb_steady_queue);
1629 } 2021 }
1630 2022
2023 if (o2hb_global_heartbeat_active())
2024 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
2025 config_item_name(&reg->hr_item));
1631 config_item_put(item); 2026 config_item_put(item);
1632} 2027}
1633 2028
@@ -1688,6 +2083,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
1688 return count; 2083 return count;
1689} 2084}
1690 2085
2086static
2087ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
2088 char *page)
2089{
2090 return sprintf(page, "%s\n",
2091 o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
2092}
2093
2094static
2095ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
2096 const char *page, size_t count)
2097{
2098 unsigned int i;
2099 int ret;
2100 size_t len;
2101
2102 len = (page[count - 1] == '\n') ? count - 1 : count;
2103 if (!len)
2104 return -EINVAL;
2105
2106 for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
2107 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
2108 continue;
2109
2110 ret = o2hb_global_hearbeat_mode_set(i);
2111 if (!ret)
2112 printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
2113 o2hb_heartbeat_mode_desc[i]);
2114 return count;
2115 }
2116
2117 return -EINVAL;
2118
2119}
2120
1691static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { 2121static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
1692 .attr = { .ca_owner = THIS_MODULE, 2122 .attr = { .ca_owner = THIS_MODULE,
1693 .ca_name = "dead_threshold", 2123 .ca_name = "dead_threshold",
@@ -1696,8 +2126,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
1696 .store = o2hb_heartbeat_group_threshold_store, 2126 .store = o2hb_heartbeat_group_threshold_store,
1697}; 2127};
1698 2128
2129static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
2130 .attr = { .ca_owner = THIS_MODULE,
2131 .ca_name = "mode",
2132 .ca_mode = S_IRUGO | S_IWUSR },
2133 .show = o2hb_heartbeat_group_mode_show,
2134 .store = o2hb_heartbeat_group_mode_store,
2135};
2136
1699static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { 2137static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
1700 &o2hb_heartbeat_group_attr_threshold.attr, 2138 &o2hb_heartbeat_group_attr_threshold.attr,
2139 &o2hb_heartbeat_group_attr_mode.attr,
1701 NULL, 2140 NULL,
1702}; 2141};
1703 2142
@@ -1963,3 +2402,34 @@ void o2hb_stop_all_regions(void)
1963 spin_unlock(&o2hb_live_lock); 2402 spin_unlock(&o2hb_live_lock);
1964} 2403}
1965EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); 2404EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
2405
2406int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
2407{
2408 struct o2hb_region *reg;
2409 int numregs = 0;
2410 char *p;
2411
2412 spin_lock(&o2hb_live_lock);
2413
2414 p = region_uuids;
2415 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2416 mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
2417 if (numregs < max_regions) {
2418 memcpy(p, config_item_name(&reg->hr_item),
2419 O2HB_MAX_REGION_NAME_LEN);
2420 p += O2HB_MAX_REGION_NAME_LEN;
2421 }
2422 numregs++;
2423 }
2424
2425 spin_unlock(&o2hb_live_lock);
2426
2427 return numregs;
2428}
2429EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
2430
2431int o2hb_global_heartbeat_active(void)
2432{
2433 return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
2434}
2435EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 2f1649253b49..00ad8e8fea51 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,6 +31,8 @@
31 31
32#define O2HB_REGION_TIMEOUT_MS 2000 32#define O2HB_REGION_TIMEOUT_MS 2000
33 33
34#define O2HB_MAX_REGION_NAME_LEN 32
35
34/* number of changes to be seen as live */ 36/* number of changes to be seen as live */
35#define O2HB_LIVE_THRESHOLD 2 37#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */ 38/* number of equal samples to be seen as dead */
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num);
81int o2hb_check_node_heartbeating_from_callback(u8 node_num); 83int o2hb_check_node_heartbeating_from_callback(u8 node_num);
82int o2hb_check_local_node_heartbeating(void); 84int o2hb_check_local_node_heartbeating(void);
83void o2hb_stop_all_regions(void); 85void o2hb_stop_all_regions(void);
86int o2hb_get_all_regions(char *region_uuids, u8 numregions);
87int o2hb_global_heartbeat_active(void);
84 88
85#endif /* O2CLUSTER_HEARTBEAT_H */ 89#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index fd96e2a2fa56..ea2ed9f56c94 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,7 +119,8 @@
119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ 121#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ 122#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
123#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */
123 124
124#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) 125#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
125#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) 126#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ed0c9f367fed..bb240647ca5f 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
711 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); 711 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
712 spin_lock_init(&node->nd_lock); 712 spin_lock_init(&node->nd_lock);
713 713
714 mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
715
714 return &node->nd_item; 716 return &node->nd_item;
715} 717}
716 718
@@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
744 } 746 }
745 write_unlock(&cluster->cl_nodes_lock); 747 write_unlock(&cluster->cl_nodes_lock);
746 748
749 mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
750 config_item_name(&node->nd_item));
751
747 config_item_put(item); 752 config_item_put(item);
748} 753}
749 754
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854bad571..49b594325bec 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
36/* host name, group name, cluster name all 64 bytes */ 36/* host name, group name, cluster name all 64 bytes */
37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN 37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
38 38
39/*
40 * Maximum number of global heartbeat regions allowed.
41 * **CAUTION** Changing this number will break dlm compatibility.
42 */
43#define O2NM_MAX_REGIONS 32
44
39#endif /* _OCFS2_NODEMANAGER_H */ 45#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index cbe2f057cc28..9aa426e42123 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
1696{ 1696{
1697 o2quo_hb_down(node_num); 1697 o2quo_hb_down(node_num);
1698 1698
1699 if (!node)
1700 return;
1701
1699 if (node_num != o2nm_this_node()) 1702 if (node_num != o2nm_this_node())
1700 o2net_disconnect_node(node); 1703 o2net_disconnect_node(node);
1701 1704
@@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1709 1712
1710 o2quo_hb_up(node_num); 1713 o2quo_hb_up(node_num);
1711 1714
1715 BUG_ON(!node);
1716
1712 /* ensure an immediate connect attempt */ 1717 /* ensure an immediate connect attempt */
1713 nn->nn_last_connect_attempt = jiffies - 1718 nn->nn_last_connect_attempt = jiffies -
1714 (msecs_to_jiffies(o2net_reconnect_delay()) + 1); 1719 (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b4957c7d9fe2..edaded48e7e9 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -40,6 +40,14 @@
40#include "inode.h" 40#include "inode.h"
41#include "super.h" 41#include "super.h"
42 42
43void ocfs2_dentry_attach_gen(struct dentry *dentry)
44{
45 unsigned long gen =
46 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
47 BUG_ON(dentry->d_inode);
48 dentry->d_fsdata = (void *)gen;
49}
50
43 51
44static int ocfs2_dentry_revalidate(struct dentry *dentry, 52static int ocfs2_dentry_revalidate(struct dentry *dentry,
45 struct nameidata *nd) 53 struct nameidata *nd)
@@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
51 mlog_entry("(0x%p, '%.*s')\n", dentry, 59 mlog_entry("(0x%p, '%.*s')\n", dentry,
52 dentry->d_name.len, dentry->d_name.name); 60 dentry->d_name.len, dentry->d_name.name);
53 61
54 /* Never trust a negative dentry - force a new lookup. */ 62 /* For a negative dentry -
63 * check the generation number of the parent and compare with the
64 * one stored in the inode.
65 */
55 if (inode == NULL) { 66 if (inode == NULL) {
56 mlog(0, "negative dentry: %.*s\n", dentry->d_name.len, 67 unsigned long gen = (unsigned long) dentry->d_fsdata;
57 dentry->d_name.name); 68 unsigned long pgen =
58 goto bail; 69 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
70 mlog(0, "negative dentry: %.*s parent gen: %lu "
71 "dentry gen: %lu\n",
72 dentry->d_name.len, dentry->d_name.name, pgen, gen);
73 if (gen != pgen)
74 goto bail;
75 goto valid;
59 } 76 }
60 77
61 BUG_ON(!osb); 78 BUG_ON(!osb);
@@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
96 goto bail; 113 goto bail;
97 } 114 }
98 115
116valid:
99 ret = 1; 117 ret = 1;
100 118
101bail: 119bail:
@@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
227 if (!inode) 245 if (!inode)
228 return 0; 246 return 0;
229 247
248 if (!dentry->d_inode && dentry->d_fsdata) {
249 /* Converting a negative dentry to positive
250 Clear dentry->d_fsdata */
251 dentry->d_fsdata = dl = NULL;
252 }
253
230 if (dl) { 254 if (dl) {
231 mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, 255 mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
232 " \"%.*s\": old parent: %llu, new: %llu\n", 256 " \"%.*s\": old parent: %llu, new: %llu\n",
@@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
452 476
453out: 477out:
454 iput(inode); 478 iput(inode);
479 ocfs2_dentry_attach_gen(dentry);
455} 480}
456 481
457/* 482/*
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index f5dd1789acf1..b79eff709958 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
64 struct inode *old_dir, struct inode *new_dir); 64 struct inode *old_dir, struct inode *new_dir);
65 65
66extern spinlock_t dentry_attach_lock; 66extern spinlock_t dentry_attach_lock;
67void ocfs2_dentry_attach_gen(struct dentry *dentry);
67 68
68#endif /* OCFS2_DCACHE_H */ 69#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 765298908f1d..b36d0bf77a5a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -445,7 +445,9 @@ enum {
445 DLM_LOCK_REQUEST_MSG, /* 515 */ 445 DLM_LOCK_REQUEST_MSG, /* 515 */
446 DLM_RECO_DATA_DONE_MSG, /* 516 */ 446 DLM_RECO_DATA_DONE_MSG, /* 516 */
447 DLM_BEGIN_RECO_MSG, /* 517 */ 447 DLM_BEGIN_RECO_MSG, /* 517 */
448 DLM_FINALIZE_RECO_MSG /* 518 */ 448 DLM_FINALIZE_RECO_MSG, /* 518 */
449 DLM_QUERY_REGION, /* 519 */
450 DLM_QUERY_NODEINFO, /* 520 */
449}; 451};
450 452
451struct dlm_reco_node_data 453struct dlm_reco_node_data
@@ -727,6 +729,31 @@ struct dlm_cancel_join
727 u8 domain[O2NM_MAX_NAME_LEN]; 729 u8 domain[O2NM_MAX_NAME_LEN];
728}; 730};
729 731
732struct dlm_query_region {
733 u8 qr_node;
734 u8 qr_numregions;
735 u8 qr_namelen;
736 u8 pad1;
737 u8 qr_domain[O2NM_MAX_NAME_LEN];
738 u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
739};
740
741struct dlm_node_info {
742 u8 ni_nodenum;
743 u8 pad1;
744 u16 ni_ipv4_port;
745 u32 ni_ipv4_address;
746};
747
748struct dlm_query_nodeinfo {
749 u8 qn_nodenum;
750 u8 qn_numnodes;
751 u8 qn_namelen;
752 u8 pad1;
753 u8 qn_domain[O2NM_MAX_NAME_LEN];
754 struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
755};
756
730struct dlm_exit_domain 757struct dlm_exit_domain
731{ 758{
732 u8 node_idx; 759 u8 node_idx;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 901ca52bf86b..272ec8631a51 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
493 struct hlist_head *bucket; 493 struct hlist_head *bucket;
494 struct hlist_node *list; 494 struct hlist_node *list;
495 int i, out = 0; 495 int i, out = 0;
496 unsigned long total = 0, longest = 0, bktcnt; 496 unsigned long total = 0, longest = 0, bucket_count = 0;
497 497
498 out += snprintf(db->buf + out, db->len - out, 498 out += snprintf(db->buf + out, db->len - out,
499 "Dumping MLEs for Domain: %s\n", dlm->name); 499 "Dumping MLEs for Domain: %s\n", dlm->name);
@@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
505 mle = hlist_entry(list, struct dlm_master_list_entry, 505 mle = hlist_entry(list, struct dlm_master_list_entry,
506 master_hash_node); 506 master_hash_node);
507 ++total; 507 ++total;
508 ++bktcnt; 508 ++bucket_count;
509 if (db->len - out < 200) 509 if (db->len - out < 200)
510 continue; 510 continue;
511 out += dump_mle(mle, db->buf + out, db->len - out); 511 out += dump_mle(mle, db->buf + out, db->len - out);
512 } 512 }
513 longest = max(longest, bktcnt); 513 longest = max(longest, bucket_count);
514 bktcnt = 0; 514 bucket_count = 0;
515 } 515 }
516 spin_unlock(&dlm->master_lock); 516 spin_unlock(&dlm->master_lock);
517 517
@@ -782,7 +782,9 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
782 782
783 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ 783 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */
784 out += snprintf(db->buf + out, db->len - out, 784 out += snprintf(db->buf + out, db->len - out,
785 "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key); 785 "Domain: %s Key: 0x%08x Protocol: %d.%d\n",
786 dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
787 dlm->dlm_locking_proto.pv_minor);
786 788
787 /* Thread Pid: xxx Node: xxx State: xxxxx */ 789 /* Thread Pid: xxx Node: xxx State: xxxxx */
788 out += snprintf(db->buf + out, db->len - out, 790 out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 11a5c87fd7f7..58a93b953735 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -128,10 +128,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
128 * will have a negotiated version with the same major number and a minor 128 * will have a negotiated version with the same major number and a minor
129 * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should 129 * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should
130 * be used to determine what a running domain is actually using. 130 * be used to determine what a running domain is actually using.
131 *
132 * New in version 1.1:
133 * - Message DLM_QUERY_REGION added to support global heartbeat
134 * - Message DLM_QUERY_NODEINFO added to allow online node removes
131 */ 135 */
132static const struct dlm_protocol_version dlm_protocol = { 136static const struct dlm_protocol_version dlm_protocol = {
133 .pv_major = 1, 137 .pv_major = 1,
134 .pv_minor = 0, 138 .pv_minor = 1,
135}; 139};
136 140
137#define DLM_DOMAIN_BACKOFF_MS 200 141#define DLM_DOMAIN_BACKOFF_MS 200
@@ -142,6 +146,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
142 void **ret_data); 146 void **ret_data);
143static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, 147static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
144 void **ret_data); 148 void **ret_data);
149static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
150 void *data, void **ret_data);
145static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, 151static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
146 void **ret_data); 152 void **ret_data);
147static int dlm_protocol_compare(struct dlm_protocol_version *existing, 153static int dlm_protocol_compare(struct dlm_protocol_version *existing,
@@ -921,6 +927,370 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
921 return 0; 927 return 0;
922} 928}
923 929
930static int dlm_match_regions(struct dlm_ctxt *dlm,
931 struct dlm_query_region *qr)
932{
933 char *local = NULL, *remote = qr->qr_regions;
934 char *l, *r;
935 int localnr, i, j, foundit;
936 int status = 0;
937
938 if (!o2hb_global_heartbeat_active()) {
939 if (qr->qr_numregions) {
940 mlog(ML_ERROR, "Domain %s: Joining node %d has global "
941 "heartbeat enabled but local node %d does not\n",
942 qr->qr_domain, qr->qr_node, dlm->node_num);
943 status = -EINVAL;
944 }
945 goto bail;
946 }
947
948 if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
949 mlog(ML_ERROR, "Domain %s: Local node %d has global "
950 "heartbeat enabled but joining node %d does not\n",
951 qr->qr_domain, dlm->node_num, qr->qr_node);
952 status = -EINVAL;
953 goto bail;
954 }
955
956 r = remote;
957 for (i = 0; i < qr->qr_numregions; ++i) {
958 mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
959 r += O2HB_MAX_REGION_NAME_LEN;
960 }
961
962 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
963 if (!local) {
964 status = -ENOMEM;
965 goto bail;
966 }
967
968 localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
969
970 /* compare local regions with remote */
971 l = local;
972 for (i = 0; i < localnr; ++i) {
973 foundit = 0;
974 r = remote;
975 for (j = 0; j <= qr->qr_numregions; ++j) {
976 if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
977 foundit = 1;
978 break;
979 }
980 r += O2HB_MAX_REGION_NAME_LEN;
981 }
982 if (!foundit) {
983 status = -EINVAL;
984 mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
985 "in local node %d but not in joining node %d\n",
986 qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
987 dlm->node_num, qr->qr_node);
988 goto bail;
989 }
990 l += O2HB_MAX_REGION_NAME_LEN;
991 }
992
993 /* compare remote with local regions */
994 r = remote;
995 for (i = 0; i < qr->qr_numregions; ++i) {
996 foundit = 0;
997 l = local;
998 for (j = 0; j < localnr; ++j) {
999 if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
1000 foundit = 1;
1001 break;
1002 }
1003 l += O2HB_MAX_REGION_NAME_LEN;
1004 }
1005 if (!foundit) {
1006 status = -EINVAL;
1007 mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
1008 "in joining node %d but not in local node %d\n",
1009 qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
1010 qr->qr_node, dlm->node_num);
1011 goto bail;
1012 }
1013 r += O2HB_MAX_REGION_NAME_LEN;
1014 }
1015
1016bail:
1017 kfree(local);
1018
1019 return status;
1020}
1021
1022static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
1023{
1024 struct dlm_query_region *qr = NULL;
1025 int status, ret = 0, i;
1026 char *p;
1027
1028 if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
1029 goto bail;
1030
1031 qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
1032 if (!qr) {
1033 ret = -ENOMEM;
1034 mlog_errno(ret);
1035 goto bail;
1036 }
1037
1038 qr->qr_node = dlm->node_num;
1039 qr->qr_namelen = strlen(dlm->name);
1040 memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
1041 /* if local hb, the numregions will be zero */
1042 if (o2hb_global_heartbeat_active())
1043 qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
1044 O2NM_MAX_REGIONS);
1045
1046 p = qr->qr_regions;
1047 for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
1048 mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
1049
1050 i = -1;
1051 while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
1052 i + 1)) < O2NM_MAX_NODES) {
1053 if (i == dlm->node_num)
1054 continue;
1055
1056 mlog(0, "Sending regions to node %d\n", i);
1057
1058 ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
1059 sizeof(struct dlm_query_region),
1060 i, &status);
1061 if (ret >= 0)
1062 ret = status;
1063 if (ret) {
1064 mlog(ML_ERROR, "Region mismatch %d, node %d\n",
1065 ret, i);
1066 break;
1067 }
1068 }
1069
1070bail:
1071 kfree(qr);
1072 return ret;
1073}
1074
1075static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1076 void *data, void **ret_data)
1077{
1078 struct dlm_query_region *qr;
1079 struct dlm_ctxt *dlm = NULL;
1080 int status = 0;
1081 int locked = 0;
1082
1083 qr = (struct dlm_query_region *) msg->buf;
1084
1085 mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
1086 qr->qr_domain);
1087
1088 status = -EINVAL;
1089
1090 spin_lock(&dlm_domain_lock);
1091 dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
1092 if (!dlm) {
1093 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1094 "before join domain\n", qr->qr_node, qr->qr_domain);
1095 goto bail;
1096 }
1097
1098 spin_lock(&dlm->spinlock);
1099 locked = 1;
1100 if (dlm->joining_node != qr->qr_node) {
1101 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1102 "but joining node is %d\n", qr->qr_node, qr->qr_domain,
1103 dlm->joining_node);
1104 goto bail;
1105 }
1106
1107 /* Support for global heartbeat was added in 1.1 */
1108 if (dlm->dlm_locking_proto.pv_major == 1 &&
1109 dlm->dlm_locking_proto.pv_minor == 0) {
1110 mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
1111 "but active dlm protocol is %d.%d\n", qr->qr_node,
1112 qr->qr_domain, dlm->dlm_locking_proto.pv_major,
1113 dlm->dlm_locking_proto.pv_minor);
1114 goto bail;
1115 }
1116
1117 status = dlm_match_regions(dlm, qr);
1118
1119bail:
1120 if (locked)
1121 spin_unlock(&dlm->spinlock);
1122 spin_unlock(&dlm_domain_lock);
1123
1124 return status;
1125}
1126
1127static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
1128{
1129 struct o2nm_node *local;
1130 struct dlm_node_info *remote;
1131 int i, j;
1132 int status = 0;
1133
1134 for (j = 0; j < qn->qn_numnodes; ++j)
1135 mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
1136 &(qn->qn_nodes[j].ni_ipv4_address),
1137 ntohs(qn->qn_nodes[j].ni_ipv4_port));
1138
1139 for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
1140 local = o2nm_get_node_by_num(i);
1141 remote = NULL;
1142 for (j = 0; j < qn->qn_numnodes; ++j) {
1143 if (qn->qn_nodes[j].ni_nodenum == i) {
1144 remote = &(qn->qn_nodes[j]);
1145 break;
1146 }
1147 }
1148
1149 if (!local && !remote)
1150 continue;
1151
1152 if ((local && !remote) || (!local && remote))
1153 status = -EINVAL;
1154
1155 if (!status &&
1156 ((remote->ni_nodenum != local->nd_num) ||
1157 (remote->ni_ipv4_port != local->nd_ipv4_port) ||
1158 (remote->ni_ipv4_address != local->nd_ipv4_address)))
1159 status = -EINVAL;
1160
1161 if (status) {
1162 if (remote && !local)
1163 mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
1164 "registered in joining node %d but not in "
1165 "local node %d\n", qn->qn_domain,
1166 remote->ni_nodenum,
1167 &(remote->ni_ipv4_address),
1168 ntohs(remote->ni_ipv4_port),
1169 qn->qn_nodenum, dlm->node_num);
1170 if (local && !remote)
1171 mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
1172 "registered in local node %d but not in "
1173 "joining node %d\n", qn->qn_domain,
1174 local->nd_num, &(local->nd_ipv4_address),
1175 ntohs(local->nd_ipv4_port),
1176 dlm->node_num, qn->qn_nodenum);
1177 BUG_ON((!local && !remote));
1178 }
1179
1180 if (local)
1181 o2nm_node_put(local);
1182 }
1183
1184 return status;
1185}
1186
1187static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
1188{
1189 struct dlm_query_nodeinfo *qn = NULL;
1190 struct o2nm_node *node;
1191 int ret = 0, status, count, i;
1192
1193 if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
1194 goto bail;
1195
1196 qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
1197 if (!qn) {
1198 ret = -ENOMEM;
1199 mlog_errno(ret);
1200 goto bail;
1201 }
1202
1203 for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
1204 node = o2nm_get_node_by_num(i);
1205 if (!node)
1206 continue;
1207 qn->qn_nodes[count].ni_nodenum = node->nd_num;
1208 qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
1209 qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
1210 mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
1211 &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
1212 ++count;
1213 o2nm_node_put(node);
1214 }
1215
1216 qn->qn_nodenum = dlm->node_num;
1217 qn->qn_numnodes = count;
1218 qn->qn_namelen = strlen(dlm->name);
1219 memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
1220
1221 i = -1;
1222 while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
1223 i + 1)) < O2NM_MAX_NODES) {
1224 if (i == dlm->node_num)
1225 continue;
1226
1227 mlog(0, "Sending nodeinfo to node %d\n", i);
1228
1229 ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
1230 qn, sizeof(struct dlm_query_nodeinfo),
1231 i, &status);
1232 if (ret >= 0)
1233 ret = status;
1234 if (ret) {
1235 mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
1236 break;
1237 }
1238 }
1239
1240bail:
1241 kfree(qn);
1242 return ret;
1243}
1244
1245static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
1246 void *data, void **ret_data)
1247{
1248 struct dlm_query_nodeinfo *qn;
1249 struct dlm_ctxt *dlm = NULL;
1250 int locked = 0, status = -EINVAL;
1251
1252 qn = (struct dlm_query_nodeinfo *) msg->buf;
1253
1254 mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
1255 qn->qn_domain);
1256
1257 spin_lock(&dlm_domain_lock);
1258 dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
1259 if (!dlm) {
1260 mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
1261 "join domain\n", qn->qn_nodenum, qn->qn_domain);
1262 goto bail;
1263 }
1264
1265 spin_lock(&dlm->spinlock);
1266 locked = 1;
1267 if (dlm->joining_node != qn->qn_nodenum) {
1268 mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
1269 "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
1270 dlm->joining_node);
1271 goto bail;
1272 }
1273
1274 /* Support for node query was added in 1.1 */
1275 if (dlm->dlm_locking_proto.pv_major == 1 &&
1276 dlm->dlm_locking_proto.pv_minor == 0) {
1277 mlog(ML_ERROR, "Node %d queried nodes on domain %s "
1278 "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
1279 qn->qn_domain, dlm->dlm_locking_proto.pv_major,
1280 dlm->dlm_locking_proto.pv_minor);
1281 goto bail;
1282 }
1283
1284 status = dlm_match_nodes(dlm, qn);
1285
1286bail:
1287 if (locked)
1288 spin_unlock(&dlm->spinlock);
1289 spin_unlock(&dlm_domain_lock);
1290
1291 return status;
1292}
1293
924static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, 1294static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
925 void **ret_data) 1295 void **ret_data)
926{ 1296{
@@ -1241,6 +1611,20 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
1241 set_bit(dlm->node_num, dlm->domain_map); 1611 set_bit(dlm->node_num, dlm->domain_map);
1242 spin_unlock(&dlm->spinlock); 1612 spin_unlock(&dlm->spinlock);
1243 1613
1614 /* Support for global heartbeat and node info was added in 1.1 */
1615 if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) {
1616 status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
1617 if (status) {
1618 mlog_errno(status);
1619 goto bail;
1620 }
1621 status = dlm_send_regions(dlm, ctxt->yes_resp_map);
1622 if (status) {
1623 mlog_errno(status);
1624 goto bail;
1625 }
1626 }
1627
1244 dlm_send_join_asserts(dlm, ctxt->yes_resp_map); 1628 dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
1245 1629
1246 /* Joined state *must* be set before the joining node 1630 /* Joined state *must* be set before the joining node
@@ -1807,7 +2191,21 @@ static int dlm_register_net_handlers(void)
1807 sizeof(struct dlm_cancel_join), 2191 sizeof(struct dlm_cancel_join),
1808 dlm_cancel_join_handler, 2192 dlm_cancel_join_handler,
1809 NULL, NULL, &dlm_join_handlers); 2193 NULL, NULL, &dlm_join_handlers);
2194 if (status)
2195 goto bail;
2196
2197 status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
2198 sizeof(struct dlm_query_region),
2199 dlm_query_region_handler,
2200 NULL, NULL, &dlm_join_handlers);
1810 2201
2202 if (status)
2203 goto bail;
2204
2205 status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
2206 sizeof(struct dlm_query_nodeinfo),
2207 dlm_query_nodeinfo_handler,
2208 NULL, NULL, &dlm_join_handlers);
1811bail: 2209bail:
1812 if (status < 0) 2210 if (status < 0)
1813 dlm_unregister_net_handlers(); 2211 dlm_unregister_net_handlers();
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 5e02a893f46e..e8d94d722ecb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
3635{ 3635{
3636 struct inode *inode; 3636 struct inode *inode;
3637 struct address_space *mapping; 3637 struct address_space *mapping;
3638 struct ocfs2_inode_info *oi;
3638 3639
3639 inode = ocfs2_lock_res_inode(lockres); 3640 inode = ocfs2_lock_res_inode(lockres);
3640 mapping = inode->i_mapping; 3641 mapping = inode->i_mapping;
3641 3642
3643 if (S_ISDIR(inode->i_mode)) {
3644 oi = OCFS2_I(inode);
3645 oi->ip_dir_lock_gen++;
3646 mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
3647 goto out;
3648 }
3649
3642 if (!S_ISREG(inode->i_mode)) 3650 if (!S_ISREG(inode->i_mode))
3643 goto out; 3651 goto out;
3644 3652
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9a03c151b5ce..9e8cc4346b76 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -64,12 +64,6 @@
64 64
65#include "buffer_head_io.h" 65#include "buffer_head_io.h"
66 66
67static int ocfs2_sync_inode(struct inode *inode)
68{
69 filemap_fdatawrite(inode->i_mapping);
70 return sync_mapping_buffers(inode->i_mapping);
71}
72
73static int ocfs2_init_file_private(struct inode *inode, struct file *file) 67static int ocfs2_init_file_private(struct inode *inode, struct file *file)
74{ 68{
75 struct ocfs2_file_private *fp; 69 struct ocfs2_file_private *fp;
@@ -180,16 +174,12 @@ static int ocfs2_sync_file(struct file *file, int datasync)
180{ 174{
181 int err = 0; 175 int err = 0;
182 journal_t *journal; 176 journal_t *journal;
183 struct dentry *dentry = file->f_path.dentry;
184 struct inode *inode = file->f_mapping->host; 177 struct inode *inode = file->f_mapping->host;
185 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 178 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
186 179
187 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 180 mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
188 dentry->d_name.len, dentry->d_name.name); 181 file->f_path.dentry, file->f_path.dentry->d_name.len,
189 182 file->f_path.dentry->d_name.name);
190 err = ocfs2_sync_inode(dentry->d_inode);
191 if (err)
192 goto bail;
193 183
194 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { 184 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
195 /* 185 /*
@@ -370,7 +360,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
370 if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) 360 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
371 goto out; 361 goto out;
372 362
373 return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); 363 return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
374 364
375out: 365out:
376 return status; 366 return status;
@@ -913,8 +903,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
913 zero_clusters = last_cpos - zero_cpos; 903 zero_clusters = last_cpos - zero_cpos;
914 904
915 if (needs_cow) { 905 if (needs_cow) {
916 rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, 906 rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
917 UINT_MAX); 907 zero_clusters, UINT_MAX);
918 if (rc) { 908 if (rc) {
919 mlog_errno(rc); 909 mlog_errno(rc);
920 goto out; 910 goto out;
@@ -2062,6 +2052,7 @@ out:
2062} 2052}
2063 2053
2064static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2054static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2055 struct file *file,
2065 loff_t pos, size_t count, 2056 loff_t pos, size_t count,
2066 int *meta_level) 2057 int *meta_level)
2067{ 2058{
@@ -2079,7 +2070,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2079 2070
2080 *meta_level = 1; 2071 *meta_level = 1;
2081 2072
2082 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); 2073 ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
2083 if (ret) 2074 if (ret)
2084 mlog_errno(ret); 2075 mlog_errno(ret);
2085out: 2076out:
@@ -2087,7 +2078,7 @@ out:
2087 return ret; 2078 return ret;
2088} 2079}
2089 2080
2090static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 2081static int ocfs2_prepare_inode_for_write(struct file *file,
2091 loff_t *ppos, 2082 loff_t *ppos,
2092 size_t count, 2083 size_t count,
2093 int appending, 2084 int appending,
@@ -2095,6 +2086,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
2095 int *has_refcount) 2086 int *has_refcount)
2096{ 2087{
2097 int ret = 0, meta_level = 0; 2088 int ret = 0, meta_level = 0;
2089 struct dentry *dentry = file->f_path.dentry;
2098 struct inode *inode = dentry->d_inode; 2090 struct inode *inode = dentry->d_inode;
2099 loff_t saved_pos, end; 2091 loff_t saved_pos, end;
2100 2092
@@ -2150,6 +2142,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
2150 meta_level = -1; 2142 meta_level = -1;
2151 2143
2152 ret = ocfs2_prepare_inode_for_refcount(inode, 2144 ret = ocfs2_prepare_inode_for_refcount(inode,
2145 file,
2153 saved_pos, 2146 saved_pos,
2154 count, 2147 count,
2155 &meta_level); 2148 &meta_level);
@@ -2232,6 +2225,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2232 struct file *file = iocb->ki_filp; 2225 struct file *file = iocb->ki_filp;
2233 struct inode *inode = file->f_path.dentry->d_inode; 2226 struct inode *inode = file->f_path.dentry->d_inode;
2234 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2227 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2228 int full_coherency = !(osb->s_mount_opt &
2229 OCFS2_MOUNT_COHERENCY_BUFFERED);
2235 2230
2236 mlog_entry("(0x%p, %u, '%.*s')\n", file, 2231 mlog_entry("(0x%p, %u, '%.*s')\n", file,
2237 (unsigned int)nr_segs, 2232 (unsigned int)nr_segs,
@@ -2255,16 +2250,39 @@ relock:
2255 have_alloc_sem = 1; 2250 have_alloc_sem = 1;
2256 } 2251 }
2257 2252
2258 /* concurrent O_DIRECT writes are allowed */ 2253 /*
2259 rw_level = !direct_io; 2254 * Concurrent O_DIRECT writes are allowed with
2255 * mount_option "coherency=buffered".
2256 */
2257 rw_level = (!direct_io || full_coherency);
2258
2260 ret = ocfs2_rw_lock(inode, rw_level); 2259 ret = ocfs2_rw_lock(inode, rw_level);
2261 if (ret < 0) { 2260 if (ret < 0) {
2262 mlog_errno(ret); 2261 mlog_errno(ret);
2263 goto out_sems; 2262 goto out_sems;
2264 } 2263 }
2265 2264
2265 /*
2266 * O_DIRECT writes with "coherency=full" need to take EX cluster
2267 * inode_lock to guarantee coherency.
2268 */
2269 if (direct_io && full_coherency) {
2270 /*
2271 * We need to take and drop the inode lock to force
2272 * other nodes to drop their caches. Buffered I/O
2273 * already does this in write_begin().
2274 */
2275 ret = ocfs2_inode_lock(inode, NULL, 1);
2276 if (ret < 0) {
2277 mlog_errno(ret);
2278 goto out_sems;
2279 }
2280
2281 ocfs2_inode_unlock(inode, 1);
2282 }
2283
2266 can_do_direct = direct_io; 2284 can_do_direct = direct_io;
2267 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 2285 ret = ocfs2_prepare_inode_for_write(file, ppos,
2268 iocb->ki_left, appending, 2286 iocb->ki_left, appending,
2269 &can_do_direct, &has_refcount); 2287 &can_do_direct, &has_refcount);
2270 if (ret < 0) { 2288 if (ret < 0) {
@@ -2312,17 +2330,6 @@ relock:
2312 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2330 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
2313 ppos, count, ocount); 2331 ppos, count, ocount);
2314 if (written < 0) { 2332 if (written < 0) {
2315 /*
2316 * direct write may have instantiated a few
2317 * blocks outside i_size. Trim these off again.
2318 * Don't need i_size_read because we hold i_mutex.
2319 *
2320 * XXX(truncate): this looks buggy because ocfs2 did not
2321 * actually implement ->truncate. Take a look at
2322 * the new truncate sequence and update this accordingly
2323 */
2324 if (*ppos + count > inode->i_size)
2325 truncate_setsize(inode, inode->i_size);
2326 ret = written; 2333 ret = written;
2327 goto out_dio; 2334 goto out_dio;
2328 } 2335 }
@@ -2394,7 +2401,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2394{ 2401{
2395 int ret; 2402 int ret;
2396 2403
2397 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, 2404 ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
2398 sd->total_len, 0, NULL, NULL); 2405 sd->total_len, 0, NULL, NULL);
2399 if (ret < 0) { 2406 if (ret < 0) {
2400 mlog_errno(ret); 2407 mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index eece3e05d9d0..f935fd6600dd 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
335 else 335 else
336 inode->i_fop = &ocfs2_dops_no_plocks; 336 inode->i_fop = &ocfs2_dops_no_plocks;
337 i_size_write(inode, le64_to_cpu(fe->i_size)); 337 i_size_write(inode, le64_to_cpu(fe->i_size));
338 OCFS2_I(inode)->ip_dir_lock_gen = 1;
338 break; 339 break;
339 case S_IFLNK: 340 case S_IFLNK:
340 if (ocfs2_inode_is_fast_symlink(inode)) 341 if (ocfs2_inode_is_fast_symlink(inode))
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 6de5a869db30..1c508b149b3a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,30 +46,28 @@ struct ocfs2_inode_info
46 /* These fields are protected by ip_lock */ 46 /* These fields are protected by ip_lock */
47 spinlock_t ip_lock; 47 spinlock_t ip_lock;
48 u32 ip_open_count; 48 u32 ip_open_count;
49 u32 ip_clusters;
50 struct list_head ip_io_markers; 49 struct list_head ip_io_markers;
50 u32 ip_clusters;
51 51
52 u16 ip_dyn_features;
52 struct mutex ip_io_mutex; 53 struct mutex ip_io_mutex;
53
54 u32 ip_flags; /* see below */ 54 u32 ip_flags; /* see below */
55 u32 ip_attr; /* inode attributes */ 55 u32 ip_attr; /* inode attributes */
56 u16 ip_dyn_features;
57 56
58 /* protected by recovery_lock. */ 57 /* protected by recovery_lock. */
59 struct inode *ip_next_orphan; 58 struct inode *ip_next_orphan;
60 59
61 u32 ip_dir_start_lookup;
62
63 struct ocfs2_caching_info ip_metadata_cache; 60 struct ocfs2_caching_info ip_metadata_cache;
64
65 struct ocfs2_extent_map ip_extent_map; 61 struct ocfs2_extent_map ip_extent_map;
66
67 struct inode vfs_inode; 62 struct inode vfs_inode;
68 struct jbd2_inode ip_jinode; 63 struct jbd2_inode ip_jinode;
69 64
65 u32 ip_dir_start_lookup;
66
70 /* Only valid if the inode is the dir. */ 67 /* Only valid if the inode is the dir. */
71 u32 ip_last_used_slot; 68 u32 ip_last_used_slot;
72 u64 ip_last_used_group; 69 u64 ip_last_used_group;
70 u32 ip_dir_lock_gen;
73 71
74 struct ocfs2_alloc_reservation ip_la_data_resv; 72 struct ocfs2_alloc_reservation ip_la_data_resv;
75}; 73};
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7d9d9c132cef..7a4868196152 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -26,6 +26,26 @@
26 26
27#include <linux/ext2_fs.h> 27#include <linux/ext2_fs.h>
28 28
29#define o2info_from_user(a, b) \
30 copy_from_user(&(a), (b), sizeof(a))
31#define o2info_to_user(a, b) \
32 copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
33
34/*
35 * This call is void because we are already reporting an error that may
36 * be -EFAULT. The error will be returned from the ioctl(2) call. It's
37 * just a best-effort to tell userspace that this request caused the error.
38 */
39static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
40 struct ocfs2_info_request __user *req)
41{
42 kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
43 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
44}
45
46#define o2info_set_request_error(a, b) \
47 __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
48
29static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) 49static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
30{ 50{
31 int status; 51 int status;
@@ -109,6 +129,328 @@ bail:
109 return status; 129 return status;
110} 130}
111 131
132int ocfs2_info_handle_blocksize(struct inode *inode,
133 struct ocfs2_info_request __user *req)
134{
135 int status = -EFAULT;
136 struct ocfs2_info_blocksize oib;
137
138 if (o2info_from_user(oib, req))
139 goto bail;
140
141 oib.ib_blocksize = inode->i_sb->s_blocksize;
142 oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
143
144 if (o2info_to_user(oib, req))
145 goto bail;
146
147 status = 0;
148bail:
149 if (status)
150 o2info_set_request_error(oib, req);
151
152 return status;
153}
154
155int ocfs2_info_handle_clustersize(struct inode *inode,
156 struct ocfs2_info_request __user *req)
157{
158 int status = -EFAULT;
159 struct ocfs2_info_clustersize oic;
160 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
161
162 if (o2info_from_user(oic, req))
163 goto bail;
164
165 oic.ic_clustersize = osb->s_clustersize;
166 oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
167
168 if (o2info_to_user(oic, req))
169 goto bail;
170
171 status = 0;
172bail:
173 if (status)
174 o2info_set_request_error(oic, req);
175
176 return status;
177}
178
179int ocfs2_info_handle_maxslots(struct inode *inode,
180 struct ocfs2_info_request __user *req)
181{
182 int status = -EFAULT;
183 struct ocfs2_info_maxslots oim;
184 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
185
186 if (o2info_from_user(oim, req))
187 goto bail;
188
189 oim.im_max_slots = osb->max_slots;
190 oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
191
192 if (o2info_to_user(oim, req))
193 goto bail;
194
195 status = 0;
196bail:
197 if (status)
198 o2info_set_request_error(oim, req);
199
200 return status;
201}
202
203int ocfs2_info_handle_label(struct inode *inode,
204 struct ocfs2_info_request __user *req)
205{
206 int status = -EFAULT;
207 struct ocfs2_info_label oil;
208 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
209
210 if (o2info_from_user(oil, req))
211 goto bail;
212
213 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
214 oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
215
216 if (o2info_to_user(oil, req))
217 goto bail;
218
219 status = 0;
220bail:
221 if (status)
222 o2info_set_request_error(oil, req);
223
224 return status;
225}
226
227int ocfs2_info_handle_uuid(struct inode *inode,
228 struct ocfs2_info_request __user *req)
229{
230 int status = -EFAULT;
231 struct ocfs2_info_uuid oiu;
232 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
233
234 if (o2info_from_user(oiu, req))
235 goto bail;
236
237 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
238 oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
239
240 if (o2info_to_user(oiu, req))
241 goto bail;
242
243 status = 0;
244bail:
245 if (status)
246 o2info_set_request_error(oiu, req);
247
248 return status;
249}
250
251int ocfs2_info_handle_fs_features(struct inode *inode,
252 struct ocfs2_info_request __user *req)
253{
254 int status = -EFAULT;
255 struct ocfs2_info_fs_features oif;
256 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
257
258 if (o2info_from_user(oif, req))
259 goto bail;
260
261 oif.if_compat_features = osb->s_feature_compat;
262 oif.if_incompat_features = osb->s_feature_incompat;
263 oif.if_ro_compat_features = osb->s_feature_ro_compat;
264 oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
265
266 if (o2info_to_user(oif, req))
267 goto bail;
268
269 status = 0;
270bail:
271 if (status)
272 o2info_set_request_error(oif, req);
273
274 return status;
275}
276
277int ocfs2_info_handle_journal_size(struct inode *inode,
278 struct ocfs2_info_request __user *req)
279{
280 int status = -EFAULT;
281 struct ocfs2_info_journal_size oij;
282 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
283
284 if (o2info_from_user(oij, req))
285 goto bail;
286
287 oij.ij_journal_size = osb->journal->j_inode->i_size;
288
289 oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
290
291 if (o2info_to_user(oij, req))
292 goto bail;
293
294 status = 0;
295bail:
296 if (status)
297 o2info_set_request_error(oij, req);
298
299 return status;
300}
301
302int ocfs2_info_handle_unknown(struct inode *inode,
303 struct ocfs2_info_request __user *req)
304{
305 int status = -EFAULT;
306 struct ocfs2_info_request oir;
307
308 if (o2info_from_user(oir, req))
309 goto bail;
310
311 oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
312
313 if (o2info_to_user(oir, req))
314 goto bail;
315
316 status = 0;
317bail:
318 if (status)
319 o2info_set_request_error(oir, req);
320
321 return status;
322}
323
324/*
325 * Validate and distinguish OCFS2_IOC_INFO requests.
326 *
327 * - validate the magic number.
328 * - distinguish different requests.
329 * - validate size of different requests.
330 */
331int ocfs2_info_handle_request(struct inode *inode,
332 struct ocfs2_info_request __user *req)
333{
334 int status = -EFAULT;
335 struct ocfs2_info_request oir;
336
337 if (o2info_from_user(oir, req))
338 goto bail;
339
340 status = -EINVAL;
341 if (oir.ir_magic != OCFS2_INFO_MAGIC)
342 goto bail;
343
344 switch (oir.ir_code) {
345 case OCFS2_INFO_BLOCKSIZE:
346 if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
347 status = ocfs2_info_handle_blocksize(inode, req);
348 break;
349 case OCFS2_INFO_CLUSTERSIZE:
350 if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
351 status = ocfs2_info_handle_clustersize(inode, req);
352 break;
353 case OCFS2_INFO_MAXSLOTS:
354 if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
355 status = ocfs2_info_handle_maxslots(inode, req);
356 break;
357 case OCFS2_INFO_LABEL:
358 if (oir.ir_size == sizeof(struct ocfs2_info_label))
359 status = ocfs2_info_handle_label(inode, req);
360 break;
361 case OCFS2_INFO_UUID:
362 if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
363 status = ocfs2_info_handle_uuid(inode, req);
364 break;
365 case OCFS2_INFO_FS_FEATURES:
366 if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
367 status = ocfs2_info_handle_fs_features(inode, req);
368 break;
369 case OCFS2_INFO_JOURNAL_SIZE:
370 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
371 status = ocfs2_info_handle_journal_size(inode, req);
372 break;
373 default:
374 status = ocfs2_info_handle_unknown(inode, req);
375 break;
376 }
377
378bail:
379 return status;
380}
381
382int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
383 u64 *req_addr, int compat_flag)
384{
385 int status = -EFAULT;
386 u64 __user *bp = NULL;
387
388 if (compat_flag) {
389#ifdef CONFIG_COMPAT
390 /*
391 * pointer bp stores the base address of a pointers array,
392 * which collects all addresses of separate request.
393 */
394 bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
395#else
396 BUG();
397#endif
398 } else
399 bp = (u64 __user *)(unsigned long)(info->oi_requests);
400
401 if (o2info_from_user(*req_addr, bp + idx))
402 goto bail;
403
404 status = 0;
405bail:
406 return status;
407}
408
409/*
410 * OCFS2_IOC_INFO handles an array of requests passed from userspace.
411 *
412 * ocfs2_info_handle() recevies a large info aggregation, grab and
413 * validate the request count from header, then break it into small
414 * pieces, later specific handlers can handle them one by one.
415 *
416 * Idea here is to make each separate request small enough to ensure
417 * a better backward&forward compatibility, since a small piece of
418 * request will be less likely to be broken if disk layout get changed.
419 */
420int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
421 int compat_flag)
422{
423 int i, status = 0;
424 u64 req_addr;
425 struct ocfs2_info_request __user *reqp;
426
427 if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
428 (!info->oi_requests)) {
429 status = -EINVAL;
430 goto bail;
431 }
432
433 for (i = 0; i < info->oi_count; i++) {
434
435 status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
436 if (status)
437 break;
438
439 reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
440 if (!reqp) {
441 status = -EINVAL;
442 goto bail;
443 }
444
445 status = ocfs2_info_handle_request(inode, reqp);
446 if (status)
447 break;
448 }
449
450bail:
451 return status;
452}
453
112long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 454long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
113{ 455{
114 struct inode *inode = filp->f_path.dentry->d_inode; 456 struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
120 struct reflink_arguments args; 462 struct reflink_arguments args;
121 const char *old_path, *new_path; 463 const char *old_path, *new_path;
122 bool preserve; 464 bool preserve;
465 struct ocfs2_info info;
123 466
124 switch (cmd) { 467 switch (cmd) {
125 case OCFS2_IOC_GETFLAGS: 468 case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
174 preserve = (args.preserve != 0); 517 preserve = (args.preserve != 0);
175 518
176 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); 519 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
520 case OCFS2_IOC_INFO:
521 if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
522 sizeof(struct ocfs2_info)))
523 return -EFAULT;
524
525 return ocfs2_info_handle(inode, &info, 0);
177 default: 526 default:
178 return -ENOTTY; 527 return -ENOTTY;
179 } 528 }
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
185 bool preserve; 534 bool preserve;
186 struct reflink_arguments args; 535 struct reflink_arguments args;
187 struct inode *inode = file->f_path.dentry->d_inode; 536 struct inode *inode = file->f_path.dentry->d_inode;
537 struct ocfs2_info info;
188 538
189 switch (cmd) { 539 switch (cmd) {
190 case OCFS2_IOC32_GETFLAGS: 540 case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
209 559
210 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), 560 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
211 compat_ptr(args.new_path), preserve); 561 compat_ptr(args.new_path), preserve);
562 case OCFS2_IOC_INFO:
563 if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
564 sizeof(struct ocfs2_info)))
565 return -EFAULT;
566
567 return ocfs2_info_handle(inode, &info, 1);
212 default: 568 default:
213 return -ENOIOCTLCMD; 569 return -ENOIOCTLCMD;
214 } 570 }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9b57c0350ff9..faa2303dbf0a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
301{ 301{
302 int status = 0; 302 int status = 0;
303 unsigned int flushed; 303 unsigned int flushed;
304 unsigned long old_id;
305 struct ocfs2_journal *journal = NULL; 304 struct ocfs2_journal *journal = NULL;
306 305
307 mlog_entry_void(); 306 mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
326 goto finally; 325 goto finally;
327 } 326 }
328 327
329 old_id = ocfs2_inc_trans_id(journal); 328 ocfs2_inc_trans_id(journal);
330 329
331 flushed = atomic_read(&journal->j_num_trans); 330 flushed = atomic_read(&journal->j_num_trans);
332 atomic_set(&journal->j_num_trans, 0); 331 atomic_set(&journal->j_num_trans, 0);
@@ -342,9 +341,6 @@ finally:
342 return status; 341 return status;
343} 342}
344 343
345/* pass it NULL and it will allocate a new handle object for you. If
346 * you pass it a handle however, it may still return error, in which
347 * case it has free'd the passed handle for you. */
348handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) 344handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
349{ 345{
350 journal_t *journal = osb->journal->j_journal; 346 journal_t *journal = osb->journal->j_journal;
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1888 1884
1889 os = &osb->osb_orphan_scan; 1885 os = &osb->osb_orphan_scan;
1890 1886
1887 mlog(0, "Begin orphan scan\n");
1888
1891 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) 1889 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
1892 goto out; 1890 goto out;
1893 1891
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1920unlock: 1918unlock:
1921 ocfs2_orphan_scan_unlock(osb, seqno); 1919 ocfs2_orphan_scan_unlock(osb, seqno);
1922out: 1920out:
1921 mlog(0, "Orphan scan completed\n");
1923 return; 1922 return;
1924} 1923}
1925 1924
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b5baaa8e710f..43e56b97f9c0 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
67 struct buffer_head *j_bh; /* Journal disk inode block */ 67 struct buffer_head *j_bh; /* Journal disk inode block */
68 atomic_t j_num_trans; /* Number of transactions 68 atomic_t j_num_trans; /* Number of transactions
69 * currently in the system. */ 69 * currently in the system. */
70 spinlock_t j_lock;
70 unsigned long j_trans_id; 71 unsigned long j_trans_id;
71 struct rw_semaphore j_trans_barrier; 72 struct rw_semaphore j_trans_barrier;
72 wait_queue_head_t j_checkpointed; 73 wait_queue_head_t j_checkpointed;
73 74
74 spinlock_t j_lock; 75 /* both fields protected by j_lock*/
75 struct list_head j_la_cleanups; 76 struct list_head j_la_cleanups;
76 struct work_struct j_recovery_work; 77 struct work_struct j_recovery_work;
77}; 78};
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 4c18f4ad93b4..7e32db9c2c99 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
59 return ret; 59 return ret;
60} 60}
61 61
62static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, 62static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
63 struct page *page) 63 struct page *page)
64{ 64{
65 int ret; 65 int ret;
66 struct inode *inode = file->f_path.dentry->d_inode;
66 struct address_space *mapping = inode->i_mapping; 67 struct address_space *mapping = inode->i_mapping;
67 loff_t pos = page_offset(page); 68 loff_t pos = page_offset(page);
68 unsigned int len = PAGE_CACHE_SIZE; 69 unsigned int len = PAGE_CACHE_SIZE;
@@ -111,7 +112,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
111 if (page->index == last_index) 112 if (page->index == last_index)
112 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; 113 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
113 114
114 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, 115 ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
115 &fsdata, di_bh, page); 116 &fsdata, di_bh, page);
116 if (ret) { 117 if (ret) {
117 if (ret != -ENOSPC) 118 if (ret != -ENOSPC)
@@ -159,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
159 */ 160 */
160 down_write(&OCFS2_I(inode)->ip_alloc_sem); 161 down_write(&OCFS2_I(inode)->ip_alloc_sem);
161 162
162 ret = __ocfs2_page_mkwrite(inode, di_bh, page); 163 ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
163 164
164 up_write(&OCFS2_I(inode)->ip_alloc_sem); 165 up_write(&OCFS2_I(inode)->ip_alloc_sem);
165 166
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a00dda2e4f16..e7bde21149ae 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -171,7 +171,8 @@ bail_add:
171 ret = ERR_PTR(status); 171 ret = ERR_PTR(status);
172 goto bail_unlock; 172 goto bail_unlock;
173 } 173 }
174 } 174 } else
175 ocfs2_dentry_attach_gen(dentry);
175 176
176bail_unlock: 177bail_unlock:
177 /* Don't drop the cluster lock until *after* the d_add -- 178 /* Don't drop the cluster lock until *after* the d_add --
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b6b5a2..d8408217e3bd 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
150struct ocfs2_lock_res { 150struct ocfs2_lock_res {
151 void *l_priv; 151 void *l_priv;
152 struct ocfs2_lock_res_ops *l_ops; 152 struct ocfs2_lock_res_ops *l_ops;
153 spinlock_t l_lock; 153
154 154
155 struct list_head l_blocked_list; 155 struct list_head l_blocked_list;
156 struct list_head l_mask_waiters; 156 struct list_head l_mask_waiters;
157 157
158 enum ocfs2_lock_type l_type;
159 unsigned long l_flags; 158 unsigned long l_flags;
160 char l_name[OCFS2_LOCK_ID_MAX_LEN]; 159 char l_name[OCFS2_LOCK_ID_MAX_LEN];
161 int l_level;
162 unsigned int l_ro_holders; 160 unsigned int l_ro_holders;
163 unsigned int l_ex_holders; 161 unsigned int l_ex_holders;
164 struct ocfs2_dlm_lksb l_lksb; 162 unsigned char l_level;
163
164 /* Data packed - type enum ocfs2_lock_type */
165 unsigned char l_type;
165 166
166 /* used from AST/BAST funcs. */ 167 /* used from AST/BAST funcs. */
167 enum ocfs2_ast_action l_action; 168 /* Data packed - enum type ocfs2_ast_action */
168 enum ocfs2_unlock_action l_unlock_action; 169 unsigned char l_action;
169 int l_requested; 170 /* Data packed - enum type ocfs2_unlock_action */
170 int l_blocking; 171 unsigned char l_unlock_action;
172 unsigned char l_requested;
173 unsigned char l_blocking;
171 unsigned int l_pending_gen; 174 unsigned int l_pending_gen;
172 175
176 spinlock_t l_lock;
177
178 struct ocfs2_dlm_lksb l_lksb;
179
173 wait_queue_head_t l_event; 180 wait_queue_head_t l_event;
174 181
175 struct list_head l_debug_list; 182 struct list_head l_debug_list;
@@ -243,7 +250,7 @@ enum ocfs2_local_alloc_state
243 250
244enum ocfs2_mount_options 251enum ocfs2_mount_options
245{ 252{
246 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */ 253 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
247 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ 254 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
248 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ 255 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
249 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 256 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
@@ -256,6 +263,10 @@ enum ocfs2_mount_options
256 control lists */ 263 control lists */
257 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ 264 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
258 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ 265 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
266 OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
267 writes */
268 OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
269 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
259}; 270};
260 271
261#define OCFS2_OSB_SOFT_RO 0x0001 272#define OCFS2_OSB_SOFT_RO 0x0001
@@ -277,7 +288,8 @@ struct ocfs2_super
277 struct super_block *sb; 288 struct super_block *sb;
278 struct inode *root_inode; 289 struct inode *root_inode;
279 struct inode *sys_root_inode; 290 struct inode *sys_root_inode;
280 struct inode *system_inodes[NUM_SYSTEM_INODES]; 291 struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
292 struct inode **local_system_inodes;
281 293
282 struct ocfs2_slot_info *slot_info; 294 struct ocfs2_slot_info *slot_info;
283 295
@@ -368,6 +380,8 @@ struct ocfs2_super
368 struct ocfs2_alloc_stats alloc_stats; 380 struct ocfs2_alloc_stats alloc_stats;
369 char dev_str[20]; /* "major,minor" of the device */ 381 char dev_str[20]; /* "major,minor" of the device */
370 382
383 u8 osb_stackflags;
384
371 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 385 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
372 struct ocfs2_cluster_connection *cconn; 386 struct ocfs2_cluster_connection *cconn;
373 struct ocfs2_lock_res osb_super_lockres; 387 struct ocfs2_lock_res osb_super_lockres;
@@ -601,10 +615,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
601 return ret; 615 return ret;
602} 616}
603 617
604static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) 618static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
605{ 619{
606 return (osb->s_feature_incompat & 620 return (osb->s_feature_incompat &
607 OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK); 621 (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
622 OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
623}
624
625static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
626{
627 if (ocfs2_clusterinfo_valid(osb) &&
628 memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
629 OCFS2_STACK_LABEL_LEN))
630 return 1;
631 return 0;
632}
633
634static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
635{
636 if (ocfs2_clusterinfo_valid(osb) &&
637 !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
638 OCFS2_STACK_LABEL_LEN))
639 return 1;
640 return 0;
641}
642
643static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
644{
645 return ocfs2_o2cb_stack(osb) &&
646 (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
608} 647}
609 648
610static inline int ocfs2_mount_local(struct ocfs2_super *osb) 649static inline int ocfs2_mount_local(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index fa31d05e41b7..c2e4f8222e2f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -101,7 +101,8 @@
101 | OCFS2_FEATURE_INCOMPAT_META_ECC \ 101 | OCFS2_FEATURE_INCOMPAT_META_ECC \
102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ 102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ 103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) 104 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
105 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
105#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 106#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
106 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 107 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
107 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 108 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -170,6 +171,13 @@
170#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 171#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
171 172
172/* 173/*
174 * Incompat bit to indicate useable clusterinfo with stackflags for all
175 * cluster stacks (userspace adnd o2cb). If this bit is set,
176 * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
177 */
178#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
179
180/*
173 * backup superblock flag is used to indicate that this volume 181 * backup superblock flag is used to indicate that this volume
174 * has backup superblocks. 182 * has backup superblocks.
175 */ 183 */
@@ -292,10 +300,13 @@
292#define OCFS2_VOL_UUID_LEN 16 300#define OCFS2_VOL_UUID_LEN 16
293#define OCFS2_MAX_VOL_LABEL_LEN 64 301#define OCFS2_MAX_VOL_LABEL_LEN 64
294 302
295/* The alternate, userspace stack fields */ 303/* The cluster stack fields */
296#define OCFS2_STACK_LABEL_LEN 4 304#define OCFS2_STACK_LABEL_LEN 4
297#define OCFS2_CLUSTER_NAME_LEN 16 305#define OCFS2_CLUSTER_NAME_LEN 16
298 306
307/* Classic (historically speaking) cluster stack */
308#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb"
309
299/* Journal limits (in bytes) */ 310/* Journal limits (in bytes) */
300#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 311#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
301 312
@@ -305,6 +316,11 @@
305 */ 316 */
306#define OCFS2_MIN_XATTR_INLINE_SIZE 256 317#define OCFS2_MIN_XATTR_INLINE_SIZE 256
307 318
319/*
320 * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
321 */
322#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01)
323
308struct ocfs2_system_inode_info { 324struct ocfs2_system_inode_info {
309 char *si_name; 325 char *si_name;
310 int si_iflags; 326 int si_iflags;
@@ -322,6 +338,7 @@ enum {
322 USER_QUOTA_SYSTEM_INODE, 338 USER_QUOTA_SYSTEM_INODE,
323 GROUP_QUOTA_SYSTEM_INODE, 339 GROUP_QUOTA_SYSTEM_INODE,
324#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE 340#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
341#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
325 ORPHAN_DIR_SYSTEM_INODE, 342 ORPHAN_DIR_SYSTEM_INODE,
326 EXTENT_ALLOC_SYSTEM_INODE, 343 EXTENT_ALLOC_SYSTEM_INODE,
327 INODE_ALLOC_SYSTEM_INODE, 344 INODE_ALLOC_SYSTEM_INODE,
@@ -330,8 +347,12 @@ enum {
330 TRUNCATE_LOG_SYSTEM_INODE, 347 TRUNCATE_LOG_SYSTEM_INODE,
331 LOCAL_USER_QUOTA_SYSTEM_INODE, 348 LOCAL_USER_QUOTA_SYSTEM_INODE,
332 LOCAL_GROUP_QUOTA_SYSTEM_INODE, 349 LOCAL_GROUP_QUOTA_SYSTEM_INODE,
350#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
333 NUM_SYSTEM_INODES 351 NUM_SYSTEM_INODES
334}; 352};
353#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
354#define NUM_LOCAL_SYSTEM_INODES \
355 (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
335 356
336static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { 357static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
337 /* Global system inodes (single copy) */ 358 /* Global system inodes (single copy) */
@@ -360,6 +381,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
360/* Parameter passed from mount.ocfs2 to module */ 381/* Parameter passed from mount.ocfs2 to module */
361#define OCFS2_HB_NONE "heartbeat=none" 382#define OCFS2_HB_NONE "heartbeat=none"
362#define OCFS2_HB_LOCAL "heartbeat=local" 383#define OCFS2_HB_LOCAL "heartbeat=local"
384#define OCFS2_HB_GLOBAL "heartbeat=global"
363 385
364/* 386/*
365 * OCFS2 directory file types. Only the low 3 bits are used. The 387 * OCFS2 directory file types. Only the low 3 bits are used. The
@@ -566,9 +588,21 @@ struct ocfs2_slot_map_extended {
566 */ 588 */
567}; 589};
568 590
591/*
592 * ci_stackflags is only valid if the incompat bit
593 * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
594 */
569struct ocfs2_cluster_info { 595struct ocfs2_cluster_info {
570/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; 596/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
571 __le32 ci_reserved; 597 union {
598 __le32 ci_reserved;
599 struct {
600 __u8 ci_stackflags;
601 __u8 ci_reserved1;
602 __u8 ci_reserved2;
603 __u8 ci_reserved3;
604 };
605 };
572/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; 606/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
573/*18*/ 607/*18*/
574}; 608};
@@ -605,9 +639,9 @@ struct ocfs2_super_block {
605 * group header */ 639 * group header */
606/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ 640/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
607/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ 641/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
608/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace 642/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
609 stack. Only valid 643 userspace or clusterinfo
610 with INCOMPAT flag. */ 644 INCOMPAT flag set. */
611/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size 645/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
612 for this fs*/ 646 for this fs*/
613 __le16 s_reserved0; 647 __le16 s_reserved0;
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 5d241505690b..b46f39bf7438 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -76,4 +76,99 @@ struct reflink_arguments {
76}; 76};
77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) 77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
78 78
79/* Following definitions dedicated for ocfs2_info_request ioctls. */
80#define OCFS2_INFO_MAX_REQUEST (50)
81#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2)
82
83/* Magic number of all requests */
84#define OCFS2_INFO_MAGIC (0x4F32494E)
85
86/*
87 * Always try to separate info request into small pieces to
88 * guarantee the backward&forward compatibility.
89 */
90struct ocfs2_info {
91 __u64 oi_requests; /* Array of __u64 pointers to requests */
92 __u32 oi_count; /* Number of requests in info_requests */
93 __u32 oi_pad;
94};
95
96struct ocfs2_info_request {
97/*00*/ __u32 ir_magic; /* Magic number */
98 __u32 ir_code; /* Info request code */
99 __u32 ir_size; /* Size of request */
100 __u32 ir_flags; /* Request flags */
101/*10*/ /* Request specific fields */
102};
103
104struct ocfs2_info_clustersize {
105 struct ocfs2_info_request ic_req;
106 __u32 ic_clustersize;
107 __u32 ic_pad;
108};
109
110struct ocfs2_info_blocksize {
111 struct ocfs2_info_request ib_req;
112 __u32 ib_blocksize;
113 __u32 ib_pad;
114};
115
116struct ocfs2_info_maxslots {
117 struct ocfs2_info_request im_req;
118 __u32 im_max_slots;
119 __u32 im_pad;
120};
121
122struct ocfs2_info_label {
123 struct ocfs2_info_request il_req;
124 __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN];
125} __attribute__ ((packed));
126
127struct ocfs2_info_uuid {
128 struct ocfs2_info_request iu_req;
129 __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
130} __attribute__ ((packed));
131
132struct ocfs2_info_fs_features {
133 struct ocfs2_info_request if_req;
134 __u32 if_compat_features;
135 __u32 if_incompat_features;
136 __u32 if_ro_compat_features;
137 __u32 if_pad;
138};
139
140struct ocfs2_info_journal_size {
141 struct ocfs2_info_request ij_req;
142 __u64 ij_journal_size;
143};
144
145/* Codes for ocfs2_info_request */
146enum ocfs2_info_type {
147 OCFS2_INFO_CLUSTERSIZE = 1,
148 OCFS2_INFO_BLOCKSIZE,
149 OCFS2_INFO_MAXSLOTS,
150 OCFS2_INFO_LABEL,
151 OCFS2_INFO_UUID,
152 OCFS2_INFO_FS_FEATURES,
153 OCFS2_INFO_JOURNAL_SIZE,
154 OCFS2_INFO_NUM_TYPES
155};
156
157/* Flags for struct ocfs2_info_request */
158/* Filled by the caller */
159#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not
160 required. This is a hint.
161 It is up to ocfs2 whether
162 the request can be fulfilled
163 without locking. */
164/* Filled by ocfs2 */
165#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood
166 this request and
167 filled in the answer */
168
169#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during
170 request handling. */
171
172#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
173
79#endif /* OCFS2_IOCTL_H */ 174#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index efdd75607406..b5f9160e93e9 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,6 +49,7 @@
49 49
50struct ocfs2_cow_context { 50struct ocfs2_cow_context {
51 struct inode *inode; 51 struct inode *inode;
52 struct file *file;
52 u32 cow_start; 53 u32 cow_start;
53 u32 cow_len; 54 u32 cow_len;
54 struct ocfs2_extent_tree data_et; 55 struct ocfs2_extent_tree data_et;
@@ -2932,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2932 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2933 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2933 struct page *page; 2934 struct page *page;
2934 pgoff_t page_index; 2935 pgoff_t page_index;
2935 unsigned int from, to; 2936 unsigned int from, to, readahead_pages;
2936 loff_t offset, end, map_end; 2937 loff_t offset, end, map_end;
2937 struct address_space *mapping = context->inode->i_mapping; 2938 struct address_space *mapping = context->inode->i_mapping;
2938 2939
2939 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, 2940 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
2940 new_cluster, new_len, cpos); 2941 new_cluster, new_len, cpos);
2941 2942
2943 readahead_pages =
2944 (ocfs2_cow_contig_clusters(sb) <<
2945 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
2942 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 2946 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2943 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); 2947 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2944 /* 2948 /*
@@ -2969,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2969 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) 2973 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2970 BUG_ON(PageDirty(page)); 2974 BUG_ON(PageDirty(page));
2971 2975
2976 if (PageReadahead(page) && context->file) {
2977 page_cache_async_readahead(mapping,
2978 &context->file->f_ra,
2979 context->file,
2980 page, page_index,
2981 readahead_pages);
2982 }
2983
2972 if (!PageUptodate(page)) { 2984 if (!PageUptodate(page)) {
2973 ret = block_read_full_page(page, ocfs2_get_block); 2985 ret = block_read_full_page(page, ocfs2_get_block);
2974 if (ret) { 2986 if (ret) {
@@ -3409,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3409 return ret; 3421 return ret;
3410} 3422}
3411 3423
3424static void ocfs2_readahead_for_cow(struct inode *inode,
3425 struct file *file,
3426 u32 start, u32 len)
3427{
3428 struct address_space *mapping;
3429 pgoff_t index;
3430 unsigned long num_pages;
3431 int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
3432
3433 if (!file)
3434 return;
3435
3436 mapping = file->f_mapping;
3437 num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
3438 if (!num_pages)
3439 num_pages = 1;
3440
3441 index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
3442 page_cache_sync_readahead(mapping, &file->f_ra, file,
3443 index, num_pages);
3444}
3445
3412/* 3446/*
3413 * Starting at cpos, try to CoW write_len clusters. Don't CoW 3447 * Starting at cpos, try to CoW write_len clusters. Don't CoW
3414 * past max_cpos. This will stop when it runs into a hole or an 3448 * past max_cpos. This will stop when it runs into a hole or an
3415 * unrefcounted extent. 3449 * unrefcounted extent.
3416 */ 3450 */
3417static int ocfs2_refcount_cow_hunk(struct inode *inode, 3451static int ocfs2_refcount_cow_hunk(struct inode *inode,
3452 struct file *file,
3418 struct buffer_head *di_bh, 3453 struct buffer_head *di_bh,
3419 u32 cpos, u32 write_len, u32 max_cpos) 3454 u32 cpos, u32 write_len, u32 max_cpos)
3420{ 3455{
@@ -3443,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3443 3478
3444 BUG_ON(cow_len == 0); 3479 BUG_ON(cow_len == 0);
3445 3480
3481 ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
3482
3446 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); 3483 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3447 if (!context) { 3484 if (!context) {
3448 ret = -ENOMEM; 3485 ret = -ENOMEM;
@@ -3464,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3464 context->ref_root_bh = ref_root_bh; 3501 context->ref_root_bh = ref_root_bh;
3465 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; 3502 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3466 context->get_clusters = ocfs2_di_get_clusters; 3503 context->get_clusters = ocfs2_di_get_clusters;
3504 context->file = file;
3467 3505
3468 ocfs2_init_dinode_extent_tree(&context->data_et, 3506 ocfs2_init_dinode_extent_tree(&context->data_et,
3469 INODE_CACHE(inode), di_bh); 3507 INODE_CACHE(inode), di_bh);
@@ -3492,6 +3530,7 @@ out:
3492 * clusters between cpos and cpos+write_len are safe to modify. 3530 * clusters between cpos and cpos+write_len are safe to modify.
3493 */ 3531 */
3494int ocfs2_refcount_cow(struct inode *inode, 3532int ocfs2_refcount_cow(struct inode *inode,
3533 struct file *file,
3495 struct buffer_head *di_bh, 3534 struct buffer_head *di_bh,
3496 u32 cpos, u32 write_len, u32 max_cpos) 3535 u32 cpos, u32 write_len, u32 max_cpos)
3497{ 3536{
@@ -3511,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode,
3511 num_clusters = write_len; 3550 num_clusters = write_len;
3512 3551
3513 if (ext_flags & OCFS2_EXT_REFCOUNTED) { 3552 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3514 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, 3553 ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
3515 num_clusters, max_cpos); 3554 num_clusters, max_cpos);
3516 if (ret) { 3555 if (ret) {
3517 mlog_errno(ret); 3556 mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9983ba1570e2..c8ce46f7d8e3 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
21 struct rb_node rf_node; 21 struct rb_node rf_node;
22 u64 rf_blkno; 22 u64 rf_blkno;
23 u32 rf_generation; 23 u32 rf_generation;
24 struct kref rf_getcnt;
24 struct rw_semaphore rf_sem; 25 struct rw_semaphore rf_sem;
25 struct ocfs2_lock_res rf_lockres; 26 struct ocfs2_lock_res rf_lockres;
26 struct kref rf_getcnt;
27 int rf_removed; 27 int rf_removed;
28 28
29 /* the following 4 fields are used by caching_info. */ 29 /* the following 4 fields are used by caching_info. */
30 struct ocfs2_caching_info rf_ci;
31 spinlock_t rf_lock; 30 spinlock_t rf_lock;
31 struct ocfs2_caching_info rf_ci;
32 struct mutex rf_io_mutex; 32 struct mutex rf_io_mutex;
33 struct super_block *rf_sb; 33 struct super_block *rf_sb;
34}; 34};
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
52 u32 clusters, 52 u32 clusters,
53 int *credits, 53 int *credits,
54 int *ref_blocks); 54 int *ref_blocks);
55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, 55int ocfs2_refcount_cow(struct inode *inode,
56 struct file *filep, struct buffer_head *di_bh,
56 u32 cpos, u32 write_len, u32 max_cpos); 57 u32 cpos, u32 write_len, u32 max_cpos);
57 58
58typedef int (ocfs2_post_refcount_func)(struct inode *inode, 59typedef int (ocfs2_post_refcount_func)(struct inode *inode,
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bfbd7e9e949f..ab4e0172cc1d 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
357{ 357{
358 int status = 0; 358 int status = 0;
359 u64 blkno; 359 u64 blkno;
360 unsigned long long blocks, bytes; 360 unsigned long long blocks, bytes = 0;
361 unsigned int i; 361 unsigned int i;
362 struct buffer_head *bh; 362 struct buffer_head *bh;
363 363
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 0d3049f696c5..19965b00c43c 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
283 /* for now we only have one cluster/node, make sure we see it 283 /* for now we only have one cluster/node, make sure we see it
284 * in the heartbeat universe */ 284 * in the heartbeat universe */
285 if (!o2hb_check_local_node_heartbeating()) { 285 if (!o2hb_check_local_node_heartbeating()) {
286 if (o2hb_global_heartbeat_active())
287 mlog(ML_ERROR, "Global heartbeat not started\n");
286 rc = -EINVAL; 288 rc = -EINVAL;
287 goto out; 289 goto out;
288 } 290 }
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 849c2f0e0a0e..5fed60de7630 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1380,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1380 } 1380 }
1381 1381
1382 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 1382 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1383 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1384 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
1385 " count %u but claims %u are freed. num_bits %d",
1386 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1387 le16_to_cpu(bg->bg_bits),
1388 le16_to_cpu(bg->bg_free_bits_count), num_bits);
1389 return -EROFS;
1390 }
1383 while(num_bits--) 1391 while(num_bits--)
1384 ocfs2_set_bit(bit_off++, bitmap); 1392 ocfs2_set_bit(bit_off++, bitmap);
1385 1393
@@ -2419,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
2419 (unsigned long *) undo_bg->bg_bitmap); 2427 (unsigned long *) undo_bg->bg_bitmap);
2420 } 2428 }
2421 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2429 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2430 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2431 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
2432 " count %u but claims %u are freed. num_bits %d",
2433 (unsigned long long)le64_to_cpu(bg->bg_blkno),
2434 le16_to_cpu(bg->bg_bits),
2435 le16_to_cpu(bg->bg_free_bits_count), num_bits);
2436 return -EROFS;
2437 }
2422 2438
2423 if (undo_fn) 2439 if (undo_fn)
2424 jbd_unlock_bh_state(group_bh); 2440 jbd_unlock_bh_state(group_bh);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b7d724393b5a..56f0cb395820 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -162,6 +162,7 @@ enum {
162 Opt_nointr, 162 Opt_nointr,
163 Opt_hb_none, 163 Opt_hb_none,
164 Opt_hb_local, 164 Opt_hb_local,
165 Opt_hb_global,
165 Opt_data_ordered, 166 Opt_data_ordered,
166 Opt_data_writeback, 167 Opt_data_writeback,
167 Opt_atime_quantum, 168 Opt_atime_quantum,
@@ -177,6 +178,8 @@ enum {
177 Opt_noacl, 178 Opt_noacl,
178 Opt_usrquota, 179 Opt_usrquota,
179 Opt_grpquota, 180 Opt_grpquota,
181 Opt_coherency_buffered,
182 Opt_coherency_full,
180 Opt_resv_level, 183 Opt_resv_level,
181 Opt_dir_resv_level, 184 Opt_dir_resv_level,
182 Opt_err, 185 Opt_err,
@@ -190,6 +193,7 @@ static const match_table_t tokens = {
190 {Opt_nointr, "nointr"}, 193 {Opt_nointr, "nointr"},
191 {Opt_hb_none, OCFS2_HB_NONE}, 194 {Opt_hb_none, OCFS2_HB_NONE},
192 {Opt_hb_local, OCFS2_HB_LOCAL}, 195 {Opt_hb_local, OCFS2_HB_LOCAL},
196 {Opt_hb_global, OCFS2_HB_GLOBAL},
193 {Opt_data_ordered, "data=ordered"}, 197 {Opt_data_ordered, "data=ordered"},
194 {Opt_data_writeback, "data=writeback"}, 198 {Opt_data_writeback, "data=writeback"},
195 {Opt_atime_quantum, "atime_quantum=%u"}, 199 {Opt_atime_quantum, "atime_quantum=%u"},
@@ -205,6 +209,8 @@ static const match_table_t tokens = {
205 {Opt_noacl, "noacl"}, 209 {Opt_noacl, "noacl"},
206 {Opt_usrquota, "usrquota"}, 210 {Opt_usrquota, "usrquota"},
207 {Opt_grpquota, "grpquota"}, 211 {Opt_grpquota, "grpquota"},
212 {Opt_coherency_buffered, "coherency=buffered"},
213 {Opt_coherency_full, "coherency=full"},
208 {Opt_resv_level, "resv_level=%u"}, 214 {Opt_resv_level, "resv_level=%u"},
209 {Opt_dir_resv_level, "dir_resv_level=%u"}, 215 {Opt_dir_resv_level, "dir_resv_level=%u"},
210 {Opt_err, NULL} 216 {Opt_err, NULL}
@@ -514,11 +520,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
514 520
515 mlog_entry_void(); 521 mlog_entry_void();
516 522
517 for (i = 0; i < NUM_SYSTEM_INODES; i++) { 523 for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
518 inode = osb->system_inodes[i]; 524 inode = osb->global_system_inodes[i];
519 if (inode) { 525 if (inode) {
520 iput(inode); 526 iput(inode);
521 osb->system_inodes[i] = NULL; 527 osb->global_system_inodes[i] = NULL;
522 } 528 }
523 } 529 }
524 530
@@ -534,6 +540,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
534 osb->root_inode = NULL; 540 osb->root_inode = NULL;
535 } 541 }
536 542
543 if (!osb->local_system_inodes)
544 goto out;
545
546 for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
547 if (osb->local_system_inodes[i]) {
548 iput(osb->local_system_inodes[i]);
549 osb->local_system_inodes[i] = NULL;
550 }
551 }
552
553 kfree(osb->local_system_inodes);
554 osb->local_system_inodes = NULL;
555
556out:
537 mlog_exit(0); 557 mlog_exit(0);
538} 558}
539 559
@@ -608,6 +628,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
608 int ret = 0; 628 int ret = 0;
609 struct mount_options parsed_options; 629 struct mount_options parsed_options;
610 struct ocfs2_super *osb = OCFS2_SB(sb); 630 struct ocfs2_super *osb = OCFS2_SB(sb);
631 u32 tmp;
611 632
612 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || 633 if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
613 !ocfs2_check_set_options(sb, &parsed_options)) { 634 !ocfs2_check_set_options(sb, &parsed_options)) {
@@ -615,8 +636,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
615 goto out; 636 goto out;
616 } 637 }
617 638
618 if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != 639 tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
619 (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 640 OCFS2_MOUNT_HB_NONE;
641 if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
620 ret = -EINVAL; 642 ret = -EINVAL;
621 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); 643 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
622 goto out; 644 goto out;
@@ -806,23 +828,29 @@ bail:
806 828
807static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) 829static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
808{ 830{
809 if (ocfs2_mount_local(osb)) { 831 u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
810 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { 832
833 if (osb->s_mount_opt & hb_enabled) {
834 if (ocfs2_mount_local(osb)) {
811 mlog(ML_ERROR, "Cannot heartbeat on a locally " 835 mlog(ML_ERROR, "Cannot heartbeat on a locally "
812 "mounted device.\n"); 836 "mounted device.\n");
813 return -EINVAL; 837 return -EINVAL;
814 } 838 }
815 } 839 if (ocfs2_userspace_stack(osb)) {
816
817 if (ocfs2_userspace_stack(osb)) {
818 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
819 mlog(ML_ERROR, "Userspace stack expected, but " 840 mlog(ML_ERROR, "Userspace stack expected, but "
820 "o2cb heartbeat arguments passed to mount\n"); 841 "o2cb heartbeat arguments passed to mount\n");
821 return -EINVAL; 842 return -EINVAL;
822 } 843 }
844 if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
845 !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
846 ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
847 ocfs2_cluster_o2cb_global_heartbeat(osb))) {
848 mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
849 return -EINVAL;
850 }
823 } 851 }
824 852
825 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 853 if (!(osb->s_mount_opt & hb_enabled)) {
826 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && 854 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
827 !ocfs2_userspace_stack(osb)) { 855 !ocfs2_userspace_stack(osb)) {
828 mlog(ML_ERROR, "Heartbeat has to be started to mount " 856 mlog(ML_ERROR, "Heartbeat has to be started to mount "
@@ -1288,6 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb,
1288{ 1316{
1289 int status; 1317 int status;
1290 char *p; 1318 char *p;
1319 u32 tmp;
1291 1320
1292 mlog_entry("remount: %d, options: \"%s\"\n", is_remount, 1321 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
1293 options ? options : "(none)"); 1322 options ? options : "(none)");
@@ -1319,7 +1348,10 @@ static int ocfs2_parse_options(struct super_block *sb,
1319 mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; 1348 mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
1320 break; 1349 break;
1321 case Opt_hb_none: 1350 case Opt_hb_none:
1322 mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; 1351 mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
1352 break;
1353 case Opt_hb_global:
1354 mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
1323 break; 1355 break;
1324 case Opt_barrier: 1356 case Opt_barrier:
1325 if (match_int(&args[0], &option)) { 1357 if (match_int(&args[0], &option)) {
@@ -1435,6 +1467,12 @@ static int ocfs2_parse_options(struct super_block *sb,
1435 case Opt_grpquota: 1467 case Opt_grpquota:
1436 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; 1468 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
1437 break; 1469 break;
1470 case Opt_coherency_buffered:
1471 mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
1472 break;
1473 case Opt_coherency_full:
1474 mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
1475 break;
1438 case Opt_acl: 1476 case Opt_acl:
1439 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; 1477 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1440 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; 1478 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1474,6 +1512,15 @@ static int ocfs2_parse_options(struct super_block *sb,
1474 } 1512 }
1475 } 1513 }
1476 1514
1515 /* Ensure only one heartbeat mode */
1516 tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
1517 OCFS2_MOUNT_HB_NONE);
1518 if (hweight32(tmp) != 1) {
1519 mlog(ML_ERROR, "Invalid heartbeat mount options\n");
1520 status = 0;
1521 goto bail;
1522 }
1523
1477 status = 1; 1524 status = 1;
1478 1525
1479bail: 1526bail:
@@ -1487,10 +1534,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1487 unsigned long opts = osb->s_mount_opt; 1534 unsigned long opts = osb->s_mount_opt;
1488 unsigned int local_alloc_megs; 1535 unsigned int local_alloc_megs;
1489 1536
1490 if (opts & OCFS2_MOUNT_HB_LOCAL) 1537 if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
1491 seq_printf(s, ",_netdev,heartbeat=local"); 1538 seq_printf(s, ",_netdev");
1492 else 1539 if (opts & OCFS2_MOUNT_HB_LOCAL)
1493 seq_printf(s, ",heartbeat=none"); 1540 seq_printf(s, ",%s", OCFS2_HB_LOCAL);
1541 else
1542 seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
1543 } else
1544 seq_printf(s, ",%s", OCFS2_HB_NONE);
1494 1545
1495 if (opts & OCFS2_MOUNT_NOINTR) 1546 if (opts & OCFS2_MOUNT_NOINTR)
1496 seq_printf(s, ",nointr"); 1547 seq_printf(s, ",nointr");
@@ -1533,6 +1584,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1533 if (opts & OCFS2_MOUNT_GRPQUOTA) 1584 if (opts & OCFS2_MOUNT_GRPQUOTA)
1534 seq_printf(s, ",grpquota"); 1585 seq_printf(s, ",grpquota");
1535 1586
1587 if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
1588 seq_printf(s, ",coherency=buffered");
1589 else
1590 seq_printf(s, ",coherency=full");
1591
1536 if (opts & OCFS2_MOUNT_NOUSERXATTR) 1592 if (opts & OCFS2_MOUNT_NOUSERXATTR)
1537 seq_printf(s, ",nouser_xattr"); 1593 seq_printf(s, ",nouser_xattr");
1538 else 1594 else
@@ -1983,6 +2039,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
1983 return 0; 2039 return 0;
1984} 2040}
1985 2041
2042/* Make sure entire volume is addressable by our journal. Requires
2043 osb_clusters_at_boot to be valid and for the journal to have been
2044 initialized by ocfs2_journal_init(). */
2045static int ocfs2_journal_addressable(struct ocfs2_super *osb)
2046{
2047 int status = 0;
2048 u64 max_block =
2049 ocfs2_clusters_to_blocks(osb->sb,
2050 osb->osb_clusters_at_boot) - 1;
2051
2052 /* 32-bit block number is always OK. */
2053 if (max_block <= (u32)~0ULL)
2054 goto out;
2055
2056 /* Volume is "huge", so see if our journal is new enough to
2057 support it. */
2058 if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
2059 OCFS2_FEATURE_COMPAT_JBD2_SB) &&
2060 jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
2061 JBD2_FEATURE_INCOMPAT_64BIT))) {
2062 mlog(ML_ERROR, "The journal cannot address the entire volume. "
2063 "Enable the 'block64' journal option with tunefs.ocfs2");
2064 status = -EFBIG;
2065 goto out;
2066 }
2067
2068 out:
2069 return status;
2070}
2071
1986static int ocfs2_initialize_super(struct super_block *sb, 2072static int ocfs2_initialize_super(struct super_block *sb,
1987 struct buffer_head *bh, 2073 struct buffer_head *bh,
1988 int sector_size, 2074 int sector_size,
@@ -1995,6 +2081,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1995 struct ocfs2_journal *journal; 2081 struct ocfs2_journal *journal;
1996 __le32 uuid_net_key; 2082 __le32 uuid_net_key;
1997 struct ocfs2_super *osb; 2083 struct ocfs2_super *osb;
2084 u64 total_blocks;
1998 2085
1999 mlog_entry_void(); 2086 mlog_entry_void();
2000 2087
@@ -2053,6 +2140,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
2053 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 2140 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
2054 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 2141 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
2055 2142
2143 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
2144 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
2145 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
2146 osb->max_slots);
2147 status = -EINVAL;
2148 goto bail;
2149 }
2150 mlog(0, "max_slots for this device: %u\n", osb->max_slots);
2151
2056 ocfs2_orphan_scan_init(osb); 2152 ocfs2_orphan_scan_init(osb);
2057 2153
2058 status = ocfs2_recovery_init(osb); 2154 status = ocfs2_recovery_init(osb);
@@ -2091,15 +2187,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2091 goto bail; 2187 goto bail;
2092 } 2188 }
2093 2189
2094 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
2095 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
2096 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
2097 osb->max_slots);
2098 status = -EINVAL;
2099 goto bail;
2100 }
2101 mlog(0, "max_slots for this device: %u\n", osb->max_slots);
2102
2103 osb->slot_recovery_generations = 2190 osb->slot_recovery_generations =
2104 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), 2191 kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
2105 GFP_KERNEL); 2192 GFP_KERNEL);
@@ -2142,7 +2229,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
2142 goto bail; 2229 goto bail;
2143 } 2230 }
2144 2231
2145 if (ocfs2_userspace_stack(osb)) { 2232 if (ocfs2_clusterinfo_valid(osb)) {
2233 osb->osb_stackflags =
2234 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
2146 memcpy(osb->osb_cluster_stack, 2235 memcpy(osb->osb_cluster_stack,
2147 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, 2236 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
2148 OCFS2_STACK_LABEL_LEN); 2237 OCFS2_STACK_LABEL_LEN);
@@ -2207,11 +2296,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
2207 goto bail; 2296 goto bail;
2208 } 2297 }
2209 2298
2210 if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) 2299 total_blocks = ocfs2_clusters_to_blocks(osb->sb,
2211 > (u32)~0UL) { 2300 le32_to_cpu(di->i_clusters));
2212 mlog(ML_ERROR, "Volume might try to write to blocks beyond " 2301
2213 "what jbd can address in 32 bits.\n"); 2302 status = generic_check_addressable(osb->sb->s_blocksize_bits,
2214 status = -EINVAL; 2303 total_blocks);
2304 if (status) {
2305 mlog(ML_ERROR, "Volume too large "
2306 "to mount safely on this system");
2307 status = -EFBIG;
2215 goto bail; 2308 goto bail;
2216 } 2309 }
2217 2310
@@ -2373,6 +2466,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2373 goto finally; 2466 goto finally;
2374 } 2467 }
2375 2468
2469 /* Now that journal has been initialized, check to make sure
2470 entire volume is addressable. */
2471 status = ocfs2_journal_addressable(osb);
2472 if (status)
2473 goto finally;
2474
2376 /* If the journal was unmounted cleanly then we don't want to 2475 /* If the journal was unmounted cleanly then we don't want to
2377 * recover anything. Otherwise, journal_load will do that 2476 * recover anything. Otherwise, journal_load will do that
2378 * dirty work for us :) */ 2477 * dirty work for us :) */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d213fc4..9975457c981f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
128 } 128 }
129 129
130 /* Fast symlinks can't be large */ 130 /* Fast symlinks can't be large */
131 len = strlen(target); 131 len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
132 link = kzalloc(len + 1, GFP_NOFS); 132 link = kzalloc(len + 1, GFP_NOFS);
133 if (!link) { 133 if (!link) {
134 status = -ENOMEM; 134 status = -ENOMEM;
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index bfe7190cdbf1..902efb23b6a6 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
44 int type, 44 int type,
45 u32 slot); 45 u32 slot);
46 46
47static inline int is_global_system_inode(int type);
48static inline int is_in_system_inode_array(struct ocfs2_super *osb,
49 int type,
50 u32 slot);
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC 47#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES]; 48static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
54#endif 49#endif
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type)
59 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; 54 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
60} 55}
61 56
62static inline int is_in_system_inode_array(struct ocfs2_super *osb, 57static struct inode **get_local_system_inode(struct ocfs2_super *osb,
63 int type, 58 int type,
64 u32 slot) 59 u32 slot)
65{ 60{
66 return slot == osb->slot_num || is_global_system_inode(type); 61 int index;
62 struct inode **local_system_inodes, **free = NULL;
63
64 BUG_ON(slot == OCFS2_INVALID_SLOT);
65 BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
66 type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
67
68 spin_lock(&osb->osb_lock);
69 local_system_inodes = osb->local_system_inodes;
70 spin_unlock(&osb->osb_lock);
71
72 if (unlikely(!local_system_inodes)) {
73 local_system_inodes = kzalloc(sizeof(struct inode *) *
74 NUM_LOCAL_SYSTEM_INODES *
75 osb->max_slots,
76 GFP_NOFS);
77 if (!local_system_inodes) {
78 mlog_errno(-ENOMEM);
79 /*
80 * return NULL here so that ocfs2_get_sytem_file_inodes
81 * will try to create an inode and use it. We will try
82 * to initialize local_system_inodes next time.
83 */
84 return NULL;
85 }
86
87 spin_lock(&osb->osb_lock);
88 if (osb->local_system_inodes) {
89 /* Someone has initialized it for us. */
90 free = local_system_inodes;
91 local_system_inodes = osb->local_system_inodes;
92 } else
93 osb->local_system_inodes = local_system_inodes;
94 spin_unlock(&osb->osb_lock);
95 if (unlikely(free))
96 kfree(free);
97 }
98
99 index = (slot * NUM_LOCAL_SYSTEM_INODES) +
100 (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
101
102 return &local_system_inodes[index];
67} 103}
68 104
69struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, 105struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
74 struct inode **arr = NULL; 110 struct inode **arr = NULL;
75 111
76 /* avoid the lookup if cached in local system file array */ 112 /* avoid the lookup if cached in local system file array */
77 if (is_in_system_inode_array(osb, type, slot)) 113 if (is_global_system_inode(type)) {
78 arr = &(osb->system_inodes[type]); 114 arr = &(osb->global_system_inodes[type]);
115 } else
116 arr = get_local_system_inode(osb, type, slot);
79 117
80 if (arr && ((inode = *arr) != NULL)) { 118 if (arr && ((inode = *arr) != NULL)) {
81 /* get a ref in addition to the array ref */ 119 /* get a ref in addition to the array ref */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 06fa5e77c40e..67cd43914641 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7081,7 +7081,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
7081 goto out; 7081 goto out;
7082 } 7082 }
7083 7083
7084 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) 7084 if (!indexed)
7085 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); 7085 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
7086 else 7086 else
7087 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); 7087 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1c43e7c8a7b..8e4addaa5424 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2675,7 +2675,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2675 INF("auxv", S_IRUSR, proc_pid_auxv), 2675 INF("auxv", S_IRUSR, proc_pid_auxv),
2676 ONE("status", S_IRUGO, proc_pid_status), 2676 ONE("status", S_IRUGO, proc_pid_status),
2677 ONE("personality", S_IRUSR, proc_pid_personality), 2677 ONE("personality", S_IRUSR, proc_pid_personality),
2678 INF("limits", S_IRUSR, proc_pid_limits), 2678 INF("limits", S_IRUGO, proc_pid_limits),
2679#ifdef CONFIG_SCHED_DEBUG 2679#ifdef CONFIG_SCHED_DEBUG
2680 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2680 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2681#endif 2681#endif
@@ -3011,7 +3011,7 @@ static const struct pid_entry tid_base_stuff[] = {
3011 INF("auxv", S_IRUSR, proc_pid_auxv), 3011 INF("auxv", S_IRUSR, proc_pid_auxv),
3012 ONE("status", S_IRUGO, proc_pid_status), 3012 ONE("status", S_IRUGO, proc_pid_status),
3013 ONE("personality", S_IRUSR, proc_pid_personality), 3013 ONE("personality", S_IRUSR, proc_pid_personality),
3014 INF("limits", S_IRUSR, proc_pid_limits), 3014 INF("limits", S_IRUGO, proc_pid_limits),
3015#ifdef CONFIG_SCHED_DEBUG 3015#ifdef CONFIG_SCHED_DEBUG
3016 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 3016 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3017#endif 3017#endif
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index f53505de0712..5cbb81e134ac 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -170,6 +170,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
170int reiserfs_unpack(struct inode *inode, struct file *filp) 170int reiserfs_unpack(struct inode *inode, struct file *filp)
171{ 171{
172 int retval = 0; 172 int retval = 0;
173 int depth;
173 int index; 174 int index;
174 struct page *page; 175 struct page *page;
175 struct address_space *mapping; 176 struct address_space *mapping;
@@ -188,8 +189,8 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
188 /* we need to make sure nobody is changing the file size beneath 189 /* we need to make sure nobody is changing the file size beneath
189 ** us 190 ** us
190 */ 191 */
191 mutex_lock(&inode->i_mutex); 192 reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
192 reiserfs_write_lock(inode->i_sb); 193 depth = reiserfs_write_lock_once(inode->i_sb);
193 194
194 write_from = inode->i_size & (blocksize - 1); 195 write_from = inode->i_size & (blocksize - 1);
195 /* if we are on a block boundary, we are already unpacked. */ 196 /* if we are on a block boundary, we are already unpacked. */
@@ -224,6 +225,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
224 225
225 out: 226 out:
226 mutex_unlock(&inode->i_mutex); 227 mutex_unlock(&inode->i_mutex);
227 reiserfs_write_unlock(inode->i_sb); 228 reiserfs_write_unlock_once(inode->i_sb, depth);
228 return retval; 229 return retval;
229} 230}
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
index e668127c8b2e..2bc24a8c4039 100644
--- a/fs/smbfs/Kconfig
+++ b/fs/smbfs/Kconfig
@@ -1,5 +1,6 @@
1config SMB_FS 1config SMB_FS
2 tristate "SMB file system support (OBSOLETE, please use CIFS)" 2 tristate "SMB file system support (OBSOLETE, please use CIFS)"
3 depends on BKL # probably unfixable
3 depends on INET 4 depends on INET
4 select NLS 5 select NLS
5 help 6 help
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 23c1e598792a..442f34ff1af8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -148,6 +148,65 @@ void sysfs_remove_group(struct kobject * kobj,
148 sysfs_put(sd); 148 sysfs_put(sd);
149} 149}
150 150
151/**
152 * sysfs_merge_group - merge files into a pre-existing attribute group.
153 * @kobj: The kobject containing the group.
154 * @grp: The files to create and the attribute group they belong to.
155 *
156 * This function returns an error if the group doesn't exist or any of the
157 * files already exist in that group, in which case none of the new files
158 * are created.
159 */
160int sysfs_merge_group(struct kobject *kobj,
161 const struct attribute_group *grp)
162{
163 struct sysfs_dirent *dir_sd;
164 int error = 0;
165 struct attribute *const *attr;
166 int i;
167
168 if (grp)
169 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
170 else
171 dir_sd = sysfs_get(kobj->sd);
172 if (!dir_sd)
173 return -ENOENT;
174
175 for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
176 error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
177 if (error) {
178 while (--i >= 0)
179 sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name);
180 }
181 sysfs_put(dir_sd);
182
183 return error;
184}
185EXPORT_SYMBOL_GPL(sysfs_merge_group);
186
187/**
188 * sysfs_unmerge_group - remove files from a pre-existing attribute group.
189 * @kobj: The kobject containing the group.
190 * @grp: The files to remove and the attribute group they belong to.
191 */
192void sysfs_unmerge_group(struct kobject *kobj,
193 const struct attribute_group *grp)
194{
195 struct sysfs_dirent *dir_sd;
196 struct attribute *const *attr;
197
198 if (grp)
199 dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
200 else
201 dir_sd = sysfs_get(kobj->sd);
202 if (dir_sd) {
203 for (attr = grp->attrs; *attr; ++attr)
204 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
205 sysfs_put(dir_sd);
206 }
207}
208EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
209
151 210
152EXPORT_SYMBOL_GPL(sysfs_create_group); 211EXPORT_SYMBOL_GPL(sysfs_create_group);
153EXPORT_SYMBOL_GPL(sysfs_update_group); 212EXPORT_SYMBOL_GPL(sysfs_update_group);
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bce..f8def3c8ea4c 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,5 +1,6 @@
1config UDF_FS 1config UDF_FS
2 tristate "UDF file system support" 2 tristate "UDF file system support"
3 depends on BKL # needs serious work to remove
3 select CRC_ITU_T 4 select CRC_ITU_T
4 help 5 help
5 This is the new file system used on some CD-ROMs and DVDs. Say Y if 6 This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768a..30c8f223253d 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,6 +1,7 @@
1config UFS_FS 1config UFS_FS
2 tristate "UFS file system support (read only)" 2 tristate "UFS file system support (read only)"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # probably fixable
4 help 5 help
5 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, 6 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
6 OpenBSD and NeXTstep) use a file system called UFS. Some System V 7 OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index d59c4a65d492..81976ffed7d6 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -668,14 +668,11 @@ xfs_inode_set_reclaim_tag(
668 xfs_perag_put(pag); 668 xfs_perag_put(pag);
669} 669}
670 670
671void 671STATIC void
672__xfs_inode_clear_reclaim_tag( 672__xfs_inode_clear_reclaim(
673 xfs_mount_t *mp,
674 xfs_perag_t *pag, 673 xfs_perag_t *pag,
675 xfs_inode_t *ip) 674 xfs_inode_t *ip)
676{ 675{
677 radix_tree_tag_clear(&pag->pag_ici_root,
678 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
679 pag->pag_ici_reclaimable--; 676 pag->pag_ici_reclaimable--;
680 if (!pag->pag_ici_reclaimable) { 677 if (!pag->pag_ici_reclaimable) {
681 /* clear the reclaim tag from the perag radix tree */ 678 /* clear the reclaim tag from the perag radix tree */
@@ -689,6 +686,17 @@ __xfs_inode_clear_reclaim_tag(
689 } 686 }
690} 687}
691 688
689void
690__xfs_inode_clear_reclaim_tag(
691 xfs_mount_t *mp,
692 xfs_perag_t *pag,
693 xfs_inode_t *ip)
694{
695 radix_tree_tag_clear(&pag->pag_ici_root,
696 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
697 __xfs_inode_clear_reclaim(pag, ip);
698}
699
692/* 700/*
693 * Inodes in different states need to be treated differently, and the return 701 * Inodes in different states need to be treated differently, and the return
694 * value of xfs_iflush is not sufficient to get this right. The following table 702 * value of xfs_iflush is not sufficient to get this right. The following table
@@ -838,6 +846,7 @@ reclaim:
838 if (!radix_tree_delete(&pag->pag_ici_root, 846 if (!radix_tree_delete(&pag->pag_ici_root,
839 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 847 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
840 ASSERT(0); 848 ASSERT(0);
849 __xfs_inode_clear_reclaim(pag, ip);
841 write_unlock(&pag->pag_ici_lock); 850 write_unlock(&pag->pag_ici_lock);
842 851
843 /* 852 /*
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ed575fb4b495..7e206fc1fa36 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -405,9 +405,15 @@ xlog_cil_push(
405 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 405 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
406 new_ctx->ticket = xlog_cil_ticket_alloc(log); 406 new_ctx->ticket = xlog_cil_ticket_alloc(log);
407 407
408 /* lock out transaction commit, but don't block on background push */ 408 /*
409 * Lock out transaction commit, but don't block for background pushes
410 * unless we are well over the CIL space limit. See the definition of
411 * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
412 * used here.
413 */
409 if (!down_write_trylock(&cil->xc_ctx_lock)) { 414 if (!down_write_trylock(&cil->xc_ctx_lock)) {
410 if (!push_seq) 415 if (!push_seq &&
416 cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
411 goto out_free_ticket; 417 goto out_free_ticket;
412 down_write(&cil->xc_ctx_lock); 418 down_write(&cil->xc_ctx_lock);
413 } 419 }
@@ -422,7 +428,7 @@ xlog_cil_push(
422 goto out_skip; 428 goto out_skip;
423 429
424 /* check for a previously pushed seqeunce */ 430 /* check for a previously pushed seqeunce */
425 if (push_seq < cil->xc_ctx->sequence) 431 if (push_seq && push_seq < cil->xc_ctx->sequence)
426 goto out_skip; 432 goto out_skip;
427 433
428 /* 434 /*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index ced52b98b322..edcdfe01617f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -426,13 +426,13 @@ struct xfs_cil {
426}; 426};
427 427
428/* 428/*
429 * The amount of log space we should the CIL to aggregate is difficult to size. 429 * The amount of log space we allow the CIL to aggregate is difficult to size.
430 * Whatever we chose we have to make we can get a reservation for the log space 430 * Whatever we choose, we have to make sure we can get a reservation for the
431 * effectively, that it is large enough to capture sufficient relogging to 431 * log space effectively, that it is large enough to capture sufficient
432 * reduce log buffer IO significantly, but it is not too large for the log or 432 * relogging to reduce log buffer IO significantly, but it is not too large for
433 * induces too much latency when writing out through the iclogs. We track both 433 * the log or induces too much latency when writing out through the iclogs. We
434 * space consumed and the number of vectors in the checkpoint context, so we 434 * track both space consumed and the number of vectors in the checkpoint
435 * need to decide which to use for limiting. 435 * context, so we need to decide which to use for limiting.
436 * 436 *
437 * Every log buffer we write out during a push needs a header reserved, which 437 * Every log buffer we write out during a push needs a header reserved, which
438 * is at least one sector and more for v2 logs. Hence we need a reservation of 438 * is at least one sector and more for v2 logs. Hence we need a reservation of
@@ -459,16 +459,21 @@ struct xfs_cil {
459 * checkpoint transaction ticket is specific to the checkpoint context, rather 459 * checkpoint transaction ticket is specific to the checkpoint context, rather
460 * than the CIL itself. 460 * than the CIL itself.
461 * 461 *
462 * With dynamic reservations, we can basically make up arbitrary limits for the 462 * With dynamic reservations, we can effectively make up arbitrary limits for
463 * checkpoint size so long as they don't violate any other size rules. Hence 463 * the checkpoint size so long as they don't violate any other size rules.
464 * the initial maximum size for the checkpoint transaction will be set to a 464 * Recovery imposes a rule that no transaction exceed half the log, so we are
465 * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit 465 * limited by that. Furthermore, the log transaction reservation subsystem
466 * right now based on the latency of writing out a large amount of data through 466 * tries to keep 25% of the log free, so we need to keep below that limit or we
467 * the circular iclog buffers. 467 * risk running out of free log space to start any new transactions.
468 *
469 * In order to keep background CIL push efficient, we will set a lower
470 * threshold at which background pushing is attempted without blocking current
471 * transaction commits. A separate, higher bound defines when CIL pushes are
472 * enforced to ensure we stay within our maximum checkpoint size bounds.
473 * threshold, yet give us plenty of space for aggregation on large logs.
468 */ 474 */
469 475#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
470#define XLOG_CIL_SPACE_LIMIT(log) \ 476#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
471 (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
472 477
473/* 478/*
474 * The reservation head lsn is not made up of a cycle number and block number. 479 * The reservation head lsn is not made up of a cycle number and block number.