aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS11
-rw-r--r--drivers/block/Kconfig17
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/rbd.c1841
-rw-r--r--drivers/block/rbd_types.h73
-rw-r--r--drivers/block/virtio_blk.c17
-rw-r--r--drivers/char/virtio_console.c240
-rw-r--r--fs/ceph/Kconfig14
-rw-r--r--fs/ceph/Makefile11
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c65
-rw-r--r--fs/ceph/caps.c50
-rw-r--r--fs/ceph/ceph_frag.c3
-rw-r--r--fs/ceph/debugfs.c406
-rw-r--r--fs/ceph/dir.c97
-rw-r--r--fs/ceph/export.c5
-rw-r--r--fs/ceph/file.c207
-rw-r--r--fs/ceph/inode.c19
-rw-r--r--fs/ceph/ioctl.c77
-rw-r--r--fs/ceph/ioctl.h4
-rw-r--r--fs/ceph/locks.c23
-rw-r--r--fs/ceph/mds_client.c129
-rw-r--r--fs/ceph/mds_client.h20
-rw-r--r--fs/ceph/mdsmap.c11
-rw-r--r--fs/ceph/pagelist.c63
-rw-r--r--fs/ceph/snap.c10
-rw-r--r--fs/ceph/strings.c (renamed from fs/ceph/ceph_strings.c)82
-rw-r--r--fs/ceph/super.c1154
-rw-r--r--fs/ceph/super.h400
-rw-r--r--fs/ceph/xattr.c18
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/aops.c24
-rw-r--r--fs/gfs2/bmap.c255
-rw-r--r--fs/gfs2/bmap.h20
-rw-r--r--fs/gfs2/dentry.c2
-rw-r--r--fs/gfs2/dir.c31
-rw-r--r--fs/gfs2/dir.h34
-rw-r--r--fs/gfs2/export.c9
-rw-r--r--fs/gfs2/file.c6
-rw-r--r--fs/gfs2/glock.c23
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c6
-rw-r--r--fs/gfs2/incore.h8
-rw-r--r--fs/gfs2/inode.c9
-rw-r--r--fs/gfs2/inode.h15
-rw-r--r--fs/gfs2/lock_dlm.c4
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/gfs2/ops_fstype.c79
-rw-r--r--fs/gfs2/ops_inode.c326
-rw-r--r--fs/gfs2/quota.c16
-rw-r--r--fs/gfs2/recovery.c15
-rw-r--r--fs/gfs2/rgrp.c50
-rw-r--r--fs/gfs2/rgrp.h8
-rw-r--r--fs/gfs2/super.c26
-rw-r--r--fs/gfs2/sys.c22
-rw-r--r--fs/gfs2/trace_gfs2.h3
-rw-r--r--fs/gfs2/trans.h9
-rw-r--r--fs/gfs2/xattr.c2
-rw-r--r--fs/hfsplus/bfind.c17
-rw-r--r--fs/hfsplus/bitmap.c20
-rw-r--r--fs/hfsplus/brec.c29
-rw-r--r--fs/hfsplus/btree.c67
-rw-r--r--fs/hfsplus/catalog.c50
-rw-r--r--fs/hfsplus/dir.c201
-rw-r--r--fs/hfsplus/extents.c223
-rw-r--r--fs/hfsplus/hfsplus_fs.h85
-rw-r--r--fs/hfsplus/hfsplus_raw.h3
-rw-r--r--fs/hfsplus/inode.c185
-rw-r--r--fs/hfsplus/ioctl.c153
-rw-r--r--fs/hfsplus/options.c10
-rw-r--r--fs/hfsplus/part_tbl.c5
-rw-r--r--fs/hfsplus/super.c310
-rw-r--r--fs/hfsplus/unicode.c16
-rw-r--r--fs/hfsplus/wrapper.c40
-rw-r--r--include/linux/ceph/auth.h (renamed from fs/ceph/auth.h)4
-rw-r--r--include/linux/ceph/buffer.h (renamed from fs/ceph/buffer.h)0
-rw-r--r--include/linux/ceph/ceph_debug.h (renamed from fs/ceph/ceph_debug.h)5
-rw-r--r--include/linux/ceph/ceph_frag.h (renamed from fs/ceph/ceph_frag.h)0
-rw-r--r--include/linux/ceph/ceph_fs.h (renamed from fs/ceph/ceph_fs.h)1
-rw-r--r--include/linux/ceph/ceph_hash.h (renamed from fs/ceph/ceph_hash.h)0
-rw-r--r--include/linux/ceph/debugfs.h33
-rw-r--r--include/linux/ceph/decode.h (renamed from fs/ceph/decode.h)5
-rw-r--r--include/linux/ceph/libceph.h249
-rw-r--r--include/linux/ceph/mdsmap.h (renamed from fs/ceph/mdsmap.h)0
-rw-r--r--include/linux/ceph/messenger.h (renamed from fs/ceph/messenger.h)12
-rw-r--r--include/linux/ceph/mon_client.h (renamed from fs/ceph/mon_client.h)1
-rw-r--r--include/linux/ceph/msgpool.h (renamed from fs/ceph/msgpool.h)0
-rw-r--r--include/linux/ceph/msgr.h (renamed from fs/ceph/msgr.h)0
-rw-r--r--include/linux/ceph/osd_client.h (renamed from fs/ceph/osd_client.h)67
-rw-r--r--include/linux/ceph/osdmap.h (renamed from fs/ceph/osdmap.h)4
-rw-r--r--include/linux/ceph/pagelist.h (renamed from fs/ceph/pagelist.h)23
-rw-r--r--include/linux/ceph/rados.h (renamed from fs/ceph/rados.h)0
-rw-r--r--include/linux/ceph/types.h (renamed from fs/ceph/types.h)0
-rw-r--r--include/linux/crush/crush.h (renamed from fs/ceph/crush/crush.h)0
-rw-r--r--include/linux/crush/hash.h (renamed from fs/ceph/crush/hash.h)0
-rw-r--r--include/linux/crush/mapper.h (renamed from fs/ceph/crush/mapper.h)0
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/ceph/Kconfig28
-rw-r--r--net/ceph/Makefile37
-rw-r--r--net/ceph/armor.c (renamed from fs/ceph/armor.c)0
-rw-r--r--net/ceph/auth.c (renamed from fs/ceph/auth.c)10
-rw-r--r--net/ceph/auth_none.c (renamed from fs/ceph/auth_none.c)7
-rw-r--r--net/ceph/auth_none.h (renamed from fs/ceph/auth_none.h)3
-rw-r--r--net/ceph/auth_x.c (renamed from fs/ceph/auth_x.c)9
-rw-r--r--net/ceph/auth_x.h (renamed from fs/ceph/auth_x.h)3
-rw-r--r--net/ceph/auth_x_protocol.h (renamed from fs/ceph/auth_x_protocol.h)0
-rw-r--r--net/ceph/buffer.c (renamed from fs/ceph/buffer.c)9
-rw-r--r--net/ceph/ceph_common.c529
-rw-r--r--net/ceph/ceph_fs.c (renamed from fs/ceph/ceph_fs.c)5
-rw-r--r--net/ceph/ceph_hash.c (renamed from fs/ceph/ceph_hash.c)2
-rw-r--r--net/ceph/ceph_strings.c84
-rw-r--r--net/ceph/crush/crush.c (renamed from fs/ceph/crush/crush.c)2
-rw-r--r--net/ceph/crush/hash.c (renamed from fs/ceph/crush/hash.c)2
-rw-r--r--net/ceph/crush/mapper.c (renamed from fs/ceph/crush/mapper.c)4
-rw-r--r--net/ceph/crypto.c (renamed from fs/ceph/crypto.c)4
-rw-r--r--net/ceph/crypto.h (renamed from fs/ceph/crypto.h)4
-rw-r--r--net/ceph/debugfs.c267
-rw-r--r--net/ceph/messenger.c (renamed from fs/ceph/messenger.c)296
-rw-r--r--net/ceph/mon_client.c (renamed from fs/ceph/mon_client.c)73
-rw-r--r--net/ceph/msgpool.c (renamed from fs/ceph/msgpool.c)4
-rw-r--r--net/ceph/osd_client.c (renamed from fs/ceph/osd_client.c)400
-rw-r--r--net/ceph/osdmap.c (renamed from fs/ceph/osdmap.c)30
-rw-r--r--net/ceph/pagelist.c154
-rw-r--r--net/ceph/pagevec.c223
125 files changed, 6971 insertions, 3138 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index f2a2b8e647c5..3d4179fbc526 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1527,6 +1527,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
1527S: Supported 1527S: Supported
1528F: Documentation/filesystems/ceph.txt 1528F: Documentation/filesystems/ceph.txt
1529F: fs/ceph 1529F: fs/ceph
1530F: net/ceph
1531F: include/linux/ceph
1530 1532
1531CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: 1533CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
1532M: David Vrabel <david.vrabel@csr.com> 1534M: David Vrabel <david.vrabel@csr.com>
@@ -4805,6 +4807,15 @@ F: fs/qnx4/
4805F: include/linux/qnx4_fs.h 4807F: include/linux/qnx4_fs.h
4806F: include/linux/qnxtypes.h 4808F: include/linux/qnxtypes.h
4807 4809
4810RADOS BLOCK DEVICE (RBD)
4811F: include/linux/qnxtypes.h
4812M: Yehuda Sadeh <yehuda@hq.newdream.net>
4813M: Sage Weil <sage@newdream.net>
4814M: ceph-devel@vger.kernel.org
4815S: Supported
4816F: drivers/block/rbd.c
4817F: drivers/block/rbd_types.h
4818
4808RADEON FRAMEBUFFER DISPLAY DRIVER 4819RADEON FRAMEBUFFER DISPLAY DRIVER
4809M: Benjamin Herrenschmidt <benh@kernel.crashing.org> 4820M: Benjamin Herrenschmidt <benh@kernel.crashing.org>
4810L: linux-fbdev@vger.kernel.org 4821L: linux-fbdev@vger.kernel.org
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index de277689da61..4b9359a6f6ca 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -488,4 +488,21 @@ config BLK_DEV_HD
488 488
489 If unsure, say N. 489 If unsure, say N.
490 490
491config BLK_DEV_RBD
492 tristate "Rados block device (RBD)"
493 depends on INET && EXPERIMENTAL && BLOCK
494 select CEPH_LIB
495 select LIBCRC32C
496 select CRYPTO_AES
497 select CRYPTO
498 default n
499 help
500 Say Y here if you want include the Rados block device, which stripes
501 a block device over objects stored in the Ceph distributed object
502 store.
503
504 More information at http://ceph.newdream.net/.
505
506 If unsure, say N.
507
491endif # BLK_DEV 508endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index aff5ac925c34..d7f463d6312d 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o
37 37
38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
39obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ 39obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
40obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
40 41
41swim_mod-objs := swim.o swim_asm.o 42swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644
index 000000000000..6ec9d53806c5
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,1841 @@
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
24 Instructions for use
25 --------------------
26
27 1) Map a Linux block device to an existing rbd image.
28
29 Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name]
30
31 $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add
32
33 The snapshot name can be "-" or omitted to map the image read/write.
34
35 2) List all active blkdev<->object mappings.
36
37 In this example, we have performed step #1 twice, creating two blkdevs,
38 mapped to two separate rados objects in the rados rbd pool
39
40 $ cat /sys/class/rbd/list
41 #id major client_name pool name snap KB
42 0 254 client4143 rbd foo - 1024000
43
44 The columns, in order, are:
45 - blkdev unique id
46 - blkdev assigned major
47 - rados client id
48 - rados pool name
49 - rados block device name
50 - mapped snapshot ("-" if none)
51 - device size in KB
52
53
54 3) Create a snapshot.
55
56 Usage: <blkdev id> <snapname>
57
58 $ echo "0 mysnap" > /sys/class/rbd/snap_create
59
60
61 4) Listing a snapshot.
62
63 $ cat /sys/class/rbd/snaps_list
64 #id snap KB
65 0 - 1024000 (*)
66 0 foo 1024000
67
68 The columns, in order, are:
69 - blkdev unique id
70 - snapshot name, '-' means none (active read/write version)
71 - size of device at time of snapshot
72 - the (*) indicates this is the active version
73
74 5) Rollback to snapshot.
75
76 Usage: <blkdev id> <snapname>
77
78 $ echo "0 mysnap" > /sys/class/rbd/snap_rollback
79
80
81 6) Mapping an image using snapshot.
82
83 A snapshot mapping is read-only. This is being done by passing
84 snap=<snapname> to the options when adding a device.
85
86 $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add
87
88
89 7) Remove an active blkdev<->rbd image mapping.
90
91 In this example, we remove the mapping with blkdev unique id 1.
92
93 $ echo 1 > /sys/class/rbd/remove
94
95
96 NOTE: The actual creation and deletion of rados objects is outside the scope
97 of this driver.
98
99 */
100
101#include <linux/ceph/libceph.h>
102#include <linux/ceph/osd_client.h>
103#include <linux/ceph/mon_client.h>
104#include <linux/ceph/decode.h>
105
106#include <linux/kernel.h>
107#include <linux/device.h>
108#include <linux/module.h>
109#include <linux/fs.h>
110#include <linux/blkdev.h>
111
112#include "rbd_types.h"
113
114#define DRV_NAME "rbd"
115#define DRV_NAME_LONG "rbd (rados block device)"
116
117#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
118
119#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
120#define RBD_MAX_POOL_NAME_LEN 64
121#define RBD_MAX_SNAP_NAME_LEN 32
122#define RBD_MAX_OPT_LEN 1024
123
124#define RBD_SNAP_HEAD_NAME "-"
125
126#define DEV_NAME_LEN 32
127
128/*
129 * block device image metadata (in-memory version)
130 */
131struct rbd_image_header {
132 u64 image_size;
133 char block_name[32];
134 __u8 obj_order;
135 __u8 crypt_type;
136 __u8 comp_type;
137 struct rw_semaphore snap_rwsem;
138 struct ceph_snap_context *snapc;
139 size_t snap_names_len;
140 u64 snap_seq;
141 u32 total_snaps;
142
143 char *snap_names;
144 u64 *snap_sizes;
145};
146
147/*
148 * an instance of the client. multiple devices may share a client.
149 */
150struct rbd_client {
151 struct ceph_client *client;
152 struct kref kref;
153 struct list_head node;
154};
155
156/*
157 * a single io request
158 */
159struct rbd_request {
160 struct request *rq; /* blk layer request */
161 struct bio *bio; /* cloned bio */
162 struct page **pages; /* list of used pages */
163 u64 len;
164};
165
166/*
167 * a single device
168 */
169struct rbd_device {
170 int id; /* blkdev unique id */
171
172 int major; /* blkdev assigned major */
173 struct gendisk *disk; /* blkdev's gendisk and rq */
174 struct request_queue *q;
175
176 struct ceph_client *client;
177 struct rbd_client *rbd_client;
178
179 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
180
181 spinlock_t lock; /* queue lock */
182
183 struct rbd_image_header header;
184 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
185 int obj_len;
186 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
187 char pool_name[RBD_MAX_POOL_NAME_LEN];
188 int poolid;
189
190 char snap_name[RBD_MAX_SNAP_NAME_LEN];
191 u32 cur_snap; /* index+1 of current snapshot within snap context
192 0 - for the head */
193 int read_only;
194
195 struct list_head node;
196};
197
198static spinlock_t node_lock; /* protects client get/put */
199
200static struct class *class_rbd; /* /sys/class/rbd */
201static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
202static LIST_HEAD(rbd_dev_list); /* devices */
203static LIST_HEAD(rbd_client_list); /* clients */
204
205
206static int rbd_open(struct block_device *bdev, fmode_t mode)
207{
208 struct gendisk *disk = bdev->bd_disk;
209 struct rbd_device *rbd_dev = disk->private_data;
210
211 set_device_ro(bdev, rbd_dev->read_only);
212
213 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
214 return -EROFS;
215
216 return 0;
217}
218
219static const struct block_device_operations rbd_bd_ops = {
220 .owner = THIS_MODULE,
221 .open = rbd_open,
222};
223
224/*
225 * Initialize an rbd client instance.
226 * We own *opt.
227 */
228static struct rbd_client *rbd_client_create(struct ceph_options *opt)
229{
230 struct rbd_client *rbdc;
231 int ret = -ENOMEM;
232
233 dout("rbd_client_create\n");
234 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
235 if (!rbdc)
236 goto out_opt;
237
238 kref_init(&rbdc->kref);
239 INIT_LIST_HEAD(&rbdc->node);
240
241 rbdc->client = ceph_create_client(opt, rbdc);
242 if (IS_ERR(rbdc->client))
243 goto out_rbdc;
244 opt = NULL; /* Now rbdc->client is responsible for opt */
245
246 ret = ceph_open_session(rbdc->client);
247 if (ret < 0)
248 goto out_err;
249
250 spin_lock(&node_lock);
251 list_add_tail(&rbdc->node, &rbd_client_list);
252 spin_unlock(&node_lock);
253
254 dout("rbd_client_create created %p\n", rbdc);
255 return rbdc;
256
257out_err:
258 ceph_destroy_client(rbdc->client);
259out_rbdc:
260 kfree(rbdc);
261out_opt:
262 if (opt)
263 ceph_destroy_options(opt);
264 return ERR_PTR(ret);
265}
266
267/*
268 * Find a ceph client with specific addr and configuration.
269 */
270static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
271{
272 struct rbd_client *client_node;
273
274 if (opt->flags & CEPH_OPT_NOSHARE)
275 return NULL;
276
277 list_for_each_entry(client_node, &rbd_client_list, node)
278 if (ceph_compare_options(opt, client_node->client) == 0)
279 return client_node;
280 return NULL;
281}
282
283/*
284 * Get a ceph client with specific addr and configuration, if one does
285 * not exist create it.
286 */
287static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
288 char *options)
289{
290 struct rbd_client *rbdc;
291 struct ceph_options *opt;
292 int ret;
293
294 ret = ceph_parse_options(&opt, options, mon_addr,
295 mon_addr + strlen(mon_addr), NULL, NULL);
296 if (ret < 0)
297 return ret;
298
299 spin_lock(&node_lock);
300 rbdc = __rbd_client_find(opt);
301 if (rbdc) {
302 ceph_destroy_options(opt);
303
304 /* using an existing client */
305 kref_get(&rbdc->kref);
306 rbd_dev->rbd_client = rbdc;
307 rbd_dev->client = rbdc->client;
308 spin_unlock(&node_lock);
309 return 0;
310 }
311 spin_unlock(&node_lock);
312
313 rbdc = rbd_client_create(opt);
314 if (IS_ERR(rbdc))
315 return PTR_ERR(rbdc);
316
317 rbd_dev->rbd_client = rbdc;
318 rbd_dev->client = rbdc->client;
319 return 0;
320}
321
322/*
323 * Destroy ceph client
324 */
325static void rbd_client_release(struct kref *kref)
326{
327 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
328
329 dout("rbd_release_client %p\n", rbdc);
330 spin_lock(&node_lock);
331 list_del(&rbdc->node);
332 spin_unlock(&node_lock);
333
334 ceph_destroy_client(rbdc->client);
335 kfree(rbdc);
336}
337
338/*
339 * Drop reference to ceph client node. If it's not referenced anymore, release
340 * it.
341 */
342static void rbd_put_client(struct rbd_device *rbd_dev)
343{
344 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
345 rbd_dev->rbd_client = NULL;
346 rbd_dev->client = NULL;
347}
348
349
350/*
351 * Create a new header structure, translate header format from the on-disk
352 * header.
353 */
354static int rbd_header_from_disk(struct rbd_image_header *header,
355 struct rbd_image_header_ondisk *ondisk,
356 int allocated_snaps,
357 gfp_t gfp_flags)
358{
359 int i;
360 u32 snap_count = le32_to_cpu(ondisk->snap_count);
361 int ret = -ENOMEM;
362
363 init_rwsem(&header->snap_rwsem);
364
365 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
366 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
367 snap_count *
368 sizeof(struct rbd_image_snap_ondisk),
369 gfp_flags);
370 if (!header->snapc)
371 return -ENOMEM;
372 if (snap_count) {
373 header->snap_names = kmalloc(header->snap_names_len,
374 GFP_KERNEL);
375 if (!header->snap_names)
376 goto err_snapc;
377 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
378 GFP_KERNEL);
379 if (!header->snap_sizes)
380 goto err_names;
381 } else {
382 header->snap_names = NULL;
383 header->snap_sizes = NULL;
384 }
385 memcpy(header->block_name, ondisk->block_name,
386 sizeof(ondisk->block_name));
387
388 header->image_size = le64_to_cpu(ondisk->image_size);
389 header->obj_order = ondisk->options.order;
390 header->crypt_type = ondisk->options.crypt_type;
391 header->comp_type = ondisk->options.comp_type;
392
393 atomic_set(&header->snapc->nref, 1);
394 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
395 header->snapc->num_snaps = snap_count;
396 header->total_snaps = snap_count;
397
398 if (snap_count &&
399 allocated_snaps == snap_count) {
400 for (i = 0; i < snap_count; i++) {
401 header->snapc->snaps[i] =
402 le64_to_cpu(ondisk->snaps[i].id);
403 header->snap_sizes[i] =
404 le64_to_cpu(ondisk->snaps[i].image_size);
405 }
406
407 /* copy snapshot names */
408 memcpy(header->snap_names, &ondisk->snaps[i],
409 header->snap_names_len);
410 }
411
412 return 0;
413
414err_names:
415 kfree(header->snap_names);
416err_snapc:
417 kfree(header->snapc);
418 return ret;
419}
420
421static int snap_index(struct rbd_image_header *header, int snap_num)
422{
423 return header->total_snaps - snap_num;
424}
425
426static u64 cur_snap_id(struct rbd_device *rbd_dev)
427{
428 struct rbd_image_header *header = &rbd_dev->header;
429
430 if (!rbd_dev->cur_snap)
431 return 0;
432
433 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
434}
435
436static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
437 u64 *seq, u64 *size)
438{
439 int i;
440 char *p = header->snap_names;
441
442 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
443 if (strcmp(snap_name, p) == 0)
444 break;
445 }
446 if (i == header->total_snaps)
447 return -ENOENT;
448 if (seq)
449 *seq = header->snapc->snaps[i];
450
451 if (size)
452 *size = header->snap_sizes[i];
453
454 return i;
455}
456
457static int rbd_header_set_snap(struct rbd_device *dev,
458 const char *snap_name,
459 u64 *size)
460{
461 struct rbd_image_header *header = &dev->header;
462 struct ceph_snap_context *snapc = header->snapc;
463 int ret = -ENOENT;
464
465 down_write(&header->snap_rwsem);
466
467 if (!snap_name ||
468 !*snap_name ||
469 strcmp(snap_name, "-") == 0 ||
470 strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
471 if (header->total_snaps)
472 snapc->seq = header->snap_seq;
473 else
474 snapc->seq = 0;
475 dev->cur_snap = 0;
476 dev->read_only = 0;
477 if (size)
478 *size = header->image_size;
479 } else {
480 ret = snap_by_name(header, snap_name, &snapc->seq, size);
481 if (ret < 0)
482 goto done;
483
484 dev->cur_snap = header->total_snaps - ret;
485 dev->read_only = 1;
486 }
487
488 ret = 0;
489done:
490 up_write(&header->snap_rwsem);
491 return ret;
492}
493
494static void rbd_header_free(struct rbd_image_header *header)
495{
496 kfree(header->snapc);
497 kfree(header->snap_names);
498 kfree(header->snap_sizes);
499}
500
501/*
502 * get the actual striped segment name, offset and length
503 */
504static u64 rbd_get_segment(struct rbd_image_header *header,
505 const char *block_name,
506 u64 ofs, u64 len,
507 char *seg_name, u64 *segofs)
508{
509 u64 seg = ofs >> header->obj_order;
510
511 if (seg_name)
512 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
513 "%s.%012llx", block_name, seg);
514
515 ofs = ofs & ((1 << header->obj_order) - 1);
516 len = min_t(u64, len, (1 << header->obj_order) - ofs);
517
518 if (segofs)
519 *segofs = ofs;
520
521 return len;
522}
523
524/*
525 * bio helpers
526 */
527
528static void bio_chain_put(struct bio *chain)
529{
530 struct bio *tmp;
531
532 while (chain) {
533 tmp = chain;
534 chain = chain->bi_next;
535 bio_put(tmp);
536 }
537}
538
539/*
540 * zeros a bio chain, starting at specific offset
541 */
542static void zero_bio_chain(struct bio *chain, int start_ofs)
543{
544 struct bio_vec *bv;
545 unsigned long flags;
546 void *buf;
547 int i;
548 int pos = 0;
549
550 while (chain) {
551 bio_for_each_segment(bv, chain, i) {
552 if (pos + bv->bv_len > start_ofs) {
553 int remainder = max(start_ofs - pos, 0);
554 buf = bvec_kmap_irq(bv, &flags);
555 memset(buf + remainder, 0,
556 bv->bv_len - remainder);
557 bvec_kunmap_irq(buf, &flags);
558 }
559 pos += bv->bv_len;
560 }
561
562 chain = chain->bi_next;
563 }
564}
565
566/*
567 * bio_chain_clone - clone a chain of bios up to a certain length.
568 * might return a bio_pair that will need to be released.
569 */
570static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
571 struct bio_pair **bp,
572 int len, gfp_t gfpmask)
573{
574 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
575 int total = 0;
576
577 if (*bp) {
578 bio_pair_release(*bp);
579 *bp = NULL;
580 }
581
582 while (old_chain && (total < len)) {
583 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
584 if (!tmp)
585 goto err_out;
586
587 if (total + old_chain->bi_size > len) {
588 struct bio_pair *bp;
589
590 /*
591 * this split can only happen with a single paged bio,
592 * split_bio will BUG_ON if this is not the case
593 */
594 dout("bio_chain_clone split! total=%d remaining=%d"
595 "bi_size=%d\n",
596 (int)total, (int)len-total,
597 (int)old_chain->bi_size);
598
599 /* split the bio. We'll release it either in the next
600 call, or it will have to be released outside */
601 bp = bio_split(old_chain, (len - total) / 512ULL);
602 if (!bp)
603 goto err_out;
604
605 __bio_clone(tmp, &bp->bio1);
606
607 *next = &bp->bio2;
608 } else {
609 __bio_clone(tmp, old_chain);
610 *next = old_chain->bi_next;
611 }
612
613 tmp->bi_bdev = NULL;
614 gfpmask &= ~__GFP_WAIT;
615 tmp->bi_next = NULL;
616
617 if (!new_chain) {
618 new_chain = tail = tmp;
619 } else {
620 tail->bi_next = tmp;
621 tail = tmp;
622 }
623 old_chain = old_chain->bi_next;
624
625 total += tmp->bi_size;
626 }
627
628 BUG_ON(total < len);
629
630 if (tail)
631 tail->bi_next = NULL;
632
633 *old = old_chain;
634
635 return new_chain;
636
637err_out:
638 dout("bio_chain_clone with err\n");
639 bio_chain_put(new_chain);
640 return NULL;
641}
642
643/*
644 * helpers for osd request op vectors.
645 */
646static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
647 int num_ops,
648 int opcode,
649 u32 payload_len)
650{
651 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
652 GFP_NOIO);
653 if (!*ops)
654 return -ENOMEM;
655 (*ops)[0].op = opcode;
656 /*
657 * op extent offset and length will be set later on
658 * in calc_raw_layout()
659 */
660 (*ops)[0].payload_len = payload_len;
661 return 0;
662}
663
664static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
665{
666 kfree(ops);
667}
668
669/*
670 * Send ceph osd request
671 */
672static int rbd_do_request(struct request *rq,
673 struct rbd_device *dev,
674 struct ceph_snap_context *snapc,
675 u64 snapid,
676 const char *obj, u64 ofs, u64 len,
677 struct bio *bio,
678 struct page **pages,
679 int num_pages,
680 int flags,
681 struct ceph_osd_req_op *ops,
682 int num_reply,
683 void (*rbd_cb)(struct ceph_osd_request *req,
684 struct ceph_msg *msg))
685{
686 struct ceph_osd_request *req;
687 struct ceph_file_layout *layout;
688 int ret;
689 u64 bno;
690 struct timespec mtime = CURRENT_TIME;
691 struct rbd_request *req_data;
692 struct ceph_osd_request_head *reqhead;
693 struct rbd_image_header *header = &dev->header;
694
695 ret = -ENOMEM;
696 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
697 if (!req_data)
698 goto done;
699
700 dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs);
701
702 down_read(&header->snap_rwsem);
703
704 req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
705 snapc,
706 ops,
707 false,
708 GFP_NOIO, pages, bio);
709 if (IS_ERR(req)) {
710 up_read(&header->snap_rwsem);
711 ret = PTR_ERR(req);
712 goto done_pages;
713 }
714
715 req->r_callback = rbd_cb;
716
717 req_data->rq = rq;
718 req_data->bio = bio;
719 req_data->pages = pages;
720 req_data->len = len;
721
722 req->r_priv = req_data;
723
724 reqhead = req->r_request->front.iov_base;
725 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
726
727 strncpy(req->r_oid, obj, sizeof(req->r_oid));
728 req->r_oid_len = strlen(req->r_oid);
729
730 layout = &req->r_file_layout;
731 memset(layout, 0, sizeof(*layout));
732 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
733 layout->fl_stripe_count = cpu_to_le32(1);
734 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
735 layout->fl_pg_preferred = cpu_to_le32(-1);
736 layout->fl_pg_pool = cpu_to_le32(dev->poolid);
737 ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
738 ofs, &len, &bno, req, ops);
739
740 ceph_osdc_build_request(req, ofs, &len,
741 ops,
742 snapc,
743 &mtime,
744 req->r_oid, req->r_oid_len);
745 up_read(&header->snap_rwsem);
746
747 ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
748 if (ret < 0)
749 goto done_err;
750
751 if (!rbd_cb) {
752 ret = ceph_osdc_wait_request(&dev->client->osdc, req);
753 ceph_osdc_put_request(req);
754 }
755 return ret;
756
757done_err:
758 bio_chain_put(req_data->bio);
759 ceph_osdc_put_request(req);
760done_pages:
761 kfree(req_data);
762done:
763 if (rq)
764 blk_end_request(rq, ret, len);
765 return ret;
766}
767
768/*
769 * Ceph osd op callback
770 */
771static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
772{
773 struct rbd_request *req_data = req->r_priv;
774 struct ceph_osd_reply_head *replyhead;
775 struct ceph_osd_op *op;
776 __s32 rc;
777 u64 bytes;
778 int read_op;
779
780 /* parse reply */
781 replyhead = msg->front.iov_base;
782 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
783 op = (void *)(replyhead + 1);
784 rc = le32_to_cpu(replyhead->result);
785 bytes = le64_to_cpu(op->extent.length);
786 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
787
788 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
789
790 if (rc == -ENOENT && read_op) {
791 zero_bio_chain(req_data->bio, 0);
792 rc = 0;
793 } else if (rc == 0 && read_op && bytes < req_data->len) {
794 zero_bio_chain(req_data->bio, bytes);
795 bytes = req_data->len;
796 }
797
798 blk_end_request(req_data->rq, rc, bytes);
799
800 if (req_data->bio)
801 bio_chain_put(req_data->bio);
802
803 ceph_osdc_put_request(req);
804 kfree(req_data);
805}
806
807/*
808 * Do a synchronous ceph osd operation
809 */
810static int rbd_req_sync_op(struct rbd_device *dev,
811 struct ceph_snap_context *snapc,
812 u64 snapid,
813 int opcode,
814 int flags,
815 struct ceph_osd_req_op *orig_ops,
816 int num_reply,
817 const char *obj,
818 u64 ofs, u64 len,
819 char *buf)
820{
821 int ret;
822 struct page **pages;
823 int num_pages;
824 struct ceph_osd_req_op *ops = orig_ops;
825 u32 payload_len;
826
827 num_pages = calc_pages_for(ofs , len);
828 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
829 if (IS_ERR(pages))
830 return PTR_ERR(pages);
831
832 if (!orig_ops) {
833 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
834 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
835 if (ret < 0)
836 goto done;
837
838 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
839 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
840 if (ret < 0)
841 goto done_ops;
842 }
843 }
844
845 ret = rbd_do_request(NULL, dev, snapc, snapid,
846 obj, ofs, len, NULL,
847 pages, num_pages,
848 flags,
849 ops,
850 2,
851 NULL);
852 if (ret < 0)
853 goto done_ops;
854
855 if ((flags & CEPH_OSD_FLAG_READ) && buf)
856 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
857
858done_ops:
859 if (!orig_ops)
860 rbd_destroy_ops(ops);
861done:
862 ceph_release_page_vector(pages, num_pages);
863 return ret;
864}
865
866/*
867 * Do an asynchronous ceph osd operation
868 */
869static int rbd_do_op(struct request *rq,
870 struct rbd_device *rbd_dev ,
871 struct ceph_snap_context *snapc,
872 u64 snapid,
873 int opcode, int flags, int num_reply,
874 u64 ofs, u64 len,
875 struct bio *bio)
876{
877 char *seg_name;
878 u64 seg_ofs;
879 u64 seg_len;
880 int ret;
881 struct ceph_osd_req_op *ops;
882 u32 payload_len;
883
884 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
885 if (!seg_name)
886 return -ENOMEM;
887
888 seg_len = rbd_get_segment(&rbd_dev->header,
889 rbd_dev->header.block_name,
890 ofs, len,
891 seg_name, &seg_ofs);
892
893 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
894
895 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
896 if (ret < 0)
897 goto done;
898
899 /* we've taken care of segment sizes earlier when we
900 cloned the bios. We should never have a segment
901 truncated at this point */
902 BUG_ON(seg_len < len);
903
904 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
905 seg_name, seg_ofs, seg_len,
906 bio,
907 NULL, 0,
908 flags,
909 ops,
910 num_reply,
911 rbd_req_cb);
912done:
913 kfree(seg_name);
914 return ret;
915}
916
917/*
918 * Request async osd write
919 */
920static int rbd_req_write(struct request *rq,
921 struct rbd_device *rbd_dev,
922 struct ceph_snap_context *snapc,
923 u64 ofs, u64 len,
924 struct bio *bio)
925{
926 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
927 CEPH_OSD_OP_WRITE,
928 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
929 2,
930 ofs, len, bio);
931}
932
933/*
934 * Request async osd read
935 */
936static int rbd_req_read(struct request *rq,
937 struct rbd_device *rbd_dev,
938 u64 snapid,
939 u64 ofs, u64 len,
940 struct bio *bio)
941{
942 return rbd_do_op(rq, rbd_dev, NULL,
943 (snapid ? snapid : CEPH_NOSNAP),
944 CEPH_OSD_OP_READ,
945 CEPH_OSD_FLAG_READ,
946 2,
947 ofs, len, bio);
948}
949
950/*
951 * Request sync osd read
952 */
953static int rbd_req_sync_read(struct rbd_device *dev,
954 struct ceph_snap_context *snapc,
955 u64 snapid,
956 const char *obj,
957 u64 ofs, u64 len,
958 char *buf)
959{
960 return rbd_req_sync_op(dev, NULL,
961 (snapid ? snapid : CEPH_NOSNAP),
962 CEPH_OSD_OP_READ,
963 CEPH_OSD_FLAG_READ,
964 NULL,
965 1, obj, ofs, len, buf);
966}
967
968/*
969 * Request sync osd read
970 */
971static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
972 u64 snapid,
973 const char *obj)
974{
975 struct ceph_osd_req_op *ops;
976 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
977 if (ret < 0)
978 return ret;
979
980 ops[0].snap.snapid = snapid;
981
982 ret = rbd_req_sync_op(dev, NULL,
983 CEPH_NOSNAP,
984 0,
985 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
986 ops,
987 1, obj, 0, 0, NULL);
988
989 rbd_destroy_ops(ops);
990
991 if (ret < 0)
992 return ret;
993
994 return ret;
995}
996
997/*
998 * Request sync osd read
999 */
1000static int rbd_req_sync_exec(struct rbd_device *dev,
1001 const char *obj,
1002 const char *cls,
1003 const char *method,
1004 const char *data,
1005 int len)
1006{
1007 struct ceph_osd_req_op *ops;
1008 int cls_len = strlen(cls);
1009 int method_len = strlen(method);
1010 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1011 cls_len + method_len + len);
1012 if (ret < 0)
1013 return ret;
1014
1015 ops[0].cls.class_name = cls;
1016 ops[0].cls.class_len = (__u8)cls_len;
1017 ops[0].cls.method_name = method;
1018 ops[0].cls.method_len = (__u8)method_len;
1019 ops[0].cls.argc = 0;
1020 ops[0].cls.indata = data;
1021 ops[0].cls.indata_len = len;
1022
1023 ret = rbd_req_sync_op(dev, NULL,
1024 CEPH_NOSNAP,
1025 0,
1026 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1027 ops,
1028 1, obj, 0, 0, NULL);
1029
1030 rbd_destroy_ops(ops);
1031
1032 dout("cls_exec returned %d\n", ret);
1033 return ret;
1034}
1035
1036/*
1037 * block device queue callback
1038 */
1039static void rbd_rq_fn(struct request_queue *q)
1040{
1041 struct rbd_device *rbd_dev = q->queuedata;
1042 struct request *rq;
1043 struct bio_pair *bp = NULL;
1044
1045 rq = blk_fetch_request(q);
1046
1047 while (1) {
1048 struct bio *bio;
1049 struct bio *rq_bio, *next_bio = NULL;
1050 bool do_write;
1051 int size, op_size = 0;
1052 u64 ofs;
1053
1054 /* peek at request from block layer */
1055 if (!rq)
1056 break;
1057
1058 dout("fetched request\n");
1059
1060 /* filter out block requests we don't understand */
1061 if ((rq->cmd_type != REQ_TYPE_FS)) {
1062 __blk_end_request_all(rq, 0);
1063 goto next;
1064 }
1065
1066 /* deduce our operation (read, write) */
1067 do_write = (rq_data_dir(rq) == WRITE);
1068
1069 size = blk_rq_bytes(rq);
1070 ofs = blk_rq_pos(rq) * 512ULL;
1071 rq_bio = rq->bio;
1072 if (do_write && rbd_dev->read_only) {
1073 __blk_end_request_all(rq, -EROFS);
1074 goto next;
1075 }
1076
1077 spin_unlock_irq(q->queue_lock);
1078
1079 dout("%s 0x%x bytes at 0x%llx\n",
1080 do_write ? "write" : "read",
1081 size, blk_rq_pos(rq) * 512ULL);
1082
1083 do {
1084 /* a bio clone to be passed down to OSD req */
1085 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1086 op_size = rbd_get_segment(&rbd_dev->header,
1087 rbd_dev->header.block_name,
1088 ofs, size,
1089 NULL, NULL);
1090 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1091 op_size, GFP_ATOMIC);
1092 if (!bio) {
1093 spin_lock_irq(q->queue_lock);
1094 __blk_end_request_all(rq, -ENOMEM);
1095 goto next;
1096 }
1097
1098 /* init OSD command: write or read */
1099 if (do_write)
1100 rbd_req_write(rq, rbd_dev,
1101 rbd_dev->header.snapc,
1102 ofs,
1103 op_size, bio);
1104 else
1105 rbd_req_read(rq, rbd_dev,
1106 cur_snap_id(rbd_dev),
1107 ofs,
1108 op_size, bio);
1109
1110 size -= op_size;
1111 ofs += op_size;
1112
1113 rq_bio = next_bio;
1114 } while (size > 0);
1115
1116 if (bp)
1117 bio_pair_release(bp);
1118
1119 spin_lock_irq(q->queue_lock);
1120next:
1121 rq = blk_fetch_request(q);
1122 }
1123}
1124
1125/*
1126 * a queue callback. Makes sure that we don't create a bio that spans across
1127 * multiple osd objects. One exception would be with a single page bios,
1128 * which we handle later at bio_chain_clone
1129 */
1130static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1131 struct bio_vec *bvec)
1132{
1133 struct rbd_device *rbd_dev = q->queuedata;
1134 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1135 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1136 unsigned int bio_sectors = bmd->bi_size >> 9;
1137 int max;
1138
1139 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
1140 + bio_sectors)) << 9;
1141 if (max < 0)
1142 max = 0; /* bio_add cannot handle a negative return */
1143 if (max <= bvec->bv_len && bio_sectors == 0)
1144 return bvec->bv_len;
1145 return max;
1146}
1147
1148static void rbd_free_disk(struct rbd_device *rbd_dev)
1149{
1150 struct gendisk *disk = rbd_dev->disk;
1151
1152 if (!disk)
1153 return;
1154
1155 rbd_header_free(&rbd_dev->header);
1156
1157 if (disk->flags & GENHD_FL_UP)
1158 del_gendisk(disk);
1159 if (disk->queue)
1160 blk_cleanup_queue(disk->queue);
1161 put_disk(disk);
1162}
1163
1164/*
1165 * reload the ondisk the header
1166 */
1167static int rbd_read_header(struct rbd_device *rbd_dev,
1168 struct rbd_image_header *header)
1169{
1170 ssize_t rc;
1171 struct rbd_image_header_ondisk *dh;
1172 int snap_count = 0;
1173 u64 snap_names_len = 0;
1174
1175 while (1) {
1176 int len = sizeof(*dh) +
1177 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1178 snap_names_len;
1179
1180 rc = -ENOMEM;
1181 dh = kmalloc(len, GFP_KERNEL);
1182 if (!dh)
1183 return -ENOMEM;
1184
1185 rc = rbd_req_sync_read(rbd_dev,
1186 NULL, CEPH_NOSNAP,
1187 rbd_dev->obj_md_name,
1188 0, len,
1189 (char *)dh);
1190 if (rc < 0)
1191 goto out_dh;
1192
1193 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
1194 if (rc < 0)
1195 goto out_dh;
1196
1197 if (snap_count != header->total_snaps) {
1198 snap_count = header->total_snaps;
1199 snap_names_len = header->snap_names_len;
1200 rbd_header_free(header);
1201 kfree(dh);
1202 continue;
1203 }
1204 break;
1205 }
1206
1207out_dh:
1208 kfree(dh);
1209 return rc;
1210}
1211
1212/*
1213 * create a snapshot
1214 */
1215static int rbd_header_add_snap(struct rbd_device *dev,
1216 const char *snap_name,
1217 gfp_t gfp_flags)
1218{
1219 int name_len = strlen(snap_name);
1220 u64 new_snapid;
1221 int ret;
1222 void *data, *data_start, *data_end;
1223
1224 /* we should create a snapshot only if we're pointing at the head */
1225 if (dev->cur_snap)
1226 return -EINVAL;
1227
1228 ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
1229 &new_snapid);
1230 dout("created snapid=%lld\n", new_snapid);
1231 if (ret < 0)
1232 return ret;
1233
1234 data = kmalloc(name_len + 16, gfp_flags);
1235 if (!data)
1236 return -ENOMEM;
1237
1238 data_start = data;
1239 data_end = data + name_len + 16;
1240
1241 ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad);
1242 ceph_encode_64_safe(&data, data_end, new_snapid, bad);
1243
1244 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
1245 data_start, data - data_start);
1246
1247 kfree(data_start);
1248
1249 if (ret < 0)
1250 return ret;
1251
1252 dev->header.snapc->seq = new_snapid;
1253
1254 return 0;
1255bad:
1256 return -ERANGE;
1257}
1258
1259/*
1260 * only read the first part of the ondisk header, without the snaps info
1261 */
1262static int rbd_update_snaps(struct rbd_device *rbd_dev)
1263{
1264 int ret;
1265 struct rbd_image_header h;
1266 u64 snap_seq;
1267
1268 ret = rbd_read_header(rbd_dev, &h);
1269 if (ret < 0)
1270 return ret;
1271
1272 down_write(&rbd_dev->header.snap_rwsem);
1273
1274 snap_seq = rbd_dev->header.snapc->seq;
1275
1276 kfree(rbd_dev->header.snapc);
1277 kfree(rbd_dev->header.snap_names);
1278 kfree(rbd_dev->header.snap_sizes);
1279
1280 rbd_dev->header.total_snaps = h.total_snaps;
1281 rbd_dev->header.snapc = h.snapc;
1282 rbd_dev->header.snap_names = h.snap_names;
1283 rbd_dev->header.snap_sizes = h.snap_sizes;
1284 rbd_dev->header.snapc->seq = snap_seq;
1285
1286 up_write(&rbd_dev->header.snap_rwsem);
1287
1288 return 0;
1289}
1290
1291static int rbd_init_disk(struct rbd_device *rbd_dev)
1292{
1293 struct gendisk *disk;
1294 struct request_queue *q;
1295 int rc;
1296 u64 total_size = 0;
1297
1298 /* contact OSD, request size info about the object being mapped */
1299 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1300 if (rc)
1301 return rc;
1302
1303 rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
1304 if (rc)
1305 return rc;
1306
1307 /* create gendisk info */
1308 rc = -ENOMEM;
1309 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1310 if (!disk)
1311 goto out;
1312
1313 sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id);
1314 disk->major = rbd_dev->major;
1315 disk->first_minor = 0;
1316 disk->fops = &rbd_bd_ops;
1317 disk->private_data = rbd_dev;
1318
1319 /* init rq */
1320 rc = -ENOMEM;
1321 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1322 if (!q)
1323 goto out_disk;
1324 blk_queue_merge_bvec(q, rbd_merge_bvec);
1325 disk->queue = q;
1326
1327 q->queuedata = rbd_dev;
1328
1329 rbd_dev->disk = disk;
1330 rbd_dev->q = q;
1331
1332 /* finally, announce the disk to the world */
1333 set_capacity(disk, total_size / 512ULL);
1334 add_disk(disk);
1335
1336 pr_info("%s: added with size 0x%llx\n",
1337 disk->disk_name, (unsigned long long)total_size);
1338 return 0;
1339
1340out_disk:
1341 put_disk(disk);
1342out:
1343 return rc;
1344}
1345
1346/********************************************************************
1347 * /sys/class/rbd/
1348 * add map rados objects to blkdev
1349 * remove unmap rados objects
1350 * list show mappings
1351 *******************************************************************/
1352
1353static void class_rbd_release(struct class *cls)
1354{
1355 kfree(cls);
1356}
1357
1358static ssize_t class_rbd_list(struct class *c,
1359 struct class_attribute *attr,
1360 char *data)
1361{
1362 int n = 0;
1363 struct list_head *tmp;
1364 int max = PAGE_SIZE;
1365
1366 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1367
1368 n += snprintf(data, max,
1369 "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n");
1370
1371 list_for_each(tmp, &rbd_dev_list) {
1372 struct rbd_device *rbd_dev;
1373
1374 rbd_dev = list_entry(tmp, struct rbd_device, node);
1375 n += snprintf(data+n, max-n,
1376 "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n",
1377 rbd_dev->id,
1378 rbd_dev->major,
1379 ceph_client_id(rbd_dev->client),
1380 rbd_dev->pool_name,
1381 rbd_dev->obj, rbd_dev->snap_name,
1382 rbd_dev->header.image_size >> 10);
1383 if (n == max)
1384 break;
1385 }
1386
1387 mutex_unlock(&ctl_mutex);
1388 return n;
1389}
1390
1391static ssize_t class_rbd_add(struct class *c,
1392 struct class_attribute *attr,
1393 const char *buf, size_t count)
1394{
1395 struct ceph_osd_client *osdc;
1396 struct rbd_device *rbd_dev;
1397 ssize_t rc = -ENOMEM;
1398 int irc, new_id = 0;
1399 struct list_head *tmp;
1400 char *mon_dev_name;
1401 char *options;
1402
1403 if (!try_module_get(THIS_MODULE))
1404 return -ENODEV;
1405
1406 mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
1407 if (!mon_dev_name)
1408 goto err_out_mod;
1409
1410 options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
1411 if (!options)
1412 goto err_mon_dev;
1413
1414 /* new rbd_device object */
1415 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
1416 if (!rbd_dev)
1417 goto err_out_opt;
1418
1419 /* static rbd_device initialization */
1420 spin_lock_init(&rbd_dev->lock);
1421 INIT_LIST_HEAD(&rbd_dev->node);
1422
1423 /* generate unique id: find highest unique id, add one */
1424 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1425
1426 list_for_each(tmp, &rbd_dev_list) {
1427 struct rbd_device *rbd_dev;
1428
1429 rbd_dev = list_entry(tmp, struct rbd_device, node);
1430 if (rbd_dev->id >= new_id)
1431 new_id = rbd_dev->id + 1;
1432 }
1433
1434 rbd_dev->id = new_id;
1435
1436 /* add to global list */
1437 list_add_tail(&rbd_dev->node, &rbd_dev_list);
1438
1439 /* parse add command */
1440 if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
1441 "%" __stringify(RBD_MAX_OPT_LEN) "s "
1442 "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
1443 "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
1444 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
1445 mon_dev_name, options, rbd_dev->pool_name,
1446 rbd_dev->obj, rbd_dev->snap_name) < 4) {
1447 rc = -EINVAL;
1448 goto err_out_slot;
1449 }
1450
1451 if (rbd_dev->snap_name[0] == 0)
1452 rbd_dev->snap_name[0] = '-';
1453
1454 rbd_dev->obj_len = strlen(rbd_dev->obj);
1455 snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
1456 rbd_dev->obj, RBD_SUFFIX);
1457
1458 /* initialize rest of new object */
1459 snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
1460 rc = rbd_get_client(rbd_dev, mon_dev_name, options);
1461 if (rc < 0)
1462 goto err_out_slot;
1463
1464 mutex_unlock(&ctl_mutex);
1465
1466 /* pick the pool */
1467 osdc = &rbd_dev->client->osdc;
1468 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
1469 if (rc < 0)
1470 goto err_out_client;
1471 rbd_dev->poolid = rc;
1472
1473 /* register our block device */
1474 irc = register_blkdev(0, rbd_dev->name);
1475 if (irc < 0) {
1476 rc = irc;
1477 goto err_out_client;
1478 }
1479 rbd_dev->major = irc;
1480
1481 /* set up and announce blkdev mapping */
1482 rc = rbd_init_disk(rbd_dev);
1483 if (rc)
1484 goto err_out_blkdev;
1485
1486 return count;
1487
1488err_out_blkdev:
1489 unregister_blkdev(rbd_dev->major, rbd_dev->name);
1490err_out_client:
1491 rbd_put_client(rbd_dev);
1492 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1493err_out_slot:
1494 list_del_init(&rbd_dev->node);
1495 mutex_unlock(&ctl_mutex);
1496
1497 kfree(rbd_dev);
1498err_out_opt:
1499 kfree(options);
1500err_mon_dev:
1501 kfree(mon_dev_name);
1502err_out_mod:
1503 dout("Error adding device %s\n", buf);
1504 module_put(THIS_MODULE);
1505 return rc;
1506}
1507
1508static struct rbd_device *__rbd_get_dev(unsigned long id)
1509{
1510 struct list_head *tmp;
1511 struct rbd_device *rbd_dev;
1512
1513 list_for_each(tmp, &rbd_dev_list) {
1514 rbd_dev = list_entry(tmp, struct rbd_device, node);
1515 if (rbd_dev->id == id)
1516 return rbd_dev;
1517 }
1518 return NULL;
1519}
1520
1521static ssize_t class_rbd_remove(struct class *c,
1522 struct class_attribute *attr,
1523 const char *buf,
1524 size_t count)
1525{
1526 struct rbd_device *rbd_dev = NULL;
1527 int target_id, rc;
1528 unsigned long ul;
1529
1530 rc = strict_strtoul(buf, 10, &ul);
1531 if (rc)
1532 return rc;
1533
1534 /* convert to int; abort if we lost anything in the conversion */
1535 target_id = (int) ul;
1536 if (target_id != ul)
1537 return -EINVAL;
1538
1539 /* remove object from list immediately */
1540 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1541
1542 rbd_dev = __rbd_get_dev(target_id);
1543 if (rbd_dev)
1544 list_del_init(&rbd_dev->node);
1545
1546 mutex_unlock(&ctl_mutex);
1547
1548 if (!rbd_dev)
1549 return -ENOENT;
1550
1551 rbd_put_client(rbd_dev);
1552
1553 /* clean up and free blkdev */
1554 rbd_free_disk(rbd_dev);
1555 unregister_blkdev(rbd_dev->major, rbd_dev->name);
1556 kfree(rbd_dev);
1557
1558 /* release module ref */
1559 module_put(THIS_MODULE);
1560
1561 return count;
1562}
1563
1564static ssize_t class_rbd_snaps_list(struct class *c,
1565 struct class_attribute *attr,
1566 char *data)
1567{
1568 struct rbd_device *rbd_dev = NULL;
1569 struct list_head *tmp;
1570 struct rbd_image_header *header;
1571 int i, n = 0, max = PAGE_SIZE;
1572 int ret;
1573
1574 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1575
1576 n += snprintf(data, max, "#id\tsnap\tKB\n");
1577
1578 list_for_each(tmp, &rbd_dev_list) {
1579 char *names, *p;
1580 struct ceph_snap_context *snapc;
1581
1582 rbd_dev = list_entry(tmp, struct rbd_device, node);
1583 header = &rbd_dev->header;
1584
1585 down_read(&header->snap_rwsem);
1586
1587 names = header->snap_names;
1588 snapc = header->snapc;
1589
1590 n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
1591 rbd_dev->id, RBD_SNAP_HEAD_NAME,
1592 header->image_size >> 10,
1593 (!rbd_dev->cur_snap ? " (*)" : ""));
1594 if (n == max)
1595 break;
1596
1597 p = names;
1598 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
1599 n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
1600 rbd_dev->id, p, header->snap_sizes[i] >> 10,
1601 (rbd_dev->cur_snap &&
1602 (snap_index(header, i) == rbd_dev->cur_snap) ?
1603 " (*)" : ""));
1604 if (n == max)
1605 break;
1606 }
1607
1608 up_read(&header->snap_rwsem);
1609 }
1610
1611
1612 ret = n;
1613 mutex_unlock(&ctl_mutex);
1614 return ret;
1615}
1616
1617static ssize_t class_rbd_snaps_refresh(struct class *c,
1618 struct class_attribute *attr,
1619 const char *buf,
1620 size_t count)
1621{
1622 struct rbd_device *rbd_dev = NULL;
1623 int target_id, rc;
1624 unsigned long ul;
1625 int ret = count;
1626
1627 rc = strict_strtoul(buf, 10, &ul);
1628 if (rc)
1629 return rc;
1630
1631 /* convert to int; abort if we lost anything in the conversion */
1632 target_id = (int) ul;
1633 if (target_id != ul)
1634 return -EINVAL;
1635
1636 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1637
1638 rbd_dev = __rbd_get_dev(target_id);
1639 if (!rbd_dev) {
1640 ret = -ENOENT;
1641 goto done;
1642 }
1643
1644 rc = rbd_update_snaps(rbd_dev);
1645 if (rc < 0)
1646 ret = rc;
1647
1648done:
1649 mutex_unlock(&ctl_mutex);
1650 return ret;
1651}
1652
1653static ssize_t class_rbd_snap_create(struct class *c,
1654 struct class_attribute *attr,
1655 const char *buf,
1656 size_t count)
1657{
1658 struct rbd_device *rbd_dev = NULL;
1659 int target_id, ret;
1660 char *name;
1661
1662 name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL);
1663 if (!name)
1664 return -ENOMEM;
1665
1666 /* parse snaps add command */
1667 if (sscanf(buf, "%d "
1668 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
1669 &target_id,
1670 name) != 2) {
1671 ret = -EINVAL;
1672 goto done;
1673 }
1674
1675 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1676
1677 rbd_dev = __rbd_get_dev(target_id);
1678 if (!rbd_dev) {
1679 ret = -ENOENT;
1680 goto done_unlock;
1681 }
1682
1683 ret = rbd_header_add_snap(rbd_dev,
1684 name, GFP_KERNEL);
1685 if (ret < 0)
1686 goto done_unlock;
1687
1688 ret = rbd_update_snaps(rbd_dev);
1689 if (ret < 0)
1690 goto done_unlock;
1691
1692 ret = count;
1693done_unlock:
1694 mutex_unlock(&ctl_mutex);
1695done:
1696 kfree(name);
1697 return ret;
1698}
1699
1700static ssize_t class_rbd_rollback(struct class *c,
1701 struct class_attribute *attr,
1702 const char *buf,
1703 size_t count)
1704{
1705 struct rbd_device *rbd_dev = NULL;
1706 int target_id, ret;
1707 u64 snapid;
1708 char snap_name[RBD_MAX_SNAP_NAME_LEN];
1709 u64 cur_ofs;
1710 char *seg_name;
1711
1712 /* parse snaps add command */
1713 if (sscanf(buf, "%d "
1714 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
1715 &target_id,
1716 snap_name) != 2) {
1717 return -EINVAL;
1718 }
1719
1720 ret = -ENOMEM;
1721 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1722 if (!seg_name)
1723 return ret;
1724
1725 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1726
1727 rbd_dev = __rbd_get_dev(target_id);
1728 if (!rbd_dev) {
1729 ret = -ENOENT;
1730 goto done_unlock;
1731 }
1732
1733 ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
1734 if (ret < 0)
1735 goto done_unlock;
1736
1737 dout("snapid=%lld\n", snapid);
1738
1739 cur_ofs = 0;
1740 while (cur_ofs < rbd_dev->header.image_size) {
1741 cur_ofs += rbd_get_segment(&rbd_dev->header,
1742 rbd_dev->obj,
1743 cur_ofs, (u64)-1,
1744 seg_name, NULL);
1745 dout("seg_name=%s\n", seg_name);
1746
1747 ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
1748 if (ret < 0)
1749 pr_warning("could not roll back obj %s err=%d\n",
1750 seg_name, ret);
1751 }
1752
1753 ret = rbd_update_snaps(rbd_dev);
1754 if (ret < 0)
1755 goto done_unlock;
1756
1757 ret = count;
1758
1759done_unlock:
1760 mutex_unlock(&ctl_mutex);
1761 kfree(seg_name);
1762
1763 return ret;
1764}
1765
1766static struct class_attribute class_rbd_attrs[] = {
1767 __ATTR(add, 0200, NULL, class_rbd_add),
1768 __ATTR(remove, 0200, NULL, class_rbd_remove),
1769 __ATTR(list, 0444, class_rbd_list, NULL),
1770 __ATTR(snaps_refresh, 0200, NULL, class_rbd_snaps_refresh),
1771 __ATTR(snap_create, 0200, NULL, class_rbd_snap_create),
1772 __ATTR(snaps_list, 0444, class_rbd_snaps_list, NULL),
1773 __ATTR(snap_rollback, 0200, NULL, class_rbd_rollback),
1774 __ATTR_NULL
1775};
1776
1777/*
1778 * create control files in sysfs
1779 * /sys/class/rbd/...
1780 */
1781static int rbd_sysfs_init(void)
1782{
1783 int ret = -ENOMEM;
1784
1785 class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL);
1786 if (!class_rbd)
1787 goto out;
1788
1789 class_rbd->name = DRV_NAME;
1790 class_rbd->owner = THIS_MODULE;
1791 class_rbd->class_release = class_rbd_release;
1792 class_rbd->class_attrs = class_rbd_attrs;
1793
1794 ret = class_register(class_rbd);
1795 if (ret)
1796 goto out_class;
1797 return 0;
1798
1799out_class:
1800 kfree(class_rbd);
1801 class_rbd = NULL;
1802 pr_err(DRV_NAME ": failed to create class rbd\n");
1803out:
1804 return ret;
1805}
1806
1807static void rbd_sysfs_cleanup(void)
1808{
1809 if (class_rbd)
1810 class_destroy(class_rbd);
1811 class_rbd = NULL;
1812}
1813
1814int __init rbd_init(void)
1815{
1816 int rc;
1817
1818 rc = rbd_sysfs_init();
1819 if (rc)
1820 return rc;
1821 spin_lock_init(&node_lock);
1822 pr_info("loaded " DRV_NAME_LONG "\n");
1823 return 0;
1824}
1825
1826void __exit rbd_exit(void)
1827{
1828 rbd_sysfs_cleanup();
1829}
1830
1831module_init(rbd_init);
1832module_exit(rbd_exit);
1833
1834MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1835MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1836MODULE_DESCRIPTION("rados block device");
1837
1838/* following authorship retained from original osdblk.c */
1839MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
1840
1841MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
new file mode 100644
index 000000000000..fc6c678aa2cb
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,73 @@
1/*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13#ifndef CEPH_RBD_TYPES_H
14#define CEPH_RBD_TYPES_H
15
16#include <linux/types.h>
17
18/*
19 * rbd image 'foo' consists of objects
20 * foo.rbd - image metadata
21 * foo.00000000
22 * foo.00000001
23 * ... - data
24 */
25
26#define RBD_SUFFIX ".rbd"
27#define RBD_DIRECTORY "rbd_directory"
28#define RBD_INFO "rbd_info"
29
30#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */
31#define RBD_MIN_OBJ_ORDER 16
32#define RBD_MAX_OBJ_ORDER 30
33
34#define RBD_MAX_OBJ_NAME_LEN 96
35#define RBD_MAX_SEG_NAME_LEN 128
36
37#define RBD_COMP_NONE 0
38#define RBD_CRYPT_NONE 0
39
40#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
41#define RBD_HEADER_SIGNATURE "RBD"
42#define RBD_HEADER_VERSION "001.005"
43
44struct rbd_info {
45 __le64 max_id;
46} __attribute__ ((packed));
47
48struct rbd_image_snap_ondisk {
49 __le64 id;
50 __le64 image_size;
51} __attribute__((packed));
52
53struct rbd_image_header_ondisk {
54 char text[40];
55 char block_name[24];
56 char signature[4];
57 char version[8];
58 struct {
59 __u8 order;
60 __u8 crypt_type;
61 __u8 comp_type;
62 __u8 unused;
63 } __attribute__((packed)) options;
64 __le64 image_size;
65 __le64 snap_seq;
66 __le32 snap_count;
67 __le32 reserved;
68 __le64 snap_names_len;
69 struct rbd_image_snap_ondisk snaps[0];
70} __attribute__((packed));
71
72
73#endif
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1101e251a629..8320490226b7 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -2,7 +2,6 @@
2#include <linux/spinlock.h> 2#include <linux/spinlock.h>
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/blkdev.h> 4#include <linux/blkdev.h>
5#include <linux/smp_lock.h>
6#include <linux/hdreg.h> 5#include <linux/hdreg.h>
7#include <linux/virtio.h> 6#include <linux/virtio.h>
8#include <linux/virtio_blk.h> 7#include <linux/virtio_blk.h>
@@ -222,8 +221,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
222 return err; 221 return err;
223} 222}
224 223
225static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode, 224static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
226 unsigned cmd, unsigned long data) 225 unsigned int cmd, unsigned long data)
227{ 226{
228 struct gendisk *disk = bdev->bd_disk; 227 struct gendisk *disk = bdev->bd_disk;
229 struct virtio_blk *vblk = disk->private_data; 228 struct virtio_blk *vblk = disk->private_data;
@@ -238,18 +237,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
238 (void __user *)data); 237 (void __user *)data);
239} 238}
240 239
241static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
242 unsigned int cmd, unsigned long param)
243{
244 int ret;
245
246 lock_kernel();
247 ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
248 unlock_kernel();
249
250 return ret;
251}
252
253/* We provide getgeo only to please some old bootloader/partitioning tools */ 240/* We provide getgeo only to please some old bootloader/partitioning tools */
254static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) 241static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
255{ 242{
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 0f69c5ec0ecd..6c1b676643a9 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -48,6 +48,9 @@ struct ports_driver_data {
48 /* Used for exporting per-port information to debugfs */ 48 /* Used for exporting per-port information to debugfs */
49 struct dentry *debugfs_dir; 49 struct dentry *debugfs_dir;
50 50
51 /* List of all the devices we're handling */
52 struct list_head portdevs;
53
51 /* Number of devices this driver is handling */ 54 /* Number of devices this driver is handling */
52 unsigned int index; 55 unsigned int index;
53 56
@@ -108,6 +111,9 @@ struct port_buffer {
108 * ports for that device (vdev->priv). 111 * ports for that device (vdev->priv).
109 */ 112 */
110struct ports_device { 113struct ports_device {
114 /* Next portdev in the list, head is in the pdrvdata struct */
115 struct list_head list;
116
111 /* 117 /*
112 * Workqueue handlers where we process deferred work after 118 * Workqueue handlers where we process deferred work after
113 * notification 119 * notification
@@ -178,15 +184,21 @@ struct port {
178 struct console cons; 184 struct console cons;
179 185
180 /* Each port associates with a separate char device */ 186 /* Each port associates with a separate char device */
181 struct cdev cdev; 187 struct cdev *cdev;
182 struct device *dev; 188 struct device *dev;
183 189
190 /* Reference-counting to handle port hot-unplugs and file operations */
191 struct kref kref;
192
184 /* A waitqueue for poll() or blocking read operations */ 193 /* A waitqueue for poll() or blocking read operations */
185 wait_queue_head_t waitqueue; 194 wait_queue_head_t waitqueue;
186 195
187 /* The 'name' of the port that we expose via sysfs properties */ 196 /* The 'name' of the port that we expose via sysfs properties */
188 char *name; 197 char *name;
189 198
199 /* We can notify apps of host connect / disconnect events via SIGIO */
200 struct fasync_struct *async_queue;
201
190 /* The 'id' to identify the port with the Host */ 202 /* The 'id' to identify the port with the Host */
191 u32 id; 203 u32 id;
192 204
@@ -221,6 +233,41 @@ out:
221 return port; 233 return port;
222} 234}
223 235
236static struct port *find_port_by_devt_in_portdev(struct ports_device *portdev,
237 dev_t dev)
238{
239 struct port *port;
240 unsigned long flags;
241
242 spin_lock_irqsave(&portdev->ports_lock, flags);
243 list_for_each_entry(port, &portdev->ports, list)
244 if (port->cdev->dev == dev)
245 goto out;
246 port = NULL;
247out:
248 spin_unlock_irqrestore(&portdev->ports_lock, flags);
249
250 return port;
251}
252
253static struct port *find_port_by_devt(dev_t dev)
254{
255 struct ports_device *portdev;
256 struct port *port;
257 unsigned long flags;
258
259 spin_lock_irqsave(&pdrvdata_lock, flags);
260 list_for_each_entry(portdev, &pdrvdata.portdevs, list) {
261 port = find_port_by_devt_in_portdev(portdev, dev);
262 if (port)
263 goto out;
264 }
265 port = NULL;
266out:
267 spin_unlock_irqrestore(&pdrvdata_lock, flags);
268 return port;
269}
270
224static struct port *find_port_by_id(struct ports_device *portdev, u32 id) 271static struct port *find_port_by_id(struct ports_device *portdev, u32 id)
225{ 272{
226 struct port *port; 273 struct port *port;
@@ -410,7 +457,10 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
410static ssize_t send_control_msg(struct port *port, unsigned int event, 457static ssize_t send_control_msg(struct port *port, unsigned int event,
411 unsigned int value) 458 unsigned int value)
412{ 459{
413 return __send_control_msg(port->portdev, port->id, event, value); 460 /* Did the port get unplugged before userspace closed it? */
461 if (port->portdev)
462 return __send_control_msg(port->portdev, port->id, event, value);
463 return 0;
414} 464}
415 465
416/* Callers must take the port->outvq_lock */ 466/* Callers must take the port->outvq_lock */
@@ -525,6 +575,10 @@ static ssize_t fill_readbuf(struct port *port, char *out_buf, size_t out_count,
525/* The condition that must be true for polling to end */ 575/* The condition that must be true for polling to end */
526static bool will_read_block(struct port *port) 576static bool will_read_block(struct port *port)
527{ 577{
578 if (!port->guest_connected) {
579 /* Port got hot-unplugged. Let's exit. */
580 return false;
581 }
528 return !port_has_data(port) && port->host_connected; 582 return !port_has_data(port) && port->host_connected;
529} 583}
530 584
@@ -575,6 +629,9 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf,
575 if (ret < 0) 629 if (ret < 0)
576 return ret; 630 return ret;
577 } 631 }
632 /* Port got hot-unplugged. */
633 if (!port->guest_connected)
634 return -ENODEV;
578 /* 635 /*
579 * We could've received a disconnection message while we were 636 * We could've received a disconnection message while we were
580 * waiting for more data. 637 * waiting for more data.
@@ -616,6 +673,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
616 if (ret < 0) 673 if (ret < 0)
617 return ret; 674 return ret;
618 } 675 }
676 /* Port got hot-unplugged. */
677 if (!port->guest_connected)
678 return -ENODEV;
619 679
620 count = min((size_t)(32 * 1024), count); 680 count = min((size_t)(32 * 1024), count);
621 681
@@ -656,6 +716,10 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
656 port = filp->private_data; 716 port = filp->private_data;
657 poll_wait(filp, &port->waitqueue, wait); 717 poll_wait(filp, &port->waitqueue, wait);
658 718
719 if (!port->guest_connected) {
720 /* Port got unplugged */
721 return POLLHUP;
722 }
659 ret = 0; 723 ret = 0;
660 if (!will_read_block(port)) 724 if (!will_read_block(port))
661 ret |= POLLIN | POLLRDNORM; 725 ret |= POLLIN | POLLRDNORM;
@@ -667,6 +731,8 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
667 return ret; 731 return ret;
668} 732}
669 733
734static void remove_port(struct kref *kref);
735
670static int port_fops_release(struct inode *inode, struct file *filp) 736static int port_fops_release(struct inode *inode, struct file *filp)
671{ 737{
672 struct port *port; 738 struct port *port;
@@ -687,6 +753,16 @@ static int port_fops_release(struct inode *inode, struct file *filp)
687 reclaim_consumed_buffers(port); 753 reclaim_consumed_buffers(port);
688 spin_unlock_irq(&port->outvq_lock); 754 spin_unlock_irq(&port->outvq_lock);
689 755
756 /*
757 * Locks aren't necessary here as a port can't be opened after
758 * unplug, and if a port isn't unplugged, a kref would already
759 * exist for the port. Plus, taking ports_lock here would
760 * create a dependency on other locks taken by functions
761 * inside remove_port if we're the last holder of the port,
762 * creating many problems.
763 */
764 kref_put(&port->kref, remove_port);
765
690 return 0; 766 return 0;
691} 767}
692 768
@@ -694,22 +770,31 @@ static int port_fops_open(struct inode *inode, struct file *filp)
694{ 770{
695 struct cdev *cdev = inode->i_cdev; 771 struct cdev *cdev = inode->i_cdev;
696 struct port *port; 772 struct port *port;
773 int ret;
697 774
698 port = container_of(cdev, struct port, cdev); 775 port = find_port_by_devt(cdev->dev);
699 filp->private_data = port; 776 filp->private_data = port;
700 777
778 /* Prevent against a port getting hot-unplugged at the same time */
779 spin_lock_irq(&port->portdev->ports_lock);
780 kref_get(&port->kref);
781 spin_unlock_irq(&port->portdev->ports_lock);
782
701 /* 783 /*
702 * Don't allow opening of console port devices -- that's done 784 * Don't allow opening of console port devices -- that's done
703 * via /dev/hvc 785 * via /dev/hvc
704 */ 786 */
705 if (is_console_port(port)) 787 if (is_console_port(port)) {
706 return -ENXIO; 788 ret = -ENXIO;
789 goto out;
790 }
707 791
708 /* Allow only one process to open a particular port at a time */ 792 /* Allow only one process to open a particular port at a time */
709 spin_lock_irq(&port->inbuf_lock); 793 spin_lock_irq(&port->inbuf_lock);
710 if (port->guest_connected) { 794 if (port->guest_connected) {
711 spin_unlock_irq(&port->inbuf_lock); 795 spin_unlock_irq(&port->inbuf_lock);
712 return -EMFILE; 796 ret = -EMFILE;
797 goto out;
713 } 798 }
714 799
715 port->guest_connected = true; 800 port->guest_connected = true;
@@ -724,10 +809,23 @@ static int port_fops_open(struct inode *inode, struct file *filp)
724 reclaim_consumed_buffers(port); 809 reclaim_consumed_buffers(port);
725 spin_unlock_irq(&port->outvq_lock); 810 spin_unlock_irq(&port->outvq_lock);
726 811
812 nonseekable_open(inode, filp);
813
727 /* Notify host of port being opened */ 814 /* Notify host of port being opened */
728 send_control_msg(filp->private_data, VIRTIO_CONSOLE_PORT_OPEN, 1); 815 send_control_msg(filp->private_data, VIRTIO_CONSOLE_PORT_OPEN, 1);
729 816
730 return 0; 817 return 0;
818out:
819 kref_put(&port->kref, remove_port);
820 return ret;
821}
822
823static int port_fops_fasync(int fd, struct file *filp, int mode)
824{
825 struct port *port;
826
827 port = filp->private_data;
828 return fasync_helper(fd, filp, mode, &port->async_queue);
731} 829}
732 830
733/* 831/*
@@ -743,6 +841,8 @@ static const struct file_operations port_fops = {
743 .write = port_fops_write, 841 .write = port_fops_write,
744 .poll = port_fops_poll, 842 .poll = port_fops_poll,
745 .release = port_fops_release, 843 .release = port_fops_release,
844 .fasync = port_fops_fasync,
845 .llseek = no_llseek,
746}; 846};
747 847
748/* 848/*
@@ -1001,6 +1101,12 @@ static unsigned int fill_queue(struct virtqueue *vq, spinlock_t *lock)
1001 return nr_added_bufs; 1101 return nr_added_bufs;
1002} 1102}
1003 1103
1104static void send_sigio_to_port(struct port *port)
1105{
1106 if (port->async_queue && port->guest_connected)
1107 kill_fasync(&port->async_queue, SIGIO, POLL_OUT);
1108}
1109
1004static int add_port(struct ports_device *portdev, u32 id) 1110static int add_port(struct ports_device *portdev, u32 id)
1005{ 1111{
1006 char debugfs_name[16]; 1112 char debugfs_name[16];
@@ -1015,6 +1121,7 @@ static int add_port(struct ports_device *portdev, u32 id)
1015 err = -ENOMEM; 1121 err = -ENOMEM;
1016 goto fail; 1122 goto fail;
1017 } 1123 }
1124 kref_init(&port->kref);
1018 1125
1019 port->portdev = portdev; 1126 port->portdev = portdev;
1020 port->id = id; 1127 port->id = id;
@@ -1022,6 +1129,7 @@ static int add_port(struct ports_device *portdev, u32 id)
1022 port->name = NULL; 1129 port->name = NULL;
1023 port->inbuf = NULL; 1130 port->inbuf = NULL;
1024 port->cons.hvc = NULL; 1131 port->cons.hvc = NULL;
1132 port->async_queue = NULL;
1025 1133
1026 port->cons.ws.ws_row = port->cons.ws.ws_col = 0; 1134 port->cons.ws.ws_row = port->cons.ws.ws_col = 0;
1027 1135
@@ -1032,14 +1140,20 @@ static int add_port(struct ports_device *portdev, u32 id)
1032 port->in_vq = portdev->in_vqs[port->id]; 1140 port->in_vq = portdev->in_vqs[port->id];
1033 port->out_vq = portdev->out_vqs[port->id]; 1141 port->out_vq = portdev->out_vqs[port->id];
1034 1142
1035 cdev_init(&port->cdev, &port_fops); 1143 port->cdev = cdev_alloc();
1144 if (!port->cdev) {
1145 dev_err(&port->portdev->vdev->dev, "Error allocating cdev\n");
1146 err = -ENOMEM;
1147 goto free_port;
1148 }
1149 port->cdev->ops = &port_fops;
1036 1150
1037 devt = MKDEV(portdev->chr_major, id); 1151 devt = MKDEV(portdev->chr_major, id);
1038 err = cdev_add(&port->cdev, devt, 1); 1152 err = cdev_add(port->cdev, devt, 1);
1039 if (err < 0) { 1153 if (err < 0) {
1040 dev_err(&port->portdev->vdev->dev, 1154 dev_err(&port->portdev->vdev->dev,
1041 "Error %d adding cdev for port %u\n", err, id); 1155 "Error %d adding cdev for port %u\n", err, id);
1042 goto free_port; 1156 goto free_cdev;
1043 } 1157 }
1044 port->dev = device_create(pdrvdata.class, &port->portdev->vdev->dev, 1158 port->dev = device_create(pdrvdata.class, &port->portdev->vdev->dev,
1045 devt, port, "vport%up%u", 1159 devt, port, "vport%up%u",
@@ -1104,7 +1218,7 @@ free_inbufs:
1104free_device: 1218free_device:
1105 device_destroy(pdrvdata.class, port->dev->devt); 1219 device_destroy(pdrvdata.class, port->dev->devt);
1106free_cdev: 1220free_cdev:
1107 cdev_del(&port->cdev); 1221 cdev_del(port->cdev);
1108free_port: 1222free_port:
1109 kfree(port); 1223 kfree(port);
1110fail: 1224fail:
@@ -1113,21 +1227,45 @@ fail:
1113 return err; 1227 return err;
1114} 1228}
1115 1229
1116/* Remove all port-specific data. */ 1230/* No users remain, remove all port-specific data. */
1117static int remove_port(struct port *port) 1231static void remove_port(struct kref *kref)
1232{
1233 struct port *port;
1234
1235 port = container_of(kref, struct port, kref);
1236
1237 sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
1238 device_destroy(pdrvdata.class, port->dev->devt);
1239 cdev_del(port->cdev);
1240
1241 kfree(port->name);
1242
1243 debugfs_remove(port->debugfs_file);
1244
1245 kfree(port);
1246}
1247
1248/*
1249 * Port got unplugged. Remove port from portdev's list and drop the
1250 * kref reference. If no userspace has this port opened, it will
1251 * result in immediate removal the port.
1252 */
1253static void unplug_port(struct port *port)
1118{ 1254{
1119 struct port_buffer *buf; 1255 struct port_buffer *buf;
1120 1256
1257 spin_lock_irq(&port->portdev->ports_lock);
1258 list_del(&port->list);
1259 spin_unlock_irq(&port->portdev->ports_lock);
1260
1121 if (port->guest_connected) { 1261 if (port->guest_connected) {
1122 port->guest_connected = false; 1262 port->guest_connected = false;
1123 port->host_connected = false; 1263 port->host_connected = false;
1124 wake_up_interruptible(&port->waitqueue); 1264 wake_up_interruptible(&port->waitqueue);
1125 send_control_msg(port, VIRTIO_CONSOLE_PORT_OPEN, 0);
1126 }
1127 1265
1128 spin_lock_irq(&port->portdev->ports_lock); 1266 /* Let the app know the port is going down. */
1129 list_del(&port->list); 1267 send_sigio_to_port(port);
1130 spin_unlock_irq(&port->portdev->ports_lock); 1268 }
1131 1269
1132 if (is_console_port(port)) { 1270 if (is_console_port(port)) {
1133 spin_lock_irq(&pdrvdata_lock); 1271 spin_lock_irq(&pdrvdata_lock);
@@ -1146,9 +1284,6 @@ static int remove_port(struct port *port)
1146 hvc_remove(port->cons.hvc); 1284 hvc_remove(port->cons.hvc);
1147#endif 1285#endif
1148 } 1286 }
1149 sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
1150 device_destroy(pdrvdata.class, port->dev->devt);
1151 cdev_del(&port->cdev);
1152 1287
1153 /* Remove unused data this port might have received. */ 1288 /* Remove unused data this port might have received. */
1154 discard_port_data(port); 1289 discard_port_data(port);
@@ -1159,12 +1294,19 @@ static int remove_port(struct port *port)
1159 while ((buf = virtqueue_detach_unused_buf(port->in_vq))) 1294 while ((buf = virtqueue_detach_unused_buf(port->in_vq)))
1160 free_buf(buf); 1295 free_buf(buf);
1161 1296
1162 kfree(port->name); 1297 /*
1163 1298 * We should just assume the device itself has gone off --
1164 debugfs_remove(port->debugfs_file); 1299 * else a close on an open port later will try to send out a
1300 * control message.
1301 */
1302 port->portdev = NULL;
1165 1303
1166 kfree(port); 1304 /*
1167 return 0; 1305 * Locks around here are not necessary - a port can't be
1306 * opened after we removed the port struct from ports_list
1307 * above.
1308 */
1309 kref_put(&port->kref, remove_port);
1168} 1310}
1169 1311
1170/* Any private messages that the Host and Guest want to share */ 1312/* Any private messages that the Host and Guest want to share */
@@ -1203,7 +1345,7 @@ static void handle_control_message(struct ports_device *portdev,
1203 add_port(portdev, cpkt->id); 1345 add_port(portdev, cpkt->id);
1204 break; 1346 break;
1205 case VIRTIO_CONSOLE_PORT_REMOVE: 1347 case VIRTIO_CONSOLE_PORT_REMOVE:
1206 remove_port(port); 1348 unplug_port(port);
1207 break; 1349 break;
1208 case VIRTIO_CONSOLE_CONSOLE_PORT: 1350 case VIRTIO_CONSOLE_CONSOLE_PORT:
1209 if (!cpkt->value) 1351 if (!cpkt->value)
@@ -1245,6 +1387,12 @@ static void handle_control_message(struct ports_device *portdev,
1245 spin_lock_irq(&port->outvq_lock); 1387 spin_lock_irq(&port->outvq_lock);
1246 reclaim_consumed_buffers(port); 1388 reclaim_consumed_buffers(port);
1247 spin_unlock_irq(&port->outvq_lock); 1389 spin_unlock_irq(&port->outvq_lock);
1390
1391 /*
1392 * If the guest is connected, it'll be interested in
1393 * knowing the host connection state changed.
1394 */
1395 send_sigio_to_port(port);
1248 break; 1396 break;
1249 case VIRTIO_CONSOLE_PORT_NAME: 1397 case VIRTIO_CONSOLE_PORT_NAME:
1250 /* 1398 /*
@@ -1341,6 +1489,9 @@ static void in_intr(struct virtqueue *vq)
1341 1489
1342 wake_up_interruptible(&port->waitqueue); 1490 wake_up_interruptible(&port->waitqueue);
1343 1491
1492 /* Send a SIGIO indicating new data in case the process asked for it */
1493 send_sigio_to_port(port);
1494
1344 if (is_console_port(port) && hvc_poll(port->cons.hvc)) 1495 if (is_console_port(port) && hvc_poll(port->cons.hvc))
1345 hvc_kick(); 1496 hvc_kick();
1346} 1497}
@@ -1577,6 +1728,10 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
1577 add_port(portdev, 0); 1728 add_port(portdev, 0);
1578 } 1729 }
1579 1730
1731 spin_lock_irq(&pdrvdata_lock);
1732 list_add_tail(&portdev->list, &pdrvdata.portdevs);
1733 spin_unlock_irq(&pdrvdata_lock);
1734
1580 __send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID, 1735 __send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID,
1581 VIRTIO_CONSOLE_DEVICE_READY, 1); 1736 VIRTIO_CONSOLE_DEVICE_READY, 1);
1582 return 0; 1737 return 0;
@@ -1600,23 +1755,41 @@ static void virtcons_remove(struct virtio_device *vdev)
1600{ 1755{
1601 struct ports_device *portdev; 1756 struct ports_device *portdev;
1602 struct port *port, *port2; 1757 struct port *port, *port2;
1603 struct port_buffer *buf;
1604 unsigned int len;
1605 1758
1606 portdev = vdev->priv; 1759 portdev = vdev->priv;
1607 1760
1761 spin_lock_irq(&pdrvdata_lock);
1762 list_del(&portdev->list);
1763 spin_unlock_irq(&pdrvdata_lock);
1764
1765 /* Disable interrupts for vqs */
1766 vdev->config->reset(vdev);
1767 /* Finish up work that's lined up */
1608 cancel_work_sync(&portdev->control_work); 1768 cancel_work_sync(&portdev->control_work);
1609 1769
1610 list_for_each_entry_safe(port, port2, &portdev->ports, list) 1770 list_for_each_entry_safe(port, port2, &portdev->ports, list)
1611 remove_port(port); 1771 unplug_port(port);
1612 1772
1613 unregister_chrdev(portdev->chr_major, "virtio-portsdev"); 1773 unregister_chrdev(portdev->chr_major, "virtio-portsdev");
1614 1774
1615 while ((buf = virtqueue_get_buf(portdev->c_ivq, &len))) 1775 /*
1616 free_buf(buf); 1776 * When yanking out a device, we immediately lose the
1777 * (device-side) queues. So there's no point in keeping the
1778 * guest side around till we drop our final reference. This
1779 * also means that any ports which are in an open state will
1780 * have to just stop using the port, as the vqs are going
1781 * away.
1782 */
1783 if (use_multiport(portdev)) {
1784 struct port_buffer *buf;
1785 unsigned int len;
1617 1786
1618 while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq))) 1787 while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
1619 free_buf(buf); 1788 free_buf(buf);
1789
1790 while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
1791 free_buf(buf);
1792 }
1620 1793
1621 vdev->config->del_vqs(vdev); 1794 vdev->config->del_vqs(vdev);
1622 kfree(portdev->in_vqs); 1795 kfree(portdev->in_vqs);
@@ -1663,6 +1836,7 @@ static int __init init(void)
1663 PTR_ERR(pdrvdata.debugfs_dir)); 1836 PTR_ERR(pdrvdata.debugfs_dir));
1664 } 1837 }
1665 INIT_LIST_HEAD(&pdrvdata.consoles); 1838 INIT_LIST_HEAD(&pdrvdata.consoles);
1839 INIT_LIST_HEAD(&pdrvdata.portdevs);
1666 1840
1667 return register_virtio_driver(&virtio_console); 1841 return register_virtio_driver(&virtio_console);
1668} 1842}
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 0fcd2640c23f..9eb134ea6eb2 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,9 +1,11 @@
1config CEPH_FS 1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)" 2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL 3 depends on INET && EXPERIMENTAL
4 select CEPH_LIB
4 select LIBCRC32C 5 select LIBCRC32C
5 select CRYPTO_AES 6 select CRYPTO_AES
6 select CRYPTO 7 select CRYPTO
8 default n
7 help 9 help
8 Choose Y or M here to include support for mounting the 10 Choose Y or M here to include support for mounting the
9 experimental Ceph distributed file system. Ceph is an extremely 11 experimental Ceph distributed file system. Ceph is an extremely
@@ -14,15 +16,3 @@ config CEPH_FS
14 16
15 If unsure, say N. 17 If unsure, say N.
16 18
17config CEPH_FS_PRETTYDEBUG
18 bool "Include file:line in ceph debug output"
19 depends on CEPH_FS
20 default n
21 help
22 If you say Y here, debug output will include a filename and
23 line to aid debugging. This icnreases kernel size and slows
24 execution slightly when debug call sites are enabled (e.g.,
25 via CONFIG_DYNAMIC_DEBUG).
26
27 If unsure, say N.
28
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 278e1172600d..9e6c4f2e8ff1 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
8 8
9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \ 10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \ 11 mds_client.o mdsmap.o strings.o ceph_frag.o \
12 mds_client.o mdsmap.o \ 12 debugfs.o
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20 13
21else 14else
22#Otherwise we were called directly from the command 15#Otherwise we were called directly from the command
diff --git a/fs/ceph/README b/fs/ceph/README
deleted file mode 100644
index 18352fab37c0..000000000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index efbc604001c8..51bcc5ce3230 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
@@ -10,7 +10,8 @@
10#include <linux/task_io_accounting_ops.h> 10#include <linux/task_io_accounting_ops.h>
11 11
12#include "super.h" 12#include "super.h"
13#include "osd_client.h" 13#include "mds_client.h"
14#include <linux/ceph/osd_client.h>
14 15
15/* 16/*
16 * Ceph address space ops. 17 * Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
193{ 194{
194 struct inode *inode = filp->f_dentry->d_inode; 195 struct inode *inode = filp->f_dentry->d_inode;
195 struct ceph_inode_info *ci = ceph_inode(inode); 196 struct ceph_inode_info *ci = ceph_inode(inode);
196 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 197 struct ceph_osd_client *osdc =
198 &ceph_inode_to_client(inode)->client->osdc;
197 int err = 0; 199 int err = 0;
198 u64 len = PAGE_CACHE_SIZE; 200 u64 len = PAGE_CACHE_SIZE;
199 201
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
265{ 267{
266 struct inode *inode = file->f_dentry->d_inode; 268 struct inode *inode = file->f_dentry->d_inode;
267 struct ceph_inode_info *ci = ceph_inode(inode); 269 struct ceph_inode_info *ci = ceph_inode(inode);
268 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 270 struct ceph_osd_client *osdc =
271 &ceph_inode_to_client(inode)->client->osdc;
269 int rc = 0; 272 int rc = 0;
270 struct page **pages; 273 struct page **pages;
271 loff_t offset; 274 loff_t offset;
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
365{ 368{
366 struct inode *inode; 369 struct inode *inode;
367 struct ceph_inode_info *ci; 370 struct ceph_inode_info *ci;
368 struct ceph_client *client; 371 struct ceph_fs_client *fsc;
369 struct ceph_osd_client *osdc; 372 struct ceph_osd_client *osdc;
370 loff_t page_off = page->index << PAGE_CACHE_SHIFT; 373 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
371 int len = PAGE_CACHE_SIZE; 374 int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
383 } 386 }
384 inode = page->mapping->host; 387 inode = page->mapping->host;
385 ci = ceph_inode(inode); 388 ci = ceph_inode(inode);
386 client = ceph_inode_to_client(inode); 389 fsc = ceph_inode_to_client(inode);
387 osdc = &client->osdc; 390 osdc = &fsc->client->osdc;
388 391
389 /* verify this is a writeable snap context */ 392 /* verify this is a writeable snap context */
390 snapc = (void *)page->private; 393 snapc = (void *)page->private;
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
414 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", 417 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
415 inode, page, page->index, page_off, len, snapc); 418 inode, page, page->index, page_off, len, snapc);
416 419
417 writeback_stat = atomic_long_inc_return(&client->writeback_count); 420 writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
418 if (writeback_stat > 421 if (writeback_stat >
419 CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) 422 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
420 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); 423 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
421 424
422 set_page_writeback(page); 425 set_page_writeback(page);
423 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 426 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
496 struct address_space *mapping = inode->i_mapping; 499 struct address_space *mapping = inode->i_mapping;
497 __s32 rc = -EIO; 500 __s32 rc = -EIO;
498 u64 bytes = 0; 501 u64 bytes = 0;
499 struct ceph_client *client = ceph_inode_to_client(inode); 502 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
500 long writeback_stat; 503 long writeback_stat;
501 unsigned issued = ceph_caps_issued(ci); 504 unsigned issued = ceph_caps_issued(ci);
502 505
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
529 WARN_ON(!PageUptodate(page)); 532 WARN_ON(!PageUptodate(page));
530 533
531 writeback_stat = 534 writeback_stat =
532 atomic_long_dec_return(&client->writeback_count); 535 atomic_long_dec_return(&fsc->writeback_count);
533 if (writeback_stat < 536 if (writeback_stat <
534 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) 537 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
535 clear_bdi_congested(&client->backing_dev_info, 538 clear_bdi_congested(&fsc->backing_dev_info,
536 BLK_RW_ASYNC); 539 BLK_RW_ASYNC);
537 540
538 ceph_put_snap_context((void *)page->private); 541 ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
569 * mempool. we avoid the mempool if we can because req->r_num_pages 572 * mempool. we avoid the mempool if we can because req->r_num_pages
570 * may be less than the maximum write size. 573 * may be less than the maximum write size.
571 */ 574 */
572static void alloc_page_vec(struct ceph_client *client, 575static void alloc_page_vec(struct ceph_fs_client *fsc,
573 struct ceph_osd_request *req) 576 struct ceph_osd_request *req)
574{ 577{
575 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, 578 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
576 GFP_NOFS); 579 GFP_NOFS);
577 if (!req->r_pages) { 580 if (!req->r_pages) {
578 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS); 581 req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
579 req->r_pages_from_pool = 1; 582 req->r_pages_from_pool = 1;
580 WARN_ON(!req->r_pages); 583 WARN_ON(!req->r_pages);
581 } 584 }
@@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping,
590 struct inode *inode = mapping->host; 593 struct inode *inode = mapping->host;
591 struct backing_dev_info *bdi = mapping->backing_dev_info; 594 struct backing_dev_info *bdi = mapping->backing_dev_info;
592 struct ceph_inode_info *ci = ceph_inode(inode); 595 struct ceph_inode_info *ci = ceph_inode(inode);
593 struct ceph_client *client; 596 struct ceph_fs_client *fsc;
594 pgoff_t index, start, end; 597 pgoff_t index, start, end;
595 int range_whole = 0; 598 int range_whole = 0;
596 int should_loop = 1; 599 int should_loop = 1;
@@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping,
617 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 620 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
618 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 621 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
619 622
620 client = ceph_inode_to_client(inode); 623 fsc = ceph_inode_to_client(inode);
621 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) { 624 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
622 pr_warning("writepage_start %p on forced umount\n", inode); 625 pr_warning("writepage_start %p on forced umount\n", inode);
623 return -EIO; /* we're in a forced umount, don't write! */ 626 return -EIO; /* we're in a forced umount, don't write! */
624 } 627 }
625 if (client->mount_args->wsize && client->mount_args->wsize < wsize) 628 if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
626 wsize = client->mount_args->wsize; 629 wsize = fsc->mount_options->wsize;
627 if (wsize < PAGE_CACHE_SIZE) 630 if (wsize < PAGE_CACHE_SIZE)
628 wsize = PAGE_CACHE_SIZE; 631 wsize = PAGE_CACHE_SIZE;
629 max_pages_ever = wsize >> PAGE_CACHE_SHIFT; 632 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
@@ -769,7 +772,7 @@ get_more_pages:
769 offset = (unsigned long long)page->index 772 offset = (unsigned long long)page->index
770 << PAGE_CACHE_SHIFT; 773 << PAGE_CACHE_SHIFT;
771 len = wsize; 774 len = wsize;
772 req = ceph_osdc_new_request(&client->osdc, 775 req = ceph_osdc_new_request(&fsc->client->osdc,
773 &ci->i_layout, 776 &ci->i_layout,
774 ceph_vino(inode), 777 ceph_vino(inode),
775 offset, &len, 778 offset, &len,
@@ -782,7 +785,7 @@ get_more_pages:
782 &inode->i_mtime, true, 1); 785 &inode->i_mtime, true, 1);
783 max_pages = req->r_num_pages; 786 max_pages = req->r_num_pages;
784 787
785 alloc_page_vec(client, req); 788 alloc_page_vec(fsc, req);
786 req->r_callback = writepages_finish; 789 req->r_callback = writepages_finish;
787 req->r_inode = inode; 790 req->r_inode = inode;
788 } 791 }
@@ -794,10 +797,10 @@ get_more_pages:
794 inode, page, page->index); 797 inode, page, page->index);
795 798
796 writeback_stat = 799 writeback_stat =
797 atomic_long_inc_return(&client->writeback_count); 800 atomic_long_inc_return(&fsc->writeback_count);
798 if (writeback_stat > CONGESTION_ON_THRESH( 801 if (writeback_stat > CONGESTION_ON_THRESH(
799 client->mount_args->congestion_kb)) { 802 fsc->mount_options->congestion_kb)) {
800 set_bdi_congested(&client->backing_dev_info, 803 set_bdi_congested(&fsc->backing_dev_info,
801 BLK_RW_ASYNC); 804 BLK_RW_ASYNC);
802 } 805 }
803 806
@@ -846,7 +849,7 @@ get_more_pages:
846 op->payload_len = cpu_to_le32(len); 849 op->payload_len = cpu_to_le32(len);
847 req->r_request->hdr.data_len = cpu_to_le32(len); 850 req->r_request->hdr.data_len = cpu_to_le32(len);
848 851
849 ceph_osdc_start_request(&client->osdc, req, true); 852 ceph_osdc_start_request(&fsc->client->osdc, req, true);
850 req = NULL; 853 req = NULL;
851 854
852 /* continue? */ 855 /* continue? */
@@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file,
915{ 918{
916 struct inode *inode = file->f_dentry->d_inode; 919 struct inode *inode = file->f_dentry->d_inode;
917 struct ceph_inode_info *ci = ceph_inode(inode); 920 struct ceph_inode_info *ci = ceph_inode(inode);
918 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 921 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
919 loff_t page_off = pos & PAGE_CACHE_MASK; 922 loff_t page_off = pos & PAGE_CACHE_MASK;
920 int pos_in_page = pos & ~PAGE_CACHE_MASK; 923 int pos_in_page = pos & ~PAGE_CACHE_MASK;
921 int end_in_page = pos_in_page + len; 924 int end_in_page = pos_in_page + len;
@@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1053 struct page *page, void *fsdata) 1056 struct page *page, void *fsdata)
1054{ 1057{
1055 struct inode *inode = file->f_dentry->d_inode; 1058 struct inode *inode = file->f_dentry->d_inode;
1056 struct ceph_client *client = ceph_inode_to_client(inode); 1059 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1057 struct ceph_mds_client *mdsc = &client->mdsc; 1060 struct ceph_mds_client *mdsc = fsc->mdsc;
1058 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1061 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1059 int check_cap = 0; 1062 int check_cap = 0;
1060 1063
@@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1123{ 1126{
1124 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1127 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1125 struct page *page = vmf->page; 1128 struct page *page = vmf->page;
1126 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1129 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1127 loff_t off = page->index << PAGE_CACHE_SHIFT; 1130 loff_t off = page->index << PAGE_CACHE_SHIFT;
1128 loff_t size, len; 1131 loff_t size, len;
1129 int ret; 1132 int ret;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 5e9da996a151..98ab13e2b71d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
@@ -9,8 +9,9 @@
9#include <linux/writeback.h> 9#include <linux/writeback.h>
10 10
11#include "super.h" 11#include "super.h"
12#include "decode.h" 12#include "mds_client.h"
13#include "messenger.h" 13#include <linux/ceph/decode.h>
14#include <linux/ceph/messenger.h>
14 15
15/* 16/*
16 * Capability management 17 * Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
287 spin_unlock(&mdsc->caps_list_lock); 288 spin_unlock(&mdsc->caps_list_lock);
288} 289}
289 290
290void ceph_reservation_status(struct ceph_client *client, 291void ceph_reservation_status(struct ceph_fs_client *fsc,
291 int *total, int *avail, int *used, int *reserved, 292 int *total, int *avail, int *used, int *reserved,
292 int *min) 293 int *min)
293{ 294{
294 struct ceph_mds_client *mdsc = &client->mdsc; 295 struct ceph_mds_client *mdsc = fsc->mdsc;
295 296
296 if (total) 297 if (total)
297 *total = mdsc->caps_total_count; 298 *total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
399static void __cap_set_timeouts(struct ceph_mds_client *mdsc, 400static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
400 struct ceph_inode_info *ci) 401 struct ceph_inode_info *ci)
401{ 402{
402 struct ceph_mount_args *ma = mdsc->client->mount_args; 403 struct ceph_mount_options *ma = mdsc->fsc->mount_options;
403 404
404 ci->i_hold_caps_min = round_jiffies(jiffies + 405 ci->i_hold_caps_min = round_jiffies(jiffies +
405 ma->caps_wanted_delay_min * HZ); 406 ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
515 unsigned seq, unsigned mseq, u64 realmino, int flags, 516 unsigned seq, unsigned mseq, u64 realmino, int flags,
516 struct ceph_cap_reservation *caps_reservation) 517 struct ceph_cap_reservation *caps_reservation)
517{ 518{
518 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 519 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
519 struct ceph_inode_info *ci = ceph_inode(inode); 520 struct ceph_inode_info *ci = ceph_inode(inode);
520 struct ceph_cap *new_cap = NULL; 521 struct ceph_cap *new_cap = NULL;
521 struct ceph_cap *cap; 522 struct ceph_cap *cap;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
873 struct ceph_mds_session *session = cap->session; 874 struct ceph_mds_session *session = cap->session;
874 struct ceph_inode_info *ci = cap->ci; 875 struct ceph_inode_info *ci = cap->ci;
875 struct ceph_mds_client *mdsc = 876 struct ceph_mds_client *mdsc =
876 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 877 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
877 int removed = 0; 878 int removed = 0;
878 879
879 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 880 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
1210 int mds; 1211 int mds;
1211 struct ceph_cap_snap *capsnap; 1212 struct ceph_cap_snap *capsnap;
1212 u32 mseq; 1213 u32 mseq;
1213 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1214 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1214 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold 1215 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1215 session->s_mutex */ 1216 session->s_mutex */
1216 u64 next_follows = 0; /* keep track of how far we've gotten through the 1217 u64 next_follows = 0; /* keep track of how far we've gotten through the
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1336void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1337void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1337{ 1338{
1338 struct ceph_mds_client *mdsc = 1339 struct ceph_mds_client *mdsc =
1339 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 1340 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1340 struct inode *inode = &ci->vfs_inode; 1341 struct inode *inode = &ci->vfs_inode;
1341 int was = ci->i_dirty_caps; 1342 int was = ci->i_dirty_caps;
1342 int dirty = 0; 1343 int dirty = 0;
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1378static int __mark_caps_flushing(struct inode *inode, 1379static int __mark_caps_flushing(struct inode *inode,
1379 struct ceph_mds_session *session) 1380 struct ceph_mds_session *session)
1380{ 1381{
1381 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 1382 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1382 struct ceph_inode_info *ci = ceph_inode(inode); 1383 struct ceph_inode_info *ci = ceph_inode(inode);
1383 int flushing; 1384 int flushing;
1384 1385
@@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
1416/* 1417/*
1417 * try to invalidate mapping pages without blocking. 1418 * try to invalidate mapping pages without blocking.
1418 */ 1419 */
1419static int mapping_is_empty(struct address_space *mapping)
1420{
1421 struct page *page = find_get_page(mapping, 0);
1422
1423 if (!page)
1424 return 1;
1425
1426 put_page(page);
1427 return 0;
1428}
1429
1430static int try_nonblocking_invalidate(struct inode *inode) 1420static int try_nonblocking_invalidate(struct inode *inode)
1431{ 1421{
1432 struct ceph_inode_info *ci = ceph_inode(inode); 1422 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1436,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
1436 invalidate_mapping_pages(&inode->i_data, 0, -1); 1426 invalidate_mapping_pages(&inode->i_data, 0, -1);
1437 spin_lock(&inode->i_lock); 1427 spin_lock(&inode->i_lock);
1438 1428
1439 if (mapping_is_empty(&inode->i_data) && 1429 if (inode->i_data.nrpages == 0 &&
1440 invalidating_gen == ci->i_rdcache_gen) { 1430 invalidating_gen == ci->i_rdcache_gen) {
1441 /* success. */ 1431 /* success. */
1442 dout("try_nonblocking_invalidate %p success\n", inode); 1432 dout("try_nonblocking_invalidate %p success\n", inode);
@@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
1462void ceph_check_caps(struct ceph_inode_info *ci, int flags, 1452void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1463 struct ceph_mds_session *session) 1453 struct ceph_mds_session *session)
1464{ 1454{
1465 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); 1455 struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1466 struct ceph_mds_client *mdsc = &client->mdsc; 1456 struct ceph_mds_client *mdsc = fsc->mdsc;
1467 struct inode *inode = &ci->vfs_inode; 1457 struct inode *inode = &ci->vfs_inode;
1468 struct ceph_cap *cap; 1458 struct ceph_cap *cap;
1469 int file_wanted, used; 1459 int file_wanted, used;
@@ -1533,7 +1523,7 @@ retry_locked:
1533 */ 1523 */
1534 if ((!is_delayed || mdsc->stopping) && 1524 if ((!is_delayed || mdsc->stopping) &&
1535 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ 1525 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1536 ci->i_rdcache_gen && /* may have cached pages */ 1526 inode->i_data.nrpages && /* have cached pages */
1537 (file_wanted == 0 || /* no open files */ 1527 (file_wanted == 0 || /* no open files */
1538 (revoking & (CEPH_CAP_FILE_CACHE| 1528 (revoking & (CEPH_CAP_FILE_CACHE|
1539 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ 1529 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
@@ -1706,7 +1696,7 @@ ack:
1706static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1696static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1707 unsigned *flush_tid) 1697 unsigned *flush_tid)
1708{ 1698{
1709 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 1699 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1710 struct ceph_inode_info *ci = ceph_inode(inode); 1700 struct ceph_inode_info *ci = ceph_inode(inode);
1711 int unlock_session = session ? 0 : 1; 1701 int unlock_session = session ? 0 : 1;
1712 int flushing = 0; 1702 int flushing = 0;
@@ -1872,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1872 caps_are_flushed(inode, flush_tid)); 1862 caps_are_flushed(inode, flush_tid));
1873 } else { 1863 } else {
1874 struct ceph_mds_client *mdsc = 1864 struct ceph_mds_client *mdsc =
1875 &ceph_sb_to_client(inode->i_sb)->mdsc; 1865 ceph_sb_to_client(inode->i_sb)->mdsc;
1876 1866
1877 spin_lock(&inode->i_lock); 1867 spin_lock(&inode->i_lock);
1878 if (__ceph_caps_dirty(ci)) 1868 if (__ceph_caps_dirty(ci))
@@ -2465,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2465 __releases(inode->i_lock) 2455 __releases(inode->i_lock)
2466{ 2456{
2467 struct ceph_inode_info *ci = ceph_inode(inode); 2457 struct ceph_inode_info *ci = ceph_inode(inode);
2468 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 2458 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
2469 unsigned seq = le32_to_cpu(m->seq); 2459 unsigned seq = le32_to_cpu(m->seq);
2470 int dirty = le32_to_cpu(m->dirty); 2460 int dirty = le32_to_cpu(m->dirty);
2471 int cleaned = 0; 2461 int cleaned = 0;
@@ -2713,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2713 struct ceph_msg *msg) 2703 struct ceph_msg *msg)
2714{ 2704{
2715 struct ceph_mds_client *mdsc = session->s_mdsc; 2705 struct ceph_mds_client *mdsc = session->s_mdsc;
2716 struct super_block *sb = mdsc->client->sb; 2706 struct super_block *sb = mdsc->fsc->sb;
2717 struct inode *inode; 2707 struct inode *inode;
2718 struct ceph_cap *cap; 2708 struct ceph_cap *cap;
2719 struct ceph_mds_caps *h; 2709 struct ceph_mds_caps *h;
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index ab6cf35c4091..bdce8b1fbd06 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * Ceph 'frag' type 2 * Ceph 'frag' type
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6int ceph_frag_compare(__u32 a, __u32 b) 7int ceph_frag_compare(__u32 a, __u32 b)
7{ 8{
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6fd8b20a8611..7ae1b3d55b58 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/device.h> 3#include <linux/device.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
@@ -7,143 +7,49 @@
7#include <linux/debugfs.h> 7#include <linux/debugfs.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9 9
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/mon_client.h>
12#include <linux/ceph/auth.h>
13#include <linux/ceph/debugfs.h>
14
10#include "super.h" 15#include "super.h"
11#include "mds_client.h"
12#include "mon_client.h"
13#include "auth.h"
14 16
15#ifdef CONFIG_DEBUG_FS 17#ifdef CONFIG_DEBUG_FS
16 18
17/* 19#include "mds_client.h"
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../mdsmap - current mdsmap
23 * .../monmap - current monmap
24 * .../osdc - active osd requests
25 * .../mdsc - active mds requests
26 * .../monc - mon client state
27 * .../dentry_lru - dump contents of dentry lru
28 * .../caps - expose cap (reservation) stats
29 * .../bdi - symlink to ../../bdi/something
30 */
31
32static struct dentry *ceph_debugfs_dir;
33
34static int monmap_show(struct seq_file *s, void *p)
35{
36 int i;
37 struct ceph_client *client = s->private;
38
39 if (client->monc.monmap == NULL)
40 return 0;
41
42 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
43 for (i = 0; i < client->monc.monmap->num_mon; i++) {
44 struct ceph_entity_inst *inst =
45 &client->monc.monmap->mon_inst[i];
46
47 seq_printf(s, "\t%s%lld\t%s\n",
48 ENTITY_NAME(inst->name),
49 pr_addr(&inst->addr.in_addr));
50 }
51 return 0;
52}
53 20
54static int mdsmap_show(struct seq_file *s, void *p) 21static int mdsmap_show(struct seq_file *s, void *p)
55{ 22{
56 int i; 23 int i;
57 struct ceph_client *client = s->private; 24 struct ceph_fs_client *fsc = s->private;
58 25
59 if (client->mdsc.mdsmap == NULL) 26 if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
60 return 0; 27 return 0;
61 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); 28 seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
62 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); 29 seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
63 seq_printf(s, "session_timeout %d\n", 30 seq_printf(s, "session_timeout %d\n",
64 client->mdsc.mdsmap->m_session_timeout); 31 fsc->mdsc->mdsmap->m_session_timeout);
65 seq_printf(s, "session_autoclose %d\n", 32 seq_printf(s, "session_autoclose %d\n",
66 client->mdsc.mdsmap->m_session_autoclose); 33 fsc->mdsc->mdsmap->m_session_autoclose);
67 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { 34 for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
68 struct ceph_entity_addr *addr = 35 struct ceph_entity_addr *addr =
69 &client->mdsc.mdsmap->m_info[i].addr; 36 &fsc->mdsc->mdsmap->m_info[i].addr;
70 int state = client->mdsc.mdsmap->m_info[i].state; 37 int state = fsc->mdsc->mdsmap->m_info[i].state;
71 38
72 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), 39 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
40 ceph_pr_addr(&addr->in_addr),
73 ceph_mds_state_name(state)); 41 ceph_mds_state_name(state));
74 } 42 }
75 return 0; 43 return 0;
76} 44}
77 45
78static int osdmap_show(struct seq_file *s, void *p) 46/*
79{ 47 * mdsc debugfs
80 int i; 48 */
81 struct ceph_client *client = s->private;
82 struct rb_node *n;
83
84 if (client->osdc.osdmap == NULL)
85 return 0;
86 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
87 seq_printf(s, "flags%s%s\n",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
89 " NEARFULL" : "",
90 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
91 " FULL" : "");
92 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
93 struct ceph_pg_pool_info *pool =
94 rb_entry(n, struct ceph_pg_pool_info, node);
95 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
96 pool->id, pool->v.pg_num, pool->pg_num_mask,
97 pool->v.lpg_num, pool->lpg_num_mask);
98 }
99 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
100 struct ceph_entity_addr *addr =
101 &client->osdc.osdmap->osd_addr[i];
102 int state = client->osdc.osdmap->osd_state[i];
103 char sb[64];
104
105 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
106 i, pr_addr(&addr->in_addr),
107 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
108 ceph_osdmap_state_str(sb, sizeof(sb), state));
109 }
110 return 0;
111}
112
113static int monc_show(struct seq_file *s, void *p)
114{
115 struct ceph_client *client = s->private;
116 struct ceph_mon_generic_request *req;
117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp;
119
120 mutex_lock(&monc->mutex);
121
122 if (monc->have_mdsmap)
123 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
124 if (monc->have_osdmap)
125 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n");
128
129 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
130 __u16 op;
131 req = rb_entry(rp, struct ceph_mon_generic_request, node);
132 op = le16_to_cpu(req->request->hdr.type);
133 if (op == CEPH_MSG_STATFS)
134 seq_printf(s, "%lld statfs\n", req->tid);
135 else
136 seq_printf(s, "%lld unknown\n", req->tid);
137 }
138
139 mutex_unlock(&monc->mutex);
140 return 0;
141}
142
143static int mdsc_show(struct seq_file *s, void *p) 49static int mdsc_show(struct seq_file *s, void *p)
144{ 50{
145 struct ceph_client *client = s->private; 51 struct ceph_fs_client *fsc = s->private;
146 struct ceph_mds_client *mdsc = &client->mdsc; 52 struct ceph_mds_client *mdsc = fsc->mdsc;
147 struct ceph_mds_request *req; 53 struct ceph_mds_request *req;
148 struct rb_node *rp; 54 struct rb_node *rp;
149 int pathlen; 55 int pathlen;
@@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p)
214 return 0; 120 return 0;
215} 121}
216 122
217static int osdc_show(struct seq_file *s, void *pp)
218{
219 struct ceph_client *client = s->private;
220 struct ceph_osd_client *osdc = &client->osdc;
221 struct rb_node *p;
222
223 mutex_lock(&osdc->request_mutex);
224 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
225 struct ceph_osd_request *req;
226 struct ceph_osd_request_head *head;
227 struct ceph_osd_op *op;
228 int num_ops;
229 int opcode, olen;
230 int i;
231
232 req = rb_entry(p, struct ceph_osd_request, r_node);
233
234 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
235 req->r_osd ? req->r_osd->o_osd : -1,
236 le32_to_cpu(req->r_pgid.pool),
237 le16_to_cpu(req->r_pgid.ps));
238
239 head = req->r_request->front.iov_base;
240 op = (void *)(head + 1);
241
242 num_ops = le16_to_cpu(head->num_ops);
243 olen = le32_to_cpu(head->object_len);
244 seq_printf(s, "%.*s", olen,
245 (const char *)(head->ops + num_ops));
246
247 if (req->r_reassert_version.epoch)
248 seq_printf(s, "\t%u'%llu",
249 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
250 le64_to_cpu(req->r_reassert_version.version));
251 else
252 seq_printf(s, "\t");
253
254 for (i = 0; i < num_ops; i++) {
255 opcode = le16_to_cpu(op->op);
256 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
257 op++;
258 }
259
260 seq_printf(s, "\n");
261 }
262 mutex_unlock(&osdc->request_mutex);
263 return 0;
264}
265
266static int caps_show(struct seq_file *s, void *p) 123static int caps_show(struct seq_file *s, void *p)
267{ 124{
268 struct ceph_client *client = s->private; 125 struct ceph_fs_client *fsc = s->private;
269 int total, avail, used, reserved, min; 126 int total, avail, used, reserved, min;
270 127
271 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); 128 ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
272 seq_printf(s, "total\t\t%d\n" 129 seq_printf(s, "total\t\t%d\n"
273 "avail\t\t%d\n" 130 "avail\t\t%d\n"
274 "used\t\t%d\n" 131 "used\t\t%d\n"
@@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p)
280 137
281static int dentry_lru_show(struct seq_file *s, void *ptr) 138static int dentry_lru_show(struct seq_file *s, void *ptr)
282{ 139{
283 struct ceph_client *client = s->private; 140 struct ceph_fs_client *fsc = s->private;
284 struct ceph_mds_client *mdsc = &client->mdsc; 141 struct ceph_mds_client *mdsc = fsc->mdsc;
285 struct ceph_dentry_info *di; 142 struct ceph_dentry_info *di;
286 143
287 spin_lock(&mdsc->dentry_lru_lock); 144 spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
295 return 0; 152 return 0;
296} 153}
297 154
298#define DEFINE_SHOW_FUNC(name) \ 155CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
299static int name##_open(struct inode *inode, struct file *file) \ 156CEPH_DEFINE_SHOW_FUNC(mdsc_show)
300{ \ 157CEPH_DEFINE_SHOW_FUNC(caps_show)
301 struct seq_file *sf; \ 158CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
302 int ret; \ 159
303 \
304 ret = single_open(file, name, NULL); \
305 sf = file->private_data; \
306 sf->private = inode->i_private; \
307 return ret; \
308} \
309 \
310static const struct file_operations name##_fops = { \
311 .open = name##_open, \
312 .read = seq_read, \
313 .llseek = seq_lseek, \
314 .release = single_release, \
315};
316
317DEFINE_SHOW_FUNC(monmap_show)
318DEFINE_SHOW_FUNC(mdsmap_show)
319DEFINE_SHOW_FUNC(osdmap_show)
320DEFINE_SHOW_FUNC(monc_show)
321DEFINE_SHOW_FUNC(mdsc_show)
322DEFINE_SHOW_FUNC(osdc_show)
323DEFINE_SHOW_FUNC(dentry_lru_show)
324DEFINE_SHOW_FUNC(caps_show)
325 160
161/*
162 * debugfs
163 */
326static int congestion_kb_set(void *data, u64 val) 164static int congestion_kb_set(void *data, u64 val)
327{ 165{
328 struct ceph_client *client = (struct ceph_client *)data; 166 struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
329
330 if (client)
331 client->mount_args->congestion_kb = (int)val;
332 167
168 fsc->mount_options->congestion_kb = (int)val;
333 return 0; 169 return 0;
334} 170}
335 171
336static int congestion_kb_get(void *data, u64 *val) 172static int congestion_kb_get(void *data, u64 *val)
337{ 173{
338 struct ceph_client *client = (struct ceph_client *)data; 174 struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
339
340 if (client)
341 *val = (u64)client->mount_args->congestion_kb;
342 175
176 *val = (u64)fsc->mount_options->congestion_kb;
343 return 0; 177 return 0;
344} 178}
345 179
346
347DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, 180DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
348 congestion_kb_set, "%llu\n"); 181 congestion_kb_set, "%llu\n");
349 182
350int __init ceph_debugfs_init(void)
351{
352 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
353 if (!ceph_debugfs_dir)
354 return -ENOMEM;
355 return 0;
356}
357 183
358void ceph_debugfs_cleanup(void) 184void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
359{ 185{
360 debugfs_remove(ceph_debugfs_dir); 186 dout("ceph_fs_debugfs_cleanup\n");
187 debugfs_remove(fsc->debugfs_bdi);
188 debugfs_remove(fsc->debugfs_congestion_kb);
189 debugfs_remove(fsc->debugfs_mdsmap);
190 debugfs_remove(fsc->debugfs_caps);
191 debugfs_remove(fsc->debugfs_mdsc);
192 debugfs_remove(fsc->debugfs_dentry_lru);
361} 193}
362 194
363int ceph_debugfs_client_init(struct ceph_client *client) 195int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
364{ 196{
365 int ret = 0; 197 char name[100];
366 char name[80]; 198 int err = -ENOMEM;
367
368 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
369 client->monc.auth->global_id);
370 199
371 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); 200 dout("ceph_fs_debugfs_init\n");
372 if (!client->debugfs_dir) 201 fsc->debugfs_congestion_kb =
373 goto out; 202 debugfs_create_file("writeback_congestion_kb",
374 203 0600,
375 client->monc.debugfs_file = debugfs_create_file("monc", 204 fsc->client->debugfs_dir,
376 0600, 205 fsc,
377 client->debugfs_dir, 206 &congestion_kb_fops);
378 client, 207 if (!fsc->debugfs_congestion_kb)
379 &monc_show_fops);
380 if (!client->monc.debugfs_file)
381 goto out; 208 goto out;
382 209
383 client->mdsc.debugfs_file = debugfs_create_file("mdsc", 210 dout("a\n");
384 0600,
385 client->debugfs_dir,
386 client,
387 &mdsc_show_fops);
388 if (!client->mdsc.debugfs_file)
389 goto out;
390 211
391 client->osdc.debugfs_file = debugfs_create_file("osdc", 212 snprintf(name, sizeof(name), "../../bdi/%s",
392 0600, 213 dev_name(fsc->backing_dev_info.dev));
393 client->debugfs_dir, 214 fsc->debugfs_bdi =
394 client, 215 debugfs_create_symlink("bdi",
395 &osdc_show_fops); 216 fsc->client->debugfs_dir,
396 if (!client->osdc.debugfs_file) 217 name);
218 if (!fsc->debugfs_bdi)
397 goto out; 219 goto out;
398 220
399 client->debugfs_monmap = debugfs_create_file("monmap", 221 dout("b\n");
222 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
400 0600, 223 0600,
401 client->debugfs_dir, 224 fsc->client->debugfs_dir,
402 client, 225 fsc,
403 &monmap_show_fops);
404 if (!client->debugfs_monmap)
405 goto out;
406
407 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
408 0600,
409 client->debugfs_dir,
410 client,
411 &mdsmap_show_fops); 226 &mdsmap_show_fops);
412 if (!client->debugfs_mdsmap) 227 if (!fsc->debugfs_mdsmap)
413 goto out;
414
415 client->debugfs_osdmap = debugfs_create_file("osdmap",
416 0600,
417 client->debugfs_dir,
418 client,
419 &osdmap_show_fops);
420 if (!client->debugfs_osdmap)
421 goto out; 228 goto out;
422 229
423 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", 230 dout("ca\n");
424 0600, 231 fsc->debugfs_mdsc = debugfs_create_file("mdsc",
425 client->debugfs_dir, 232 0600,
426 client, 233 fsc->client->debugfs_dir,
427 &dentry_lru_show_fops); 234 fsc,
428 if (!client->debugfs_dentry_lru) 235 &mdsc_show_fops);
236 if (!fsc->debugfs_mdsc)
429 goto out; 237 goto out;
430 238
431 client->debugfs_caps = debugfs_create_file("caps", 239 dout("da\n");
240 fsc->debugfs_caps = debugfs_create_file("caps",
432 0400, 241 0400,
433 client->debugfs_dir, 242 fsc->client->debugfs_dir,
434 client, 243 fsc,
435 &caps_show_fops); 244 &caps_show_fops);
436 if (!client->debugfs_caps) 245 if (!fsc->debugfs_caps)
437 goto out; 246 goto out;
438 247
439 client->debugfs_congestion_kb = 248 dout("ea\n");
440 debugfs_create_file("writeback_congestion_kb", 249 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
441 0600, 250 0600,
442 client->debugfs_dir, 251 fsc->client->debugfs_dir,
443 client, 252 fsc,
444 &congestion_kb_fops); 253 &dentry_lru_show_fops);
445 if (!client->debugfs_congestion_kb) 254 if (!fsc->debugfs_dentry_lru)
446 goto out; 255 goto out;
447 256
448 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
449 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
450 name);
451
452 return 0; 257 return 0;
453 258
454out: 259out:
455 ceph_debugfs_client_cleanup(client); 260 ceph_fs_debugfs_cleanup(fsc);
456 return ret; 261 return err;
457} 262}
458 263
459void ceph_debugfs_client_cleanup(struct ceph_client *client)
460{
461 debugfs_remove(client->debugfs_bdi);
462 debugfs_remove(client->debugfs_caps);
463 debugfs_remove(client->debugfs_dentry_lru);
464 debugfs_remove(client->debugfs_osdmap);
465 debugfs_remove(client->debugfs_mdsmap);
466 debugfs_remove(client->debugfs_monmap);
467 debugfs_remove(client->osdc.debugfs_file);
468 debugfs_remove(client->mdsc.debugfs_file);
469 debugfs_remove(client->monc.debugfs_file);
470 debugfs_remove(client->debugfs_congestion_kb);
471 debugfs_remove(client->debugfs_dir);
472}
473 264
474#else /* CONFIG_DEBUG_FS */ 265#else /* CONFIG_DEBUG_FS */
475 266
476int __init ceph_debugfs_init(void) 267int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
477{
478 return 0;
479}
480
481void ceph_debugfs_cleanup(void)
482{
483}
484
485int ceph_debugfs_client_init(struct ceph_client *client)
486{ 268{
487 return 0; 269 return 0;
488} 270}
489 271
490void ceph_debugfs_client_cleanup(struct ceph_client *client) 272void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
491{ 273{
492} 274}
493 275
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a1986eb52045..e0a2dc6fcafc 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/spinlock.h> 3#include <linux/spinlock.h>
4#include <linux/fs_struct.h> 4#include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8 8
9#include "super.h" 9#include "super.h"
10#include "mds_client.h"
10 11
11/* 12/*
12 * Directory operations: readdir, lookup, create, link, unlink, 13 * Directory operations: readdir, lookup, create, link, unlink,
@@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p)
94 */ 95 */
95static int __dcache_readdir(struct file *filp, 96static int __dcache_readdir(struct file *filp,
96 void *dirent, filldir_t filldir) 97 void *dirent, filldir_t filldir)
97 __releases(inode->i_lock)
98 __acquires(inode->i_lock)
99{ 98{
100 struct inode *inode = filp->f_dentry->d_inode;
101 struct ceph_file_info *fi = filp->private_data; 99 struct ceph_file_info *fi = filp->private_data;
102 struct dentry *parent = filp->f_dentry; 100 struct dentry *parent = filp->f_dentry;
103 struct inode *dir = parent->d_inode; 101 struct inode *dir = parent->d_inode;
@@ -153,7 +151,6 @@ more:
153 151
154 atomic_inc(&dentry->d_count); 152 atomic_inc(&dentry->d_count);
155 spin_unlock(&dcache_lock); 153 spin_unlock(&dcache_lock);
156 spin_unlock(&inode->i_lock);
157 154
158 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 155 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
159 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 156 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -171,35 +168,30 @@ more:
171 } else { 168 } else {
172 dput(last); 169 dput(last);
173 } 170 }
174 last = NULL;
175 } 171 }
176
177 spin_lock(&inode->i_lock);
178 spin_lock(&dcache_lock);
179
180 last = dentry; 172 last = dentry;
181 173
182 if (err < 0) 174 if (err < 0)
183 goto out_unlock; 175 goto out;
184 176
185 p = p->prev;
186 filp->f_pos++; 177 filp->f_pos++;
187 178
188 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ 179 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
189 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) 180 if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
190 goto more; 181 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
191 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); 182 err = -EAGAIN;
192 err = -EAGAIN; 183 goto out;
184 }
185
186 spin_lock(&dcache_lock);
187 p = p->prev; /* advance to next dentry */
188 goto more;
193 189
194out_unlock: 190out_unlock:
195 spin_unlock(&dcache_lock); 191 spin_unlock(&dcache_lock);
196 192out:
197 if (last) { 193 if (last)
198 spin_unlock(&inode->i_lock);
199 dput(last); 194 dput(last);
200 spin_lock(&inode->i_lock);
201 }
202
203 return err; 195 return err;
204} 196}
205 197
@@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
227 struct ceph_file_info *fi = filp->private_data; 219 struct ceph_file_info *fi = filp->private_data;
228 struct inode *inode = filp->f_dentry->d_inode; 220 struct inode *inode = filp->f_dentry->d_inode;
229 struct ceph_inode_info *ci = ceph_inode(inode); 221 struct ceph_inode_info *ci = ceph_inode(inode);
230 struct ceph_client *client = ceph_inode_to_client(inode); 222 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
231 struct ceph_mds_client *mdsc = &client->mdsc; 223 struct ceph_mds_client *mdsc = fsc->mdsc;
232 unsigned frag = fpos_frag(filp->f_pos); 224 unsigned frag = fpos_frag(filp->f_pos);
233 int off = fpos_off(filp->f_pos); 225 int off = fpos_off(filp->f_pos);
234 int err; 226 int err;
235 u32 ftype; 227 u32 ftype;
236 struct ceph_mds_reply_info_parsed *rinfo; 228 struct ceph_mds_reply_info_parsed *rinfo;
237 const int max_entries = client->mount_args->max_readdir; 229 const int max_entries = fsc->mount_options->max_readdir;
238 const int max_bytes = client->mount_args->max_readdir_bytes; 230 const int max_bytes = fsc->mount_options->max_readdir_bytes;
239 231
240 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 232 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
241 if (fi->at_end) 233 if (fi->at_end)
@@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
267 /* can we use the dcache? */ 259 /* can we use the dcache? */
268 spin_lock(&inode->i_lock); 260 spin_lock(&inode->i_lock);
269 if ((filp->f_pos == 2 || fi->dentry) && 261 if ((filp->f_pos == 2 || fi->dentry) &&
270 !ceph_test_opt(client, NOASYNCREADDIR) && 262 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
271 ceph_snap(inode) != CEPH_SNAPDIR && 263 ceph_snap(inode) != CEPH_SNAPDIR &&
272 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 264 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
273 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 265 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
266 spin_unlock(&inode->i_lock);
274 err = __dcache_readdir(filp, dirent, filldir); 267 err = __dcache_readdir(filp, dirent, filldir);
275 if (err != -EAGAIN) { 268 if (err != -EAGAIN)
276 spin_unlock(&inode->i_lock);
277 return err; 269 return err;
278 } 270 } else {
271 spin_unlock(&inode->i_lock);
279 } 272 }
280 spin_unlock(&inode->i_lock);
281 if (fi->dentry) { 273 if (fi->dentry) {
282 err = note_last_dentry(fi, fi->dentry->d_name.name, 274 err = note_last_dentry(fi, fi->dentry->d_name.name,
283 fi->dentry->d_name.len); 275 fi->dentry->d_name.len);
@@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
487struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 479struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
488 struct dentry *dentry, int err) 480 struct dentry *dentry, int err)
489{ 481{
490 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 482 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
491 struct inode *parent = dentry->d_parent->d_inode; 483 struct inode *parent = dentry->d_parent->d_inode;
492 484
493 /* .snap dir? */ 485 /* .snap dir? */
494 if (err == -ENOENT && 486 if (err == -ENOENT &&
495 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
496 strcmp(dentry->d_name.name, 487 strcmp(dentry->d_name.name,
497 client->mount_args->snapdir_name) == 0) { 488 fsc->mount_options->snapdir_name) == 0) {
498 struct inode *inode = ceph_get_snapdir(parent); 489 struct inode *inode = ceph_get_snapdir(parent);
499 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
500 dentry, dentry->d_name.len, dentry->d_name.name, inode); 491 dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
539static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 530static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
540 struct nameidata *nd) 531 struct nameidata *nd)
541{ 532{
542 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 533 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
543 struct ceph_mds_client *mdsc = &client->mdsc; 534 struct ceph_mds_client *mdsc = fsc->mdsc;
544 struct ceph_mds_request *req; 535 struct ceph_mds_request *req;
545 int op; 536 int op;
546 int err; 537 int err;
@@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
572 spin_lock(&dir->i_lock); 563 spin_lock(&dir->i_lock);
573 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 564 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
574 if (strncmp(dentry->d_name.name, 565 if (strncmp(dentry->d_name.name,
575 client->mount_args->snapdir_name, 566 fsc->mount_options->snapdir_name,
576 dentry->d_name.len) && 567 dentry->d_name.len) &&
577 !is_root_ceph_dentry(dir, dentry) && 568 !is_root_ceph_dentry(dir, dentry) &&
578 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 569 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
629static int ceph_mknod(struct inode *dir, struct dentry *dentry, 620static int ceph_mknod(struct inode *dir, struct dentry *dentry,
630 int mode, dev_t rdev) 621 int mode, dev_t rdev)
631{ 622{
632 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 623 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
633 struct ceph_mds_client *mdsc = &client->mdsc; 624 struct ceph_mds_client *mdsc = fsc->mdsc;
634 struct ceph_mds_request *req; 625 struct ceph_mds_request *req;
635 int err; 626 int err;
636 627
@@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
685static int ceph_symlink(struct inode *dir, struct dentry *dentry, 676static int ceph_symlink(struct inode *dir, struct dentry *dentry,
686 const char *dest) 677 const char *dest)
687{ 678{
688 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 679 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
689 struct ceph_mds_client *mdsc = &client->mdsc; 680 struct ceph_mds_client *mdsc = fsc->mdsc;
690 struct ceph_mds_request *req; 681 struct ceph_mds_request *req;
691 int err; 682 int err;
692 683
@@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
716 707
717static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) 708static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
718{ 709{
719 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 710 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
720 struct ceph_mds_client *mdsc = &client->mdsc; 711 struct ceph_mds_client *mdsc = fsc->mdsc;
721 struct ceph_mds_request *req; 712 struct ceph_mds_request *req;
722 int err = -EROFS; 713 int err = -EROFS;
723 int op; 714 int op;
@@ -758,8 +749,8 @@ out:
758static int ceph_link(struct dentry *old_dentry, struct inode *dir, 749static int ceph_link(struct dentry *old_dentry, struct inode *dir,
759 struct dentry *dentry) 750 struct dentry *dentry)
760{ 751{
761 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 752 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
762 struct ceph_mds_client *mdsc = &client->mdsc; 753 struct ceph_mds_client *mdsc = fsc->mdsc;
763 struct ceph_mds_request *req; 754 struct ceph_mds_request *req;
764 int err; 755 int err;
765 756
@@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode)
813 */ 804 */
814static int ceph_unlink(struct inode *dir, struct dentry *dentry) 805static int ceph_unlink(struct inode *dir, struct dentry *dentry)
815{ 806{
816 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 807 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
817 struct ceph_mds_client *mdsc = &client->mdsc; 808 struct ceph_mds_client *mdsc = fsc->mdsc;
818 struct inode *inode = dentry->d_inode; 809 struct inode *inode = dentry->d_inode;
819 struct ceph_mds_request *req; 810 struct ceph_mds_request *req;
820 int err = -EROFS; 811 int err = -EROFS;
@@ -854,8 +845,8 @@ out:
854static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 845static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
855 struct inode *new_dir, struct dentry *new_dentry) 846 struct inode *new_dir, struct dentry *new_dentry)
856{ 847{
857 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); 848 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
858 struct ceph_mds_client *mdsc = &client->mdsc; 849 struct ceph_mds_client *mdsc = fsc->mdsc;
859 struct ceph_mds_request *req; 850 struct ceph_mds_request *req;
860 int err; 851 int err;
861 852
@@ -1076,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1076 struct ceph_inode_info *ci = ceph_inode(inode); 1067 struct ceph_inode_info *ci = ceph_inode(inode);
1077 int left; 1068 int left;
1078 1069
1079 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1070 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1080 return -EISDIR; 1071 return -EISDIR;
1081 1072
1082 if (!cf->dir_info) { 1073 if (!cf->dir_info) {
@@ -1177,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
1177 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1168 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1178 dn->d_name.len, dn->d_name.name); 1169 dn->d_name.len, dn->d_name.name);
1179 if (di) { 1170 if (di) {
1180 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1171 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1181 spin_lock(&mdsc->dentry_lru_lock); 1172 spin_lock(&mdsc->dentry_lru_lock);
1182 list_add_tail(&di->lru, &mdsc->dentry_lru); 1173 list_add_tail(&di->lru, &mdsc->dentry_lru);
1183 mdsc->num_dentry++; 1174 mdsc->num_dentry++;
@@ -1193,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1193 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, 1184 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1194 dn->d_name.len, dn->d_name.name, di->offset); 1185 dn->d_name.len, dn->d_name.name, di->offset);
1195 if (di) { 1186 if (di) {
1196 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1187 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1197 spin_lock(&mdsc->dentry_lru_lock); 1188 spin_lock(&mdsc->dentry_lru_lock);
1198 list_move_tail(&di->lru, &mdsc->dentry_lru); 1189 list_move_tail(&di->lru, &mdsc->dentry_lru);
1199 spin_unlock(&mdsc->dentry_lru_lock); 1190 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1208,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
1208 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1199 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1209 dn->d_name.len, dn->d_name.name); 1200 dn->d_name.len, dn->d_name.name);
1210 if (di) { 1201 if (di) {
1211 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1202 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1212 spin_lock(&mdsc->dentry_lru_lock); 1203 spin_lock(&mdsc->dentry_lru_lock);
1213 list_del_init(&di->lru); 1204 list_del_init(&di->lru);
1214 mdsc->num_dentry--; 1205 mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e38423e82f2e..2297d9426992 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/exportfs.h> 3#include <linux/exportfs.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <asm/unaligned.h> 5#include <asm/unaligned.h>
6 6
7#include "super.h" 7#include "super.h"
8#include "mds_client.h"
8 9
9/* 10/*
10 * NFS export support 11 * NFS export support
@@ -120,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
120static struct dentry *__cfh_to_dentry(struct super_block *sb, 121static struct dentry *__cfh_to_dentry(struct super_block *sb,
121 struct ceph_nfs_confh *cfh) 122 struct ceph_nfs_confh *cfh)
122{ 123{
123 struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; 124 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
124 struct inode *inode; 125 struct inode *inode;
125 struct dentry *dentry; 126 struct dentry *dentry;
126 struct ceph_vino vino; 127 struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 66e4da6dba22..e77c28cf3690 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/file.h> 6#include <linux/file.h>
@@ -38,8 +39,8 @@
38static struct ceph_mds_request * 39static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode) 40prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{ 41{
41 struct ceph_client *client = ceph_sb_to_client(sb); 42 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc; 43 struct ceph_mds_client *mdsc = fsc->mdsc;
43 struct ceph_mds_request *req; 44 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS; 45 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; 46 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
117int ceph_open(struct inode *inode, struct file *file) 118int ceph_open(struct inode *inode, struct file *file)
118{ 119{
119 struct ceph_inode_info *ci = ceph_inode(inode); 120 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 121 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc; 122 struct ceph_mds_client *mdsc = fsc->mdsc;
122 struct ceph_mds_request *req; 123 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data; 124 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 125 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode, 217 struct nameidata *nd, int mode,
217 int locked_dir) 218 int locked_dir)
218{ 219{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 220 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc; 221 struct ceph_mds_client *mdsc = fsc->mdsc;
221 struct file *file = nd->intent.open.file; 222 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); 223 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req; 224 struct ceph_mds_request *req;
@@ -270,163 +271,6 @@ int ceph_release(struct inode *inode, struct file *file)
270} 271}
271 272
272/* 273/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = __page_cache_alloc(flags);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over 274 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.) 275 * objects we stripe over. (That's not atomic, but good enough for now.)
432 * 276 *
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode,
438 struct page **pages, int num_pages, 282 struct page **pages, int num_pages,
439 int *checkeof) 283 int *checkeof)
440{ 284{
441 struct ceph_client *client = ceph_inode_to_client(inode); 285 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode); 286 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len; 287 u64 pos, this_len;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 288 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode,
459 303
460more: 304more:
461 this_len = left; 305 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), 306 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len, 307 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq, 308 ci->i_truncate_seq,
465 ci->i_truncate_size, 309 ci->i_truncate_size,
@@ -477,8 +321,8 @@ more:
477 321
478 if (read < pos - off) { 322 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos); 323 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read, 324 ceph_zero_page_vector_range(page_off + read,
481 pos - off - read, pages); 325 pos - off - read, pages);
482 } 326 }
483 pos += ret; 327 pos += ret;
484 read = pos - off; 328 read = pos - off;
@@ -495,8 +339,8 @@ more:
495 /* was original extent fully inside i_size? */ 339 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) { 340 if (pos + left <= inode->i_size) {
497 dout("zero tail\n"); 341 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read, 342 ceph_zero_page_vector_range(page_off + read, len - read,
499 pages); 343 pages);
500 read = len; 344 read = len;
501 goto out; 345 goto out;
502 } 346 }
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 375 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532 376
533 if (file->f_flags & O_DIRECT) { 377 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len); 378 pages = ceph_get_direct_page_vector(data, num_pages, off, len);
535 379
536 /* 380 /*
537 * flush any page cache pages in this range. this 381 * flush any page cache pages in this range. this
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof); 396 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
553 397
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 398 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret); 399 ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0) 400 if (ret >= 0)
557 *poff = off + ret; 401 *poff = off + ret;
558 402
559done: 403done:
560 if (file->f_flags & O_DIRECT) 404 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages); 405 ceph_put_page_vector(pages, num_pages);
562 else 406 else
563 ceph_release_page_vector(pages, num_pages); 407 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret); 408 dout("sync_read result %d\n", ret);
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
594{ 438{
595 struct inode *inode = file->f_dentry->d_inode; 439 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode); 440 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode); 441 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req; 442 struct ceph_osd_request *req;
599 struct page **pages; 443 struct page **pages;
600 int num_pages; 444 int num_pages;
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
642 */ 486 */
643more: 487more:
644 len = left; 488 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, 489 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len, 490 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags, 491 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context, 492 ci->i_snap_realm->cached_context,
@@ -655,7 +499,7 @@ more:
655 num_pages = calc_pages_for(pos, len); 499 num_pages = calc_pages_for(pos, len);
656 500
657 if (file->f_flags & O_DIRECT) { 501 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len); 502 pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
659 if (IS_ERR(pages)) { 503 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages); 504 ret = PTR_ERR(pages);
661 goto out; 505 goto out;
@@ -673,7 +517,7 @@ more:
673 ret = PTR_ERR(pages); 517 ret = PTR_ERR(pages);
674 goto out; 518 goto out;
675 } 519 }
676 ret = copy_user_to_page_vector(pages, data, pos, len); 520 ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
677 if (ret < 0) { 521 if (ret < 0) {
678 ceph_release_page_vector(pages, num_pages); 522 ceph_release_page_vector(pages, num_pages);
679 goto out; 523 goto out;
@@ -689,7 +533,7 @@ more:
689 req->r_num_pages = num_pages; 533 req->r_num_pages = num_pages;
690 req->r_inode = inode; 534 req->r_inode = inode;
691 535
692 ret = ceph_osdc_start_request(&client->osdc, req, false); 536 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
693 if (!ret) { 537 if (!ret) {
694 if (req->r_safe_callback) { 538 if (req->r_safe_callback) {
695 /* 539 /*
@@ -701,11 +545,11 @@ more:
701 spin_unlock(&ci->i_unsafe_lock); 545 spin_unlock(&ci->i_unsafe_lock);
702 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); 546 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
703 } 547 }
704 ret = ceph_osdc_wait_request(&client->osdc, req); 548 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
705 } 549 }
706 550
707 if (file->f_flags & O_DIRECT) 551 if (file->f_flags & O_DIRECT)
708 put_page_vector(pages, num_pages); 552 ceph_put_page_vector(pages, num_pages);
709 else if (file->f_flags & O_SYNC) 553 else if (file->f_flags & O_SYNC)
710 ceph_release_page_vector(pages, num_pages); 554 ceph_release_page_vector(pages, num_pages);
711 555
@@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
814 struct ceph_file_info *fi = file->private_data; 658 struct ceph_file_info *fi = file->private_data;
815 struct inode *inode = file->f_dentry->d_inode; 659 struct inode *inode = file->f_dentry->d_inode;
816 struct ceph_inode_info *ci = ceph_inode(inode); 660 struct ceph_inode_info *ci = ceph_inode(inode);
817 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 661 struct ceph_osd_client *osdc =
662 &ceph_sb_to_client(inode->i_sb)->client->osdc;
818 loff_t endoff = pos + iov->iov_len; 663 loff_t endoff = pos + iov->iov_len;
819 int want, got = 0; 664 int want, got = 0;
820 int ret, err; 665 int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 62377ec37edf..1d6a45b5a04c 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
@@ -13,7 +13,8 @@
13#include <linux/pagevec.h> 13#include <linux/pagevec.h>
14 14
15#include "super.h" 15#include "super.h"
16#include "decode.h" 16#include "mds_client.h"
17#include <linux/ceph/decode.h>
17 18
18/* 19/*
19 * Ceph inode operations 20 * Ceph inode operations
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode)
384 */ 385 */
385 if (ci->i_snap_realm) { 386 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc = 387 struct ceph_mds_client *mdsc =
387 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 388 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm; 389 struct ceph_snap_realm *realm = ci->i_snap_realm;
389 390
390 dout(" dropping residual ref to snap realm %p\n", realm); 391 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode,
685 } 686 }
686 687
687 /* it may be better to set st_size in getattr instead? */ 688 /* it may be better to set st_size in getattr instead? */
688 if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) 689 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
689 inode->i_size = ci->i_rbytes; 690 inode->i_size = ci->i_rbytes;
690 break; 691 break;
691 default: 692 default:
@@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
901 struct inode *in = NULL; 902 struct inode *in = NULL;
902 struct ceph_mds_reply_inode *ininfo; 903 struct ceph_mds_reply_inode *ininfo;
903 struct ceph_vino vino; 904 struct ceph_vino vino;
904 struct ceph_client *client = ceph_sb_to_client(sb); 905 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
905 int i = 0; 906 int i = 0;
906 int err = 0; 907 int err = 0;
907 908
@@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
965 */ 966 */
966 if (rinfo->head->is_dentry && !req->r_aborted && 967 if (rinfo->head->is_dentry && !req->r_aborted &&
967 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, 968 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
968 client->mount_args->snapdir_name, 969 fsc->mount_options->snapdir_name,
969 req->r_dentry->d_name.len))) { 970 req->r_dentry->d_name.len))) {
970 /* 971 /*
971 * lookup link rename : null -> possibly existing inode 972 * lookup link rename : null -> possibly existing inode
@@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1533 struct inode *parent_inode = dentry->d_parent->d_inode; 1534 struct inode *parent_inode = dentry->d_parent->d_inode;
1534 const unsigned int ia_valid = attr->ia_valid; 1535 const unsigned int ia_valid = attr->ia_valid;
1535 struct ceph_mds_request *req; 1536 struct ceph_mds_request *req;
1536 struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; 1537 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
1537 int issued; 1538 int issued;
1538 int release = 0, dirtied = 0; 1539 int release = 0, dirtied = 0;
1539 int mask = 0; 1540 int mask = 0;
@@ -1728,8 +1729,8 @@ out:
1728 */ 1729 */
1729int ceph_do_getattr(struct inode *inode, int mask) 1730int ceph_do_getattr(struct inode *inode, int mask)
1730{ 1731{
1731 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 1732 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
1732 struct ceph_mds_client *mdsc = &client->mdsc; 1733 struct ceph_mds_client *mdsc = fsc->mdsc;
1733 struct ceph_mds_request *req; 1734 struct ceph_mds_request *req;
1734 int err; 1735 int err;
1735 1736
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 76e307d2aba1..8888c9ba68db 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
1#include <linux/in.h> 1#include <linux/in.h>
2 2
3#include "ioctl.h"
4#include "super.h" 3#include "super.h"
5#include "ceph_debug.h" 4#include "mds_client.h"
5#include <linux/ceph/ceph_debug.h>
6
7#include "ioctl.h"
6 8
7 9
8/* 10/*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{ 39{
38 struct inode *inode = file->f_dentry->d_inode; 40 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 41 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 42 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req; 43 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l; 44 struct ceph_ioctl_layout l;
43 int err, i; 45 int err, i;
@@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
90} 92}
91 93
92/* 94/*
95 * Set a layout policy on a directory inode. All items in the tree
96 * rooted at this inode will inherit this layout on creation,
97 * (It doesn't apply retroactively )
98 * unless a subdirectory has its own layout policy.
99 */
100static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
101{
102 struct inode *inode = file->f_dentry->d_inode;
103 struct ceph_mds_request *req;
104 struct ceph_ioctl_layout l;
105 int err, i;
106 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
107
108 /* copy and validate */
109 if (copy_from_user(&l, arg, sizeof(l)))
110 return -EFAULT;
111
112 if ((l.object_size & ~PAGE_MASK) ||
113 (l.stripe_unit & ~PAGE_MASK) ||
114 !l.stripe_unit ||
115 (l.object_size &&
116 (unsigned)l.object_size % (unsigned)l.stripe_unit))
117 return -EINVAL;
118
119 /* make sure it's a valid data pool */
120 if (l.data_pool > 0) {
121 mutex_lock(&mdsc->mutex);
122 err = -EINVAL;
123 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
124 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
125 err = 0;
126 break;
127 }
128 mutex_unlock(&mdsc->mutex);
129 if (err)
130 return err;
131 }
132
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
134 USE_AUTH_MDS);
135
136 if (IS_ERR(req))
137 return PTR_ERR(req);
138 req->r_inode = igrab(inode);
139
140 req->r_args.setlayout.layout.fl_stripe_unit =
141 cpu_to_le32(l.stripe_unit);
142 req->r_args.setlayout.layout.fl_stripe_count =
143 cpu_to_le32(l.stripe_count);
144 req->r_args.setlayout.layout.fl_object_size =
145 cpu_to_le32(l.object_size);
146 req->r_args.setlayout.layout.fl_pg_pool =
147 cpu_to_le32(l.data_pool);
148 req->r_args.setlayout.layout.fl_pg_preferred =
149 cpu_to_le32(l.preferred_osd);
150
151 err = ceph_mdsc_do_request(mdsc, inode, req);
152 ceph_mdsc_put_request(req);
153 return err;
154}
155
156/*
93 * Return object name, size/offset information, and location (OSD 157 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset. 158 * number, network address) for a given file offset.
95 */ 159 */
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
98 struct ceph_ioctl_dataloc dl; 162 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode; 163 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode); 164 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 165 struct ceph_osd_client *osdc =
166 &ceph_sb_to_client(inode->i_sb)->client->osdc;
102 u64 len = 1, olen; 167 u64 len = 1, olen;
103 u64 tmp; 168 u64 tmp;
104 struct ceph_object_layout ol; 169 struct ceph_object_layout ol;
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
174 case CEPH_IOC_SET_LAYOUT: 239 case CEPH_IOC_SET_LAYOUT:
175 return ceph_ioctl_set_layout(file, (void __user *)arg); 240 return ceph_ioctl_set_layout(file, (void __user *)arg);
176 241
242 case CEPH_IOC_SET_LAYOUT_POLICY:
243 return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
244
177 case CEPH_IOC_GET_DATALOC: 245 case CEPH_IOC_GET_DATALOC:
178 return ceph_ioctl_get_dataloc(file, (void __user *)arg); 246 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
179 247
180 case CEPH_IOC_LAZYIO: 248 case CEPH_IOC_LAZYIO:
181 return ceph_ioctl_lazyio(file); 249 return ceph_ioctl_lazyio(file);
182 } 250 }
251
183 return -ENOTTY; 252 return -ENOTTY;
184} 253}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 88451a3b6857..a6ce54e94eb5 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
4#include <linux/ioctl.h> 4#include <linux/ioctl.h>
5#include <linux/types.h> 5#include <linux/types.h>
6 6
7#define CEPH_IOCTL_MAGIC 0x97 7#define CEPH_IOCTL_MAGIC 0x98
8 8
9/* just use u64 to align sanely on all archs */ 9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout { 10struct ceph_ioctl_layout {
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
17 struct ceph_ioctl_layout) 17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ 18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout) 19 struct ceph_ioctl_layout)
20#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \
21 struct ceph_ioctl_layout)
20 22
21/* 23/*
22 * Extract identity, address of the OSD and object storing a given 24 * Extract identity, address of the OSD and object storing a given
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ff4e753aae92..40abde93c345 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,11 +1,11 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/file.h> 3#include <linux/file.h>
4#include <linux/namei.h> 4#include <linux/namei.h>
5 5
6#include "super.h" 6#include "super.h"
7#include "mds_client.h" 7#include "mds_client.h"
8#include "pagelist.h" 8#include <linux/ceph/pagelist.h>
9 9
10/** 10/**
11 * Implement fcntl and flock locking functions. 11 * Implement fcntl and flock locking functions.
@@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
16{ 16{
17 struct inode *inode = file->f_dentry->d_inode; 17 struct inode *inode = file->f_dentry->d_inode;
18 struct ceph_mds_client *mdsc = 18 struct ceph_mds_client *mdsc =
19 &ceph_sb_to_client(inode->i_sb)->mdsc; 19 ceph_sb_to_client(inode->i_sb)->mdsc;
20 struct ceph_mds_request *req; 20 struct ceph_mds_request *req;
21 int err; 21 int err;
22 22
@@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
181 * Encode the flock and fcntl locks for the given inode into the pagelist. 181 * Encode the flock and fcntl locks for the given inode into the pagelist.
182 * Format is: #fcntl locks, sequential fcntl locks, #flock locks, 182 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
183 * sequential flock locks. 183 * sequential flock locks.
184 * Must be called with BLK already held, and the lock numbers should have 184 * Must be called with lock_flocks() already held.
185 * been gathered under the same lock holding window. 185 * If we encounter more of a specific lock type than expected,
186 * we return the value 1.
186 */ 187 */
187int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, 188int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
188 int num_fcntl_locks, int num_flock_locks) 189 int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
190 struct file_lock *lock; 191 struct file_lock *lock;
191 struct ceph_filelock cephlock; 192 struct ceph_filelock cephlock;
192 int err = 0; 193 int err = 0;
194 int seen_fcntl = 0;
195 int seen_flock = 0;
193 196
194 dout("encoding %d flock and %d fcntl locks", num_flock_locks, 197 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
195 num_fcntl_locks); 198 num_fcntl_locks);
@@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
198 goto fail; 201 goto fail;
199 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 202 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
200 if (lock->fl_flags & FL_POSIX) { 203 if (lock->fl_flags & FL_POSIX) {
204 ++seen_fcntl;
205 if (seen_fcntl > num_fcntl_locks) {
206 err = -ENOSPC;
207 goto fail;
208 }
201 err = lock_to_ceph_filelock(lock, &cephlock); 209 err = lock_to_ceph_filelock(lock, &cephlock);
202 if (err) 210 if (err)
203 goto fail; 211 goto fail;
@@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
213 goto fail; 221 goto fail;
214 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 222 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
215 if (lock->fl_flags & FL_FLOCK) { 223 if (lock->fl_flags & FL_FLOCK) {
224 ++seen_flock;
225 if (seen_flock > num_flock_locks) {
226 err = -ENOSPC;
227 goto fail;
228 }
216 err = lock_to_ceph_filelock(lock, &cephlock); 229 err = lock_to_ceph_filelock(lock, &cephlock);
217 if (err) 230 if (err)
218 goto fail; 231 goto fail;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fad95f8f2608..3142b15940c2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,17 +1,21 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/fs.h>
3#include <linux/wait.h> 4#include <linux/wait.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
6#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
7 10
8#include "mds_client.h"
9#include "mon_client.h"
10#include "super.h" 11#include "super.h"
11#include "messenger.h" 12#include "mds_client.h"
12#include "decode.h" 13
13#include "auth.h" 14#include <linux/ceph/messenger.h>
14#include "pagelist.h" 15#include <linux/ceph/decode.h>
16#include <linux/ceph/pagelist.h>
17#include <linux/ceph/auth.h>
18#include <linux/ceph/debugfs.h>
15 19
16/* 20/*
17 * A cluster of MDS (metadata server) daemons is responsible for 21 * A cluster of MDS (metadata server) daemons is responsible for
@@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
286 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); 290 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
287 if (atomic_dec_and_test(&s->s_ref)) { 291 if (atomic_dec_and_test(&s->s_ref)) {
288 if (s->s_authorizer) 292 if (s->s_authorizer)
289 s->s_mdsc->client->monc.auth->ops->destroy_authorizer( 293 s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
290 s->s_mdsc->client->monc.auth, s->s_authorizer); 294 s->s_mdsc->fsc->client->monc.auth,
295 s->s_authorizer);
291 kfree(s); 296 kfree(s);
292 } 297 }
293} 298}
@@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
344 s->s_seq = 0; 349 s->s_seq = 0;
345 mutex_init(&s->s_mutex); 350 mutex_init(&s->s_mutex);
346 351
347 ceph_con_init(mdsc->client->msgr, &s->s_con); 352 ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
348 s->s_con.private = s; 353 s->s_con.private = s;
349 s->s_con.ops = &mds_con_ops; 354 s->s_con.ops = &mds_con_ops;
350 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; 355 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
599 } else if (req->r_dentry) { 604 } else if (req->r_dentry) {
600 struct inode *dir = req->r_dentry->d_parent->d_inode; 605 struct inode *dir = req->r_dentry->d_parent->d_inode;
601 606
602 if (dir->i_sb != mdsc->client->sb) { 607 if (dir->i_sb != mdsc->fsc->sb) {
603 /* not this fs! */ 608 /* not this fs! */
604 inode = req->r_dentry->d_inode; 609 inode = req->r_dentry->d_inode;
605 } else if (ceph_snap(dir) != CEPH_NOSNAP) { 610 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
884 __ceph_remove_cap(cap); 889 __ceph_remove_cap(cap);
885 if (!__ceph_is_any_real_caps(ci)) { 890 if (!__ceph_is_any_real_caps(ci)) {
886 struct ceph_mds_client *mdsc = 891 struct ceph_mds_client *mdsc =
887 &ceph_sb_to_client(inode->i_sb)->mdsc; 892 ceph_sb_to_client(inode->i_sb)->mdsc;
888 893
889 spin_lock(&mdsc->cap_dirty_lock); 894 spin_lock(&mdsc->cap_dirty_lock);
890 if (!list_empty(&ci->i_dirty_item)) { 895 if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1146 struct ceph_msg *msg, *partial = NULL; 1151 struct ceph_msg *msg, *partial = NULL;
1147 struct ceph_mds_cap_release *head; 1152 struct ceph_mds_cap_release *head;
1148 int err = -ENOMEM; 1153 int err = -ENOMEM;
1149 int extra = mdsc->client->mount_args->cap_release_safety; 1154 int extra = mdsc->fsc->mount_options->cap_release_safety;
1150 int num; 1155 int num;
1151 1156
1152 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, 1157 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2085 2090
2086 /* insert trace into our cache */ 2091 /* insert trace into our cache */
2087 mutex_lock(&req->r_fill_mutex); 2092 mutex_lock(&req->r_fill_mutex);
2088 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); 2093 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2089 if (err == 0) { 2094 if (err == 0) {
2090 if (result == 0 && rinfo->dir_nr) 2095 if (result == 0 && rinfo->dir_nr)
2091 ceph_readdir_prepopulate(req, req->r_session); 2096 ceph_readdir_prepopulate(req, req->r_session);
@@ -2361,19 +2366,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2361 2366
2362 if (recon_state->flock) { 2367 if (recon_state->flock) {
2363 int num_fcntl_locks, num_flock_locks; 2368 int num_fcntl_locks, num_flock_locks;
2364 2369 struct ceph_pagelist_cursor trunc_point;
2365 lock_kernel(); 2370
2366 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); 2371 ceph_pagelist_set_cursor(pagelist, &trunc_point);
2367 rec.v2.flock_len = (2*sizeof(u32) + 2372 do {
2368 (num_fcntl_locks+num_flock_locks) * 2373 lock_flocks();
2369 sizeof(struct ceph_filelock)); 2374 ceph_count_locks(inode, &num_fcntl_locks,
2370 2375 &num_flock_locks);
2371 err = ceph_pagelist_append(pagelist, &rec, reclen); 2376 rec.v2.flock_len = (2*sizeof(u32) +
2372 if (!err) 2377 (num_fcntl_locks+num_flock_locks) *
2373 err = ceph_encode_locks(inode, pagelist, 2378 sizeof(struct ceph_filelock));
2374 num_fcntl_locks, 2379 unlock_flocks();
2375 num_flock_locks); 2380
2376 unlock_kernel(); 2381 /* pre-alloc pagelist */
2382 ceph_pagelist_truncate(pagelist, &trunc_point);
2383 err = ceph_pagelist_append(pagelist, &rec, reclen);
2384 if (!err)
2385 err = ceph_pagelist_reserve(pagelist,
2386 rec.v2.flock_len);
2387
2388 /* encode locks */
2389 if (!err) {
2390 lock_flocks();
2391 err = ceph_encode_locks(inode,
2392 pagelist,
2393 num_fcntl_locks,
2394 num_flock_locks);
2395 unlock_flocks();
2396 }
2397 } while (err == -ENOSPC);
2377 } else { 2398 } else {
2378 err = ceph_pagelist_append(pagelist, &rec, reclen); 2399 err = ceph_pagelist_append(pagelist, &rec, reclen);
2379 } 2400 }
@@ -2613,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2613 struct ceph_mds_session *session, 2634 struct ceph_mds_session *session,
2614 struct ceph_msg *msg) 2635 struct ceph_msg *msg)
2615{ 2636{
2616 struct super_block *sb = mdsc->client->sb; 2637 struct super_block *sb = mdsc->fsc->sb;
2617 struct inode *inode; 2638 struct inode *inode;
2618 struct ceph_inode_info *ci; 2639 struct ceph_inode_info *ci;
2619 struct dentry *parent, *dentry; 2640 struct dentry *parent, *dentry;
@@ -2891,10 +2912,16 @@ static void delayed_work(struct work_struct *work)
2891 schedule_delayed(mdsc); 2912 schedule_delayed(mdsc);
2892} 2913}
2893 2914
2915int ceph_mdsc_init(struct ceph_fs_client *fsc)
2894 2916
2895int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2896{ 2917{
2897 mdsc->client = client; 2918 struct ceph_mds_client *mdsc;
2919
2920 mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
2921 if (!mdsc)
2922 return -ENOMEM;
2923 mdsc->fsc = fsc;
2924 fsc->mdsc = mdsc;
2898 mutex_init(&mdsc->mutex); 2925 mutex_init(&mdsc->mutex);
2899 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 2926 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2900 if (mdsc->mdsmap == NULL) 2927 if (mdsc->mdsmap == NULL)
@@ -2927,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2927 INIT_LIST_HEAD(&mdsc->dentry_lru); 2954 INIT_LIST_HEAD(&mdsc->dentry_lru);
2928 2955
2929 ceph_caps_init(mdsc); 2956 ceph_caps_init(mdsc);
2930 ceph_adjust_min_caps(mdsc, client->min_caps); 2957 ceph_adjust_min_caps(mdsc, fsc->min_caps);
2931 2958
2932 return 0; 2959 return 0;
2933} 2960}
@@ -2939,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2939static void wait_requests(struct ceph_mds_client *mdsc) 2966static void wait_requests(struct ceph_mds_client *mdsc)
2940{ 2967{
2941 struct ceph_mds_request *req; 2968 struct ceph_mds_request *req;
2942 struct ceph_client *client = mdsc->client; 2969 struct ceph_fs_client *fsc = mdsc->fsc;
2943 2970
2944 mutex_lock(&mdsc->mutex); 2971 mutex_lock(&mdsc->mutex);
2945 if (__get_oldest_req(mdsc)) { 2972 if (__get_oldest_req(mdsc)) {
@@ -2947,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
2947 2974
2948 dout("wait_requests waiting for requests\n"); 2975 dout("wait_requests waiting for requests\n");
2949 wait_for_completion_timeout(&mdsc->safe_umount_waiters, 2976 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2950 client->mount_args->mount_timeout * HZ); 2977 fsc->client->options->mount_timeout * HZ);
2951 2978
2952 /* tear down remaining requests */ 2979 /* tear down remaining requests */
2953 mutex_lock(&mdsc->mutex); 2980 mutex_lock(&mdsc->mutex);
@@ -3030,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3030{ 3057{
3031 u64 want_tid, want_flush; 3058 u64 want_tid, want_flush;
3032 3059
3033 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) 3060 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3034 return; 3061 return;
3035 3062
3036 dout("sync\n"); 3063 dout("sync\n");
@@ -3053,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
3053{ 3080{
3054 int i, n = 0; 3081 int i, n = 0;
3055 3082
3056 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) 3083 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3057 return true; 3084 return true;
3058 3085
3059 mutex_lock(&mdsc->mutex); 3086 mutex_lock(&mdsc->mutex);
@@ -3071,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3071{ 3098{
3072 struct ceph_mds_session *session; 3099 struct ceph_mds_session *session;
3073 int i; 3100 int i;
3074 struct ceph_client *client = mdsc->client; 3101 struct ceph_fs_client *fsc = mdsc->fsc;
3075 unsigned long timeout = client->mount_args->mount_timeout * HZ; 3102 unsigned long timeout = fsc->client->options->mount_timeout * HZ;
3076 3103
3077 dout("close_sessions\n"); 3104 dout("close_sessions\n");
3078 3105
@@ -3119,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3119 dout("stopped\n"); 3146 dout("stopped\n");
3120} 3147}
3121 3148
3122void ceph_mdsc_stop(struct ceph_mds_client *mdsc) 3149static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3123{ 3150{
3124 dout("stop\n"); 3151 dout("stop\n");
3125 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ 3152 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3129,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3129 ceph_caps_finalize(mdsc); 3156 ceph_caps_finalize(mdsc);
3130} 3157}
3131 3158
3159void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
3160{
3161 struct ceph_mds_client *mdsc = fsc->mdsc;
3162
3163 ceph_mdsc_stop(mdsc);
3164 fsc->mdsc = NULL;
3165 kfree(mdsc);
3166}
3167
3132 3168
3133/* 3169/*
3134 * handle mds map update. 3170 * handle mds map update.
@@ -3145,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3145 3181
3146 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); 3182 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
3147 ceph_decode_copy(&p, &fsid, sizeof(fsid)); 3183 ceph_decode_copy(&p, &fsid, sizeof(fsid));
3148 if (ceph_check_fsid(mdsc->client, &fsid) < 0) 3184 if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
3149 return; 3185 return;
3150 epoch = ceph_decode_32(&p); 3186 epoch = ceph_decode_32(&p);
3151 maplen = ceph_decode_32(&p); 3187 maplen = ceph_decode_32(&p);
3152 dout("handle_map epoch %u len %d\n", epoch, (int)maplen); 3188 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
3153 3189
3154 /* do we need it? */ 3190 /* do we need it? */
3155 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch); 3191 ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
3156 mutex_lock(&mdsc->mutex); 3192 mutex_lock(&mdsc->mutex);
3157 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { 3193 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
3158 dout("handle_map epoch %u <= our %u\n", 3194 dout("handle_map epoch %u <= our %u\n",
@@ -3176,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3176 } else { 3212 } else {
3177 mdsc->mdsmap = newmap; /* first mds map */ 3213 mdsc->mdsmap = newmap; /* first mds map */
3178 } 3214 }
3179 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; 3215 mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
3180 3216
3181 __wake_requests(mdsc, &mdsc->waiting_for_map); 3217 __wake_requests(mdsc, &mdsc->waiting_for_map);
3182 3218
@@ -3277,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con,
3277{ 3313{
3278 struct ceph_mds_session *s = con->private; 3314 struct ceph_mds_session *s = con->private;
3279 struct ceph_mds_client *mdsc = s->s_mdsc; 3315 struct ceph_mds_client *mdsc = s->s_mdsc;
3280 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3316 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3281 int ret = 0; 3317 int ret = 0;
3282 3318
3283 if (force_new && s->s_authorizer) { 3319 if (force_new && s->s_authorizer) {
@@ -3311,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
3311{ 3347{
3312 struct ceph_mds_session *s = con->private; 3348 struct ceph_mds_session *s = con->private;
3313 struct ceph_mds_client *mdsc = s->s_mdsc; 3349 struct ceph_mds_client *mdsc = s->s_mdsc;
3314 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3350 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3315 3351
3316 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); 3352 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3317} 3353}
@@ -3320,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
3320{ 3356{
3321 struct ceph_mds_session *s = con->private; 3357 struct ceph_mds_session *s = con->private;
3322 struct ceph_mds_client *mdsc = s->s_mdsc; 3358 struct ceph_mds_client *mdsc = s->s_mdsc;
3323 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3359 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3324 3360
3325 if (ac->ops->invalidate_authorizer) 3361 if (ac->ops->invalidate_authorizer)
3326 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); 3362 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3327 3363
3328 return ceph_monc_validate_auth(&mdsc->client->monc); 3364 return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
3329} 3365}
3330 3366
3331static const struct ceph_connection_operations mds_con_ops = { 3367static const struct ceph_connection_operations mds_con_ops = {
@@ -3338,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = {
3338 .peer_reset = peer_reset, 3374 .peer_reset = peer_reset,
3339}; 3375};
3340 3376
3341
3342
3343
3344/* eof */ 3377/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c98267ce6d2a..d66d63c72355 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
8#include <linux/rbtree.h> 8#include <linux/rbtree.h>
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10 10
11#include "types.h" 11#include <linux/ceph/types.h>
12#include "messenger.h" 12#include <linux/ceph/messenger.h>
13#include "mdsmap.h" 13#include <linux/ceph/mdsmap.h>
14 14
15/* 15/*
16 * Some lock dependencies: 16 * Some lock dependencies:
@@ -26,7 +26,7 @@
26 * 26 *
27 */ 27 */
28 28
29struct ceph_client; 29struct ceph_fs_client;
30struct ceph_cap; 30struct ceph_cap;
31 31
32/* 32/*
@@ -230,7 +230,7 @@ struct ceph_mds_request {
230 * mds client state 230 * mds client state
231 */ 231 */
232struct ceph_mds_client { 232struct ceph_mds_client {
233 struct ceph_client *client; 233 struct ceph_fs_client *fsc;
234 struct mutex mutex; /* all nested structures */ 234 struct mutex mutex; /* all nested structures */
235 235
236 struct ceph_mdsmap *mdsmap; 236 struct ceph_mdsmap *mdsmap;
@@ -289,11 +289,6 @@ struct ceph_mds_client {
289 int caps_avail_count; /* unused, unreserved */ 289 int caps_avail_count; /* unused, unreserved */
290 int caps_min_count; /* keep at least this many 290 int caps_min_count; /* keep at least this many
291 (unreserved) */ 291 (unreserved) */
292
293#ifdef CONFIG_DEBUG_FS
294 struct dentry *debugfs_file;
295#endif
296
297 spinlock_t dentry_lru_lock; 292 spinlock_t dentry_lru_lock;
298 struct list_head dentry_lru; 293 struct list_head dentry_lru;
299 int num_dentry; 294 int num_dentry;
@@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
316extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, 311extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
317 struct ceph_msg *msg, int mds); 312 struct ceph_msg *msg, int mds);
318 313
319extern int ceph_mdsc_init(struct ceph_mds_client *mdsc, 314extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
320 struct ceph_client *client);
321extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); 315extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
322extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc); 316extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
323 317
324extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); 318extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
325 319
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 040be6d1150b..73b7d44e8a35 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/bug.h> 3#include <linux/bug.h>
4#include <linux/err.h> 4#include <linux/err.h>
@@ -6,9 +6,9 @@
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9#include "mdsmap.h" 9#include <linux/ceph/mdsmap.h>
10#include "messenger.h" 10#include <linux/ceph/messenger.h>
11#include "decode.h" 11#include <linux/ceph/decode.h>
12 12
13#include "super.h" 13#include "super.h"
14 14
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
117 } 117 }
118 118
119 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", 119 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
120 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr), 120 i+1, n, global_id, mds, inc,
121 ceph_pr_addr(&addr.in_addr),
121 ceph_mds_state_name(state)); 122 ceph_mds_state_name(state));
122 if (mds >= 0 && mds < m->m_max_mds && state > 0) { 123 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
123 m->m_info[mds].global_id = global_id; 124 m->m_info[mds].global_id = global_id;
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
deleted file mode 100644
index 46a368b6dce5..000000000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,63 +0,0 @@
1
2#include <linux/gfp.h>
3#include <linux/pagemap.h>
4#include <linux/highmem.h>
5
6#include "pagelist.h"
7
8static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
9{
10 struct page *page = list_entry(pl->head.prev, struct page,
11 lru);
12 kunmap(page);
13}
14
15int ceph_pagelist_release(struct ceph_pagelist *pl)
16{
17 if (pl->mapped_tail)
18 ceph_pagelist_unmap_tail(pl);
19
20 while (!list_empty(&pl->head)) {
21 struct page *page = list_first_entry(&pl->head, struct page,
22 lru);
23 list_del(&page->lru);
24 __free_page(page);
25 }
26 return 0;
27}
28
29static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
30{
31 struct page *page = __page_cache_alloc(GFP_NOFS);
32 if (!page)
33 return -ENOMEM;
34 pl->room += PAGE_SIZE;
35 list_add_tail(&page->lru, &pl->head);
36 if (pl->mapped_tail)
37 ceph_pagelist_unmap_tail(pl);
38 pl->mapped_tail = kmap(page);
39 return 0;
40}
41
42int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
43{
44 while (pl->room < len) {
45 size_t bit = pl->room;
46 int ret;
47
48 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
49 buf, bit);
50 pl->length += bit;
51 pl->room -= bit;
52 buf += bit;
53 len -= bit;
54 ret = ceph_pagelist_addpage(pl);
55 if (ret)
56 return ret;
57 }
58
59 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
60 pl->length += len;
61 pl->room -= len;
62 return 0;
63}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 190b6c4a6f2b..39c243acd062 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/sort.h> 3#include <linux/sort.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5 5
6#include "super.h" 6#include "super.h"
7#include "decode.h" 7#include "mds_client.h"
8
9#include <linux/ceph/decode.h>
8 10
9/* 11/*
10 * Snapshots in ceph are driven in large part by cooperation from the 12 * Snapshots in ceph are driven in large part by cooperation from the
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
526 struct ceph_cap_snap *capsnap) 528 struct ceph_cap_snap *capsnap)
527{ 529{
528 struct inode *inode = &ci->vfs_inode; 530 struct inode *inode = &ci->vfs_inode;
529 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 531 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
530 532
531 BUG_ON(capsnap->writing); 533 BUG_ON(capsnap->writing);
532 capsnap->size = inode->i_size; 534 capsnap->size = inode->i_size;
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
747 struct ceph_mds_session *session, 749 struct ceph_mds_session *session,
748 struct ceph_msg *msg) 750 struct ceph_msg *msg)
749{ 751{
750 struct super_block *sb = mdsc->client->sb; 752 struct super_block *sb = mdsc->fsc->sb;
751 int mds = session->s_mds; 753 int mds = session->s_mds;
752 u64 split; 754 u64 split;
753 int op; 755 int op;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c
index c6179d3a26a2..cd5097d7c804 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/strings.c
@@ -1,71 +1,9 @@
1/* 1/*
2 * Ceph string constants 2 * Ceph fs string constants
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_AUTH: return "auth";
14 default: return "unknown";
15 }
16}
17
18const char *ceph_osd_op_name(int op)
19{
20 switch (op) {
21 case CEPH_OSD_OP_READ: return "read";
22 case CEPH_OSD_OP_STAT: return "stat";
23
24 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
25
26 case CEPH_OSD_OP_WRITE: return "write";
27 case CEPH_OSD_OP_DELETE: return "delete";
28 case CEPH_OSD_OP_TRUNCATE: return "truncate";
29 case CEPH_OSD_OP_ZERO: return "zero";
30 case CEPH_OSD_OP_WRITEFULL: return "writefull";
31 case CEPH_OSD_OP_ROLLBACK: return "rollback";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
49
50 case CEPH_OSD_OP_PULL: return "pull";
51 case CEPH_OSD_OP_PUSH: return "push";
52 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
53 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
54 case CEPH_OSD_OP_SCRUB: return "scrub";
55
56 case CEPH_OSD_OP_WRLOCK: return "wrlock";
57 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
58 case CEPH_OSD_OP_RDLOCK: return "rdlock";
59 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
60 case CEPH_OSD_OP_UPLOCK: return "uplock";
61 case CEPH_OSD_OP_DNLOCK: return "dnlock";
62
63 case CEPH_OSD_OP_CALL: return "call";
64
65 case CEPH_OSD_OP_PGLS: return "pgls";
66 }
67 return "???";
68}
69 7
70const char *ceph_mds_state_name(int s) 8const char *ceph_mds_state_name(int s)
71{ 9{
@@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o)
177 } 115 }
178 return "???"; 116 return "???";
179} 117}
180
181const char *ceph_pool_op_name(int op)
182{
183 switch (op) {
184 case POOL_OP_CREATE: return "create";
185 case POOL_OP_DELETE: return "delete";
186 case POOL_OP_AUID_CHANGE: return "auid change";
187 case POOL_OP_CREATE_SNAP: return "create snap";
188 case POOL_OP_DELETE_SNAP: return "delete snap";
189 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
190 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
191 }
192 return "???";
193}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9922628532b2..d6e0e0421891 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,5 +1,5 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/backing-dev.h> 4#include <linux/backing-dev.h>
5#include <linux/ctype.h> 5#include <linux/ctype.h>
@@ -15,10 +15,13 @@
15#include <linux/statfs.h> 15#include <linux/statfs.h>
16#include <linux/string.h> 16#include <linux/string.h>
17 17
18#include "decode.h"
19#include "super.h" 18#include "super.h"
20#include "mon_client.h" 19#include "mds_client.h"
21#include "auth.h" 20
21#include <linux/ceph/decode.h>
22#include <linux/ceph/mon_client.h>
23#include <linux/ceph/auth.h>
24#include <linux/ceph/debugfs.h>
22 25
23/* 26/*
24 * Ceph superblock operations 27 * Ceph superblock operations
@@ -26,36 +29,22 @@
26 * Handle the basics of mounting, unmounting. 29 * Handle the basics of mounting, unmounting.
27 */ 30 */
28 31
29
30/*
31 * find filename portion of a path (/foo/bar/baz -> baz)
32 */
33const char *ceph_file_part(const char *s, int len)
34{
35 const char *e = s + len;
36
37 while (e != s && *(e-1) != '/')
38 e--;
39 return e;
40}
41
42
43/* 32/*
44 * super ops 33 * super ops
45 */ 34 */
46static void ceph_put_super(struct super_block *s) 35static void ceph_put_super(struct super_block *s)
47{ 36{
48 struct ceph_client *client = ceph_sb_to_client(s); 37 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
49 38
50 dout("put_super\n"); 39 dout("put_super\n");
51 ceph_mdsc_close_sessions(&client->mdsc); 40 ceph_mdsc_close_sessions(fsc->mdsc);
52 41
53 /* 42 /*
54 * ensure we release the bdi before put_anon_super releases 43 * ensure we release the bdi before put_anon_super releases
55 * the device name. 44 * the device name.
56 */ 45 */
57 if (s->s_bdi == &client->backing_dev_info) { 46 if (s->s_bdi == &fsc->backing_dev_info) {
58 bdi_unregister(&client->backing_dev_info); 47 bdi_unregister(&fsc->backing_dev_info);
59 s->s_bdi = NULL; 48 s->s_bdi = NULL;
60 } 49 }
61 50
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
64 53
65static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) 54static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
66{ 55{
67 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode); 56 struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
68 struct ceph_monmap *monmap = client->monc.monmap; 57 struct ceph_monmap *monmap = fsc->client->monc.monmap;
69 struct ceph_statfs st; 58 struct ceph_statfs st;
70 u64 fsid; 59 u64 fsid;
71 int err; 60 int err;
72 61
73 dout("statfs\n"); 62 dout("statfs\n");
74 err = ceph_monc_do_statfs(&client->monc, &st); 63 err = ceph_monc_do_statfs(&fsc->client->monc, &st);
75 if (err < 0) 64 if (err < 0)
76 return err; 65 return err;
77 66
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
104 93
105static int ceph_sync_fs(struct super_block *sb, int wait) 94static int ceph_sync_fs(struct super_block *sb, int wait)
106{ 95{
107 struct ceph_client *client = ceph_sb_to_client(sb); 96 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
108 97
109 if (!wait) { 98 if (!wait) {
110 dout("sync_fs (non-blocking)\n"); 99 dout("sync_fs (non-blocking)\n");
111 ceph_flush_dirty_caps(&client->mdsc); 100 ceph_flush_dirty_caps(fsc->mdsc);
112 dout("sync_fs (non-blocking) done\n"); 101 dout("sync_fs (non-blocking) done\n");
113 return 0; 102 return 0;
114 } 103 }
115 104
116 dout("sync_fs (blocking)\n"); 105 dout("sync_fs (blocking)\n");
117 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); 106 ceph_osdc_sync(&fsc->client->osdc);
118 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); 107 ceph_mdsc_sync(fsc->mdsc);
119 dout("sync_fs (blocking) done\n"); 108 dout("sync_fs (blocking) done\n");
120 return 0; 109 return 0;
121} 110}
122 111
123static int default_congestion_kb(void)
124{
125 int congestion_kb;
126
127 /*
128 * Copied from NFS
129 *
130 * congestion size, scale with available memory.
131 *
132 * 64MB: 8192k
133 * 128MB: 11585k
134 * 256MB: 16384k
135 * 512MB: 23170k
136 * 1GB: 32768k
137 * 2GB: 46340k
138 * 4GB: 65536k
139 * 8GB: 92681k
140 * 16GB: 131072k
141 *
142 * This allows larger machines to have larger/more transfers.
143 * Limit the default to 256M
144 */
145 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
146 if (congestion_kb > 256*1024)
147 congestion_kb = 256*1024;
148
149 return congestion_kb;
150}
151
152/**
153 * ceph_show_options - Show mount options in /proc/mounts
154 * @m: seq_file to write to
155 * @mnt: mount descriptor
156 */
157static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
158{
159 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
160 struct ceph_mount_args *args = client->mount_args;
161
162 if (args->flags & CEPH_OPT_FSID)
163 seq_printf(m, ",fsid=%pU", &args->fsid);
164 if (args->flags & CEPH_OPT_NOSHARE)
165 seq_puts(m, ",noshare");
166 if (args->flags & CEPH_OPT_DIRSTAT)
167 seq_puts(m, ",dirstat");
168 if ((args->flags & CEPH_OPT_RBYTES) == 0)
169 seq_puts(m, ",norbytes");
170 if (args->flags & CEPH_OPT_NOCRC)
171 seq_puts(m, ",nocrc");
172 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
173 seq_puts(m, ",noasyncreaddir");
174
175 if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
176 seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
177 if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
178 seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
179 if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
180 seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
181 if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
182 seq_printf(m, ",osdkeepalivetimeout=%d",
183 args->osd_keepalive_timeout);
184 if (args->wsize)
185 seq_printf(m, ",wsize=%d", args->wsize);
186 if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
187 seq_printf(m, ",rsize=%d", args->rsize);
188 if (args->congestion_kb != default_congestion_kb())
189 seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
190 if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
191 seq_printf(m, ",caps_wanted_delay_min=%d",
192 args->caps_wanted_delay_min);
193 if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
194 seq_printf(m, ",caps_wanted_delay_max=%d",
195 args->caps_wanted_delay_max);
196 if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
197 seq_printf(m, ",cap_release_safety=%d",
198 args->cap_release_safety);
199 if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
200 seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
201 if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
202 seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
203 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
204 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
205 if (args->name)
206 seq_printf(m, ",name=%s", args->name);
207 if (args->secret)
208 seq_puts(m, ",secret=<hidden>");
209 return 0;
210}
211
212/*
213 * caches
214 */
215struct kmem_cache *ceph_inode_cachep;
216struct kmem_cache *ceph_cap_cachep;
217struct kmem_cache *ceph_dentry_cachep;
218struct kmem_cache *ceph_file_cachep;
219
220static void ceph_inode_init_once(void *foo)
221{
222 struct ceph_inode_info *ci = foo;
223 inode_init_once(&ci->vfs_inode);
224}
225
226static int __init init_caches(void)
227{
228 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
229 sizeof(struct ceph_inode_info),
230 __alignof__(struct ceph_inode_info),
231 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
232 ceph_inode_init_once);
233 if (ceph_inode_cachep == NULL)
234 return -ENOMEM;
235
236 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
237 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
238 if (ceph_cap_cachep == NULL)
239 goto bad_cap;
240
241 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
242 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
243 if (ceph_dentry_cachep == NULL)
244 goto bad_dentry;
245
246 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
247 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
248 if (ceph_file_cachep == NULL)
249 goto bad_file;
250
251 return 0;
252
253bad_file:
254 kmem_cache_destroy(ceph_dentry_cachep);
255bad_dentry:
256 kmem_cache_destroy(ceph_cap_cachep);
257bad_cap:
258 kmem_cache_destroy(ceph_inode_cachep);
259 return -ENOMEM;
260}
261
262static void destroy_caches(void)
263{
264 kmem_cache_destroy(ceph_inode_cachep);
265 kmem_cache_destroy(ceph_cap_cachep);
266 kmem_cache_destroy(ceph_dentry_cachep);
267 kmem_cache_destroy(ceph_file_cachep);
268}
269
270
271/*
272 * ceph_umount_begin - initiate forced umount. Tear down down the
273 * mount, skipping steps that may hang while waiting for server(s).
274 */
275static void ceph_umount_begin(struct super_block *sb)
276{
277 struct ceph_client *client = ceph_sb_to_client(sb);
278
279 dout("ceph_umount_begin - starting forced umount\n");
280 if (!client)
281 return;
282 client->mount_state = CEPH_MOUNT_SHUTDOWN;
283 return;
284}
285
286static const struct super_operations ceph_super_ops = {
287 .alloc_inode = ceph_alloc_inode,
288 .destroy_inode = ceph_destroy_inode,
289 .write_inode = ceph_write_inode,
290 .sync_fs = ceph_sync_fs,
291 .put_super = ceph_put_super,
292 .show_options = ceph_show_options,
293 .statfs = ceph_statfs,
294 .umount_begin = ceph_umount_begin,
295};
296
297
298const char *ceph_msg_type_name(int type)
299{
300 switch (type) {
301 case CEPH_MSG_SHUTDOWN: return "shutdown";
302 case CEPH_MSG_PING: return "ping";
303 case CEPH_MSG_AUTH: return "auth";
304 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
305 case CEPH_MSG_MON_MAP: return "mon_map";
306 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
307 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
308 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
309 case CEPH_MSG_STATFS: return "statfs";
310 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
311 case CEPH_MSG_MDS_MAP: return "mds_map";
312 case CEPH_MSG_CLIENT_SESSION: return "client_session";
313 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
314 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
315 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
316 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
317 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
318 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
319 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
320 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
321 case CEPH_MSG_OSD_MAP: return "osd_map";
322 case CEPH_MSG_OSD_OP: return "osd_op";
323 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
324 default: return "unknown";
325 }
326}
327
328
329/* 112/*
330 * mount options 113 * mount options
331 */ 114 */
332enum { 115enum {
333 Opt_wsize, 116 Opt_wsize,
334 Opt_rsize, 117 Opt_rsize,
335 Opt_osdtimeout,
336 Opt_osdkeepalivetimeout,
337 Opt_mount_timeout,
338 Opt_osd_idle_ttl,
339 Opt_caps_wanted_delay_min, 118 Opt_caps_wanted_delay_min,
340 Opt_caps_wanted_delay_max, 119 Opt_caps_wanted_delay_max,
341 Opt_cap_release_safety, 120 Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
344 Opt_congestion_kb, 123 Opt_congestion_kb,
345 Opt_last_int, 124 Opt_last_int,
346 /* int args above */ 125 /* int args above */
347 Opt_fsid,
348 Opt_snapdirname, 126 Opt_snapdirname,
349 Opt_name,
350 Opt_secret,
351 Opt_last_string, 127 Opt_last_string,
352 /* string args above */ 128 /* string args above */
353 Opt_ip,
354 Opt_noshare,
355 Opt_dirstat, 129 Opt_dirstat,
356 Opt_nodirstat, 130 Opt_nodirstat,
357 Opt_rbytes, 131 Opt_rbytes,
358 Opt_norbytes, 132 Opt_norbytes,
359 Opt_nocrc,
360 Opt_noasyncreaddir, 133 Opt_noasyncreaddir,
361}; 134};
362 135
363static match_table_t arg_tokens = { 136static match_table_t fsopt_tokens = {
364 {Opt_wsize, "wsize=%d"}, 137 {Opt_wsize, "wsize=%d"},
365 {Opt_rsize, "rsize=%d"}, 138 {Opt_rsize, "rsize=%d"},
366 {Opt_osdtimeout, "osdtimeout=%d"},
367 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
368 {Opt_mount_timeout, "mount_timeout=%d"},
369 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
370 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 139 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
371 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 140 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
372 {Opt_cap_release_safety, "cap_release_safety=%d"}, 141 {Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,459 @@ static match_table_t arg_tokens = {
374 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, 143 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
375 {Opt_congestion_kb, "write_congestion_kb=%d"}, 144 {Opt_congestion_kb, "write_congestion_kb=%d"},
376 /* int args above */ 145 /* int args above */
377 {Opt_fsid, "fsid=%s"},
378 {Opt_snapdirname, "snapdirname=%s"}, 146 {Opt_snapdirname, "snapdirname=%s"},
379 {Opt_name, "name=%s"},
380 {Opt_secret, "secret=%s"},
381 /* string args above */ 147 /* string args above */
382 {Opt_ip, "ip=%s"},
383 {Opt_noshare, "noshare"},
384 {Opt_dirstat, "dirstat"}, 148 {Opt_dirstat, "dirstat"},
385 {Opt_nodirstat, "nodirstat"}, 149 {Opt_nodirstat, "nodirstat"},
386 {Opt_rbytes, "rbytes"}, 150 {Opt_rbytes, "rbytes"},
387 {Opt_norbytes, "norbytes"}, 151 {Opt_norbytes, "norbytes"},
388 {Opt_nocrc, "nocrc"},
389 {Opt_noasyncreaddir, "noasyncreaddir"}, 152 {Opt_noasyncreaddir, "noasyncreaddir"},
390 {-1, NULL} 153 {-1, NULL}
391}; 154};
392 155
393static int parse_fsid(const char *str, struct ceph_fsid *fsid) 156static int parse_fsopt_token(char *c, void *private)
394{ 157{
395 int i = 0; 158 struct ceph_mount_options *fsopt = private;
396 char tmp[3]; 159 substring_t argstr[MAX_OPT_ARGS];
397 int err = -EINVAL; 160 int token, intval, ret;
398 int d; 161
399 162 token = match_token((char *)c, fsopt_tokens, argstr);
400 dout("parse_fsid '%s'\n", str); 163 if (token < 0)
401 tmp[2] = 0; 164 return -EINVAL;
402 while (*str && i < 16) { 165
403 if (ispunct(*str)) { 166 if (token < Opt_last_int) {
404 str++; 167 ret = match_int(&argstr[0], &intval);
405 continue; 168 if (ret < 0) {
169 pr_err("bad mount option arg (not int) "
170 "at '%s'\n", c);
171 return ret;
406 } 172 }
407 if (!isxdigit(str[0]) || !isxdigit(str[1])) 173 dout("got int token %d val %d\n", token, intval);
408 break; 174 } else if (token > Opt_last_int && token < Opt_last_string) {
409 tmp[0] = str[0]; 175 dout("got string token %d val %s\n", token,
410 tmp[1] = str[1]; 176 argstr[0].from);
411 if (sscanf(tmp, "%x", &d) < 1) 177 } else {
412 break; 178 dout("got token %d\n", token);
413 fsid->fsid[i] = d & 0xff;
414 i++;
415 str += 2;
416 } 179 }
417 180
418 if (i == 16) 181 switch (token) {
419 err = 0; 182 case Opt_snapdirname:
420 dout("parse_fsid ret %d got fsid %pU", err, fsid); 183 kfree(fsopt->snapdir_name);
421 return err; 184 fsopt->snapdir_name = kstrndup(argstr[0].from,
185 argstr[0].to-argstr[0].from,
186 GFP_KERNEL);
187 if (!fsopt->snapdir_name)
188 return -ENOMEM;
189 break;
190
191 /* misc */
192 case Opt_wsize:
193 fsopt->wsize = intval;
194 break;
195 case Opt_rsize:
196 fsopt->rsize = intval;
197 break;
198 case Opt_caps_wanted_delay_min:
199 fsopt->caps_wanted_delay_min = intval;
200 break;
201 case Opt_caps_wanted_delay_max:
202 fsopt->caps_wanted_delay_max = intval;
203 break;
204 case Opt_readdir_max_entries:
205 fsopt->max_readdir = intval;
206 break;
207 case Opt_readdir_max_bytes:
208 fsopt->max_readdir_bytes = intval;
209 break;
210 case Opt_congestion_kb:
211 fsopt->congestion_kb = intval;
212 break;
213 case Opt_dirstat:
214 fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
215 break;
216 case Opt_nodirstat:
217 fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
218 break;
219 case Opt_rbytes:
220 fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
221 break;
222 case Opt_norbytes:
223 fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
224 break;
225 case Opt_noasyncreaddir:
226 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
227 break;
228 default:
229 BUG_ON(token);
230 }
231 return 0;
422} 232}
423 233
424static struct ceph_mount_args *parse_mount_args(int flags, char *options, 234static void destroy_mount_options(struct ceph_mount_options *args)
425 const char *dev_name,
426 const char **path)
427{ 235{
428 struct ceph_mount_args *args; 236 dout("destroy_mount_options %p\n", args);
429 const char *c; 237 kfree(args->snapdir_name);
430 int err = -ENOMEM; 238 kfree(args);
431 substring_t argstr[MAX_OPT_ARGS]; 239}
432 240
433 args = kzalloc(sizeof(*args), GFP_KERNEL); 241static int strcmp_null(const char *s1, const char *s2)
434 if (!args) 242{
435 return ERR_PTR(-ENOMEM); 243 if (!s1 && !s2)
436 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr), 244 return 0;
437 GFP_KERNEL); 245 if (s1 && !s2)
438 if (!args->mon_addr) 246 return -1;
439 goto out; 247 if (!s1 && s2)
248 return 1;
249 return strcmp(s1, s2);
250}
440 251
441 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name); 252static int compare_mount_options(struct ceph_mount_options *new_fsopt,
442 253 struct ceph_options *new_opt,
443 /* start with defaults */ 254 struct ceph_fs_client *fsc)
444 args->sb_flags = flags; 255{
445 args->flags = CEPH_OPT_DEFAULT; 256 struct ceph_mount_options *fsopt1 = new_fsopt;
446 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; 257 struct ceph_mount_options *fsopt2 = fsc->mount_options;
447 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 258 int ofs = offsetof(struct ceph_mount_options, snapdir_name);
448 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ 259 int ret;
449 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
450 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
451 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
452 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
453 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
454 args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
455 args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
456 args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
457 args->congestion_kb = default_congestion_kb();
458
459 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
460 err = -EINVAL;
461 if (!dev_name)
462 goto out;
463 *path = strstr(dev_name, ":/");
464 if (*path == NULL) {
465 pr_err("device name is missing path (no :/ in %s)\n",
466 dev_name);
467 goto out;
468 }
469 260
470 /* get mon ip(s) */ 261 ret = memcmp(fsopt1, fsopt2, ofs);
471 err = ceph_parse_ips(dev_name, *path, args->mon_addr, 262 if (ret)
472 CEPH_MAX_MON, &args->num_mon); 263 return ret;
473 if (err < 0) 264
474 goto out; 265 ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
266 if (ret)
267 return ret;
268
269 return ceph_compare_options(new_opt, fsc->client);
270}
271
272static int parse_mount_options(struct ceph_mount_options **pfsopt,
273 struct ceph_options **popt,
274 int flags, char *options,
275 const char *dev_name,
276 const char **path)
277{
278 struct ceph_mount_options *fsopt;
279 const char *dev_name_end;
280 int err = -ENOMEM;
281
282 fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
283 if (!fsopt)
284 return -ENOMEM;
285
286 dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
287
288 fsopt->sb_flags = flags;
289 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
290
291 fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
292 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
293 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
294 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
295 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
296 fsopt->congestion_kb = default_congestion_kb();
297
298 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
299 err = -EINVAL;
300 if (!dev_name)
301 goto out;
302 *path = strstr(dev_name, ":/");
303 if (*path == NULL) {
304 pr_err("device name is missing path (no :/ in %s)\n",
305 dev_name);
306 goto out;
307 }
308 dev_name_end = *path;
309 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
475 310
476 /* path on server */ 311 /* path on server */
477 *path += 2; 312 *path += 2;
478 dout("server path '%s'\n", *path); 313 dout("server path '%s'\n", *path);
479 314
480 /* parse mount options */ 315 err = ceph_parse_options(popt, options, dev_name, dev_name_end,
481 while ((c = strsep(&options, ",")) != NULL) { 316 parse_fsopt_token, (void *)fsopt);
482 int token, intval, ret; 317 if (err)
483 if (!*c) 318 goto out;
484 continue; 319
485 err = -EINVAL; 320 /* success */
486 token = match_token((char *)c, arg_tokens, argstr); 321 *pfsopt = fsopt;
487 if (token < 0) { 322 return 0;
488 pr_err("bad mount option at '%s'\n", c);
489 goto out;
490 }
491 if (token < Opt_last_int) {
492 ret = match_int(&argstr[0], &intval);
493 if (ret < 0) {
494 pr_err("bad mount option arg (not int) "
495 "at '%s'\n", c);
496 continue;
497 }
498 dout("got int token %d val %d\n", token, intval);
499 } else if (token > Opt_last_int && token < Opt_last_string) {
500 dout("got string token %d val %s\n", token,
501 argstr[0].from);
502 } else {
503 dout("got token %d\n", token);
504 }
505 switch (token) {
506 case Opt_ip:
507 err = ceph_parse_ips(argstr[0].from,
508 argstr[0].to,
509 &args->my_addr,
510 1, NULL);
511 if (err < 0)
512 goto out;
513 args->flags |= CEPH_OPT_MYIP;
514 break;
515
516 case Opt_fsid:
517 err = parse_fsid(argstr[0].from, &args->fsid);
518 if (err == 0)
519 args->flags |= CEPH_OPT_FSID;
520 break;
521 case Opt_snapdirname:
522 kfree(args->snapdir_name);
523 args->snapdir_name = kstrndup(argstr[0].from,
524 argstr[0].to-argstr[0].from,
525 GFP_KERNEL);
526 break;
527 case Opt_name:
528 args->name = kstrndup(argstr[0].from,
529 argstr[0].to-argstr[0].from,
530 GFP_KERNEL);
531 break;
532 case Opt_secret:
533 args->secret = kstrndup(argstr[0].from,
534 argstr[0].to-argstr[0].from,
535 GFP_KERNEL);
536 break;
537
538 /* misc */
539 case Opt_wsize:
540 args->wsize = intval;
541 break;
542 case Opt_rsize:
543 args->rsize = intval;
544 break;
545 case Opt_osdtimeout:
546 args->osd_timeout = intval;
547 break;
548 case Opt_osdkeepalivetimeout:
549 args->osd_keepalive_timeout = intval;
550 break;
551 case Opt_osd_idle_ttl:
552 args->osd_idle_ttl = intval;
553 break;
554 case Opt_mount_timeout:
555 args->mount_timeout = intval;
556 break;
557 case Opt_caps_wanted_delay_min:
558 args->caps_wanted_delay_min = intval;
559 break;
560 case Opt_caps_wanted_delay_max:
561 args->caps_wanted_delay_max = intval;
562 break;
563 case Opt_readdir_max_entries:
564 args->max_readdir = intval;
565 break;
566 case Opt_readdir_max_bytes:
567 args->max_readdir_bytes = intval;
568 break;
569 case Opt_congestion_kb:
570 args->congestion_kb = intval;
571 break;
572
573 case Opt_noshare:
574 args->flags |= CEPH_OPT_NOSHARE;
575 break;
576
577 case Opt_dirstat:
578 args->flags |= CEPH_OPT_DIRSTAT;
579 break;
580 case Opt_nodirstat:
581 args->flags &= ~CEPH_OPT_DIRSTAT;
582 break;
583 case Opt_rbytes:
584 args->flags |= CEPH_OPT_RBYTES;
585 break;
586 case Opt_norbytes:
587 args->flags &= ~CEPH_OPT_RBYTES;
588 break;
589 case Opt_nocrc:
590 args->flags |= CEPH_OPT_NOCRC;
591 break;
592 case Opt_noasyncreaddir:
593 args->flags |= CEPH_OPT_NOASYNCREADDIR;
594 break;
595
596 default:
597 BUG_ON(token);
598 }
599 }
600 return args;
601 323
602out: 324out:
603 kfree(args->mon_addr); 325 destroy_mount_options(fsopt);
604 kfree(args); 326 return err;
605 return ERR_PTR(err);
606} 327}
607 328
608static void destroy_mount_args(struct ceph_mount_args *args) 329/**
330 * ceph_show_options - Show mount options in /proc/mounts
331 * @m: seq_file to write to
332 * @mnt: mount descriptor
333 */
334static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
609{ 335{
610 dout("destroy_mount_args %p\n", args); 336 struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
611 kfree(args->snapdir_name); 337 struct ceph_mount_options *fsopt = fsc->mount_options;
612 args->snapdir_name = NULL; 338 struct ceph_options *opt = fsc->client->options;
613 kfree(args->name); 339
614 args->name = NULL; 340 if (opt->flags & CEPH_OPT_FSID)
615 kfree(args->secret); 341 seq_printf(m, ",fsid=%pU", &opt->fsid);
616 args->secret = NULL; 342 if (opt->flags & CEPH_OPT_NOSHARE)
617 kfree(args); 343 seq_puts(m, ",noshare");
344 if (opt->flags & CEPH_OPT_NOCRC)
345 seq_puts(m, ",nocrc");
346
347 if (opt->name)
348 seq_printf(m, ",name=%s", opt->name);
349 if (opt->secret)
350 seq_puts(m, ",secret=<hidden>");
351
352 if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
353 seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
354 if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
355 seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
356 if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
357 seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
358 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
359 seq_printf(m, ",osdkeepalivetimeout=%d",
360 opt->osd_keepalive_timeout);
361
362 if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
363 seq_puts(m, ",dirstat");
364 if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
365 seq_puts(m, ",norbytes");
366 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
367 seq_puts(m, ",noasyncreaddir");
368
369 if (fsopt->wsize)
370 seq_printf(m, ",wsize=%d", fsopt->wsize);
371 if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
372 seq_printf(m, ",rsize=%d", fsopt->rsize);
373 if (fsopt->congestion_kb != default_congestion_kb())
374 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
375 if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
376 seq_printf(m, ",caps_wanted_delay_min=%d",
377 fsopt->caps_wanted_delay_min);
378 if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
379 seq_printf(m, ",caps_wanted_delay_max=%d",
380 fsopt->caps_wanted_delay_max);
381 if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
382 seq_printf(m, ",cap_release_safety=%d",
383 fsopt->cap_release_safety);
384 if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
385 seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
386 if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
387 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
388 if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
389 seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
390 return 0;
618} 391}
619 392
620/* 393/*
621 * create a fresh client instance 394 * handle any mon messages the standard library doesn't understand.
395 * return error if we don't either.
622 */ 396 */
623static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) 397static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
624{ 398{
625 struct ceph_client *client; 399 struct ceph_fs_client *fsc = client->private;
400 int type = le16_to_cpu(msg->hdr.type);
401
402 switch (type) {
403 case CEPH_MSG_MDS_MAP:
404 ceph_mdsc_handle_map(fsc->mdsc, msg);
405 return 0;
406
407 default:
408 return -1;
409 }
410}
411
412/*
413 * create a new fs client
414 */
415struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
416 struct ceph_options *opt)
417{
418 struct ceph_fs_client *fsc;
626 int err = -ENOMEM; 419 int err = -ENOMEM;
627 420
628 client = kzalloc(sizeof(*client), GFP_KERNEL); 421 fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
629 if (client == NULL) 422 if (!fsc)
630 return ERR_PTR(-ENOMEM); 423 return ERR_PTR(-ENOMEM);
631 424
632 mutex_init(&client->mount_mutex); 425 fsc->client = ceph_create_client(opt, fsc);
633 426 if (IS_ERR(fsc->client)) {
634 init_waitqueue_head(&client->auth_wq); 427 err = PTR_ERR(fsc->client);
428 goto fail;
429 }
430 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
431 fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
432 fsc->client->monc.want_mdsmap = 1;
635 433
636 client->sb = NULL; 434 fsc->mount_options = fsopt;
637 client->mount_state = CEPH_MOUNT_MOUNTING;
638 client->mount_args = args;
639 435
640 client->msgr = NULL; 436 fsc->sb = NULL;
437 fsc->mount_state = CEPH_MOUNT_MOUNTING;
641 438
642 client->auth_err = 0; 439 atomic_long_set(&fsc->writeback_count, 0);
643 atomic_long_set(&client->writeback_count, 0);
644 440
645 err = bdi_init(&client->backing_dev_info); 441 err = bdi_init(&fsc->backing_dev_info);
646 if (err < 0) 442 if (err < 0)
647 goto fail; 443 goto fail_client;
648 444
649 err = -ENOMEM; 445 err = -ENOMEM;
650 client->wb_wq = create_workqueue("ceph-writeback"); 446 fsc->wb_wq = create_workqueue("ceph-writeback");
651 if (client->wb_wq == NULL) 447 if (fsc->wb_wq == NULL)
652 goto fail_bdi; 448 goto fail_bdi;
653 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); 449 fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
654 if (client->pg_inv_wq == NULL) 450 if (fsc->pg_inv_wq == NULL)
655 goto fail_wb_wq; 451 goto fail_wb_wq;
656 client->trunc_wq = create_singlethread_workqueue("ceph-trunc"); 452 fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
657 if (client->trunc_wq == NULL) 453 if (fsc->trunc_wq == NULL)
658 goto fail_pg_inv_wq; 454 goto fail_pg_inv_wq;
659 455
660 /* set up mempools */ 456 /* set up mempools */
661 err = -ENOMEM; 457 err = -ENOMEM;
662 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10, 458 fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
663 client->mount_args->wsize >> PAGE_CACHE_SHIFT); 459 fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
664 if (!client->wb_pagevec_pool) 460 if (!fsc->wb_pagevec_pool)
665 goto fail_trunc_wq; 461 goto fail_trunc_wq;
666 462
667 /* caps */ 463 /* caps */
668 client->min_caps = args->max_readdir; 464 fsc->min_caps = fsopt->max_readdir;
465
466 return fsc;
669 467
670 /* subsystems */
671 err = ceph_monc_init(&client->monc, client);
672 if (err < 0)
673 goto fail_mempool;
674 err = ceph_osdc_init(&client->osdc, client);
675 if (err < 0)
676 goto fail_monc;
677 err = ceph_mdsc_init(&client->mdsc, client);
678 if (err < 0)
679 goto fail_osdc;
680 return client;
681
682fail_osdc:
683 ceph_osdc_stop(&client->osdc);
684fail_monc:
685 ceph_monc_stop(&client->monc);
686fail_mempool:
687 mempool_destroy(client->wb_pagevec_pool);
688fail_trunc_wq: 468fail_trunc_wq:
689 destroy_workqueue(client->trunc_wq); 469 destroy_workqueue(fsc->trunc_wq);
690fail_pg_inv_wq: 470fail_pg_inv_wq:
691 destroy_workqueue(client->pg_inv_wq); 471 destroy_workqueue(fsc->pg_inv_wq);
692fail_wb_wq: 472fail_wb_wq:
693 destroy_workqueue(client->wb_wq); 473 destroy_workqueue(fsc->wb_wq);
694fail_bdi: 474fail_bdi:
695 bdi_destroy(&client->backing_dev_info); 475 bdi_destroy(&fsc->backing_dev_info);
476fail_client:
477 ceph_destroy_client(fsc->client);
696fail: 478fail:
697 kfree(client); 479 kfree(fsc);
698 return ERR_PTR(err); 480 return ERR_PTR(err);
699} 481}
700 482
701static void ceph_destroy_client(struct ceph_client *client) 483void destroy_fs_client(struct ceph_fs_client *fsc)
702{ 484{
703 dout("destroy_client %p\n", client); 485 dout("destroy_fs_client %p\n", fsc);
704 486
705 /* unmount */ 487 destroy_workqueue(fsc->wb_wq);
706 ceph_mdsc_stop(&client->mdsc); 488 destroy_workqueue(fsc->pg_inv_wq);
707 ceph_osdc_stop(&client->osdc); 489 destroy_workqueue(fsc->trunc_wq);
708 490
709 /* 491 bdi_destroy(&fsc->backing_dev_info);
710 * make sure mds and osd connections close out before destroying
711 * the auth module, which is needed to free those connections'
712 * ceph_authorizers.
713 */
714 ceph_msgr_flush();
715
716 ceph_monc_stop(&client->monc);
717 492
718 ceph_debugfs_client_cleanup(client); 493 mempool_destroy(fsc->wb_pagevec_pool);
719 destroy_workqueue(client->wb_wq);
720 destroy_workqueue(client->pg_inv_wq);
721 destroy_workqueue(client->trunc_wq);
722 494
723 bdi_destroy(&client->backing_dev_info); 495 destroy_mount_options(fsc->mount_options);
724 496
725 if (client->msgr) 497 ceph_fs_debugfs_cleanup(fsc);
726 ceph_messenger_destroy(client->msgr);
727 mempool_destroy(client->wb_pagevec_pool);
728 498
729 destroy_mount_args(client->mount_args); 499 ceph_destroy_client(fsc->client);
730 500
731 kfree(client); 501 kfree(fsc);
732 dout("destroy_client %p done\n", client); 502 dout("destroy_fs_client %p done\n", fsc);
733} 503}
734 504
735/* 505/*
736 * Initially learn our fsid, or verify an fsid matches. 506 * caches
737 */ 507 */
738int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) 508struct kmem_cache *ceph_inode_cachep;
509struct kmem_cache *ceph_cap_cachep;
510struct kmem_cache *ceph_dentry_cachep;
511struct kmem_cache *ceph_file_cachep;
512
513static void ceph_inode_init_once(void *foo)
739{ 514{
740 if (client->have_fsid) { 515 struct ceph_inode_info *ci = foo;
741 if (ceph_fsid_compare(&client->fsid, fsid)) { 516 inode_init_once(&ci->vfs_inode);
742 pr_err("bad fsid, had %pU got %pU", 517}
743 &client->fsid, fsid); 518
744 return -1; 519static int __init init_caches(void)
745 } 520{
746 } else { 521 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
747 pr_info("client%lld fsid %pU\n", client->monc.auth->global_id, 522 sizeof(struct ceph_inode_info),
748 fsid); 523 __alignof__(struct ceph_inode_info),
749 memcpy(&client->fsid, fsid, sizeof(*fsid)); 524 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
750 ceph_debugfs_client_init(client); 525 ceph_inode_init_once);
751 client->have_fsid = true; 526 if (ceph_inode_cachep == NULL)
752 } 527 return -ENOMEM;
528
529 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
530 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
531 if (ceph_cap_cachep == NULL)
532 goto bad_cap;
533
534 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
535 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
536 if (ceph_dentry_cachep == NULL)
537 goto bad_dentry;
538
539 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
540 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
541 if (ceph_file_cachep == NULL)
542 goto bad_file;
543
753 return 0; 544 return 0;
545
546bad_file:
547 kmem_cache_destroy(ceph_dentry_cachep);
548bad_dentry:
549 kmem_cache_destroy(ceph_cap_cachep);
550bad_cap:
551 kmem_cache_destroy(ceph_inode_cachep);
552 return -ENOMEM;
754} 553}
755 554
555static void destroy_caches(void)
556{
557 kmem_cache_destroy(ceph_inode_cachep);
558 kmem_cache_destroy(ceph_cap_cachep);
559 kmem_cache_destroy(ceph_dentry_cachep);
560 kmem_cache_destroy(ceph_file_cachep);
561}
562
563
756/* 564/*
757 * true if we have the mon map (and have thus joined the cluster) 565 * ceph_umount_begin - initiate forced umount. Tear down down the
566 * mount, skipping steps that may hang while waiting for server(s).
758 */ 567 */
759static int have_mon_and_osd_map(struct ceph_client *client) 568static void ceph_umount_begin(struct super_block *sb)
760{ 569{
761 return client->monc.monmap && client->monc.monmap->epoch && 570 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
762 client->osdc.osdmap && client->osdc.osdmap->epoch; 571
572 dout("ceph_umount_begin - starting forced umount\n");
573 if (!fsc)
574 return;
575 fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
576 return;
763} 577}
764 578
579static const struct super_operations ceph_super_ops = {
580 .alloc_inode = ceph_alloc_inode,
581 .destroy_inode = ceph_destroy_inode,
582 .write_inode = ceph_write_inode,
583 .sync_fs = ceph_sync_fs,
584 .put_super = ceph_put_super,
585 .show_options = ceph_show_options,
586 .statfs = ceph_statfs,
587 .umount_begin = ceph_umount_begin,
588};
589
765/* 590/*
766 * Bootstrap mount by opening the root directory. Note the mount 591 * Bootstrap mount by opening the root directory. Note the mount
767 * @started time from caller, and time out if this takes too long. 592 * @started time from caller, and time out if this takes too long.
768 */ 593 */
769static struct dentry *open_root_dentry(struct ceph_client *client, 594static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
770 const char *path, 595 const char *path,
771 unsigned long started) 596 unsigned long started)
772{ 597{
773 struct ceph_mds_client *mdsc = &client->mdsc; 598 struct ceph_mds_client *mdsc = fsc->mdsc;
774 struct ceph_mds_request *req = NULL; 599 struct ceph_mds_request *req = NULL;
775 int err; 600 int err;
776 struct dentry *root; 601 struct dentry *root;
@@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
784 req->r_ino1.ino = CEPH_INO_ROOT; 609 req->r_ino1.ino = CEPH_INO_ROOT;
785 req->r_ino1.snap = CEPH_NOSNAP; 610 req->r_ino1.snap = CEPH_NOSNAP;
786 req->r_started = started; 611 req->r_started = started;
787 req->r_timeout = client->mount_args->mount_timeout * HZ; 612 req->r_timeout = fsc->client->options->mount_timeout * HZ;
788 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 613 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
789 req->r_num_caps = 2; 614 req->r_num_caps = 2;
790 err = ceph_mdsc_do_request(mdsc, NULL, req); 615 err = ceph_mdsc_do_request(mdsc, NULL, req);
791 if (err == 0) { 616 if (err == 0) {
792 dout("open_root_inode success\n"); 617 dout("open_root_inode success\n");
793 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && 618 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
794 client->sb->s_root == NULL) 619 fsc->sb->s_root == NULL)
795 root = d_alloc_root(req->r_target_inode); 620 root = d_alloc_root(req->r_target_inode);
796 else 621 else
797 root = d_obtain_alias(req->r_target_inode); 622 root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
804 return root; 629 return root;
805} 630}
806 631
632
633
634
807/* 635/*
808 * mount: join the ceph cluster, and open root directory. 636 * mount: join the ceph cluster, and open root directory.
809 */ 637 */
810static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, 638static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
811 const char *path) 639 const char *path)
812{ 640{
813 struct ceph_entity_addr *myaddr = NULL;
814 int err; 641 int err;
815 unsigned long timeout = client->mount_args->mount_timeout * HZ;
816 unsigned long started = jiffies; /* note the start time */ 642 unsigned long started = jiffies; /* note the start time */
817 struct dentry *root; 643 struct dentry *root;
644 int first = 0; /* first vfsmount for this super_block */
818 645
819 dout("mount start\n"); 646 dout("mount start\n");
820 mutex_lock(&client->mount_mutex); 647 mutex_lock(&fsc->client->mount_mutex);
821
822 /* initialize the messenger */
823 if (client->msgr == NULL) {
824 if (ceph_test_opt(client, MYIP))
825 myaddr = &client->mount_args->my_addr;
826 client->msgr = ceph_messenger_create(myaddr);
827 if (IS_ERR(client->msgr)) {
828 err = PTR_ERR(client->msgr);
829 client->msgr = NULL;
830 goto out;
831 }
832 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
833 }
834 648
835 /* open session, and wait for mon, mds, and osd maps */ 649 err = __ceph_open_session(fsc->client, started);
836 err = ceph_monc_open_session(&client->monc);
837 if (err < 0) 650 if (err < 0)
838 goto out; 651 goto out;
839 652
840 while (!have_mon_and_osd_map(client)) {
841 err = -EIO;
842 if (timeout && time_after_eq(jiffies, started + timeout))
843 goto out;
844
845 /* wait */
846 dout("mount waiting for mon_map\n");
847 err = wait_event_interruptible_timeout(client->auth_wq,
848 have_mon_and_osd_map(client) || (client->auth_err < 0),
849 timeout);
850 if (err == -EINTR || err == -ERESTARTSYS)
851 goto out;
852 if (client->auth_err < 0) {
853 err = client->auth_err;
854 goto out;
855 }
856 }
857
858 dout("mount opening root\n"); 653 dout("mount opening root\n");
859 root = open_root_dentry(client, "", started); 654 root = open_root_dentry(fsc, "", started);
860 if (IS_ERR(root)) { 655 if (IS_ERR(root)) {
861 err = PTR_ERR(root); 656 err = PTR_ERR(root);
862 goto out; 657 goto out;
863 } 658 }
864 if (client->sb->s_root) 659 if (fsc->sb->s_root) {
865 dput(root); 660 dput(root);
866 else 661 } else {
867 client->sb->s_root = root; 662 fsc->sb->s_root = root;
663 first = 1;
664
665 err = ceph_fs_debugfs_init(fsc);
666 if (err < 0)
667 goto fail;
668 }
868 669
869 if (path[0] == 0) { 670 if (path[0] == 0) {
870 dget(root); 671 dget(root);
871 } else { 672 } else {
872 dout("mount opening base mountpoint\n"); 673 dout("mount opening base mountpoint\n");
873 root = open_root_dentry(client, path, started); 674 root = open_root_dentry(fsc, path, started);
874 if (IS_ERR(root)) { 675 if (IS_ERR(root)) {
875 err = PTR_ERR(root); 676 err = PTR_ERR(root);
876 dput(client->sb->s_root); 677 goto fail;
877 client->sb->s_root = NULL;
878 goto out;
879 } 678 }
880 } 679 }
881 680
882 mnt->mnt_root = root; 681 mnt->mnt_root = root;
883 mnt->mnt_sb = client->sb; 682 mnt->mnt_sb = fsc->sb;
884 683
885 client->mount_state = CEPH_MOUNT_MOUNTED; 684 fsc->mount_state = CEPH_MOUNT_MOUNTED;
886 dout("mount success\n"); 685 dout("mount success\n");
887 err = 0; 686 err = 0;
888 687
889out: 688out:
890 mutex_unlock(&client->mount_mutex); 689 mutex_unlock(&fsc->client->mount_mutex);
891 return err; 690 return err;
691
692fail:
693 if (first) {
694 dput(fsc->sb->s_root);
695 fsc->sb->s_root = NULL;
696 }
697 goto out;
892} 698}
893 699
894static int ceph_set_super(struct super_block *s, void *data) 700static int ceph_set_super(struct super_block *s, void *data)
895{ 701{
896 struct ceph_client *client = data; 702 struct ceph_fs_client *fsc = data;
897 int ret; 703 int ret;
898 704
899 dout("set_super %p data %p\n", s, data); 705 dout("set_super %p data %p\n", s, data);
900 706
901 s->s_flags = client->mount_args->sb_flags; 707 s->s_flags = fsc->mount_options->sb_flags;
902 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ 708 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
903 709
904 s->s_fs_info = client; 710 s->s_fs_info = fsc;
905 client->sb = s; 711 fsc->sb = s;
906 712
907 s->s_op = &ceph_super_ops; 713 s->s_op = &ceph_super_ops;
908 s->s_export_op = &ceph_export_ops; 714 s->s_export_op = &ceph_export_ops;
@@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data)
917 723
918fail: 724fail:
919 s->s_fs_info = NULL; 725 s->s_fs_info = NULL;
920 client->sb = NULL; 726 fsc->sb = NULL;
921 return ret; 727 return ret;
922} 728}
923 729
@@ -926,30 +732,23 @@ fail:
926 */ 732 */
927static int ceph_compare_super(struct super_block *sb, void *data) 733static int ceph_compare_super(struct super_block *sb, void *data)
928{ 734{
929 struct ceph_client *new = data; 735 struct ceph_fs_client *new = data;
930 struct ceph_mount_args *args = new->mount_args; 736 struct ceph_mount_options *fsopt = new->mount_options;
931 struct ceph_client *other = ceph_sb_to_client(sb); 737 struct ceph_options *opt = new->client->options;
932 int i; 738 struct ceph_fs_client *other = ceph_sb_to_client(sb);
933 739
934 dout("ceph_compare_super %p\n", sb); 740 dout("ceph_compare_super %p\n", sb);
935 if (args->flags & CEPH_OPT_FSID) { 741
936 if (ceph_fsid_compare(&args->fsid, &other->fsid)) { 742 if (compare_mount_options(fsopt, opt, other)) {
937 dout("fsid doesn't match\n"); 743 dout("monitor(s)/mount options don't match\n");
938 return 0; 744 return 0;
939 }
940 } else {
941 /* do we share (a) monitor? */
942 for (i = 0; i < new->monc.monmap->num_mon; i++)
943 if (ceph_monmap_contains(other->monc.monmap,
944 &new->monc.monmap->mon_inst[i].addr))
945 break;
946 if (i == new->monc.monmap->num_mon) {
947 dout("mon ip not part of monmap\n");
948 return 0;
949 }
950 dout("mon ip matches existing sb %p\n", sb);
951 } 745 }
952 if (args->sb_flags != other->mount_args->sb_flags) { 746 if ((opt->flags & CEPH_OPT_FSID) &&
747 ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
748 dout("fsid doesn't match\n");
749 return 0;
750 }
751 if (fsopt->sb_flags != other->mount_options->sb_flags) {
953 dout("flags differ\n"); 752 dout("flags differ\n");
954 return 0; 753 return 0;
955 } 754 }
@@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data)
961 */ 760 */
962static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 761static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
963 762
964static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 763static int ceph_register_bdi(struct super_block *sb,
764 struct ceph_fs_client *fsc)
965{ 765{
966 int err; 766 int err;
967 767
968 /* set ra_pages based on rsize mount option? */ 768 /* set ra_pages based on rsize mount option? */
969 if (client->mount_args->rsize >= PAGE_CACHE_SIZE) 769 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
970 client->backing_dev_info.ra_pages = 770 fsc->backing_dev_info.ra_pages =
971 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 771 (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
972 >> PAGE_SHIFT; 772 >> PAGE_SHIFT;
973 err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", 773 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
974 atomic_long_inc_return(&bdi_seq)); 774 atomic_long_inc_return(&bdi_seq));
975 if (!err) 775 if (!err)
976 sb->s_bdi = &client->backing_dev_info; 776 sb->s_bdi = &fsc->backing_dev_info;
977 return err; 777 return err;
978} 778}
979 779
@@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type,
982 struct vfsmount *mnt) 782 struct vfsmount *mnt)
983{ 783{
984 struct super_block *sb; 784 struct super_block *sb;
985 struct ceph_client *client; 785 struct ceph_fs_client *fsc;
986 int err; 786 int err;
987 int (*compare_super)(struct super_block *, void *) = ceph_compare_super; 787 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
988 const char *path = NULL; 788 const char *path = NULL;
989 struct ceph_mount_args *args; 789 struct ceph_mount_options *fsopt = NULL;
790 struct ceph_options *opt = NULL;
990 791
991 dout("ceph_get_sb\n"); 792 dout("ceph_get_sb\n");
992 args = parse_mount_args(flags, data, dev_name, &path); 793 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
993 if (IS_ERR(args)) { 794 if (err < 0)
994 err = PTR_ERR(args);
995 goto out_final; 795 goto out_final;
996 }
997 796
998 /* create client (which we may/may not use) */ 797 /* create client (which we may/may not use) */
999 client = ceph_create_client(args); 798 fsc = create_fs_client(fsopt, opt);
1000 if (IS_ERR(client)) { 799 if (IS_ERR(fsc)) {
1001 err = PTR_ERR(client); 800 err = PTR_ERR(fsc);
801 kfree(fsopt);
802 kfree(opt);
1002 goto out_final; 803 goto out_final;
1003 } 804 }
1004 805
1005 if (client->mount_args->flags & CEPH_OPT_NOSHARE) 806 err = ceph_mdsc_init(fsc);
807 if (err < 0)
808 goto out;
809
810 if (ceph_test_opt(fsc->client, NOSHARE))
1006 compare_super = NULL; 811 compare_super = NULL;
1007 sb = sget(fs_type, compare_super, ceph_set_super, client); 812 sb = sget(fs_type, compare_super, ceph_set_super, fsc);
1008 if (IS_ERR(sb)) { 813 if (IS_ERR(sb)) {
1009 err = PTR_ERR(sb); 814 err = PTR_ERR(sb);
1010 goto out; 815 goto out;
1011 } 816 }
1012 817
1013 if (ceph_sb_to_client(sb) != client) { 818 if (ceph_sb_to_client(sb) != fsc) {
1014 ceph_destroy_client(client); 819 ceph_mdsc_destroy(fsc);
1015 client = ceph_sb_to_client(sb); 820 destroy_fs_client(fsc);
1016 dout("get_sb got existing client %p\n", client); 821 fsc = ceph_sb_to_client(sb);
822 dout("get_sb got existing client %p\n", fsc);
1017 } else { 823 } else {
1018 dout("get_sb using new client %p\n", client); 824 dout("get_sb using new client %p\n", fsc);
1019 err = ceph_register_bdi(sb, client); 825 err = ceph_register_bdi(sb, fsc);
1020 if (err < 0) 826 if (err < 0)
1021 goto out_splat; 827 goto out_splat;
1022 } 828 }
1023 829
1024 err = ceph_mount(client, mnt, path); 830 err = ceph_mount(fsc, mnt, path);
1025 if (err < 0) 831 if (err < 0)
1026 goto out_splat; 832 goto out_splat;
1027 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, 833 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
@@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type,
1029 return 0; 835 return 0;
1030 836
1031out_splat: 837out_splat:
1032 ceph_mdsc_close_sessions(&client->mdsc); 838 ceph_mdsc_close_sessions(fsc->mdsc);
1033 deactivate_locked_super(sb); 839 deactivate_locked_super(sb);
1034 goto out_final; 840 goto out_final;
1035 841
1036out: 842out:
1037 ceph_destroy_client(client); 843 ceph_mdsc_destroy(fsc);
844 destroy_fs_client(fsc);
1038out_final: 845out_final:
1039 dout("ceph_get_sb fail %d\n", err); 846 dout("ceph_get_sb fail %d\n", err);
1040 return err; 847 return err;
@@ -1042,11 +849,12 @@ out_final:
1042 849
1043static void ceph_kill_sb(struct super_block *s) 850static void ceph_kill_sb(struct super_block *s)
1044{ 851{
1045 struct ceph_client *client = ceph_sb_to_client(s); 852 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
1046 dout("kill_sb %p\n", s); 853 dout("kill_sb %p\n", s);
1047 ceph_mdsc_pre_umount(&client->mdsc); 854 ceph_mdsc_pre_umount(fsc->mdsc);
1048 kill_anon_super(s); /* will call put_super after sb is r/o */ 855 kill_anon_super(s); /* will call put_super after sb is r/o */
1049 ceph_destroy_client(client); 856 ceph_mdsc_destroy(fsc);
857 destroy_fs_client(fsc);
1050} 858}
1051 859
1052static struct file_system_type ceph_fs_type = { 860static struct file_system_type ceph_fs_type = {
@@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = {
1062 870
1063static int __init init_ceph(void) 871static int __init init_ceph(void)
1064{ 872{
1065 int ret = 0; 873 int ret = init_caches();
1066
1067 ret = ceph_debugfs_init();
1068 if (ret < 0)
1069 goto out;
1070
1071 ret = ceph_msgr_init();
1072 if (ret < 0)
1073 goto out_debugfs;
1074
1075 ret = init_caches();
1076 if (ret) 874 if (ret)
1077 goto out_msgr; 875 goto out;
1078 876
1079 ret = register_filesystem(&ceph_fs_type); 877 ret = register_filesystem(&ceph_fs_type);
1080 if (ret) 878 if (ret)
1081 goto out_icache; 879 goto out_icache;
1082 880
1083 pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", 881 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1084 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, 882
1085 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
1086 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
1087 return 0; 883 return 0;
1088 884
1089out_icache: 885out_icache:
1090 destroy_caches(); 886 destroy_caches();
1091out_msgr:
1092 ceph_msgr_exit();
1093out_debugfs:
1094 ceph_debugfs_cleanup();
1095out: 887out:
1096 return ret; 888 return ret;
1097} 889}
@@ -1101,8 +893,6 @@ static void __exit exit_ceph(void)
1101 dout("exit_ceph\n"); 893 dout("exit_ceph\n");
1102 unregister_filesystem(&ceph_fs_type); 894 unregister_filesystem(&ceph_fs_type);
1103 destroy_caches(); 895 destroy_caches();
1104 ceph_msgr_exit();
1105 ceph_debugfs_cleanup();
1106} 896}
1107 897
1108module_init(init_ceph); 898module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b87638e84c4b..1886294e12f7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
1#ifndef _FS_CEPH_SUPER_H 1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H 2#define _FS_CEPH_SUPER_H
3 3
4#include "ceph_debug.h" 4#include <linux/ceph/ceph_debug.h>
5 5
6#include <asm/unaligned.h> 6#include <asm/unaligned.h>
7#include <linux/backing-dev.h> 7#include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
14#include <linux/writeback.h> 14#include <linux/writeback.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include "types.h" 17#include <linux/ceph/libceph.h>
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "mds_client.h"
22#include "osd_client.h"
23#include "ceph_fs.h"
24 18
25/* f_type in struct statfs */ 19/* f_type in struct statfs */
26#define CEPH_SUPER_MAGIC 0x00c36400 20#define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
30#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ 24#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 25#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32 26
33/* 27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
34 * Supported features 28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
35 */ 29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
36#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
37#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR
38 30
39/* 31#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
40 * mount options
41 */
42#define CEPH_OPT_FSID (1<<0)
43#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
44#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
45#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
46#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
47#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
48#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
49 32
50#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) 33#define ceph_set_mount_opt(fsc, opt) \
34 (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
35#define ceph_test_mount_opt(fsc, opt) \
36 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
51 37
52#define ceph_set_opt(client, opt) \ 38#define CEPH_MAX_READDIR_DEFAULT 1024
53 (client)->mount_args->flags |= CEPH_OPT_##opt; 39#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
54#define ceph_test_opt(client, opt) \ 40#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
55 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
56 41
57 42struct ceph_mount_options {
58struct ceph_mount_args {
59 int sb_flags;
60 int flags; 43 int flags;
61 struct ceph_fsid fsid; 44 int sb_flags;
62 struct ceph_entity_addr my_addr; 45
63 int num_mon;
64 struct ceph_entity_addr *mon_addr;
65 int mount_timeout;
66 int osd_idle_ttl;
67 int osd_timeout;
68 int osd_keepalive_timeout;
69 int wsize; 46 int wsize;
70 int rsize; /* max readahead */ 47 int rsize; /* max readahead */
71 int congestion_kb; /* max writeback in flight */ 48 int congestion_kb; /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
73 int cap_release_safety; 50 int cap_release_safety;
74 int max_readdir; /* max readdir result (entires) */ 51 int max_readdir; /* max readdir result (entires) */
75 int max_readdir_bytes; /* max readdir result (bytes) */ 52 int max_readdir_bytes; /* max readdir result (bytes) */
76 char *snapdir_name; /* default ".snap" */
77 char *name;
78 char *secret;
79};
80 53
81/* 54 /*
82 * defaults 55 * everything above this point can be memcmp'd; everything below
83 */ 56 * is handled in compare_mount_options()
84#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 57 */
85#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
86#define CEPH_OSD_KEEPALIVE_DEFAULT 5
87#define CEPH_OSD_IDLE_TTL_DEFAULT 60
88#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
89#define CEPH_MAX_READDIR_DEFAULT 1024
90#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
91
92#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
93#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
94
95#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
96#define CEPH_AUTH_NAME_DEFAULT "guest"
97/*
98 * Delay telling the MDS we no longer want caps, in case we reopen
99 * the file. Delay a minimum amount of time, even if we send a cap
100 * message for some other reason. Otherwise, take the oppotunity to
101 * update the mds to avoid sending another message later.
102 */
103#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
104#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
105
106#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
107
108/* mount state */
109enum {
110 CEPH_MOUNT_MOUNTING,
111 CEPH_MOUNT_MOUNTED,
112 CEPH_MOUNT_UNMOUNTING,
113 CEPH_MOUNT_UNMOUNTED,
114 CEPH_MOUNT_SHUTDOWN,
115};
116
117/*
118 * subtract jiffies
119 */
120static inline unsigned long time_sub(unsigned long a, unsigned long b)
121{
122 BUG_ON(time_after(b, a));
123 return (long)a - (long)b;
124}
125
126/*
127 * per-filesystem client state
128 *
129 * possibly shared by multiple mount points, if they are
130 * mounting the same ceph filesystem/cluster.
131 */
132struct ceph_client {
133 struct ceph_fsid fsid;
134 bool have_fsid;
135 58
136 struct mutex mount_mutex; /* serialize mount attempts */ 59 char *snapdir_name; /* default ".snap" */
137 struct ceph_mount_args *mount_args; 60};
138 61
62struct ceph_fs_client {
139 struct super_block *sb; 63 struct super_block *sb;
140 64
141 unsigned long mount_state; 65 struct ceph_mount_options *mount_options;
142 wait_queue_head_t auth_wq; 66 struct ceph_client *client;
143
144 int auth_err;
145 67
68 unsigned long mount_state;
146 int min_caps; /* min caps i added */ 69 int min_caps; /* min caps i added */
147 70
148 struct ceph_messenger *msgr; /* messenger instance */ 71 struct ceph_mds_client *mdsc;
149 struct ceph_mon_client monc;
150 struct ceph_mds_client mdsc;
151 struct ceph_osd_client osdc;
152 72
153 /* writeback */ 73 /* writeback */
154 mempool_t *wb_pagevec_pool; 74 mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
160 struct backing_dev_info backing_dev_info; 80 struct backing_dev_info backing_dev_info;
161 81
162#ifdef CONFIG_DEBUG_FS 82#ifdef CONFIG_DEBUG_FS
163 struct dentry *debugfs_monmap; 83 struct dentry *debugfs_dentry_lru, *debugfs_caps;
164 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
165 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
166 struct dentry *debugfs_congestion_kb; 84 struct dentry *debugfs_congestion_kb;
167 struct dentry *debugfs_bdi; 85 struct dentry *debugfs_bdi;
86 struct dentry *debugfs_mdsc, *debugfs_mdsmap;
168#endif 87#endif
169}; 88};
170 89
90
171/* 91/*
172 * File i/o capability. This tracks shared state with the metadata 92 * File i/o capability. This tracks shared state with the metadata
173 * server that allows us to cache or writeback attributes or to read 93 * server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
275 int should_free_val; 195 int should_free_val;
276}; 196};
277 197
198/*
199 * Ceph dentry state
200 */
201struct ceph_dentry_info {
202 struct ceph_mds_session *lease_session;
203 u32 lease_gen, lease_shared_gen;
204 u32 lease_seq;
205 unsigned long lease_renew_after, lease_renew_from;
206 struct list_head lru;
207 struct dentry *dentry;
208 u64 time;
209 u64 offset;
210};
211
278struct ceph_inode_xattrs_info { 212struct ceph_inode_xattrs_info {
279 /* 213 /*
280 * (still encoded) xattr blob. we avoid the overhead of parsing 214 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
296/* 230/*
297 * Ceph inode. 231 * Ceph inode.
298 */ 232 */
299#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
300#define CEPH_I_NODELAY 4 /* do not delay cap release */
301#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
302#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
303
304struct ceph_inode_info { 233struct ceph_inode_info {
305 struct ceph_vino i_vino; /* ceph ino + snap */ 234 struct ceph_vino i_vino; /* ceph ino + snap */
306 235
@@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
391 return container_of(inode, struct ceph_inode_info, vfs_inode); 320 return container_of(inode, struct ceph_inode_info, vfs_inode);
392} 321}
393 322
323static inline struct ceph_vino ceph_vino(struct inode *inode)
324{
325 return ceph_inode(inode)->i_vino;
326}
327
328/*
329 * ino_t is <64 bits on many architectures, blech.
330 *
331 * don't include snap in ino hash, at least for now.
332 */
333static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
334{
335 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
336#if BITS_PER_LONG == 32
337 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
338 if (!ino)
339 ino = 1;
340#endif
341 return ino;
342}
343
344/* for printf-style formatting */
345#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
346
347static inline u64 ceph_ino(struct inode *inode)
348{
349 return ceph_inode(inode)->i_vino.ino;
350}
351static inline u64 ceph_snap(struct inode *inode)
352{
353 return ceph_inode(inode)->i_vino.snap;
354}
355
356static inline int ceph_ino_compare(struct inode *inode, void *data)
357{
358 struct ceph_vino *pvino = (struct ceph_vino *)data;
359 struct ceph_inode_info *ci = ceph_inode(inode);
360 return ci->i_vino.ino == pvino->ino &&
361 ci->i_vino.snap == pvino->snap;
362}
363
364static inline struct inode *ceph_find_inode(struct super_block *sb,
365 struct ceph_vino vino)
366{
367 ino_t t = ceph_vino_to_ino(vino);
368 return ilookup5(sb, t, ceph_ino_compare, &vino);
369}
370
371
372/*
373 * Ceph inode.
374 */
375#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
376#define CEPH_I_NODELAY 4 /* do not delay cap release */
377#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
378#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
379
394static inline void ceph_i_clear(struct inode *inode, unsigned mask) 380static inline void ceph_i_clear(struct inode *inode, unsigned mask)
395{ 381{
396 struct ceph_inode_info *ci = ceph_inode(inode); 382 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -414,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
414 struct ceph_inode_info *ci = ceph_inode(inode); 400 struct ceph_inode_info *ci = ceph_inode(inode);
415 bool r; 401 bool r;
416 402
417 smp_mb(); 403 spin_lock(&inode->i_lock);
418 r = (ci->i_ceph_flags & mask) == mask; 404 r = (ci->i_ceph_flags & mask) == mask;
405 spin_unlock(&inode->i_lock);
419 return r; 406 return r;
420} 407}
421 408
@@ -432,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
432 struct ceph_inode_frag *pfrag, 419 struct ceph_inode_frag *pfrag,
433 int *found); 420 int *found);
434 421
435/*
436 * Ceph dentry state
437 */
438struct ceph_dentry_info {
439 struct ceph_mds_session *lease_session;
440 u32 lease_gen, lease_shared_gen;
441 u32 lease_seq;
442 unsigned long lease_renew_after, lease_renew_from;
443 struct list_head lru;
444 struct dentry *dentry;
445 u64 time;
446 u64 offset;
447};
448
449static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) 422static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
450{ 423{
451 return (struct ceph_dentry_info *)dentry->d_fsdata; 424 return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
456 return ((loff_t)frag << 32) | (loff_t)off; 429 return ((loff_t)frag << 32) | (loff_t)off;
457} 430}
458 431
459/*
460 * ino_t is <64 bits on many architectures, blech.
461 *
462 * don't include snap in ino hash, at least for now.
463 */
464static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
465{
466 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
467#if BITS_PER_LONG == 32
468 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
469 if (!ino)
470 ino = 1;
471#endif
472 return ino;
473}
474
475static inline int ceph_set_ino_cb(struct inode *inode, void *data) 432static inline int ceph_set_ino_cb(struct inode *inode, void *data)
476{ 433{
477 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; 434 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
479 return 0; 436 return 0;
480} 437}
481 438
482static inline struct ceph_vino ceph_vino(struct inode *inode)
483{
484 return ceph_inode(inode)->i_vino;
485}
486
487/* for printf-style formatting */
488#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
489
490static inline u64 ceph_ino(struct inode *inode)
491{
492 return ceph_inode(inode)->i_vino.ino;
493}
494static inline u64 ceph_snap(struct inode *inode)
495{
496 return ceph_inode(inode)->i_vino.snap;
497}
498
499static inline int ceph_ino_compare(struct inode *inode, void *data)
500{
501 struct ceph_vino *pvino = (struct ceph_vino *)data;
502 struct ceph_inode_info *ci = ceph_inode(inode);
503 return ci->i_vino.ino == pvino->ino &&
504 ci->i_vino.snap == pvino->snap;
505}
506
507static inline struct inode *ceph_find_inode(struct super_block *sb,
508 struct ceph_vino vino)
509{
510 ino_t t = ceph_vino_to_ino(vino);
511 return ilookup5(sb, t, ceph_ino_compare, &vino);
512}
513
514
515/* 439/*
516 * caps helpers 440 * caps helpers
517 */ 441 */
@@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
576 struct ceph_cap_reservation *ctx, int need); 500 struct ceph_cap_reservation *ctx, int need);
577extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 501extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
578 struct ceph_cap_reservation *ctx); 502 struct ceph_cap_reservation *ctx);
579extern void ceph_reservation_status(struct ceph_client *client, 503extern void ceph_reservation_status(struct ceph_fs_client *client,
580 int *total, int *avail, int *used, 504 int *total, int *avail, int *used,
581 int *reserved, int *min); 505 int *reserved, int *min);
582 506
583static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) 507static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
584{ 508{
585 return (struct ceph_client *)inode->i_sb->s_fs_info; 509 return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
586} 510}
587 511
588static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) 512static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
589{ 513{
590 return (struct ceph_client *)sb->s_fs_info; 514 return (struct ceph_fs_client *)sb->s_fs_info;
591} 515}
592 516
593 517
@@ -617,51 +541,6 @@ struct ceph_file_info {
617 541
618 542
619/* 543/*
620 * snapshots
621 */
622
623/*
624 * A "snap context" is the set of existing snapshots when we
625 * write data. It is used by the OSD to guide its COW behavior.
626 *
627 * The ceph_snap_context is refcounted, and attached to each dirty
628 * page, indicating which context the dirty data belonged when it was
629 * dirtied.
630 */
631struct ceph_snap_context {
632 atomic_t nref;
633 u64 seq;
634 int num_snaps;
635 u64 snaps[];
636};
637
638static inline struct ceph_snap_context *
639ceph_get_snap_context(struct ceph_snap_context *sc)
640{
641 /*
642 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
643 atomic_read(&sc->nref)+1);
644 */
645 if (sc)
646 atomic_inc(&sc->nref);
647 return sc;
648}
649
650static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
651{
652 if (!sc)
653 return;
654 /*
655 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
656 atomic_read(&sc->nref)-1);
657 */
658 if (atomic_dec_and_test(&sc->nref)) {
659 /*printk(" deleting snap_context %p\n", sc);*/
660 kfree(sc);
661 }
662}
663
664/*
665 * A "snap realm" describes a subset of the file hierarchy sharing 544 * A "snap realm" describes a subset of the file hierarchy sharing
666 * the same set of snapshots that apply to it. The realms themselves 545 * the same set of snapshots that apply to it. The realms themselves
667 * are organized into a hierarchy, such that children inherit (some of) 546 * are organized into a hierarchy, such that children inherit (some of)
@@ -699,16 +578,33 @@ struct ceph_snap_realm {
699 spinlock_t inodes_with_caps_lock; 578 spinlock_t inodes_with_caps_lock;
700}; 579};
701 580
702 581static inline int default_congestion_kb(void)
703
704/*
705 * calculate the number of pages a given length and offset map onto,
706 * if we align the data.
707 */
708static inline int calc_pages_for(u64 off, u64 len)
709{ 582{
710 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - 583 int congestion_kb;
711 (off >> PAGE_CACHE_SHIFT); 584
585 /*
586 * Copied from NFS
587 *
588 * congestion size, scale with available memory.
589 *
590 * 64MB: 8192k
591 * 128MB: 11585k
592 * 256MB: 16384k
593 * 512MB: 23170k
594 * 1GB: 32768k
595 * 2GB: 46340k
596 * 4GB: 65536k
597 * 8GB: 92681k
598 * 16GB: 131072k
599 *
600 * This allows larger machines to have larger/more transfers.
601 * Limit the default to 256M
602 */
603 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
604 if (congestion_kb > 256*1024)
605 congestion_kb = 256*1024;
606
607 return congestion_kb;
712} 608}
713 609
714 610
@@ -741,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
741 ci_item)->writing; 637 ci_item)->writing;
742} 638}
743 639
744
745/* super.c */
746extern struct kmem_cache *ceph_inode_cachep;
747extern struct kmem_cache *ceph_cap_cachep;
748extern struct kmem_cache *ceph_dentry_cachep;
749extern struct kmem_cache *ceph_file_cachep;
750
751extern const char *ceph_msg_type_name(int type);
752extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
753
754/* inode.c */ 640/* inode.c */
755extern const struct inode_operations ceph_file_iops; 641extern const struct inode_operations ceph_file_iops;
756 642
@@ -857,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
857/* file.c */ 743/* file.c */
858extern const struct file_operations ceph_file_fops; 744extern const struct file_operations ceph_file_fops;
859extern const struct address_space_operations ceph_aops; 745extern const struct address_space_operations ceph_aops;
746extern int ceph_copy_to_page_vector(struct page **pages,
747 const char *data,
748 loff_t off, size_t len);
749extern int ceph_copy_from_page_vector(struct page **pages,
750 char *data,
751 loff_t off, size_t len);
752extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
860extern int ceph_open(struct inode *inode, struct file *file); 753extern int ceph_open(struct inode *inode, struct file *file);
861extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, 754extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
862 struct nameidata *nd, int mode, 755 struct nameidata *nd, int mode,
863 int locked_dir); 756 int locked_dir);
864extern int ceph_release(struct inode *inode, struct file *filp); 757extern int ceph_release(struct inode *inode, struct file *filp);
865extern void ceph_release_page_vector(struct page **pages, int num_pages);
866 758
867/* dir.c */ 759/* dir.c */
868extern const struct file_operations ceph_dir_fops; 760extern const struct file_operations ceph_dir_fops;
@@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
892/* export.c */ 784/* export.c */
893extern const struct export_operations ceph_export_ops; 785extern const struct export_operations ceph_export_ops;
894 786
895/* debugfs.c */
896extern int ceph_debugfs_init(void);
897extern void ceph_debugfs_cleanup(void);
898extern int ceph_debugfs_client_init(struct ceph_client *client);
899extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
900
901/* locks.c */ 787/* locks.c */
902extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); 788extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
903extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); 789extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
914 return NULL; 800 return NULL;
915} 801}
916 802
803/* debugfs.c */
804extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
805extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
806
917#endif /* _FS_CEPH_SUPER_H */ 807#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9578af610b73..6e12a6ba5f79 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2
2#include "super.h" 3#include "super.h"
3#include "decode.h" 4#include "mds_client.h"
5
6#include <linux/ceph/decode.h>
4 7
5#include <linux/xattr.h> 8#include <linux/xattr.h>
6#include <linux/slab.h> 9#include <linux/slab.h>
@@ -620,12 +623,12 @@ out:
620static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 623static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
621 const char *value, size_t size, int flags) 624 const char *value, size_t size, int flags)
622{ 625{
623 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 626 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
624 struct inode *inode = dentry->d_inode; 627 struct inode *inode = dentry->d_inode;
625 struct ceph_inode_info *ci = ceph_inode(inode); 628 struct ceph_inode_info *ci = ceph_inode(inode);
626 struct inode *parent_inode = dentry->d_parent->d_inode; 629 struct inode *parent_inode = dentry->d_parent->d_inode;
627 struct ceph_mds_request *req; 630 struct ceph_mds_request *req;
628 struct ceph_mds_client *mdsc = &client->mdsc; 631 struct ceph_mds_client *mdsc = fsc->mdsc;
629 int err; 632 int err;
630 int i, nr_pages; 633 int i, nr_pages;
631 struct page **pages = NULL; 634 struct page **pages = NULL;
@@ -713,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
713 716
714 /* preallocate memory for xattr name, value, index node */ 717 /* preallocate memory for xattr name, value, index node */
715 err = -ENOMEM; 718 err = -ENOMEM;
716 newname = kmalloc(name_len + 1, GFP_NOFS); 719 newname = kmemdup(name, name_len + 1, GFP_NOFS);
717 if (!newname) 720 if (!newname)
718 goto out; 721 goto out;
719 memcpy(newname, name, name_len + 1);
720 722
721 if (val_len) { 723 if (val_len) {
722 newval = kmalloc(val_len + 1, GFP_NOFS); 724 newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -777,8 +779,8 @@ out:
777 779
778static int ceph_send_removexattr(struct dentry *dentry, const char *name) 780static int ceph_send_removexattr(struct dentry *dentry, const char *name)
779{ 781{
780 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 782 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
781 struct ceph_mds_client *mdsc = &client->mdsc; 783 struct ceph_mds_client *mdsc = fsc->mdsc;
782 struct inode *inode = dentry->d_inode; 784 struct inode *inode = dentry->d_inode;
783 struct inode *parent_inode = dentry->d_parent->d_inode; 785 struct inode *parent_inode = dentry->d_parent->d_inode;
784 struct ceph_mds_request *req; 786 struct ceph_mds_request *req;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cc9665522148..c465ae066c62 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL && (64BIT || LBDAF) 3 depends on (64BIT || LBDAF)
4 select DLM if GFS2_FS_LOCKING_DLM 4 select DLM if GFS2_FS_LOCKING_DLM
5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM 5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
6 select SYSFS if GFS2_FS_LOCKING_DLM 6 select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 194fe16d8418..6b24afb96aae 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
36#include "glops.h" 36#include "glops.h"
37 37
38 38
39static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, 39void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
40 unsigned int from, unsigned int to) 40 unsigned int from, unsigned int to)
41{ 41{
42 struct buffer_head *head = page_buffers(page); 42 struct buffer_head *head = page_buffers(page);
43 unsigned int bsize = head->b_size; 43 unsigned int bsize = head->b_size;
@@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
615 unsigned int data_blocks = 0, ind_blocks = 0, rblocks; 615 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
616 int alloc_required; 616 int alloc_required;
617 int error = 0; 617 int error = 0;
618 struct gfs2_alloc *al; 618 struct gfs2_alloc *al = NULL;
619 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 619 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
620 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 620 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
621 unsigned to = from + len; 621 unsigned to = from + len;
@@ -663,6 +663,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
663 rblocks += RES_STATFS + RES_QUOTA; 663 rblocks += RES_STATFS + RES_QUOTA;
664 if (&ip->i_inode == sdp->sd_rindex) 664 if (&ip->i_inode == sdp->sd_rindex)
665 rblocks += 2 * RES_STATFS; 665 rblocks += 2 * RES_STATFS;
666 if (alloc_required)
667 rblocks += gfs2_rg_blocks(al);
666 668
667 error = gfs2_trans_begin(sdp, rblocks, 669 error = gfs2_trans_begin(sdp, rblocks,
668 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); 670 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -696,13 +698,11 @@ out:
696 698
697 page_cache_release(page); 699 page_cache_release(page);
698 700
699 /* 701 gfs2_trans_end(sdp);
700 * XXX(truncate): the call below should probably be replaced with
701 * a call to the gfs2-specific truncate blocks helper to actually
702 * release disk blocks..
703 */
704 if (pos + len > ip->i_inode.i_size) 702 if (pos + len > ip->i_inode.i_size)
705 truncate_setsize(&ip->i_inode, ip->i_inode.i_size); 703 gfs2_trim_blocks(&ip->i_inode);
704 goto out_trans_fail;
705
706out_endtrans: 706out_endtrans:
707 gfs2_trans_end(sdp); 707 gfs2_trans_end(sdp);
708out_trans_fail: 708out_trans_fail:
@@ -802,10 +802,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
802 page_cache_release(page); 802 page_cache_release(page);
803 803
804 if (copied) { 804 if (copied) {
805 if (inode->i_size < to) { 805 if (inode->i_size < to)
806 i_size_write(inode, to); 806 i_size_write(inode, to);
807 ip->i_disksize = inode->i_size;
808 }
809 gfs2_dinode_out(ip, di); 807 gfs2_dinode_out(ip, di);
810 mark_inode_dirty(inode); 808 mark_inode_dirty(inode);
811 } 809 }
@@ -876,8 +874,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
876 874
877 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 875 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
878 if (ret > 0) { 876 if (ret > 0) {
879 if (inode->i_size > ip->i_disksize)
880 ip->i_disksize = inode->i_size;
881 gfs2_dinode_out(ip, dibh->b_data); 877 gfs2_dinode_out(ip, dibh->b_data);
882 mark_inode_dirty(inode); 878 mark_inode_dirty(inode);
883 } 879 }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6f482809d1a3..5476c066d4ee 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
50 * @ip: the inode 50 * @ip: the inode
51 * @dibh: the dinode buffer 51 * @dibh: the dinode buffer
52 * @block: the block number that was allocated 52 * @block: the block number that was allocated
53 * @private: any locked page held by the caller process 53 * @page: The (optional) page. This is looked up if @page is NULL
54 * 54 *
55 * Returns: errno 55 * Returns: errno
56 */ 56 */
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
109/** 109/**
110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big 110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
111 * @ip: The GFS2 inode to unstuff 111 * @ip: The GFS2 inode to unstuff
112 * @unstuffer: the routine that handles unstuffing a non-zero length file 112 * @page: The (optional) page. This is looked up if the @page is NULL
113 * @private: private data for the unstuffer
114 * 113 *
115 * This routine unstuffs a dinode and returns it to a "normal" state such 114 * This routine unstuffs a dinode and returns it to a "normal" state such
116 * that the height can be grown in the traditional way. 115 * that the height can be grown in the traditional way.
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
132 if (error) 131 if (error)
133 goto out; 132 goto out;
134 133
135 if (ip->i_disksize) { 134 if (i_size_read(&ip->i_inode)) {
136 /* Get a free block, fill it with the stuffed data, 135 /* Get a free block, fill it with the stuffed data,
137 and write it out to disk */ 136 and write it out to disk */
138 137
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
161 di = (struct gfs2_dinode *)dibh->b_data; 160 di = (struct gfs2_dinode *)dibh->b_data;
162 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 161 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
163 162
164 if (ip->i_disksize) { 163 if (i_size_read(&ip->i_inode)) {
165 *(__be64 *)(di + 1) = cpu_to_be64(block); 164 *(__be64 *)(di + 1) = cpu_to_be64(block);
166 gfs2_add_inode_blocks(&ip->i_inode, 1); 165 gfs2_add_inode_blocks(&ip->i_inode, 1);
167 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 166 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -885,83 +884,14 @@ out:
885} 884}
886 885
887/** 886/**
888 * do_grow - Make a file look bigger than it is
889 * @ip: the inode
890 * @size: the size to set the file to
891 *
892 * Called with an exclusive lock on @ip.
893 *
894 * Returns: errno
895 */
896
897static int do_grow(struct gfs2_inode *ip, u64 size)
898{
899 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
900 struct gfs2_alloc *al;
901 struct buffer_head *dibh;
902 int error;
903
904 al = gfs2_alloc_get(ip);
905 if (!al)
906 return -ENOMEM;
907
908 error = gfs2_quota_lock_check(ip);
909 if (error)
910 goto out;
911
912 al->al_requested = sdp->sd_max_height + RES_DATA;
913
914 error = gfs2_inplace_reserve(ip);
915 if (error)
916 goto out_gunlock_q;
917
918 error = gfs2_trans_begin(sdp,
919 sdp->sd_max_height + al->al_rgd->rd_length +
920 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
921 if (error)
922 goto out_ipres;
923
924 error = gfs2_meta_inode_buffer(ip, &dibh);
925 if (error)
926 goto out_end_trans;
927
928 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
929 if (gfs2_is_stuffed(ip)) {
930 error = gfs2_unstuff_dinode(ip, NULL);
931 if (error)
932 goto out_brelse;
933 }
934 }
935
936 ip->i_disksize = size;
937 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
938 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
939 gfs2_dinode_out(ip, dibh->b_data);
940
941out_brelse:
942 brelse(dibh);
943out_end_trans:
944 gfs2_trans_end(sdp);
945out_ipres:
946 gfs2_inplace_release(ip);
947out_gunlock_q:
948 gfs2_quota_unlock(ip);
949out:
950 gfs2_alloc_put(ip);
951 return error;
952}
953
954
955/**
956 * gfs2_block_truncate_page - Deal with zeroing out data for truncate 887 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
957 * 888 *
958 * This is partly borrowed from ext3. 889 * This is partly borrowed from ext3.
959 */ 890 */
960static int gfs2_block_truncate_page(struct address_space *mapping) 891static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
961{ 892{
962 struct inode *inode = mapping->host; 893 struct inode *inode = mapping->host;
963 struct gfs2_inode *ip = GFS2_I(inode); 894 struct gfs2_inode *ip = GFS2_I(inode);
964 loff_t from = inode->i_size;
965 unsigned long index = from >> PAGE_CACHE_SHIFT; 895 unsigned long index = from >> PAGE_CACHE_SHIFT;
966 unsigned offset = from & (PAGE_CACHE_SIZE-1); 896 unsigned offset = from & (PAGE_CACHE_SIZE-1);
967 unsigned blocksize, iblock, length, pos; 897 unsigned blocksize, iblock, length, pos;
@@ -1023,9 +953,11 @@ unlock:
1023 return err; 953 return err;
1024} 954}
1025 955
1026static int trunc_start(struct gfs2_inode *ip, u64 size) 956static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1027{ 957{
1028 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 958 struct gfs2_inode *ip = GFS2_I(inode);
959 struct gfs2_sbd *sdp = GFS2_SB(inode);
960 struct address_space *mapping = inode->i_mapping;
1029 struct buffer_head *dibh; 961 struct buffer_head *dibh;
1030 int journaled = gfs2_is_jdata(ip); 962 int journaled = gfs2_is_jdata(ip);
1031 int error; 963 int error;
@@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1039 if (error) 971 if (error)
1040 goto out; 972 goto out;
1041 973
974 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
975
1042 if (gfs2_is_stuffed(ip)) { 976 if (gfs2_is_stuffed(ip)) {
1043 u64 dsize = size + sizeof(struct gfs2_dinode); 977 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1044 ip->i_disksize = size;
1045 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1046 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1047 gfs2_dinode_out(ip, dibh->b_data);
1048 if (dsize > dibh->b_size)
1049 dsize = dibh->b_size;
1050 gfs2_buffer_clear_tail(dibh, dsize);
1051 error = 1;
1052 } else { 978 } else {
1053 if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) 979 if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
1054 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 980 error = gfs2_block_truncate_page(mapping, newsize);
1055 981 if (error)
1056 if (!error) { 982 goto out_brelse;
1057 ip->i_disksize = size;
1058 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1059 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1060 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1061 gfs2_dinode_out(ip, dibh->b_data);
1062 } 983 }
984 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1063 } 985 }
1064 986
1065 brelse(dibh); 987 i_size_write(inode, newsize);
988 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
989 gfs2_dinode_out(ip, dibh->b_data);
1066 990
991 truncate_pagecache(inode, oldsize, newsize);
992out_brelse:
993 brelse(dibh);
1067out: 994out:
1068 gfs2_trans_end(sdp); 995 gfs2_trans_end(sdp);
1069 return error; 996 return error;
@@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip)
1123 if (error) 1050 if (error)
1124 goto out; 1051 goto out;
1125 1052
1126 if (!ip->i_disksize) { 1053 if (!i_size_read(&ip->i_inode)) {
1127 ip->i_height = 0; 1054 ip->i_height = 0;
1128 ip->i_goal = ip->i_no_addr; 1055 ip->i_goal = ip->i_no_addr;
1129 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1056 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1143,92 +1070,154 @@ out:
1143 1070
1144/** 1071/**
1145 * do_shrink - make a file smaller 1072 * do_shrink - make a file smaller
1146 * @ip: the inode 1073 * @inode: the inode
1147 * @size: the size to make the file 1074 * @oldsize: the current inode size
1148 * @truncator: function to truncate the last partial block 1075 * @newsize: the size to make the file
1149 * 1076 *
1150 * Called with an exclusive lock on @ip. 1077 * Called with an exclusive lock on @inode. The @size must
1078 * be equal to or smaller than the current inode size.
1151 * 1079 *
1152 * Returns: errno 1080 * Returns: errno
1153 */ 1081 */
1154 1082
1155static int do_shrink(struct gfs2_inode *ip, u64 size) 1083static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
1156{ 1084{
1085 struct gfs2_inode *ip = GFS2_I(inode);
1157 int error; 1086 int error;
1158 1087
1159 error = trunc_start(ip, size); 1088 error = trunc_start(inode, oldsize, newsize);
1160 if (error < 0) 1089 if (error < 0)
1161 return error; 1090 return error;
1162 if (error > 0) 1091 if (gfs2_is_stuffed(ip))
1163 return 0; 1092 return 0;
1164 1093
1165 error = trunc_dealloc(ip, size); 1094 error = trunc_dealloc(ip, newsize);
1166 if (!error) 1095 if (error == 0)
1167 error = trunc_end(ip); 1096 error = trunc_end(ip);
1168 1097
1169 return error; 1098 return error;
1170} 1099}
1171 1100
1172static int do_touch(struct gfs2_inode *ip, u64 size) 1101void gfs2_trim_blocks(struct inode *inode)
1173{ 1102{
1174 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1103 u64 size = inode->i_size;
1104 int ret;
1105
1106 ret = do_shrink(inode, size, size);
1107 WARN_ON(ret != 0);
1108}
1109
1110/**
1111 * do_grow - Touch and update inode size
1112 * @inode: The inode
1113 * @size: The new size
1114 *
1115 * This function updates the timestamps on the inode and
1116 * may also increase the size of the inode. This function
1117 * must not be called with @size any smaller than the current
1118 * inode size.
1119 *
1120 * Although it is not strictly required to unstuff files here,
1121 * earlier versions of GFS2 have a bug in the stuffed file reading
1122 * code which will result in a buffer overrun if the size is larger
1123 * than the max stuffed file size. In order to prevent this from
1124 * occuring, such files are unstuffed, but in other cases we can
1125 * just update the inode size directly.
1126 *
1127 * Returns: 0 on success, or -ve on error
1128 */
1129
1130static int do_grow(struct inode *inode, u64 size)
1131{
1132 struct gfs2_inode *ip = GFS2_I(inode);
1133 struct gfs2_sbd *sdp = GFS2_SB(inode);
1175 struct buffer_head *dibh; 1134 struct buffer_head *dibh;
1135 struct gfs2_alloc *al = NULL;
1176 int error; 1136 int error;
1177 1137
1178 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 1138 if (gfs2_is_stuffed(ip) &&
1139 (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1140 al = gfs2_alloc_get(ip);
1141 if (al == NULL)
1142 return -ENOMEM;
1143
1144 error = gfs2_quota_lock_check(ip);
1145 if (error)
1146 goto do_grow_alloc_put;
1147
1148 al->al_requested = 1;
1149 error = gfs2_inplace_reserve(ip);
1150 if (error)
1151 goto do_grow_qunlock;
1152 }
1153
1154 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
1179 if (error) 1155 if (error)
1180 return error; 1156 goto do_grow_release;
1181 1157
1182 down_write(&ip->i_rw_mutex); 1158 if (al) {
1159 error = gfs2_unstuff_dinode(ip, NULL);
1160 if (error)
1161 goto do_end_trans;
1162 }
1183 1163
1184 error = gfs2_meta_inode_buffer(ip, &dibh); 1164 error = gfs2_meta_inode_buffer(ip, &dibh);
1185 if (error) 1165 if (error)
1186 goto do_touch_out; 1166 goto do_end_trans;
1187 1167
1168 i_size_write(inode, size);
1188 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1169 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1189 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1170 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1190 gfs2_dinode_out(ip, dibh->b_data); 1171 gfs2_dinode_out(ip, dibh->b_data);
1191 brelse(dibh); 1172 brelse(dibh);
1192 1173
1193do_touch_out: 1174do_end_trans:
1194 up_write(&ip->i_rw_mutex);
1195 gfs2_trans_end(sdp); 1175 gfs2_trans_end(sdp);
1176do_grow_release:
1177 if (al) {
1178 gfs2_inplace_release(ip);
1179do_grow_qunlock:
1180 gfs2_quota_unlock(ip);
1181do_grow_alloc_put:
1182 gfs2_alloc_put(ip);
1183 }
1196 return error; 1184 return error;
1197} 1185}
1198 1186
1199/** 1187/**
1200 * gfs2_truncatei - make a file a given size 1188 * gfs2_setattr_size - make a file a given size
1201 * @ip: the inode 1189 * @inode: the inode
1202 * @size: the size to make the file 1190 * @newsize: the size to make the file
1203 * @truncator: function to truncate the last partial block
1204 * 1191 *
1205 * The file size can grow, shrink, or stay the same size. 1192 * The file size can grow, shrink, or stay the same size. This
1193 * is called holding i_mutex and an exclusive glock on the inode
1194 * in question.
1206 * 1195 *
1207 * Returns: errno 1196 * Returns: errno
1208 */ 1197 */
1209 1198
1210int gfs2_truncatei(struct gfs2_inode *ip, u64 size) 1199int gfs2_setattr_size(struct inode *inode, u64 newsize)
1211{ 1200{
1212 int error; 1201 int ret;
1202 u64 oldsize;
1213 1203
1214 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) 1204 BUG_ON(!S_ISREG(inode->i_mode));
1215 return -EINVAL;
1216 1205
1217 if (size > ip->i_disksize) 1206 ret = inode_newsize_ok(inode, newsize);
1218 error = do_grow(ip, size); 1207 if (ret)
1219 else if (size < ip->i_disksize) 1208 return ret;
1220 error = do_shrink(ip, size);
1221 else
1222 /* update time stamps */
1223 error = do_touch(ip, size);
1224 1209
1225 return error; 1210 oldsize = inode->i_size;
1211 if (newsize >= oldsize)
1212 return do_grow(inode, newsize);
1213
1214 return do_shrink(inode, oldsize, newsize);
1226} 1215}
1227 1216
1228int gfs2_truncatei_resume(struct gfs2_inode *ip) 1217int gfs2_truncatei_resume(struct gfs2_inode *ip)
1229{ 1218{
1230 int error; 1219 int error;
1231 error = trunc_dealloc(ip, ip->i_disksize); 1220 error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1232 if (!error) 1221 if (!error)
1233 error = trunc_end(ip); 1222 error = trunc_end(ip);
1234 return error; 1223 return error;
@@ -1269,7 +1258,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1269 1258
1270 shift = sdp->sd_sb.sb_bsize_shift; 1259 shift = sdp->sd_sb.sb_bsize_shift;
1271 BUG_ON(gfs2_is_dir(ip)); 1260 BUG_ON(gfs2_is_dir(ip));
1272 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; 1261 end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1273 lblock = offset >> shift; 1262 lblock = offset >> shift;
1274 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; 1263 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1275 if (lblock_stop > end_of_file) 1264 if (lblock_stop > end_of_file)
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index a20a5213135a..42fea03e2bd9 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
44 } 44 }
45} 45}
46 46
47int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); 47extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
48int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); 48extern int gfs2_block_map(struct inode *inode, sector_t lblock,
49int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); 49 struct buffer_head *bh, int create);
50 50extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
51int gfs2_truncatei(struct gfs2_inode *ip, u64 size); 51 u64 *dblock, unsigned *extlen);
52int gfs2_truncatei_resume(struct gfs2_inode *ip); 52extern int gfs2_setattr_size(struct inode *inode, u64 size);
53int gfs2_file_dealloc(struct gfs2_inode *ip); 53extern void gfs2_trim_blocks(struct inode *inode);
54int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 54extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
55 unsigned int len); 55extern int gfs2_file_dealloc(struct gfs2_inode *ip);
56extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
57 unsigned int len);
56 58
57#endif /* __BMAP_DOT_H__ */ 59#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index bb7907bde3d8..6798755b3858 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
49 ip = GFS2_I(inode); 49 ip = GFS2_I(inode);
50 } 50 }
51 51
52 if (sdp->sd_args.ar_localcaching) 52 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
53 goto valid; 53 goto valid;
54 54
55 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); 55 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b9dd88a78dd4..5c356d09c321 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) 79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) 80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
81 81
82struct qstr gfs2_qdot __read_mostly;
83struct qstr gfs2_qdotdot __read_mostly;
84
82typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, 85typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
83 u64 leaf_no, void *data); 86 u64 leaf_no, void *data);
84typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, 87typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
127 130
128 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 131 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
129 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); 132 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
130 if (ip->i_disksize < offset + size) 133 if (ip->i_inode.i_size < offset + size)
131 ip->i_disksize = offset + size; 134 i_size_write(&ip->i_inode, offset + size);
132 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 135 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
133 gfs2_dinode_out(ip, dibh->b_data); 136 gfs2_dinode_out(ip, dibh->b_data);
134 137
@@ -225,8 +228,8 @@ out:
225 if (error) 228 if (error)
226 return error; 229 return error;
227 230
228 if (ip->i_disksize < offset + copied) 231 if (ip->i_inode.i_size < offset + copied)
229 ip->i_disksize = offset + copied; 232 i_size_write(&ip->i_inode, offset + copied);
230 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 233 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
231 234
232 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 235 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
275 unsigned int o; 278 unsigned int o;
276 int copied = 0; 279 int copied = 0;
277 int error = 0; 280 int error = 0;
281 u64 disksize = i_size_read(&ip->i_inode);
278 282
279 if (offset >= ip->i_disksize) 283 if (offset >= disksize)
280 return 0; 284 return 0;
281 285
282 if (offset + size > ip->i_disksize) 286 if (offset + size > disksize)
283 size = ip->i_disksize - offset; 287 size = disksize - offset;
284 288
285 if (!size) 289 if (!size)
286 return 0; 290 return 0;
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
727 unsigned hsize = 1 << ip->i_depth; 731 unsigned hsize = 1 << ip->i_depth;
728 unsigned index; 732 unsigned index;
729 u64 ln; 733 u64 ln;
730 if (hsize * sizeof(u64) != ip->i_disksize) { 734 if (hsize * sizeof(u64) != i_size_read(inode)) {
731 gfs2_consist_inode(ip); 735 gfs2_consist_inode(ip);
732 return ERR_PTR(-EIO); 736 return ERR_PTR(-EIO);
733 } 737 }
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode)
879 for (x = sdp->sd_hash_ptrs; x--; lp++) 883 for (x = sdp->sd_hash_ptrs; x--; lp++)
880 *lp = cpu_to_be64(bn); 884 *lp = cpu_to_be64(bn);
881 885
882 dip->i_disksize = sdp->sd_sb.sb_bsize / 2; 886 i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
883 gfs2_add_inode_blocks(&dip->i_inode, 1); 887 gfs2_add_inode_blocks(&dip->i_inode, 1);
884 dip->i_diskflags |= GFS2_DIF_EXHASH; 888 dip->i_diskflags |= GFS2_DIF_EXHASH;
885 889
@@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1057 u64 *buf; 1061 u64 *buf;
1058 u64 *from, *to; 1062 u64 *from, *to;
1059 u64 block; 1063 u64 block;
1064 u64 disksize = i_size_read(&dip->i_inode);
1060 int x; 1065 int x;
1061 int error = 0; 1066 int error = 0;
1062 1067
1063 hsize = 1 << dip->i_depth; 1068 hsize = 1 << dip->i_depth;
1064 if (hsize * sizeof(u64) != dip->i_disksize) { 1069 if (hsize * sizeof(u64) != disksize) {
1065 gfs2_consist_inode(dip); 1070 gfs2_consist_inode(dip);
1066 return -EIO; 1071 return -EIO;
1067 } 1072 }
@@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1072 if (!buf) 1077 if (!buf)
1073 return -ENOMEM; 1078 return -ENOMEM;
1074 1079
1075 for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { 1080 for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
1076 error = gfs2_dir_read_data(dip, (char *)buf, 1081 error = gfs2_dir_read_data(dip, (char *)buf,
1077 block * sdp->sd_hash_bsize, 1082 block * sdp->sd_hash_bsize,
1078 sdp->sd_hash_bsize, 1); 1083 sdp->sd_hash_bsize, 1);
@@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1370 unsigned depth = 0; 1375 unsigned depth = 0;
1371 1376
1372 hsize = 1 << dip->i_depth; 1377 hsize = 1 << dip->i_depth;
1373 if (hsize * sizeof(u64) != dip->i_disksize) { 1378 if (hsize * sizeof(u64) != i_size_read(inode)) {
1374 gfs2_consist_inode(dip); 1379 gfs2_consist_inode(dip);
1375 return -EIO; 1380 return -EIO;
1376 } 1381 }
@@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1784 int error = 0; 1789 int error = 0;
1785 1790
1786 hsize = 1 << dip->i_depth; 1791 hsize = 1 << dip->i_depth;
1787 if (hsize * sizeof(u64) != dip->i_disksize) { 1792 if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
1788 gfs2_consist_inode(dip); 1793 gfs2_consist_inode(dip);
1789 return -EIO; 1794 return -EIO;
1790 } 1795 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f919440c3be..a98f644bd3df 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
17struct gfs2_inode; 17struct gfs2_inode;
18struct gfs2_inum; 18struct gfs2_inum;
19 19
20struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename); 20extern struct inode *gfs2_dir_search(struct inode *dir,
21int gfs2_dir_check(struct inode *dir, const struct qstr *filename, 21 const struct qstr *filename);
22 const struct gfs2_inode *ip); 22extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
23int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 23 const struct gfs2_inode *ip);
24 const struct gfs2_inode *ip, unsigned int type); 24extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
25int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); 25 const struct gfs2_inode *ip, unsigned int type);
26int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, 26extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
27 filldir_t filldir); 27extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
28int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, 28 filldir_t filldir);
29 const struct gfs2_inode *nip, unsigned int new_type); 29extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
30 const struct gfs2_inode *nip, unsigned int new_type);
30 31
31int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); 32extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
32 33
33int gfs2_diradd_alloc_required(struct inode *dir, 34extern int gfs2_diradd_alloc_required(struct inode *dir,
34 const struct qstr *filename); 35 const struct qstr *filename);
35int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, 36extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
36 struct buffer_head **bhp); 37 struct buffer_head **bhp);
37 38
38static inline u32 gfs2_disk_hash(const char *data, int len) 39static inline u32 gfs2_disk_hash(const char *data, int len)
39{ 40{
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
61 memcpy(dent + 1, name->name, name->len); 62 memcpy(dent + 1, name->name, name->len);
62} 63}
63 64
65extern struct qstr gfs2_qdot;
66extern struct qstr gfs2_qdotdot;
67
64#endif /* __DIR_DOT_H__ */ 68#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index dfe237a3f8ad..06d582732d34 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name,
126 126
127static struct dentry *gfs2_get_parent(struct dentry *child) 127static struct dentry *gfs2_get_parent(struct dentry *child)
128{ 128{
129 struct qstr dotdot;
130 struct dentry *dentry; 129 struct dentry *dentry;
131 130
132 /* 131 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
133 * XXX(hch): it would be a good idea to keep this around as a
134 * static variable.
135 */
136 gfs2_str2qstr(&dotdot, "..");
137
138 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
139 if (!IS_ERR(dentry)) 132 if (!IS_ERR(dentry))
140 dentry->d_op = &gfs2_dops; 133 dentry->d_op = &gfs2_dops;
141 return dentry; 134 return dentry;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4edd662c8232..237ee6a940df 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -382,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
382 rblocks = RES_DINODE + ind_blocks; 382 rblocks = RES_DINODE + ind_blocks;
383 if (gfs2_is_jdata(ip)) 383 if (gfs2_is_jdata(ip))
384 rblocks += data_blocks ? data_blocks : 1; 384 rblocks += data_blocks ? data_blocks : 1;
385 if (ind_blocks || data_blocks) 385 if (ind_blocks || data_blocks) {
386 rblocks += RES_STATFS + RES_QUOTA; 386 rblocks += RES_STATFS + RES_QUOTA;
387 rblocks += gfs2_rg_blocks(al);
388 }
387 ret = gfs2_trans_begin(sdp, rblocks, 0); 389 ret = gfs2_trans_begin(sdp, rblocks, 0);
388 if (ret) 390 if (ret)
389 goto out_trans_fail; 391 goto out_trans_fail;
@@ -491,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
491 goto fail; 493 goto fail;
492 494
493 if (!(file->f_flags & O_LARGEFILE) && 495 if (!(file->f_flags & O_LARGEFILE) &&
494 ip->i_disksize > MAX_NON_LFS) { 496 i_size_read(inode) > MAX_NON_LFS) {
495 error = -EOVERFLOW; 497 error = -EOVERFLOW;
496 goto fail_gunlock; 498 goto fail_gunlock;
497 } 499 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9adf8f924e08..87778857f099 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
441 else 441 else
442 gfs2_glock_put_nolock(gl); 442 gfs2_glock_put_nolock(gl);
443 } 443 }
444 if (held1 && held2 && list_empty(&gl->gl_holders))
445 clear_bit(GLF_QUEUED, &gl->gl_flags);
444 446
445 gl->gl_state = new_state; 447 gl->gl_state = new_state;
446 gl->gl_tchange = jiffies; 448 gl->gl_tchange = jiffies;
@@ -1012,6 +1014,7 @@ fail:
1012 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) 1014 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
1013 insert_pt = &gh2->gh_list; 1015 insert_pt = &gh2->gh_list;
1014 } 1016 }
1017 set_bit(GLF_QUEUED, &gl->gl_flags);
1015 if (likely(insert_pt == NULL)) { 1018 if (likely(insert_pt == NULL)) {
1016 list_add_tail(&gh->gh_list, &gl->gl_holders); 1019 list_add_tail(&gh->gh_list, &gl->gl_holders);
1017 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) 1020 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1310,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1310 1313
1311 gfs2_glock_hold(gl); 1314 gfs2_glock_hold(gl);
1312 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; 1315 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1313 if (time_before(now, holdtime)) 1316 if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
1314 delay = holdtime - now; 1317 if (time_before(now, holdtime))
1315 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) 1318 delay = holdtime - now;
1316 delay = gl->gl_ops->go_min_hold_time; 1319 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
1320 delay = gl->gl_ops->go_min_hold_time;
1321 }
1317 1322
1318 spin_lock(&gl->gl_spin); 1323 spin_lock(&gl->gl_spin);
1319 handle_callback(gl, state, delay); 1324 handle_callback(gl, state, delay);
@@ -1512,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl)
1512 spin_unlock(&lru_lock); 1517 spin_unlock(&lru_lock);
1513 1518
1514 spin_lock(&gl->gl_spin); 1519 spin_lock(&gl->gl_spin);
1515 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) 1520 if (gl->gl_state != LM_ST_UNLOCKED)
1516 handle_callback(gl, LM_ST_UNLOCKED, 0); 1521 handle_callback(gl, LM_ST_UNLOCKED, 0);
1517 spin_unlock(&gl->gl_spin); 1522 spin_unlock(&gl->gl_spin);
1518 gfs2_glock_hold(gl); 1523 gfs2_glock_hold(gl);
@@ -1660,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
1660 *p++ = 'I'; 1665 *p++ = 'I';
1661 if (test_bit(GLF_FROZEN, gflags)) 1666 if (test_bit(GLF_FROZEN, gflags))
1662 *p++ = 'F'; 1667 *p++ = 'F';
1668 if (test_bit(GLF_QUEUED, gflags))
1669 *p++ = 'q';
1663 *p = 0; 1670 *p = 0;
1664 return buf; 1671 return buf;
1665} 1672}
@@ -1776,10 +1783,12 @@ int __init gfs2_glock_init(void)
1776 } 1783 }
1777#endif 1784#endif
1778 1785
1779 glock_workqueue = create_workqueue("glock_workqueue"); 1786 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
1787 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1780 if (IS_ERR(glock_workqueue)) 1788 if (IS_ERR(glock_workqueue))
1781 return PTR_ERR(glock_workqueue); 1789 return PTR_ERR(glock_workqueue);
1782 gfs2_delete_workqueue = create_workqueue("delete_workqueue"); 1790 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
1791 WQ_FREEZEABLE, 0);
1783 if (IS_ERR(gfs2_delete_workqueue)) { 1792 if (IS_ERR(gfs2_delete_workqueue)) {
1784 destroy_workqueue(glock_workqueue); 1793 destroy_workqueue(glock_workqueue);
1785 return PTR_ERR(gfs2_delete_workqueue); 1794 return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2bda1911b156..db1c26d6d220 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
216 216
217/** 217/**
218 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock 218 * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
219 * @gl: the glock 219 * @gl: the glock
220 * @state: the state we're requesting 220 * @state: the state we're requesting
221 * @flags: the modifier flags 221 * @flags: the modifier flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49f97d3bb690..0d149dcc04e5 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
262 const struct gfs2_inode *ip = gl->gl_object; 262 const struct gfs2_inode *ip = gl->gl_object;
263 if (ip == NULL) 263 if (ip == NULL)
264 return 0; 264 return 0;
265 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n", 265 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
266 (unsigned long long)ip->i_no_formal_ino, 266 (unsigned long long)ip->i_no_formal_ino,
267 (unsigned long long)ip->i_no_addr, 267 (unsigned long long)ip->i_no_addr,
268 IF2DT(ip->i_inode.i_mode), ip->i_flags, 268 IF2DT(ip->i_inode.i_mode), ip->i_flags,
269 (unsigned int)ip->i_diskflags, 269 (unsigned int)ip->i_diskflags,
270 (unsigned long long)ip->i_inode.i_size, 270 (unsigned long long)i_size_read(&ip->i_inode));
271 (unsigned long long)ip->i_disksize);
272 return 0; 271 return 0;
273} 272}
274 273
@@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
453 [LM_TYPE_META] = &gfs2_meta_glops, 452 [LM_TYPE_META] = &gfs2_meta_glops,
454 [LM_TYPE_INODE] = &gfs2_inode_glops, 453 [LM_TYPE_INODE] = &gfs2_inode_glops,
455 [LM_TYPE_RGRP] = &gfs2_rgrp_glops, 454 [LM_TYPE_RGRP] = &gfs2_rgrp_glops,
456 [LM_TYPE_NONDISK] = &gfs2_trans_glops,
457 [LM_TYPE_IOPEN] = &gfs2_iopen_glops, 455 [LM_TYPE_IOPEN] = &gfs2_iopen_glops,
458 [LM_TYPE_FLOCK] = &gfs2_flock_glops, 456 [LM_TYPE_FLOCK] = &gfs2_flock_glops,
459 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, 457 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fdbf4b366fa5..764fbb49efc8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -196,6 +196,7 @@ enum {
196 GLF_REPLY_PENDING = 9, 196 GLF_REPLY_PENDING = 9,
197 GLF_INITIAL = 10, 197 GLF_INITIAL = 10,
198 GLF_FROZEN = 11, 198 GLF_FROZEN = 11,
199 GLF_QUEUED = 12,
199}; 200};
200 201
201struct gfs2_glock { 202struct gfs2_glock {
@@ -267,7 +268,6 @@ struct gfs2_inode {
267 u64 i_no_formal_ino; 268 u64 i_no_formal_ino;
268 u64 i_generation; 269 u64 i_generation;
269 u64 i_eattr; 270 u64 i_eattr;
270 loff_t i_disksize;
271 unsigned long i_flags; /* GIF_... */ 271 unsigned long i_flags; /* GIF_... */
272 struct gfs2_glock *i_gl; /* Move into i_gh? */ 272 struct gfs2_glock *i_gl; /* Move into i_gh? */
273 struct gfs2_holder i_iopen_gh; 273 struct gfs2_holder i_iopen_gh;
@@ -416,11 +416,8 @@ struct gfs2_args {
416 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ 416 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
417 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ 417 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
418 unsigned int ar_spectator:1; /* Don't get a journal */ 418 unsigned int ar_spectator:1; /* Don't get a journal */
419 unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */
420 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ 419 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */
421 unsigned int ar_localcaching:1; /* Local caching */
422 unsigned int ar_debug:1; /* Oops on errors */ 420 unsigned int ar_debug:1; /* Oops on errors */
423 unsigned int ar_upgrade:1; /* Upgrade ondisk format */
424 unsigned int ar_posix_acl:1; /* Enable posix acls */ 421 unsigned int ar_posix_acl:1; /* Enable posix acls */
425 unsigned int ar_quota:2; /* off/account/on */ 422 unsigned int ar_quota:2; /* off/account/on */
426 unsigned int ar_suiddir:1; /* suiddir support */ 423 unsigned int ar_suiddir:1; /* suiddir support */
@@ -497,7 +494,7 @@ struct gfs2_sb_host {
497 */ 494 */
498 495
499struct lm_lockstruct { 496struct lm_lockstruct {
500 unsigned int ls_jid; 497 int ls_jid;
501 unsigned int ls_first; 498 unsigned int ls_first;
502 unsigned int ls_first_done; 499 unsigned int ls_first_done;
503 unsigned int ls_nodir; 500 unsigned int ls_nodir;
@@ -572,6 +569,7 @@ struct gfs2_sbd {
572 struct list_head sd_rindex_mru_list; 569 struct list_head sd_rindex_mru_list;
573 struct gfs2_rgrpd *sd_rindex_forward; 570 struct gfs2_rgrpd *sd_rindex_forward;
574 unsigned int sd_rgrps; 571 unsigned int sd_rgrps;
572 unsigned int sd_max_rg_data;
575 573
576 /* Journal index stuff */ 574 /* Journal index stuff */
577 575
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 08140f185a37..06370f8bd8cf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
359 * to do that. 359 * to do that.
360 */ 360 */
361 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); 361 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
362 ip->i_disksize = be64_to_cpu(str->di_size); 362 i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
363 i_size_write(&ip->i_inode, ip->i_disksize);
364 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); 363 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
365 atime.tv_sec = be64_to_cpu(str->di_atime); 364 atime.tv_sec = be64_to_cpu(str->di_atime);
366 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 365 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -1055,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1055 str->di_uid = cpu_to_be32(ip->i_inode.i_uid); 1054 str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
1056 str->di_gid = cpu_to_be32(ip->i_inode.i_gid); 1055 str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
1057 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); 1056 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
1058 str->di_size = cpu_to_be64(ip->i_disksize); 1057 str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
1059 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 1058 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
1060 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); 1059 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1061 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); 1060 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1085,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
1085 (unsigned long long)ip->i_no_formal_ino); 1084 (unsigned long long)ip->i_no_formal_ino);
1086 printk(KERN_INFO " no_addr = %llu\n", 1085 printk(KERN_INFO " no_addr = %llu\n",
1087 (unsigned long long)ip->i_no_addr); 1086 (unsigned long long)ip->i_no_addr);
1088 printk(KERN_INFO " i_disksize = %llu\n", 1087 printk(KERN_INFO " i_size = %llu\n",
1089 (unsigned long long)ip->i_disksize); 1088 (unsigned long long)i_size_read(&ip->i_inode));
1090 printk(KERN_INFO " blocks = %llu\n", 1089 printk(KERN_INFO " blocks = %llu\n",
1091 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); 1090 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
1092 printk(KERN_INFO " i_goal = %llu\n", 1091 printk(KERN_INFO " i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 300ada3f21de..6720d7d5fbc6 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
19extern int gfs2_internal_read(struct gfs2_inode *ip, 19extern int gfs2_internal_read(struct gfs2_inode *ip,
20 struct file_ra_state *ra_state, 20 struct file_ra_state *ra_state,
21 char *buf, loff_t *pos, unsigned size); 21 char *buf, loff_t *pos, unsigned size);
22extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
23 unsigned int from, unsigned int to);
22extern void gfs2_set_aops(struct inode *inode); 24extern void gfs2_set_aops(struct inode *inode);
23 25
24static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) 26static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
80 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); 82 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
81} 83}
82 84
85static inline int gfs2_check_internal_file_size(struct inode *inode,
86 u64 minsize, u64 maxsize)
87{
88 u64 size = i_size_read(inode);
89 if (size < minsize || size > maxsize)
90 goto err;
91 if (size & ((1 << inode->i_blkbits) - 1))
92 goto err;
93 return 0;
94err:
95 gfs2_consist_inode(GFS2_I(inode));
96 return -EIO;
97}
83 98
84extern void gfs2_set_iop(struct inode *inode); 99extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 100extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e0470ed34c2..1c09425b45fd 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
42 ret |= LM_OUT_CANCELED; 42 ret |= LM_OUT_CANCELED;
43 goto out; 43 goto out;
44 case -EAGAIN: /* Try lock fails */ 44 case -EAGAIN: /* Try lock fails */
45 case -EDEADLK: /* Deadlock detected */
45 goto out; 46 goto out;
46 case -EINVAL: /* Invalid */ 47 case -ETIMEDOUT: /* Canceled due to timeout */
47 case -ENOMEM: /* Out of memory */
48 ret |= LM_OUT_ERROR; 48 ret |= LM_OUT_ERROR;
49 goto out; 49 goto out;
50 case 0: /* Success */ 50 case 0: /* Success */
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index b1e9630eb46a..d7eb1e209aa8 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,6 +24,7 @@
24#include "glock.h" 24#include "glock.h"
25#include "quota.h" 25#include "quota.h"
26#include "recovery.h" 26#include "recovery.h"
27#include "dir.h"
27 28
28static struct shrinker qd_shrinker = { 29static struct shrinker qd_shrinker = {
29 .shrink = gfs2_shrink_qd_memory, 30 .shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
78{ 79{
79 int error; 80 int error;
80 81
82 gfs2_str2qstr(&gfs2_qdot, ".");
83 gfs2_str2qstr(&gfs2_qdotdot, "..");
84
81 error = gfs2_sys_init(); 85 error = gfs2_sys_init();
82 if (error) 86 if (error)
83 return error; 87 return error;
@@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void)
140 144
141 error = -ENOMEM; 145 error = -ENOMEM;
142 gfs_recovery_wq = alloc_workqueue("gfs_recovery", 146 gfs_recovery_wq = alloc_workqueue("gfs_recovery",
143 WQ_NON_REENTRANT | WQ_RESCUER, 0); 147 WQ_RESCUER | WQ_FREEZEABLE, 0);
144 if (!gfs_recovery_wq) 148 if (!gfs_recovery_wq)
145 goto fail_wq; 149 goto fail_wq;
146 150
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4d4b1e8ac64c..aeafc233dc89 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -38,14 +38,6 @@
38#define DO 0 38#define DO 0
39#define UNDO 1 39#define UNDO 1
40 40
41static const u32 gfs2_old_fs_formats[] = {
42 0
43};
44
45static const u32 gfs2_old_multihost_formats[] = {
46 0
47};
48
49/** 41/**
50 * gfs2_tune_init - Fill a gfs2_tune structure with default values 42 * gfs2_tune_init - Fill a gfs2_tune structure with default values
51 * @gt: tune 43 * @gt: tune
@@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
135 127
136static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) 128static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
137{ 129{
138 unsigned int x;
139
140 if (sb->sb_magic != GFS2_MAGIC || 130 if (sb->sb_magic != GFS2_MAGIC ||
141 sb->sb_type != GFS2_METATYPE_SB) { 131 sb->sb_type != GFS2_METATYPE_SB) {
142 if (!silent) 132 if (!silent)
@@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
150 sb->sb_multihost_format == GFS2_FORMAT_MULTI) 140 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
151 return 0; 141 return 0;
152 142
153 if (sb->sb_fs_format != GFS2_FORMAT_FS) { 143 fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
154 for (x = 0; gfs2_old_fs_formats[x]; x++)
155 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
156 break;
157 144
158 if (!gfs2_old_fs_formats[x]) { 145 return -EINVAL;
159 printk(KERN_WARNING
160 "GFS2: code version (%u, %u) is incompatible "
161 "with ondisk format (%u, %u)\n",
162 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
163 sb->sb_fs_format, sb->sb_multihost_format);
164 printk(KERN_WARNING
165 "GFS2: I don't know how to upgrade this FS\n");
166 return -EINVAL;
167 }
168 }
169
170 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
171 for (x = 0; gfs2_old_multihost_formats[x]; x++)
172 if (gfs2_old_multihost_formats[x] ==
173 sb->sb_multihost_format)
174 break;
175
176 if (!gfs2_old_multihost_formats[x]) {
177 printk(KERN_WARNING
178 "GFS2: code version (%u, %u) is incompatible "
179 "with ondisk format (%u, %u)\n",
180 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
181 sb->sb_fs_format, sb->sb_multihost_format);
182 printk(KERN_WARNING
183 "GFS2: I don't know how to upgrade this FS\n");
184 return -EINVAL;
185 }
186 }
187
188 if (!sdp->sd_args.ar_upgrade) {
189 printk(KERN_WARNING
190 "GFS2: code version (%u, %u) is incompatible "
191 "with ondisk format (%u, %u)\n",
192 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
193 sb->sb_fs_format, sb->sb_multihost_format);
194 printk(KERN_INFO
195 "GFS2: Use the \"upgrade\" mount option to upgrade "
196 "the FS\n");
197 printk(KERN_INFO "GFS2: See the manual for more details\n");
198 return -EINVAL;
199 }
200
201 return 0;
202} 146}
203 147
204static void end_bio_io_page(struct bio *bio, int error) 148static void end_bio_io_page(struct bio *bio, int error)
@@ -586,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
586 530
587 prev_db = 0; 531 prev_db = 0;
588 532
589 for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) { 533 for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
590 bh.b_state = 0; 534 bh.b_state = 0;
591 bh.b_blocknr = 0; 535 bh.b_blocknr = 0;
592 bh.b_size = 1 << ip->i_inode.i_blkbits; 536 bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1022 if (!strcmp("lock_nolock", proto)) { 966 if (!strcmp("lock_nolock", proto)) {
1023 lm = &nolock_ops; 967 lm = &nolock_ops;
1024 sdp->sd_args.ar_localflocks = 1; 968 sdp->sd_args.ar_localflocks = 1;
1025 sdp->sd_args.ar_localcaching = 1;
1026#ifdef CONFIG_GFS2_FS_LOCKING_DLM 969#ifdef CONFIG_GFS2_FS_LOCKING_DLM
1027 } else if (!strcmp("lock_dlm", proto)) { 970 } else if (!strcmp("lock_dlm", proto)) {
1028 lm = &gfs2_dlm_ops; 971 lm = &gfs2_dlm_ops;
@@ -1113,8 +1056,6 @@ static int gfs2_journalid_wait(void *word)
1113 1056
1114static int wait_on_journal(struct gfs2_sbd *sdp) 1057static int wait_on_journal(struct gfs2_sbd *sdp)
1115{ 1058{
1116 if (sdp->sd_args.ar_spectator)
1117 return 0;
1118 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 1059 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
1119 return 0; 1060 return 0;
1120 1061
@@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1217 if (error) 1158 if (error)
1218 goto fail_sb; 1159 goto fail_sb;
1219 1160
1161 /*
1162 * If user space has failed to join the cluster or some similar
1163 * failure has occurred, then the journal id will contain a
1164 * negative (error) number. This will then be returned to the
1165 * caller (of the mount syscall). We do this even for spectator
1166 * mounts (which just write a jid of 0 to indicate "ok" even though
1167 * the jid is unused in the spectator case)
1168 */
1169 if (sdp->sd_lockstruct.ls_jid < 0) {
1170 error = sdp->sd_lockstruct.ls_jid;
1171 sdp->sd_lockstruct.ls_jid = 0;
1172 goto fail_sb;
1173 }
1174
1220 error = init_inodes(sdp, DO); 1175 error = init_inodes(sdp, DO);
1221 if (error) 1176 if (error)
1222 goto fail_sb; 1177 goto fail_sb;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1009be2c9737..0534510200d5 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,6 +18,8 @@
18#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
19#include <linux/crc32.h> 19#include <linux/crc32.h>
20#include <linux/fiemap.h> 20#include <linux/fiemap.h>
21#include <linux/swap.h>
22#include <linux/falloc.h>
21#include <asm/uaccess.h> 23#include <asm/uaccess.h>
22 24
23#include "gfs2.h" 25#include "gfs2.h"
@@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
217 goto out_gunlock_q; 219 goto out_gunlock_q;
218 220
219 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 221 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
220 al->al_rgd->rd_length + 222 gfs2_rg_blocks(al) +
221 2 * RES_DINODE + RES_STATFS + 223 2 * RES_DINODE + RES_STATFS +
222 RES_QUOTA, 0); 224 RES_QUOTA, 0);
223 if (error) 225 if (error)
@@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
406 408
407 ip = ghs[1].gh_gl->gl_object; 409 ip = ghs[1].gh_gl->gl_object;
408 410
409 ip->i_disksize = size;
410 i_size_write(inode, size); 411 i_size_write(inode, size);
411 412
412 error = gfs2_meta_inode_buffer(ip, &dibh); 413 error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
461 ip = ghs[1].gh_gl->gl_object; 462 ip = ghs[1].gh_gl->gl_object;
462 463
463 ip->i_inode.i_nlink = 2; 464 ip->i_inode.i_nlink = 2;
464 ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); 465 i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
465 ip->i_diskflags |= GFS2_DIF_JDATA; 466 ip->i_diskflags |= GFS2_DIF_JDATA;
466 ip->i_entries = 2; 467 ip->i_entries = 2;
467 468
@@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
470 if (!gfs2_assert_withdraw(sdp, !error)) { 471 if (!gfs2_assert_withdraw(sdp, !error)) {
471 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; 472 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
472 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); 473 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
473 struct qstr str;
474 474
475 gfs2_str2qstr(&str, ".");
476 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 475 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
477 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent); 476 gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
478 dent->de_inum = di->di_num; /* already GFS2 endian */ 477 dent->de_inum = di->di_num; /* already GFS2 endian */
479 dent->de_type = cpu_to_be16(DT_DIR); 478 dent->de_type = cpu_to_be16(DT_DIR);
480 di->di_entries = cpu_to_be32(1); 479 di->di_entries = cpu_to_be32(1);
481 480
482 gfs2_str2qstr(&str, "..");
483 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); 481 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
484 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); 482 gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
485 483
486 gfs2_inum_out(dip, dent); 484 gfs2_inum_out(dip, dent);
487 dent->de_type = cpu_to_be16(DT_DIR); 485 dent->de_type = cpu_to_be16(DT_DIR);
@@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
522static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, 520static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
523 struct gfs2_inode *ip) 521 struct gfs2_inode *ip)
524{ 522{
525 struct qstr dotname;
526 int error; 523 int error;
527 524
528 if (ip->i_entries != 2) { 525 if (ip->i_entries != 2) {
@@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
539 if (error) 536 if (error)
540 return error; 537 return error;
541 538
542 gfs2_str2qstr(&dotname, "."); 539 error = gfs2_dir_del(ip, &gfs2_qdot);
543 error = gfs2_dir_del(ip, &dotname);
544 if (error) 540 if (error)
545 return error; 541 return error;
546 542
547 gfs2_str2qstr(&dotname, ".."); 543 error = gfs2_dir_del(ip, &gfs2_qdotdot);
548 error = gfs2_dir_del(ip, &dotname);
549 if (error) 544 if (error)
550 return error; 545 return error;
551 546
@@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
694 struct inode *dir = &to->i_inode; 689 struct inode *dir = &to->i_inode;
695 struct super_block *sb = dir->i_sb; 690 struct super_block *sb = dir->i_sb;
696 struct inode *tmp; 691 struct inode *tmp;
697 struct qstr dotdot;
698 int error = 0; 692 int error = 0;
699 693
700 gfs2_str2qstr(&dotdot, "..");
701
702 igrab(dir); 694 igrab(dir);
703 695
704 for (;;) { 696 for (;;) {
@@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
711 break; 703 break;
712 } 704 }
713 705
714 tmp = gfs2_lookupi(dir, &dotdot, 1); 706 tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
715 if (IS_ERR(tmp)) { 707 if (IS_ERR(tmp)) {
716 error = PTR_ERR(tmp); 708 error = PTR_ERR(tmp);
717 break; 709 break;
@@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
744 struct gfs2_inode *ip = GFS2_I(odentry->d_inode); 736 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
745 struct gfs2_inode *nip = NULL; 737 struct gfs2_inode *nip = NULL;
746 struct gfs2_sbd *sdp = GFS2_SB(odir); 738 struct gfs2_sbd *sdp = GFS2_SB(odir);
747 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; 739 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
748 struct gfs2_rgrpd *nrgd; 740 struct gfs2_rgrpd *nrgd;
749 unsigned int num_gh; 741 unsigned int num_gh;
750 int dir_rename = 0; 742 int dir_rename = 0;
@@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
758 return 0; 750 return 0;
759 } 751 }
760 752
753 error = gfs2_rindex_hold(sdp, &ri_gh);
754 if (error)
755 return error;
761 756
762 if (odip != ndip) { 757 if (odip != ndip) {
763 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 758 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
887 882
888 al->al_requested = sdp->sd_max_dirres; 883 al->al_requested = sdp->sd_max_dirres;
889 884
890 error = gfs2_inplace_reserve(ndip); 885 error = gfs2_inplace_reserve_ri(ndip);
891 if (error) 886 if (error)
892 goto out_gunlock_q; 887 goto out_gunlock_q;
893 888
894 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 889 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
895 al->al_rgd->rd_length + 890 gfs2_rg_blocks(al) +
896 4 * RES_DINODE + 4 * RES_LEAF + 891 4 * RES_DINODE + 4 * RES_LEAF +
897 RES_STATFS + RES_QUOTA + 4, 0); 892 RES_STATFS + RES_QUOTA + 4, 0);
898 if (error) 893 if (error)
@@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
920 } 915 }
921 916
922 if (dir_rename) { 917 if (dir_rename) {
923 struct qstr name;
924 gfs2_str2qstr(&name, "..");
925
926 error = gfs2_change_nlink(ndip, +1); 918 error = gfs2_change_nlink(ndip, +1);
927 if (error) 919 if (error)
928 goto out_end_trans; 920 goto out_end_trans;
@@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
930 if (error) 922 if (error)
931 goto out_end_trans; 923 goto out_end_trans;
932 924
933 error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR); 925 error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
934 if (error) 926 if (error)
935 goto out_end_trans; 927 goto out_end_trans;
936 } else { 928 } else {
@@ -972,6 +964,7 @@ out_gunlock_r:
972 if (r_gh.gh_gl) 964 if (r_gh.gh_gl)
973 gfs2_glock_dq_uninit(&r_gh); 965 gfs2_glock_dq_uninit(&r_gh);
974out: 966out:
967 gfs2_glock_dq_uninit(&ri_gh);
975 return error; 968 return error;
976} 969}
977 970
@@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
990 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 983 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
991 struct gfs2_holder i_gh; 984 struct gfs2_holder i_gh;
992 struct buffer_head *dibh; 985 struct buffer_head *dibh;
993 unsigned int x; 986 unsigned int x, size;
994 char *buf; 987 char *buf;
995 int error; 988 int error;
996 989
@@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1002 return NULL; 995 return NULL;
1003 } 996 }
1004 997
1005 if (!ip->i_disksize) { 998 size = (unsigned int)i_size_read(&ip->i_inode);
999 if (size == 0) {
1006 gfs2_consist_inode(ip); 1000 gfs2_consist_inode(ip);
1007 buf = ERR_PTR(-EIO); 1001 buf = ERR_PTR(-EIO);
1008 goto out; 1002 goto out;
@@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1014 goto out; 1008 goto out;
1015 } 1009 }
1016 1010
1017 x = ip->i_disksize + 1; 1011 x = size + 1;
1018 buf = kmalloc(x, GFP_NOFS); 1012 buf = kmalloc(x, GFP_NOFS);
1019 if (!buf) 1013 if (!buf)
1020 buf = ERR_PTR(-ENOMEM); 1014 buf = ERR_PTR(-ENOMEM);
@@ -1071,30 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask)
1071 return error; 1065 return error;
1072} 1066}
1073 1067
1074/*
1075 * XXX(truncate): the truncate_setsize calls should be moved to the end.
1076 */
1077static int setattr_size(struct inode *inode, struct iattr *attr)
1078{
1079 struct gfs2_inode *ip = GFS2_I(inode);
1080 struct gfs2_sbd *sdp = GFS2_SB(inode);
1081 int error;
1082
1083 if (attr->ia_size != ip->i_disksize) {
1084 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1085 if (error)
1086 return error;
1087 truncate_setsize(inode, attr->ia_size);
1088 gfs2_trans_end(sdp);
1089 }
1090
1091 error = gfs2_truncatei(ip, attr->ia_size);
1092 if (error && (inode->i_size != ip->i_disksize))
1093 i_size_write(inode, ip->i_disksize);
1094
1095 return error;
1096}
1097
1098static int setattr_chown(struct inode *inode, struct iattr *attr) 1068static int setattr_chown(struct inode *inode, struct iattr *attr)
1099{ 1069{
1100 struct gfs2_inode *ip = GFS2_I(inode); 1070 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1195,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
1195 goto out; 1165 goto out;
1196 1166
1197 if (attr->ia_valid & ATTR_SIZE) 1167 if (attr->ia_valid & ATTR_SIZE)
1198 error = setattr_size(inode, attr); 1168 error = gfs2_setattr_size(inode, attr->ia_size);
1199 else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) 1169 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1200 error = setattr_chown(inode, attr); 1170 error = setattr_chown(inode, attr);
1201 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) 1171 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
@@ -1301,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1301 return ret; 1271 return ret;
1302} 1272}
1303 1273
1274static void empty_write_end(struct page *page, unsigned from,
1275 unsigned to)
1276{
1277 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
1278
1279 page_zero_new_buffers(page, from, to);
1280 flush_dcache_page(page);
1281 mark_page_accessed(page);
1282
1283 if (!gfs2_is_writeback(ip))
1284 gfs2_page_add_databufs(ip, page, from, to);
1285
1286 block_commit_write(page, from, to);
1287}
1288
1289
1290static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
1291{
1292 unsigned start, end, next;
1293 struct buffer_head *bh, *head;
1294 int error;
1295
1296 if (!page_has_buffers(page)) {
1297 error = block_prepare_write(page, from, to, gfs2_block_map);
1298 if (unlikely(error))
1299 return error;
1300
1301 empty_write_end(page, from, to);
1302 return 0;
1303 }
1304
1305 bh = head = page_buffers(page);
1306 next = end = 0;
1307 while (next < from) {
1308 next += bh->b_size;
1309 bh = bh->b_this_page;
1310 }
1311 start = next;
1312 do {
1313 next += bh->b_size;
1314 if (buffer_mapped(bh)) {
1315 if (end) {
1316 error = block_prepare_write(page, start, end,
1317 gfs2_block_map);
1318 if (unlikely(error))
1319 return error;
1320 empty_write_end(page, start, end);
1321 end = 0;
1322 }
1323 start = next;
1324 }
1325 else
1326 end = next;
1327 bh = bh->b_this_page;
1328 } while (next < to);
1329
1330 if (end) {
1331 error = block_prepare_write(page, start, end, gfs2_block_map);
1332 if (unlikely(error))
1333 return error;
1334 empty_write_end(page, start, end);
1335 }
1336
1337 return 0;
1338}
1339
1340static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
1341 int mode)
1342{
1343 struct gfs2_inode *ip = GFS2_I(inode);
1344 struct buffer_head *dibh;
1345 int error;
1346 u64 start = offset >> PAGE_CACHE_SHIFT;
1347 unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
1348 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
1349 pgoff_t curr;
1350 struct page *page;
1351 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
1352 unsigned int from, to;
1353
1354 if (!end_offset)
1355 end_offset = PAGE_CACHE_SIZE;
1356
1357 error = gfs2_meta_inode_buffer(ip, &dibh);
1358 if (unlikely(error))
1359 goto out;
1360
1361 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1362
1363 if (gfs2_is_stuffed(ip)) {
1364 error = gfs2_unstuff_dinode(ip, NULL);
1365 if (unlikely(error))
1366 goto out;
1367 }
1368
1369 curr = start;
1370 offset = start << PAGE_CACHE_SHIFT;
1371 from = start_offset;
1372 to = PAGE_CACHE_SIZE;
1373 while (curr <= end) {
1374 page = grab_cache_page_write_begin(inode->i_mapping, curr,
1375 AOP_FLAG_NOFS);
1376 if (unlikely(!page)) {
1377 error = -ENOMEM;
1378 goto out;
1379 }
1380
1381 if (curr == end)
1382 to = end_offset;
1383 error = write_empty_blocks(page, from, to);
1384 if (!error && offset + to > inode->i_size &&
1385 !(mode & FALLOC_FL_KEEP_SIZE)) {
1386 i_size_write(inode, offset + to);
1387 }
1388 unlock_page(page);
1389 page_cache_release(page);
1390 if (error)
1391 goto out;
1392 curr++;
1393 offset += PAGE_CACHE_SIZE;
1394 from = 0;
1395 }
1396
1397 gfs2_dinode_out(ip, dibh->b_data);
1398 mark_inode_dirty(inode);
1399
1400 brelse(dibh);
1401
1402out:
1403 return error;
1404}
1405
1406static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
1407 unsigned int *data_blocks, unsigned int *ind_blocks)
1408{
1409 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1410 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
1411 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
1412
1413 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
1414 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1415 max_data -= tmp;
1416 }
1417 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
1418 so it might end up with fewer data blocks */
1419 if (max_data <= *data_blocks)
1420 return;
1421 *data_blocks = max_data;
1422 *ind_blocks = max_blocks - max_data;
1423 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
1424 if (*len > max) {
1425 *len = max;
1426 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
1427 }
1428}
1429
1430static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1431 loff_t len)
1432{
1433 struct gfs2_sbd *sdp = GFS2_SB(inode);
1434 struct gfs2_inode *ip = GFS2_I(inode);
1435 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
1436 loff_t bytes, max_bytes;
1437 struct gfs2_alloc *al;
1438 int error;
1439 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
1440 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
1441
1442 offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
1443 sdp->sd_sb.sb_bsize_shift;
1444
1445 len = next - offset;
1446 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
1447 if (!bytes)
1448 bytes = UINT_MAX;
1449
1450 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
1451 error = gfs2_glock_nq(&ip->i_gh);
1452 if (unlikely(error))
1453 goto out_uninit;
1454
1455 if (!gfs2_write_alloc_required(ip, offset, len))
1456 goto out_unlock;
1457
1458 while (len > 0) {
1459 if (len < bytes)
1460 bytes = len;
1461 al = gfs2_alloc_get(ip);
1462 if (!al) {
1463 error = -ENOMEM;
1464 goto out_unlock;
1465 }
1466
1467 error = gfs2_quota_lock_check(ip);
1468 if (error)
1469 goto out_alloc_put;
1470
1471retry:
1472 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
1473
1474 al->al_requested = data_blocks + ind_blocks;
1475 error = gfs2_inplace_reserve(ip);
1476 if (error) {
1477 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
1478 bytes >>= 1;
1479 goto retry;
1480 }
1481 goto out_qunlock;
1482 }
1483 max_bytes = bytes;
1484 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
1485 al->al_requested = data_blocks + ind_blocks;
1486
1487 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
1488 RES_RG_HDR + gfs2_rg_blocks(al);
1489 if (gfs2_is_jdata(ip))
1490 rblocks += data_blocks ? data_blocks : 1;
1491
1492 error = gfs2_trans_begin(sdp, rblocks,
1493 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
1494 if (error)
1495 goto out_trans_fail;
1496
1497 error = fallocate_chunk(inode, offset, max_bytes, mode);
1498 gfs2_trans_end(sdp);
1499
1500 if (error)
1501 goto out_trans_fail;
1502
1503 len -= max_bytes;
1504 offset += max_bytes;
1505 gfs2_inplace_release(ip);
1506 gfs2_quota_unlock(ip);
1507 gfs2_alloc_put(ip);
1508 }
1509 goto out_unlock;
1510
1511out_trans_fail:
1512 gfs2_inplace_release(ip);
1513out_qunlock:
1514 gfs2_quota_unlock(ip);
1515out_alloc_put:
1516 gfs2_alloc_put(ip);
1517out_unlock:
1518 gfs2_glock_dq(&ip->i_gh);
1519out_uninit:
1520 gfs2_holder_uninit(&ip->i_gh);
1521 return error;
1522}
1523
1524
1304static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1525static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1305 u64 start, u64 len) 1526 u64 start, u64 len)
1306{ 1527{
@@ -1351,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = {
1351 .getxattr = gfs2_getxattr, 1572 .getxattr = gfs2_getxattr,
1352 .listxattr = gfs2_listxattr, 1573 .listxattr = gfs2_listxattr,
1353 .removexattr = gfs2_removexattr, 1574 .removexattr = gfs2_removexattr,
1575 .fallocate = gfs2_fallocate,
1354 .fiemap = gfs2_fiemap, 1576 .fiemap = gfs2_fiemap,
1355}; 1577};
1356 1578
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 1bc6b5695e6d..58a9b9998b42 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,10 +735,8 @@ get_a_page:
735 goto out; 735 goto out;
736 736
737 size = loc + sizeof(struct gfs2_quota); 737 size = loc + sizeof(struct gfs2_quota);
738 if (size > inode->i_size) { 738 if (size > inode->i_size)
739 ip->i_disksize = size;
740 i_size_write(inode, size); 739 i_size_write(inode, size);
741 }
742 inode->i_mtime = inode->i_atime = CURRENT_TIME; 740 inode->i_mtime = inode->i_atime = CURRENT_TIME;
743 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 741 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
744 gfs2_dinode_out(ip, dibh->b_data); 742 gfs2_dinode_out(ip, dibh->b_data);
@@ -817,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
817 goto out_alloc; 815 goto out_alloc;
818 816
819 if (nalloc) 817 if (nalloc)
820 blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS; 818 blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
821 819
822 error = gfs2_trans_begin(sdp, blocks, 0); 820 error = gfs2_trans_begin(sdp, blocks, 0);
823 if (error) 821 if (error)
@@ -1190,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
1190int gfs2_quota_init(struct gfs2_sbd *sdp) 1188int gfs2_quota_init(struct gfs2_sbd *sdp)
1191{ 1189{
1192 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1190 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1193 unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; 1191 u64 size = i_size_read(sdp->sd_qc_inode);
1192 unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
1194 unsigned int x, slot = 0; 1193 unsigned int x, slot = 0;
1195 unsigned int found = 0; 1194 unsigned int found = 0;
1196 u64 dblock; 1195 u64 dblock;
1197 u32 extlen = 0; 1196 u32 extlen = 0;
1198 int error; 1197 int error;
1199 1198
1200 if (!ip->i_disksize || ip->i_disksize > (64 << 20) || 1199 if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
1201 ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
1202 gfs2_consist_inode(ip);
1203 return -EIO; 1200 return -EIO;
1204 } 1201
1205 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; 1202 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1206 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); 1203 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1207 1204
@@ -1589,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1589 error = gfs2_inplace_reserve(ip); 1586 error = gfs2_inplace_reserve(ip);
1590 if (error) 1587 if (error)
1591 goto out_alloc; 1588 goto out_alloc;
1589 blocks += gfs2_rg_blocks(al);
1592 } 1590 }
1593 1591
1594 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); 1592 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f7f89a94a5a4..f2a02edcac8f 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work)
455 int ro = 0; 455 int ro = 0;
456 unsigned int pass; 456 unsigned int pass;
457 int error; 457 int error;
458 int jlocked = 0;
458 459
459 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { 460 if (sdp->sd_args.ar_spectator ||
461 (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
460 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", 462 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
461 jd->jd_jid); 463 jd->jd_jid);
462 464 jlocked = 1;
463 /* Acquire the journal lock so we can do recovery */ 465 /* Acquire the journal lock so we can do recovery */
464 466
465 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, 467 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work)
554 jd->jd_jid, t); 556 jd->jd_jid, t);
555 } 557 }
556 558
557 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
558 gfs2_glock_dq_uninit(&ji_gh);
559
560 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); 559 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
561 560
562 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) 561 if (jlocked) {
562 gfs2_glock_dq_uninit(&ji_gh);
563 gfs2_glock_dq_uninit(&j_gh); 563 gfs2_glock_dq_uninit(&j_gh);
564 }
564 565
565 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); 566 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
566 goto done; 567 goto done;
@@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work)
568fail_gunlock_tr: 569fail_gunlock_tr:
569 gfs2_glock_dq_uninit(&t_gh); 570 gfs2_glock_dq_uninit(&t_gh);
570fail_gunlock_ji: 571fail_gunlock_ji:
571 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { 572 if (jlocked) {
572 gfs2_glock_dq_uninit(&ji_gh); 573 gfs2_glock_dq_uninit(&ji_gh);
573fail_gunlock_j: 574fail_gunlock_j:
574 gfs2_glock_dq_uninit(&j_gh); 575 gfs2_glock_dq_uninit(&j_gh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e45..fb67f593f408 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
500 for (rgrps = 0;; rgrps++) { 500 for (rgrps = 0;; rgrps++) {
501 loff_t pos = rgrps * sizeof(struct gfs2_rindex); 501 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
502 502
503 if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize) 503 if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
504 break; 504 break;
505 error = gfs2_internal_read(ip, &ra_state, buf, &pos, 505 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
506 sizeof(struct gfs2_rindex)); 506 sizeof(struct gfs2_rindex));
@@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
589 struct inode *inode = &ip->i_inode; 589 struct inode *inode = &ip->i_inode;
590 struct file_ra_state ra_state; 590 struct file_ra_state ra_state;
591 u64 rgrp_count = ip->i_disksize; 591 u64 rgrp_count = i_size_read(inode);
592 struct gfs2_rgrpd *rgd;
593 unsigned int max_data = 0;
592 int error; 594 int error;
593 595
594 do_div(rgrp_count, sizeof(struct gfs2_rindex)); 596 do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
603 } 605 }
604 } 606 }
605 607
608 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
609 if (rgd->rd_data > max_data)
610 max_data = rgd->rd_data;
611 sdp->sd_max_rg_data = max_data;
606 sdp->sd_rindex_uptodate = 1; 612 sdp->sd_rindex_uptodate = 1;
607 return 0; 613 return 0;
608} 614}
@@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
622 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 628 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
623 struct inode *inode = &ip->i_inode; 629 struct inode *inode = &ip->i_inode;
624 struct file_ra_state ra_state; 630 struct file_ra_state ra_state;
631 struct gfs2_rgrpd *rgd;
632 unsigned int max_data = 0;
625 int error; 633 int error;
626 634
627 file_ra_state_init(&ra_state, inode->i_mapping); 635 file_ra_state_init(&ra_state, inode->i_mapping);
628 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { 636 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
629 /* Ignore partials */ 637 /* Ignore partials */
630 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > 638 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
631 ip->i_disksize) 639 i_size_read(inode))
632 break; 640 break;
633 error = read_rindex_entry(ip, &ra_state); 641 error = read_rindex_entry(ip, &ra_state);
634 if (error) { 642 if (error) {
@@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
636 return error; 644 return error;
637 } 645 }
638 } 646 }
647 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
648 if (rgd->rd_data > max_data)
649 max_data = rgd->rd_data;
650 sdp->sd_max_rg_data = max_data;
639 651
640 sdp->sd_rindex_uptodate = 1; 652 sdp->sd_rindex_uptodate = 1;
641 return 0; 653 return 0;
@@ -1188,7 +1200,8 @@ out:
1188 * Returns: errno 1200 * Returns: errno
1189 */ 1201 */
1190 1202
1191int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) 1203int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
1204 char *file, unsigned int line)
1192{ 1205{
1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1206 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1194 struct gfs2_alloc *al = ip->i_alloc; 1207 struct gfs2_alloc *al = ip->i_alloc;
@@ -1199,12 +1212,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1199 return -EINVAL; 1212 return -EINVAL;
1200 1213
1201try_again: 1214try_again:
1202 /* We need to hold the rindex unless the inode we're using is 1215 if (hold_rindex) {
1203 the rindex itself, in which case it's already held. */ 1216 /* We need to hold the rindex unless the inode we're using is
1204 if (ip != GFS2_I(sdp->sd_rindex)) 1217 the rindex itself, in which case it's already held. */
1205 error = gfs2_rindex_hold(sdp, &al->al_ri_gh); 1218 if (ip != GFS2_I(sdp->sd_rindex))
1206 else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ 1219 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1207 error = gfs2_ri_update_special(ip); 1220 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1221 in, so: */
1222 error = gfs2_ri_update_special(ip);
1223 }
1208 1224
1209 if (error) 1225 if (error)
1210 return error; 1226 return error;
@@ -1215,7 +1231,7 @@ try_again:
1215 try to free it, and try the allocation again. */ 1231 try to free it, and try the allocation again. */
1216 error = get_local_rgrp(ip, &unlinked, &last_unlinked); 1232 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1217 if (error) { 1233 if (error) {
1218 if (ip != GFS2_I(sdp->sd_rindex)) 1234 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
1219 gfs2_glock_dq_uninit(&al->al_ri_gh); 1235 gfs2_glock_dq_uninit(&al->al_ri_gh);
1220 if (error != -EAGAIN) 1236 if (error != -EAGAIN)
1221 return error; 1237 return error;
@@ -1257,7 +1273,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1257 al->al_rgd = NULL; 1273 al->al_rgd = NULL;
1258 if (al->al_rgd_gh.gh_gl) 1274 if (al->al_rgd_gh.gh_gl)
1259 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1275 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1260 if (ip != GFS2_I(sdp->sd_rindex)) 1276 if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
1261 gfs2_glock_dq_uninit(&al->al_ri_gh); 1277 gfs2_glock_dq_uninit(&al->al_ri_gh);
1262} 1278}
1263 1279
@@ -1496,11 +1512,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1496 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1512 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1497 struct buffer_head *dibh; 1513 struct buffer_head *dibh;
1498 struct gfs2_alloc *al = ip->i_alloc; 1514 struct gfs2_alloc *al = ip->i_alloc;
1499 struct gfs2_rgrpd *rgd = al->al_rgd; 1515 struct gfs2_rgrpd *rgd;
1500 u32 goal, blk; 1516 u32 goal, blk;
1501 u64 block; 1517 u64 block;
1502 int error; 1518 int error;
1503 1519
1520 /* Only happens if there is a bug in gfs2, return something distinctive
1521 * to ensure that it is noticed.
1522 */
1523 if (al == NULL)
1524 return -ECANCELED;
1525
1526 rgd = al->al_rgd;
1527
1504 if (rgrp_contains_block(rgd, ip->i_goal)) 1528 if (rgrp_contains_block(rgd, ip->i_goal))
1505 goal = ip->i_goal - rgd->rd_data0; 1529 goal = ip->i_goal - rgd->rd_data0;
1506 else 1530 else
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index f07119d89557..0e35c0466f9a 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
39 ip->i_alloc = NULL; 39 ip->i_alloc = NULL;
40} 40}
41 41
42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, 42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
43 unsigned int line); 43 char *file, unsigned int line);
44#define gfs2_inplace_reserve(ip) \ 44#define gfs2_inplace_reserve(ip) \
45gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) 45 gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
46#define gfs2_inplace_reserve_ri(ip) \
47 gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
46 48
47extern void gfs2_inplace_release(struct gfs2_inode *ip); 49extern void gfs2_inplace_release(struct gfs2_inode *ip);
48 50
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 77cb9f830ee4..047d1176096c 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
85 {Opt_locktable, "locktable=%s"}, 85 {Opt_locktable, "locktable=%s"},
86 {Opt_hostdata, "hostdata=%s"}, 86 {Opt_hostdata, "hostdata=%s"},
87 {Opt_spectator, "spectator"}, 87 {Opt_spectator, "spectator"},
88 {Opt_spectator, "norecovery"},
88 {Opt_ignore_local_fs, "ignore_local_fs"}, 89 {Opt_ignore_local_fs, "ignore_local_fs"},
89 {Opt_localflocks, "localflocks"}, 90 {Opt_localflocks, "localflocks"},
90 {Opt_localcaching, "localcaching"}, 91 {Opt_localcaching, "localcaching"},
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
159 args->ar_spectator = 1; 160 args->ar_spectator = 1;
160 break; 161 break;
161 case Opt_ignore_local_fs: 162 case Opt_ignore_local_fs:
162 args->ar_ignore_local_fs = 1; 163 /* Retained for backwards compat only */
163 break; 164 break;
164 case Opt_localflocks: 165 case Opt_localflocks:
165 args->ar_localflocks = 1; 166 args->ar_localflocks = 1;
166 break; 167 break;
167 case Opt_localcaching: 168 case Opt_localcaching:
168 args->ar_localcaching = 1; 169 /* Retained for backwards compat only */
169 break; 170 break;
170 case Opt_debug: 171 case Opt_debug:
171 if (args->ar_errors == GFS2_ERRORS_PANIC) { 172 if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
179 args->ar_debug = 0; 180 args->ar_debug = 0;
180 break; 181 break;
181 case Opt_upgrade: 182 case Opt_upgrade:
182 args->ar_upgrade = 1; 183 /* Retained for backwards compat only */
183 break; 184 break;
184 case Opt_acl: 185 case Opt_acl:
185 args->ar_posix_acl = 1; 186 args->ar_posix_acl = 1;
@@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
342{ 343{
343 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 344 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
344 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); 345 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
346 u64 size = i_size_read(jd->jd_inode);
345 347
346 if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || 348 if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
347 (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
348 gfs2_consist_inode(ip);
349 return -EIO; 349 return -EIO;
350 }
351 jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
352 350
353 if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) { 351 jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
352
353 if (gfs2_write_alloc_required(ip, 0, size)) {
354 gfs2_consist_inode(ip); 354 gfs2_consist_inode(ip);
355 return -EIO; 355 return -EIO;
356 } 356 }
@@ -1129,9 +1129,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1129 1129
1130 /* Some flags must not be changed */ 1130 /* Some flags must not be changed */
1131 if (args_neq(&args, &sdp->sd_args, spectator) || 1131 if (args_neq(&args, &sdp->sd_args, spectator) ||
1132 args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
1133 args_neq(&args, &sdp->sd_args, localflocks) || 1132 args_neq(&args, &sdp->sd_args, localflocks) ||
1134 args_neq(&args, &sdp->sd_args, localcaching) ||
1135 args_neq(&args, &sdp->sd_args, meta)) 1133 args_neq(&args, &sdp->sd_args, meta))
1136 return -EINVAL; 1134 return -EINVAL;
1137 1135
@@ -1234,16 +1232,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1234 seq_printf(s, ",hostdata=%s", args->ar_hostdata); 1232 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
1235 if (args->ar_spectator) 1233 if (args->ar_spectator)
1236 seq_printf(s, ",spectator"); 1234 seq_printf(s, ",spectator");
1237 if (args->ar_ignore_local_fs)
1238 seq_printf(s, ",ignore_local_fs");
1239 if (args->ar_localflocks) 1235 if (args->ar_localflocks)
1240 seq_printf(s, ",localflocks"); 1236 seq_printf(s, ",localflocks");
1241 if (args->ar_localcaching)
1242 seq_printf(s, ",localcaching");
1243 if (args->ar_debug) 1237 if (args->ar_debug)
1244 seq_printf(s, ",debug"); 1238 seq_printf(s, ",debug");
1245 if (args->ar_upgrade)
1246 seq_printf(s, ",upgrade");
1247 if (args->ar_posix_acl) 1239 if (args->ar_posix_acl)
1248 seq_printf(s, ",acl"); 1240 seq_printf(s, ",acl");
1249 if (args->ar_quota != GFS2_QUOTA_DEFAULT) { 1241 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ccacffd2faaa..748ccb557c18 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
230 230
231 if (gltype > LM_TYPE_JOURNAL) 231 if (gltype > LM_TYPE_JOURNAL)
232 return -EINVAL; 232 return -EINVAL;
233 glops = gfs2_glops_list[gltype]; 233 if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
234 glops = &gfs2_trans_glops;
235 else
236 glops = gfs2_glops_list[gltype];
234 if (glops == NULL) 237 if (glops == NULL)
235 return -EINVAL; 238 return -EINVAL;
236 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) 239 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
@@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
399 402
400static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) 403static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
401{ 404{
402 return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); 405 return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
403} 406}
404 407
405static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) 408static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
406{ 409{
407 unsigned jid; 410 int jid;
408 int rv; 411 int rv;
409 412
410 rv = sscanf(buf, "%u", &jid); 413 rv = sscanf(buf, "%d", &jid);
411 if (rv != 1) 414 if (rv != 1)
412 return -EINVAL; 415 return -EINVAL;
413 416
414 spin_lock(&sdp->sd_jindex_spin); 417 spin_lock(&sdp->sd_jindex_spin);
415 rv = -EINVAL; 418 rv = -EINVAL;
416 if (sdp->sd_args.ar_spectator)
417 goto out;
418 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 419 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
419 goto out; 420 goto out;
420 rv = -EBUSY; 421 rv = -EBUSY;
421 if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) 422 if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
422 goto out; 423 goto out;
424 rv = 0;
425 if (sdp->sd_args.ar_spectator && jid > 0)
426 rv = jid = -EINVAL;
423 sdp->sd_lockstruct.ls_jid = jid; 427 sdp->sd_lockstruct.ls_jid = jid;
428 clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
424 smp_mb__after_clear_bit(); 429 smp_mb__after_clear_bit();
425 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); 430 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
426 rv = 0;
427out: 431out:
428 spin_unlock(&sdp->sd_jindex_spin); 432 spin_unlock(&sdp->sd_jindex_spin);
429 return rv ? rv : len; 433 return rv ? rv : len;
@@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
617 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); 621 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
618 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 622 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
619 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) 623 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
620 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); 624 add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
621 if (gfs2_uuid_valid(uuid)) 625 if (gfs2_uuid_valid(uuid))
622 add_uevent_var(env, "UUID=%pUB", uuid); 626 add_uevent_var(env, "UUID=%pUB", uuid);
623 return 0; 627 return 0;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 148d55c14171..cedb0bb96d96 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ 39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
40 {(1UL << GLF_REPLY_PENDING), "r" }, \ 40 {(1UL << GLF_REPLY_PENDING), "r" }, \
41 {(1UL << GLF_INITIAL), "I" }, \ 41 {(1UL << GLF_INITIAL), "I" }, \
42 {(1UL << GLF_FROZEN), "F" }) 42 {(1UL << GLF_FROZEN), "F" }, \
43 {(1UL << GLF_QUEUED), "q" })
43 44
44#ifndef NUMPTY 45#ifndef NUMPTY
45#define NUMPTY 46#define NUMPTY
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index edf9d4bd908e..fb56b783e028 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,11 +20,20 @@ struct gfs2_glock;
20#define RES_JDATA 1 20#define RES_JDATA 1
21#define RES_DATA 1 21#define RES_DATA 1
22#define RES_LEAF 1 22#define RES_LEAF 1
23#define RES_RG_HDR 1
23#define RES_RG_BIT 2 24#define RES_RG_BIT 2
24#define RES_EATTR 1 25#define RES_EATTR 1
25#define RES_STATFS 1 26#define RES_STATFS 1
26#define RES_QUOTA 2 27#define RES_QUOTA 2
27 28
29/* reserve either the number of blocks to be allocated plus the rg header
30 * block, or all of the blocks in the rg, whichever is smaller */
31static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
32{
33 return (al->al_requested < al->al_rgd->rd_length)?
34 al->al_requested + 1 : al->al_rgd->rd_length;
35}
36
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, 37int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes); 38 unsigned int revokes);
30 39
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 776af6eb4bcb..30b58f07c8a6 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
734 goto out_gunlock_q; 734 goto out_gunlock_q;
735 735
736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), 736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
737 blks + al->al_rgd->rd_length + 737 blks + gfs2_rg_blocks(al) +
738 RES_DINODE + RES_STATFS + RES_QUOTA, 0); 738 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
739 if (error) 739 if (error)
740 goto out_ipres; 740 goto out_ipres;
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5007a41f1be9..d182438c7ae4 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
23 fd->search_key = ptr; 23 fd->search_key = ptr;
24 fd->key = ptr + tree->max_key_len + 2; 24 fd->key = ptr + tree->max_key_len + 2;
25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); 25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
26 down(&tree->tree_lock); 26 mutex_lock(&tree->tree_lock);
27 return 0; 27 return 0;
28} 28}
29 29
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
32 hfs_bnode_put(fd->bnode); 32 hfs_bnode_put(fd->bnode);
33 kfree(fd->search_key); 33 kfree(fd->search_key);
34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); 34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
35 up(&fd->tree->tree_lock); 35 mutex_unlock(&fd->tree->tree_lock);
36 fd->tree = NULL; 36 fd->tree = NULL;
37} 37}
38 38
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
52 rec = (e + b) / 2; 52 rec = (e + b) / 2;
53 len = hfs_brec_lenoff(bnode, rec, &off); 53 len = hfs_brec_lenoff(bnode, rec, &off);
54 keylen = hfs_brec_keylen(bnode, rec); 54 keylen = hfs_brec_keylen(bnode, rec);
55 if (keylen == 0) {
56 res = -EINVAL;
57 goto fail;
58 }
55 hfs_bnode_read(bnode, fd->key, off, keylen); 59 hfs_bnode_read(bnode, fd->key, off, keylen);
56 cmpval = bnode->tree->keycmp(fd->key, fd->search_key); 60 cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
57 if (!cmpval) { 61 if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
67 if (rec != e && e >= 0) { 71 if (rec != e && e >= 0) {
68 len = hfs_brec_lenoff(bnode, e, &off); 72 len = hfs_brec_lenoff(bnode, e, &off);
69 keylen = hfs_brec_keylen(bnode, e); 73 keylen = hfs_brec_keylen(bnode, e);
74 if (keylen == 0) {
75 res = -EINVAL;
76 goto fail;
77 }
70 hfs_bnode_read(bnode, fd->key, off, keylen); 78 hfs_bnode_read(bnode, fd->key, off, keylen);
71 } 79 }
72done: 80done:
@@ -75,6 +83,7 @@ done:
75 fd->keylength = keylen; 83 fd->keylength = keylen;
76 fd->entryoffset = off + keylen; 84 fd->entryoffset = off + keylen;
77 fd->entrylength = len - keylen; 85 fd->entrylength = len - keylen;
86fail:
78 return res; 87 return res;
79} 88}
80 89
@@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
198 207
199 len = hfs_brec_lenoff(bnode, fd->record, &off); 208 len = hfs_brec_lenoff(bnode, fd->record, &off);
200 keylen = hfs_brec_keylen(bnode, fd->record); 209 keylen = hfs_brec_keylen(bnode, fd->record);
210 if (keylen == 0) {
211 res = -EINVAL;
212 goto out;
213 }
201 fd->keyoffset = off; 214 fd->keyoffset = off;
202 fd->keylength = keylen; 215 fd->keylength = keylen;
203 fd->entryoffset = off + keylen; 216 fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ea30afc2a03c..ad57f5991eb1 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -17,6 +17,7 @@
17 17
18int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) 18int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
19{ 19{
20 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
20 struct page *page; 21 struct page *page;
21 struct address_space *mapping; 22 struct address_space *mapping;
22 __be32 *pptr, *curr, *end; 23 __be32 *pptr, *curr, *end;
@@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
29 return size; 30 return size;
30 31
31 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); 32 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
32 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 33 mutex_lock(&sbi->alloc_mutex);
33 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 34 mapping = sbi->alloc_file->i_mapping;
34 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); 35 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
35 if (IS_ERR(page)) { 36 if (IS_ERR(page)) {
36 start = size; 37 start = size;
@@ -150,16 +151,17 @@ done:
150 set_page_dirty(page); 151 set_page_dirty(page);
151 kunmap(page); 152 kunmap(page);
152 *max = offset + (curr - pptr) * 32 + i - start; 153 *max = offset + (curr - pptr) * 32 + i - start;
153 HFSPLUS_SB(sb).free_blocks -= *max; 154 sbi->free_blocks -= *max;
154 sb->s_dirt = 1; 155 sb->s_dirt = 1;
155 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); 156 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
156out: 157out:
157 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 158 mutex_unlock(&sbi->alloc_mutex);
158 return start; 159 return start;
159} 160}
160 161
161int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) 162int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
162{ 163{
164 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
163 struct page *page; 165 struct page *page;
164 struct address_space *mapping; 166 struct address_space *mapping;
165 __be32 *pptr, *curr, *end; 167 __be32 *pptr, *curr, *end;
@@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
172 174
173 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); 175 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
174 /* are all of the bits in range? */ 176 /* are all of the bits in range? */
175 if ((offset + count) > HFSPLUS_SB(sb).total_blocks) 177 if ((offset + count) > sbi->total_blocks)
176 return -2; 178 return -2;
177 179
178 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 180 mutex_lock(&sbi->alloc_mutex);
179 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 181 mapping = sbi->alloc_file->i_mapping;
180 pnr = offset / PAGE_CACHE_BITS; 182 pnr = offset / PAGE_CACHE_BITS;
181 page = read_mapping_page(mapping, pnr, NULL); 183 page = read_mapping_page(mapping, pnr, NULL);
182 pptr = kmap(page); 184 pptr = kmap(page);
@@ -224,9 +226,9 @@ done:
224out: 226out:
225 set_page_dirty(page); 227 set_page_dirty(page);
226 kunmap(page); 228 kunmap(page);
227 HFSPLUS_SB(sb).free_blocks += len; 229 sbi->free_blocks += len;
228 sb->s_dirt = 1; 230 sb->s_dirt = 1;
229 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 231 mutex_unlock(&sbi->alloc_mutex);
230 232
231 return 0; 233 return 0;
232} 234}
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index c88e5d72a402..2f39d05443e1 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
42 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); 42 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
43 if (!recoff) 43 if (!recoff)
44 return 0; 44 return 0;
45 if (node->tree->attributes & HFS_TREE_BIGKEYS) 45
46 retval = hfs_bnode_read_u16(node, recoff) + 2; 46 retval = hfs_bnode_read_u16(node, recoff) + 2;
47 else 47 if (retval > node->tree->max_key_len + 2) {
48 retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; 48 printk(KERN_ERR "hfs: keylen %d too large\n",
49 retval);
50 retval = 0;
51 }
49 } 52 }
50 return retval; 53 return retval;
51} 54}
@@ -216,7 +219,7 @@ skip:
216static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) 219static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
217{ 220{
218 struct hfs_btree *tree; 221 struct hfs_btree *tree;
219 struct hfs_bnode *node, *new_node; 222 struct hfs_bnode *node, *new_node, *next_node;
220 struct hfs_bnode_desc node_desc; 223 struct hfs_bnode_desc node_desc;
221 int num_recs, new_rec_off, new_off, old_rec_off; 224 int num_recs, new_rec_off, new_off, old_rec_off;
222 int data_start, data_end, size; 225 int data_start, data_end, size;
@@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
235 new_node->type = node->type; 238 new_node->type = node->type;
236 new_node->height = node->height; 239 new_node->height = node->height;
237 240
241 if (node->next)
242 next_node = hfs_bnode_find(tree, node->next);
243 else
244 next_node = NULL;
245
246 if (IS_ERR(next_node)) {
247 hfs_bnode_put(node);
248 hfs_bnode_put(new_node);
249 return next_node;
250 }
251
238 size = tree->node_size / 2 - node->num_recs * 2 - 14; 252 size = tree->node_size / 2 - node->num_recs * 2 - 14;
239 old_rec_off = tree->node_size - 4; 253 old_rec_off = tree->node_size - 4;
240 num_recs = 1; 254 num_recs = 1;
@@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
248 /* panic? */ 262 /* panic? */
249 hfs_bnode_put(node); 263 hfs_bnode_put(node);
250 hfs_bnode_put(new_node); 264 hfs_bnode_put(new_node);
265 if (next_node)
266 hfs_bnode_put(next_node);
251 return ERR_PTR(-ENOSPC); 267 return ERR_PTR(-ENOSPC);
252 } 268 }
253 269
@@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
302 hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); 318 hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
303 319
304 /* update next bnode header */ 320 /* update next bnode header */
305 if (new_node->next) { 321 if (next_node) {
306 struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
307 next_node->prev = new_node->this; 322 next_node->prev = new_node->this;
308 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); 323 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
309 node_desc.prev = cpu_to_be32(next_node->prev); 324 node_desc.prev = cpu_to_be32(next_node->prev);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index e49fcee1e293..22e4d4e32999 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
30 if (!tree) 30 if (!tree)
31 return NULL; 31 return NULL;
32 32
33 init_MUTEX(&tree->tree_lock); 33 mutex_init(&tree->tree_lock);
34 spin_lock_init(&tree->hash_lock); 34 spin_lock_init(&tree->hash_lock);
35 tree->sb = sb; 35 tree->sb = sb;
36 tree->cnid = id; 36 tree->cnid = id;
@@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
39 goto free_tree; 39 goto free_tree;
40 tree->inode = inode; 40 tree->inode = inode;
41 41
42 if (!HFSPLUS_I(tree->inode)->first_blocks) {
43 printk(KERN_ERR
44 "hfs: invalid btree extent records (0 size).\n");
45 goto free_inode;
46 }
47
42 mapping = tree->inode->i_mapping; 48 mapping = tree->inode->i_mapping;
43 page = read_mapping_page(mapping, 0, NULL); 49 page = read_mapping_page(mapping, 0, NULL);
44 if (IS_ERR(page)) 50 if (IS_ERR(page))
45 goto free_tree; 51 goto free_inode;
46 52
47 /* Load the header */ 53 /* Load the header */
48 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); 54 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
57 tree->max_key_len = be16_to_cpu(head->max_key_len); 63 tree->max_key_len = be16_to_cpu(head->max_key_len);
58 tree->depth = be16_to_cpu(head->depth); 64 tree->depth = be16_to_cpu(head->depth);
59 65
60 /* Set the correct compare function */ 66 /* Verify the tree and set the correct compare function */
61 if (id == HFSPLUS_EXT_CNID) { 67 switch (id) {
68 case HFSPLUS_EXT_CNID:
69 if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
70 printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
71 tree->max_key_len);
72 goto fail_page;
73 }
74 if (tree->attributes & HFS_TREE_VARIDXKEYS) {
75 printk(KERN_ERR "hfs: invalid extent btree flag\n");
76 goto fail_page;
77 }
78
62 tree->keycmp = hfsplus_ext_cmp_key; 79 tree->keycmp = hfsplus_ext_cmp_key;
63 } else if (id == HFSPLUS_CAT_CNID) { 80 break;
64 if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) && 81 case HFSPLUS_CAT_CNID:
82 if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
83 printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
84 tree->max_key_len);
85 goto fail_page;
86 }
87 if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
88 printk(KERN_ERR "hfs: invalid catalog btree flag\n");
89 goto fail_page;
90 }
91
92 if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
65 (head->key_type == HFSPLUS_KEY_BINARY)) 93 (head->key_type == HFSPLUS_KEY_BINARY))
66 tree->keycmp = hfsplus_cat_bin_cmp_key; 94 tree->keycmp = hfsplus_cat_bin_cmp_key;
67 else { 95 else {
68 tree->keycmp = hfsplus_cat_case_cmp_key; 96 tree->keycmp = hfsplus_cat_case_cmp_key;
69 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD; 97 set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
70 } 98 }
71 } else { 99 break;
100 default:
72 printk(KERN_ERR "hfs: unknown B*Tree requested\n"); 101 printk(KERN_ERR "hfs: unknown B*Tree requested\n");
73 goto fail_page; 102 goto fail_page;
74 } 103 }
75 104
105 if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
106 printk(KERN_ERR "hfs: invalid btree flag\n");
107 goto fail_page;
108 }
109
76 size = tree->node_size; 110 size = tree->node_size;
77 if (!is_power_of_2(size)) 111 if (!is_power_of_2(size))
78 goto fail_page; 112 goto fail_page;
79 if (!tree->node_count) 113 if (!tree->node_count)
80 goto fail_page; 114 goto fail_page;
115
81 tree->node_size_shift = ffs(size) - 1; 116 tree->node_size_shift = ffs(size) - 1;
82 117
83 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 118 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
87 return tree; 122 return tree;
88 123
89 fail_page: 124 fail_page:
90 tree->inode->i_mapping->a_ops = &hfsplus_aops;
91 page_cache_release(page); 125 page_cache_release(page);
92 free_tree: 126 free_inode:
127 tree->inode->i_mapping->a_ops = &hfsplus_aops;
93 iput(tree->inode); 128 iput(tree->inode);
129 free_tree:
94 kfree(tree); 130 kfree(tree);
95 return NULL; 131 return NULL;
96} 132}
@@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
192 228
193 while (!tree->free_nodes) { 229 while (!tree->free_nodes) {
194 struct inode *inode = tree->inode; 230 struct inode *inode = tree->inode;
231 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
195 u32 count; 232 u32 count;
196 int res; 233 int res;
197 234
198 res = hfsplus_file_extend(inode); 235 res = hfsplus_file_extend(inode);
199 if (res) 236 if (res)
200 return ERR_PTR(res); 237 return ERR_PTR(res);
201 HFSPLUS_I(inode).phys_size = inode->i_size = 238 hip->phys_size = inode->i_size =
202 (loff_t)HFSPLUS_I(inode).alloc_blocks << 239 (loff_t)hip->alloc_blocks <<
203 HFSPLUS_SB(tree->sb).alloc_blksz_shift; 240 HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
204 HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks << 241 hip->fs_blocks =
205 HFSPLUS_SB(tree->sb).fs_shift; 242 hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
206 inode_set_bytes(inode, inode->i_size); 243 inode_set_bytes(inode, inode->i_size);
207 count = inode->i_size >> tree->node_size_shift; 244 count = inode->i_size >> tree->node_size_shift;
208 tree->free_nodes = count - tree->node_count; 245 tree->free_nodes = count - tree->node_count;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index f6874acb2cf2..8af45fc5b051 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
67 key->key_len = cpu_to_be16(6 + ustrlen); 67 key->key_len = cpu_to_be16(6 + ustrlen);
68} 68}
69 69
70static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) 70void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
71{ 71{
72 if (inode->i_flags & S_IMMUTABLE) 72 if (inode->i_flags & S_IMMUTABLE)
73 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; 73 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
77 perms->rootflags |= HFSPLUS_FLG_APPEND; 77 perms->rootflags |= HFSPLUS_FLG_APPEND;
78 else 78 else
79 perms->rootflags &= ~HFSPLUS_FLG_APPEND; 79 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
80 HFSPLUS_I(inode).rootflags = perms->rootflags; 80
81 HFSPLUS_I(inode).userflags = perms->userflags; 81 perms->userflags = HFSPLUS_I(inode)->userflags;
82 perms->mode = cpu_to_be16(inode->i_mode); 82 perms->mode = cpu_to_be16(inode->i_mode);
83 perms->owner = cpu_to_be32(inode->i_uid); 83 perms->owner = cpu_to_be32(inode->i_uid);
84 perms->group = cpu_to_be32(inode->i_gid); 84 perms->group = cpu_to_be32(inode->i_gid);
85
86 if (S_ISREG(inode->i_mode))
87 perms->dev = cpu_to_be32(inode->i_nlink);
88 else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
89 perms->dev = cpu_to_be32(inode->i_rdev);
90 else
91 perms->dev = 0;
85} 92}
86 93
87static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode) 94static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
88{ 95{
96 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
97
89 if (S_ISDIR(inode->i_mode)) { 98 if (S_ISDIR(inode->i_mode)) {
90 struct hfsplus_cat_folder *folder; 99 struct hfsplus_cat_folder *folder;
91 100
@@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
93 memset(folder, 0, sizeof(*folder)); 102 memset(folder, 0, sizeof(*folder));
94 folder->type = cpu_to_be16(HFSPLUS_FOLDER); 103 folder->type = cpu_to_be16(HFSPLUS_FOLDER);
95 folder->id = cpu_to_be32(inode->i_ino); 104 folder->id = cpu_to_be32(inode->i_ino);
96 HFSPLUS_I(inode).create_date = 105 HFSPLUS_I(inode)->create_date =
97 folder->create_date = 106 folder->create_date =
98 folder->content_mod_date = 107 folder->content_mod_date =
99 folder->attribute_mod_date = 108 folder->attribute_mod_date =
100 folder->access_date = hfsp_now2mt(); 109 folder->access_date = hfsp_now2mt();
101 hfsplus_set_perms(inode, &folder->permissions); 110 hfsplus_cat_set_perms(inode, &folder->permissions);
102 if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir) 111 if (inode == sbi->hidden_dir)
103 /* invisible and namelocked */ 112 /* invisible and namelocked */
104 folder->user_info.frFlags = cpu_to_be16(0x5000); 113 folder->user_info.frFlags = cpu_to_be16(0x5000);
105 return sizeof(*folder); 114 return sizeof(*folder);
@@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
111 file->type = cpu_to_be16(HFSPLUS_FILE); 120 file->type = cpu_to_be16(HFSPLUS_FILE);
112 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); 121 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
113 file->id = cpu_to_be32(cnid); 122 file->id = cpu_to_be32(cnid);
114 HFSPLUS_I(inode).create_date = 123 HFSPLUS_I(inode)->create_date =
115 file->create_date = 124 file->create_date =
116 file->content_mod_date = 125 file->content_mod_date =
117 file->attribute_mod_date = 126 file->attribute_mod_date =
118 file->access_date = hfsp_now2mt(); 127 file->access_date = hfsp_now2mt();
119 if (cnid == inode->i_ino) { 128 if (cnid == inode->i_ino) {
120 hfsplus_set_perms(inode, &file->permissions); 129 hfsplus_cat_set_perms(inode, &file->permissions);
121 if (S_ISLNK(inode->i_mode)) { 130 if (S_ISLNK(inode->i_mode)) {
122 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); 131 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
123 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); 132 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
124 } else { 133 } else {
125 file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type); 134 file->user_info.fdType = cpu_to_be32(sbi->type);
126 file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator); 135 file->user_info.fdCreator = cpu_to_be32(sbi->creator);
127 } 136 }
128 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 137 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
129 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 138 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
@@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
131 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); 140 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
132 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); 141 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
133 file->user_info.fdFlags = cpu_to_be16(0x100); 142 file->user_info.fdFlags = cpu_to_be16(0x100);
134 file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date; 143 file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
135 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev); 144 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
136 } 145 }
137 return sizeof(*file); 146 return sizeof(*file);
138 } 147 }
@@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
180 189
181int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) 190int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
182{ 191{
192 struct super_block *sb = dir->i_sb;
183 struct hfs_find_data fd; 193 struct hfs_find_data fd;
184 struct super_block *sb;
185 hfsplus_cat_entry entry; 194 hfsplus_cat_entry entry;
186 int entry_size; 195 int entry_size;
187 int err; 196 int err;
188 197
189 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); 198 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
190 sb = dir->i_sb; 199 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
191 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
192 200
193 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); 201 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
194 entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? 202 entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -234,7 +242,7 @@ err2:
234 242
235int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) 243int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
236{ 244{
237 struct super_block *sb; 245 struct super_block *sb = dir->i_sb;
238 struct hfs_find_data fd; 246 struct hfs_find_data fd;
239 struct hfsplus_fork_raw fork; 247 struct hfsplus_fork_raw fork;
240 struct list_head *pos; 248 struct list_head *pos;
@@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
242 u16 type; 250 u16 type;
243 251
244 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); 252 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
245 sb = dir->i_sb; 253 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
246 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
247 254
248 if (!str) { 255 if (!str) {
249 int len; 256 int len;
@@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
279 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); 286 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
280 } 287 }
281 288
282 list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) { 289 list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
283 struct hfsplus_readdir_data *rd = 290 struct hfsplus_readdir_data *rd =
284 list_entry(pos, struct hfsplus_readdir_data, list); 291 list_entry(pos, struct hfsplus_readdir_data, list);
285 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) 292 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
@@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid,
312 struct inode *src_dir, struct qstr *src_name, 319 struct inode *src_dir, struct qstr *src_name,
313 struct inode *dst_dir, struct qstr *dst_name) 320 struct inode *dst_dir, struct qstr *dst_name)
314{ 321{
315 struct super_block *sb; 322 struct super_block *sb = src_dir->i_sb;
316 struct hfs_find_data src_fd, dst_fd; 323 struct hfs_find_data src_fd, dst_fd;
317 hfsplus_cat_entry entry; 324 hfsplus_cat_entry entry;
318 int entry_size, type; 325 int entry_size, type;
@@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid,
320 327
321 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, 328 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
322 dst_dir->i_ino, dst_name->name); 329 dst_dir->i_ino, dst_name->name);
323 sb = src_dir->i_sb; 330 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
324 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
325 dst_fd = src_fd; 331 dst_fd = src_fd;
326 332
327 /* find the old dir entry and read the data */ 333 /* find the old dir entry and read the data */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 764fd1bdca88..d236d85ec9d7 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
39 39
40 dentry->d_op = &hfsplus_dentry_operations; 40 dentry->d_op = &hfsplus_dentry_operations;
41 dentry->d_fsdata = NULL; 41 dentry->d_fsdata = NULL;
42 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 42 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); 43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
44again: 44again:
45 err = hfs_brec_read(&fd, &entry, sizeof(entry)); 45 err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -68,9 +68,9 @@ again:
68 cnid = be32_to_cpu(entry.file.id); 68 cnid = be32_to_cpu(entry.file.id);
69 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && 69 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
70 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && 70 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
71 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date || 71 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
72 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) && 72 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
73 HFSPLUS_SB(sb).hidden_dir) { 73 HFSPLUS_SB(sb)->hidden_dir) {
74 struct qstr str; 74 struct qstr str;
75 char name[32]; 75 char name[32];
76 76
@@ -86,7 +86,8 @@ again:
86 linkid = be32_to_cpu(entry.file.permissions.dev); 86 linkid = be32_to_cpu(entry.file.permissions.dev);
87 str.len = sprintf(name, "iNode%d", linkid); 87 str.len = sprintf(name, "iNode%d", linkid);
88 str.name = name; 88 str.name = name;
89 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str); 89 hfsplus_cat_build_key(sb, fd.search_key,
90 HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
90 goto again; 91 goto again;
91 } 92 }
92 } else if (!dentry->d_fsdata) 93 } else if (!dentry->d_fsdata)
@@ -101,7 +102,7 @@ again:
101 if (IS_ERR(inode)) 102 if (IS_ERR(inode))
102 return ERR_CAST(inode); 103 return ERR_CAST(inode);
103 if (S_ISREG(inode->i_mode)) 104 if (S_ISREG(inode->i_mode))
104 HFSPLUS_I(inode).dev = linkid; 105 HFSPLUS_I(inode)->linkid = linkid;
105out: 106out:
106 d_add(dentry, inode); 107 d_add(dentry, inode);
107 return NULL; 108 return NULL;
@@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
124 if (filp->f_pos >= inode->i_size) 125 if (filp->f_pos >= inode->i_size)
125 return 0; 126 return 0;
126 127
127 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 128 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
128 hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); 129 hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
129 err = hfs_brec_find(&fd); 130 err = hfs_brec_find(&fd);
130 if (err) 131 if (err)
@@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
180 err = -EIO; 181 err = -EIO;
181 goto out; 182 goto out;
182 } 183 }
183 if (HFSPLUS_SB(sb).hidden_dir && 184 if (HFSPLUS_SB(sb)->hidden_dir &&
184 HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id)) 185 HFSPLUS_SB(sb)->hidden_dir->i_ino ==
186 be32_to_cpu(entry.folder.id))
185 goto next; 187 goto next;
186 if (filldir(dirent, strbuf, len, filp->f_pos, 188 if (filldir(dirent, strbuf, len, filp->f_pos,
187 be32_to_cpu(entry.folder.id), DT_DIR)) 189 be32_to_cpu(entry.folder.id), DT_DIR))
@@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
217 } 219 }
218 filp->private_data = rd; 220 filp->private_data = rd;
219 rd->file = filp; 221 rd->file = filp;
220 list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list); 222 list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
221 } 223 }
222 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); 224 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
223out: 225out:
@@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
229{ 231{
230 struct hfsplus_readdir_data *rd = file->private_data; 232 struct hfsplus_readdir_data *rd = file->private_data;
231 if (rd) { 233 if (rd) {
234 mutex_lock(&inode->i_mutex);
232 list_del(&rd->list); 235 list_del(&rd->list);
236 mutex_unlock(&inode->i_mutex);
233 kfree(rd); 237 kfree(rd);
234 } 238 }
235 return 0; 239 return 0;
236} 240}
237 241
238static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
239 struct nameidata *nd)
240{
241 struct inode *inode;
242 int res;
243
244 inode = hfsplus_new_inode(dir->i_sb, mode);
245 if (!inode)
246 return -ENOSPC;
247
248 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
249 if (res) {
250 inode->i_nlink = 0;
251 hfsplus_delete_inode(inode);
252 iput(inode);
253 return res;
254 }
255 hfsplus_instantiate(dentry, inode, inode->i_ino);
256 mark_inode_dirty(inode);
257 return 0;
258}
259
260static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, 242static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
261 struct dentry *dst_dentry) 243 struct dentry *dst_dentry)
262{ 244{
263 struct super_block *sb = dst_dir->i_sb; 245 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
264 struct inode *inode = src_dentry->d_inode; 246 struct inode *inode = src_dentry->d_inode;
265 struct inode *src_dir = src_dentry->d_parent->d_inode; 247 struct inode *src_dir = src_dentry->d_parent->d_inode;
266 struct qstr str; 248 struct qstr str;
@@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
270 252
271 if (HFSPLUS_IS_RSRC(inode)) 253 if (HFSPLUS_IS_RSRC(inode))
272 return -EPERM; 254 return -EPERM;
255 if (!S_ISREG(inode->i_mode))
256 return -EPERM;
273 257
258 mutex_lock(&sbi->vh_mutex);
274 if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { 259 if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
275 for (;;) { 260 for (;;) {
276 get_random_bytes(&id, sizeof(cnid)); 261 get_random_bytes(&id, sizeof(cnid));
@@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
279 str.len = sprintf(name, "iNode%d", id); 264 str.len = sprintf(name, "iNode%d", id);
280 res = hfsplus_rename_cat(inode->i_ino, 265 res = hfsplus_rename_cat(inode->i_ino,
281 src_dir, &src_dentry->d_name, 266 src_dir, &src_dentry->d_name,
282 HFSPLUS_SB(sb).hidden_dir, &str); 267 sbi->hidden_dir, &str);
283 if (!res) 268 if (!res)
284 break; 269 break;
285 if (res != -EEXIST) 270 if (res != -EEXIST)
286 return res; 271 goto out;
287 } 272 }
288 HFSPLUS_I(inode).dev = id; 273 HFSPLUS_I(inode)->linkid = id;
289 cnid = HFSPLUS_SB(sb).next_cnid++; 274 cnid = sbi->next_cnid++;
290 src_dentry->d_fsdata = (void *)(unsigned long)cnid; 275 src_dentry->d_fsdata = (void *)(unsigned long)cnid;
291 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); 276 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
292 if (res) 277 if (res)
293 /* panic? */ 278 /* panic? */
294 return res; 279 goto out;
295 HFSPLUS_SB(sb).file_count++; 280 sbi->file_count++;
296 } 281 }
297 cnid = HFSPLUS_SB(sb).next_cnid++; 282 cnid = sbi->next_cnid++;
298 res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); 283 res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
299 if (res) 284 if (res)
300 return res; 285 goto out;
301 286
302 inc_nlink(inode); 287 inc_nlink(inode);
303 hfsplus_instantiate(dst_dentry, inode, cnid); 288 hfsplus_instantiate(dst_dentry, inode, cnid);
304 atomic_inc(&inode->i_count); 289 atomic_inc(&inode->i_count);
305 inode->i_ctime = CURRENT_TIME_SEC; 290 inode->i_ctime = CURRENT_TIME_SEC;
306 mark_inode_dirty(inode); 291 mark_inode_dirty(inode);
307 HFSPLUS_SB(sb).file_count++; 292 sbi->file_count++;
308 sb->s_dirt = 1; 293 dst_dir->i_sb->s_dirt = 1;
309 294out:
310 return 0; 295 mutex_unlock(&sbi->vh_mutex);
296 return res;
311} 297}
312 298
313static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) 299static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
314{ 300{
315 struct super_block *sb = dir->i_sb; 301 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
316 struct inode *inode = dentry->d_inode; 302 struct inode *inode = dentry->d_inode;
317 struct qstr str; 303 struct qstr str;
318 char name[32]; 304 char name[32];
@@ -322,21 +308,22 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
322 if (HFSPLUS_IS_RSRC(inode)) 308 if (HFSPLUS_IS_RSRC(inode))
323 return -EPERM; 309 return -EPERM;
324 310
311 mutex_lock(&sbi->vh_mutex);
325 cnid = (u32)(unsigned long)dentry->d_fsdata; 312 cnid = (u32)(unsigned long)dentry->d_fsdata;
326 if (inode->i_ino == cnid && 313 if (inode->i_ino == cnid &&
327 atomic_read(&HFSPLUS_I(inode).opencnt)) { 314 atomic_read(&HFSPLUS_I(inode)->opencnt)) {
328 str.name = name; 315 str.name = name;
329 str.len = sprintf(name, "temp%lu", inode->i_ino); 316 str.len = sprintf(name, "temp%lu", inode->i_ino);
330 res = hfsplus_rename_cat(inode->i_ino, 317 res = hfsplus_rename_cat(inode->i_ino,
331 dir, &dentry->d_name, 318 dir, &dentry->d_name,
332 HFSPLUS_SB(sb).hidden_dir, &str); 319 sbi->hidden_dir, &str);
333 if (!res) 320 if (!res)
334 inode->i_flags |= S_DEAD; 321 inode->i_flags |= S_DEAD;
335 return res; 322 goto out;
336 } 323 }
337 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); 324 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
338 if (res) 325 if (res)
339 return res; 326 goto out;
340 327
341 if (inode->i_nlink > 0) 328 if (inode->i_nlink > 0)
342 drop_nlink(inode); 329 drop_nlink(inode);
@@ -344,10 +331,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
344 clear_nlink(inode); 331 clear_nlink(inode);
345 if (!inode->i_nlink) { 332 if (!inode->i_nlink) {
346 if (inode->i_ino != cnid) { 333 if (inode->i_ino != cnid) {
347 HFSPLUS_SB(sb).file_count--; 334 sbi->file_count--;
348 if (!atomic_read(&HFSPLUS_I(inode).opencnt)) { 335 if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
349 res = hfsplus_delete_cat(inode->i_ino, 336 res = hfsplus_delete_cat(inode->i_ino,
350 HFSPLUS_SB(sb).hidden_dir, 337 sbi->hidden_dir,
351 NULL); 338 NULL);
352 if (!res) 339 if (!res)
353 hfsplus_delete_inode(inode); 340 hfsplus_delete_inode(inode);
@@ -356,107 +343,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
356 } else 343 } else
357 hfsplus_delete_inode(inode); 344 hfsplus_delete_inode(inode);
358 } else 345 } else
359 HFSPLUS_SB(sb).file_count--; 346 sbi->file_count--;
360 inode->i_ctime = CURRENT_TIME_SEC; 347 inode->i_ctime = CURRENT_TIME_SEC;
361 mark_inode_dirty(inode); 348 mark_inode_dirty(inode);
362 349out:
350 mutex_unlock(&sbi->vh_mutex);
363 return res; 351 return res;
364} 352}
365 353
366static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
367{
368 struct inode *inode;
369 int res;
370
371 inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
372 if (!inode)
373 return -ENOSPC;
374
375 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
376 if (res) {
377 inode->i_nlink = 0;
378 hfsplus_delete_inode(inode);
379 iput(inode);
380 return res;
381 }
382 hfsplus_instantiate(dentry, inode, inode->i_ino);
383 mark_inode_dirty(inode);
384 return 0;
385}
386
387static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) 354static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
388{ 355{
389 struct inode *inode; 356 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
357 struct inode *inode = dentry->d_inode;
390 int res; 358 int res;
391 359
392 inode = dentry->d_inode;
393 if (inode->i_size != 2) 360 if (inode->i_size != 2)
394 return -ENOTEMPTY; 361 return -ENOTEMPTY;
362
363 mutex_lock(&sbi->vh_mutex);
395 res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); 364 res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
396 if (res) 365 if (res)
397 return res; 366 goto out;
398 clear_nlink(inode); 367 clear_nlink(inode);
399 inode->i_ctime = CURRENT_TIME_SEC; 368 inode->i_ctime = CURRENT_TIME_SEC;
400 hfsplus_delete_inode(inode); 369 hfsplus_delete_inode(inode);
401 mark_inode_dirty(inode); 370 mark_inode_dirty(inode);
402 return 0; 371out:
372 mutex_unlock(&sbi->vh_mutex);
373 return res;
403} 374}
404 375
405static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, 376static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
406 const char *symname) 377 const char *symname)
407{ 378{
408 struct super_block *sb; 379 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
409 struct inode *inode; 380 struct inode *inode;
410 int res; 381 int res = -ENOSPC;
411 382
412 sb = dir->i_sb; 383 mutex_lock(&sbi->vh_mutex);
413 inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO); 384 inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
414 if (!inode) 385 if (!inode)
415 return -ENOSPC; 386 goto out;
416 387
417 res = page_symlink(inode, symname, strlen(symname) + 1); 388 res = page_symlink(inode, symname, strlen(symname) + 1);
418 if (res) { 389 if (res)
419 inode->i_nlink = 0; 390 goto out_err;
420 hfsplus_delete_inode(inode);
421 iput(inode);
422 return res;
423 }
424 391
425 mark_inode_dirty(inode);
426 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 392 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
393 if (res)
394 goto out_err;
427 395
428 if (!res) { 396 hfsplus_instantiate(dentry, inode, inode->i_ino);
429 hfsplus_instantiate(dentry, inode, inode->i_ino); 397 mark_inode_dirty(inode);
430 mark_inode_dirty(inode); 398 goto out;
431 }
432 399
400out_err:
401 inode->i_nlink = 0;
402 hfsplus_delete_inode(inode);
403 iput(inode);
404out:
405 mutex_unlock(&sbi->vh_mutex);
433 return res; 406 return res;
434} 407}
435 408
436static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, 409static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
437 int mode, dev_t rdev) 410 int mode, dev_t rdev)
438{ 411{
439 struct super_block *sb; 412 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
440 struct inode *inode; 413 struct inode *inode;
441 int res; 414 int res = -ENOSPC;
442 415
443 sb = dir->i_sb; 416 mutex_lock(&sbi->vh_mutex);
444 inode = hfsplus_new_inode(sb, mode); 417 inode = hfsplus_new_inode(dir->i_sb, mode);
445 if (!inode) 418 if (!inode)
446 return -ENOSPC; 419 goto out;
420
421 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
422 init_special_inode(inode, mode, rdev);
447 423
448 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 424 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
449 if (res) { 425 if (res) {
450 inode->i_nlink = 0; 426 inode->i_nlink = 0;
451 hfsplus_delete_inode(inode); 427 hfsplus_delete_inode(inode);
452 iput(inode); 428 iput(inode);
453 return res; 429 goto out;
454 } 430 }
455 init_special_inode(inode, mode, rdev); 431
456 hfsplus_instantiate(dentry, inode, inode->i_ino); 432 hfsplus_instantiate(dentry, inode, inode->i_ino);
457 mark_inode_dirty(inode); 433 mark_inode_dirty(inode);
434out:
435 mutex_unlock(&sbi->vh_mutex);
436 return res;
437}
458 438
459 return 0; 439static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
440 struct nameidata *nd)
441{
442 return hfsplus_mknod(dir, dentry, mode, 0);
443}
444
445static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
446{
447 return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
460} 448}
461 449
462static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, 450static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -466,7 +454,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
466 454
467 /* Unlink destination if it already exists */ 455 /* Unlink destination if it already exists */
468 if (new_dentry->d_inode) { 456 if (new_dentry->d_inode) {
469 res = hfsplus_unlink(new_dir, new_dentry); 457 if (S_ISDIR(new_dentry->d_inode->i_mode))
458 res = hfsplus_rmdir(new_dir, new_dentry);
459 else
460 res = hfsplus_unlink(new_dir, new_dentry);
470 if (res) 461 if (res)
471 return res; 462 return res;
472 } 463 }
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0022eec63cda..0c9cb1820a52 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
85 85
86static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) 86static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
87{ 87{
88 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
88 int res; 89 int res;
89 90
90 hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start, 91 WARN_ON(!mutex_is_locked(&hip->extents_lock));
91 HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 92
93 hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
94 HFSPLUS_IS_RSRC(inode) ?
95 HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
96
92 res = hfs_brec_find(fd); 97 res = hfs_brec_find(fd);
93 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) { 98 if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
94 if (res != -ENOENT) 99 if (res != -ENOENT)
95 return; 100 return;
96 hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec)); 101 hfs_brec_insert(fd, hip->cached_extents,
97 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 102 sizeof(hfsplus_extent_rec));
103 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
98 } else { 104 } else {
99 if (res) 105 if (res)
100 return; 106 return;
101 hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength); 107 hfs_bnode_write(fd->bnode, hip->cached_extents,
102 HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY; 108 fd->entryoffset, fd->entrylength);
109 hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
103 } 110 }
104} 111}
105 112
106void hfsplus_ext_write_extent(struct inode *inode) 113static void hfsplus_ext_write_extent_locked(struct inode *inode)
107{ 114{
108 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) { 115 if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
109 struct hfs_find_data fd; 116 struct hfs_find_data fd;
110 117
111 hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); 118 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
112 __hfsplus_ext_write_extent(inode, &fd); 119 __hfsplus_ext_write_extent(inode, &fd);
113 hfs_find_exit(&fd); 120 hfs_find_exit(&fd);
114 } 121 }
115} 122}
116 123
124void hfsplus_ext_write_extent(struct inode *inode)
125{
126 mutex_lock(&HFSPLUS_I(inode)->extents_lock);
127 hfsplus_ext_write_extent_locked(inode);
128 mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
129}
130
117static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, 131static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
118 struct hfsplus_extent *extent, 132 struct hfsplus_extent *extent,
119 u32 cnid, u32 block, u8 type) 133 u32 cnid, u32 block, u8 type)
@@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
136 150
137static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) 151static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
138{ 152{
153 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
139 int res; 154 int res;
140 155
141 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) 156 WARN_ON(!mutex_is_locked(&hip->extents_lock));
157
158 if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
142 __hfsplus_ext_write_extent(inode, fd); 159 __hfsplus_ext_write_extent(inode, fd);
143 160
144 res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino, 161 res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
145 block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 162 block, HFSPLUS_IS_RSRC(inode) ?
163 HFSPLUS_TYPE_RSRC :
164 HFSPLUS_TYPE_DATA);
146 if (!res) { 165 if (!res) {
147 HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block); 166 hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
148 HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents); 167 hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
149 } else { 168 } else {
150 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; 169 hip->cached_start = hip->cached_blocks = 0;
151 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 170 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
152 } 171 }
153 return res; 172 return res;
154} 173}
155 174
156static int hfsplus_ext_read_extent(struct inode *inode, u32 block) 175static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
157{ 176{
177 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
158 struct hfs_find_data fd; 178 struct hfs_find_data fd;
159 int res; 179 int res;
160 180
161 if (block >= HFSPLUS_I(inode).cached_start && 181 if (block >= hip->cached_start &&
162 block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks) 182 block < hip->cached_start + hip->cached_blocks)
163 return 0; 183 return 0;
164 184
165 hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); 185 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
166 res = __hfsplus_ext_cache_extent(&fd, inode, block); 186 res = __hfsplus_ext_cache_extent(&fd, inode, block);
167 hfs_find_exit(&fd); 187 hfs_find_exit(&fd);
168 return res; 188 return res;
@@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
172int hfsplus_get_block(struct inode *inode, sector_t iblock, 192int hfsplus_get_block(struct inode *inode, sector_t iblock,
173 struct buffer_head *bh_result, int create) 193 struct buffer_head *bh_result, int create)
174{ 194{
175 struct super_block *sb; 195 struct super_block *sb = inode->i_sb;
196 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
197 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
176 int res = -EIO; 198 int res = -EIO;
177 u32 ablock, dblock, mask; 199 u32 ablock, dblock, mask;
178 int shift; 200 int shift;
179 201
180 sb = inode->i_sb;
181
182 /* Convert inode block to disk allocation block */ 202 /* Convert inode block to disk allocation block */
183 shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits; 203 shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
184 ablock = iblock >> HFSPLUS_SB(sb).fs_shift; 204 ablock = iblock >> sbi->fs_shift;
185 205
186 if (iblock >= HFSPLUS_I(inode).fs_blocks) { 206 if (iblock >= hip->fs_blocks) {
187 if (iblock > HFSPLUS_I(inode).fs_blocks || !create) 207 if (iblock > hip->fs_blocks || !create)
188 return -EIO; 208 return -EIO;
189 if (ablock >= HFSPLUS_I(inode).alloc_blocks) { 209 if (ablock >= hip->alloc_blocks) {
190 res = hfsplus_file_extend(inode); 210 res = hfsplus_file_extend(inode);
191 if (res) 211 if (res)
192 return res; 212 return res;
@@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
194 } else 214 } else
195 create = 0; 215 create = 0;
196 216
197 if (ablock < HFSPLUS_I(inode).first_blocks) { 217 if (ablock < hip->first_blocks) {
198 dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock); 218 dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
199 goto done; 219 goto done;
200 } 220 }
201 221
202 if (inode->i_ino == HFSPLUS_EXT_CNID) 222 if (inode->i_ino == HFSPLUS_EXT_CNID)
203 return -EIO; 223 return -EIO;
204 224
205 mutex_lock(&HFSPLUS_I(inode).extents_lock); 225 mutex_lock(&hip->extents_lock);
206 res = hfsplus_ext_read_extent(inode, ablock); 226 res = hfsplus_ext_read_extent(inode, ablock);
207 if (!res) { 227 if (!res) {
208 dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock - 228 dblock = hfsplus_ext_find_block(hip->cached_extents,
209 HFSPLUS_I(inode).cached_start); 229 ablock - hip->cached_start);
210 } else { 230 } else {
211 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 231 mutex_unlock(&hip->extents_lock);
212 return -EIO; 232 return -EIO;
213 } 233 }
214 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 234 mutex_unlock(&hip->extents_lock);
215 235
216done: 236done:
217 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); 237 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
218 mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1; 238 mask = (1 << sbi->fs_shift) - 1;
219 map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask)); 239 map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
220 if (create) { 240 if (create) {
221 set_buffer_new(bh_result); 241 set_buffer_new(bh_result);
222 HFSPLUS_I(inode).phys_size += sb->s_blocksize; 242 hip->phys_size += sb->s_blocksize;
223 HFSPLUS_I(inode).fs_blocks++; 243 hip->fs_blocks++;
224 inode_add_bytes(inode, sb->s_blocksize); 244 inode_add_bytes(inode, sb->s_blocksize);
225 mark_inode_dirty(inode); 245 mark_inode_dirty(inode);
226 } 246 }
@@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
327 if (total_blocks == blocks) 347 if (total_blocks == blocks)
328 return 0; 348 return 0;
329 349
330 hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); 350 hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
331 do { 351 do {
332 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, 352 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
333 total_blocks, type); 353 total_blocks, type);
@@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
348int hfsplus_file_extend(struct inode *inode) 368int hfsplus_file_extend(struct inode *inode)
349{ 369{
350 struct super_block *sb = inode->i_sb; 370 struct super_block *sb = inode->i_sb;
371 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
372 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
351 u32 start, len, goal; 373 u32 start, len, goal;
352 int res; 374 int res;
353 375
354 if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) { 376 if (sbi->alloc_file->i_size * 8 <
377 sbi->total_blocks - sbi->free_blocks + 8) {
355 // extend alloc file 378 // extend alloc file
356 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8, 379 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
357 HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks); 380 sbi->alloc_file->i_size * 8,
381 sbi->total_blocks, sbi->free_blocks);
358 return -ENOSPC; 382 return -ENOSPC;
359 } 383 }
360 384
361 mutex_lock(&HFSPLUS_I(inode).extents_lock); 385 mutex_lock(&hip->extents_lock);
362 if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks) 386 if (hip->alloc_blocks == hip->first_blocks)
363 goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents); 387 goal = hfsplus_ext_lastblock(hip->first_extents);
364 else { 388 else {
365 res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks); 389 res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
366 if (res) 390 if (res)
367 goto out; 391 goto out;
368 goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents); 392 goal = hfsplus_ext_lastblock(hip->cached_extents);
369 } 393 }
370 394
371 len = HFSPLUS_I(inode).clump_blocks; 395 len = hip->clump_blocks;
372 start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len); 396 start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
373 if (start >= HFSPLUS_SB(sb).total_blocks) { 397 if (start >= sbi->total_blocks) {
374 start = hfsplus_block_allocate(sb, goal, 0, &len); 398 start = hfsplus_block_allocate(sb, goal, 0, &len);
375 if (start >= goal) { 399 if (start >= goal) {
376 res = -ENOSPC; 400 res = -ENOSPC;
@@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode)
379 } 403 }
380 404
381 dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); 405 dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
382 if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) { 406
383 if (!HFSPLUS_I(inode).first_blocks) { 407 if (hip->alloc_blocks <= hip->first_blocks) {
408 if (!hip->first_blocks) {
384 dprint(DBG_EXTENT, "first extents\n"); 409 dprint(DBG_EXTENT, "first extents\n");
385 /* no extents yet */ 410 /* no extents yet */
386 HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start); 411 hip->first_extents[0].start_block = cpu_to_be32(start);
387 HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len); 412 hip->first_extents[0].block_count = cpu_to_be32(len);
388 res = 0; 413 res = 0;
389 } else { 414 } else {
390 /* try to append to extents in inode */ 415 /* try to append to extents in inode */
391 res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents, 416 res = hfsplus_add_extent(hip->first_extents,
392 HFSPLUS_I(inode).alloc_blocks, 417 hip->alloc_blocks,
393 start, len); 418 start, len);
394 if (res == -ENOSPC) 419 if (res == -ENOSPC)
395 goto insert_extent; 420 goto insert_extent;
396 } 421 }
397 if (!res) { 422 if (!res) {
398 hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); 423 hfsplus_dump_extent(hip->first_extents);
399 HFSPLUS_I(inode).first_blocks += len; 424 hip->first_blocks += len;
400 } 425 }
401 } else { 426 } else {
402 res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents, 427 res = hfsplus_add_extent(hip->cached_extents,
403 HFSPLUS_I(inode).alloc_blocks - 428 hip->alloc_blocks - hip->cached_start,
404 HFSPLUS_I(inode).cached_start,
405 start, len); 429 start, len);
406 if (!res) { 430 if (!res) {
407 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 431 hfsplus_dump_extent(hip->cached_extents);
408 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; 432 hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
409 HFSPLUS_I(inode).cached_blocks += len; 433 hip->cached_blocks += len;
410 } else if (res == -ENOSPC) 434 } else if (res == -ENOSPC)
411 goto insert_extent; 435 goto insert_extent;
412 } 436 }
413out: 437out:
414 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 438 mutex_unlock(&hip->extents_lock);
415 if (!res) { 439 if (!res) {
416 HFSPLUS_I(inode).alloc_blocks += len; 440 hip->alloc_blocks += len;
417 mark_inode_dirty(inode); 441 mark_inode_dirty(inode);
418 } 442 }
419 return res; 443 return res;
420 444
421insert_extent: 445insert_extent:
422 dprint(DBG_EXTENT, "insert new extent\n"); 446 dprint(DBG_EXTENT, "insert new extent\n");
423 hfsplus_ext_write_extent(inode); 447 hfsplus_ext_write_extent_locked(inode);
424 448
425 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 449 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
426 HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start); 450 hip->cached_extents[0].start_block = cpu_to_be32(start);
427 HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len); 451 hip->cached_extents[0].block_count = cpu_to_be32(len);
428 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 452 hfsplus_dump_extent(hip->cached_extents);
429 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; 453 hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
430 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks; 454 hip->cached_start = hip->alloc_blocks;
431 HFSPLUS_I(inode).cached_blocks = len; 455 hip->cached_blocks = len;
432 456
433 res = 0; 457 res = 0;
434 goto out; 458 goto out;
@@ -437,13 +461,15 @@ insert_extent:
437void hfsplus_file_truncate(struct inode *inode) 461void hfsplus_file_truncate(struct inode *inode)
438{ 462{
439 struct super_block *sb = inode->i_sb; 463 struct super_block *sb = inode->i_sb;
464 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
440 struct hfs_find_data fd; 465 struct hfs_find_data fd;
441 u32 alloc_cnt, blk_cnt, start; 466 u32 alloc_cnt, blk_cnt, start;
442 int res; 467 int res;
443 468
444 dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, 469 dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
445 (long long)HFSPLUS_I(inode).phys_size, inode->i_size); 470 inode->i_ino, (long long)hip->phys_size, inode->i_size);
446 if (inode->i_size > HFSPLUS_I(inode).phys_size) { 471
472 if (inode->i_size > hip->phys_size) {
447 struct address_space *mapping = inode->i_mapping; 473 struct address_space *mapping = inode->i_mapping;
448 struct page *page; 474 struct page *page;
449 void *fsdata; 475 void *fsdata;
@@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode)
460 return; 486 return;
461 mark_inode_dirty(inode); 487 mark_inode_dirty(inode);
462 return; 488 return;
463 } else if (inode->i_size == HFSPLUS_I(inode).phys_size) 489 } else if (inode->i_size == hip->phys_size)
464 return; 490 return;
465 491
466 blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift; 492 blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
467 alloc_cnt = HFSPLUS_I(inode).alloc_blocks; 493 HFSPLUS_SB(sb)->alloc_blksz_shift;
494 alloc_cnt = hip->alloc_blocks;
468 if (blk_cnt == alloc_cnt) 495 if (blk_cnt == alloc_cnt)
469 goto out; 496 goto out;
470 497
471 mutex_lock(&HFSPLUS_I(inode).extents_lock); 498 mutex_lock(&hip->extents_lock);
472 hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); 499 hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
473 while (1) { 500 while (1) {
474 if (alloc_cnt == HFSPLUS_I(inode).first_blocks) { 501 if (alloc_cnt == hip->first_blocks) {
475 hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents, 502 hfsplus_free_extents(sb, hip->first_extents,
476 alloc_cnt, alloc_cnt - blk_cnt); 503 alloc_cnt, alloc_cnt - blk_cnt);
477 hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); 504 hfsplus_dump_extent(hip->first_extents);
478 HFSPLUS_I(inode).first_blocks = blk_cnt; 505 hip->first_blocks = blk_cnt;
479 break; 506 break;
480 } 507 }
481 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); 508 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
482 if (res) 509 if (res)
483 break; 510 break;
484 start = HFSPLUS_I(inode).cached_start; 511 start = hip->cached_start;
485 hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents, 512 hfsplus_free_extents(sb, hip->cached_extents,
486 alloc_cnt - start, alloc_cnt - blk_cnt); 513 alloc_cnt - start, alloc_cnt - blk_cnt);
487 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 514 hfsplus_dump_extent(hip->cached_extents);
488 if (blk_cnt > start) { 515 if (blk_cnt > start) {
489 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; 516 hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
490 break; 517 break;
491 } 518 }
492 alloc_cnt = start; 519 alloc_cnt = start;
493 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; 520 hip->cached_start = hip->cached_blocks = 0;
494 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 521 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
495 hfs_brec_remove(&fd); 522 hfs_brec_remove(&fd);
496 } 523 }
497 hfs_find_exit(&fd); 524 hfs_find_exit(&fd);
498 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 525 mutex_unlock(&hip->extents_lock);
499 526
500 HFSPLUS_I(inode).alloc_blocks = blk_cnt; 527 hip->alloc_blocks = blk_cnt;
501out: 528out:
502 HFSPLUS_I(inode).phys_size = inode->i_size; 529 hip->phys_size = inode->i_size;
503 HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 530 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
504 inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); 531 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
505 mark_inode_dirty(inode); 532 mark_inode_dirty(inode);
506} 533}
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index dc856be3c2b0..cb3653efb57a 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -62,7 +62,7 @@ struct hfs_btree {
62 unsigned int depth; 62 unsigned int depth;
63 63
64 //unsigned int map1_size, map_size; 64 //unsigned int map1_size, map_size;
65 struct semaphore tree_lock; 65 struct mutex tree_lock;
66 66
67 unsigned int pages_per_bnode; 67 unsigned int pages_per_bnode;
68 spinlock_t hash_lock; 68 spinlock_t hash_lock;
@@ -121,16 +121,21 @@ struct hfsplus_sb_info {
121 u32 sect_count; 121 u32 sect_count;
122 int fs_shift; 122 int fs_shift;
123 123
124 /* Stuff in host order from Vol Header */ 124 /* immutable data from the volume header */
125 u32 alloc_blksz; 125 u32 alloc_blksz;
126 int alloc_blksz_shift; 126 int alloc_blksz_shift;
127 u32 total_blocks; 127 u32 total_blocks;
128 u32 data_clump_blocks, rsrc_clump_blocks;
129
130 /* mutable data from the volume header, protected by alloc_mutex */
128 u32 free_blocks; 131 u32 free_blocks;
129 u32 next_alloc; 132 struct mutex alloc_mutex;
133
134 /* mutable data from the volume header, protected by vh_mutex */
130 u32 next_cnid; 135 u32 next_cnid;
131 u32 file_count; 136 u32 file_count;
132 u32 folder_count; 137 u32 folder_count;
133 u32 data_clump_blocks, rsrc_clump_blocks; 138 struct mutex vh_mutex;
134 139
135 /* Config options */ 140 /* Config options */
136 u32 creator; 141 u32 creator;
@@ -143,40 +148,50 @@ struct hfsplus_sb_info {
143 int part, session; 148 int part, session;
144 149
145 unsigned long flags; 150 unsigned long flags;
146
147 struct hlist_head rsrc_inodes;
148}; 151};
149 152
150#define HFSPLUS_SB_WRITEBACKUP 0x0001 153#define HFSPLUS_SB_WRITEBACKUP 0
151#define HFSPLUS_SB_NODECOMPOSE 0x0002 154#define HFSPLUS_SB_NODECOMPOSE 1
152#define HFSPLUS_SB_FORCE 0x0004 155#define HFSPLUS_SB_FORCE 2
153#define HFSPLUS_SB_HFSX 0x0008 156#define HFSPLUS_SB_HFSX 3
154#define HFSPLUS_SB_CASEFOLD 0x0010 157#define HFSPLUS_SB_CASEFOLD 4
155 158
156 159
157struct hfsplus_inode_info { 160struct hfsplus_inode_info {
158 struct mutex extents_lock;
159 u32 clump_blocks, alloc_blocks;
160 sector_t fs_blocks;
161 /* Allocation extents from catalog record or volume header */
162 hfsplus_extent_rec first_extents;
163 u32 first_blocks;
164 hfsplus_extent_rec cached_extents;
165 u32 cached_start, cached_blocks;
166 atomic_t opencnt; 161 atomic_t opencnt;
167 162
168 struct inode *rsrc_inode; 163 /*
164 * Extent allocation information, protected by extents_lock.
165 */
166 u32 first_blocks;
167 u32 clump_blocks;
168 u32 alloc_blocks;
169 u32 cached_start;
170 u32 cached_blocks;
171 hfsplus_extent_rec first_extents;
172 hfsplus_extent_rec cached_extents;
169 unsigned long flags; 173 unsigned long flags;
174 struct mutex extents_lock;
170 175
176 /*
177 * Immutable data.
178 */
179 struct inode *rsrc_inode;
171 __be32 create_date; 180 __be32 create_date;
172 /* Device number in hfsplus_permissions in catalog */
173 u32 dev;
174 /* BSD system and user file flags */
175 u8 rootflags;
176 u8 userflags;
177 181
182 /*
183 * Protected by sbi->vh_mutex.
184 */
185 u32 linkid;
186
187 /*
188 * Protected by i_mutex.
189 */
190 sector_t fs_blocks;
191 u8 userflags; /* BSD user file flags */
178 struct list_head open_dir_list; 192 struct list_head open_dir_list;
179 loff_t phys_size; 193 loff_t phys_size;
194
180 struct inode vfs_inode; 195 struct inode vfs_inode;
181}; 196};
182 197
@@ -184,8 +199,8 @@ struct hfsplus_inode_info {
184#define HFSPLUS_FLG_EXT_DIRTY 0x0002 199#define HFSPLUS_FLG_EXT_DIRTY 0x0002
185#define HFSPLUS_FLG_EXT_NEW 0x0004 200#define HFSPLUS_FLG_EXT_NEW 0x0004
186 201
187#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)) 202#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
188#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC) 203#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
189 204
190struct hfs_find_data { 205struct hfs_find_data {
191 /* filled by caller */ 206 /* filled by caller */
@@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
311int hfsplus_delete_cat(u32, struct inode *, struct qstr *); 326int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
312int hfsplus_rename_cat(u32, struct inode *, struct qstr *, 327int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
313 struct inode *, struct qstr *); 328 struct inode *, struct qstr *);
329void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
314 330
315/* dir.c */ 331/* dir.c */
316extern const struct inode_operations hfsplus_dir_inode_operations; 332extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -372,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *);
372int hfs_part_find(struct super_block *, sector_t *, sector_t *); 388int hfs_part_find(struct super_block *, sector_t *, sector_t *);
373 389
374/* access macros */ 390/* access macros */
375/*
376static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) 391static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
377{ 392{
378 return sb->s_fs_info; 393 return sb->s_fs_info;
379} 394}
395
380static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) 396static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
381{ 397{
382 return list_entry(inode, struct hfsplus_inode_info, vfs_inode); 398 return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
383} 399}
384*/
385#define HFSPLUS_SB(super) (*(struct hfsplus_sb_info *)(super)->s_fs_info)
386#define HFSPLUS_I(inode) (*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
387
388#if 1
389#define hfsplus_kmap(p) ({ struct page *__p = (p); kmap(__p); })
390#define hfsplus_kunmap(p) ({ struct page *__p = (p); kunmap(__p); __p; })
391#else
392#define hfsplus_kmap(p) kmap(p)
393#define hfsplus_kunmap(p) kunmap(p)
394#endif
395 400
396#define sb_bread512(sb, sec, data) ({ \ 401#define sb_bread512(sb, sec, data) ({ \
397 struct buffer_head *__bh; \ 402 struct buffer_head *__bh; \
@@ -419,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
419#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) 424#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec)
420#define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) 425#define hfsp_now2mt() __hfsp_ut2mt(get_seconds())
421 426
422#define kdev_t_to_nr(x) (x)
423
424#endif 427#endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index fe99fe8db61a..6892899fd6fb 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -200,6 +200,7 @@ struct hfsplus_cat_key {
200 struct hfsplus_unistr name; 200 struct hfsplus_unistr name;
201} __packed; 201} __packed;
202 202
203#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key))
203 204
204/* Structs from hfs.h */ 205/* Structs from hfs.h */
205struct hfsp_point { 206struct hfsp_point {
@@ -323,7 +324,7 @@ struct hfsplus_ext_key {
323 __be32 start_block; 324 __be32 start_block;
324} __packed; 325} __packed;
325 326
326#define HFSPLUS_EXT_KEYLEN 12 327#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key)
327 328
328/* HFS+ generic BTree key */ 329/* HFS+ generic BTree key */
329typedef union { 330typedef union {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c5a979d62c65..78449280dae0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -36,7 +36,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
36 *pagep = NULL; 36 *pagep = NULL;
37 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 37 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
38 hfsplus_get_block, 38 hfsplus_get_block,
39 &HFSPLUS_I(mapping->host).phys_size); 39 &HFSPLUS_I(mapping->host)->phys_size);
40 if (unlikely(ret)) { 40 if (unlikely(ret)) {
41 loff_t isize = mapping->host->i_size; 41 loff_t isize = mapping->host->i_size;
42 if (pos + len > isize) 42 if (pos + len > isize)
@@ -62,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
62 62
63 switch (inode->i_ino) { 63 switch (inode->i_ino) {
64 case HFSPLUS_EXT_CNID: 64 case HFSPLUS_EXT_CNID:
65 tree = HFSPLUS_SB(sb).ext_tree; 65 tree = HFSPLUS_SB(sb)->ext_tree;
66 break; 66 break;
67 case HFSPLUS_CAT_CNID: 67 case HFSPLUS_CAT_CNID:
68 tree = HFSPLUS_SB(sb).cat_tree; 68 tree = HFSPLUS_SB(sb)->cat_tree;
69 break; 69 break;
70 case HFSPLUS_ATTR_CNID: 70 case HFSPLUS_ATTR_CNID:
71 tree = HFSPLUS_SB(sb).attr_tree; 71 tree = HFSPLUS_SB(sb)->attr_tree;
72 break; 72 break;
73 default: 73 default:
74 BUG(); 74 BUG();
@@ -172,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
172 struct hfs_find_data fd; 172 struct hfs_find_data fd;
173 struct super_block *sb = dir->i_sb; 173 struct super_block *sb = dir->i_sb;
174 struct inode *inode = NULL; 174 struct inode *inode = NULL;
175 struct hfsplus_inode_info *hip;
175 int err; 176 int err;
176 177
177 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) 178 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
178 goto out; 179 goto out;
179 180
180 inode = HFSPLUS_I(dir).rsrc_inode; 181 inode = HFSPLUS_I(dir)->rsrc_inode;
181 if (inode) 182 if (inode)
182 goto out; 183 goto out;
183 184
@@ -185,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
185 if (!inode) 186 if (!inode)
186 return ERR_PTR(-ENOMEM); 187 return ERR_PTR(-ENOMEM);
187 188
189 hip = HFSPLUS_I(inode);
188 inode->i_ino = dir->i_ino; 190 inode->i_ino = dir->i_ino;
189 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 191 INIT_LIST_HEAD(&hip->open_dir_list);
190 mutex_init(&HFSPLUS_I(inode).extents_lock); 192 mutex_init(&hip->extents_lock);
191 HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC; 193 hip->flags = HFSPLUS_FLG_RSRC;
192 194
193 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 195 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
194 err = hfsplus_find_cat(sb, dir->i_ino, &fd); 196 err = hfsplus_find_cat(sb, dir->i_ino, &fd);
195 if (!err) 197 if (!err)
196 err = hfsplus_cat_read_inode(inode, &fd); 198 err = hfsplus_cat_read_inode(inode, &fd);
@@ -199,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
199 iput(inode); 201 iput(inode);
200 return ERR_PTR(err); 202 return ERR_PTR(err);
201 } 203 }
202 HFSPLUS_I(inode).rsrc_inode = dir; 204 hip->rsrc_inode = dir;
203 HFSPLUS_I(dir).rsrc_inode = inode; 205 HFSPLUS_I(dir)->rsrc_inode = inode;
204 igrab(dir); 206 igrab(dir);
205 hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes); 207
208 /*
209 * __mark_inode_dirty expects inodes to be hashed. Since we don't
210 * want resource fork inodes in the regular inode space, we make them
211 * appear hashed, but do not put on any lists. hlist_del()
212 * will work fine and require no locking.
213 */
214 inode->i_hash.pprev = &inode->i_hash.next;
215
206 mark_inode_dirty(inode); 216 mark_inode_dirty(inode);
207out: 217out:
208 d_add(dentry, inode); 218 d_add(dentry, inode);
@@ -211,30 +221,27 @@ out:
211 221
212static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) 222static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
213{ 223{
214 struct super_block *sb = inode->i_sb; 224 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
215 u16 mode; 225 u16 mode;
216 226
217 mode = be16_to_cpu(perms->mode); 227 mode = be16_to_cpu(perms->mode);
218 228
219 inode->i_uid = be32_to_cpu(perms->owner); 229 inode->i_uid = be32_to_cpu(perms->owner);
220 if (!inode->i_uid && !mode) 230 if (!inode->i_uid && !mode)
221 inode->i_uid = HFSPLUS_SB(sb).uid; 231 inode->i_uid = sbi->uid;
222 232
223 inode->i_gid = be32_to_cpu(perms->group); 233 inode->i_gid = be32_to_cpu(perms->group);
224 if (!inode->i_gid && !mode) 234 if (!inode->i_gid && !mode)
225 inode->i_gid = HFSPLUS_SB(sb).gid; 235 inode->i_gid = sbi->gid;
226 236
227 if (dir) { 237 if (dir) {
228 mode = mode ? (mode & S_IALLUGO) : 238 mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
229 (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
230 mode |= S_IFDIR; 239 mode |= S_IFDIR;
231 } else if (!mode) 240 } else if (!mode)
232 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & 241 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
233 ~(HFSPLUS_SB(sb).umask));
234 inode->i_mode = mode; 242 inode->i_mode = mode;
235 243
236 HFSPLUS_I(inode).rootflags = perms->rootflags; 244 HFSPLUS_I(inode)->userflags = perms->userflags;
237 HFSPLUS_I(inode).userflags = perms->userflags;
238 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) 245 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
239 inode->i_flags |= S_IMMUTABLE; 246 inode->i_flags |= S_IMMUTABLE;
240 else 247 else
@@ -245,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
245 inode->i_flags &= ~S_APPEND; 252 inode->i_flags &= ~S_APPEND;
246} 253}
247 254
248static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
249{
250 if (inode->i_flags & S_IMMUTABLE)
251 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
252 else
253 perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
254 if (inode->i_flags & S_APPEND)
255 perms->rootflags |= HFSPLUS_FLG_APPEND;
256 else
257 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
258 perms->userflags = HFSPLUS_I(inode).userflags;
259 perms->mode = cpu_to_be16(inode->i_mode);
260 perms->owner = cpu_to_be32(inode->i_uid);
261 perms->group = cpu_to_be32(inode->i_gid);
262 perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
263}
264
265static int hfsplus_file_open(struct inode *inode, struct file *file) 255static int hfsplus_file_open(struct inode *inode, struct file *file)
266{ 256{
267 if (HFSPLUS_IS_RSRC(inode)) 257 if (HFSPLUS_IS_RSRC(inode))
268 inode = HFSPLUS_I(inode).rsrc_inode; 258 inode = HFSPLUS_I(inode)->rsrc_inode;
269 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 259 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
270 return -EOVERFLOW; 260 return -EOVERFLOW;
271 atomic_inc(&HFSPLUS_I(inode).opencnt); 261 atomic_inc(&HFSPLUS_I(inode)->opencnt);
272 return 0; 262 return 0;
273} 263}
274 264
@@ -277,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
277 struct super_block *sb = inode->i_sb; 267 struct super_block *sb = inode->i_sb;
278 268
279 if (HFSPLUS_IS_RSRC(inode)) 269 if (HFSPLUS_IS_RSRC(inode))
280 inode = HFSPLUS_I(inode).rsrc_inode; 270 inode = HFSPLUS_I(inode)->rsrc_inode;
281 if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) { 271 if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
282 mutex_lock(&inode->i_mutex); 272 mutex_lock(&inode->i_mutex);
283 hfsplus_file_truncate(inode); 273 hfsplus_file_truncate(inode);
284 if (inode->i_flags & S_DEAD) { 274 if (inode->i_flags & S_DEAD) {
285 hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); 275 hfsplus_delete_cat(inode->i_ino,
276 HFSPLUS_SB(sb)->hidden_dir, NULL);
286 hfsplus_delete_inode(inode); 277 hfsplus_delete_inode(inode);
287 } 278 }
288 mutex_unlock(&inode->i_mutex); 279 mutex_unlock(&inode->i_mutex);
@@ -361,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = {
361 352
362struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 353struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
363{ 354{
355 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
364 struct inode *inode = new_inode(sb); 356 struct inode *inode = new_inode(sb);
357 struct hfsplus_inode_info *hip;
358
365 if (!inode) 359 if (!inode)
366 return NULL; 360 return NULL;
367 361
368 inode->i_ino = HFSPLUS_SB(sb).next_cnid++; 362 inode->i_ino = sbi->next_cnid++;
369 inode->i_mode = mode; 363 inode->i_mode = mode;
370 inode->i_uid = current_fsuid(); 364 inode->i_uid = current_fsuid();
371 inode->i_gid = current_fsgid(); 365 inode->i_gid = current_fsgid();
372 inode->i_nlink = 1; 366 inode->i_nlink = 1;
373 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 367 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
374 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 368
375 mutex_init(&HFSPLUS_I(inode).extents_lock); 369 hip = HFSPLUS_I(inode);
376 atomic_set(&HFSPLUS_I(inode).opencnt, 0); 370 INIT_LIST_HEAD(&hip->open_dir_list);
377 HFSPLUS_I(inode).flags = 0; 371 mutex_init(&hip->extents_lock);
378 memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec)); 372 atomic_set(&hip->opencnt, 0);
379 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 373 hip->flags = 0;
380 HFSPLUS_I(inode).alloc_blocks = 0; 374 memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
381 HFSPLUS_I(inode).first_blocks = 0; 375 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
382 HFSPLUS_I(inode).cached_start = 0; 376 hip->alloc_blocks = 0;
383 HFSPLUS_I(inode).cached_blocks = 0; 377 hip->first_blocks = 0;
384 HFSPLUS_I(inode).phys_size = 0; 378 hip->cached_start = 0;
385 HFSPLUS_I(inode).fs_blocks = 0; 379 hip->cached_blocks = 0;
386 HFSPLUS_I(inode).rsrc_inode = NULL; 380 hip->phys_size = 0;
381 hip->fs_blocks = 0;
382 hip->rsrc_inode = NULL;
387 if (S_ISDIR(inode->i_mode)) { 383 if (S_ISDIR(inode->i_mode)) {
388 inode->i_size = 2; 384 inode->i_size = 2;
389 HFSPLUS_SB(sb).folder_count++; 385 sbi->folder_count++;
390 inode->i_op = &hfsplus_dir_inode_operations; 386 inode->i_op = &hfsplus_dir_inode_operations;
391 inode->i_fop = &hfsplus_dir_operations; 387 inode->i_fop = &hfsplus_dir_operations;
392 } else if (S_ISREG(inode->i_mode)) { 388 } else if (S_ISREG(inode->i_mode)) {
393 HFSPLUS_SB(sb).file_count++; 389 sbi->file_count++;
394 inode->i_op = &hfsplus_file_inode_operations; 390 inode->i_op = &hfsplus_file_inode_operations;
395 inode->i_fop = &hfsplus_file_operations; 391 inode->i_fop = &hfsplus_file_operations;
396 inode->i_mapping->a_ops = &hfsplus_aops; 392 inode->i_mapping->a_ops = &hfsplus_aops;
397 HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks; 393 hip->clump_blocks = sbi->data_clump_blocks;
398 } else if (S_ISLNK(inode->i_mode)) { 394 } else if (S_ISLNK(inode->i_mode)) {
399 HFSPLUS_SB(sb).file_count++; 395 sbi->file_count++;
400 inode->i_op = &page_symlink_inode_operations; 396 inode->i_op = &page_symlink_inode_operations;
401 inode->i_mapping->a_ops = &hfsplus_aops; 397 inode->i_mapping->a_ops = &hfsplus_aops;
402 HFSPLUS_I(inode).clump_blocks = 1; 398 hip->clump_blocks = 1;
403 } else 399 } else
404 HFSPLUS_SB(sb).file_count++; 400 sbi->file_count++;
405 insert_inode_hash(inode); 401 insert_inode_hash(inode);
406 mark_inode_dirty(inode); 402 mark_inode_dirty(inode);
407 sb->s_dirt = 1; 403 sb->s_dirt = 1;
@@ -414,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode)
414 struct super_block *sb = inode->i_sb; 410 struct super_block *sb = inode->i_sb;
415 411
416 if (S_ISDIR(inode->i_mode)) { 412 if (S_ISDIR(inode->i_mode)) {
417 HFSPLUS_SB(sb).folder_count--; 413 HFSPLUS_SB(sb)->folder_count--;
418 sb->s_dirt = 1; 414 sb->s_dirt = 1;
419 return; 415 return;
420 } 416 }
421 HFSPLUS_SB(sb).file_count--; 417 HFSPLUS_SB(sb)->file_count--;
422 if (S_ISREG(inode->i_mode)) { 418 if (S_ISREG(inode->i_mode)) {
423 if (!inode->i_nlink) { 419 if (!inode->i_nlink) {
424 inode->i_size = 0; 420 inode->i_size = 0;
@@ -434,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode)
434void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 430void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
435{ 431{
436 struct super_block *sb = inode->i_sb; 432 struct super_block *sb = inode->i_sb;
433 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
434 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
437 u32 count; 435 u32 count;
438 int i; 436 int i;
439 437
440 memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents, 438 memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
441 sizeof(hfsplus_extent_rec));
442 for (count = 0, i = 0; i < 8; i++) 439 for (count = 0, i = 0; i < 8; i++)
443 count += be32_to_cpu(fork->extents[i].block_count); 440 count += be32_to_cpu(fork->extents[i].block_count);
444 HFSPLUS_I(inode).first_blocks = count; 441 hip->first_blocks = count;
445 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 442 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
446 HFSPLUS_I(inode).cached_start = 0; 443 hip->cached_start = 0;
447 HFSPLUS_I(inode).cached_blocks = 0; 444 hip->cached_blocks = 0;
448 445
449 HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks); 446 hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
450 inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size); 447 hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
451 HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 448 hip->fs_blocks =
452 inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); 449 (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
453 HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift; 450 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
454 if (!HFSPLUS_I(inode).clump_blocks) 451 hip->clump_blocks =
455 HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks : 452 be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
456 HFSPLUS_SB(sb).data_clump_blocks; 453 if (!hip->clump_blocks) {
454 hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
455 sbi->rsrc_clump_blocks :
456 sbi->data_clump_blocks;
457 }
457} 458}
458 459
459void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 460void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
460{ 461{
461 memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents, 462 memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
462 sizeof(hfsplus_extent_rec)); 463 sizeof(hfsplus_extent_rec));
463 fork->total_size = cpu_to_be64(inode->i_size); 464 fork->total_size = cpu_to_be64(inode->i_size);
464 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks); 465 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
465} 466}
466 467
467int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) 468int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -472,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
472 473
473 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); 474 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
474 475
475 HFSPLUS_I(inode).dev = 0; 476 HFSPLUS_I(inode)->linkid = 0;
476 if (type == HFSPLUS_FOLDER) { 477 if (type == HFSPLUS_FOLDER) {
477 struct hfsplus_cat_folder *folder = &entry.folder; 478 struct hfsplus_cat_folder *folder = &entry.folder;
478 479
@@ -486,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
486 inode->i_atime = hfsp_mt2ut(folder->access_date); 487 inode->i_atime = hfsp_mt2ut(folder->access_date);
487 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); 488 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
488 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); 489 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
489 HFSPLUS_I(inode).create_date = folder->create_date; 490 HFSPLUS_I(inode)->create_date = folder->create_date;
490 HFSPLUS_I(inode).fs_blocks = 0; 491 HFSPLUS_I(inode)->fs_blocks = 0;
491 inode->i_op = &hfsplus_dir_inode_operations; 492 inode->i_op = &hfsplus_dir_inode_operations;
492 inode->i_fop = &hfsplus_dir_operations; 493 inode->i_fop = &hfsplus_dir_operations;
493 } else if (type == HFSPLUS_FILE) { 494 } else if (type == HFSPLUS_FILE) {
@@ -518,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
518 inode->i_atime = hfsp_mt2ut(file->access_date); 519 inode->i_atime = hfsp_mt2ut(file->access_date);
519 inode->i_mtime = hfsp_mt2ut(file->content_mod_date); 520 inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
520 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); 521 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
521 HFSPLUS_I(inode).create_date = file->create_date; 522 HFSPLUS_I(inode)->create_date = file->create_date;
522 } else { 523 } else {
523 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); 524 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
524 res = -EIO; 525 res = -EIO;
@@ -533,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode)
533 hfsplus_cat_entry entry; 534 hfsplus_cat_entry entry;
534 535
535 if (HFSPLUS_IS_RSRC(inode)) 536 if (HFSPLUS_IS_RSRC(inode))
536 main_inode = HFSPLUS_I(inode).rsrc_inode; 537 main_inode = HFSPLUS_I(inode)->rsrc_inode;
537 538
538 if (!main_inode->i_nlink) 539 if (!main_inode->i_nlink)
539 return 0; 540 return 0;
540 541
541 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd)) 542 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
542 /* panic? */ 543 /* panic? */
543 return -EIO; 544 return -EIO;
544 545
@@ -554,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
554 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 555 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
555 sizeof(struct hfsplus_cat_folder)); 556 sizeof(struct hfsplus_cat_folder));
556 /* simple node checks? */ 557 /* simple node checks? */
557 hfsplus_set_perms(inode, &folder->permissions); 558 hfsplus_cat_set_perms(inode, &folder->permissions);
558 folder->access_date = hfsp_ut2mt(inode->i_atime); 559 folder->access_date = hfsp_ut2mt(inode->i_atime);
559 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); 560 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
560 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); 561 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -576,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
576 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 577 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
577 sizeof(struct hfsplus_cat_file)); 578 sizeof(struct hfsplus_cat_file));
578 hfsplus_inode_write_fork(inode, &file->data_fork); 579 hfsplus_inode_write_fork(inode, &file->data_fork);
579 if (S_ISREG(inode->i_mode)) 580 hfsplus_cat_set_perms(inode, &file->permissions);
580 HFSPLUS_I(inode).dev = inode->i_nlink;
581 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
582 HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev);
583 hfsplus_set_perms(inode, &file->permissions);
584 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 581 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
585 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 582 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
586 else 583 else
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ac405f099026..5b4667e08ef7 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,83 +17,98 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/xattr.h> 19#include <linux/xattr.h>
20#include <linux/smp_lock.h>
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
22#include "hfsplus_fs.h" 21#include "hfsplus_fs.h"
23 22
24long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 23static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
25{ 24{
26 struct inode *inode = filp->f_path.dentry->d_inode; 25 struct inode *inode = file->f_path.dentry->d_inode;
26 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
27 unsigned int flags = 0;
28
29 if (inode->i_flags & S_IMMUTABLE)
30 flags |= FS_IMMUTABLE_FL;
31 if (inode->i_flags |= S_APPEND)
32 flags |= FS_APPEND_FL;
33 if (hip->userflags & HFSPLUS_FLG_NODUMP)
34 flags |= FS_NODUMP_FL;
35
36 return put_user(flags, user_flags);
37}
38
39static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
40{
41 struct inode *inode = file->f_path.dentry->d_inode;
42 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
27 unsigned int flags; 43 unsigned int flags;
44 int err = 0;
28 45
29 lock_kernel(); 46 err = mnt_want_write(file->f_path.mnt);
30 switch (cmd) { 47 if (err)
31 case HFSPLUS_IOC_EXT2_GETFLAGS: 48 goto out;
32 flags = 0;
33 if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
34 flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
35 if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
36 flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
37 if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
38 flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
39 return put_user(flags, (int __user *)arg);
40 case HFSPLUS_IOC_EXT2_SETFLAGS: {
41 int err = 0;
42 err = mnt_want_write(filp->f_path.mnt);
43 if (err) {
44 unlock_kernel();
45 return err;
46 }
47 49
48 if (!is_owner_or_cap(inode)) { 50 if (!is_owner_or_cap(inode)) {
49 err = -EACCES; 51 err = -EACCES;
50 goto setflags_out; 52 goto out_drop_write;
51 } 53 }
52 if (get_user(flags, (int __user *)arg)) {
53 err = -EFAULT;
54 goto setflags_out;
55 }
56 if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
57 HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
58 if (!capable(CAP_LINUX_IMMUTABLE)) {
59 err = -EPERM;
60 goto setflags_out;
61 }
62 }
63 54
64 /* don't silently ignore unsupported ext2 flags */ 55 if (get_user(flags, user_flags)) {
65 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { 56 err = -EFAULT;
66 err = -EOPNOTSUPP; 57 goto out_drop_write;
67 goto setflags_out; 58 }
68 } 59
69 if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */ 60 mutex_lock(&inode->i_mutex);
70 inode->i_flags |= S_IMMUTABLE; 61
71 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE; 62 if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
72 } else { 63 inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
73 inode->i_flags &= ~S_IMMUTABLE; 64 if (!capable(CAP_LINUX_IMMUTABLE)) {
74 HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE; 65 err = -EPERM;
75 } 66 goto out_unlock_inode;
76 if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
77 inode->i_flags |= S_APPEND;
78 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
79 } else {
80 inode->i_flags &= ~S_APPEND;
81 HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
82 } 67 }
83 if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
84 HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
85 else
86 HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
87
88 inode->i_ctime = CURRENT_TIME_SEC;
89 mark_inode_dirty(inode);
90setflags_out:
91 mnt_drop_write(filp->f_path.mnt);
92 unlock_kernel();
93 return err;
94 } 68 }
69
70 /* don't silently ignore unsupported ext2 flags */
71 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
72 err = -EOPNOTSUPP;
73 goto out_unlock_inode;
74 }
75
76 if (flags & FS_IMMUTABLE_FL)
77 inode->i_flags |= S_IMMUTABLE;
78 else
79 inode->i_flags &= ~S_IMMUTABLE;
80
81 if (flags & FS_APPEND_FL)
82 inode->i_flags |= S_APPEND;
83 else
84 inode->i_flags &= ~S_APPEND;
85
86 if (flags & FS_NODUMP_FL)
87 hip->userflags |= HFSPLUS_FLG_NODUMP;
88 else
89 hip->userflags &= ~HFSPLUS_FLG_NODUMP;
90
91 inode->i_ctime = CURRENT_TIME_SEC;
92 mark_inode_dirty(inode);
93
94out_unlock_inode:
95 mutex_lock(&inode->i_mutex);
96out_drop_write:
97 mnt_drop_write(file->f_path.mnt);
98out:
99 return err;
100}
101
102long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
103{
104 void __user *argp = (void __user *)arg;
105
106 switch (cmd) {
107 case HFSPLUS_IOC_EXT2_GETFLAGS:
108 return hfsplus_ioctl_getflags(file, argp);
109 case HFSPLUS_IOC_EXT2_SETFLAGS:
110 return hfsplus_ioctl_setflags(file, argp);
95 default: 111 default:
96 unlock_kernel();
97 return -ENOTTY; 112 return -ENOTTY;
98 } 113 }
99} 114}
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
110 if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) 125 if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
111 return -EOPNOTSUPP; 126 return -EOPNOTSUPP;
112 127
113 res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); 128 res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
114 if (res) 129 if (res)
115 return res; 130 return res;
116 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); 131 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
153 return -EOPNOTSUPP; 168 return -EOPNOTSUPP;
154 169
155 if (size) { 170 if (size) {
156 res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); 171 res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
157 if (res) 172 if (res)
158 return res; 173 return res;
159 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); 174 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
177 } else 192 } else
178 res = size ? -ERANGE : 4; 193 res = size ? -ERANGE : 4;
179 } else 194 } else
180 res = -ENODATA; 195 res = -EOPNOTSUPP;
181out: 196out:
182 if (size) 197 if (size)
183 hfs_find_exit(&fd); 198 hfs_find_exit(&fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 572628b4b07d..f9ab276a4d8d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
143 kfree(p); 143 kfree(p);
144 break; 144 break;
145 case opt_decompose: 145 case opt_decompose:
146 sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE; 146 clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
147 break; 147 break;
148 case opt_nodecompose: 148 case opt_nodecompose:
149 sbi->flags |= HFSPLUS_SB_NODECOMPOSE; 149 set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
150 break; 150 break;
151 case opt_force: 151 case opt_force:
152 sbi->flags |= HFSPLUS_SB_FORCE; 152 set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
153 break; 153 break;
154 default: 154 default:
155 return 0; 155 return 0;
@@ -171,7 +171,7 @@ done:
171 171
172int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) 172int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
173{ 173{
174 struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb); 174 struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
175 175
176 if (sbi->creator != HFSPLUS_DEF_CR_TYPE) 176 if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
177 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); 177 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
@@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
184 seq_printf(seq, ",session=%u", sbi->session); 184 seq_printf(seq, ",session=%u", sbi->session);
185 if (sbi->nls) 185 if (sbi->nls)
186 seq_printf(seq, ",nls=%s", sbi->nls->charset); 186 seq_printf(seq, ",nls=%s", sbi->nls->charset);
187 if (sbi->flags & HFSPLUS_SB_NODECOMPOSE) 187 if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
188 seq_printf(seq, ",nodecompose"); 188 seq_printf(seq, ",nodecompose");
189 return 0; 189 return 0;
190} 190}
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 1528a6fd0299..208b16c645cc 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -74,6 +74,7 @@ struct old_pmap {
74int hfs_part_find(struct super_block *sb, 74int hfs_part_find(struct super_block *sb,
75 sector_t *part_start, sector_t *part_size) 75 sector_t *part_start, sector_t *part_size)
76{ 76{
77 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
77 struct buffer_head *bh; 78 struct buffer_head *bh;
78 __be16 *data; 79 __be16 *data;
79 int i, size, res; 80 int i, size, res;
@@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb,
95 for (i = 0; i < size; p++, i++) { 96 for (i = 0; i < size; p++, i++) {
96 if (p->pdStart && p->pdSize && 97 if (p->pdStart && p->pdSize &&
97 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && 98 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
98 (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { 99 (sbi->part < 0 || sbi->part == i)) {
99 *part_start += be32_to_cpu(p->pdStart); 100 *part_start += be32_to_cpu(p->pdStart);
100 *part_size = be32_to_cpu(p->pdSize); 101 *part_size = be32_to_cpu(p->pdSize);
101 res = 0; 102 res = 0;
@@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb,
111 size = be32_to_cpu(pm->pmMapBlkCnt); 112 size = be32_to_cpu(pm->pmMapBlkCnt);
112 for (i = 0; i < size;) { 113 for (i = 0; i < size;) {
113 if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && 114 if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
114 (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { 115 (sbi->part < 0 || sbi->part == i)) {
115 *part_start += be32_to_cpu(pm->pmPyPartStart); 116 *part_start += be32_to_cpu(pm->pmPyPartStart);
116 *part_size = be32_to_cpu(pm->pmPartBlkCnt); 117 *part_size = be32_to_cpu(pm->pmPartBlkCnt);
117 res = 0; 118 res = 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3b55c050c742..9a88d7536103 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,7 +12,6 @@
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/smp_lock.h>
16#include <linux/vfs.h> 15#include <linux/vfs.h>
17#include <linux/nls.h> 16#include <linux/nls.h>
18 17
@@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode);
21 20
22#include "hfsplus_fs.h" 21#include "hfsplus_fs.h"
23 22
24struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) 23static int hfsplus_system_read_inode(struct inode *inode)
25{ 24{
26 struct hfs_find_data fd; 25 struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
27 struct hfsplus_vh *vhdr;
28 struct inode *inode;
29 long err = -EIO;
30
31 inode = iget_locked(sb, ino);
32 if (!inode)
33 return ERR_PTR(-ENOMEM);
34 if (!(inode->i_state & I_NEW))
35 return inode;
36 26
37 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 27 switch (inode->i_ino) {
38 mutex_init(&HFSPLUS_I(inode).extents_lock);
39 HFSPLUS_I(inode).flags = 0;
40 HFSPLUS_I(inode).rsrc_inode = NULL;
41 atomic_set(&HFSPLUS_I(inode).opencnt, 0);
42
43 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
44 read_inode:
45 hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
46 err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
47 if (!err)
48 err = hfsplus_cat_read_inode(inode, &fd);
49 hfs_find_exit(&fd);
50 if (err)
51 goto bad_inode;
52 goto done;
53 }
54 vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
55 switch(inode->i_ino) {
56 case HFSPLUS_ROOT_CNID:
57 goto read_inode;
58 case HFSPLUS_EXT_CNID: 28 case HFSPLUS_EXT_CNID:
59 hfsplus_inode_read_fork(inode, &vhdr->ext_file); 29 hfsplus_inode_read_fork(inode, &vhdr->ext_file);
60 inode->i_mapping->a_ops = &hfsplus_btree_aops; 30 inode->i_mapping->a_ops = &hfsplus_btree_aops;
@@ -75,74 +45,101 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
75 inode->i_mapping->a_ops = &hfsplus_btree_aops; 45 inode->i_mapping->a_ops = &hfsplus_btree_aops;
76 break; 46 break;
77 default: 47 default:
78 goto bad_inode; 48 return -EIO;
49 }
50
51 return 0;
52}
53
54struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
55{
56 struct hfs_find_data fd;
57 struct inode *inode;
58 int err;
59
60 inode = iget_locked(sb, ino);
61 if (!inode)
62 return ERR_PTR(-ENOMEM);
63 if (!(inode->i_state & I_NEW))
64 return inode;
65
66 INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
67 mutex_init(&HFSPLUS_I(inode)->extents_lock);
68 HFSPLUS_I(inode)->flags = 0;
69 HFSPLUS_I(inode)->rsrc_inode = NULL;
70 atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
71
72 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
73 inode->i_ino == HFSPLUS_ROOT_CNID) {
74 hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
75 err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
76 if (!err)
77 err = hfsplus_cat_read_inode(inode, &fd);
78 hfs_find_exit(&fd);
79 } else {
80 err = hfsplus_system_read_inode(inode);
81 }
82
83 if (err) {
84 iget_failed(inode);
85 return ERR_PTR(err);
79 } 86 }
80 87
81done:
82 unlock_new_inode(inode); 88 unlock_new_inode(inode);
83 return inode; 89 return inode;
84
85bad_inode:
86 iget_failed(inode);
87 return ERR_PTR(err);
88} 90}
89 91
90static int hfsplus_write_inode(struct inode *inode, 92static int hfsplus_system_write_inode(struct inode *inode)
91 struct writeback_control *wbc)
92{ 93{
93 struct hfsplus_vh *vhdr; 94 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
94 int ret = 0; 95 struct hfsplus_vh *vhdr = sbi->s_vhdr;
96 struct hfsplus_fork_raw *fork;
97 struct hfs_btree *tree = NULL;
95 98
96 dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
97 hfsplus_ext_write_extent(inode);
98 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
99 return hfsplus_cat_write_inode(inode);
100 }
101 vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
102 switch (inode->i_ino) { 99 switch (inode->i_ino) {
103 case HFSPLUS_ROOT_CNID:
104 ret = hfsplus_cat_write_inode(inode);
105 break;
106 case HFSPLUS_EXT_CNID: 100 case HFSPLUS_EXT_CNID:
107 if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) { 101 fork = &vhdr->ext_file;
108 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 102 tree = sbi->ext_tree;
109 inode->i_sb->s_dirt = 1;
110 }
111 hfsplus_inode_write_fork(inode, &vhdr->ext_file);
112 hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
113 break; 103 break;
114 case HFSPLUS_CAT_CNID: 104 case HFSPLUS_CAT_CNID:
115 if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) { 105 fork = &vhdr->cat_file;
116 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 106 tree = sbi->cat_tree;
117 inode->i_sb->s_dirt = 1;
118 }
119 hfsplus_inode_write_fork(inode, &vhdr->cat_file);
120 hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
121 break; 107 break;
122 case HFSPLUS_ALLOC_CNID: 108 case HFSPLUS_ALLOC_CNID:
123 if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) { 109 fork = &vhdr->alloc_file;
124 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
125 inode->i_sb->s_dirt = 1;
126 }
127 hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
128 break; 110 break;
129 case HFSPLUS_START_CNID: 111 case HFSPLUS_START_CNID:
130 if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) { 112 fork = &vhdr->start_file;
131 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
132 inode->i_sb->s_dirt = 1;
133 }
134 hfsplus_inode_write_fork(inode, &vhdr->start_file);
135 break; 113 break;
136 case HFSPLUS_ATTR_CNID: 114 case HFSPLUS_ATTR_CNID:
137 if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) { 115 fork = &vhdr->attr_file;
138 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 116 tree = sbi->attr_tree;
139 inode->i_sb->s_dirt = 1; 117 default:
140 } 118 return -EIO;
141 hfsplus_inode_write_fork(inode, &vhdr->attr_file); 119 }
142 hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree); 120
143 break; 121 if (fork->total_size != cpu_to_be64(inode->i_size)) {
122 set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
123 inode->i_sb->s_dirt = 1;
144 } 124 }
145 return ret; 125 hfsplus_inode_write_fork(inode, fork);
126 if (tree)
127 hfs_btree_write(tree);
128 return 0;
129}
130
131static int hfsplus_write_inode(struct inode *inode,
132 struct writeback_control *wbc)
133{
134 dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
135
136 hfsplus_ext_write_extent(inode);
137
138 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
139 inode->i_ino == HFSPLUS_ROOT_CNID)
140 return hfsplus_cat_write_inode(inode);
141 else
142 return hfsplus_system_write_inode(inode);
146} 143}
147 144
148static void hfsplus_evict_inode(struct inode *inode) 145static void hfsplus_evict_inode(struct inode *inode)
@@ -151,51 +148,53 @@ static void hfsplus_evict_inode(struct inode *inode)
151 truncate_inode_pages(&inode->i_data, 0); 148 truncate_inode_pages(&inode->i_data, 0);
152 end_writeback(inode); 149 end_writeback(inode);
153 if (HFSPLUS_IS_RSRC(inode)) { 150 if (HFSPLUS_IS_RSRC(inode)) {
154 HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL; 151 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
155 iput(HFSPLUS_I(inode).rsrc_inode); 152 iput(HFSPLUS_I(inode)->rsrc_inode);
156 } 153 }
157} 154}
158 155
159int hfsplus_sync_fs(struct super_block *sb, int wait) 156int hfsplus_sync_fs(struct super_block *sb, int wait)
160{ 157{
161 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 158 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
159 struct hfsplus_vh *vhdr = sbi->s_vhdr;
162 160
163 dprint(DBG_SUPER, "hfsplus_write_super\n"); 161 dprint(DBG_SUPER, "hfsplus_write_super\n");
164 162
165 lock_super(sb); 163 mutex_lock(&sbi->vh_mutex);
164 mutex_lock(&sbi->alloc_mutex);
166 sb->s_dirt = 0; 165 sb->s_dirt = 0;
167 166
168 vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); 167 vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
169 vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); 168 vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
170 vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid); 169 vhdr->folder_count = cpu_to_be32(sbi->folder_count);
171 vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count); 170 vhdr->file_count = cpu_to_be32(sbi->file_count);
172 vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count);
173 171
174 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 172 mark_buffer_dirty(sbi->s_vhbh);
175 if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) { 173 if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
176 if (HFSPLUS_SB(sb).sect_count) { 174 if (sbi->sect_count) {
177 struct buffer_head *bh; 175 struct buffer_head *bh;
178 u32 block, offset; 176 u32 block, offset;
179 177
180 block = HFSPLUS_SB(sb).blockoffset; 178 block = sbi->blockoffset;
181 block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9); 179 block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
182 offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1); 180 offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
183 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset, 181 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
184 HFSPLUS_SB(sb).sect_count, block, offset); 182 sbi->blockoffset, sbi->sect_count,
183 block, offset);
185 bh = sb_bread(sb, block); 184 bh = sb_bread(sb, block);
186 if (bh) { 185 if (bh) {
187 vhdr = (struct hfsplus_vh *)(bh->b_data + offset); 186 vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
188 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) { 187 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
189 memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr)); 188 memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
190 mark_buffer_dirty(bh); 189 mark_buffer_dirty(bh);
191 brelse(bh); 190 brelse(bh);
192 } else 191 } else
193 printk(KERN_WARNING "hfs: backup not found!\n"); 192 printk(KERN_WARNING "hfs: backup not found!\n");
194 } 193 }
195 } 194 }
196 HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
197 } 195 }
198 unlock_super(sb); 196 mutex_unlock(&sbi->alloc_mutex);
197 mutex_unlock(&sbi->vh_mutex);
199 return 0; 198 return 0;
200} 199}
201 200
@@ -209,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb)
209 208
210static void hfsplus_put_super(struct super_block *sb) 209static void hfsplus_put_super(struct super_block *sb)
211{ 210{
211 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
212
212 dprint(DBG_SUPER, "hfsplus_put_super\n"); 213 dprint(DBG_SUPER, "hfsplus_put_super\n");
214
213 if (!sb->s_fs_info) 215 if (!sb->s_fs_info)
214 return; 216 return;
215 217
216 lock_kernel();
217
218 if (sb->s_dirt) 218 if (sb->s_dirt)
219 hfsplus_write_super(sb); 219 hfsplus_write_super(sb);
220 if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { 220 if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
221 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 221 struct hfsplus_vh *vhdr = sbi->s_vhdr;
222 222
223 vhdr->modify_date = hfsp_now2mt(); 223 vhdr->modify_date = hfsp_now2mt();
224 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); 224 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
225 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); 225 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
226 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 226 mark_buffer_dirty(sbi->s_vhbh);
227 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 227 sync_dirty_buffer(sbi->s_vhbh);
228 } 228 }
229 229
230 hfs_btree_close(HFSPLUS_SB(sb).cat_tree); 230 hfs_btree_close(sbi->cat_tree);
231 hfs_btree_close(HFSPLUS_SB(sb).ext_tree); 231 hfs_btree_close(sbi->ext_tree);
232 iput(HFSPLUS_SB(sb).alloc_file); 232 iput(sbi->alloc_file);
233 iput(HFSPLUS_SB(sb).hidden_dir); 233 iput(sbi->hidden_dir);
234 brelse(HFSPLUS_SB(sb).s_vhbh); 234 brelse(sbi->s_vhbh);
235 unload_nls(HFSPLUS_SB(sb).nls); 235 unload_nls(sbi->nls);
236 kfree(sb->s_fs_info); 236 kfree(sb->s_fs_info);
237 sb->s_fs_info = NULL; 237 sb->s_fs_info = NULL;
238
239 unlock_kernel();
240} 238}
241 239
242static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) 240static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
243{ 241{
244 struct super_block *sb = dentry->d_sb; 242 struct super_block *sb = dentry->d_sb;
243 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
245 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 244 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
246 245
247 buf->f_type = HFSPLUS_SUPER_MAGIC; 246 buf->f_type = HFSPLUS_SUPER_MAGIC;
248 buf->f_bsize = sb->s_blocksize; 247 buf->f_bsize = sb->s_blocksize;
249 buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift; 248 buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
250 buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift; 249 buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
251 buf->f_bavail = buf->f_bfree; 250 buf->f_bavail = buf->f_bfree;
252 buf->f_files = 0xFFFFFFFF; 251 buf->f_files = 0xFFFFFFFF;
253 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; 252 buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
254 buf->f_fsid.val[0] = (u32)id; 253 buf->f_fsid.val[0] = (u32)id;
255 buf->f_fsid.val[1] = (u32)(id >> 32); 254 buf->f_fsid.val[1] = (u32)(id >> 32);
256 buf->f_namelen = HFSPLUS_MAX_STRLEN; 255 buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -263,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
263 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 262 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
264 return 0; 263 return 0;
265 if (!(*flags & MS_RDONLY)) { 264 if (!(*flags & MS_RDONLY)) {
266 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 265 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
267 struct hfsplus_sb_info sbi; 266 struct hfsplus_sb_info sbi;
268 267
269 memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); 268 memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
270 sbi.nls = HFSPLUS_SB(sb).nls; 269 sbi.nls = HFSPLUS_SB(sb)->nls;
271 if (!hfsplus_parse_options(data, &sbi)) 270 if (!hfsplus_parse_options(data, &sbi))
272 return -EINVAL; 271 return -EINVAL;
273 272
@@ -276,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
276 "running fsck.hfsplus is recommended. leaving read-only.\n"); 275 "running fsck.hfsplus is recommended. leaving read-only.\n");
277 sb->s_flags |= MS_RDONLY; 276 sb->s_flags |= MS_RDONLY;
278 *flags |= MS_RDONLY; 277 *flags |= MS_RDONLY;
279 } else if (sbi.flags & HFSPLUS_SB_FORCE) { 278 } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
280 /* nothing */ 279 /* nothing */
281 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 280 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
282 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); 281 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
@@ -320,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
320 return -ENOMEM; 319 return -ENOMEM;
321 320
322 sb->s_fs_info = sbi; 321 sb->s_fs_info = sbi;
323 INIT_HLIST_HEAD(&sbi->rsrc_inodes); 322 mutex_init(&sbi->alloc_mutex);
323 mutex_init(&sbi->vh_mutex);
324 hfsplus_fill_defaults(sbi); 324 hfsplus_fill_defaults(sbi);
325 if (!hfsplus_parse_options(data, sbi)) { 325 if (!hfsplus_parse_options(data, sbi)) {
326 printk(KERN_ERR "hfs: unable to parse mount options\n"); 326 printk(KERN_ERR "hfs: unable to parse mount options\n");
@@ -344,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
344 err = -EINVAL; 344 err = -EINVAL;
345 goto cleanup; 345 goto cleanup;
346 } 346 }
347 vhdr = HFSPLUS_SB(sb).s_vhdr; 347 vhdr = sbi->s_vhdr;
348 348
349 /* Copy parts of the volume header into the superblock */ 349 /* Copy parts of the volume header into the superblock */
350 sb->s_magic = HFSPLUS_VOLHEAD_SIG; 350 sb->s_magic = HFSPLUS_VOLHEAD_SIG;
@@ -353,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
353 printk(KERN_ERR "hfs: wrong filesystem version\n"); 353 printk(KERN_ERR "hfs: wrong filesystem version\n");
354 goto cleanup; 354 goto cleanup;
355 } 355 }
356 HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks); 356 sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
357 HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks); 357 sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
358 HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc); 358 sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
359 HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid); 359 sbi->file_count = be32_to_cpu(vhdr->file_count);
360 HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count); 360 sbi->folder_count = be32_to_cpu(vhdr->folder_count);
361 HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count); 361 sbi->data_clump_blocks =
362 HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; 362 be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
363 if (!HFSPLUS_SB(sb).data_clump_blocks) 363 if (!sbi->data_clump_blocks)
364 HFSPLUS_SB(sb).data_clump_blocks = 1; 364 sbi->data_clump_blocks = 1;
365 HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; 365 sbi->rsrc_clump_blocks =
366 if (!HFSPLUS_SB(sb).rsrc_clump_blocks) 366 be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
367 HFSPLUS_SB(sb).rsrc_clump_blocks = 1; 367 if (!sbi->rsrc_clump_blocks)
368 sbi->rsrc_clump_blocks = 1;
368 369
369 /* Set up operations so we can load metadata */ 370 /* Set up operations so we can load metadata */
370 sb->s_op = &hfsplus_sops; 371 sb->s_op = &hfsplus_sops;
@@ -374,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
374 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, " 375 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
375 "running fsck.hfsplus is recommended. mounting read-only.\n"); 376 "running fsck.hfsplus is recommended. mounting read-only.\n");
376 sb->s_flags |= MS_RDONLY; 377 sb->s_flags |= MS_RDONLY;
377 } else if (sbi->flags & HFSPLUS_SB_FORCE) { 378 } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
378 /* nothing */ 379 /* nothing */
379 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 380 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
380 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); 381 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
@@ -384,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
384 "use the force option at your own risk, mounting read-only.\n"); 385 "use the force option at your own risk, mounting read-only.\n");
385 sb->s_flags |= MS_RDONLY; 386 sb->s_flags |= MS_RDONLY;
386 } 387 }
387 sbi->flags &= ~HFSPLUS_SB_FORCE;
388 388
389 /* Load metadata objects (B*Trees) */ 389 /* Load metadata objects (B*Trees) */
390 HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); 390 sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
391 if (!HFSPLUS_SB(sb).ext_tree) { 391 if (!sbi->ext_tree) {
392 printk(KERN_ERR "hfs: failed to load extents file\n"); 392 printk(KERN_ERR "hfs: failed to load extents file\n");
393 goto cleanup; 393 goto cleanup;
394 } 394 }
395 HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); 395 sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
396 if (!HFSPLUS_SB(sb).cat_tree) { 396 if (!sbi->cat_tree) {
397 printk(KERN_ERR "hfs: failed to load catalog file\n"); 397 printk(KERN_ERR "hfs: failed to load catalog file\n");
398 goto cleanup; 398 goto cleanup;
399 } 399 }
@@ -404,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
404 err = PTR_ERR(inode); 404 err = PTR_ERR(inode);
405 goto cleanup; 405 goto cleanup;
406 } 406 }
407 HFSPLUS_SB(sb).alloc_file = inode; 407 sbi->alloc_file = inode;
408 408
409 /* Load the root directory */ 409 /* Load the root directory */
410 root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); 410 root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
@@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
423 423
424 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 424 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
425 str.name = HFSP_HIDDENDIR_NAME; 425 str.name = HFSP_HIDDENDIR_NAME;
426 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 426 hfs_find_init(sbi->cat_tree, &fd);
427 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); 427 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
428 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { 428 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
429 hfs_find_exit(&fd); 429 hfs_find_exit(&fd);
@@ -434,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
434 err = PTR_ERR(inode); 434 err = PTR_ERR(inode);
435 goto cleanup; 435 goto cleanup;
436 } 436 }
437 HFSPLUS_SB(sb).hidden_dir = inode; 437 sbi->hidden_dir = inode;
438 } else 438 } else
439 hfs_find_exit(&fd); 439 hfs_find_exit(&fd);
440 440
@@ -449,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
449 be32_add_cpu(&vhdr->write_count, 1); 449 be32_add_cpu(&vhdr->write_count, 1);
450 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); 450 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
451 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); 451 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
452 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 452 mark_buffer_dirty(sbi->s_vhbh);
453 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 453 sync_dirty_buffer(sbi->s_vhbh);
454 454
455 if (!HFSPLUS_SB(sb).hidden_dir) { 455 if (!sbi->hidden_dir) {
456 printk(KERN_DEBUG "hfs: create hidden dir...\n"); 456 printk(KERN_DEBUG "hfs: create hidden dir...\n");
457 HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR); 457
458 hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode, 458 mutex_lock(&sbi->vh_mutex);
459 &str, HFSPLUS_SB(sb).hidden_dir); 459 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
460 mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir); 460 hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
461 &str, sbi->hidden_dir);
462 mutex_unlock(&sbi->vh_mutex);
463
464 mark_inode_dirty(sbi->hidden_dir);
461 } 465 }
462out: 466out:
463 unload_nls(sbi->nls); 467 unload_nls(sbi->nls);
@@ -486,7 +490,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
486 490
487static void hfsplus_destroy_inode(struct inode *inode) 491static void hfsplus_destroy_inode(struct inode *inode)
488{ 492{
489 kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode)); 493 kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
490} 494}
491 495
492#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) 496#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info)
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 628ccf6fa402..b66d67de882c 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
121int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) 121int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
122{ 122{
123 const hfsplus_unichr *ip; 123 const hfsplus_unichr *ip;
124 struct nls_table *nls = HFSPLUS_SB(sb).nls; 124 struct nls_table *nls = HFSPLUS_SB(sb)->nls;
125 u8 *op; 125 u8 *op;
126 u16 cc, c0, c1; 126 u16 cc, c0, c1;
127 u16 *ce1, *ce2; 127 u16 *ce1, *ce2;
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
132 ustrlen = be16_to_cpu(ustr->length); 132 ustrlen = be16_to_cpu(ustr->length);
133 len = *len_p; 133 len = *len_p;
134 ce1 = NULL; 134 ce1 = NULL;
135 compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 135 compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
136 136
137 while (ustrlen > 0) { 137 while (ustrlen > 0) {
138 c0 = be16_to_cpu(*ip++); 138 c0 = be16_to_cpu(*ip++);
@@ -246,7 +246,7 @@ out:
246static inline int asc2unichar(struct super_block *sb, const char *astr, int len, 246static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
247 wchar_t *uc) 247 wchar_t *uc)
248{ 248{
249 int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc); 249 int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
250 if (size <= 0) { 250 if (size <= 0) {
251 *uc = '?'; 251 *uc = '?';
252 size = 1; 252 size = 1;
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
293 u16 *dstr, outlen = 0; 293 u16 *dstr, outlen = 0;
294 wchar_t c; 294 wchar_t c;
295 295
296 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 296 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { 297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
298 size = asc2unichar(sb, astr, len, &c); 298 size = asc2unichar(sb, astr, len, &c);
299 299
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
330 wchar_t c; 330 wchar_t c;
331 u16 c2; 331 u16 c2;
332 332
333 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); 333 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
334 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 334 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
335 hash = init_name_hash(); 335 hash = init_name_hash();
336 astr = str->name; 336 astr = str->name;
337 len = str->len; 337 len = str->len;
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
373 u16 c1, c2; 373 u16 c1, c2;
374 wchar_t c; 374 wchar_t c;
375 375
376 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); 376 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
377 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 377 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
378 astr1 = s1->name; 378 astr1 = s1->name;
379 len1 = s1->len; 379 len1 = s1->len;
380 astr2 = s2->name; 380 astr2 = s2->name;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index bed78ac8f6d1..8972c20b3216 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
65 *start = 0; 65 *start = 0;
66 *size = sb->s_bdev->bd_inode->i_size >> 9; 66 *size = sb->s_bdev->bd_inode->i_size >> 9;
67 67
68 if (HFSPLUS_SB(sb).session >= 0) { 68 if (HFSPLUS_SB(sb)->session >= 0) {
69 te.cdte_track = HFSPLUS_SB(sb).session; 69 te.cdte_track = HFSPLUS_SB(sb)->session;
70 te.cdte_format = CDROM_LBA; 70 te.cdte_format = CDROM_LBA;
71 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); 71 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
72 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { 72 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
@@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
87/* Takes in super block, returns true if good data read */ 87/* Takes in super block, returns true if good data read */
88int hfsplus_read_wrapper(struct super_block *sb) 88int hfsplus_read_wrapper(struct super_block *sb)
89{ 89{
90 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
90 struct buffer_head *bh; 91 struct buffer_head *bh;
91 struct hfsplus_vh *vhdr; 92 struct hfsplus_vh *vhdr;
92 struct hfsplus_wd wd; 93 struct hfsplus_wd wd;
@@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
122 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) 123 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
123 break; 124 break;
124 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) { 125 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
125 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX; 126 set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
126 break; 127 break;
127 } 128 }
128 brelse(bh); 129 brelse(bh);
@@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb)
143 if (blocksize < HFSPLUS_SECTOR_SIZE || 144 if (blocksize < HFSPLUS_SECTOR_SIZE ||
144 ((blocksize - 1) & blocksize)) 145 ((blocksize - 1) & blocksize))
145 return -EINVAL; 146 return -EINVAL;
146 HFSPLUS_SB(sb).alloc_blksz = blocksize; 147 sbi->alloc_blksz = blocksize;
147 HFSPLUS_SB(sb).alloc_blksz_shift = 0; 148 sbi->alloc_blksz_shift = 0;
148 while ((blocksize >>= 1) != 0) 149 while ((blocksize >>= 1) != 0)
149 HFSPLUS_SB(sb).alloc_blksz_shift++; 150 sbi->alloc_blksz_shift++;
150 blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE); 151 blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
151 152
152 /* align block size to block offset */ 153 /* align block size to block offset */
153 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) 154 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
@@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb)
158 return -EINVAL; 159 return -EINVAL;
159 } 160 }
160 161
161 HFSPLUS_SB(sb).blockoffset = part_start >> 162 sbi->blockoffset =
162 (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); 163 part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
163 HFSPLUS_SB(sb).sect_count = part_size; 164 sbi->sect_count = part_size;
164 HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift - 165 sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
165 sb->s_blocksize_bits;
166 166
167 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); 167 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
168 if (!bh) 168 if (!bh)
169 return -EIO; 169 return -EIO;
170 170
171 /* should still be the same... */ 171 /* should still be the same... */
172 if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ? 172 if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
173 cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) : 173 if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
174 cpu_to_be16(HFSPLUS_VOLHEAD_SIG))) 174 goto error;
175 goto error; 175 } else {
176 HFSPLUS_SB(sb).s_vhbh = bh; 176 if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
177 HFSPLUS_SB(sb).s_vhdr = vhdr; 177 goto error;
178 }
179
180 sbi->s_vhbh = bh;
181 sbi->s_vhdr = vhdr;
178 182
179 return 0; 183 return 0;
180 error: 184 error:
diff --git a/fs/ceph/auth.h b/include/linux/ceph/auth.h
index d38a2fb4a137..7fff521d7eb5 100644
--- a/fs/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -1,8 +1,8 @@
1#ifndef _FS_CEPH_AUTH_H 1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H 2#define _FS_CEPH_AUTH_H
3 3
4#include "types.h" 4#include <linux/ceph/types.h>
5#include "buffer.h" 5#include <linux/ceph/buffer.h>
6 6
7/* 7/*
8 * Abstract interface for communicating with the authenticate module. 8 * Abstract interface for communicating with the authenticate module.
diff --git a/fs/ceph/buffer.h b/include/linux/ceph/buffer.h
index 58d19014068f..58d19014068f 100644
--- a/fs/ceph/buffer.h
+++ b/include/linux/ceph/buffer.h
diff --git a/fs/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h
index 1818c2305610..aa2e19182d99 100644
--- a/fs/ceph/ceph_debug.h
+++ b/include/linux/ceph/ceph_debug.h
@@ -3,7 +3,7 @@
3 3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5 5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG 6#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
7 7
8/* 8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line. 9 * wrap pr_debug to include a filename:lineno prefix on each line.
@@ -14,7 +14,8 @@
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) 14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len); 15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \ 16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \ 17 pr_debug("%.*s %12.12s:%-4d : " fmt, \
18 8 - (int)sizeof(KBUILD_MODNAME), " ", \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \ 19 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__) 20 __LINE__, ##__VA_ARGS__)
20# else 21# else
diff --git a/fs/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index 5babb8e95352..5babb8e95352 100644
--- a/fs/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
diff --git a/fs/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index d5619ac86711..c3c74aef289d 100644
--- a/fs/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -299,6 +299,7 @@ enum {
299 CEPH_MDS_OP_SETATTR = 0x01108, 299 CEPH_MDS_OP_SETATTR = 0x01108,
300 CEPH_MDS_OP_SETFILELOCK= 0x01109, 300 CEPH_MDS_OP_SETFILELOCK= 0x01109,
301 CEPH_MDS_OP_GETFILELOCK= 0x00110, 301 CEPH_MDS_OP_GETFILELOCK= 0x00110,
302 CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
302 303
303 CEPH_MDS_OP_MKNOD = 0x01201, 304 CEPH_MDS_OP_MKNOD = 0x01201,
304 CEPH_MDS_OP_LINK = 0x01202, 305 CEPH_MDS_OP_LINK = 0x01202,
diff --git a/fs/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h
index d099c3f90236..d099c3f90236 100644
--- a/fs/ceph/ceph_hash.h
+++ b/include/linux/ceph/ceph_hash.h
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h
new file mode 100644
index 000000000000..2a79702e092b
--- /dev/null
+++ b/include/linux/ceph/debugfs.h
@@ -0,0 +1,33 @@
1#ifndef _FS_CEPH_DEBUGFS_H
2#define _FS_CEPH_DEBUGFS_H
3
4#include "ceph_debug.h"
5#include "types.h"
6
7#define CEPH_DEFINE_SHOW_FUNC(name) \
8static int name##_open(struct inode *inode, struct file *file) \
9{ \
10 struct seq_file *sf; \
11 int ret; \
12 \
13 ret = single_open(file, name, NULL); \
14 sf = file->private_data; \
15 sf->private = inode->i_private; \
16 return ret; \
17} \
18 \
19static const struct file_operations name##_fops = { \
20 .open = name##_open, \
21 .read = seq_read, \
22 .llseek = seq_lseek, \
23 .release = single_release, \
24};
25
26/* debugfs.c */
27extern int ceph_debugfs_init(void);
28extern void ceph_debugfs_cleanup(void);
29extern int ceph_debugfs_client_init(struct ceph_client *client);
30extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
31
32#endif
33
diff --git a/fs/ceph/decode.h b/include/linux/ceph/decode.h
index 3d25415afe63..c5b6939fb32a 100644
--- a/fs/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -191,6 +191,11 @@ static inline void ceph_encode_string(void **p, void *end,
191 ceph_encode_need(p, end, n, bad); \ 191 ceph_encode_need(p, end, n, bad); \
192 ceph_encode_copy(p, pv, n); \ 192 ceph_encode_copy(p, pv, n); \
193 } while (0) 193 } while (0)
194#define ceph_encode_string_safe(p, end, s, n, bad) \
195 do { \
196 ceph_encode_need(p, end, n, bad); \
197 ceph_encode_string(p, end, s, n); \
198 } while (0)
194 199
195 200
196#endif 201#endif
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
new file mode 100644
index 000000000000..f22b2e941686
--- /dev/null
+++ b/include/linux/ceph/libceph.h
@@ -0,0 +1,249 @@
1#ifndef _FS_CEPH_LIBCEPH_H
2#define _FS_CEPH_LIBCEPH_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/wait.h>
14#include <linux/writeback.h>
15#include <linux/slab.h>
16
17#include "types.h"
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "osd_client.h"
22#include "ceph_fs.h"
23
24/*
25 * Supported features
26 */
27#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
28#define CEPH_FEATURE_REQUIRED_DEFAULT CEPH_FEATURE_NOSRCADDR
29
30/*
31 * mount options
32 */
33#define CEPH_OPT_FSID (1<<0)
34#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
35#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
36#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */
37
38#define CEPH_OPT_DEFAULT (0);
39
40#define ceph_set_opt(client, opt) \
41 (client)->options->flags |= CEPH_OPT_##opt;
42#define ceph_test_opt(client, opt) \
43 (!!((client)->options->flags & CEPH_OPT_##opt))
44
45struct ceph_options {
46 int flags;
47 struct ceph_fsid fsid;
48 struct ceph_entity_addr my_addr;
49 int mount_timeout;
50 int osd_idle_ttl;
51 int osd_timeout;
52 int osd_keepalive_timeout;
53
54 /*
55 * any type that can't be simply compared or doesn't need need
56 * to be compared should go beyond this point,
57 * ceph_compare_options() should be updated accordingly
58 */
59
60 struct ceph_entity_addr *mon_addr; /* should be the first
61 pointer type of args */
62 int num_mon;
63 char *name;
64 char *secret;
65};
66
67/*
68 * defaults
69 */
70#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
71#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
72#define CEPH_OSD_KEEPALIVE_DEFAULT 5
73#define CEPH_OSD_IDLE_TTL_DEFAULT 60
74#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
75
76#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
77#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
78
79#define CEPH_AUTH_NAME_DEFAULT "guest"
80
81/*
82 * Delay telling the MDS we no longer want caps, in case we reopen
83 * the file. Delay a minimum amount of time, even if we send a cap
84 * message for some other reason. Otherwise, take the oppotunity to
85 * update the mds to avoid sending another message later.
86 */
87#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
88#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
89
90#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
91
92/* mount state */
93enum {
94 CEPH_MOUNT_MOUNTING,
95 CEPH_MOUNT_MOUNTED,
96 CEPH_MOUNT_UNMOUNTING,
97 CEPH_MOUNT_UNMOUNTED,
98 CEPH_MOUNT_SHUTDOWN,
99};
100
101/*
102 * subtract jiffies
103 */
104static inline unsigned long time_sub(unsigned long a, unsigned long b)
105{
106 BUG_ON(time_after(b, a));
107 return (long)a - (long)b;
108}
109
110struct ceph_mds_client;
111
112/*
113 * per client state
114 *
115 * possibly shared by multiple mount points, if they are
116 * mounting the same ceph filesystem/cluster.
117 */
118struct ceph_client {
119 struct ceph_fsid fsid;
120 bool have_fsid;
121
122 void *private;
123
124 struct ceph_options *options;
125
126 struct mutex mount_mutex; /* serialize mount attempts */
127 wait_queue_head_t auth_wq;
128 int auth_err;
129
130 int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
131
132 u32 supported_features;
133 u32 required_features;
134
135 struct ceph_messenger *msgr; /* messenger instance */
136 struct ceph_mon_client monc;
137 struct ceph_osd_client osdc;
138
139#ifdef CONFIG_DEBUG_FS
140 struct dentry *debugfs_dir;
141 struct dentry *debugfs_monmap;
142 struct dentry *debugfs_osdmap;
143#endif
144};
145
146
147
148/*
149 * snapshots
150 */
151
152/*
153 * A "snap context" is the set of existing snapshots when we
154 * write data. It is used by the OSD to guide its COW behavior.
155 *
156 * The ceph_snap_context is refcounted, and attached to each dirty
157 * page, indicating which context the dirty data belonged when it was
158 * dirtied.
159 */
160struct ceph_snap_context {
161 atomic_t nref;
162 u64 seq;
163 int num_snaps;
164 u64 snaps[];
165};
166
167static inline struct ceph_snap_context *
168ceph_get_snap_context(struct ceph_snap_context *sc)
169{
170 /*
171 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
172 atomic_read(&sc->nref)+1);
173 */
174 if (sc)
175 atomic_inc(&sc->nref);
176 return sc;
177}
178
179static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
180{
181 if (!sc)
182 return;
183 /*
184 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
185 atomic_read(&sc->nref)-1);
186 */
187 if (atomic_dec_and_test(&sc->nref)) {
188 /*printk(" deleting snap_context %p\n", sc);*/
189 kfree(sc);
190 }
191}
192
193/*
194 * calculate the number of pages a given length and offset map onto,
195 * if we align the data.
196 */
197static inline int calc_pages_for(u64 off, u64 len)
198{
199 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
200 (off >> PAGE_CACHE_SHIFT);
201}
202
203/* ceph_common.c */
204extern const char *ceph_msg_type_name(int type);
205extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
206extern struct kmem_cache *ceph_inode_cachep;
207extern struct kmem_cache *ceph_cap_cachep;
208extern struct kmem_cache *ceph_dentry_cachep;
209extern struct kmem_cache *ceph_file_cachep;
210
211extern int ceph_parse_options(struct ceph_options **popt, char *options,
212 const char *dev_name, const char *dev_name_end,
213 int (*parse_extra_token)(char *c, void *private),
214 void *private);
215extern void ceph_destroy_options(struct ceph_options *opt);
216extern int ceph_compare_options(struct ceph_options *new_opt,
217 struct ceph_client *client);
218extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
219 void *private);
220extern u64 ceph_client_id(struct ceph_client *client);
221extern void ceph_destroy_client(struct ceph_client *client);
222extern int __ceph_open_session(struct ceph_client *client,
223 unsigned long started);
224extern int ceph_open_session(struct ceph_client *client);
225
226/* pagevec.c */
227extern void ceph_release_page_vector(struct page **pages, int num_pages);
228
229extern struct page **ceph_get_direct_page_vector(const char __user *data,
230 int num_pages,
231 loff_t off, size_t len);
232extern void ceph_put_page_vector(struct page **pages, int num_pages);
233extern void ceph_release_page_vector(struct page **pages, int num_pages);
234extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
235extern int ceph_copy_user_to_page_vector(struct page **pages,
236 const char __user *data,
237 loff_t off, size_t len);
238extern int ceph_copy_to_page_vector(struct page **pages,
239 const char *data,
240 loff_t off, size_t len);
241extern int ceph_copy_from_page_vector(struct page **pages,
242 char *data,
243 loff_t off, size_t len);
244extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data,
245 loff_t off, size_t len);
246extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
247
248
249#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 4c5cb0880bba..4c5cb0880bba 100644
--- a/fs/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
diff --git a/fs/ceph/messenger.h b/include/linux/ceph/messenger.h
index 76fbc957bc13..5956d62c3057 100644
--- a/fs/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -65,6 +65,9 @@ struct ceph_messenger {
65 */ 65 */
66 u32 global_seq; 66 u32 global_seq;
67 spinlock_t global_seq_lock; 67 spinlock_t global_seq_lock;
68
69 u32 supported_features;
70 u32 required_features;
68}; 71};
69 72
70/* 73/*
@@ -82,6 +85,10 @@ struct ceph_msg {
82 struct ceph_pagelist *pagelist; /* instead of pages */ 85 struct ceph_pagelist *pagelist; /* instead of pages */
83 struct list_head list_head; 86 struct list_head list_head;
84 struct kref kref; 87 struct kref kref;
88 struct bio *bio; /* instead of pages/pagelist */
89 struct bio *bio_iter; /* bio iterator */
90 int bio_seg; /* current bio segment */
91 struct ceph_pagelist *trail; /* the trailing part of the data */
85 bool front_is_vmalloc; 92 bool front_is_vmalloc;
86 bool more_to_follow; 93 bool more_to_follow;
87 bool needs_out_seq; 94 bool needs_out_seq;
@@ -205,7 +212,7 @@ struct ceph_connection {
205}; 212};
206 213
207 214
208extern const char *pr_addr(const struct sockaddr_storage *ss); 215extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
209extern int ceph_parse_ips(const char *c, const char *end, 216extern int ceph_parse_ips(const char *c, const char *end,
210 struct ceph_entity_addr *addr, 217 struct ceph_entity_addr *addr,
211 int max_count, int *count); 218 int max_count, int *count);
@@ -216,7 +223,8 @@ extern void ceph_msgr_exit(void);
216extern void ceph_msgr_flush(void); 223extern void ceph_msgr_flush(void);
217 224
218extern struct ceph_messenger *ceph_messenger_create( 225extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr); 226 struct ceph_entity_addr *myaddr,
227 u32 features, u32 required);
220extern void ceph_messenger_destroy(struct ceph_messenger *); 228extern void ceph_messenger_destroy(struct ceph_messenger *);
221 229
222extern void ceph_con_init(struct ceph_messenger *msgr, 230extern void ceph_con_init(struct ceph_messenger *msgr,
diff --git a/fs/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 8e396f2c0963..545f85917780 100644
--- a/fs/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -79,6 +79,7 @@ struct ceph_mon_client {
79 u64 last_tid; 79 u64 last_tid;
80 80
81 /* mds/osd map */ 81 /* mds/osd map */
82 int want_mdsmap;
82 int want_next_osdmap; /* 1 = want, 2 = want+asked */ 83 int want_next_osdmap; /* 1 = want, 2 = want+asked */
83 u32 have_osdmap, have_mdsmap; 84 u32 have_osdmap, have_mdsmap;
84 85
diff --git a/fs/ceph/msgpool.h b/include/linux/ceph/msgpool.h
index a362605f9368..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/include/linux/ceph/msgpool.h
diff --git a/fs/ceph/msgr.h b/include/linux/ceph/msgr.h
index 680d3d648cac..680d3d648cac 100644
--- a/fs/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
diff --git a/fs/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ce776989ef6a..6c91fb032c39 100644
--- a/fs/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -15,6 +15,7 @@ struct ceph_snap_context;
15struct ceph_osd_request; 15struct ceph_osd_request;
16struct ceph_osd_client; 16struct ceph_osd_client;
17struct ceph_authorizer; 17struct ceph_authorizer;
18struct ceph_pagelist;
18 19
19/* 20/*
20 * completion callback for async writepages 21 * completion callback for async writepages
@@ -68,6 +69,7 @@ struct ceph_osd_request {
68 struct list_head r_unsafe_item; 69 struct list_head r_unsafe_item;
69 70
70 struct inode *r_inode; /* for use by callbacks */ 71 struct inode *r_inode; /* for use by callbacks */
72 void *r_priv; /* ditto */
71 73
72 char r_oid[40]; /* object name */ 74 char r_oid[40]; /* object name */
73 int r_oid_len; 75 int r_oid_len;
@@ -80,6 +82,11 @@ struct ceph_osd_request {
80 struct page **r_pages; /* pages for data payload */ 82 struct page **r_pages; /* pages for data payload */
81 int r_pages_from_pool; 83 int r_pages_from_pool;
82 int r_own_pages; /* if true, i own page list */ 84 int r_own_pages; /* if true, i own page list */
85#ifdef CONFIG_BLOCK
86 struct bio *r_bio; /* instead of pages */
87#endif
88
89 struct ceph_pagelist *r_trail; /* trailing part of the data */
83}; 90};
84 91
85struct ceph_osd_client { 92struct ceph_osd_client {
@@ -110,6 +117,42 @@ struct ceph_osd_client {
110 struct ceph_msgpool msgpool_op_reply; 117 struct ceph_msgpool msgpool_op_reply;
111}; 118};
112 119
120struct ceph_osd_req_op {
121 u16 op; /* CEPH_OSD_OP_* */
122 u32 flags; /* CEPH_OSD_FLAG_* */
123 union {
124 struct {
125 u64 offset, length;
126 u64 truncate_size;
127 u32 truncate_seq;
128 } extent;
129 struct {
130 const char *name;
131 u32 name_len;
132 const char *val;
133 u32 value_len;
134 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
135 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
136 } xattr;
137 struct {
138 const char *class_name;
139 __u8 class_len;
140 const char *method_name;
141 __u8 method_len;
142 __u8 argc;
143 const char *indata;
144 u32 indata_len;
145 } cls;
146 struct {
147 u64 cookie, count;
148 } pgls;
149 struct {
150 u64 snapid;
151 } snap;
152 };
153 u32 payload_len;
154};
155
113extern int ceph_osdc_init(struct ceph_osd_client *osdc, 156extern int ceph_osdc_init(struct ceph_osd_client *osdc,
114 struct ceph_client *client); 157 struct ceph_client *client);
115extern void ceph_osdc_stop(struct ceph_osd_client *osdc); 158extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
@@ -119,6 +162,30 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
119extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, 162extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
120 struct ceph_msg *msg); 163 struct ceph_msg *msg);
121 164
165extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
166 struct ceph_file_layout *layout,
167 u64 snapid,
168 u64 off, u64 *plen, u64 *bno,
169 struct ceph_osd_request *req,
170 struct ceph_osd_req_op *op);
171
172extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
173 int flags,
174 struct ceph_snap_context *snapc,
175 struct ceph_osd_req_op *ops,
176 bool use_mempool,
177 gfp_t gfp_flags,
178 struct page **pages,
179 struct bio *bio);
180
181extern void ceph_osdc_build_request(struct ceph_osd_request *req,
182 u64 off, u64 *plen,
183 struct ceph_osd_req_op *src_ops,
184 struct ceph_snap_context *snapc,
185 struct timespec *mtime,
186 const char *oid,
187 int oid_len);
188
122extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, 189extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
123 struct ceph_file_layout *layout, 190 struct ceph_file_layout *layout,
124 struct ceph_vino vino, 191 struct ceph_vino vino,
diff --git a/fs/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 970b547e510d..ba4c205cbb01 100644
--- a/fs/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -4,7 +4,7 @@
4#include <linux/rbtree.h> 4#include <linux/rbtree.h>
5#include "types.h" 5#include "types.h"
6#include "ceph_fs.h" 6#include "ceph_fs.h"
7#include "crush/crush.h" 7#include <linux/crush/crush.h>
8 8
9/* 9/*
10 * The osd map describes the current membership of the osd cluster and 10 * The osd map describes the current membership of the osd cluster and
@@ -125,4 +125,6 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
125extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 125extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
126 struct ceph_pg pgid); 126 struct ceph_pg pgid);
127 127
128extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
129
128#endif 130#endif
diff --git a/fs/ceph/pagelist.h b/include/linux/ceph/pagelist.h
index e8a4187e1087..9660d6b0a35d 100644
--- a/fs/ceph/pagelist.h
+++ b/include/linux/ceph/pagelist.h
@@ -8,6 +8,14 @@ struct ceph_pagelist {
8 void *mapped_tail; 8 void *mapped_tail;
9 size_t length; 9 size_t length;
10 size_t room; 10 size_t room;
11 struct list_head free_list;
12 size_t num_pages_free;
13};
14
15struct ceph_pagelist_cursor {
16 struct ceph_pagelist *pl; /* pagelist, for error checking */
17 struct list_head *page_lru; /* page in list */
18 size_t room; /* room remaining to reset to */
11}; 19};
12 20
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl) 21static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
@@ -16,10 +24,23 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
16 pl->mapped_tail = NULL; 24 pl->mapped_tail = NULL;
17 pl->length = 0; 25 pl->length = 0;
18 pl->room = 0; 26 pl->room = 0;
27 INIT_LIST_HEAD(&pl->free_list);
28 pl->num_pages_free = 0;
19} 29}
30
20extern int ceph_pagelist_release(struct ceph_pagelist *pl); 31extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21 32
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l); 33extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
34
35extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space);
36
37extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl);
38
39extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
40 struct ceph_pagelist_cursor *c);
41
42extern int ceph_pagelist_truncate(struct ceph_pagelist *pl,
43 struct ceph_pagelist_cursor *c);
23 44
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) 45static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{ 46{
diff --git a/fs/ceph/rados.h b/include/linux/ceph/rados.h
index 6d5247f2e81b..6d5247f2e81b 100644
--- a/fs/ceph/rados.h
+++ b/include/linux/ceph/rados.h
diff --git a/fs/ceph/types.h b/include/linux/ceph/types.h
index 28b35a005ec2..28b35a005ec2 100644
--- a/fs/ceph/types.h
+++ b/include/linux/ceph/types.h
diff --git a/fs/ceph/crush/crush.h b/include/linux/crush/crush.h
index 97e435b191f4..97e435b191f4 100644
--- a/fs/ceph/crush/crush.h
+++ b/include/linux/crush/crush.h
diff --git a/fs/ceph/crush/hash.h b/include/linux/crush/hash.h
index 91e884230d5d..91e884230d5d 100644
--- a/fs/ceph/crush/hash.h
+++ b/include/linux/crush/hash.h
diff --git a/fs/ceph/crush/mapper.h b/include/linux/crush/mapper.h
index c46b99c18bb0..c46b99c18bb0 100644
--- a/fs/ceph/crush/mapper.h
+++ b/include/linux/crush/mapper.h
diff --git a/net/Kconfig b/net/Kconfig
index e926884c1675..55fd82e9ffd9 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -293,6 +293,7 @@ source "net/wimax/Kconfig"
293source "net/rfkill/Kconfig" 293source "net/rfkill/Kconfig"
294source "net/9p/Kconfig" 294source "net/9p/Kconfig"
295source "net/caif/Kconfig" 295source "net/caif/Kconfig"
296source "net/ceph/Kconfig"
296 297
297 298
298endif # if NET 299endif # if NET
diff --git a/net/Makefile b/net/Makefile
index ea60fbce9b1b..6b7bfd7f1416 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL) += sysctl_net.o
68endif 68endif
69obj-$(CONFIG_WIMAX) += wimax/ 69obj-$(CONFIG_WIMAX) += wimax/
70obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ 70obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
71obj-$(CONFIG_CEPH_LIB) += ceph/
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
new file mode 100644
index 000000000000..ad424049b0cf
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,28 @@
1config CEPH_LIB
2 tristate "Ceph core library (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CRYPTO_AES
6 select CRYPTO
7 default n
8 help
9 Choose Y or M here to include cephlib, which provides the
10 common functionality to both the Ceph filesystem and
11 to the rados block device (rbd).
12
13 More information at http://ceph.newdream.net/.
14
15 If unsure, say N.
16
17config CEPH_LIB_PRETTYDEBUG
18 bool "Include file:line in ceph debug output"
19 depends on CEPH_LIB
20 default n
21 help
22 If you say Y here, debug output will include a filename and
23 line to aid debugging. This increases kernel size and slows
24 execution slightly when debug call sites are enabled (e.g.,
25 via CONFIG_DYNAMIC_DEBUG).
26
27 If unsure, say N.
28
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
new file mode 100644
index 000000000000..aab1cabb8035
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,37 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_LIB) += libceph.o
8
9libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
10 mon_client.o \
11 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
12 debugfs.o \
13 auth.o auth_none.o \
14 crypto.o armor.o \
15 auth_x.o \
16 ceph_fs.o ceph_strings.o ceph_hash.o \
17 pagevec.o
18
19else
20#Otherwise we were called directly from the command
21# line; invoke the kernel build system.
22
23KERNELDIR ?= /lib/modules/$(shell uname -r)/build
24PWD := $(shell pwd)
25
26default: all
27
28all:
29 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules
30
31modules_install:
32 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install
33
34clean:
35 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
36
37endif
diff --git a/fs/ceph/armor.c b/net/ceph/armor.c
index eb2a666b0be7..eb2a666b0be7 100644
--- a/fs/ceph/armor.c
+++ b/net/ceph/armor.c
diff --git a/fs/ceph/auth.c b/net/ceph/auth.c
index 6d2e30600627..549c1f43e1d5 100644
--- a/fs/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -1,16 +1,16 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/slab.h> 5#include <linux/slab.h>
6 6
7#include "types.h" 7#include <linux/ceph/types.h>
8#include <linux/ceph/decode.h>
9#include <linux/ceph/libceph.h>
10#include <linux/ceph/messenger.h>
8#include "auth_none.h" 11#include "auth_none.h"
9#include "auth_x.h" 12#include "auth_x.h"
10#include "decode.h"
11#include "super.h"
12 13
13#include "messenger.h"
14 14
15/* 15/*
16 * get protocol handler 16 * get protocol handler
diff --git a/fs/ceph/auth_none.c b/net/ceph/auth_none.c
index ad1dc21286c7..214c2bb43d62 100644
--- a/fs/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -1,14 +1,15 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/random.h> 6#include <linux/random.h>
7#include <linux/slab.h> 7#include <linux/slab.h>
8 8
9#include <linux/ceph/decode.h>
10#include <linux/ceph/auth.h>
11
9#include "auth_none.h" 12#include "auth_none.h"
10#include "auth.h"
11#include "decode.h"
12 13
13static void reset(struct ceph_auth_client *ac) 14static void reset(struct ceph_auth_client *ac)
14{ 15{
diff --git a/fs/ceph/auth_none.h b/net/ceph/auth_none.h
index 8164df1a08be..ed7d088b1bc9 100644
--- a/fs/ceph/auth_none.h
+++ b/net/ceph/auth_none.h
@@ -2,8 +2,7 @@
2#define _FS_CEPH_AUTH_NONE_H 2#define _FS_CEPH_AUTH_NONE_H
3 3
4#include <linux/slab.h> 4#include <linux/slab.h>
5 5#include <linux/ceph/auth.h>
6#include "auth.h"
7 6
8/* 7/*
9 * null security mode. 8 * null security mode.
diff --git a/fs/ceph/auth_x.c b/net/ceph/auth_x.c
index a2d002cbdec2..7fd5dfcf6e18 100644
--- a/fs/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -1,16 +1,17 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/random.h> 6#include <linux/random.h>
7#include <linux/slab.h> 7#include <linux/slab.h>
8 8
9#include <linux/ceph/decode.h>
10#include <linux/ceph/auth.h>
11
12#include "crypto.h"
9#include "auth_x.h" 13#include "auth_x.h"
10#include "auth_x_protocol.h" 14#include "auth_x_protocol.h"
11#include "crypto.h"
12#include "auth.h"
13#include "decode.h"
14 15
15#define TEMP_TICKET_BUF_LEN 256 16#define TEMP_TICKET_BUF_LEN 256
16 17
diff --git a/fs/ceph/auth_x.h b/net/ceph/auth_x.h
index ff6f8180e681..e02da7a5c5a1 100644
--- a/fs/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -3,8 +3,9 @@
3 3
4#include <linux/rbtree.h> 4#include <linux/rbtree.h>
5 5
6#include <linux/ceph/auth.h>
7
6#include "crypto.h" 8#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h" 9#include "auth_x_protocol.h"
9 10
10/* 11/*
diff --git a/fs/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
index 671d30576c4f..671d30576c4f 100644
--- a/fs/ceph/auth_x_protocol.h
+++ b/net/ceph/auth_x_protocol.h
diff --git a/fs/ceph/buffer.c b/net/ceph/buffer.c
index cd39f17021de..53d8abfa25d5 100644
--- a/fs/ceph/buffer.c
+++ b/net/ceph/buffer.c
@@ -1,10 +1,11 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/module.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5 6
6#include "buffer.h" 7#include <linux/ceph/buffer.h>
7#include "decode.h" 8#include <linux/ceph/decode.h>
8 9
9struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) 10struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
10{ 11{
@@ -32,6 +33,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
32 dout("buffer_new %p\n", b); 33 dout("buffer_new %p\n", b);
33 return b; 34 return b;
34} 35}
36EXPORT_SYMBOL(ceph_buffer_new);
35 37
36void ceph_buffer_release(struct kref *kref) 38void ceph_buffer_release(struct kref *kref)
37{ 39{
@@ -46,6 +48,7 @@ void ceph_buffer_release(struct kref *kref)
46 } 48 }
47 kfree(b); 49 kfree(b);
48} 50}
51EXPORT_SYMBOL(ceph_buffer_release);
49 52
50int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) 53int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
51{ 54{
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
new file mode 100644
index 000000000000..f3e4a13fea0c
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,529 @@
1
2#include <linux/ceph/ceph_debug.h>
3#include <linux/backing-dev.h>
4#include <linux/ctype.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/sched.h>
12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/statfs.h>
15#include <linux/string.h>
16
17
18#include <linux/ceph/libceph.h>
19#include <linux/ceph/debugfs.h>
20#include <linux/ceph/decode.h>
21#include <linux/ceph/mon_client.h>
22#include <linux/ceph/auth.h>
23
24
25
26/*
27 * find filename portion of a path (/foo/bar/baz -> baz)
28 */
29const char *ceph_file_part(const char *s, int len)
30{
31 const char *e = s + len;
32
33 while (e != s && *(e-1) != '/')
34 e--;
35 return e;
36}
37EXPORT_SYMBOL(ceph_file_part);
38
39const char *ceph_msg_type_name(int type)
40{
41 switch (type) {
42 case CEPH_MSG_SHUTDOWN: return "shutdown";
43 case CEPH_MSG_PING: return "ping";
44 case CEPH_MSG_AUTH: return "auth";
45 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
46 case CEPH_MSG_MON_MAP: return "mon_map";
47 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
48 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
49 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
50 case CEPH_MSG_STATFS: return "statfs";
51 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
52 case CEPH_MSG_MDS_MAP: return "mds_map";
53 case CEPH_MSG_CLIENT_SESSION: return "client_session";
54 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
55 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
56 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
57 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
58 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
59 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
60 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
61 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
62 case CEPH_MSG_OSD_MAP: return "osd_map";
63 case CEPH_MSG_OSD_OP: return "osd_op";
64 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
65 default: return "unknown";
66 }
67}
68EXPORT_SYMBOL(ceph_msg_type_name);
69
70/*
71 * Initially learn our fsid, or verify an fsid matches.
72 */
73int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
74{
75 if (client->have_fsid) {
76 if (ceph_fsid_compare(&client->fsid, fsid)) {
77 pr_err("bad fsid, had %pU got %pU",
78 &client->fsid, fsid);
79 return -1;
80 }
81 } else {
82 pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
83 memcpy(&client->fsid, fsid, sizeof(*fsid));
84 ceph_debugfs_client_init(client);
85 client->have_fsid = true;
86 }
87 return 0;
88}
89EXPORT_SYMBOL(ceph_check_fsid);
90
91static int strcmp_null(const char *s1, const char *s2)
92{
93 if (!s1 && !s2)
94 return 0;
95 if (s1 && !s2)
96 return -1;
97 if (!s1 && s2)
98 return 1;
99 return strcmp(s1, s2);
100}
101
102int ceph_compare_options(struct ceph_options *new_opt,
103 struct ceph_client *client)
104{
105 struct ceph_options *opt1 = new_opt;
106 struct ceph_options *opt2 = client->options;
107 int ofs = offsetof(struct ceph_options, mon_addr);
108 int i;
109 int ret;
110
111 ret = memcmp(opt1, opt2, ofs);
112 if (ret)
113 return ret;
114
115 ret = strcmp_null(opt1->name, opt2->name);
116 if (ret)
117 return ret;
118
119 ret = strcmp_null(opt1->secret, opt2->secret);
120 if (ret)
121 return ret;
122
123 /* any matching mon ip implies a match */
124 for (i = 0; i < opt1->num_mon; i++) {
125 if (ceph_monmap_contains(client->monc.monmap,
126 &opt1->mon_addr[i]))
127 return 0;
128 }
129 return -1;
130}
131EXPORT_SYMBOL(ceph_compare_options);
132
133
134static int parse_fsid(const char *str, struct ceph_fsid *fsid)
135{
136 int i = 0;
137 char tmp[3];
138 int err = -EINVAL;
139 int d;
140
141 dout("parse_fsid '%s'\n", str);
142 tmp[2] = 0;
143 while (*str && i < 16) {
144 if (ispunct(*str)) {
145 str++;
146 continue;
147 }
148 if (!isxdigit(str[0]) || !isxdigit(str[1]))
149 break;
150 tmp[0] = str[0];
151 tmp[1] = str[1];
152 if (sscanf(tmp, "%x", &d) < 1)
153 break;
154 fsid->fsid[i] = d & 0xff;
155 i++;
156 str += 2;
157 }
158
159 if (i == 16)
160 err = 0;
161 dout("parse_fsid ret %d got fsid %pU", err, fsid);
162 return err;
163}
164
165/*
166 * ceph options
167 */
168enum {
169 Opt_osdtimeout,
170 Opt_osdkeepalivetimeout,
171 Opt_mount_timeout,
172 Opt_osd_idle_ttl,
173 Opt_last_int,
174 /* int args above */
175 Opt_fsid,
176 Opt_name,
177 Opt_secret,
178 Opt_ip,
179 Opt_last_string,
180 /* string args above */
181 Opt_noshare,
182 Opt_nocrc,
183};
184
185static match_table_t opt_tokens = {
186 {Opt_osdtimeout, "osdtimeout=%d"},
187 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
188 {Opt_mount_timeout, "mount_timeout=%d"},
189 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
190 /* int args above */
191 {Opt_fsid, "fsid=%s"},
192 {Opt_name, "name=%s"},
193 {Opt_secret, "secret=%s"},
194 {Opt_ip, "ip=%s"},
195 /* string args above */
196 {Opt_noshare, "noshare"},
197 {Opt_nocrc, "nocrc"},
198 {-1, NULL}
199};
200
201void ceph_destroy_options(struct ceph_options *opt)
202{
203 dout("destroy_options %p\n", opt);
204 kfree(opt->name);
205 kfree(opt->secret);
206 kfree(opt);
207}
208EXPORT_SYMBOL(ceph_destroy_options);
209
210int ceph_parse_options(struct ceph_options **popt, char *options,
211 const char *dev_name, const char *dev_name_end,
212 int (*parse_extra_token)(char *c, void *private),
213 void *private)
214{
215 struct ceph_options *opt;
216 const char *c;
217 int err = -ENOMEM;
218 substring_t argstr[MAX_OPT_ARGS];
219
220 opt = kzalloc(sizeof(*opt), GFP_KERNEL);
221 if (!opt)
222 return err;
223 opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
224 GFP_KERNEL);
225 if (!opt->mon_addr)
226 goto out;
227
228 dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
229 dev_name);
230
231 /* start with defaults */
232 opt->flags = CEPH_OPT_DEFAULT;
233 opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
234 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
235 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
236 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
237
238 /* get mon ip(s) */
239 /* ip1[:port1][,ip2[:port2]...] */
240 err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
241 CEPH_MAX_MON, &opt->num_mon);
242 if (err < 0)
243 goto out;
244
245 /* parse mount options */
246 while ((c = strsep(&options, ",")) != NULL) {
247 int token, intval, ret;
248 if (!*c)
249 continue;
250 err = -EINVAL;
251 token = match_token((char *)c, opt_tokens, argstr);
252 if (token < 0 && parse_extra_token) {
253 /* extra? */
254 err = parse_extra_token((char *)c, private);
255 if (err < 0) {
256 pr_err("bad option at '%s'\n", c);
257 goto out;
258 }
259 continue;
260 }
261 if (token < Opt_last_int) {
262 ret = match_int(&argstr[0], &intval);
263 if (ret < 0) {
264 pr_err("bad mount option arg (not int) "
265 "at '%s'\n", c);
266 continue;
267 }
268 dout("got int token %d val %d\n", token, intval);
269 } else if (token > Opt_last_int && token < Opt_last_string) {
270 dout("got string token %d val %s\n", token,
271 argstr[0].from);
272 } else {
273 dout("got token %d\n", token);
274 }
275 switch (token) {
276 case Opt_ip:
277 err = ceph_parse_ips(argstr[0].from,
278 argstr[0].to,
279 &opt->my_addr,
280 1, NULL);
281 if (err < 0)
282 goto out;
283 opt->flags |= CEPH_OPT_MYIP;
284 break;
285
286 case Opt_fsid:
287 err = parse_fsid(argstr[0].from, &opt->fsid);
288 if (err == 0)
289 opt->flags |= CEPH_OPT_FSID;
290 break;
291 case Opt_name:
292 opt->name = kstrndup(argstr[0].from,
293 argstr[0].to-argstr[0].from,
294 GFP_KERNEL);
295 break;
296 case Opt_secret:
297 opt->secret = kstrndup(argstr[0].from,
298 argstr[0].to-argstr[0].from,
299 GFP_KERNEL);
300 break;
301
302 /* misc */
303 case Opt_osdtimeout:
304 opt->osd_timeout = intval;
305 break;
306 case Opt_osdkeepalivetimeout:
307 opt->osd_keepalive_timeout = intval;
308 break;
309 case Opt_osd_idle_ttl:
310 opt->osd_idle_ttl = intval;
311 break;
312 case Opt_mount_timeout:
313 opt->mount_timeout = intval;
314 break;
315
316 case Opt_noshare:
317 opt->flags |= CEPH_OPT_NOSHARE;
318 break;
319
320 case Opt_nocrc:
321 opt->flags |= CEPH_OPT_NOCRC;
322 break;
323
324 default:
325 BUG_ON(token);
326 }
327 }
328
329 /* success */
330 *popt = opt;
331 return 0;
332
333out:
334 ceph_destroy_options(opt);
335 return err;
336}
337EXPORT_SYMBOL(ceph_parse_options);
338
339u64 ceph_client_id(struct ceph_client *client)
340{
341 return client->monc.auth->global_id;
342}
343EXPORT_SYMBOL(ceph_client_id);
344
345/*
346 * create a fresh client instance
347 */
348struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
349{
350 struct ceph_client *client;
351 int err = -ENOMEM;
352
353 client = kzalloc(sizeof(*client), GFP_KERNEL);
354 if (client == NULL)
355 return ERR_PTR(-ENOMEM);
356
357 client->private = private;
358 client->options = opt;
359
360 mutex_init(&client->mount_mutex);
361 init_waitqueue_head(&client->auth_wq);
362 client->auth_err = 0;
363
364 client->extra_mon_dispatch = NULL;
365 client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
366 client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
367
368 client->msgr = NULL;
369
370 /* subsystems */
371 err = ceph_monc_init(&client->monc, client);
372 if (err < 0)
373 goto fail;
374 err = ceph_osdc_init(&client->osdc, client);
375 if (err < 0)
376 goto fail_monc;
377
378 return client;
379
380fail_monc:
381 ceph_monc_stop(&client->monc);
382fail:
383 kfree(client);
384 return ERR_PTR(err);
385}
386EXPORT_SYMBOL(ceph_create_client);
387
388void ceph_destroy_client(struct ceph_client *client)
389{
390 dout("destroy_client %p\n", client);
391
392 /* unmount */
393 ceph_osdc_stop(&client->osdc);
394
395 /*
396 * make sure mds and osd connections close out before destroying
397 * the auth module, which is needed to free those connections'
398 * ceph_authorizers.
399 */
400 ceph_msgr_flush();
401
402 ceph_monc_stop(&client->monc);
403
404 ceph_debugfs_client_cleanup(client);
405
406 if (client->msgr)
407 ceph_messenger_destroy(client->msgr);
408
409 ceph_destroy_options(client->options);
410
411 kfree(client);
412 dout("destroy_client %p done\n", client);
413}
414EXPORT_SYMBOL(ceph_destroy_client);
415
416/*
417 * true if we have the mon map (and have thus joined the cluster)
418 */
419static int have_mon_and_osd_map(struct ceph_client *client)
420{
421 return client->monc.monmap && client->monc.monmap->epoch &&
422 client->osdc.osdmap && client->osdc.osdmap->epoch;
423}
424
425/*
426 * mount: join the ceph cluster, and open root directory.
427 */
428int __ceph_open_session(struct ceph_client *client, unsigned long started)
429{
430 struct ceph_entity_addr *myaddr = NULL;
431 int err;
432 unsigned long timeout = client->options->mount_timeout * HZ;
433
434 /* initialize the messenger */
435 if (client->msgr == NULL) {
436 if (ceph_test_opt(client, MYIP))
437 myaddr = &client->options->my_addr;
438 client->msgr = ceph_messenger_create(myaddr,
439 client->supported_features,
440 client->required_features);
441 if (IS_ERR(client->msgr)) {
442 client->msgr = NULL;
443 return PTR_ERR(client->msgr);
444 }
445 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
446 }
447
448 /* open session, and wait for mon and osd maps */
449 err = ceph_monc_open_session(&client->monc);
450 if (err < 0)
451 return err;
452
453 while (!have_mon_and_osd_map(client)) {
454 err = -EIO;
455 if (timeout && time_after_eq(jiffies, started + timeout))
456 return err;
457
458 /* wait */
459 dout("mount waiting for mon_map\n");
460 err = wait_event_interruptible_timeout(client->auth_wq,
461 have_mon_and_osd_map(client) || (client->auth_err < 0),
462 timeout);
463 if (err == -EINTR || err == -ERESTARTSYS)
464 return err;
465 if (client->auth_err < 0)
466 return client->auth_err;
467 }
468
469 return 0;
470}
471EXPORT_SYMBOL(__ceph_open_session);
472
473
474int ceph_open_session(struct ceph_client *client)
475{
476 int ret;
477 unsigned long started = jiffies; /* note the start time */
478
479 dout("open_session start\n");
480 mutex_lock(&client->mount_mutex);
481
482 ret = __ceph_open_session(client, started);
483
484 mutex_unlock(&client->mount_mutex);
485 return ret;
486}
487EXPORT_SYMBOL(ceph_open_session);
488
489
490static int __init init_ceph_lib(void)
491{
492 int ret = 0;
493
494 ret = ceph_debugfs_init();
495 if (ret < 0)
496 goto out;
497
498 ret = ceph_msgr_init();
499 if (ret < 0)
500 goto out_debugfs;
501
502 pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
503 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
504 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
505 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
506
507 return 0;
508
509out_debugfs:
510 ceph_debugfs_cleanup();
511out:
512 return ret;
513}
514
515static void __exit exit_ceph_lib(void)
516{
517 dout("exit_ceph_lib\n");
518 ceph_msgr_exit();
519 ceph_debugfs_cleanup();
520}
521
522module_init(init_ceph_lib);
523module_exit(exit_ceph_lib);
524
525MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
526MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
527MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
528MODULE_DESCRIPTION("Ceph filesystem for Linux");
529MODULE_LICENSE("GPL");
diff --git a/fs/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
index 3ac6cc7c1156..a3a3a31d3c37 100644
--- a/fs/ceph/ceph_fs.c
+++ b/net/ceph/ceph_fs.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * Some non-inline ceph helpers 2 * Some non-inline ceph helpers
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6/* 7/*
7 * return true if @layout appears to be valid 8 * return true if @layout appears to be valid
@@ -52,6 +53,7 @@ int ceph_flags_to_mode(int flags)
52 53
53 return mode; 54 return mode;
54} 55}
56EXPORT_SYMBOL(ceph_flags_to_mode);
55 57
56int ceph_caps_for_mode(int mode) 58int ceph_caps_for_mode(int mode)
57{ 59{
@@ -70,3 +72,4 @@ int ceph_caps_for_mode(int mode)
70 72
71 return caps; 73 return caps;
72} 74}
75EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/fs/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
index bd570015d147..815ef8826796 100644
--- a/fs/ceph/ceph_hash.c
+++ b/net/ceph/ceph_hash.c
@@ -1,5 +1,5 @@
1 1
2#include "types.h" 2#include <linux/ceph/types.h>
3 3
4/* 4/*
5 * Robert Jenkin's hash function. 5 * Robert Jenkin's hash function.
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
new file mode 100644
index 000000000000..3fbda04de29c
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,84 @@
1/*
2 * Ceph string constants
3 */
4#include <linux/module.h>
5#include <linux/ceph/types.h>
6
7const char *ceph_entity_type_name(int type)
8{
9 switch (type) {
10 case CEPH_ENTITY_TYPE_MDS: return "mds";
11 case CEPH_ENTITY_TYPE_OSD: return "osd";
12 case CEPH_ENTITY_TYPE_MON: return "mon";
13 case CEPH_ENTITY_TYPE_CLIENT: return "client";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32 case CEPH_OSD_OP_ROLLBACK: return "rollback";
33
34 case CEPH_OSD_OP_APPEND: return "append";
35 case CEPH_OSD_OP_STARTSYNC: return "startsync";
36 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
37 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
38
39 case CEPH_OSD_OP_TMAPUP: return "tmapup";
40 case CEPH_OSD_OP_TMAPGET: return "tmapget";
41 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
42
43 case CEPH_OSD_OP_GETXATTR: return "getxattr";
44 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
45 case CEPH_OSD_OP_SETXATTR: return "setxattr";
46 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
47 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
48 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
49 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
50
51 case CEPH_OSD_OP_PULL: return "pull";
52 case CEPH_OSD_OP_PUSH: return "push";
53 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
54 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
55 case CEPH_OSD_OP_SCRUB: return "scrub";
56
57 case CEPH_OSD_OP_WRLOCK: return "wrlock";
58 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
59 case CEPH_OSD_OP_RDLOCK: return "rdlock";
60 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
61 case CEPH_OSD_OP_UPLOCK: return "uplock";
62 case CEPH_OSD_OP_DNLOCK: return "dnlock";
63
64 case CEPH_OSD_OP_CALL: return "call";
65
66 case CEPH_OSD_OP_PGLS: return "pgls";
67 }
68 return "???";
69}
70
71
72const char *ceph_pool_op_name(int op)
73{
74 switch (op) {
75 case POOL_OP_CREATE: return "create";
76 case POOL_OP_DELETE: return "delete";
77 case POOL_OP_AUID_CHANGE: return "auid change";
78 case POOL_OP_CREATE_SNAP: return "create snap";
79 case POOL_OP_DELETE_SNAP: return "delete snap";
80 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
81 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
82 }
83 return "???";
84}
diff --git a/fs/ceph/crush/crush.c b/net/ceph/crush/crush.c
index fabd302e5779..d6ebb13a18a4 100644
--- a/fs/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -8,7 +8,7 @@
8# define BUG_ON(x) assert(!(x)) 8# define BUG_ON(x) assert(!(x))
9#endif 9#endif
10 10
11#include "crush.h" 11#include <linux/crush/crush.h>
12 12
13const char *crush_bucket_alg_name(int alg) 13const char *crush_bucket_alg_name(int alg)
14{ 14{
diff --git a/fs/ceph/crush/hash.c b/net/ceph/crush/hash.c
index 5873aed694bf..5bb63e37a8a1 100644
--- a/fs/ceph/crush/hash.c
+++ b/net/ceph/crush/hash.c
@@ -1,6 +1,6 @@
1 1
2#include <linux/types.h> 2#include <linux/types.h>
3#include "hash.h" 3#include <linux/crush/hash.h>
4 4
5/* 5/*
6 * Robert Jenkins' function for mixing 32-bit values 6 * Robert Jenkins' function for mixing 32-bit values
diff --git a/fs/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index a4eec133258e..42599e31dcad 100644
--- a/fs/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -18,8 +18,8 @@
18# define kfree(x) free(x) 18# define kfree(x) free(x)
19#endif 19#endif
20 20
21#include "crush.h" 21#include <linux/crush/crush.h>
22#include "hash.h" 22#include <linux/crush/hash.h>
23 23
24/* 24/*
25 * Implement the core CRUSH mapping algorithm. 25 * Implement the core CRUSH mapping algorithm.
diff --git a/fs/ceph/crypto.c b/net/ceph/crypto.c
index a3e627f63293..7b505b0c983f 100644
--- a/fs/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -1,13 +1,13 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/scatterlist.h> 5#include <linux/scatterlist.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <crypto/hash.h> 7#include <crypto/hash.h>
8 8
9#include <linux/ceph/decode.h>
9#include "crypto.h" 10#include "crypto.h"
10#include "decode.h"
11 11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) 12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{ 13{
diff --git a/fs/ceph/crypto.h b/net/ceph/crypto.h
index bdf38607323c..f9eccace592b 100644
--- a/fs/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -1,8 +1,8 @@
1#ifndef _FS_CEPH_CRYPTO_H 1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H 2#define _FS_CEPH_CRYPTO_H
3 3
4#include "types.h" 4#include <linux/ceph/types.h>
5#include "buffer.h" 5#include <linux/ceph/buffer.h>
6 6
7/* 7/*
8 * cryptographic secret 8 * cryptographic secret
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
new file mode 100644
index 000000000000..27d4ea315d12
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,267 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/device.h>
4#include <linux/slab.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
9
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/mon_client.h>
12#include <linux/ceph/auth.h>
13#include <linux/ceph/debugfs.h>
14
15#ifdef CONFIG_DEBUG_FS
16
17/*
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../monmap - current monmap
23 * .../osdc - active osd requests
24 * .../monc - mon client state
25 * .../dentry_lru - dump contents of dentry lru
26 * .../caps - expose cap (reservation) stats
27 * .../bdi - symlink to ../../bdi/something
28 */
29
30static struct dentry *ceph_debugfs_dir;
31
32static int monmap_show(struct seq_file *s, void *p)
33{
34 int i;
35 struct ceph_client *client = s->private;
36
37 if (client->monc.monmap == NULL)
38 return 0;
39
40 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
41 for (i = 0; i < client->monc.monmap->num_mon; i++) {
42 struct ceph_entity_inst *inst =
43 &client->monc.monmap->mon_inst[i];
44
45 seq_printf(s, "\t%s%lld\t%s\n",
46 ENTITY_NAME(inst->name),
47 ceph_pr_addr(&inst->addr.in_addr));
48 }
49 return 0;
50}
51
52static int osdmap_show(struct seq_file *s, void *p)
53{
54 int i;
55 struct ceph_client *client = s->private;
56 struct rb_node *n;
57
58 if (client->osdc.osdmap == NULL)
59 return 0;
60 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
61 seq_printf(s, "flags%s%s\n",
62 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
63 " NEARFULL" : "",
64 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
65 " FULL" : "");
66 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
67 struct ceph_pg_pool_info *pool =
68 rb_entry(n, struct ceph_pg_pool_info, node);
69 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
70 pool->id, pool->v.pg_num, pool->pg_num_mask,
71 pool->v.lpg_num, pool->lpg_num_mask);
72 }
73 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
74 struct ceph_entity_addr *addr =
75 &client->osdc.osdmap->osd_addr[i];
76 int state = client->osdc.osdmap->osd_state[i];
77 char sb[64];
78
79 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
80 i, ceph_pr_addr(&addr->in_addr),
81 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
82 ceph_osdmap_state_str(sb, sizeof(sb), state));
83 }
84 return 0;
85}
86
87static int monc_show(struct seq_file *s, void *p)
88{
89 struct ceph_client *client = s->private;
90 struct ceph_mon_generic_request *req;
91 struct ceph_mon_client *monc = &client->monc;
92 struct rb_node *rp;
93
94 mutex_lock(&monc->mutex);
95
96 if (monc->have_mdsmap)
97 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
98 if (monc->have_osdmap)
99 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
100 if (monc->want_next_osdmap)
101 seq_printf(s, "want next osdmap\n");
102
103 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
104 __u16 op;
105 req = rb_entry(rp, struct ceph_mon_generic_request, node);
106 op = le16_to_cpu(req->request->hdr.type);
107 if (op == CEPH_MSG_STATFS)
108 seq_printf(s, "%lld statfs\n", req->tid);
109 else
110 seq_printf(s, "%lld unknown\n", req->tid);
111 }
112
113 mutex_unlock(&monc->mutex);
114 return 0;
115}
116
117static int osdc_show(struct seq_file *s, void *pp)
118{
119 struct ceph_client *client = s->private;
120 struct ceph_osd_client *osdc = &client->osdc;
121 struct rb_node *p;
122
123 mutex_lock(&osdc->request_mutex);
124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
125 struct ceph_osd_request *req;
126 struct ceph_osd_request_head *head;
127 struct ceph_osd_op *op;
128 int num_ops;
129 int opcode, olen;
130 int i;
131
132 req = rb_entry(p, struct ceph_osd_request, r_node);
133
134 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
135 req->r_osd ? req->r_osd->o_osd : -1,
136 le32_to_cpu(req->r_pgid.pool),
137 le16_to_cpu(req->r_pgid.ps));
138
139 head = req->r_request->front.iov_base;
140 op = (void *)(head + 1);
141
142 num_ops = le16_to_cpu(head->num_ops);
143 olen = le32_to_cpu(head->object_len);
144 seq_printf(s, "%.*s", olen,
145 (const char *)(head->ops + num_ops));
146
147 if (req->r_reassert_version.epoch)
148 seq_printf(s, "\t%u'%llu",
149 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
150 le64_to_cpu(req->r_reassert_version.version));
151 else
152 seq_printf(s, "\t");
153
154 for (i = 0; i < num_ops; i++) {
155 opcode = le16_to_cpu(op->op);
156 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
157 op++;
158 }
159
160 seq_printf(s, "\n");
161 }
162 mutex_unlock(&osdc->request_mutex);
163 return 0;
164}
165
166CEPH_DEFINE_SHOW_FUNC(monmap_show)
167CEPH_DEFINE_SHOW_FUNC(osdmap_show)
168CEPH_DEFINE_SHOW_FUNC(monc_show)
169CEPH_DEFINE_SHOW_FUNC(osdc_show)
170
171int ceph_debugfs_init(void)
172{
173 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
174 if (!ceph_debugfs_dir)
175 return -ENOMEM;
176 return 0;
177}
178
179void ceph_debugfs_cleanup(void)
180{
181 debugfs_remove(ceph_debugfs_dir);
182}
183
184int ceph_debugfs_client_init(struct ceph_client *client)
185{
186 int ret = -ENOMEM;
187 char name[80];
188
189 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
190 client->monc.auth->global_id);
191
192 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
193 if (!client->debugfs_dir)
194 goto out;
195
196 client->monc.debugfs_file = debugfs_create_file("monc",
197 0600,
198 client->debugfs_dir,
199 client,
200 &monc_show_fops);
201 if (!client->monc.debugfs_file)
202 goto out;
203
204 client->osdc.debugfs_file = debugfs_create_file("osdc",
205 0600,
206 client->debugfs_dir,
207 client,
208 &osdc_show_fops);
209 if (!client->osdc.debugfs_file)
210 goto out;
211
212 client->debugfs_monmap = debugfs_create_file("monmap",
213 0600,
214 client->debugfs_dir,
215 client,
216 &monmap_show_fops);
217 if (!client->debugfs_monmap)
218 goto out;
219
220 client->debugfs_osdmap = debugfs_create_file("osdmap",
221 0600,
222 client->debugfs_dir,
223 client,
224 &osdmap_show_fops);
225 if (!client->debugfs_osdmap)
226 goto out;
227
228 return 0;
229
230out:
231 ceph_debugfs_client_cleanup(client);
232 return ret;
233}
234
235void ceph_debugfs_client_cleanup(struct ceph_client *client)
236{
237 debugfs_remove(client->debugfs_osdmap);
238 debugfs_remove(client->debugfs_monmap);
239 debugfs_remove(client->osdc.debugfs_file);
240 debugfs_remove(client->monc.debugfs_file);
241 debugfs_remove(client->debugfs_dir);
242}
243
244#else /* CONFIG_DEBUG_FS */
245
246int ceph_debugfs_init(void)
247{
248 return 0;
249}
250
251void ceph_debugfs_cleanup(void)
252{
253}
254
255int ceph_debugfs_client_init(struct ceph_client *client)
256{
257 return 0;
258}
259
260void ceph_debugfs_client_cleanup(struct ceph_client *client)
261{
262}
263
264#endif /* CONFIG_DEBUG_FS */
265
266EXPORT_SYMBOL(ceph_debugfs_init);
267EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/fs/ceph/messenger.c b/net/ceph/messenger.c
index 2502d76fcec1..0e8157ee5d43 100644
--- a/fs/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/crc32c.h> 3#include <linux/crc32c.h>
4#include <linux/ctype.h> 4#include <linux/ctype.h>
@@ -9,12 +9,14 @@
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/socket.h> 10#include <linux/socket.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/bio.h>
13#include <linux/blkdev.h>
12#include <net/tcp.h> 14#include <net/tcp.h>
13 15
14#include "super.h" 16#include <linux/ceph/libceph.h>
15#include "messenger.h" 17#include <linux/ceph/messenger.h>
16#include "decode.h" 18#include <linux/ceph/decode.h>
17#include "pagelist.h" 19#include <linux/ceph/pagelist.h>
18 20
19/* 21/*
20 * Ceph uses the messenger to exchange ceph_msg messages with other 22 * Ceph uses the messenger to exchange ceph_msg messages with other
@@ -48,7 +50,7 @@ static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
48static DEFINE_SPINLOCK(addr_str_lock); 50static DEFINE_SPINLOCK(addr_str_lock);
49static int last_addr_str; 51static int last_addr_str;
50 52
51const char *pr_addr(const struct sockaddr_storage *ss) 53const char *ceph_pr_addr(const struct sockaddr_storage *ss)
52{ 54{
53 int i; 55 int i;
54 char *s; 56 char *s;
@@ -79,6 +81,7 @@ const char *pr_addr(const struct sockaddr_storage *ss)
79 81
80 return s; 82 return s;
81} 83}
84EXPORT_SYMBOL(ceph_pr_addr);
82 85
83static void encode_my_addr(struct ceph_messenger *msgr) 86static void encode_my_addr(struct ceph_messenger *msgr)
84{ 87{
@@ -91,7 +94,7 @@ static void encode_my_addr(struct ceph_messenger *msgr)
91 */ 94 */
92struct workqueue_struct *ceph_msgr_wq; 95struct workqueue_struct *ceph_msgr_wq;
93 96
94int __init ceph_msgr_init(void) 97int ceph_msgr_init(void)
95{ 98{
96 ceph_msgr_wq = create_workqueue("ceph-msgr"); 99 ceph_msgr_wq = create_workqueue("ceph-msgr");
97 if (IS_ERR(ceph_msgr_wq)) { 100 if (IS_ERR(ceph_msgr_wq)) {
@@ -102,16 +105,19 @@ int __init ceph_msgr_init(void)
102 } 105 }
103 return 0; 106 return 0;
104} 107}
108EXPORT_SYMBOL(ceph_msgr_init);
105 109
106void ceph_msgr_exit(void) 110void ceph_msgr_exit(void)
107{ 111{
108 destroy_workqueue(ceph_msgr_wq); 112 destroy_workqueue(ceph_msgr_wq);
109} 113}
114EXPORT_SYMBOL(ceph_msgr_exit);
110 115
111void ceph_msgr_flush(void) 116void ceph_msgr_flush(void)
112{ 117{
113 flush_workqueue(ceph_msgr_wq); 118 flush_workqueue(ceph_msgr_wq);
114} 119}
120EXPORT_SYMBOL(ceph_msgr_flush);
115 121
116 122
117/* 123/*
@@ -221,19 +227,19 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
221 227
222 set_sock_callbacks(sock, con); 228 set_sock_callbacks(sock, con);
223 229
224 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); 230 dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
225 231
226 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), 232 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
227 O_NONBLOCK); 233 O_NONBLOCK);
228 if (ret == -EINPROGRESS) { 234 if (ret == -EINPROGRESS) {
229 dout("connect %s EINPROGRESS sk_state = %u\n", 235 dout("connect %s EINPROGRESS sk_state = %u\n",
230 pr_addr(&con->peer_addr.in_addr), 236 ceph_pr_addr(&con->peer_addr.in_addr),
231 sock->sk->sk_state); 237 sock->sk->sk_state);
232 ret = 0; 238 ret = 0;
233 } 239 }
234 if (ret < 0) { 240 if (ret < 0) {
235 pr_err("connect %s error %d\n", 241 pr_err("connect %s error %d\n",
236 pr_addr(&con->peer_addr.in_addr), ret); 242 ceph_pr_addr(&con->peer_addr.in_addr), ret);
237 sock_release(sock); 243 sock_release(sock);
238 con->sock = NULL; 244 con->sock = NULL;
239 con->error_msg = "connect error"; 245 con->error_msg = "connect error";
@@ -334,7 +340,8 @@ static void reset_connection(struct ceph_connection *con)
334 */ 340 */
335void ceph_con_close(struct ceph_connection *con) 341void ceph_con_close(struct ceph_connection *con)
336{ 342{
337 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr)); 343 dout("con_close %p peer %s\n", con,
344 ceph_pr_addr(&con->peer_addr.in_addr));
338 set_bit(CLOSED, &con->state); /* in case there's queued work */ 345 set_bit(CLOSED, &con->state); /* in case there's queued work */
339 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ 346 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
340 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ 347 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
@@ -347,19 +354,21 @@ void ceph_con_close(struct ceph_connection *con)
347 mutex_unlock(&con->mutex); 354 mutex_unlock(&con->mutex);
348 queue_con(con); 355 queue_con(con);
349} 356}
357EXPORT_SYMBOL(ceph_con_close);
350 358
351/* 359/*
352 * Reopen a closed connection, with a new peer address. 360 * Reopen a closed connection, with a new peer address.
353 */ 361 */
354void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) 362void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
355{ 363{
356 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr)); 364 dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
357 set_bit(OPENING, &con->state); 365 set_bit(OPENING, &con->state);
358 clear_bit(CLOSED, &con->state); 366 clear_bit(CLOSED, &con->state);
359 memcpy(&con->peer_addr, addr, sizeof(*addr)); 367 memcpy(&con->peer_addr, addr, sizeof(*addr));
360 con->delay = 0; /* reset backoff memory */ 368 con->delay = 0; /* reset backoff memory */
361 queue_con(con); 369 queue_con(con);
362} 370}
371EXPORT_SYMBOL(ceph_con_open);
363 372
364/* 373/*
365 * return true if this connection ever successfully opened 374 * return true if this connection ever successfully opened
@@ -406,6 +415,7 @@ void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
406 INIT_LIST_HEAD(&con->out_sent); 415 INIT_LIST_HEAD(&con->out_sent);
407 INIT_DELAYED_WORK(&con->work, con_work); 416 INIT_DELAYED_WORK(&con->work, con_work);
408} 417}
418EXPORT_SYMBOL(ceph_con_init);
409 419
410 420
411/* 421/*
@@ -529,8 +539,11 @@ static void prepare_write_message(struct ceph_connection *con)
529 if (le32_to_cpu(m->hdr.data_len) > 0) { 539 if (le32_to_cpu(m->hdr.data_len) > 0) {
530 /* initialize page iterator */ 540 /* initialize page iterator */
531 con->out_msg_pos.page = 0; 541 con->out_msg_pos.page = 0;
532 con->out_msg_pos.page_pos = 542 if (m->pages)
533 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; 543 con->out_msg_pos.page_pos =
544 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
545 else
546 con->out_msg_pos.page_pos = 0;
534 con->out_msg_pos.data_pos = 0; 547 con->out_msg_pos.data_pos = 0;
535 con->out_msg_pos.did_page_crc = 0; 548 con->out_msg_pos.did_page_crc = 0;
536 con->out_more = 1; /* data + footer will follow */ 549 con->out_more = 1; /* data + footer will follow */
@@ -647,7 +660,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
647 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 660 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
648 con->connect_seq, global_seq, proto); 661 con->connect_seq, global_seq, proto);
649 662
650 con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED); 663 con->out_connect.features = cpu_to_le64(msgr->supported_features);
651 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 664 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
652 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); 665 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
653 con->out_connect.global_seq = cpu_to_le32(global_seq); 666 con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -712,6 +725,31 @@ out:
712 return ret; /* done! */ 725 return ret; /* done! */
713} 726}
714 727
728#ifdef CONFIG_BLOCK
729static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
730{
731 if (!bio) {
732 *iter = NULL;
733 *seg = 0;
734 return;
735 }
736 *iter = bio;
737 *seg = bio->bi_idx;
738}
739
740static void iter_bio_next(struct bio **bio_iter, int *seg)
741{
742 if (*bio_iter == NULL)
743 return;
744
745 BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
746
747 (*seg)++;
748 if (*seg == (*bio_iter)->bi_vcnt)
749 init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
750}
751#endif
752
715/* 753/*
716 * Write as much message data payload as we can. If we finish, queue 754 * Write as much message data payload as we can. If we finish, queue
717 * up the footer. 755 * up the footer.
@@ -726,21 +764,46 @@ static int write_partial_msg_pages(struct ceph_connection *con)
726 size_t len; 764 size_t len;
727 int crc = con->msgr->nocrc; 765 int crc = con->msgr->nocrc;
728 int ret; 766 int ret;
767 int total_max_write;
768 int in_trail = 0;
769 size_t trail_len = (msg->trail ? msg->trail->length : 0);
729 770
730 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", 771 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
731 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, 772 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
732 con->out_msg_pos.page_pos); 773 con->out_msg_pos.page_pos);
733 774
734 while (con->out_msg_pos.page < con->out_msg->nr_pages) { 775#ifdef CONFIG_BLOCK
776 if (msg->bio && !msg->bio_iter)
777 init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
778#endif
779
780 while (data_len > con->out_msg_pos.data_pos) {
735 struct page *page = NULL; 781 struct page *page = NULL;
736 void *kaddr = NULL; 782 void *kaddr = NULL;
783 int max_write = PAGE_SIZE;
784 int page_shift = 0;
785
786 total_max_write = data_len - trail_len -
787 con->out_msg_pos.data_pos;
737 788
738 /* 789 /*
739 * if we are calculating the data crc (the default), we need 790 * if we are calculating the data crc (the default), we need
740 * to map the page. if our pages[] has been revoked, use the 791 * to map the page. if our pages[] has been revoked, use the
741 * zero page. 792 * zero page.
742 */ 793 */
743 if (msg->pages) { 794
795 /* have we reached the trail part of the data? */
796 if (con->out_msg_pos.data_pos >= data_len - trail_len) {
797 in_trail = 1;
798
799 total_max_write = data_len - con->out_msg_pos.data_pos;
800
801 page = list_first_entry(&msg->trail->head,
802 struct page, lru);
803 if (crc)
804 kaddr = kmap(page);
805 max_write = PAGE_SIZE;
806 } else if (msg->pages) {
744 page = msg->pages[con->out_msg_pos.page]; 807 page = msg->pages[con->out_msg_pos.page];
745 if (crc) 808 if (crc)
746 kaddr = kmap(page); 809 kaddr = kmap(page);
@@ -749,13 +812,25 @@ static int write_partial_msg_pages(struct ceph_connection *con)
749 struct page, lru); 812 struct page, lru);
750 if (crc) 813 if (crc)
751 kaddr = kmap(page); 814 kaddr = kmap(page);
815#ifdef CONFIG_BLOCK
816 } else if (msg->bio) {
817 struct bio_vec *bv;
818
819 bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
820 page = bv->bv_page;
821 page_shift = bv->bv_offset;
822 if (crc)
823 kaddr = kmap(page) + page_shift;
824 max_write = bv->bv_len;
825#endif
752 } else { 826 } else {
753 page = con->msgr->zero_page; 827 page = con->msgr->zero_page;
754 if (crc) 828 if (crc)
755 kaddr = page_address(con->msgr->zero_page); 829 kaddr = page_address(con->msgr->zero_page);
756 } 830 }
757 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos), 831 len = min_t(int, max_write - con->out_msg_pos.page_pos,
758 (int)(data_len - con->out_msg_pos.data_pos)); 832 total_max_write);
833
759 if (crc && !con->out_msg_pos.did_page_crc) { 834 if (crc && !con->out_msg_pos.did_page_crc) {
760 void *base = kaddr + con->out_msg_pos.page_pos; 835 void *base = kaddr + con->out_msg_pos.page_pos;
761 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); 836 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
@@ -765,13 +840,14 @@ static int write_partial_msg_pages(struct ceph_connection *con)
765 cpu_to_le32(crc32c(tmpcrc, base, len)); 840 cpu_to_le32(crc32c(tmpcrc, base, len));
766 con->out_msg_pos.did_page_crc = 1; 841 con->out_msg_pos.did_page_crc = 1;
767 } 842 }
768
769 ret = kernel_sendpage(con->sock, page, 843 ret = kernel_sendpage(con->sock, page,
770 con->out_msg_pos.page_pos, len, 844 con->out_msg_pos.page_pos + page_shift,
845 len,
771 MSG_DONTWAIT | MSG_NOSIGNAL | 846 MSG_DONTWAIT | MSG_NOSIGNAL |
772 MSG_MORE); 847 MSG_MORE);
773 848
774 if (crc && (msg->pages || msg->pagelist)) 849 if (crc &&
850 (msg->pages || msg->pagelist || msg->bio || in_trail))
775 kunmap(page); 851 kunmap(page);
776 852
777 if (ret <= 0) 853 if (ret <= 0)
@@ -783,9 +859,16 @@ static int write_partial_msg_pages(struct ceph_connection *con)
783 con->out_msg_pos.page_pos = 0; 859 con->out_msg_pos.page_pos = 0;
784 con->out_msg_pos.page++; 860 con->out_msg_pos.page++;
785 con->out_msg_pos.did_page_crc = 0; 861 con->out_msg_pos.did_page_crc = 0;
786 if (msg->pagelist) 862 if (in_trail)
863 list_move_tail(&page->lru,
864 &msg->trail->head);
865 else if (msg->pagelist)
787 list_move_tail(&page->lru, 866 list_move_tail(&page->lru,
788 &msg->pagelist->head); 867 &msg->pagelist->head);
868#ifdef CONFIG_BLOCK
869 else if (msg->bio)
870 iter_bio_next(&msg->bio_iter, &msg->bio_seg);
871#endif
789 } 872 }
790 } 873 }
791 874
@@ -938,7 +1021,7 @@ static int verify_hello(struct ceph_connection *con)
938{ 1021{
939 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { 1022 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
940 pr_err("connect to %s got bad banner\n", 1023 pr_err("connect to %s got bad banner\n",
941 pr_addr(&con->peer_addr.in_addr)); 1024 ceph_pr_addr(&con->peer_addr.in_addr));
942 con->error_msg = "protocol error, bad banner"; 1025 con->error_msg = "protocol error, bad banner";
943 return -1; 1026 return -1;
944 } 1027 }
@@ -1041,7 +1124,7 @@ int ceph_parse_ips(const char *c, const char *end,
1041 1124
1042 addr_set_port(ss, port); 1125 addr_set_port(ss, port);
1043 1126
1044 dout("parse_ips got %s\n", pr_addr(ss)); 1127 dout("parse_ips got %s\n", ceph_pr_addr(ss));
1045 1128
1046 if (p == end) 1129 if (p == end)
1047 break; 1130 break;
@@ -1061,6 +1144,7 @@ bad:
1061 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); 1144 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
1062 return -EINVAL; 1145 return -EINVAL;
1063} 1146}
1147EXPORT_SYMBOL(ceph_parse_ips);
1064 1148
1065static int process_banner(struct ceph_connection *con) 1149static int process_banner(struct ceph_connection *con)
1066{ 1150{
@@ -1082,9 +1166,9 @@ static int process_banner(struct ceph_connection *con)
1082 !(addr_is_blank(&con->actual_peer_addr.in_addr) && 1166 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1083 con->actual_peer_addr.nonce == con->peer_addr.nonce)) { 1167 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1084 pr_warning("wrong peer, want %s/%d, got %s/%d\n", 1168 pr_warning("wrong peer, want %s/%d, got %s/%d\n",
1085 pr_addr(&con->peer_addr.in_addr), 1169 ceph_pr_addr(&con->peer_addr.in_addr),
1086 (int)le32_to_cpu(con->peer_addr.nonce), 1170 (int)le32_to_cpu(con->peer_addr.nonce),
1087 pr_addr(&con->actual_peer_addr.in_addr), 1171 ceph_pr_addr(&con->actual_peer_addr.in_addr),
1088 (int)le32_to_cpu(con->actual_peer_addr.nonce)); 1172 (int)le32_to_cpu(con->actual_peer_addr.nonce));
1089 con->error_msg = "wrong peer at address"; 1173 con->error_msg = "wrong peer at address";
1090 return -1; 1174 return -1;
@@ -1102,7 +1186,7 @@ static int process_banner(struct ceph_connection *con)
1102 addr_set_port(&con->msgr->inst.addr.in_addr, port); 1186 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1103 encode_my_addr(con->msgr); 1187 encode_my_addr(con->msgr);
1104 dout("process_banner learned my addr is %s\n", 1188 dout("process_banner learned my addr is %s\n",
1105 pr_addr(&con->msgr->inst.addr.in_addr)); 1189 ceph_pr_addr(&con->msgr->inst.addr.in_addr));
1106 } 1190 }
1107 1191
1108 set_bit(NEGOTIATING, &con->state); 1192 set_bit(NEGOTIATING, &con->state);
@@ -1123,8 +1207,8 @@ static void fail_protocol(struct ceph_connection *con)
1123 1207
1124static int process_connect(struct ceph_connection *con) 1208static int process_connect(struct ceph_connection *con)
1125{ 1209{
1126 u64 sup_feat = CEPH_FEATURE_SUPPORTED; 1210 u64 sup_feat = con->msgr->supported_features;
1127 u64 req_feat = CEPH_FEATURE_REQUIRED; 1211 u64 req_feat = con->msgr->required_features;
1128 u64 server_feat = le64_to_cpu(con->in_reply.features); 1212 u64 server_feat = le64_to_cpu(con->in_reply.features);
1129 1213
1130 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); 1214 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1134,7 +1218,7 @@ static int process_connect(struct ceph_connection *con)
1134 pr_err("%s%lld %s feature set mismatch," 1218 pr_err("%s%lld %s feature set mismatch,"
1135 " my %llx < server's %llx, missing %llx\n", 1219 " my %llx < server's %llx, missing %llx\n",
1136 ENTITY_NAME(con->peer_name), 1220 ENTITY_NAME(con->peer_name),
1137 pr_addr(&con->peer_addr.in_addr), 1221 ceph_pr_addr(&con->peer_addr.in_addr),
1138 sup_feat, server_feat, server_feat & ~sup_feat); 1222 sup_feat, server_feat, server_feat & ~sup_feat);
1139 con->error_msg = "missing required protocol features"; 1223 con->error_msg = "missing required protocol features";
1140 fail_protocol(con); 1224 fail_protocol(con);
@@ -1144,7 +1228,7 @@ static int process_connect(struct ceph_connection *con)
1144 pr_err("%s%lld %s protocol version mismatch," 1228 pr_err("%s%lld %s protocol version mismatch,"
1145 " my %d != server's %d\n", 1229 " my %d != server's %d\n",
1146 ENTITY_NAME(con->peer_name), 1230 ENTITY_NAME(con->peer_name),
1147 pr_addr(&con->peer_addr.in_addr), 1231 ceph_pr_addr(&con->peer_addr.in_addr),
1148 le32_to_cpu(con->out_connect.protocol_version), 1232 le32_to_cpu(con->out_connect.protocol_version),
1149 le32_to_cpu(con->in_reply.protocol_version)); 1233 le32_to_cpu(con->in_reply.protocol_version));
1150 con->error_msg = "protocol version mismatch"; 1234 con->error_msg = "protocol version mismatch";
@@ -1178,7 +1262,7 @@ static int process_connect(struct ceph_connection *con)
1178 le32_to_cpu(con->in_connect.connect_seq)); 1262 le32_to_cpu(con->in_connect.connect_seq));
1179 pr_err("%s%lld %s connection reset\n", 1263 pr_err("%s%lld %s connection reset\n",
1180 ENTITY_NAME(con->peer_name), 1264 ENTITY_NAME(con->peer_name),
1181 pr_addr(&con->peer_addr.in_addr)); 1265 ceph_pr_addr(&con->peer_addr.in_addr));
1182 reset_connection(con); 1266 reset_connection(con);
1183 prepare_write_connect(con->msgr, con, 0); 1267 prepare_write_connect(con->msgr, con, 0);
1184 prepare_read_connect(con); 1268 prepare_read_connect(con);
@@ -1223,7 +1307,7 @@ static int process_connect(struct ceph_connection *con)
1223 pr_err("%s%lld %s protocol feature mismatch," 1307 pr_err("%s%lld %s protocol feature mismatch,"
1224 " my required %llx > server's %llx, need %llx\n", 1308 " my required %llx > server's %llx, need %llx\n",
1225 ENTITY_NAME(con->peer_name), 1309 ENTITY_NAME(con->peer_name),
1226 pr_addr(&con->peer_addr.in_addr), 1310 ceph_pr_addr(&con->peer_addr.in_addr),
1227 req_feat, server_feat, req_feat & ~server_feat); 1311 req_feat, server_feat, req_feat & ~server_feat);
1228 con->error_msg = "missing required protocol features"; 1312 con->error_msg = "missing required protocol features";
1229 fail_protocol(con); 1313 fail_protocol(con);
@@ -1305,8 +1389,7 @@ static int read_partial_message_section(struct ceph_connection *con,
1305 struct kvec *section, 1389 struct kvec *section,
1306 unsigned int sec_len, u32 *crc) 1390 unsigned int sec_len, u32 *crc)
1307{ 1391{
1308 int left; 1392 int ret, left;
1309 int ret;
1310 1393
1311 BUG_ON(!section); 1394 BUG_ON(!section);
1312 1395
@@ -1329,13 +1412,83 @@ static int read_partial_message_section(struct ceph_connection *con,
1329static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, 1412static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1330 struct ceph_msg_header *hdr, 1413 struct ceph_msg_header *hdr,
1331 int *skip); 1414 int *skip);
1415
1416
1417static int read_partial_message_pages(struct ceph_connection *con,
1418 struct page **pages,
1419 unsigned data_len, int datacrc)
1420{
1421 void *p;
1422 int ret;
1423 int left;
1424
1425 left = min((int)(data_len - con->in_msg_pos.data_pos),
1426 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1427 /* (page) data */
1428 BUG_ON(pages == NULL);
1429 p = kmap(pages[con->in_msg_pos.page]);
1430 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1431 left);
1432 if (ret > 0 && datacrc)
1433 con->in_data_crc =
1434 crc32c(con->in_data_crc,
1435 p + con->in_msg_pos.page_pos, ret);
1436 kunmap(pages[con->in_msg_pos.page]);
1437 if (ret <= 0)
1438 return ret;
1439 con->in_msg_pos.data_pos += ret;
1440 con->in_msg_pos.page_pos += ret;
1441 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1442 con->in_msg_pos.page_pos = 0;
1443 con->in_msg_pos.page++;
1444 }
1445
1446 return ret;
1447}
1448
1449#ifdef CONFIG_BLOCK
1450static int read_partial_message_bio(struct ceph_connection *con,
1451 struct bio **bio_iter, int *bio_seg,
1452 unsigned data_len, int datacrc)
1453{
1454 struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
1455 void *p;
1456 int ret, left;
1457
1458 if (IS_ERR(bv))
1459 return PTR_ERR(bv);
1460
1461 left = min((int)(data_len - con->in_msg_pos.data_pos),
1462 (int)(bv->bv_len - con->in_msg_pos.page_pos));
1463
1464 p = kmap(bv->bv_page) + bv->bv_offset;
1465
1466 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1467 left);
1468 if (ret > 0 && datacrc)
1469 con->in_data_crc =
1470 crc32c(con->in_data_crc,
1471 p + con->in_msg_pos.page_pos, ret);
1472 kunmap(bv->bv_page);
1473 if (ret <= 0)
1474 return ret;
1475 con->in_msg_pos.data_pos += ret;
1476 con->in_msg_pos.page_pos += ret;
1477 if (con->in_msg_pos.page_pos == bv->bv_len) {
1478 con->in_msg_pos.page_pos = 0;
1479 iter_bio_next(bio_iter, bio_seg);
1480 }
1481
1482 return ret;
1483}
1484#endif
1485
1332/* 1486/*
1333 * read (part of) a message. 1487 * read (part of) a message.
1334 */ 1488 */
1335static int read_partial_message(struct ceph_connection *con) 1489static int read_partial_message(struct ceph_connection *con)
1336{ 1490{
1337 struct ceph_msg *m = con->in_msg; 1491 struct ceph_msg *m = con->in_msg;
1338 void *p;
1339 int ret; 1492 int ret;
1340 int to, left; 1493 int to, left;
1341 unsigned front_len, middle_len, data_len, data_off; 1494 unsigned front_len, middle_len, data_len, data_off;
@@ -1381,7 +1534,7 @@ static int read_partial_message(struct ceph_connection *con)
1381 if ((s64)seq - (s64)con->in_seq < 1) { 1534 if ((s64)seq - (s64)con->in_seq < 1) {
1382 pr_info("skipping %s%lld %s seq %lld, expected %lld\n", 1535 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
1383 ENTITY_NAME(con->peer_name), 1536 ENTITY_NAME(con->peer_name),
1384 pr_addr(&con->peer_addr.in_addr), 1537 ceph_pr_addr(&con->peer_addr.in_addr),
1385 seq, con->in_seq + 1); 1538 seq, con->in_seq + 1);
1386 con->in_base_pos = -front_len - middle_len - data_len - 1539 con->in_base_pos = -front_len - middle_len - data_len -
1387 sizeof(m->footer); 1540 sizeof(m->footer);
@@ -1422,7 +1575,10 @@ static int read_partial_message(struct ceph_connection *con)
1422 m->middle->vec.iov_len = 0; 1575 m->middle->vec.iov_len = 0;
1423 1576
1424 con->in_msg_pos.page = 0; 1577 con->in_msg_pos.page = 0;
1425 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; 1578 if (m->pages)
1579 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1580 else
1581 con->in_msg_pos.page_pos = 0;
1426 con->in_msg_pos.data_pos = 0; 1582 con->in_msg_pos.data_pos = 0;
1427 } 1583 }
1428 1584
@@ -1440,27 +1596,29 @@ static int read_partial_message(struct ceph_connection *con)
1440 if (ret <= 0) 1596 if (ret <= 0)
1441 return ret; 1597 return ret;
1442 } 1598 }
1599#ifdef CONFIG_BLOCK
1600 if (m->bio && !m->bio_iter)
1601 init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
1602#endif
1443 1603
1444 /* (page) data */ 1604 /* (page) data */
1445 while (con->in_msg_pos.data_pos < data_len) { 1605 while (con->in_msg_pos.data_pos < data_len) {
1446 left = min((int)(data_len - con->in_msg_pos.data_pos), 1606 if (m->pages) {
1447 (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); 1607 ret = read_partial_message_pages(con, m->pages,
1448 BUG_ON(m->pages == NULL); 1608 data_len, datacrc);
1449 p = kmap(m->pages[con->in_msg_pos.page]); 1609 if (ret <= 0)
1450 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, 1610 return ret;
1451 left); 1611#ifdef CONFIG_BLOCK
1452 if (ret > 0 && datacrc) 1612 } else if (m->bio) {
1453 con->in_data_crc = 1613
1454 crc32c(con->in_data_crc, 1614 ret = read_partial_message_bio(con,
1455 p + con->in_msg_pos.page_pos, ret); 1615 &m->bio_iter, &m->bio_seg,
1456 kunmap(m->pages[con->in_msg_pos.page]); 1616 data_len, datacrc);
1457 if (ret <= 0) 1617 if (ret <= 0)
1458 return ret; 1618 return ret;
1459 con->in_msg_pos.data_pos += ret; 1619#endif
1460 con->in_msg_pos.page_pos += ret; 1620 } else {
1461 if (con->in_msg_pos.page_pos == PAGE_SIZE) { 1621 BUG_ON(1);
1462 con->in_msg_pos.page_pos = 0;
1463 con->in_msg_pos.page++;
1464 } 1622 }
1465 } 1623 }
1466 1624
@@ -1874,9 +2032,9 @@ out:
1874static void ceph_fault(struct ceph_connection *con) 2032static void ceph_fault(struct ceph_connection *con)
1875{ 2033{
1876 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), 2034 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1877 pr_addr(&con->peer_addr.in_addr), con->error_msg); 2035 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
1878 dout("fault %p state %lu to peer %s\n", 2036 dout("fault %p state %lu to peer %s\n",
1879 con, con->state, pr_addr(&con->peer_addr.in_addr)); 2037 con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
1880 2038
1881 if (test_bit(LOSSYTX, &con->state)) { 2039 if (test_bit(LOSSYTX, &con->state)) {
1882 dout("fault on LOSSYTX channel\n"); 2040 dout("fault on LOSSYTX channel\n");
@@ -1936,7 +2094,9 @@ out:
1936/* 2094/*
1937 * create a new messenger instance 2095 * create a new messenger instance
1938 */ 2096 */
1939struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) 2097struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
2098 u32 supported_features,
2099 u32 required_features)
1940{ 2100{
1941 struct ceph_messenger *msgr; 2101 struct ceph_messenger *msgr;
1942 2102
@@ -1944,6 +2104,9 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1944 if (msgr == NULL) 2104 if (msgr == NULL)
1945 return ERR_PTR(-ENOMEM); 2105 return ERR_PTR(-ENOMEM);
1946 2106
2107 msgr->supported_features = supported_features;
2108 msgr->required_features = required_features;
2109
1947 spin_lock_init(&msgr->global_seq_lock); 2110 spin_lock_init(&msgr->global_seq_lock);
1948 2111
1949 /* the zero page is needed if a request is "canceled" while the message 2112 /* the zero page is needed if a request is "canceled" while the message
@@ -1966,6 +2129,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1966 dout("messenger_create %p\n", msgr); 2129 dout("messenger_create %p\n", msgr);
1967 return msgr; 2130 return msgr;
1968} 2131}
2132EXPORT_SYMBOL(ceph_messenger_create);
1969 2133
1970void ceph_messenger_destroy(struct ceph_messenger *msgr) 2134void ceph_messenger_destroy(struct ceph_messenger *msgr)
1971{ 2135{
@@ -1975,6 +2139,7 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr)
1975 kfree(msgr); 2139 kfree(msgr);
1976 dout("destroyed messenger %p\n", msgr); 2140 dout("destroyed messenger %p\n", msgr);
1977} 2141}
2142EXPORT_SYMBOL(ceph_messenger_destroy);
1978 2143
1979/* 2144/*
1980 * Queue up an outgoing message on the given connection. 2145 * Queue up an outgoing message on the given connection.
@@ -2011,6 +2176,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
2011 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) 2176 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2012 queue_con(con); 2177 queue_con(con);
2013} 2178}
2179EXPORT_SYMBOL(ceph_con_send);
2014 2180
2015/* 2181/*
2016 * Revoke a message that was previously queued for send 2182 * Revoke a message that was previously queued for send
@@ -2076,6 +2242,7 @@ void ceph_con_keepalive(struct ceph_connection *con)
2076 test_and_set_bit(WRITE_PENDING, &con->state) == 0) 2242 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2077 queue_con(con); 2243 queue_con(con);
2078} 2244}
2245EXPORT_SYMBOL(ceph_con_keepalive);
2079 2246
2080 2247
2081/* 2248/*
@@ -2136,6 +2303,10 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2136 m->nr_pages = 0; 2303 m->nr_pages = 0;
2137 m->pages = NULL; 2304 m->pages = NULL;
2138 m->pagelist = NULL; 2305 m->pagelist = NULL;
2306 m->bio = NULL;
2307 m->bio_iter = NULL;
2308 m->bio_seg = 0;
2309 m->trail = NULL;
2139 2310
2140 dout("ceph_msg_new %p front %d\n", m, front_len); 2311 dout("ceph_msg_new %p front %d\n", m, front_len);
2141 return m; 2312 return m;
@@ -2146,6 +2317,7 @@ out:
2146 pr_err("msg_new can't create type %d front %d\n", type, front_len); 2317 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2147 return NULL; 2318 return NULL;
2148} 2319}
2320EXPORT_SYMBOL(ceph_msg_new);
2149 2321
2150/* 2322/*
2151 * Allocate "middle" portion of a message, if it is needed and wasn't 2323 * Allocate "middle" portion of a message, if it is needed and wasn't
@@ -2250,11 +2422,14 @@ void ceph_msg_last_put(struct kref *kref)
2250 m->pagelist = NULL; 2422 m->pagelist = NULL;
2251 } 2423 }
2252 2424
2425 m->trail = NULL;
2426
2253 if (m->pool) 2427 if (m->pool)
2254 ceph_msgpool_put(m->pool, m); 2428 ceph_msgpool_put(m->pool, m);
2255 else 2429 else
2256 ceph_msg_kfree(m); 2430 ceph_msg_kfree(m);
2257} 2431}
2432EXPORT_SYMBOL(ceph_msg_last_put);
2258 2433
2259void ceph_msg_dump(struct ceph_msg *msg) 2434void ceph_msg_dump(struct ceph_msg *msg)
2260{ 2435{
@@ -2275,3 +2450,4 @@ void ceph_msg_dump(struct ceph_msg *msg)
2275 DUMP_PREFIX_OFFSET, 16, 1, 2450 DUMP_PREFIX_OFFSET, 16, 1,
2276 &msg->footer, sizeof(msg->footer), true); 2451 &msg->footer, sizeof(msg->footer), true);
2277} 2452}
2453EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/fs/ceph/mon_client.c b/net/ceph/mon_client.c
index b2a5a3e4a671..8a079399174a 100644
--- a/fs/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1,14 +1,16 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h>
3#include <linux/types.h> 4#include <linux/types.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/random.h> 6#include <linux/random.h>
6#include <linux/sched.h> 7#include <linux/sched.h>
7 8
8#include "mon_client.h" 9#include <linux/ceph/mon_client.h>
9#include "super.h" 10#include <linux/ceph/libceph.h>
10#include "auth.h" 11#include <linux/ceph/decode.h>
11#include "decode.h" 12
13#include <linux/ceph/auth.h>
12 14
13/* 15/*
14 * Interact with Ceph monitor cluster. Handle requests for new map 16 * Interact with Ceph monitor cluster. Handle requests for new map
@@ -74,7 +76,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
74 m->num_mon); 76 m->num_mon);
75 for (i = 0; i < m->num_mon; i++) 77 for (i = 0; i < m->num_mon; i++)
76 dout("monmap_decode mon%d is %s\n", i, 78 dout("monmap_decode mon%d is %s\n", i,
77 pr_addr(&m->mon_inst[i].addr.in_addr)); 79 ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
78 return m; 80 return m;
79 81
80bad: 82bad:
@@ -191,30 +193,33 @@ static void __send_subscribe(struct ceph_mon_client *monc)
191 struct ceph_msg *msg = monc->m_subscribe; 193 struct ceph_msg *msg = monc->m_subscribe;
192 struct ceph_mon_subscribe_item *i; 194 struct ceph_mon_subscribe_item *i;
193 void *p, *end; 195 void *p, *end;
196 int num;
194 197
195 p = msg->front.iov_base; 198 p = msg->front.iov_base;
196 end = p + msg->front_max; 199 end = p + msg->front_max;
197 200
198 dout("__send_subscribe to 'mdsmap' %u+\n", 201 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
199 (unsigned)monc->have_mdsmap); 202 ceph_encode_32(&p, num);
203
200 if (monc->want_next_osdmap) { 204 if (monc->want_next_osdmap) {
201 dout("__send_subscribe to 'osdmap' %u\n", 205 dout("__send_subscribe to 'osdmap' %u\n",
202 (unsigned)monc->have_osdmap); 206 (unsigned)monc->have_osdmap);
203 ceph_encode_32(&p, 3);
204 ceph_encode_string(&p, end, "osdmap", 6); 207 ceph_encode_string(&p, end, "osdmap", 6);
205 i = p; 208 i = p;
206 i->have = cpu_to_le64(monc->have_osdmap); 209 i->have = cpu_to_le64(monc->have_osdmap);
207 i->onetime = 1; 210 i->onetime = 1;
208 p += sizeof(*i); 211 p += sizeof(*i);
209 monc->want_next_osdmap = 2; /* requested */ 212 monc->want_next_osdmap = 2; /* requested */
210 } else {
211 ceph_encode_32(&p, 2);
212 } 213 }
213 ceph_encode_string(&p, end, "mdsmap", 6); 214 if (monc->want_mdsmap) {
214 i = p; 215 dout("__send_subscribe to 'mdsmap' %u+\n",
215 i->have = cpu_to_le64(monc->have_mdsmap); 216 (unsigned)monc->have_mdsmap);
216 i->onetime = 0; 217 ceph_encode_string(&p, end, "mdsmap", 6);
217 p += sizeof(*i); 218 i = p;
219 i->have = cpu_to_le64(monc->have_mdsmap);
220 i->onetime = 0;
221 p += sizeof(*i);
222 }
218 ceph_encode_string(&p, end, "monmap", 6); 223 ceph_encode_string(&p, end, "monmap", 6);
219 i = p; 224 i = p;
220 i->have = 0; 225 i->have = 0;
@@ -243,7 +248,8 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
243 mutex_lock(&monc->mutex); 248 mutex_lock(&monc->mutex);
244 if (monc->hunting) { 249 if (monc->hunting) {
245 pr_info("mon%d %s session established\n", 250 pr_info("mon%d %s session established\n",
246 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr)); 251 monc->cur_mon,
252 ceph_pr_addr(&monc->con->peer_addr.in_addr));
247 monc->hunting = false; 253 monc->hunting = false;
248 } 254 }
249 dout("handle_subscribe_ack after %d seconds\n", seconds); 255 dout("handle_subscribe_ack after %d seconds\n", seconds);
@@ -266,6 +272,7 @@ int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
266 mutex_unlock(&monc->mutex); 272 mutex_unlock(&monc->mutex);
267 return 0; 273 return 0;
268} 274}
275EXPORT_SYMBOL(ceph_monc_got_mdsmap);
269 276
270int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) 277int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
271{ 278{
@@ -310,6 +317,7 @@ int ceph_monc_open_session(struct ceph_mon_client *monc)
310 mutex_unlock(&monc->mutex); 317 mutex_unlock(&monc->mutex);
311 return 0; 318 return 0;
312} 319}
320EXPORT_SYMBOL(ceph_monc_open_session);
313 321
314/* 322/*
315 * The monitor responds with mount ack indicate mount success. The 323 * The monitor responds with mount ack indicate mount success. The
@@ -540,6 +548,7 @@ out:
540 kref_put(&req->kref, release_generic_request); 548 kref_put(&req->kref, release_generic_request);
541 return err; 549 return err;
542} 550}
551EXPORT_SYMBOL(ceph_monc_do_statfs);
543 552
544/* 553/*
545 * pool ops 554 * pool ops
@@ -651,6 +660,7 @@ int ceph_monc_create_snapid(struct ceph_mon_client *monc,
651 pool, 0, (char *)snapid, sizeof(*snapid)); 660 pool, 0, (char *)snapid, sizeof(*snapid));
652 661
653} 662}
663EXPORT_SYMBOL(ceph_monc_create_snapid);
654 664
655int ceph_monc_delete_snapid(struct ceph_mon_client *monc, 665int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
656 u32 pool, u64 snapid) 666 u32 pool, u64 snapid)
@@ -708,9 +718,9 @@ static void delayed_work(struct work_struct *work)
708 */ 718 */
709static int build_initial_monmap(struct ceph_mon_client *monc) 719static int build_initial_monmap(struct ceph_mon_client *monc)
710{ 720{
711 struct ceph_mount_args *args = monc->client->mount_args; 721 struct ceph_options *opt = monc->client->options;
712 struct ceph_entity_addr *mon_addr = args->mon_addr; 722 struct ceph_entity_addr *mon_addr = opt->mon_addr;
713 int num_mon = args->num_mon; 723 int num_mon = opt->num_mon;
714 int i; 724 int i;
715 725
716 /* build initial monmap */ 726 /* build initial monmap */
@@ -728,11 +738,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
728 } 738 }
729 monc->monmap->num_mon = num_mon; 739 monc->monmap->num_mon = num_mon;
730 monc->have_fsid = false; 740 monc->have_fsid = false;
731
732 /* release addr memory */
733 kfree(args->mon_addr);
734 args->mon_addr = NULL;
735 args->num_mon = 0;
736 return 0; 741 return 0;
737} 742}
738 743
@@ -753,8 +758,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
753 monc->con = NULL; 758 monc->con = NULL;
754 759
755 /* authentication */ 760 /* authentication */
756 monc->auth = ceph_auth_init(cl->mount_args->name, 761 monc->auth = ceph_auth_init(cl->options->name,
757 cl->mount_args->secret); 762 cl->options->secret);
758 if (IS_ERR(monc->auth)) 763 if (IS_ERR(monc->auth))
759 return PTR_ERR(monc->auth); 764 return PTR_ERR(monc->auth);
760 monc->auth->want_keys = 765 monc->auth->want_keys =
@@ -808,6 +813,7 @@ out_monmap:
808out: 813out:
809 return err; 814 return err;
810} 815}
816EXPORT_SYMBOL(ceph_monc_init);
811 817
812void ceph_monc_stop(struct ceph_mon_client *monc) 818void ceph_monc_stop(struct ceph_mon_client *monc)
813{ 819{
@@ -832,6 +838,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
832 838
833 kfree(monc->monmap); 839 kfree(monc->monmap);
834} 840}
841EXPORT_SYMBOL(ceph_monc_stop);
835 842
836static void handle_auth_reply(struct ceph_mon_client *monc, 843static void handle_auth_reply(struct ceph_mon_client *monc,
837 struct ceph_msg *msg) 844 struct ceph_msg *msg)
@@ -889,6 +896,7 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc)
889 mutex_unlock(&monc->mutex); 896 mutex_unlock(&monc->mutex);
890 return ret; 897 return ret;
891} 898}
899EXPORT_SYMBOL(ceph_monc_validate_auth);
892 900
893/* 901/*
894 * handle incoming message 902 * handle incoming message
@@ -922,15 +930,16 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
922 ceph_monc_handle_map(monc, msg); 930 ceph_monc_handle_map(monc, msg);
923 break; 931 break;
924 932
925 case CEPH_MSG_MDS_MAP:
926 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
927 break;
928
929 case CEPH_MSG_OSD_MAP: 933 case CEPH_MSG_OSD_MAP:
930 ceph_osdc_handle_map(&monc->client->osdc, msg); 934 ceph_osdc_handle_map(&monc->client->osdc, msg);
931 break; 935 break;
932 936
933 default: 937 default:
938 /* can the chained handler handle it? */
939 if (monc->client->extra_mon_dispatch &&
940 monc->client->extra_mon_dispatch(monc->client, msg) == 0)
941 break;
942
934 pr_err("received unknown message type %d %s\n", type, 943 pr_err("received unknown message type %d %s\n", type,
935 ceph_msg_type_name(type)); 944 ceph_msg_type_name(type));
936 } 945 }
@@ -994,7 +1003,7 @@ static void mon_fault(struct ceph_connection *con)
994 if (monc->con && !monc->hunting) 1003 if (monc->con && !monc->hunting)
995 pr_info("mon%d %s session lost, " 1004 pr_info("mon%d %s session lost, "
996 "hunting for new mon\n", monc->cur_mon, 1005 "hunting for new mon\n", monc->cur_mon,
997 pr_addr(&monc->con->peer_addr.in_addr)); 1006 ceph_pr_addr(&monc->con->peer_addr.in_addr));
998 1007
999 __close_session(monc); 1008 __close_session(monc);
1000 if (!monc->hunting) { 1009 if (!monc->hunting) {
diff --git a/fs/ceph/msgpool.c b/net/ceph/msgpool.c
index dd65a6438131..d5f2d97ac05c 100644
--- a/fs/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -1,11 +1,11 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/err.h> 3#include <linux/err.h>
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/vmalloc.h> 6#include <linux/vmalloc.h>
7 7
8#include "msgpool.h" 8#include <linux/ceph/msgpool.h>
9 9
10static void *alloc_fn(gfp_t gfp_mask, void *arg) 10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11{ 11{
diff --git a/fs/ceph/osd_client.c b/net/ceph/osd_client.c
index 3b5571b8ce22..79391994b3ed 100644
--- a/fs/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1,17 +1,22 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h>
3#include <linux/err.h> 4#include <linux/err.h>
4#include <linux/highmem.h> 5#include <linux/highmem.h>
5#include <linux/mm.h> 6#include <linux/mm.h>
6#include <linux/pagemap.h> 7#include <linux/pagemap.h>
7#include <linux/slab.h> 8#include <linux/slab.h>
8#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#ifdef CONFIG_BLOCK
11#include <linux/bio.h>
12#endif
9 13
10#include "super.h" 14#include <linux/ceph/libceph.h>
11#include "osd_client.h" 15#include <linux/ceph/osd_client.h>
12#include "messenger.h" 16#include <linux/ceph/messenger.h>
13#include "decode.h" 17#include <linux/ceph/decode.h>
14#include "auth.h" 18#include <linux/ceph/auth.h>
19#include <linux/ceph/pagelist.h>
15 20
16#define OSD_OP_FRONT_LEN 4096 21#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512 22#define OSD_OPREPLY_FRONT_LEN 512
@@ -22,6 +27,59 @@ static int __kick_requests(struct ceph_osd_client *osdc,
22 27
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); 28static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24 29
30static int op_needs_trail(int op)
31{
32 switch (op) {
33 case CEPH_OSD_OP_GETXATTR:
34 case CEPH_OSD_OP_SETXATTR:
35 case CEPH_OSD_OP_CMPXATTR:
36 case CEPH_OSD_OP_CALL:
37 return 1;
38 default:
39 return 0;
40 }
41}
42
43static int op_has_extent(int op)
44{
45 return (op == CEPH_OSD_OP_READ ||
46 op == CEPH_OSD_OP_WRITE);
47}
48
49void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
50 struct ceph_file_layout *layout,
51 u64 snapid,
52 u64 off, u64 *plen, u64 *bno,
53 struct ceph_osd_request *req,
54 struct ceph_osd_req_op *op)
55{
56 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59
60 reqhead->snapid = cpu_to_le64(snapid);
61
62 /* object extent? */
63 ceph_calc_file_object_mapping(layout, off, plen, bno,
64 &objoff, &objlen);
65 if (*plen < orig_len)
66 dout(" skipping last %llu, final file extent %llu~%llu\n",
67 orig_len - *plen, off, *plen);
68
69 if (op_has_extent(op->op)) {
70 op->extent.offset = objoff;
71 op->extent.length = objlen;
72 }
73 req->r_num_pages = calc_pages_for(off, *plen);
74 if (op->op == CEPH_OSD_OP_WRITE)
75 op->payload_len = *plen;
76
77 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
78 *bno, objoff, objlen, req->r_num_pages);
79
80}
81EXPORT_SYMBOL(ceph_calc_raw_layout);
82
25/* 83/*
26 * Implement client access to distributed object storage cluster. 84 * Implement client access to distributed object storage cluster.
27 * 85 *
@@ -48,34 +106,19 @@ static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
48 * fill osd op in request message. 106 * fill osd op in request message.
49 */ 107 */
50static void calc_layout(struct ceph_osd_client *osdc, 108static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout, 109 struct ceph_vino vino,
110 struct ceph_file_layout *layout,
52 u64 off, u64 *plen, 111 u64 off, u64 *plen,
53 struct ceph_osd_request *req) 112 struct ceph_osd_request *req,
113 struct ceph_osd_req_op *op)
54{ 114{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno; 115 u64 bno;
60 116
61 reqhead->snapid = cpu_to_le64(vino.snap); 117 ceph_calc_raw_layout(osdc, layout, vino.snap, off,
62 118 plen, &bno, req, op);
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69 119
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); 120 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid); 121 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79} 122}
80 123
81/* 124/*
@@ -101,56 +144,66 @@ void ceph_osdc_release_request(struct kref *kref)
101 if (req->r_own_pages) 144 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages, 145 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages); 146 req->r_num_pages);
147#ifdef CONFIG_BLOCK
148 if (req->r_bio)
149 bio_put(req->r_bio);
150#endif
104 ceph_put_snap_context(req->r_snapc); 151 ceph_put_snap_context(req->r_snapc);
152 if (req->r_trail) {
153 ceph_pagelist_release(req->r_trail);
154 kfree(req->r_trail);
155 }
105 if (req->r_mempool) 156 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool); 157 mempool_free(req, req->r_osdc->req_mempool);
107 else 158 else
108 kfree(req); 159 kfree(req);
109} 160}
161EXPORT_SYMBOL(ceph_osdc_release_request);
110 162
111/* 163static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
112 * build new request AND message, calculate layout, and adjust file 164{
113 * extent as needed. 165 int i = 0;
114 * 166
115 * if the file was recently truncated, we include information about its 167 if (needs_trail)
116 * old and new size so that the object can be updated appropriately. (we 168 *needs_trail = 0;
117 * avoid synchronously deleting truncated objects because it's slow.) 169 while (ops[i].op) {
118 * 170 if (needs_trail && op_needs_trail(ops[i].op))
119 * if @do_sync, include a 'startsync' command so that the osd will flush 171 *needs_trail = 1;
120 * data quickly. 172 i++;
121 */ 173 }
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, 174
123 struct ceph_file_layout *layout, 175 return i;
124 struct ceph_vino vino, 176}
125 u64 off, u64 *plen, 177
126 int opcode, int flags, 178struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
179 int flags,
127 struct ceph_snap_context *snapc, 180 struct ceph_snap_context *snapc,
128 int do_sync, 181 struct ceph_osd_req_op *ops,
129 u32 truncate_seq, 182 bool use_mempool,
130 u64 truncate_size, 183 gfp_t gfp_flags,
131 struct timespec *mtime, 184 struct page **pages,
132 bool use_mempool, int num_reply) 185 struct bio *bio)
133{ 186{
134 struct ceph_osd_request *req; 187 struct ceph_osd_request *req;
135 struct ceph_msg *msg; 188 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head; 189 int needs_trail;
137 struct ceph_osd_op *op; 190 int num_op = get_num_ops(ops, &needs_trail);
138 void *p; 191 size_t msg_size = sizeof(struct ceph_osd_request_head);
139 int num_op = 1 + do_sync; 192
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op); 193 msg_size += num_op*sizeof(struct ceph_osd_op);
141 int i;
142 194
143 if (use_mempool) { 195 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS); 196 req = mempool_alloc(osdc->req_mempool, gfp_flags);
145 memset(req, 0, sizeof(*req)); 197 memset(req, 0, sizeof(*req));
146 } else { 198 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS); 199 req = kzalloc(sizeof(*req), gfp_flags);
148 } 200 }
149 if (req == NULL) 201 if (req == NULL)
150 return NULL; 202 return NULL;
151 203
152 req->r_osdc = osdc; 204 req->r_osdc = osdc;
153 req->r_mempool = use_mempool; 205 req->r_mempool = use_mempool;
206
154 kref_init(&req->r_kref); 207 kref_init(&req->r_kref);
155 init_completion(&req->r_completion); 208 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion); 209 init_completion(&req->r_safe_completion);
@@ -164,13 +217,22 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 217 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else 218 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 219 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, GFP_NOFS); 220 OSD_OPREPLY_FRONT_LEN, gfp_flags);
168 if (!msg) { 221 if (!msg) {
169 ceph_osdc_put_request(req); 222 ceph_osdc_put_request(req);
170 return NULL; 223 return NULL;
171 } 224 }
172 req->r_reply = msg; 225 req->r_reply = msg;
173 226
227 /* allocate space for the trailing data */
228 if (needs_trail) {
229 req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
230 if (!req->r_trail) {
231 ceph_osdc_put_request(req);
232 return NULL;
233 }
234 ceph_pagelist_init(req->r_trail);
235 }
174 /* create request message; allow space for oid */ 236 /* create request message; allow space for oid */
175 msg_size += 40; 237 msg_size += 40;
176 if (snapc) 238 if (snapc)
@@ -178,18 +240,115 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
178 if (use_mempool) 240 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 241 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else 242 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS); 243 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
182 if (!msg) { 244 if (!msg) {
183 ceph_osdc_put_request(req); 245 ceph_osdc_put_request(req);
184 return NULL; 246 return NULL;
185 } 247 }
248
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); 249 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len); 250 memset(msg->front.iov_base, 0, msg->front.iov_len);
251
252 req->r_request = msg;
253 req->r_pages = pages;
254#ifdef CONFIG_BLOCK
255 if (bio) {
256 req->r_bio = bio;
257 bio_get(req->r_bio);
258 }
259#endif
260
261 return req;
262}
263EXPORT_SYMBOL(ceph_osdc_alloc_request);
264
265static void osd_req_encode_op(struct ceph_osd_request *req,
266 struct ceph_osd_op *dst,
267 struct ceph_osd_req_op *src)
268{
269 dst->op = cpu_to_le16(src->op);
270
271 switch (dst->op) {
272 case CEPH_OSD_OP_READ:
273 case CEPH_OSD_OP_WRITE:
274 dst->extent.offset =
275 cpu_to_le64(src->extent.offset);
276 dst->extent.length =
277 cpu_to_le64(src->extent.length);
278 dst->extent.truncate_size =
279 cpu_to_le64(src->extent.truncate_size);
280 dst->extent.truncate_seq =
281 cpu_to_le32(src->extent.truncate_seq);
282 break;
283
284 case CEPH_OSD_OP_GETXATTR:
285 case CEPH_OSD_OP_SETXATTR:
286 case CEPH_OSD_OP_CMPXATTR:
287 BUG_ON(!req->r_trail);
288
289 dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
290 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
291 dst->xattr.cmp_op = src->xattr.cmp_op;
292 dst->xattr.cmp_mode = src->xattr.cmp_mode;
293 ceph_pagelist_append(req->r_trail, src->xattr.name,
294 src->xattr.name_len);
295 ceph_pagelist_append(req->r_trail, src->xattr.val,
296 src->xattr.value_len);
297 break;
298 case CEPH_OSD_OP_CALL:
299 BUG_ON(!req->r_trail);
300
301 dst->cls.class_len = src->cls.class_len;
302 dst->cls.method_len = src->cls.method_len;
303 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
304
305 ceph_pagelist_append(req->r_trail, src->cls.class_name,
306 src->cls.class_len);
307 ceph_pagelist_append(req->r_trail, src->cls.method_name,
308 src->cls.method_len);
309 ceph_pagelist_append(req->r_trail, src->cls.indata,
310 src->cls.indata_len);
311 break;
312 case CEPH_OSD_OP_ROLLBACK:
313 dst->snap.snapid = cpu_to_le64(src->snap.snapid);
314 break;
315 case CEPH_OSD_OP_STARTSYNC:
316 break;
317 default:
318 pr_err("unrecognized osd opcode %d\n", dst->op);
319 WARN_ON(1);
320 break;
321 }
322 dst->payload_len = cpu_to_le32(src->payload_len);
323}
324
325/*
326 * build new request AND message
327 *
328 */
329void ceph_osdc_build_request(struct ceph_osd_request *req,
330 u64 off, u64 *plen,
331 struct ceph_osd_req_op *src_ops,
332 struct ceph_snap_context *snapc,
333 struct timespec *mtime,
334 const char *oid,
335 int oid_len)
336{
337 struct ceph_msg *msg = req->r_request;
338 struct ceph_osd_request_head *head;
339 struct ceph_osd_req_op *src_op;
340 struct ceph_osd_op *op;
341 void *p;
342 int num_op = get_num_ops(src_ops, NULL);
343 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
344 int flags = req->r_flags;
345 u64 data_len = 0;
346 int i;
347
188 head = msg->front.iov_base; 348 head = msg->front.iov_base;
189 op = (void *)(head + 1); 349 op = (void *)(head + 1);
190 p = (void *)(op + num_op); 350 p = (void *)(op + num_op);
191 351
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc); 352 req->r_snapc = ceph_get_snap_context(snapc);
194 353
195 head->client_inc = cpu_to_le32(1); /* always, for now. */ 354 head->client_inc = cpu_to_le32(1); /* always, for now. */
@@ -197,29 +356,23 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
197 if (flags & CEPH_OSD_FLAG_WRITE) 356 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime); 357 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op); 358 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201 359
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213 360
214 /* fill in oid */ 361 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len); 362 head->object_len = cpu_to_le32(oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len); 363 memcpy(p, oid, oid_len);
217 p += req->r_oid_len; 364 p += oid_len;
218 365
219 if (do_sync) { 366 src_op = src_ops;
367 while (src_op->op) {
368 osd_req_encode_op(req, op, src_op);
369 src_op++;
220 op++; 370 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 } 371 }
372
373 if (req->r_trail)
374 data_len += req->r_trail->length;
375
223 if (snapc) { 376 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq); 377 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps); 378 head->num_snaps = cpu_to_le32(snapc->num_snaps);
@@ -229,12 +382,79 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
229 } 382 }
230 } 383 }
231 384
385 if (flags & CEPH_OSD_FLAG_WRITE) {
386 req->r_request->hdr.data_off = cpu_to_le16(off);
387 req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
388 } else if (data_len) {
389 req->r_request->hdr.data_off = 0;
390 req->r_request->hdr.data_len = cpu_to_le32(data_len);
391 }
392
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len); 393 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base; 394 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size; 395 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size); 396 msg->hdr.front_len = cpu_to_le32(msg_size);
397 return;
398}
399EXPORT_SYMBOL(ceph_osdc_build_request);
400
401/*
402 * build new request AND message, calculate layout, and adjust file
403 * extent as needed.
404 *
405 * if the file was recently truncated, we include information about its
406 * old and new size so that the object can be updated appropriately. (we
407 * avoid synchronously deleting truncated objects because it's slow.)
408 *
409 * if @do_sync, include a 'startsync' command so that the osd will flush
410 * data quickly.
411 */
412struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
413 struct ceph_file_layout *layout,
414 struct ceph_vino vino,
415 u64 off, u64 *plen,
416 int opcode, int flags,
417 struct ceph_snap_context *snapc,
418 int do_sync,
419 u32 truncate_seq,
420 u64 truncate_size,
421 struct timespec *mtime,
422 bool use_mempool, int num_reply)
423{
424 struct ceph_osd_req_op ops[3];
425 struct ceph_osd_request *req;
426
427 ops[0].op = opcode;
428 ops[0].extent.truncate_seq = truncate_seq;
429 ops[0].extent.truncate_size = truncate_size;
430 ops[0].payload_len = 0;
431
432 if (do_sync) {
433 ops[1].op = CEPH_OSD_OP_STARTSYNC;
434 ops[1].payload_len = 0;
435 ops[2].op = 0;
436 } else
437 ops[1].op = 0;
438
439 req = ceph_osdc_alloc_request(osdc, flags,
440 snapc, ops,
441 use_mempool,
442 GFP_NOFS, NULL, NULL);
443 if (IS_ERR(req))
444 return req;
445
446 /* calculate max write size */
447 calc_layout(osdc, vino, layout, off, plen, req, ops);
448 req->r_file_layout = *layout; /* keep a copy */
449
450 ceph_osdc_build_request(req, off, plen, ops,
451 snapc,
452 mtime,
453 req->r_oid, req->r_oid_len);
454
236 return req; 455 return req;
237} 456}
457EXPORT_SYMBOL(ceph_osdc_new_request);
238 458
239/* 459/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid. 460 * We keep osd requests in an rbtree, sorted by ->r_tid.
@@ -389,7 +609,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc,
389 dout("__move_osd_to_lru %p\n", osd); 609 dout("__move_osd_to_lru %p\n", osd);
390 BUG_ON(!list_empty(&osd->o_osd_lru)); 610 BUG_ON(!list_empty(&osd->o_osd_lru));
391 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); 611 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
392 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ; 612 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
393} 613}
394 614
395static void __remove_osd_from_lru(struct ceph_osd *osd) 615static void __remove_osd_from_lru(struct ceph_osd *osd)
@@ -483,7 +703,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
483static void __schedule_osd_timeout(struct ceph_osd_client *osdc) 703static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
484{ 704{
485 schedule_delayed_work(&osdc->timeout_work, 705 schedule_delayed_work(&osdc->timeout_work,
486 osdc->client->mount_args->osd_keepalive_timeout * HZ); 706 osdc->client->options->osd_keepalive_timeout * HZ);
487} 707}
488 708
489static void __cancel_osd_timeout(struct ceph_osd_client *osdc) 709static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
@@ -684,9 +904,9 @@ static void handle_timeout(struct work_struct *work)
684 container_of(work, struct ceph_osd_client, timeout_work.work); 904 container_of(work, struct ceph_osd_client, timeout_work.work);
685 struct ceph_osd_request *req, *last_req = NULL; 905 struct ceph_osd_request *req, *last_req = NULL;
686 struct ceph_osd *osd; 906 struct ceph_osd *osd;
687 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; 907 unsigned long timeout = osdc->client->options->osd_timeout * HZ;
688 unsigned long keepalive = 908 unsigned long keepalive =
689 osdc->client->mount_args->osd_keepalive_timeout * HZ; 909 osdc->client->options->osd_keepalive_timeout * HZ;
690 unsigned long last_stamp = 0; 910 unsigned long last_stamp = 0;
691 struct rb_node *p; 911 struct rb_node *p;
692 struct list_head slow_osds; 912 struct list_head slow_osds;
@@ -773,7 +993,7 @@ static void handle_osds_timeout(struct work_struct *work)
773 container_of(work, struct ceph_osd_client, 993 container_of(work, struct ceph_osd_client,
774 osds_timeout_work.work); 994 osds_timeout_work.work);
775 unsigned long delay = 995 unsigned long delay =
776 osdc->client->mount_args->osd_idle_ttl * HZ >> 2; 996 osdc->client->options->osd_idle_ttl * HZ >> 2;
777 997
778 dout("osds timeout\n"); 998 dout("osds timeout\n");
779 down_read(&osdc->map_sem); 999 down_read(&osdc->map_sem);
@@ -1104,6 +1324,10 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1104 1324
1105 req->r_request->pages = req->r_pages; 1325 req->r_request->pages = req->r_pages;
1106 req->r_request->nr_pages = req->r_num_pages; 1326 req->r_request->nr_pages = req->r_num_pages;
1327#ifdef CONFIG_BLOCK
1328 req->r_request->bio = req->r_bio;
1329#endif
1330 req->r_request->trail = req->r_trail;
1107 1331
1108 register_request(osdc, req); 1332 register_request(osdc, req);
1109 1333
@@ -1131,6 +1355,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1131 up_read(&osdc->map_sem); 1355 up_read(&osdc->map_sem);
1132 return rc; 1356 return rc;
1133} 1357}
1358EXPORT_SYMBOL(ceph_osdc_start_request);
1134 1359
1135/* 1360/*
1136 * wait for a request to complete 1361 * wait for a request to complete
@@ -1153,6 +1378,7 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1153 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); 1378 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1154 return req->r_result; 1379 return req->r_result;
1155} 1380}
1381EXPORT_SYMBOL(ceph_osdc_wait_request);
1156 1382
1157/* 1383/*
1158 * sync - wait for all in-flight requests to flush. avoid starvation. 1384 * sync - wait for all in-flight requests to flush. avoid starvation.
@@ -1186,6 +1412,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
1186 mutex_unlock(&osdc->request_mutex); 1412 mutex_unlock(&osdc->request_mutex);
1187 dout("sync done (thru tid %llu)\n", last_tid); 1413 dout("sync done (thru tid %llu)\n", last_tid);
1188} 1414}
1415EXPORT_SYMBOL(ceph_osdc_sync);
1189 1416
1190/* 1417/*
1191 * init, shutdown 1418 * init, shutdown
@@ -1211,7 +1438,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1211 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); 1438 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1212 1439
1213 schedule_delayed_work(&osdc->osds_timeout_work, 1440 schedule_delayed_work(&osdc->osds_timeout_work,
1214 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ)); 1441 round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
1215 1442
1216 err = -ENOMEM; 1443 err = -ENOMEM;
1217 osdc->req_mempool = mempool_create_kmalloc_pool(10, 1444 osdc->req_mempool = mempool_create_kmalloc_pool(10,
@@ -1237,6 +1464,7 @@ out_mempool:
1237out: 1464out:
1238 return err; 1465 return err;
1239} 1466}
1467EXPORT_SYMBOL(ceph_osdc_init);
1240 1468
1241void ceph_osdc_stop(struct ceph_osd_client *osdc) 1469void ceph_osdc_stop(struct ceph_osd_client *osdc)
1242{ 1470{
@@ -1251,6 +1479,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
1251 ceph_msgpool_destroy(&osdc->msgpool_op); 1479 ceph_msgpool_destroy(&osdc->msgpool_op);
1252 ceph_msgpool_destroy(&osdc->msgpool_op_reply); 1480 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1253} 1481}
1482EXPORT_SYMBOL(ceph_osdc_stop);
1254 1483
1255/* 1484/*
1256 * Read some contiguous pages. If we cross a stripe boundary, shorten 1485 * Read some contiguous pages. If we cross a stripe boundary, shorten
@@ -1288,6 +1517,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1288 dout("readpages result %d\n", rc); 1517 dout("readpages result %d\n", rc);
1289 return rc; 1518 return rc;
1290} 1519}
1520EXPORT_SYMBOL(ceph_osdc_readpages);
1291 1521
1292/* 1522/*
1293 * do a synchronous write on N pages 1523 * do a synchronous write on N pages
@@ -1330,6 +1560,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1330 dout("writepages result %d\n", rc); 1560 dout("writepages result %d\n", rc);
1331 return rc; 1561 return rc;
1332} 1562}
1563EXPORT_SYMBOL(ceph_osdc_writepages);
1333 1564
1334/* 1565/*
1335 * handle incoming message 1566 * handle incoming message
@@ -1420,6 +1651,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1420 } 1651 }
1421 m->pages = req->r_pages; 1652 m->pages = req->r_pages;
1422 m->nr_pages = req->r_num_pages; 1653 m->nr_pages = req->r_num_pages;
1654#ifdef CONFIG_BLOCK
1655 m->bio = req->r_bio;
1656#endif
1423 } 1657 }
1424 *skip = 0; 1658 *skip = 0;
1425 req->r_con_filling_msg = ceph_con_get(con); 1659 req->r_con_filling_msg = ceph_con_get(con);
diff --git a/fs/ceph/osdmap.c b/net/ceph/osdmap.c
index e31f118f1392..d73f3f6efa36 100644
--- a/fs/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1,14 +1,15 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/module.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <asm/div64.h> 6#include <asm/div64.h>
6 7
7#include "super.h" 8#include <linux/ceph/libceph.h>
8#include "osdmap.h" 9#include <linux/ceph/osdmap.h>
9#include "crush/hash.h" 10#include <linux/ceph/decode.h>
10#include "crush/mapper.h" 11#include <linux/crush/hash.h>
11#include "decode.h" 12#include <linux/crush/mapper.h>
12 13
13char *ceph_osdmap_state_str(char *str, int len, int state) 14char *ceph_osdmap_state_str(char *str, int len, int state)
14{ 15{
@@ -417,6 +418,20 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
417 return NULL; 418 return NULL;
418} 419}
419 420
421int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
422{
423 struct rb_node *rbp;
424
425 for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
426 struct ceph_pg_pool_info *pi =
427 rb_entry(rbp, struct ceph_pg_pool_info, node);
428 if (pi->name && strcmp(pi->name, name) == 0)
429 return pi->id;
430 }
431 return -ENOENT;
432}
433EXPORT_SYMBOL(ceph_pg_poolid_by_name);
434
420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) 435static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
421{ 436{
422 rb_erase(&pi->node, root); 437 rb_erase(&pi->node, root);
@@ -966,6 +981,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
966 981
967 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); 982 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
968} 983}
984EXPORT_SYMBOL(ceph_calc_file_object_mapping);
969 985
970/* 986/*
971 * calculate an object layout (i.e. pgid) from an oid, 987 * calculate an object layout (i.e. pgid) from an oid,
@@ -1011,6 +1027,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
1011 ol->ol_stripe_unit = fl->fl_object_stripe_unit; 1027 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
1012 return 0; 1028 return 0;
1013} 1029}
1030EXPORT_SYMBOL(ceph_calc_object_layout);
1014 1031
1015/* 1032/*
1016 * Calculate raw osd vector for the given pgid. Return pointer to osd 1033 * Calculate raw osd vector for the given pgid. Return pointer to osd
@@ -1108,3 +1125,4 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1108 return osds[i]; 1125 return osds[i];
1109 return -1; 1126 return -1;
1110} 1127}
1128EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
new file mode 100644
index 000000000000..13cb409a7bba
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,154 @@
1
2#include <linux/module.h>
3#include <linux/gfp.h>
4#include <linux/pagemap.h>
5#include <linux/highmem.h>
6#include <linux/ceph/pagelist.h>
7
8static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail) {
11 struct page *page = list_entry(pl->head.prev, struct page, lru);
12 kunmap(page);
13 pl->mapped_tail = NULL;
14 }
15}
16
17int ceph_pagelist_release(struct ceph_pagelist *pl)
18{
19 ceph_pagelist_unmap_tail(pl);
20 while (!list_empty(&pl->head)) {
21 struct page *page = list_first_entry(&pl->head, struct page,
22 lru);
23 list_del(&page->lru);
24 __free_page(page);
25 }
26 ceph_pagelist_free_reserve(pl);
27 return 0;
28}
29EXPORT_SYMBOL(ceph_pagelist_release);
30
31static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
32{
33 struct page *page;
34
35 if (!pl->num_pages_free) {
36 page = __page_cache_alloc(GFP_NOFS);
37 } else {
38 page = list_first_entry(&pl->free_list, struct page, lru);
39 list_del(&page->lru);
40 --pl->num_pages_free;
41 }
42 if (!page)
43 return -ENOMEM;
44 pl->room += PAGE_SIZE;
45 ceph_pagelist_unmap_tail(pl);
46 list_add_tail(&page->lru, &pl->head);
47 pl->mapped_tail = kmap(page);
48 return 0;
49}
50
51int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
52{
53 while (pl->room < len) {
54 size_t bit = pl->room;
55 int ret;
56
57 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
58 buf, bit);
59 pl->length += bit;
60 pl->room -= bit;
61 buf += bit;
62 len -= bit;
63 ret = ceph_pagelist_addpage(pl);
64 if (ret)
65 return ret;
66 }
67
68 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
69 pl->length += len;
70 pl->room -= len;
71 return 0;
72}
73EXPORT_SYMBOL(ceph_pagelist_append);
74
75/**
76 * Allocate enough pages for a pagelist to append the given amount
77 * of data without without allocating.
78 * Returns: 0 on success, -ENOMEM on error.
79 */
80int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
81{
82 if (space <= pl->room)
83 return 0;
84 space -= pl->room;
85 space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */
86
87 while (space > pl->num_pages_free) {
88 struct page *page = __page_cache_alloc(GFP_NOFS);
89 if (!page)
90 return -ENOMEM;
91 list_add_tail(&page->lru, &pl->free_list);
92 ++pl->num_pages_free;
93 }
94 return 0;
95}
96EXPORT_SYMBOL(ceph_pagelist_reserve);
97
98/**
99 * Free any pages that have been preallocated.
100 */
101int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
102{
103 while (!list_empty(&pl->free_list)) {
104 struct page *page = list_first_entry(&pl->free_list,
105 struct page, lru);
106 list_del(&page->lru);
107 __free_page(page);
108 --pl->num_pages_free;
109 }
110 BUG_ON(pl->num_pages_free);
111 return 0;
112}
113EXPORT_SYMBOL(ceph_pagelist_free_reserve);
114
115/**
116 * Create a truncation point.
117 */
118void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
119 struct ceph_pagelist_cursor *c)
120{
121 c->pl = pl;
122 c->page_lru = pl->head.prev;
123 c->room = pl->room;
124}
125EXPORT_SYMBOL(ceph_pagelist_set_cursor);
126
127/**
128 * Truncate a pagelist to the given point. Move extra pages to reserve.
129 * This won't sleep.
130 * Returns: 0 on success,
131 * -EINVAL if the pagelist doesn't match the trunc point pagelist
132 */
133int ceph_pagelist_truncate(struct ceph_pagelist *pl,
134 struct ceph_pagelist_cursor *c)
135{
136 struct page *page;
137
138 if (pl != c->pl)
139 return -EINVAL;
140 ceph_pagelist_unmap_tail(pl);
141 while (pl->head.prev != c->page_lru) {
142 page = list_entry(pl->head.prev, struct page, lru);
143 list_del(&page->lru); /* remove from pagelist */
144 list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
145 ++pl->num_pages_free;
146 }
147 pl->room = c->room;
148 if (!list_empty(&pl->head)) {
149 page = list_entry(pl->head.prev, struct page, lru);
150 pl->mapped_tail = kmap(page);
151 }
152 return 0;
153}
154EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
new file mode 100644
index 000000000000..54caf0687155
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,223 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/sched.h>
5#include <linux/slab.h>
6#include <linux/file.h>
7#include <linux/namei.h>
8#include <linux/writeback.h>
9
10#include <linux/ceph/libceph.h>
11
12/*
13 * build a vector of user pages
14 */
15struct page **ceph_get_direct_page_vector(const char __user *data,
16 int num_pages,
17 loff_t off, size_t len)
18{
19 struct page **pages;
20 int rc;
21
22 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
23 if (!pages)
24 return ERR_PTR(-ENOMEM);
25
26 down_read(&current->mm->mmap_sem);
27 rc = get_user_pages(current, current->mm, (unsigned long)data,
28 num_pages, 0, 0, pages, NULL);
29 up_read(&current->mm->mmap_sem);
30 if (rc < 0)
31 goto fail;
32 return pages;
33
34fail:
35 kfree(pages);
36 return ERR_PTR(rc);
37}
38EXPORT_SYMBOL(ceph_get_direct_page_vector);
39
40void ceph_put_page_vector(struct page **pages, int num_pages)
41{
42 int i;
43
44 for (i = 0; i < num_pages; i++)
45 put_page(pages[i]);
46 kfree(pages);
47}
48EXPORT_SYMBOL(ceph_put_page_vector);
49
50void ceph_release_page_vector(struct page **pages, int num_pages)
51{
52 int i;
53
54 for (i = 0; i < num_pages; i++)
55 __free_pages(pages[i], 0);
56 kfree(pages);
57}
58EXPORT_SYMBOL(ceph_release_page_vector);
59
60/*
61 * allocate a vector new pages
62 */
63struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
64{
65 struct page **pages;
66 int i;
67
68 pages = kmalloc(sizeof(*pages) * num_pages, flags);
69 if (!pages)
70 return ERR_PTR(-ENOMEM);
71 for (i = 0; i < num_pages; i++) {
72 pages[i] = __page_cache_alloc(flags);
73 if (pages[i] == NULL) {
74 ceph_release_page_vector(pages, i);
75 return ERR_PTR(-ENOMEM);
76 }
77 }
78 return pages;
79}
80EXPORT_SYMBOL(ceph_alloc_page_vector);
81
82/*
83 * copy user data into a page vector
84 */
85int ceph_copy_user_to_page_vector(struct page **pages,
86 const char __user *data,
87 loff_t off, size_t len)
88{
89 int i = 0;
90 int po = off & ~PAGE_CACHE_MASK;
91 int left = len;
92 int l, bad;
93
94 while (left > 0) {
95 l = min_t(int, PAGE_CACHE_SIZE-po, left);
96 bad = copy_from_user(page_address(pages[i]) + po, data, l);
97 if (bad == l)
98 return -EFAULT;
99 data += l - bad;
100 left -= l - bad;
101 po += l - bad;
102 if (po == PAGE_CACHE_SIZE) {
103 po = 0;
104 i++;
105 }
106 }
107 return len;
108}
109EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
110
111int ceph_copy_to_page_vector(struct page **pages,
112 const char *data,
113 loff_t off, size_t len)
114{
115 int i = 0;
116 size_t po = off & ~PAGE_CACHE_MASK;
117 size_t left = len;
118 size_t l;
119
120 while (left > 0) {
121 l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
122 memcpy(page_address(pages[i]) + po, data, l);
123 data += l;
124 left -= l;
125 po += l;
126 if (po == PAGE_CACHE_SIZE) {
127 po = 0;
128 i++;
129 }
130 }
131 return len;
132}
133EXPORT_SYMBOL(ceph_copy_to_page_vector);
134
135int ceph_copy_from_page_vector(struct page **pages,
136 char *data,
137 loff_t off, size_t len)
138{
139 int i = 0;
140 size_t po = off & ~PAGE_CACHE_MASK;
141 size_t left = len;
142 size_t l;
143
144 while (left > 0) {
145 l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
146 memcpy(data, page_address(pages[i]) + po, l);
147 data += l;
148 left -= l;
149 po += l;
150 if (po == PAGE_CACHE_SIZE) {
151 po = 0;
152 i++;
153 }
154 }
155 return len;
156}
157EXPORT_SYMBOL(ceph_copy_from_page_vector);
158
159/*
160 * copy user data from a page vector into a user pointer
161 */
162int ceph_copy_page_vector_to_user(struct page **pages,
163 char __user *data,
164 loff_t off, size_t len)
165{
166 int i = 0;
167 int po = off & ~PAGE_CACHE_MASK;
168 int left = len;
169 int l, bad;
170
171 while (left > 0) {
172 l = min_t(int, left, PAGE_CACHE_SIZE-po);
173 bad = copy_to_user(data, page_address(pages[i]) + po, l);
174 if (bad == l)
175 return -EFAULT;
176 data += l - bad;
177 left -= l - bad;
178 if (po) {
179 po += l - bad;
180 if (po == PAGE_CACHE_SIZE)
181 po = 0;
182 }
183 i++;
184 }
185 return len;
186}
187EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
188
189/*
190 * Zero an extent within a page vector. Offset is relative to the
191 * start of the first page.
192 */
193void ceph_zero_page_vector_range(int off, int len, struct page **pages)
194{
195 int i = off >> PAGE_CACHE_SHIFT;
196
197 off &= ~PAGE_CACHE_MASK;
198
199 dout("zero_page_vector_page %u~%u\n", off, len);
200
201 /* leading partial page? */
202 if (off) {
203 int end = min((int)PAGE_CACHE_SIZE, off + len);
204 dout("zeroing %d %p head from %d\n", i, pages[i],
205 (int)off);
206 zero_user_segment(pages[i], off, end);
207 len -= (end - off);
208 i++;
209 }
210 while (len >= PAGE_CACHE_SIZE) {
211 dout("zeroing %d %p len=%d\n", i, pages[i], len);
212 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
213 len -= PAGE_CACHE_SIZE;
214 i++;
215 }
216 /* trailing partial page? */
217 if (len) {
218 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
219 zero_user_segment(pages[i], 0, len);
220 }
221}
222EXPORT_SYMBOL(ceph_zero_page_vector_range);
223