aboutsummaryrefslogtreecommitdiffstats
path: root/fs/exofs
diff options
context:
space:
mode:
authorBoaz Harrosh <bharrosh@panasas.com>2009-11-16 09:03:05 -0500
committerBoaz Harrosh <bharrosh@panasas.com>2009-12-10 02:59:23 -0500
commit04dc1e88ad9c9f9639019e9646a89ce0ebf706bb (patch)
tree403206d1e85e9e487d847694cbe0ecf111b3f02b /fs/exofs
parent06886a5a3dc5a5abe0a4d257c26317bde7047be8 (diff)
exofs: Multi-device mirror support
This patch changes on-disk format, it is accompanied with a parallel patch to mkfs.exofs that enables multi-device capabilities. After this patch, old exofs will refuse to mount a new formatted FS and new exofs will refuse an old format. This is done by moving the magic field offset inside the FSCB. A new FSCB *version* field was added. In the future, exofs will refuse to mount unmatched FSCB version. To up-grade or down-grade an exofs one must use mkfs.exofs --upgrade option before mounting. Introduced, a new object that contains a *device-table*. This object contains the default *data-map* and a linear array of devices information, which identifies the devices used in the filesystem. This object is only written to offline by mkfs.exofs. This is why it is kept separate from the FSCB, since the later is written to while mounted. Same partition number, same object number is used on all devices only the device varies. * define the new format, then load the device table on mount time make sure every thing is supported. * Change I/O engine to now support Mirror IO, .i.e write same data to multiple devices, read from a random device to spread the read-load from multiple clients (TODO: stripe read) Implementation notes: A few points introduced in previous patch should be mentioned here: * Special care was made so absolutlly all operation that have any chance of failing are done before any osd-request is executed. This is to minimize the need for a data consistency recovery, to only real IO errors. * Each IO state has a kref. It starts at 1, any osd-request executed will increment the kref, finally when all are executed the first ref is dropped. At IO-done, each request completion decrements the kref, the last one to return executes the internal _last_io() routine. _last_io() will call the registered io_state_done. On sync mode a caller does not supply a done method, indicating a synchronous request, the caller is put to sleep and a special io_state_done is registered that will awaken the caller. Though also in sync mode all operations are executed in parallel. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs/exofs')
-rw-r--r--fs/exofs/common.h63
-rw-r--r--fs/exofs/exofs.h12
-rw-r--r--fs/exofs/inode.c5
-rw-r--r--fs/exofs/ios.c38
-rw-r--r--fs/exofs/pnfs.h51
-rw-r--r--fs/exofs/super.c220
6 files changed, 361 insertions, 28 deletions
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index ce1c71692599..b1b178e61718 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -49,6 +49,7 @@
49#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ 49#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
50#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ 50#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
51#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ 51#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
52#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
52#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ 53#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
53 54
54/* exofs Application specific page/attribute */ 55/* exofs Application specific page/attribute */
@@ -78,17 +79,67 @@ enum {
78#define EXOFS_SUPER_MAGIC 0x5DF5 79#define EXOFS_SUPER_MAGIC 0x5DF5
79 80
80/* 81/*
81 * The file system control block - stored in an object's data (mainly, the one 82 * The file system control block - stored in object EXOFS_SUPER_ID's data.
82 * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored 83 * This is where the in-memory superblock is stored on disk.
83 * on disk. Right now it just has a magic value, which is basically a sanity
84 * check on our ability to communicate with the object store.
85 */ 84 */
85enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
86struct exofs_fscb { 86struct exofs_fscb {
87 __le64 s_nextid; /* Highest object ID used */ 87 __le64 s_nextid; /* Highest object ID used */
88 __le32 s_numfiles; /* Number of files on fs */ 88 __le64 s_numfiles; /* Number of files on fs */
89 __le32 s_version; /* == EXOFS_FSCB_VER */
89 __le16 s_magic; /* Magic signature */ 90 __le16 s_magic; /* Magic signature */
90 __le16 s_newfs; /* Non-zero if this is a new fs */ 91 __le16 s_newfs; /* Non-zero if this is a new fs */
91}; 92
93 /* From here on it's a static part, only written by mkexofs */
94 __le64 s_dev_table_oid; /* Resurved, not used */
95 __le64 s_dev_table_count; /* == 0 means no dev_table */
96} __packed;
97
98/*
99 * Describes the raid used in the FS. It is part of the device table.
100 * This here is taken from the pNFS-objects definition. In exofs we
101 * use one raid policy through-out the filesystem. (NOTE: the funny
102 * alignment at begining. We take care of it at exofs_device_table.
103 */
104struct exofs_dt_data_map {
105 __le32 cb_num_comps;
106 __le64 cb_stripe_unit;
107 __le32 cb_group_width;
108 __le32 cb_group_depth;
109 __le32 cb_mirror_cnt;
110 __le32 cb_raid_algorithm;
111} __packed;
112
113/*
114 * This is an osd device information descriptor. It is a single entry in
115 * the exofs device table. It describes an osd target lun which
116 * contains data belonging to this FS. (Same partition_id on all devices)
117 */
118struct exofs_dt_device_info {
119 __le32 systemid_len;
120 u8 systemid[OSD_SYSTEMID_LEN];
121 __le64 long_name_offset; /* If !0 then offset-in-file */
122 __le32 osdname_len; /* */
123 u8 osdname[44]; /* Embbeded, Ususally an asci uuid */
124} __packed;
125
126/*
127 * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
128 * It contains the raid used for this multy-device FS and an array of
129 * participating devices.
130 */
131struct exofs_device_table {
132 __le32 dt_version; /* == EXOFS_DT_VER */
133 struct exofs_dt_data_map dt_data_map; /* Raid policy to use */
134
135 /* Resurved space For future use. Total includeing this:
136 * (8 * sizeof(le64))
137 */
138 __le64 __Resurved[4];
139
140 __le64 dt_num_devices; /* Array size */
141 struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */
142} __packed;
92 143
93/**************************************************************************** 144/****************************************************************************
94 * inode-related things 145 * inode-related things
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2e08859a89e8..c35fd4623986 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -37,6 +37,11 @@
37#include <linux/time.h> 37#include <linux/time.h>
38#include "common.h" 38#include "common.h"
39 39
40/* FIXME: Remove once pnfs hits mainline
41 * #include <linux/exportfs/pnfs_osd_xdr.h>
42 */
43#include "pnfs.h"
44
40#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) 45#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
41 46
42#ifdef CONFIG_EXOFS_DEBUG 47#ifdef CONFIG_EXOFS_DEBUG
@@ -54,7 +59,6 @@
54 * our extension to the in-memory superblock 59 * our extension to the in-memory superblock
55 */ 60 */
56struct exofs_sb_info { 61struct exofs_sb_info {
57 struct osd_dev *s_dev; /* returned by get_osd_dev */
58 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ 62 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/
59 osd_id s_pid; /* partition ID of file system*/ 63 osd_id s_pid; /* partition ID of file system*/
60 int s_timeout; /* timeout for OSD operations */ 64 int s_timeout; /* timeout for OSD operations */
@@ -63,7 +67,11 @@ struct exofs_sb_info {
63 spinlock_t s_next_gen_lock; /* spinlock for gen # update */ 67 spinlock_t s_next_gen_lock; /* spinlock for gen # update */
64 u32 s_next_generation; /* next gen # to use */ 68 u32 s_next_generation; /* next gen # to use */
65 atomic_t s_curr_pending; /* number of pending commands */ 69 atomic_t s_curr_pending; /* number of pending commands */
66 uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */ 70 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
71
72 struct pnfs_osd_data_map data_map; /* Default raid to use */
73 unsigned s_numdevs; /* Num of devices in array */
74 struct osd_dev *s_ods[1]; /* Variable length, minimum 1 */
67}; 75};
68 76
69/* 77/*
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 7578950fd135..698a8636d39c 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -62,7 +62,10 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
62 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 62 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
63 63
64 pcol->sbi = sbi; 64 pcol->sbi = sbi;
65 pcol->req_q = osd_request_queue(sbi->s_dev); 65 /* Create master bios on first Q, later on cloning, each clone will be
66 * allocated on it's destination Q
67 */
68 pcol->req_q = osd_request_queue(sbi->s_ods[0]);
66 pcol->inode = inode; 69 pcol->inode = inode;
67 pcol->expected_pages = expected_pages; 70 pcol->expected_pages = expected_pages;
68 71
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index bb2f9d341fdf..5bad01fa1f9f 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -71,7 +71,7 @@ int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
71 /*TODO: Maybe use kmem_cach per sbi of size 71 /*TODO: Maybe use kmem_cach per sbi of size
72 * exofs_io_state_size(sbi->s_numdevs) 72 * exofs_io_state_size(sbi->s_numdevs)
73 */ 73 */
74 ios = kzalloc(exofs_io_state_size(1), GFP_KERNEL); 74 ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
75 if (unlikely(!ios)) { 75 if (unlikely(!ios)) {
76 *pios = NULL; 76 *pios = NULL;
77 return -ENOMEM; 77 return -ENOMEM;
@@ -209,10 +209,10 @@ int exofs_sbi_create(struct exofs_io_state *ios)
209{ 209{
210 int i, ret; 210 int i, ret;
211 211
212 for (i = 0; i < 1; i++) { 212 for (i = 0; i < ios->sbi->s_numdevs; i++) {
213 struct osd_request *or; 213 struct osd_request *or;
214 214
215 or = osd_start_request(ios->sbi->s_dev, GFP_KERNEL); 215 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
216 if (unlikely(!or)) { 216 if (unlikely(!or)) {
217 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 217 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
218 ret = -ENOMEM; 218 ret = -ENOMEM;
@@ -233,10 +233,10 @@ int exofs_sbi_remove(struct exofs_io_state *ios)
233{ 233{
234 int i, ret; 234 int i, ret;
235 235
236 for (i = 0; i < 1; i++) { 236 for (i = 0; i < ios->sbi->s_numdevs; i++) {
237 struct osd_request *or; 237 struct osd_request *or;
238 238
239 or = osd_start_request(ios->sbi->s_dev, GFP_KERNEL); 239 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
240 if (unlikely(!or)) { 240 if (unlikely(!or)) {
241 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 241 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
242 ret = -ENOMEM; 242 ret = -ENOMEM;
@@ -257,10 +257,10 @@ int exofs_sbi_write(struct exofs_io_state *ios)
257{ 257{
258 int i, ret; 258 int i, ret;
259 259
260 for (i = 0; i < 1; i++) { 260 for (i = 0; i < ios->sbi->s_numdevs; i++) {
261 struct osd_request *or; 261 struct osd_request *or;
262 262
263 or = osd_start_request(ios->sbi->s_dev, GFP_KERNEL); 263 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
264 if (unlikely(!or)) { 264 if (unlikely(!or)) {
265 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 265 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
266 ret = -ENOMEM; 266 ret = -ENOMEM;
@@ -272,7 +272,21 @@ int exofs_sbi_write(struct exofs_io_state *ios)
272 if (ios->bio) { 272 if (ios->bio) {
273 struct bio *bio; 273 struct bio *bio;
274 274
275 bio = ios->bio; 275 if (i != 0) {
276 bio = bio_kmalloc(GFP_KERNEL,
277 ios->bio->bi_max_vecs);
278 if (unlikely(!bio)) {
279 ret = -ENOMEM;
280 goto out;
281 }
282
283 __bio_clone(bio, ios->bio);
284 bio->bi_bdev = NULL;
285 bio->bi_next = NULL;
286 ios->per_dev[i].bio = bio;
287 } else {
288 bio = ios->bio;
289 }
276 290
277 osd_req_write(or, &ios->obj, ios->offset, bio, 291 osd_req_write(or, &ios->obj, ios->offset, bio,
278 ios->length); 292 ios->length);
@@ -306,8 +320,10 @@ int exofs_sbi_read(struct exofs_io_state *ios)
306 320
307 for (i = 0; i < 1; i++) { 321 for (i = 0; i < 1; i++) {
308 struct osd_request *or; 322 struct osd_request *or;
323 unsigned first_dev = (unsigned)ios->obj.id;
309 324
310 or = osd_start_request(ios->sbi->s_dev, GFP_KERNEL); 325 first_dev %= ios->sbi->s_numdevs;
326 or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL);
311 if (unlikely(!or)) { 327 if (unlikely(!or)) {
312 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 328 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
313 ret = -ENOMEM; 329 ret = -ENOMEM;
@@ -382,10 +398,10 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
382 attr = g_attr_logical_length; 398 attr = g_attr_logical_length;
383 attr.val_ptr = &newsize; 399 attr.val_ptr = &newsize;
384 400
385 for (i = 0; i < 1; i++) { 401 for (i = 0; i < sbi->s_numdevs; i++) {
386 struct osd_request *or; 402 struct osd_request *or;
387 403
388 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 404 or = osd_start_request(sbi->s_ods[i], GFP_KERNEL);
389 if (unlikely(!or)) { 405 if (unlikely(!or)) {
390 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 406 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
391 ret = -ENOMEM; 407 ret = -ENOMEM;
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
new file mode 100644
index 000000000000..423033addd1f
--- /dev/null
+++ b/fs/exofs/pnfs.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (C) 2008, 2009
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of exofs.
6 *
7 * exofs is free software; you can redistribute it and/or modify it under the
8 * terms of the GNU General Public License version 2 as published by the Free
9 * Software Foundation.
10 *
11 */
12
13/* FIXME: Remove this file once pnfs hits mainline */
14
15#ifndef __EXOFS_PNFS_H__
16#define __EXOFS_PNFS_H__
17
18#if defined(CONFIG_PNFS)
19
20
21/* FIXME: move this file to: linux/exportfs/pnfs_osd_xdr.h */
22#include "../nfs/objlayout/pnfs_osd_xdr.h"
23
24#else /* defined(CONFIG_PNFS) */
25
26enum pnfs_iomode {
27 IOMODE_READ = 1,
28 IOMODE_RW = 2,
29 IOMODE_ANY = 3,
30};
31
32/* Layout Structure */
33enum pnfs_osd_raid_algorithm4 {
34 PNFS_OSD_RAID_0 = 1,
35 PNFS_OSD_RAID_4 = 2,
36 PNFS_OSD_RAID_5 = 3,
37 PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
38};
39
40struct pnfs_osd_data_map {
41 u32 odm_num_comps;
42 u64 odm_stripe_unit;
43 u32 odm_group_width;
44 u32 odm_group_depth;
45 u32 odm_mirror_cnt;
46 u32 odm_raid_algorithm;
47};
48
49#endif /* else defined(CONFIG_PNFS) */
50
51#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 4cd97f526d49..a1d1e77b12eb 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -214,12 +214,17 @@ int exofs_sync_fs(struct super_block *sb, int wait)
214 if (ret) 214 if (ret)
215 goto out; 215 goto out;
216 216
217 ios->length = sizeof(*fscb); 217 /* Note: We only write the changing part of the fscb. .i.e upto the
218 * the fscb->s_dev_table_oid member. There is no read-modify-write
219 * here.
220 */
221 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
218 memset(fscb, 0, ios->length); 222 memset(fscb, 0, ios->length);
219 fscb->s_nextid = cpu_to_le64(sbi->s_nextid); 223 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
220 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); 224 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
221 fscb->s_magic = cpu_to_le16(sb->s_magic); 225 fscb->s_magic = cpu_to_le16(sb->s_magic);
222 fscb->s_newfs = 0; 226 fscb->s_newfs = 0;
227 fscb->s_version = EXOFS_FSCB_VER;
223 228
224 ios->obj.id = EXOFS_SUPER_ID; 229 ios->obj.id = EXOFS_SUPER_ID;
225 ios->offset = 0; 230 ios->offset = 0;
@@ -257,6 +262,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
257 msg, dev_path ?: "", odi->osdname, _LLU(pid)); 262 msg, dev_path ?: "", odi->osdname, _LLU(pid));
258} 263}
259 264
265void exofs_free_sbi(struct exofs_sb_info *sbi)
266{
267 while (sbi->s_numdevs) {
268 int i = --sbi->s_numdevs;
269 struct osd_dev *od = sbi->s_ods[i];
270
271 if (od) {
272 sbi->s_ods[i] = NULL;
273 osduld_put_device(od);
274 }
275 }
276 kfree(sbi);
277}
278
260/* 279/*
261 * This function is called when the vfs is freeing the superblock. We just 280 * This function is called when the vfs is freeing the superblock. We just
262 * need to free our own part. 281 * need to free our own part.
@@ -279,12 +298,182 @@ static void exofs_put_super(struct super_block *sb)
279 msecs_to_jiffies(100)); 298 msecs_to_jiffies(100));
280 } 299 }
281 300
282 _exofs_print_device("Unmounting", NULL, sbi->s_dev, sbi->s_pid); 301 _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid);
283 osduld_put_device(sbi->s_dev); 302
284 kfree(sb->s_fs_info); 303 exofs_free_sbi(sbi);
285 sb->s_fs_info = NULL; 304 sb->s_fs_info = NULL;
286} 305}
287 306
307static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
308 struct exofs_device_table *dt)
309{
310 sbi->data_map.odm_num_comps =
311 le32_to_cpu(dt->dt_data_map.cb_num_comps);
312 sbi->data_map.odm_stripe_unit =
313 le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
314 sbi->data_map.odm_group_width =
315 le32_to_cpu(dt->dt_data_map.cb_group_width);
316 sbi->data_map.odm_group_depth =
317 le32_to_cpu(dt->dt_data_map.cb_group_depth);
318 sbi->data_map.odm_mirror_cnt =
319 le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
320 sbi->data_map.odm_raid_algorithm =
321 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
322
323/* FIXME: Hard coded mirror only for now. if not so do not mount */
324 if ((sbi->data_map.odm_num_comps != numdevs) ||
325 (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) ||
326 (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) ||
327 (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
328 return -EINVAL;
329 else
330 return 0;
331}
332
333/* @odi is valid only as long as @fscb_dev is valid */
334static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
335 struct osd_dev_info *odi)
336{
337 odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
338 memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
339
340 odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
341 odi->osdname = dt_dev->osdname;
342
343 /* FIXME support long names. Will need a _put function */
344 if (dt_dev->long_name_offset)
345 return -EINVAL;
346
347 /* Make sure osdname is printable!
348 * mkexofs should give us space for a null-terminator else the
349 * device-table is invalid.
350 */
351 if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
352 odi->osdname_len = sizeof(dt_dev->osdname) - 1;
353 dt_dev->osdname[odi->osdname_len] = 0;
354
355 /* If it's all zeros something is bad we read past end-of-obj */
356 return !(odi->systemid_len || odi->osdname_len);
357}
358
359static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
360 unsigned table_count)
361{
362 struct exofs_sb_info *sbi = *psbi;
363 struct osd_dev *fscb_od;
364 struct osd_obj_id obj = {.partition = sbi->s_pid,
365 .id = EXOFS_DEVTABLE_ID};
366 struct exofs_device_table *dt;
367 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
368 sizeof(*dt);
369 unsigned numdevs, i;
370 int ret;
371
372 dt = kmalloc(table_bytes, GFP_KERNEL);
373 if (unlikely(!dt)) {
374 EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
375 table_bytes);
376 return -ENOMEM;
377 }
378
379 fscb_od = sbi->s_ods[0];
380 sbi->s_ods[0] = NULL;
381 sbi->s_numdevs = 0;
382 ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
383 if (unlikely(ret)) {
384 EXOFS_ERR("ERROR: reading device table\n");
385 goto out;
386 }
387
388 numdevs = le64_to_cpu(dt->dt_num_devices);
389 if (unlikely(!numdevs)) {
390 ret = -EINVAL;
391 goto out;
392 }
393 WARN_ON(table_count != numdevs);
394
395 ret = _read_and_match_data_map(sbi, numdevs, dt);
396 if (unlikely(ret))
397 goto out;
398
399 if (likely(numdevs > 1)) {
400 unsigned size = numdevs * sizeof(sbi->s_ods[0]);
401
402 sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
403 if (unlikely(!sbi)) {
404 ret = -ENOMEM;
405 goto out;
406 }
407 memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0]));
408 *psbi = sbi;
409 }
410
411 for (i = 0; i < numdevs; i++) {
412 struct exofs_fscb fscb;
413 struct osd_dev_info odi;
414 struct osd_dev *od;
415
416 if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
417 EXOFS_ERR("ERROR: Read all-zeros device entry\n");
418 ret = -EINVAL;
419 goto out;
420 }
421
422 printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
423 i, odi.osdname);
424
425 /* On all devices the device table is identical. The user can
426 * specify any one of the participating devices on the command
427 * line. We always keep them in device-table order.
428 */
429 if (fscb_od && osduld_device_same(fscb_od, &odi)) {
430 sbi->s_ods[i] = fscb_od;
431 ++sbi->s_numdevs;
432 fscb_od = NULL;
433 continue;
434 }
435
436 od = osduld_info_lookup(&odi);
437 if (unlikely(IS_ERR(od))) {
438 ret = PTR_ERR(od);
439 EXOFS_ERR("ERROR: device requested is not found "
440 "osd_name-%s =>%d\n", odi.osdname, ret);
441 goto out;
442 }
443
444 sbi->s_ods[i] = od;
445 ++sbi->s_numdevs;
446
447 /* Read the fscb of the other devices to make sure the FS
448 * partition is there.
449 */
450 ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
451 sizeof(fscb));
452 if (unlikely(ret)) {
453 EXOFS_ERR("ERROR: Malformed participating device "
454 "error reading fscb osd_name-%s\n",
455 odi.osdname);
456 goto out;
457 }
458
459 /* TODO: verify other information is correct and FS-uuid
460 * matches. Benny what did you say about device table
461 * generation and old devices?
462 */
463 }
464
465out:
466 kfree(dt);
467 if (unlikely(!ret && fscb_od)) {
468 EXOFS_ERR(
469 "ERROR: Bad device-table container device not present\n");
470 osduld_put_device(fscb_od);
471 ret = -EINVAL;
472 }
473
474 return ret;
475}
476
288/* 477/*
289 * Read the superblock from the OSD and fill in the fields 478 * Read the superblock from the OSD and fill in the fields
290 */ 479 */
@@ -296,6 +485,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
296 struct osd_dev *od; /* Master device */ 485 struct osd_dev *od; /* Master device */
297 struct exofs_fscb fscb; /*on-disk superblock info */ 486 struct exofs_fscb fscb; /*on-disk superblock info */
298 struct osd_obj_id obj; 487 struct osd_obj_id obj;
488 unsigned table_count;
299 int ret; 489 int ret;
300 490
301 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 491 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
@@ -309,7 +499,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
309 goto free_sbi; 499 goto free_sbi;
310 } 500 }
311 501
312 sbi->s_dev = od; 502 sbi->s_ods[0] = od;
503 sbi->s_numdevs = 1;
313 sbi->s_pid = opts->pid; 504 sbi->s_pid = opts->pid;
314 sbi->s_timeout = opts->timeout; 505 sbi->s_timeout = opts->timeout;
315 506
@@ -342,11 +533,24 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
342 ret = -EINVAL; 533 ret = -EINVAL;
343 goto free_sbi; 534 goto free_sbi;
344 } 535 }
536 if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
537 EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
538 EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
539 ret = -EINVAL;
540 goto free_sbi;
541 }
345 542
346 /* start generation numbers from a random point */ 543 /* start generation numbers from a random point */
347 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 544 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
348 spin_lock_init(&sbi->s_next_gen_lock); 545 spin_lock_init(&sbi->s_next_gen_lock);
349 546
547 table_count = le64_to_cpu(fscb.s_dev_table_count);
548 if (table_count) {
549 ret = exofs_read_lookup_dev_table(&sbi, table_count);
550 if (unlikely(ret))
551 goto free_sbi;
552 }
553
350 /* set up operation vectors */ 554 /* set up operation vectors */
351 sb->s_fs_info = sbi; 555 sb->s_fs_info = sbi;
352 sb->s_op = &exofs_sops; 556 sb->s_op = &exofs_sops;
@@ -374,14 +578,14 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
374 goto free_sbi; 578 goto free_sbi;
375 } 579 }
376 580
377 _exofs_print_device("Mounting", opts->dev_name, sbi->s_dev, sbi->s_pid); 581 _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0],
582 sbi->s_pid);
378 return 0; 583 return 0;
379 584
380free_sbi: 585free_sbi:
381 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", 586 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
382 opts->dev_name, sbi->s_pid, ret); 587 opts->dev_name, sbi->s_pid, ret);
383 osduld_put_device(sbi->s_dev); /* NULL safe */ 588 exofs_free_sbi(sbi);
384 kfree(sbi);
385 return ret; 589 return ret;
386} 590}
387 591