diff options
Diffstat (limited to 'fs/exofs')
-rw-r--r-- | fs/exofs/Kbuild | 2 | ||||
-rw-r--r-- | fs/exofs/common.h | 112 | ||||
-rw-r--r-- | fs/exofs/exofs.h | 140 | ||||
-rw-r--r-- | fs/exofs/inode.c | 547 | ||||
-rw-r--r-- | fs/exofs/ios.c | 823 | ||||
-rw-r--r-- | fs/exofs/osd.c | 125 | ||||
-rw-r--r-- | fs/exofs/pnfs.h | 45 | ||||
-rw-r--r-- | fs/exofs/super.c | 417 |
8 files changed, 1723 insertions, 488 deletions
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index cc2d22db119c..2d0f757fda3e 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild | |||
@@ -12,5 +12,5 @@ | |||
12 | # Kbuild - Gets included from the Kernels Makefile and build system | 12 | # Kbuild - Gets included from the Kernels Makefile and build system |
13 | # | 13 | # |
14 | 14 | ||
15 | exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o | 15 | exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o |
16 | obj-$(CONFIG_EXOFS_FS) += exofs.o | 16 | obj-$(CONFIG_EXOFS_FS) += exofs.o |
diff --git a/fs/exofs/common.h b/fs/exofs/common.h index c6718e4817fe..f0d520312d8b 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h | |||
@@ -49,11 +49,14 @@ | |||
49 | #define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ | 49 | #define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ |
50 | #define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ | 50 | #define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ |
51 | #define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ | 51 | #define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ |
52 | #define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */ | ||
52 | #define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ | 53 | #define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ |
53 | 54 | ||
54 | /* exofs Application specific page/attribute */ | 55 | /* exofs Application specific page/attribute */ |
55 | # define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) | 56 | # define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) |
56 | # define EXOFS_ATTR_INODE_DATA 1 | 57 | # define EXOFS_ATTR_INODE_DATA 1 |
58 | # define EXOFS_ATTR_INODE_FILE_LAYOUT 2 | ||
59 | # define EXOFS_ATTR_INODE_DIR_LAYOUT 3 | ||
57 | 60 | ||
58 | /* | 61 | /* |
59 | * The maximum number of files we can have is limited by the size of the | 62 | * The maximum number of files we can have is limited by the size of the |
@@ -78,17 +81,67 @@ enum { | |||
78 | #define EXOFS_SUPER_MAGIC 0x5DF5 | 81 | #define EXOFS_SUPER_MAGIC 0x5DF5 |
79 | 82 | ||
80 | /* | 83 | /* |
81 | * The file system control block - stored in an object's data (mainly, the one | 84 | * The file system control block - stored in object EXOFS_SUPER_ID's data. |
82 | * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored | 85 | * This is where the in-memory superblock is stored on disk. |
83 | * on disk. Right now it just has a magic value, which is basically a sanity | ||
84 | * check on our ability to communicate with the object store. | ||
85 | */ | 86 | */ |
87 | enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; | ||
86 | struct exofs_fscb { | 88 | struct exofs_fscb { |
87 | __le64 s_nextid; /* Highest object ID used */ | 89 | __le64 s_nextid; /* Highest object ID used */ |
88 | __le32 s_numfiles; /* Number of files on fs */ | 90 | __le64 s_numfiles; /* Number of files on fs */ |
91 | __le32 s_version; /* == EXOFS_FSCB_VER */ | ||
89 | __le16 s_magic; /* Magic signature */ | 92 | __le16 s_magic; /* Magic signature */ |
90 | __le16 s_newfs; /* Non-zero if this is a new fs */ | 93 | __le16 s_newfs; /* Non-zero if this is a new fs */ |
91 | }; | 94 | |
95 | /* From here on it's a static part, only written by mkexofs */ | ||
96 | __le64 s_dev_table_oid; /* Resurved, not used */ | ||
97 | __le64 s_dev_table_count; /* == 0 means no dev_table */ | ||
98 | } __packed; | ||
99 | |||
100 | /* | ||
101 | * Describes the raid used in the FS. It is part of the device table. | ||
102 | * This here is taken from the pNFS-objects definition. In exofs we | ||
103 | * use one raid policy through-out the filesystem. (NOTE: the funny | ||
104 | * alignment at begining. We take care of it at exofs_device_table. | ||
105 | */ | ||
106 | struct exofs_dt_data_map { | ||
107 | __le32 cb_num_comps; | ||
108 | __le64 cb_stripe_unit; | ||
109 | __le32 cb_group_width; | ||
110 | __le32 cb_group_depth; | ||
111 | __le32 cb_mirror_cnt; | ||
112 | __le32 cb_raid_algorithm; | ||
113 | } __packed; | ||
114 | |||
115 | /* | ||
116 | * This is an osd device information descriptor. It is a single entry in | ||
117 | * the exofs device table. It describes an osd target lun which | ||
118 | * contains data belonging to this FS. (Same partition_id on all devices) | ||
119 | */ | ||
120 | struct exofs_dt_device_info { | ||
121 | __le32 systemid_len; | ||
122 | u8 systemid[OSD_SYSTEMID_LEN]; | ||
123 | __le64 long_name_offset; /* If !0 then offset-in-file */ | ||
124 | __le32 osdname_len; /* */ | ||
125 | u8 osdname[44]; /* Embbeded, Ususally an asci uuid */ | ||
126 | } __packed; | ||
127 | |||
128 | /* | ||
129 | * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data. | ||
130 | * It contains the raid used for this multy-device FS and an array of | ||
131 | * participating devices. | ||
132 | */ | ||
133 | struct exofs_device_table { | ||
134 | __le32 dt_version; /* == EXOFS_DT_VER */ | ||
135 | struct exofs_dt_data_map dt_data_map; /* Raid policy to use */ | ||
136 | |||
137 | /* Resurved space For future use. Total includeing this: | ||
138 | * (8 * sizeof(le64)) | ||
139 | */ | ||
140 | __le64 __Resurved[4]; | ||
141 | |||
142 | __le64 dt_num_devices; /* Array size */ | ||
143 | struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */ | ||
144 | } __packed; | ||
92 | 145 | ||
93 | /**************************************************************************** | 146 | /**************************************************************************** |
94 | * inode-related things | 147 | * inode-related things |
@@ -155,22 +208,41 @@ enum { | |||
155 | (((name_len) + offsetof(struct exofs_dir_entry, name) + \ | 208 | (((name_len) + offsetof(struct exofs_dir_entry, name) + \ |
156 | EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) | 209 | EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) |
157 | 210 | ||
158 | /************************* | 211 | /* |
159 | * function declarations * | 212 | * The on-disk (optional) layout structure. |
160 | *************************/ | 213 | * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT |
161 | /* osd.c */ | 214 | * attribute, attached to any inode, usually to a directory. |
162 | void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], | 215 | */ |
163 | const struct osd_obj_id *obj); | 216 | |
217 | enum exofs_inode_layout_gen_functions { | ||
218 | LAYOUT_MOVING_WINDOW = 0, | ||
219 | LAYOUT_IMPLICT = 1, | ||
220 | }; | ||
164 | 221 | ||
165 | int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid); | 222 | struct exofs_on_disk_inode_layout { |
166 | static inline int exofs_check_ok(struct osd_request *or) | 223 | __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */ |
224 | __le16 pad; | ||
225 | union { | ||
226 | /* gen_func == LAYOUT_MOVING_WINDOW (default) */ | ||
227 | struct exofs_layout_sliding_window { | ||
228 | __le32 num_devices; /* first n devices in global-table*/ | ||
229 | } sliding_window __packed; | ||
230 | |||
231 | /* gen_func == LAYOUT_IMPLICT */ | ||
232 | struct exofs_layout_implict_list { | ||
233 | struct exofs_dt_data_map data_map; | ||
234 | /* Variable array of size data_map.cb_num_comps. These | ||
235 | * are device indexes of the devices in the global table | ||
236 | */ | ||
237 | __le32 dev_indexes[]; | ||
238 | } implict __packed; | ||
239 | }; | ||
240 | } __packed; | ||
241 | |||
242 | static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs) | ||
167 | { | 243 | { |
168 | return exofs_check_ok_resid(or, NULL, NULL); | 244 | return sizeof(struct exofs_on_disk_inode_layout) + |
245 | max_devs * sizeof(__le32); | ||
169 | } | 246 | } |
170 | int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred); | ||
171 | int exofs_async_op(struct osd_request *or, | ||
172 | osd_req_done_fn *async_done, void *caller_context, u8 *cred); | ||
173 | |||
174 | int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr); | ||
175 | 247 | ||
176 | #endif /*ifndef __EXOFS_COM_H__*/ | 248 | #endif /*ifndef __EXOFS_COM_H__*/ |
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 5ec72e020b22..8442e353309f 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h | |||
@@ -30,13 +30,17 @@ | |||
30 | * along with exofs; if not, write to the Free Software | 30 | * along with exofs; if not, write to the Free Software |
31 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | 31 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
32 | */ | 32 | */ |
33 | #ifndef __EXOFS_H__ | ||
34 | #define __EXOFS_H__ | ||
33 | 35 | ||
34 | #include <linux/fs.h> | 36 | #include <linux/fs.h> |
35 | #include <linux/time.h> | 37 | #include <linux/time.h> |
36 | #include "common.h" | 38 | #include "common.h" |
37 | 39 | ||
38 | #ifndef __EXOFS_H__ | 40 | /* FIXME: Remove once pnfs hits mainline |
39 | #define __EXOFS_H__ | 41 | * #include <linux/exportfs/pnfs_osd_xdr.h> |
42 | */ | ||
43 | #include "pnfs.h" | ||
40 | 44 | ||
41 | #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) | 45 | #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) |
42 | 46 | ||
@@ -51,34 +55,110 @@ | |||
51 | /* u64 has problems with printk this will cast it to unsigned long long */ | 55 | /* u64 has problems with printk this will cast it to unsigned long long */ |
52 | #define _LLU(x) (unsigned long long)(x) | 56 | #define _LLU(x) (unsigned long long)(x) |
53 | 57 | ||
58 | struct exofs_layout { | ||
59 | osd_id s_pid; /* partition ID of file system*/ | ||
60 | |||
61 | /* Our way of looking at the data_map */ | ||
62 | unsigned stripe_unit; | ||
63 | unsigned mirrors_p1; | ||
64 | |||
65 | unsigned group_width; | ||
66 | u64 group_depth; | ||
67 | unsigned group_count; | ||
68 | |||
69 | enum exofs_inode_layout_gen_functions lay_func; | ||
70 | |||
71 | unsigned s_numdevs; /* Num of devices in array */ | ||
72 | struct osd_dev *s_ods[0]; /* Variable length */ | ||
73 | }; | ||
74 | |||
54 | /* | 75 | /* |
55 | * our extension to the in-memory superblock | 76 | * our extension to the in-memory superblock |
56 | */ | 77 | */ |
57 | struct exofs_sb_info { | 78 | struct exofs_sb_info { |
58 | struct osd_dev *s_dev; /* returned by get_osd_dev */ | 79 | struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ |
59 | osd_id s_pid; /* partition ID of file system*/ | ||
60 | int s_timeout; /* timeout for OSD operations */ | 80 | int s_timeout; /* timeout for OSD operations */ |
61 | uint64_t s_nextid; /* highest object ID used */ | 81 | uint64_t s_nextid; /* highest object ID used */ |
62 | uint32_t s_numfiles; /* number of files on fs */ | 82 | uint32_t s_numfiles; /* number of files on fs */ |
63 | spinlock_t s_next_gen_lock; /* spinlock for gen # update */ | 83 | spinlock_t s_next_gen_lock; /* spinlock for gen # update */ |
64 | u32 s_next_generation; /* next gen # to use */ | 84 | u32 s_next_generation; /* next gen # to use */ |
65 | atomic_t s_curr_pending; /* number of pending commands */ | 85 | atomic_t s_curr_pending; /* number of pending commands */ |
66 | uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */ | 86 | uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ |
87 | |||
88 | struct pnfs_osd_data_map data_map; /* Default raid to use | ||
89 | * FIXME: Needed ? | ||
90 | */ | ||
91 | /* struct exofs_layout dir_layout;*/ /* Default dir layout */ | ||
92 | struct exofs_layout layout; /* Default files layout, | ||
93 | * contains the variable osd_dev | ||
94 | * array. Keep last */ | ||
95 | struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ | ||
67 | }; | 96 | }; |
68 | 97 | ||
69 | /* | 98 | /* |
70 | * our extension to the in-memory inode | 99 | * our extension to the in-memory inode |
71 | */ | 100 | */ |
72 | struct exofs_i_info { | 101 | struct exofs_i_info { |
102 | struct inode vfs_inode; /* normal in-memory inode */ | ||
103 | wait_queue_head_t i_wq; /* wait queue for inode */ | ||
73 | unsigned long i_flags; /* various atomic flags */ | 104 | unsigned long i_flags; /* various atomic flags */ |
74 | uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ | 105 | uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ |
75 | uint32_t i_dir_start_lookup; /* which page to start lookup */ | 106 | uint32_t i_dir_start_lookup; /* which page to start lookup */ |
76 | wait_queue_head_t i_wq; /* wait queue for inode */ | ||
77 | uint64_t i_commit_size; /* the object's written length */ | 107 | uint64_t i_commit_size; /* the object's written length */ |
78 | uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ | 108 | uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ |
79 | struct inode vfs_inode; /* normal in-memory inode */ | ||
80 | }; | 109 | }; |
81 | 110 | ||
111 | static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) | ||
112 | { | ||
113 | return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; | ||
114 | } | ||
115 | |||
116 | struct exofs_io_state; | ||
117 | typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private); | ||
118 | |||
119 | struct exofs_io_state { | ||
120 | struct kref kref; | ||
121 | |||
122 | void *private; | ||
123 | exofs_io_done_fn done; | ||
124 | |||
125 | struct exofs_layout *layout; | ||
126 | struct osd_obj_id obj; | ||
127 | u8 *cred; | ||
128 | |||
129 | /* Global read/write IO*/ | ||
130 | loff_t offset; | ||
131 | unsigned long length; | ||
132 | void *kern_buff; | ||
133 | |||
134 | struct page **pages; | ||
135 | unsigned nr_pages; | ||
136 | unsigned pgbase; | ||
137 | unsigned pages_consumed; | ||
138 | |||
139 | /* Attributes */ | ||
140 | unsigned in_attr_len; | ||
141 | struct osd_attr *in_attr; | ||
142 | unsigned out_attr_len; | ||
143 | struct osd_attr *out_attr; | ||
144 | |||
145 | /* Variable array of size numdevs */ | ||
146 | unsigned numdevs; | ||
147 | struct exofs_per_dev_state { | ||
148 | struct osd_request *or; | ||
149 | struct bio *bio; | ||
150 | loff_t offset; | ||
151 | unsigned length; | ||
152 | unsigned dev; | ||
153 | } per_dev[]; | ||
154 | }; | ||
155 | |||
156 | static inline unsigned exofs_io_state_size(unsigned numdevs) | ||
157 | { | ||
158 | return sizeof(struct exofs_io_state) + | ||
159 | sizeof(struct exofs_per_dev_state) * numdevs; | ||
160 | } | ||
161 | |||
82 | /* | 162 | /* |
83 | * our inode flags | 163 | * our inode flags |
84 | */ | 164 | */ |
@@ -123,6 +203,12 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode) | |||
123 | } | 203 | } |
124 | 204 | ||
125 | /* | 205 | /* |
206 | * Given a layout, object_number and stripe_index return the associated global | ||
207 | * dev_index | ||
208 | */ | ||
209 | unsigned exofs_layout_od_id(struct exofs_layout *layout, | ||
210 | osd_id obj_no, unsigned layout_index); | ||
211 | /* | ||
126 | * Maximum count of links to a file | 212 | * Maximum count of links to a file |
127 | */ | 213 | */ |
128 | #define EXOFS_LINK_MAX 32000 | 214 | #define EXOFS_LINK_MAX 32000 |
@@ -130,6 +216,43 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode) | |||
130 | /************************* | 216 | /************************* |
131 | * function declarations * | 217 | * function declarations * |
132 | *************************/ | 218 | *************************/ |
219 | |||
220 | /* ios.c */ | ||
221 | void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], | ||
222 | const struct osd_obj_id *obj); | ||
223 | int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, | ||
224 | u64 offset, void *p, unsigned length); | ||
225 | |||
226 | int exofs_get_io_state(struct exofs_layout *layout, | ||
227 | struct exofs_io_state **ios); | ||
228 | void exofs_put_io_state(struct exofs_io_state *ios); | ||
229 | |||
230 | int exofs_check_io(struct exofs_io_state *ios, u64 *resid); | ||
231 | |||
232 | int exofs_sbi_create(struct exofs_io_state *ios); | ||
233 | int exofs_sbi_remove(struct exofs_io_state *ios); | ||
234 | int exofs_sbi_write(struct exofs_io_state *ios); | ||
235 | int exofs_sbi_read(struct exofs_io_state *ios); | ||
236 | |||
237 | int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr); | ||
238 | |||
239 | int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len); | ||
240 | static inline int exofs_oi_write(struct exofs_i_info *oi, | ||
241 | struct exofs_io_state *ios) | ||
242 | { | ||
243 | ios->obj.id = exofs_oi_objno(oi); | ||
244 | ios->cred = oi->i_cred; | ||
245 | return exofs_sbi_write(ios); | ||
246 | } | ||
247 | |||
248 | static inline int exofs_oi_read(struct exofs_i_info *oi, | ||
249 | struct exofs_io_state *ios) | ||
250 | { | ||
251 | ios->obj.id = exofs_oi_objno(oi); | ||
252 | ios->cred = oi->i_cred; | ||
253 | return exofs_sbi_read(ios); | ||
254 | } | ||
255 | |||
133 | /* inode.c */ | 256 | /* inode.c */ |
134 | void exofs_truncate(struct inode *inode); | 257 | void exofs_truncate(struct inode *inode); |
135 | int exofs_setattr(struct dentry *, struct iattr *); | 258 | int exofs_setattr(struct dentry *, struct iattr *); |
@@ -138,7 +261,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, | |||
138 | struct page **pagep, void **fsdata); | 261 | struct page **pagep, void **fsdata); |
139 | extern struct inode *exofs_iget(struct super_block *, unsigned long); | 262 | extern struct inode *exofs_iget(struct super_block *, unsigned long); |
140 | struct inode *exofs_new_inode(struct inode *, int); | 263 | struct inode *exofs_new_inode(struct inode *, int); |
141 | extern int exofs_write_inode(struct inode *, int); | 264 | extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); |
142 | extern void exofs_delete_inode(struct inode *); | 265 | extern void exofs_delete_inode(struct inode *); |
143 | 266 | ||
144 | /* dir.c: */ | 267 | /* dir.c: */ |
@@ -169,6 +292,7 @@ extern const struct file_operations exofs_file_operations; | |||
169 | 292 | ||
170 | /* inode.c */ | 293 | /* inode.c */ |
171 | extern const struct address_space_operations exofs_aops; | 294 | extern const struct address_space_operations exofs_aops; |
295 | extern const struct osd_attr g_attr_logical_length; | ||
172 | 296 | ||
173 | /* namei.c */ | 297 | /* namei.c */ |
174 | extern const struct inode_operations exofs_dir_inode_operations; | 298 | extern const struct inode_operations exofs_dir_inode_operations; |
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 6c10f7476699..76d2a79ef93e 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c | |||
@@ -31,94 +31,117 @@ | |||
31 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | 31 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
32 | */ | 32 | */ |
33 | 33 | ||
34 | #include <linux/slab.h> | ||
34 | #include <linux/writeback.h> | 35 | #include <linux/writeback.h> |
35 | #include <linux/buffer_head.h> | 36 | #include <linux/buffer_head.h> |
36 | #include <scsi/scsi_device.h> | 37 | #include <scsi/scsi_device.h> |
37 | 38 | ||
38 | #include "exofs.h" | 39 | #include "exofs.h" |
39 | 40 | ||
40 | #ifdef CONFIG_EXOFS_DEBUG | 41 | #define EXOFS_DBGMSG2(M...) do {} while (0) |
41 | # define EXOFS_DEBUG_OBJ_ISIZE 1 | 42 | |
42 | #endif | 43 | enum { BIO_MAX_PAGES_KMALLOC = |
44 | (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), | ||
45 | MAX_PAGES_KMALLOC = | ||
46 | PAGE_SIZE / sizeof(struct page *), | ||
47 | }; | ||
43 | 48 | ||
44 | struct page_collect { | 49 | struct page_collect { |
45 | struct exofs_sb_info *sbi; | 50 | struct exofs_sb_info *sbi; |
46 | struct request_queue *req_q; | ||
47 | struct inode *inode; | 51 | struct inode *inode; |
48 | unsigned expected_pages; | 52 | unsigned expected_pages; |
53 | struct exofs_io_state *ios; | ||
49 | 54 | ||
50 | struct bio *bio; | 55 | struct page **pages; |
56 | unsigned alloc_pages; | ||
51 | unsigned nr_pages; | 57 | unsigned nr_pages; |
52 | unsigned long length; | 58 | unsigned long length; |
53 | loff_t pg_first; /* keep 64bit also in 32-arches */ | 59 | loff_t pg_first; /* keep 64bit also in 32-arches */ |
54 | }; | 60 | }; |
55 | 61 | ||
56 | static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, | 62 | static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, |
57 | struct inode *inode) | 63 | struct inode *inode) |
58 | { | 64 | { |
59 | struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; | 65 | struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; |
60 | 66 | ||
61 | pcol->sbi = sbi; | 67 | pcol->sbi = sbi; |
62 | pcol->req_q = osd_request_queue(sbi->s_dev); | ||
63 | pcol->inode = inode; | 68 | pcol->inode = inode; |
64 | pcol->expected_pages = expected_pages; | 69 | pcol->expected_pages = expected_pages; |
65 | 70 | ||
66 | pcol->bio = NULL; | 71 | pcol->ios = NULL; |
72 | pcol->pages = NULL; | ||
73 | pcol->alloc_pages = 0; | ||
67 | pcol->nr_pages = 0; | 74 | pcol->nr_pages = 0; |
68 | pcol->length = 0; | 75 | pcol->length = 0; |
69 | pcol->pg_first = -1; | 76 | pcol->pg_first = -1; |
70 | |||
71 | EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino, | ||
72 | expected_pages); | ||
73 | } | 77 | } |
74 | 78 | ||
75 | static void _pcol_reset(struct page_collect *pcol) | 79 | static void _pcol_reset(struct page_collect *pcol) |
76 | { | 80 | { |
77 | pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); | 81 | pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); |
78 | 82 | ||
79 | pcol->bio = NULL; | 83 | pcol->pages = NULL; |
84 | pcol->alloc_pages = 0; | ||
80 | pcol->nr_pages = 0; | 85 | pcol->nr_pages = 0; |
81 | pcol->length = 0; | 86 | pcol->length = 0; |
82 | pcol->pg_first = -1; | 87 | pcol->pg_first = -1; |
83 | EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n", | 88 | pcol->ios = NULL; |
84 | pcol->inode->i_ino, pcol->expected_pages); | ||
85 | 89 | ||
86 | /* this is probably the end of the loop but in writes | 90 | /* this is probably the end of the loop but in writes |
87 | * it might not end here. don't be left with nothing | 91 | * it might not end here. don't be left with nothing |
88 | */ | 92 | */ |
89 | if (!pcol->expected_pages) | 93 | if (!pcol->expected_pages) |
90 | pcol->expected_pages = 128; | 94 | pcol->expected_pages = MAX_PAGES_KMALLOC; |
91 | } | 95 | } |
92 | 96 | ||
93 | static int pcol_try_alloc(struct page_collect *pcol) | 97 | static int pcol_try_alloc(struct page_collect *pcol) |
94 | { | 98 | { |
95 | int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES); | 99 | unsigned pages = min_t(unsigned, pcol->expected_pages, |
100 | MAX_PAGES_KMALLOC); | ||
101 | |||
102 | if (!pcol->ios) { /* First time allocate io_state */ | ||
103 | int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); | ||
104 | |||
105 | if (ret) | ||
106 | return ret; | ||
107 | } | ||
108 | |||
109 | /* TODO: easily support bio chaining */ | ||
110 | pages = min_t(unsigned, pages, | ||
111 | pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC); | ||
96 | 112 | ||
97 | for (; pages; pages >>= 1) { | 113 | for (; pages; pages >>= 1) { |
98 | pcol->bio = bio_alloc(GFP_KERNEL, pages); | 114 | pcol->pages = kmalloc(pages * sizeof(struct page *), |
99 | if (likely(pcol->bio)) | 115 | GFP_KERNEL); |
116 | if (likely(pcol->pages)) { | ||
117 | pcol->alloc_pages = pages; | ||
100 | return 0; | 118 | return 0; |
119 | } | ||
101 | } | 120 | } |
102 | 121 | ||
103 | EXOFS_ERR("Failed to kcalloc expected_pages=%u\n", | 122 | EXOFS_ERR("Failed to kmalloc expected_pages=%u\n", |
104 | pcol->expected_pages); | 123 | pcol->expected_pages); |
105 | return -ENOMEM; | 124 | return -ENOMEM; |
106 | } | 125 | } |
107 | 126 | ||
108 | static void pcol_free(struct page_collect *pcol) | 127 | static void pcol_free(struct page_collect *pcol) |
109 | { | 128 | { |
110 | bio_put(pcol->bio); | 129 | kfree(pcol->pages); |
111 | pcol->bio = NULL; | 130 | pcol->pages = NULL; |
131 | |||
132 | if (pcol->ios) { | ||
133 | exofs_put_io_state(pcol->ios); | ||
134 | pcol->ios = NULL; | ||
135 | } | ||
112 | } | 136 | } |
113 | 137 | ||
114 | static int pcol_add_page(struct page_collect *pcol, struct page *page, | 138 | static int pcol_add_page(struct page_collect *pcol, struct page *page, |
115 | unsigned len) | 139 | unsigned len) |
116 | { | 140 | { |
117 | int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0); | 141 | if (unlikely(pcol->nr_pages >= pcol->alloc_pages)) |
118 | if (unlikely(len != added_len)) | ||
119 | return -ENOMEM; | 142 | return -ENOMEM; |
120 | 143 | ||
121 | ++pcol->nr_pages; | 144 | pcol->pages[pcol->nr_pages++] = page; |
122 | pcol->length += len; | 145 | pcol->length += len; |
123 | return 0; | 146 | return 0; |
124 | } | 147 | } |
@@ -161,32 +184,26 @@ static void update_write_page(struct page *page, int ret) | |||
161 | /* Called at the end of reads, to optionally unlock pages and update their | 184 | /* Called at the end of reads, to optionally unlock pages and update their |
162 | * status. | 185 | * status. |
163 | */ | 186 | */ |
164 | static int __readpages_done(struct osd_request *or, struct page_collect *pcol, | 187 | static int __readpages_done(struct page_collect *pcol, bool do_unlock) |
165 | bool do_unlock) | ||
166 | { | 188 | { |
167 | struct bio_vec *bvec; | ||
168 | int i; | 189 | int i; |
169 | u64 resid; | 190 | u64 resid; |
170 | u64 good_bytes; | 191 | u64 good_bytes; |
171 | u64 length = 0; | 192 | u64 length = 0; |
172 | int ret = exofs_check_ok_resid(or, &resid, NULL); | 193 | int ret = exofs_check_io(pcol->ios, &resid); |
173 | |||
174 | osd_end_request(or); | ||
175 | 194 | ||
176 | if (likely(!ret)) | 195 | if (likely(!ret)) |
177 | good_bytes = pcol->length; | 196 | good_bytes = pcol->length; |
178 | else if (!resid) | ||
179 | good_bytes = 0; | ||
180 | else | 197 | else |
181 | good_bytes = pcol->length - resid; | 198 | good_bytes = pcol->length - resid; |
182 | 199 | ||
183 | EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx" | 200 | EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" |
184 | " length=0x%lx nr_pages=%u\n", | 201 | " length=0x%lx nr_pages=%u\n", |
185 | pcol->inode->i_ino, _LLU(good_bytes), pcol->length, | 202 | pcol->inode->i_ino, _LLU(good_bytes), pcol->length, |
186 | pcol->nr_pages); | 203 | pcol->nr_pages); |
187 | 204 | ||
188 | __bio_for_each_segment(bvec, pcol->bio, i, 0) { | 205 | for (i = 0; i < pcol->nr_pages; i++) { |
189 | struct page *page = bvec->bv_page; | 206 | struct page *page = pcol->pages[i]; |
190 | struct inode *inode = page->mapping->host; | 207 | struct inode *inode = page->mapping->host; |
191 | int page_stat; | 208 | int page_stat; |
192 | 209 | ||
@@ -198,38 +215,37 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol, | |||
198 | else | 215 | else |
199 | page_stat = ret; | 216 | page_stat = ret; |
200 | 217 | ||
201 | EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n", | 218 | EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n", |
202 | inode->i_ino, page->index, | 219 | inode->i_ino, page->index, |
203 | page_stat ? "bad_bytes" : "good_bytes"); | 220 | page_stat ? "bad_bytes" : "good_bytes"); |
204 | 221 | ||
205 | ret = update_read_page(page, page_stat); | 222 | ret = update_read_page(page, page_stat); |
206 | if (do_unlock) | 223 | if (do_unlock) |
207 | unlock_page(page); | 224 | unlock_page(page); |
208 | length += bvec->bv_len; | 225 | length += PAGE_SIZE; |
209 | } | 226 | } |
210 | 227 | ||
211 | pcol_free(pcol); | 228 | pcol_free(pcol); |
212 | EXOFS_DBGMSG("readpages_done END\n"); | 229 | EXOFS_DBGMSG2("readpages_done END\n"); |
213 | return ret; | 230 | return ret; |
214 | } | 231 | } |
215 | 232 | ||
216 | /* callback of async reads */ | 233 | /* callback of async reads */ |
217 | static void readpages_done(struct osd_request *or, void *p) | 234 | static void readpages_done(struct exofs_io_state *ios, void *p) |
218 | { | 235 | { |
219 | struct page_collect *pcol = p; | 236 | struct page_collect *pcol = p; |
220 | 237 | ||
221 | __readpages_done(or, pcol, true); | 238 | __readpages_done(pcol, true); |
222 | atomic_dec(&pcol->sbi->s_curr_pending); | 239 | atomic_dec(&pcol->sbi->s_curr_pending); |
223 | kfree(p); | 240 | kfree(pcol); |
224 | } | 241 | } |
225 | 242 | ||
226 | static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) | 243 | static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) |
227 | { | 244 | { |
228 | struct bio_vec *bvec; | ||
229 | int i; | 245 | int i; |
230 | 246 | ||
231 | __bio_for_each_segment(bvec, pcol->bio, i, 0) { | 247 | for (i = 0; i < pcol->nr_pages; i++) { |
232 | struct page *page = bvec->bv_page; | 248 | struct page *page = pcol->pages[i]; |
233 | 249 | ||
234 | if (rw == READ) | 250 | if (rw == READ) |
235 | update_read_page(page, ret); | 251 | update_read_page(page, ret); |
@@ -238,36 +254,29 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) | |||
238 | 254 | ||
239 | unlock_page(page); | 255 | unlock_page(page); |
240 | } | 256 | } |
241 | pcol_free(pcol); | ||
242 | } | 257 | } |
243 | 258 | ||
244 | static int read_exec(struct page_collect *pcol, bool is_sync) | 259 | static int read_exec(struct page_collect *pcol, bool is_sync) |
245 | { | 260 | { |
246 | struct exofs_i_info *oi = exofs_i(pcol->inode); | 261 | struct exofs_i_info *oi = exofs_i(pcol->inode); |
247 | struct osd_obj_id obj = {pcol->sbi->s_pid, | 262 | struct exofs_io_state *ios = pcol->ios; |
248 | pcol->inode->i_ino + EXOFS_OBJ_OFF}; | ||
249 | struct osd_request *or = NULL; | ||
250 | struct page_collect *pcol_copy = NULL; | 263 | struct page_collect *pcol_copy = NULL; |
251 | loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT; | ||
252 | int ret; | 264 | int ret; |
253 | 265 | ||
254 | if (!pcol->bio) | 266 | if (!pcol->pages) |
255 | return 0; | 267 | return 0; |
256 | 268 | ||
257 | /* see comment in _readpage() about sync reads */ | 269 | /* see comment in _readpage() about sync reads */ |
258 | WARN_ON(is_sync && (pcol->nr_pages != 1)); | 270 | WARN_ON(is_sync && (pcol->nr_pages != 1)); |
259 | 271 | ||
260 | or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL); | 272 | ios->pages = pcol->pages; |
261 | if (unlikely(!or)) { | 273 | ios->nr_pages = pcol->nr_pages; |
262 | ret = -ENOMEM; | 274 | ios->length = pcol->length; |
263 | goto err; | 275 | ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; |
264 | } | ||
265 | |||
266 | osd_req_read(or, &obj, i_start, pcol->bio, pcol->length); | ||
267 | 276 | ||
268 | if (is_sync) { | 277 | if (is_sync) { |
269 | exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred); | 278 | exofs_oi_read(oi, pcol->ios); |
270 | return __readpages_done(or, pcol, false); | 279 | return __readpages_done(pcol, false); |
271 | } | 280 | } |
272 | 281 | ||
273 | pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); | 282 | pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); |
@@ -277,14 +286,16 @@ static int read_exec(struct page_collect *pcol, bool is_sync) | |||
277 | } | 286 | } |
278 | 287 | ||
279 | *pcol_copy = *pcol; | 288 | *pcol_copy = *pcol; |
280 | ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred); | 289 | ios->done = readpages_done; |
290 | ios->private = pcol_copy; | ||
291 | ret = exofs_oi_read(oi, ios); | ||
281 | if (unlikely(ret)) | 292 | if (unlikely(ret)) |
282 | goto err; | 293 | goto err; |
283 | 294 | ||
284 | atomic_inc(&pcol->sbi->s_curr_pending); | 295 | atomic_inc(&pcol->sbi->s_curr_pending); |
285 | 296 | ||
286 | EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", | 297 | EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", |
287 | obj.id, _LLU(i_start), pcol->length); | 298 | ios->obj.id, _LLU(ios->offset), pcol->length); |
288 | 299 | ||
289 | /* pages ownership was passed to pcol_copy */ | 300 | /* pages ownership was passed to pcol_copy */ |
290 | _pcol_reset(pcol); | 301 | _pcol_reset(pcol); |
@@ -293,12 +304,10 @@ static int read_exec(struct page_collect *pcol, bool is_sync) | |||
293 | err: | 304 | err: |
294 | if (!is_sync) | 305 | if (!is_sync) |
295 | _unlock_pcol_pages(pcol, ret, READ); | 306 | _unlock_pcol_pages(pcol, ret, READ); |
296 | else /* Pages unlocked by caller in sync mode only free bio */ | 307 | |
297 | pcol_free(pcol); | 308 | pcol_free(pcol); |
298 | 309 | ||
299 | kfree(pcol_copy); | 310 | kfree(pcol_copy); |
300 | if (or) | ||
301 | osd_end_request(or); | ||
302 | return ret; | 311 | return ret; |
303 | } | 312 | } |
304 | 313 | ||
@@ -361,7 +370,7 @@ try_again: | |||
361 | goto try_again; | 370 | goto try_again; |
362 | } | 371 | } |
363 | 372 | ||
364 | if (!pcol->bio) { | 373 | if (!pcol->pages) { |
365 | ret = pcol_try_alloc(pcol); | 374 | ret = pcol_try_alloc(pcol); |
366 | if (unlikely(ret)) | 375 | if (unlikely(ret)) |
367 | goto fail; | 376 | goto fail; |
@@ -370,12 +379,12 @@ try_again: | |||
370 | if (len != PAGE_CACHE_SIZE) | 379 | if (len != PAGE_CACHE_SIZE) |
371 | zero_user(page, len, PAGE_CACHE_SIZE - len); | 380 | zero_user(page, len, PAGE_CACHE_SIZE - len); |
372 | 381 | ||
373 | EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", | 382 | EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", |
374 | inode->i_ino, page->index, len); | 383 | inode->i_ino, page->index, len); |
375 | 384 | ||
376 | ret = pcol_add_page(pcol, page, len); | 385 | ret = pcol_add_page(pcol, page, len); |
377 | if (ret) { | 386 | if (ret) { |
378 | EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p " | 387 | EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p " |
379 | "this_len=0x%zx nr_pages=%u length=0x%lx\n", | 388 | "this_len=0x%zx nr_pages=%u length=0x%lx\n", |
380 | page, len, pcol->nr_pages, pcol->length); | 389 | page, len, pcol->nr_pages, pcol->length); |
381 | 390 | ||
@@ -419,9 +428,8 @@ static int _readpage(struct page *page, bool is_sync) | |||
419 | 428 | ||
420 | _pcol_init(&pcol, 1, page->mapping->host); | 429 | _pcol_init(&pcol, 1, page->mapping->host); |
421 | 430 | ||
422 | /* readpage_strip might call read_exec(,async) inside at several places | 431 | /* readpage_strip might call read_exec(,is_sync==false) at several |
423 | * but this is safe for is_async=0 since read_exec will not do anything | 432 | * places but not if we have a single page. |
424 | * when we have a single page. | ||
425 | */ | 433 | */ |
426 | ret = readpage_strip(&pcol, page); | 434 | ret = readpage_strip(&pcol, page); |
427 | if (ret) { | 435 | if (ret) { |
@@ -440,35 +448,30 @@ static int exofs_readpage(struct file *file, struct page *page) | |||
440 | return _readpage(page, false); | 448 | return _readpage(page, false); |
441 | } | 449 | } |
442 | 450 | ||
443 | /* Callback for osd_write. All writes are asynchronouse */ | 451 | /* Callback for osd_write. All writes are asynchronous */ |
444 | static void writepages_done(struct osd_request *or, void *p) | 452 | static void writepages_done(struct exofs_io_state *ios, void *p) |
445 | { | 453 | { |
446 | struct page_collect *pcol = p; | 454 | struct page_collect *pcol = p; |
447 | struct bio_vec *bvec; | ||
448 | int i; | 455 | int i; |
449 | u64 resid; | 456 | u64 resid; |
450 | u64 good_bytes; | 457 | u64 good_bytes; |
451 | u64 length = 0; | 458 | u64 length = 0; |
459 | int ret = exofs_check_io(ios, &resid); | ||
452 | 460 | ||
453 | int ret = exofs_check_ok_resid(or, NULL, &resid); | ||
454 | |||
455 | osd_end_request(or); | ||
456 | atomic_dec(&pcol->sbi->s_curr_pending); | 461 | atomic_dec(&pcol->sbi->s_curr_pending); |
457 | 462 | ||
458 | if (likely(!ret)) | 463 | if (likely(!ret)) |
459 | good_bytes = pcol->length; | 464 | good_bytes = pcol->length; |
460 | else if (!resid) | ||
461 | good_bytes = 0; | ||
462 | else | 465 | else |
463 | good_bytes = pcol->length - resid; | 466 | good_bytes = pcol->length - resid; |
464 | 467 | ||
465 | EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx" | 468 | EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" |
466 | " length=0x%lx nr_pages=%u\n", | 469 | " length=0x%lx nr_pages=%u\n", |
467 | pcol->inode->i_ino, _LLU(good_bytes), pcol->length, | 470 | pcol->inode->i_ino, _LLU(good_bytes), pcol->length, |
468 | pcol->nr_pages); | 471 | pcol->nr_pages); |
469 | 472 | ||
470 | __bio_for_each_segment(bvec, pcol->bio, i, 0) { | 473 | for (i = 0; i < pcol->nr_pages; i++) { |
471 | struct page *page = bvec->bv_page; | 474 | struct page *page = pcol->pages[i]; |
472 | struct inode *inode = page->mapping->host; | 475 | struct inode *inode = page->mapping->host; |
473 | int page_stat; | 476 | int page_stat; |
474 | 477 | ||
@@ -482,37 +485,27 @@ static void writepages_done(struct osd_request *or, void *p) | |||
482 | 485 | ||
483 | update_write_page(page, page_stat); | 486 | update_write_page(page, page_stat); |
484 | unlock_page(page); | 487 | unlock_page(page); |
485 | EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n", | 488 | EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", |
486 | inode->i_ino, page->index, page_stat); | 489 | inode->i_ino, page->index, page_stat); |
487 | 490 | ||
488 | length += bvec->bv_len; | 491 | length += PAGE_SIZE; |
489 | } | 492 | } |
490 | 493 | ||
491 | pcol_free(pcol); | 494 | pcol_free(pcol); |
492 | kfree(pcol); | 495 | kfree(pcol); |
493 | EXOFS_DBGMSG("writepages_done END\n"); | 496 | EXOFS_DBGMSG2("writepages_done END\n"); |
494 | } | 497 | } |
495 | 498 | ||
496 | static int write_exec(struct page_collect *pcol) | 499 | static int write_exec(struct page_collect *pcol) |
497 | { | 500 | { |
498 | struct exofs_i_info *oi = exofs_i(pcol->inode); | 501 | struct exofs_i_info *oi = exofs_i(pcol->inode); |
499 | struct osd_obj_id obj = {pcol->sbi->s_pid, | 502 | struct exofs_io_state *ios = pcol->ios; |
500 | pcol->inode->i_ino + EXOFS_OBJ_OFF}; | ||
501 | struct osd_request *or = NULL; | ||
502 | struct page_collect *pcol_copy = NULL; | 503 | struct page_collect *pcol_copy = NULL; |
503 | loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT; | ||
504 | int ret; | 504 | int ret; |
505 | 505 | ||
506 | if (!pcol->bio) | 506 | if (!pcol->pages) |
507 | return 0; | 507 | return 0; |
508 | 508 | ||
509 | or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL); | ||
510 | if (unlikely(!or)) { | ||
511 | EXOFS_ERR("write_exec: Faild to osd_start_request()\n"); | ||
512 | ret = -ENOMEM; | ||
513 | goto err; | ||
514 | } | ||
515 | |||
516 | pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); | 509 | pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); |
517 | if (!pcol_copy) { | 510 | if (!pcol_copy) { |
518 | EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); | 511 | EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); |
@@ -522,17 +515,22 @@ static int write_exec(struct page_collect *pcol) | |||
522 | 515 | ||
523 | *pcol_copy = *pcol; | 516 | *pcol_copy = *pcol; |
524 | 517 | ||
525 | pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */ | 518 | ios->pages = pcol_copy->pages; |
526 | osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length); | 519 | ios->nr_pages = pcol_copy->nr_pages; |
527 | ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred); | 520 | ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; |
521 | ios->length = pcol_copy->length; | ||
522 | ios->done = writepages_done; | ||
523 | ios->private = pcol_copy; | ||
524 | |||
525 | ret = exofs_oi_write(oi, ios); | ||
528 | if (unlikely(ret)) { | 526 | if (unlikely(ret)) { |
529 | EXOFS_ERR("write_exec: exofs_async_op() Faild\n"); | 527 | EXOFS_ERR("write_exec: exofs_oi_write() Faild\n"); |
530 | goto err; | 528 | goto err; |
531 | } | 529 | } |
532 | 530 | ||
533 | atomic_inc(&pcol->sbi->s_curr_pending); | 531 | atomic_inc(&pcol->sbi->s_curr_pending); |
534 | EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", | 532 | EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", |
535 | pcol->inode->i_ino, pcol->pg_first, _LLU(i_start), | 533 | pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), |
536 | pcol->length); | 534 | pcol->length); |
537 | /* pages ownership was passed to pcol_copy */ | 535 | /* pages ownership was passed to pcol_copy */ |
538 | _pcol_reset(pcol); | 536 | _pcol_reset(pcol); |
@@ -540,9 +538,9 @@ static int write_exec(struct page_collect *pcol) | |||
540 | 538 | ||
541 | err: | 539 | err: |
542 | _unlock_pcol_pages(pcol, ret, WRITE); | 540 | _unlock_pcol_pages(pcol, ret, WRITE); |
541 | pcol_free(pcol); | ||
543 | kfree(pcol_copy); | 542 | kfree(pcol_copy); |
544 | if (or) | 543 | |
545 | osd_end_request(or); | ||
546 | return ret; | 544 | return ret; |
547 | } | 545 | } |
548 | 546 | ||
@@ -586,6 +584,9 @@ static int writepage_strip(struct page *page, | |||
586 | if (PageError(page)) | 584 | if (PageError(page)) |
587 | ClearPageError(page); | 585 | ClearPageError(page); |
588 | unlock_page(page); | 586 | unlock_page(page); |
587 | EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) " | ||
588 | "outside the limits\n", | ||
589 | inode->i_ino, page->index); | ||
589 | return 0; | 590 | return 0; |
590 | } | 591 | } |
591 | } | 592 | } |
@@ -600,21 +601,24 @@ try_again: | |||
600 | ret = write_exec(pcol); | 601 | ret = write_exec(pcol); |
601 | if (unlikely(ret)) | 602 | if (unlikely(ret)) |
602 | goto fail; | 603 | goto fail; |
604 | |||
605 | EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n", | ||
606 | inode->i_ino, page->index); | ||
603 | goto try_again; | 607 | goto try_again; |
604 | } | 608 | } |
605 | 609 | ||
606 | if (!pcol->bio) { | 610 | if (!pcol->pages) { |
607 | ret = pcol_try_alloc(pcol); | 611 | ret = pcol_try_alloc(pcol); |
608 | if (unlikely(ret)) | 612 | if (unlikely(ret)) |
609 | goto fail; | 613 | goto fail; |
610 | } | 614 | } |
611 | 615 | ||
612 | EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", | 616 | EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", |
613 | inode->i_ino, page->index, len); | 617 | inode->i_ino, page->index, len); |
614 | 618 | ||
615 | ret = pcol_add_page(pcol, page, len); | 619 | ret = pcol_add_page(pcol, page, len); |
616 | if (unlikely(ret)) { | 620 | if (unlikely(ret)) { |
617 | EXOFS_DBGMSG("Failed pcol_add_page " | 621 | EXOFS_DBGMSG2("Failed pcol_add_page " |
618 | "nr_pages=%u total_length=0x%lx\n", | 622 | "nr_pages=%u total_length=0x%lx\n", |
619 | pcol->nr_pages, pcol->length); | 623 | pcol->nr_pages, pcol->length); |
620 | 624 | ||
@@ -634,6 +638,8 @@ try_again: | |||
634 | return 0; | 638 | return 0; |
635 | 639 | ||
636 | fail: | 640 | fail: |
641 | EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n", | ||
642 | inode->i_ino, page->index, ret); | ||
637 | set_bit(AS_EIO, &page->mapping->flags); | 643 | set_bit(AS_EIO, &page->mapping->flags); |
638 | unlock_page(page); | 644 | unlock_page(page); |
639 | return ret; | 645 | return ret; |
@@ -652,14 +658,17 @@ static int exofs_writepages(struct address_space *mapping, | |||
652 | wbc->range_end >> PAGE_CACHE_SHIFT; | 658 | wbc->range_end >> PAGE_CACHE_SHIFT; |
653 | 659 | ||
654 | if (start || end) | 660 | if (start || end) |
655 | expected_pages = min(end - start + 1, 32L); | 661 | expected_pages = end - start + 1; |
656 | else | 662 | else |
657 | expected_pages = mapping->nrpages; | 663 | expected_pages = mapping->nrpages; |
658 | 664 | ||
659 | EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx" | 665 | if (expected_pages < 32L) |
660 | " m->nrpages=%lu start=0x%lx end=0x%lx\n", | 666 | expected_pages = 32L; |
667 | |||
668 | EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " | ||
669 | "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", | ||
661 | mapping->host->i_ino, wbc->range_start, wbc->range_end, | 670 | mapping->host->i_ino, wbc->range_start, wbc->range_end, |
662 | mapping->nrpages, start, end); | 671 | mapping->nrpages, start, end, expected_pages); |
663 | 672 | ||
664 | _pcol_init(&pcol, expected_pages, mapping->host); | 673 | _pcol_init(&pcol, expected_pages, mapping->host); |
665 | 674 | ||
@@ -731,13 +740,28 @@ static int exofs_write_begin_export(struct file *file, | |||
731 | fsdata); | 740 | fsdata); |
732 | } | 741 | } |
733 | 742 | ||
743 | static int exofs_write_end(struct file *file, struct address_space *mapping, | ||
744 | loff_t pos, unsigned len, unsigned copied, | ||
745 | struct page *page, void *fsdata) | ||
746 | { | ||
747 | struct inode *inode = mapping->host; | ||
748 | /* According to comment in simple_write_end i_mutex is held */ | ||
749 | loff_t i_size = inode->i_size; | ||
750 | int ret; | ||
751 | |||
752 | ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); | ||
753 | if (i_size != inode->i_size) | ||
754 | mark_inode_dirty(inode); | ||
755 | return ret; | ||
756 | } | ||
757 | |||
734 | const struct address_space_operations exofs_aops = { | 758 | const struct address_space_operations exofs_aops = { |
735 | .readpage = exofs_readpage, | 759 | .readpage = exofs_readpage, |
736 | .readpages = exofs_readpages, | 760 | .readpages = exofs_readpages, |
737 | .writepage = exofs_writepage, | 761 | .writepage = exofs_writepage, |
738 | .writepages = exofs_writepages, | 762 | .writepages = exofs_writepages, |
739 | .write_begin = exofs_write_begin_export, | 763 | .write_begin = exofs_write_begin_export, |
740 | .write_end = simple_write_end, | 764 | .write_end = exofs_write_end, |
741 | }; | 765 | }; |
742 | 766 | ||
743 | /****************************************************************************** | 767 | /****************************************************************************** |
@@ -771,19 +795,28 @@ static int exofs_get_block(struct inode *inode, sector_t iblock, | |||
771 | const struct osd_attr g_attr_logical_length = ATTR_DEF( | 795 | const struct osd_attr g_attr_logical_length = ATTR_DEF( |
772 | OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); | 796 | OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); |
773 | 797 | ||
798 | static int _do_truncate(struct inode *inode) | ||
799 | { | ||
800 | struct exofs_i_info *oi = exofs_i(inode); | ||
801 | loff_t isize = i_size_read(inode); | ||
802 | int ret; | ||
803 | |||
804 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
805 | |||
806 | nobh_truncate_page(inode->i_mapping, isize, exofs_get_block); | ||
807 | |||
808 | ret = exofs_oi_truncate(oi, (u64)isize); | ||
809 | EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize); | ||
810 | return ret; | ||
811 | } | ||
812 | |||
774 | /* | 813 | /* |
775 | * Truncate a file to the specified size - all we have to do is set the size | 814 | * Truncate a file to the specified size - all we have to do is set the size |
776 | * attribute. We make sure the object exists first. | 815 | * attribute. We make sure the object exists first. |
777 | */ | 816 | */ |
778 | void exofs_truncate(struct inode *inode) | 817 | void exofs_truncate(struct inode *inode) |
779 | { | 818 | { |
780 | struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; | ||
781 | struct exofs_i_info *oi = exofs_i(inode); | 819 | struct exofs_i_info *oi = exofs_i(inode); |
782 | struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; | ||
783 | struct osd_request *or; | ||
784 | struct osd_attr attr; | ||
785 | loff_t isize = i_size_read(inode); | ||
786 | __be64 newsize; | ||
787 | int ret; | 820 | int ret; |
788 | 821 | ||
789 | if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) | 822 | if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) |
@@ -793,22 +826,6 @@ void exofs_truncate(struct inode *inode) | |||
793 | return; | 826 | return; |
794 | if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) | 827 | if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) |
795 | return; | 828 | return; |
796 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
797 | |||
798 | nobh_truncate_page(inode->i_mapping, isize, exofs_get_block); | ||
799 | |||
800 | or = osd_start_request(sbi->s_dev, GFP_KERNEL); | ||
801 | if (unlikely(!or)) { | ||
802 | EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n"); | ||
803 | goto fail; | ||
804 | } | ||
805 | |||
806 | osd_req_set_attributes(or, &obj); | ||
807 | |||
808 | newsize = cpu_to_be64((u64)isize); | ||
809 | attr = g_attr_logical_length; | ||
810 | attr.val_ptr = &newsize; | ||
811 | osd_req_add_set_attr_list(or, &attr, 1); | ||
812 | 829 | ||
813 | /* if we are about to truncate an object, and it hasn't been | 830 | /* if we are about to truncate an object, and it hasn't been |
814 | * created yet, wait | 831 | * created yet, wait |
@@ -816,8 +833,7 @@ void exofs_truncate(struct inode *inode) | |||
816 | if (unlikely(wait_obj_created(oi))) | 833 | if (unlikely(wait_obj_created(oi))) |
817 | goto fail; | 834 | goto fail; |
818 | 835 | ||
819 | ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); | 836 | ret = _do_truncate(inode); |
820 | osd_end_request(or); | ||
821 | if (ret) | 837 | if (ret) |
822 | goto fail; | 838 | goto fail; |
823 | 839 | ||
@@ -845,67 +861,110 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr) | |||
845 | return error; | 861 | return error; |
846 | } | 862 | } |
847 | 863 | ||
864 | static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF( | ||
865 | EXOFS_APAGE_FS_DATA, | ||
866 | EXOFS_ATTR_INODE_FILE_LAYOUT, | ||
867 | 0); | ||
868 | static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( | ||
869 | EXOFS_APAGE_FS_DATA, | ||
870 | EXOFS_ATTR_INODE_DIR_LAYOUT, | ||
871 | 0); | ||
872 | |||
848 | /* | 873 | /* |
849 | * Read an inode from the OSD, and return it as is. We also return the size | 874 | * Read the Linux inode info from the OSD, and return it as is. In exofs the |
850 | * attribute in the 'sanity' argument if we got compiled with debugging turned | 875 | * inode info is in an application specific page/attribute of the osd-object. |
851 | * on. | ||
852 | */ | 876 | */ |
853 | static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, | 877 | static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, |
854 | struct exofs_fcb *inode, uint64_t *sanity) | 878 | struct exofs_fcb *inode) |
855 | { | 879 | { |
856 | struct exofs_sb_info *sbi = sb->s_fs_info; | 880 | struct exofs_sb_info *sbi = sb->s_fs_info; |
857 | struct osd_request *or; | 881 | struct osd_attr attrs[] = { |
858 | struct osd_attr attr; | 882 | [0] = g_attr_inode_data, |
859 | struct osd_obj_id obj = {sbi->s_pid, | 883 | [1] = g_attr_inode_file_layout, |
860 | oi->vfs_inode.i_ino + EXOFS_OBJ_OFF}; | 884 | [2] = g_attr_inode_dir_layout, |
885 | }; | ||
886 | struct exofs_io_state *ios; | ||
887 | struct exofs_on_disk_inode_layout *layout; | ||
861 | int ret; | 888 | int ret; |
862 | 889 | ||
863 | exofs_make_credential(oi->i_cred, &obj); | 890 | ret = exofs_get_io_state(&sbi->layout, &ios); |
864 | 891 | if (unlikely(ret)) { | |
865 | or = osd_start_request(sbi->s_dev, GFP_KERNEL); | 892 | EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); |
866 | if (unlikely(!or)) { | 893 | return ret; |
867 | EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n"); | ||
868 | return -ENOMEM; | ||
869 | } | 894 | } |
870 | osd_req_get_attributes(or, &obj); | ||
871 | 895 | ||
872 | /* we need the inode attribute */ | 896 | ios->obj.id = exofs_oi_objno(oi); |
873 | osd_req_add_get_attr_list(or, &g_attr_inode_data, 1); | 897 | exofs_make_credential(oi->i_cred, &ios->obj); |
898 | ios->cred = oi->i_cred; | ||
874 | 899 | ||
875 | #ifdef EXOFS_DEBUG_OBJ_ISIZE | 900 | attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); |
876 | /* we get the size attributes to do a sanity check */ | 901 | attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); |
877 | osd_req_add_get_attr_list(or, &g_attr_logical_length, 1); | ||
878 | #endif | ||
879 | 902 | ||
880 | ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); | 903 | ios->in_attr = attrs; |
881 | if (ret) | 904 | ios->in_attr_len = ARRAY_SIZE(attrs); |
905 | |||
906 | ret = exofs_sbi_read(ios); | ||
907 | if (unlikely(ret)) { | ||
908 | EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", | ||
909 | _LLU(ios->obj.id), ret); | ||
910 | memset(inode, 0, sizeof(*inode)); | ||
911 | inode->i_mode = 0040000 | (0777 & ~022); | ||
912 | /* If object is lost on target we might as well enable it's | ||
913 | * delete. | ||
914 | */ | ||
915 | if ((ret == -ENOENT) || (ret == -EINVAL)) | ||
916 | ret = 0; | ||
882 | goto out; | 917 | goto out; |
918 | } | ||
883 | 919 | ||
884 | attr = g_attr_inode_data; | 920 | ret = extract_attr_from_ios(ios, &attrs[0]); |
885 | ret = extract_attr_from_req(or, &attr); | ||
886 | if (ret) { | 921 | if (ret) { |
887 | EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n"); | 922 | EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); |
888 | goto out; | 923 | goto out; |
889 | } | 924 | } |
925 | WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); | ||
926 | memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE); | ||
890 | 927 | ||
891 | WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE); | 928 | ret = extract_attr_from_ios(ios, &attrs[1]); |
892 | memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE); | 929 | if (ret) { |
930 | EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); | ||
931 | goto out; | ||
932 | } | ||
933 | if (attrs[1].len) { | ||
934 | layout = attrs[1].val_ptr; | ||
935 | if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { | ||
936 | EXOFS_ERR("%s: unsupported files layout %d\n", | ||
937 | __func__, layout->gen_func); | ||
938 | ret = -ENOTSUPP; | ||
939 | goto out; | ||
940 | } | ||
941 | } | ||
893 | 942 | ||
894 | #ifdef EXOFS_DEBUG_OBJ_ISIZE | 943 | ret = extract_attr_from_ios(ios, &attrs[2]); |
895 | attr = g_attr_logical_length; | ||
896 | ret = extract_attr_from_req(or, &attr); | ||
897 | if (ret) { | 944 | if (ret) { |
898 | EXOFS_ERR("ERROR: extract attr from or failed\n"); | 945 | EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); |
899 | goto out; | 946 | goto out; |
900 | } | 947 | } |
901 | *sanity = get_unaligned_be64(attr.val_ptr); | 948 | if (attrs[2].len) { |
902 | #endif | 949 | layout = attrs[2].val_ptr; |
950 | if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { | ||
951 | EXOFS_ERR("%s: unsupported meta-data layout %d\n", | ||
952 | __func__, layout->gen_func); | ||
953 | ret = -ENOTSUPP; | ||
954 | goto out; | ||
955 | } | ||
956 | } | ||
903 | 957 | ||
904 | out: | 958 | out: |
905 | osd_end_request(or); | 959 | exofs_put_io_state(ios); |
906 | return ret; | 960 | return ret; |
907 | } | 961 | } |
908 | 962 | ||
963 | static void __oi_init(struct exofs_i_info *oi) | ||
964 | { | ||
965 | init_waitqueue_head(&oi->i_wq); | ||
966 | oi->i_flags = 0; | ||
967 | } | ||
909 | /* | 968 | /* |
910 | * Fill in an inode read from the OSD and set it up for use | 969 | * Fill in an inode read from the OSD and set it up for use |
911 | */ | 970 | */ |
@@ -914,7 +973,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) | |||
914 | struct exofs_i_info *oi; | 973 | struct exofs_i_info *oi; |
915 | struct exofs_fcb fcb; | 974 | struct exofs_fcb fcb; |
916 | struct inode *inode; | 975 | struct inode *inode; |
917 | uint64_t uninitialized_var(sanity); | ||
918 | int ret; | 976 | int ret; |
919 | 977 | ||
920 | inode = iget_locked(sb, ino); | 978 | inode = iget_locked(sb, ino); |
@@ -923,13 +981,13 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) | |||
923 | if (!(inode->i_state & I_NEW)) | 981 | if (!(inode->i_state & I_NEW)) |
924 | return inode; | 982 | return inode; |
925 | oi = exofs_i(inode); | 983 | oi = exofs_i(inode); |
984 | __oi_init(oi); | ||
926 | 985 | ||
927 | /* read the inode from the osd */ | 986 | /* read the inode from the osd */ |
928 | ret = exofs_get_inode(sb, oi, &fcb, &sanity); | 987 | ret = exofs_get_inode(sb, oi, &fcb); |
929 | if (ret) | 988 | if (ret) |
930 | goto bad_inode; | 989 | goto bad_inode; |
931 | 990 | ||
932 | init_waitqueue_head(&oi->i_wq); | ||
933 | set_obj_created(oi); | 991 | set_obj_created(oi); |
934 | 992 | ||
935 | /* copy stuff from on-disk struct to in-memory struct */ | 993 | /* copy stuff from on-disk struct to in-memory struct */ |
@@ -947,15 +1005,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) | |||
947 | inode->i_blkbits = EXOFS_BLKSHIFT; | 1005 | inode->i_blkbits = EXOFS_BLKSHIFT; |
948 | inode->i_generation = le32_to_cpu(fcb.i_generation); | 1006 | inode->i_generation = le32_to_cpu(fcb.i_generation); |
949 | 1007 | ||
950 | #ifdef EXOFS_DEBUG_OBJ_ISIZE | ||
951 | if ((inode->i_size != sanity) && | ||
952 | (!exofs_inode_is_fast_symlink(inode))) { | ||
953 | EXOFS_ERR("WARNING: Size of object from inode and " | ||
954 | "attributes differ (%lld != %llu)\n", | ||
955 | inode->i_size, _LLU(sanity)); | ||
956 | } | ||
957 | #endif | ||
958 | |||
959 | oi->i_dir_start_lookup = 0; | 1008 | oi->i_dir_start_lookup = 0; |
960 | 1009 | ||
961 | if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { | 1010 | if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { |
@@ -1020,23 +1069,30 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi) | |||
1020 | * set the obj_created flag so that other methods know that the object exists on | 1069 | * set the obj_created flag so that other methods know that the object exists on |
1021 | * the OSD. | 1070 | * the OSD. |
1022 | */ | 1071 | */ |
1023 | static void create_done(struct osd_request *or, void *p) | 1072 | static void create_done(struct exofs_io_state *ios, void *p) |
1024 | { | 1073 | { |
1025 | struct inode *inode = p; | 1074 | struct inode *inode = p; |
1026 | struct exofs_i_info *oi = exofs_i(inode); | 1075 | struct exofs_i_info *oi = exofs_i(inode); |
1027 | struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; | 1076 | struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; |
1028 | int ret; | 1077 | int ret; |
1029 | 1078 | ||
1030 | ret = exofs_check_ok(or); | 1079 | ret = exofs_check_io(ios, NULL); |
1031 | osd_end_request(or); | 1080 | exofs_put_io_state(ios); |
1081 | |||
1032 | atomic_dec(&sbi->s_curr_pending); | 1082 | atomic_dec(&sbi->s_curr_pending); |
1033 | 1083 | ||
1034 | if (unlikely(ret)) { | 1084 | if (unlikely(ret)) { |
1035 | EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", | 1085 | EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", |
1036 | _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF)); | 1086 | _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); |
1037 | make_bad_inode(inode); | 1087 | /*TODO: When FS is corrupted creation can fail, object already |
1038 | } else | 1088 | * exist. Get rid of this asynchronous creation, if exist |
1039 | set_obj_created(oi); | 1089 | * increment the obj counter and try the next object. Until we |
1090 | * succeed. All these dangling objects will be made into lost | ||
1091 | * files by chkfs.exofs | ||
1092 | */ | ||
1093 | } | ||
1094 | |||
1095 | set_obj_created(oi); | ||
1040 | 1096 | ||
1041 | atomic_dec(&inode->i_count); | 1097 | atomic_dec(&inode->i_count); |
1042 | wake_up(&oi->i_wq); | 1098 | wake_up(&oi->i_wq); |
@@ -1051,8 +1107,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) | |||
1051 | struct inode *inode; | 1107 | struct inode *inode; |
1052 | struct exofs_i_info *oi; | 1108 | struct exofs_i_info *oi; |
1053 | struct exofs_sb_info *sbi; | 1109 | struct exofs_sb_info *sbi; |
1054 | struct osd_request *or; | 1110 | struct exofs_io_state *ios; |
1055 | struct osd_obj_id obj; | ||
1056 | int ret; | 1111 | int ret; |
1057 | 1112 | ||
1058 | sb = dir->i_sb; | 1113 | sb = dir->i_sb; |
@@ -1061,8 +1116,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) | |||
1061 | return ERR_PTR(-ENOMEM); | 1116 | return ERR_PTR(-ENOMEM); |
1062 | 1117 | ||
1063 | oi = exofs_i(inode); | 1118 | oi = exofs_i(inode); |
1119 | __oi_init(oi); | ||
1064 | 1120 | ||
1065 | init_waitqueue_head(&oi->i_wq); | ||
1066 | set_obj_2bcreated(oi); | 1121 | set_obj_2bcreated(oi); |
1067 | 1122 | ||
1068 | sbi = sb->s_fs_info; | 1123 | sbi = sb->s_fs_info; |
@@ -1089,28 +1144,28 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) | |||
1089 | 1144 | ||
1090 | mark_inode_dirty(inode); | 1145 | mark_inode_dirty(inode); |
1091 | 1146 | ||
1092 | obj.partition = sbi->s_pid; | 1147 | ret = exofs_get_io_state(&sbi->layout, &ios); |
1093 | obj.id = inode->i_ino + EXOFS_OBJ_OFF; | 1148 | if (unlikely(ret)) { |
1094 | exofs_make_credential(oi->i_cred, &obj); | 1149 | EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); |
1095 | 1150 | return ERR_PTR(ret); | |
1096 | or = osd_start_request(sbi->s_dev, GFP_KERNEL); | ||
1097 | if (unlikely(!or)) { | ||
1098 | EXOFS_ERR("exofs_new_inode: osd_start_request failed\n"); | ||
1099 | return ERR_PTR(-ENOMEM); | ||
1100 | } | 1151 | } |
1101 | 1152 | ||
1102 | osd_req_create_object(or, &obj); | 1153 | ios->obj.id = exofs_oi_objno(oi); |
1154 | exofs_make_credential(oi->i_cred, &ios->obj); | ||
1103 | 1155 | ||
1104 | /* increment the refcount so that the inode will still be around when we | 1156 | /* increment the refcount so that the inode will still be around when we |
1105 | * reach the callback | 1157 | * reach the callback |
1106 | */ | 1158 | */ |
1107 | atomic_inc(&inode->i_count); | 1159 | atomic_inc(&inode->i_count); |
1108 | 1160 | ||
1109 | ret = exofs_async_op(or, create_done, inode, oi->i_cred); | 1161 | ios->done = create_done; |
1162 | ios->private = inode; | ||
1163 | ios->cred = oi->i_cred; | ||
1164 | ret = exofs_sbi_create(ios); | ||
1110 | if (ret) { | 1165 | if (ret) { |
1111 | atomic_dec(&inode->i_count); | 1166 | atomic_dec(&inode->i_count); |
1112 | osd_end_request(or); | 1167 | exofs_put_io_state(ios); |
1113 | return ERR_PTR(-EIO); | 1168 | return ERR_PTR(ret); |
1114 | } | 1169 | } |
1115 | atomic_inc(&sbi->s_curr_pending); | 1170 | atomic_inc(&sbi->s_curr_pending); |
1116 | 1171 | ||
@@ -1128,11 +1183,11 @@ struct updatei_args { | |||
1128 | /* | 1183 | /* |
1129 | * Callback function from exofs_update_inode(). | 1184 | * Callback function from exofs_update_inode(). |
1130 | */ | 1185 | */ |
1131 | static void updatei_done(struct osd_request *or, void *p) | 1186 | static void updatei_done(struct exofs_io_state *ios, void *p) |
1132 | { | 1187 | { |
1133 | struct updatei_args *args = p; | 1188 | struct updatei_args *args = p; |
1134 | 1189 | ||
1135 | osd_end_request(or); | 1190 | exofs_put_io_state(ios); |
1136 | 1191 | ||
1137 | atomic_dec(&args->sbi->s_curr_pending); | 1192 | atomic_dec(&args->sbi->s_curr_pending); |
1138 | 1193 | ||
@@ -1148,16 +1203,17 @@ static int exofs_update_inode(struct inode *inode, int do_sync) | |||
1148 | struct exofs_i_info *oi = exofs_i(inode); | 1203 | struct exofs_i_info *oi = exofs_i(inode); |
1149 | struct super_block *sb = inode->i_sb; | 1204 | struct super_block *sb = inode->i_sb; |
1150 | struct exofs_sb_info *sbi = sb->s_fs_info; | 1205 | struct exofs_sb_info *sbi = sb->s_fs_info; |
1151 | struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; | 1206 | struct exofs_io_state *ios; |
1152 | struct osd_request *or; | ||
1153 | struct osd_attr attr; | 1207 | struct osd_attr attr; |
1154 | struct exofs_fcb *fcb; | 1208 | struct exofs_fcb *fcb; |
1155 | struct updatei_args *args; | 1209 | struct updatei_args *args; |
1156 | int ret; | 1210 | int ret; |
1157 | 1211 | ||
1158 | args = kzalloc(sizeof(*args), GFP_KERNEL); | 1212 | args = kzalloc(sizeof(*args), GFP_KERNEL); |
1159 | if (!args) | 1213 | if (!args) { |
1214 | EXOFS_DBGMSG("Faild kzalloc of args\n"); | ||
1160 | return -ENOMEM; | 1215 | return -ENOMEM; |
1216 | } | ||
1161 | 1217 | ||
1162 | fcb = &args->fcb; | 1218 | fcb = &args->fcb; |
1163 | 1219 | ||
@@ -1186,18 +1242,16 @@ static int exofs_update_inode(struct inode *inode, int do_sync) | |||
1186 | } else | 1242 | } else |
1187 | memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); | 1243 | memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); |
1188 | 1244 | ||
1189 | or = osd_start_request(sbi->s_dev, GFP_KERNEL); | 1245 | ret = exofs_get_io_state(&sbi->layout, &ios); |
1190 | if (unlikely(!or)) { | 1246 | if (unlikely(ret)) { |
1191 | EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n"); | 1247 | EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); |
1192 | ret = -ENOMEM; | ||
1193 | goto free_args; | 1248 | goto free_args; |
1194 | } | 1249 | } |
1195 | 1250 | ||
1196 | osd_req_set_attributes(or, &obj); | ||
1197 | |||
1198 | attr = g_attr_inode_data; | 1251 | attr = g_attr_inode_data; |
1199 | attr.val_ptr = fcb; | 1252 | attr.val_ptr = fcb; |
1200 | osd_req_add_set_attr_list(or, &attr, 1); | 1253 | ios->out_attr_len = 1; |
1254 | ios->out_attr = &attr; | ||
1201 | 1255 | ||
1202 | if (!obj_created(oi)) { | 1256 | if (!obj_created(oi)) { |
1203 | EXOFS_DBGMSG("!obj_created\n"); | 1257 | EXOFS_DBGMSG("!obj_created\n"); |
@@ -1206,43 +1260,42 @@ static int exofs_update_inode(struct inode *inode, int do_sync) | |||
1206 | EXOFS_DBGMSG("wait_event done\n"); | 1260 | EXOFS_DBGMSG("wait_event done\n"); |
1207 | } | 1261 | } |
1208 | 1262 | ||
1209 | if (do_sync) { | 1263 | if (!do_sync) { |
1210 | ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); | ||
1211 | osd_end_request(or); | ||
1212 | goto free_args; | ||
1213 | } else { | ||
1214 | args->sbi = sbi; | 1264 | args->sbi = sbi; |
1265 | ios->done = updatei_done; | ||
1266 | ios->private = args; | ||
1267 | } | ||
1215 | 1268 | ||
1216 | ret = exofs_async_op(or, updatei_done, args, oi->i_cred); | 1269 | ret = exofs_oi_write(oi, ios); |
1217 | if (ret) { | 1270 | if (!do_sync && !ret) { |
1218 | osd_end_request(or); | ||
1219 | goto free_args; | ||
1220 | } | ||
1221 | atomic_inc(&sbi->s_curr_pending); | 1271 | atomic_inc(&sbi->s_curr_pending); |
1222 | goto out; /* deallocation in updatei_done */ | 1272 | goto out; /* deallocation in updatei_done */ |
1223 | } | 1273 | } |
1224 | 1274 | ||
1275 | exofs_put_io_state(ios); | ||
1225 | free_args: | 1276 | free_args: |
1226 | kfree(args); | 1277 | kfree(args); |
1227 | out: | 1278 | out: |
1228 | EXOFS_DBGMSG("ret=>%d\n", ret); | 1279 | EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n", |
1280 | inode->i_ino, do_sync, ret); | ||
1229 | return ret; | 1281 | return ret; |
1230 | } | 1282 | } |
1231 | 1283 | ||
1232 | int exofs_write_inode(struct inode *inode, int wait) | 1284 | int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) |
1233 | { | 1285 | { |
1234 | return exofs_update_inode(inode, wait); | 1286 | return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); |
1235 | } | 1287 | } |
1236 | 1288 | ||
1237 | /* | 1289 | /* |
1238 | * Callback function from exofs_delete_inode() - don't have much cleaning up to | 1290 | * Callback function from exofs_delete_inode() - don't have much cleaning up to |
1239 | * do. | 1291 | * do. |
1240 | */ | 1292 | */ |
1241 | static void delete_done(struct osd_request *or, void *p) | 1293 | static void delete_done(struct exofs_io_state *ios, void *p) |
1242 | { | 1294 | { |
1243 | struct exofs_sb_info *sbi; | 1295 | struct exofs_sb_info *sbi = p; |
1244 | osd_end_request(or); | 1296 | |
1245 | sbi = p; | 1297 | exofs_put_io_state(ios); |
1298 | |||
1246 | atomic_dec(&sbi->s_curr_pending); | 1299 | atomic_dec(&sbi->s_curr_pending); |
1247 | } | 1300 | } |
1248 | 1301 | ||
@@ -1256,8 +1309,7 @@ void exofs_delete_inode(struct inode *inode) | |||
1256 | struct exofs_i_info *oi = exofs_i(inode); | 1309 | struct exofs_i_info *oi = exofs_i(inode); |
1257 | struct super_block *sb = inode->i_sb; | 1310 | struct super_block *sb = inode->i_sb; |
1258 | struct exofs_sb_info *sbi = sb->s_fs_info; | 1311 | struct exofs_sb_info *sbi = sb->s_fs_info; |
1259 | struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; | 1312 | struct exofs_io_state *ios; |
1260 | struct osd_request *or; | ||
1261 | int ret; | 1313 | int ret; |
1262 | 1314 | ||
1263 | truncate_inode_pages(&inode->i_data, 0); | 1315 | truncate_inode_pages(&inode->i_data, 0); |
@@ -1274,25 +1326,26 @@ void exofs_delete_inode(struct inode *inode) | |||
1274 | 1326 | ||
1275 | clear_inode(inode); | 1327 | clear_inode(inode); |
1276 | 1328 | ||
1277 | or = osd_start_request(sbi->s_dev, GFP_KERNEL); | 1329 | ret = exofs_get_io_state(&sbi->layout, &ios); |
1278 | if (unlikely(!or)) { | 1330 | if (unlikely(ret)) { |
1279 | EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n"); | 1331 | EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); |
1280 | return; | 1332 | return; |
1281 | } | 1333 | } |
1282 | 1334 | ||
1283 | osd_req_remove_object(or, &obj); | ||
1284 | |||
1285 | /* if we are deleting an obj that hasn't been created yet, wait */ | 1335 | /* if we are deleting an obj that hasn't been created yet, wait */ |
1286 | if (!obj_created(oi)) { | 1336 | if (!obj_created(oi)) { |
1287 | BUG_ON(!obj_2bcreated(oi)); | 1337 | BUG_ON(!obj_2bcreated(oi)); |
1288 | wait_event(oi->i_wq, obj_created(oi)); | 1338 | wait_event(oi->i_wq, obj_created(oi)); |
1289 | } | 1339 | } |
1290 | 1340 | ||
1291 | ret = exofs_async_op(or, delete_done, sbi, oi->i_cred); | 1341 | ios->obj.id = exofs_oi_objno(oi); |
1342 | ios->done = delete_done; | ||
1343 | ios->private = sbi; | ||
1344 | ios->cred = oi->i_cred; | ||
1345 | ret = exofs_sbi_remove(ios); | ||
1292 | if (ret) { | 1346 | if (ret) { |
1293 | EXOFS_ERR( | 1347 | EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__); |
1294 | "ERROR: @exofs_delete_inode exofs_async_op failed\n"); | 1348 | exofs_put_io_state(ios); |
1295 | osd_end_request(or); | ||
1296 | return; | 1349 | return; |
1297 | } | 1350 | } |
1298 | atomic_inc(&sbi->s_curr_pending); | 1351 | atomic_inc(&sbi->s_curr_pending); |
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c new file mode 100644 index 000000000000..4337cad7777b --- /dev/null +++ b/fs/exofs/ios.c | |||
@@ -0,0 +1,823 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2005, 2006 | ||
3 | * Avishay Traeger (avishay@gmail.com) | ||
4 | * Copyright (C) 2008, 2009 | ||
5 | * Boaz Harrosh <bharrosh@panasas.com> | ||
6 | * | ||
7 | * This file is part of exofs. | ||
8 | * | ||
9 | * exofs is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation. Since it is based on ext2, and the only | ||
12 | * valid version of GPL for the Linux kernel is version 2, the only valid | ||
13 | * version of GPL for exofs is version 2. | ||
14 | * | ||
15 | * exofs is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | * GNU General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with exofs; if not, write to the Free Software | ||
22 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
23 | */ | ||
24 | |||
25 | #include <linux/slab.h> | ||
26 | #include <scsi/scsi_device.h> | ||
27 | #include <asm/div64.h> | ||
28 | |||
29 | #include "exofs.h" | ||
30 | |||
31 | #define EXOFS_DBGMSG2(M...) do {} while (0) | ||
32 | /* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */ | ||
33 | |||
34 | void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) | ||
35 | { | ||
36 | osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); | ||
37 | } | ||
38 | |||
39 | int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, | ||
40 | u64 offset, void *p, unsigned length) | ||
41 | { | ||
42 | struct osd_request *or = osd_start_request(od, GFP_KERNEL); | ||
43 | /* struct osd_sense_info osi = {.key = 0};*/ | ||
44 | int ret; | ||
45 | |||
46 | if (unlikely(!or)) { | ||
47 | EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); | ||
48 | return -ENOMEM; | ||
49 | } | ||
50 | ret = osd_req_read_kern(or, obj, offset, p, length); | ||
51 | if (unlikely(ret)) { | ||
52 | EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); | ||
53 | goto out; | ||
54 | } | ||
55 | |||
56 | ret = osd_finalize_request(or, 0, cred, NULL); | ||
57 | if (unlikely(ret)) { | ||
58 | EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); | ||
59 | goto out; | ||
60 | } | ||
61 | |||
62 | ret = osd_execute_request(or); | ||
63 | if (unlikely(ret)) | ||
64 | EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); | ||
65 | /* osd_req_decode_sense(or, ret); */ | ||
66 | |||
67 | out: | ||
68 | osd_end_request(or); | ||
69 | return ret; | ||
70 | } | ||
71 | |||
72 | int exofs_get_io_state(struct exofs_layout *layout, | ||
73 | struct exofs_io_state **pios) | ||
74 | { | ||
75 | struct exofs_io_state *ios; | ||
76 | |||
77 | /*TODO: Maybe use kmem_cach per sbi of size | ||
78 | * exofs_io_state_size(layout->s_numdevs) | ||
79 | */ | ||
80 | ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); | ||
81 | if (unlikely(!ios)) { | ||
82 | EXOFS_DBGMSG("Faild kzalloc bytes=%d\n", | ||
83 | exofs_io_state_size(layout->s_numdevs)); | ||
84 | *pios = NULL; | ||
85 | return -ENOMEM; | ||
86 | } | ||
87 | |||
88 | ios->layout = layout; | ||
89 | ios->obj.partition = layout->s_pid; | ||
90 | *pios = ios; | ||
91 | return 0; | ||
92 | } | ||
93 | |||
94 | void exofs_put_io_state(struct exofs_io_state *ios) | ||
95 | { | ||
96 | if (ios) { | ||
97 | unsigned i; | ||
98 | |||
99 | for (i = 0; i < ios->numdevs; i++) { | ||
100 | struct exofs_per_dev_state *per_dev = &ios->per_dev[i]; | ||
101 | |||
102 | if (per_dev->or) | ||
103 | osd_end_request(per_dev->or); | ||
104 | if (per_dev->bio) | ||
105 | bio_put(per_dev->bio); | ||
106 | } | ||
107 | |||
108 | kfree(ios); | ||
109 | } | ||
110 | } | ||
111 | |||
112 | unsigned exofs_layout_od_id(struct exofs_layout *layout, | ||
113 | osd_id obj_no, unsigned layout_index) | ||
114 | { | ||
115 | /* switch (layout->lay_func) { | ||
116 | case LAYOUT_MOVING_WINDOW: | ||
117 | {*/ | ||
118 | unsigned dev_mod = obj_no; | ||
119 | |||
120 | return (layout_index + dev_mod * layout->mirrors_p1) % | ||
121 | layout->s_numdevs; | ||
122 | /* } | ||
123 | case LAYOUT_FUNC_IMPLICT: | ||
124 | return layout->devs[layout_index]; | ||
125 | }*/ | ||
126 | } | ||
127 | |||
128 | static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, | ||
129 | unsigned layout_index) | ||
130 | { | ||
131 | return ios->layout->s_ods[ | ||
132 | exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)]; | ||
133 | } | ||
134 | |||
135 | static void _sync_done(struct exofs_io_state *ios, void *p) | ||
136 | { | ||
137 | struct completion *waiting = p; | ||
138 | |||
139 | complete(waiting); | ||
140 | } | ||
141 | |||
142 | static void _last_io(struct kref *kref) | ||
143 | { | ||
144 | struct exofs_io_state *ios = container_of( | ||
145 | kref, struct exofs_io_state, kref); | ||
146 | |||
147 | ios->done(ios, ios->private); | ||
148 | } | ||
149 | |||
150 | static void _done_io(struct osd_request *or, void *p) | ||
151 | { | ||
152 | struct exofs_io_state *ios = p; | ||
153 | |||
154 | kref_put(&ios->kref, _last_io); | ||
155 | } | ||
156 | |||
157 | static int exofs_io_execute(struct exofs_io_state *ios) | ||
158 | { | ||
159 | DECLARE_COMPLETION_ONSTACK(wait); | ||
160 | bool sync = (ios->done == NULL); | ||
161 | int i, ret; | ||
162 | |||
163 | if (sync) { | ||
164 | ios->done = _sync_done; | ||
165 | ios->private = &wait; | ||
166 | } | ||
167 | |||
168 | for (i = 0; i < ios->numdevs; i++) { | ||
169 | struct osd_request *or = ios->per_dev[i].or; | ||
170 | if (unlikely(!or)) | ||
171 | continue; | ||
172 | |||
173 | ret = osd_finalize_request(or, 0, ios->cred, NULL); | ||
174 | if (unlikely(ret)) { | ||
175 | EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", | ||
176 | ret); | ||
177 | return ret; | ||
178 | } | ||
179 | } | ||
180 | |||
181 | kref_init(&ios->kref); | ||
182 | |||
183 | for (i = 0; i < ios->numdevs; i++) { | ||
184 | struct osd_request *or = ios->per_dev[i].or; | ||
185 | if (unlikely(!or)) | ||
186 | continue; | ||
187 | |||
188 | kref_get(&ios->kref); | ||
189 | osd_execute_request_async(or, _done_io, ios); | ||
190 | } | ||
191 | |||
192 | kref_put(&ios->kref, _last_io); | ||
193 | ret = 0; | ||
194 | |||
195 | if (sync) { | ||
196 | wait_for_completion(&wait); | ||
197 | ret = exofs_check_io(ios, NULL); | ||
198 | } | ||
199 | return ret; | ||
200 | } | ||
201 | |||
202 | static void _clear_bio(struct bio *bio) | ||
203 | { | ||
204 | struct bio_vec *bv; | ||
205 | unsigned i; | ||
206 | |||
207 | __bio_for_each_segment(bv, bio, i, 0) { | ||
208 | unsigned this_count = bv->bv_len; | ||
209 | |||
210 | if (likely(PAGE_SIZE == this_count)) | ||
211 | clear_highpage(bv->bv_page); | ||
212 | else | ||
213 | zero_user(bv->bv_page, bv->bv_offset, this_count); | ||
214 | } | ||
215 | } | ||
216 | |||
217 | int exofs_check_io(struct exofs_io_state *ios, u64 *resid) | ||
218 | { | ||
219 | enum osd_err_priority acumulated_osd_err = 0; | ||
220 | int acumulated_lin_err = 0; | ||
221 | int i; | ||
222 | |||
223 | for (i = 0; i < ios->numdevs; i++) { | ||
224 | struct osd_sense_info osi; | ||
225 | struct osd_request *or = ios->per_dev[i].or; | ||
226 | int ret; | ||
227 | |||
228 | if (unlikely(!or)) | ||
229 | continue; | ||
230 | |||
231 | ret = osd_req_decode_sense(or, &osi); | ||
232 | if (likely(!ret)) | ||
233 | continue; | ||
234 | |||
235 | if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { | ||
236 | /* start read offset passed endof file */ | ||
237 | _clear_bio(ios->per_dev[i].bio); | ||
238 | EXOFS_DBGMSG("start read offset passed end of file " | ||
239 | "offset=0x%llx, length=0x%llx\n", | ||
240 | _LLU(ios->per_dev[i].offset), | ||
241 | _LLU(ios->per_dev[i].length)); | ||
242 | |||
243 | continue; /* we recovered */ | ||
244 | } | ||
245 | |||
246 | if (osi.osd_err_pri >= acumulated_osd_err) { | ||
247 | acumulated_osd_err = osi.osd_err_pri; | ||
248 | acumulated_lin_err = ret; | ||
249 | } | ||
250 | } | ||
251 | |||
252 | /* TODO: raid specific residual calculations */ | ||
253 | if (resid) { | ||
254 | if (likely(!acumulated_lin_err)) | ||
255 | *resid = 0; | ||
256 | else | ||
257 | *resid = ios->length; | ||
258 | } | ||
259 | |||
260 | return acumulated_lin_err; | ||
261 | } | ||
262 | |||
263 | /* | ||
264 | * L - logical offset into the file | ||
265 | * | ||
266 | * U - The number of bytes in a stripe within a group | ||
267 | * | ||
268 | * U = stripe_unit * group_width | ||
269 | * | ||
270 | * T - The number of bytes striped within a group of component objects | ||
271 | * (before advancing to the next group) | ||
272 | * | ||
273 | * T = stripe_unit * group_width * group_depth | ||
274 | * | ||
275 | * S - The number of bytes striped across all component objects | ||
276 | * before the pattern repeats | ||
277 | * | ||
278 | * S = stripe_unit * group_width * group_depth * group_count | ||
279 | * | ||
280 | * M - The "major" (i.e., across all components) stripe number | ||
281 | * | ||
282 | * M = L / S | ||
283 | * | ||
284 | * G - Counts the groups from the beginning of the major stripe | ||
285 | * | ||
286 | * G = (L - (M * S)) / T [or (L % S) / T] | ||
287 | * | ||
288 | * H - The byte offset within the group | ||
289 | * | ||
290 | * H = (L - (M * S)) % T [or (L % S) % T] | ||
291 | * | ||
292 | * N - The "minor" (i.e., across the group) stripe number | ||
293 | * | ||
294 | * N = H / U | ||
295 | * | ||
296 | * C - The component index coresponding to L | ||
297 | * | ||
298 | * C = (H - (N * U)) / stripe_unit + G * group_width | ||
299 | * [or (L % U) / stripe_unit + G * group_width] | ||
300 | * | ||
301 | * O - The component offset coresponding to L | ||
302 | * | ||
303 | * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit | ||
304 | */ | ||
305 | struct _striping_info { | ||
306 | u64 obj_offset; | ||
307 | u64 group_length; | ||
308 | u64 total_group_length; | ||
309 | u64 Major; | ||
310 | unsigned dev; | ||
311 | unsigned unit_off; | ||
312 | }; | ||
313 | |||
314 | static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, | ||
315 | struct _striping_info *si) | ||
316 | { | ||
317 | u32 stripe_unit = ios->layout->stripe_unit; | ||
318 | u32 group_width = ios->layout->group_width; | ||
319 | u64 group_depth = ios->layout->group_depth; | ||
320 | |||
321 | u32 U = stripe_unit * group_width; | ||
322 | u64 T = U * group_depth; | ||
323 | u64 S = T * ios->layout->group_count; | ||
324 | u64 M = div64_u64(file_offset, S); | ||
325 | |||
326 | /* | ||
327 | G = (L - (M * S)) / T | ||
328 | H = (L - (M * S)) % T | ||
329 | */ | ||
330 | u64 LmodS = file_offset - M * S; | ||
331 | u32 G = div64_u64(LmodS, T); | ||
332 | u64 H = LmodS - G * T; | ||
333 | |||
334 | u32 N = div_u64(H, U); | ||
335 | |||
336 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ | ||
337 | si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; | ||
338 | si->dev *= ios->layout->mirrors_p1; | ||
339 | |||
340 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); | ||
341 | |||
342 | si->obj_offset = si->unit_off + (N * stripe_unit) + | ||
343 | (M * group_depth * stripe_unit); | ||
344 | |||
345 | si->group_length = T - H; | ||
346 | si->total_group_length = T; | ||
347 | si->Major = M; | ||
348 | } | ||
349 | |||
350 | static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, | ||
351 | unsigned pgbase, struct exofs_per_dev_state *per_dev, | ||
352 | int cur_len) | ||
353 | { | ||
354 | unsigned pg = *cur_pg; | ||
355 | struct request_queue *q = | ||
356 | osd_request_queue(exofs_ios_od(ios, per_dev->dev)); | ||
357 | |||
358 | per_dev->length += cur_len; | ||
359 | |||
360 | if (per_dev->bio == NULL) { | ||
361 | unsigned pages_in_stripe = ios->layout->group_width * | ||
362 | (ios->layout->stripe_unit / PAGE_SIZE); | ||
363 | unsigned bio_size = (ios->nr_pages + pages_in_stripe) / | ||
364 | ios->layout->group_width; | ||
365 | |||
366 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); | ||
367 | if (unlikely(!per_dev->bio)) { | ||
368 | EXOFS_DBGMSG("Faild to allocate BIO size=%u\n", | ||
369 | bio_size); | ||
370 | return -ENOMEM; | ||
371 | } | ||
372 | } | ||
373 | |||
374 | while (cur_len > 0) { | ||
375 | unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); | ||
376 | unsigned added_len; | ||
377 | |||
378 | BUG_ON(ios->nr_pages <= pg); | ||
379 | cur_len -= pglen; | ||
380 | |||
381 | added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], | ||
382 | pglen, pgbase); | ||
383 | if (unlikely(pglen != added_len)) | ||
384 | return -ENOMEM; | ||
385 | pgbase = 0; | ||
386 | ++pg; | ||
387 | } | ||
388 | BUG_ON(cur_len); | ||
389 | |||
390 | *cur_pg = pg; | ||
391 | return 0; | ||
392 | } | ||
393 | |||
394 | static int _prepare_one_group(struct exofs_io_state *ios, u64 length, | ||
395 | struct _striping_info *si, unsigned first_comp) | ||
396 | { | ||
397 | unsigned stripe_unit = ios->layout->stripe_unit; | ||
398 | unsigned mirrors_p1 = ios->layout->mirrors_p1; | ||
399 | unsigned devs_in_group = ios->layout->group_width * mirrors_p1; | ||
400 | unsigned dev = si->dev; | ||
401 | unsigned first_dev = dev - (dev % devs_in_group); | ||
402 | unsigned comp = first_comp + (dev - first_dev); | ||
403 | unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; | ||
404 | unsigned cur_pg = ios->pages_consumed; | ||
405 | int ret = 0; | ||
406 | |||
407 | while (length) { | ||
408 | struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; | ||
409 | unsigned cur_len, page_off = 0; | ||
410 | |||
411 | if (!per_dev->length) { | ||
412 | per_dev->dev = dev; | ||
413 | if (dev < si->dev) { | ||
414 | per_dev->offset = si->obj_offset + stripe_unit - | ||
415 | si->unit_off; | ||
416 | cur_len = stripe_unit; | ||
417 | } else if (dev == si->dev) { | ||
418 | per_dev->offset = si->obj_offset; | ||
419 | cur_len = stripe_unit - si->unit_off; | ||
420 | page_off = si->unit_off & ~PAGE_MASK; | ||
421 | BUG_ON(page_off && (page_off != ios->pgbase)); | ||
422 | } else { /* dev > si->dev */ | ||
423 | per_dev->offset = si->obj_offset - si->unit_off; | ||
424 | cur_len = stripe_unit; | ||
425 | } | ||
426 | |||
427 | if (max_comp < comp) | ||
428 | max_comp = comp; | ||
429 | |||
430 | dev += mirrors_p1; | ||
431 | dev = (dev % devs_in_group) + first_dev; | ||
432 | } else { | ||
433 | cur_len = stripe_unit; | ||
434 | } | ||
435 | if (cur_len >= length) | ||
436 | cur_len = length; | ||
437 | |||
438 | ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, | ||
439 | cur_len); | ||
440 | if (unlikely(ret)) | ||
441 | goto out; | ||
442 | |||
443 | comp += mirrors_p1; | ||
444 | comp = (comp % devs_in_group) + first_comp; | ||
445 | |||
446 | length -= cur_len; | ||
447 | } | ||
448 | out: | ||
449 | ios->numdevs = max_comp + mirrors_p1; | ||
450 | ios->pages_consumed = cur_pg; | ||
451 | return ret; | ||
452 | } | ||
453 | |||
454 | static int _prepare_for_striping(struct exofs_io_state *ios) | ||
455 | { | ||
456 | u64 length = ios->length; | ||
457 | struct _striping_info si; | ||
458 | unsigned devs_in_group = ios->layout->group_width * | ||
459 | ios->layout->mirrors_p1; | ||
460 | unsigned first_comp = 0; | ||
461 | int ret = 0; | ||
462 | |||
463 | _calc_stripe_info(ios, ios->offset, &si); | ||
464 | |||
465 | if (!ios->pages) { | ||
466 | if (ios->kern_buff) { | ||
467 | struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; | ||
468 | |||
469 | per_dev->offset = si.obj_offset; | ||
470 | per_dev->dev = si.dev; | ||
471 | |||
472 | /* no cross device without page array */ | ||
473 | BUG_ON((ios->layout->group_width > 1) && | ||
474 | (si.unit_off + ios->length > | ||
475 | ios->layout->stripe_unit)); | ||
476 | } | ||
477 | ios->numdevs = ios->layout->mirrors_p1; | ||
478 | return 0; | ||
479 | } | ||
480 | |||
481 | while (length) { | ||
482 | if (length < si.group_length) | ||
483 | si.group_length = length; | ||
484 | |||
485 | ret = _prepare_one_group(ios, si.group_length, &si, first_comp); | ||
486 | if (unlikely(ret)) | ||
487 | goto out; | ||
488 | |||
489 | length -= si.group_length; | ||
490 | |||
491 | si.group_length = si.total_group_length; | ||
492 | si.unit_off = 0; | ||
493 | ++si.Major; | ||
494 | si.obj_offset = si.Major * ios->layout->stripe_unit * | ||
495 | ios->layout->group_depth; | ||
496 | |||
497 | si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; | ||
498 | si.dev %= ios->layout->s_numdevs; | ||
499 | |||
500 | first_comp += devs_in_group; | ||
501 | first_comp %= ios->layout->s_numdevs; | ||
502 | } | ||
503 | |||
504 | out: | ||
505 | return ret; | ||
506 | } | ||
507 | |||
508 | int exofs_sbi_create(struct exofs_io_state *ios) | ||
509 | { | ||
510 | int i, ret; | ||
511 | |||
512 | for (i = 0; i < ios->layout->s_numdevs; i++) { | ||
513 | struct osd_request *or; | ||
514 | |||
515 | or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); | ||
516 | if (unlikely(!or)) { | ||
517 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | ||
518 | ret = -ENOMEM; | ||
519 | goto out; | ||
520 | } | ||
521 | ios->per_dev[i].or = or; | ||
522 | ios->numdevs++; | ||
523 | |||
524 | osd_req_create_object(or, &ios->obj); | ||
525 | } | ||
526 | ret = exofs_io_execute(ios); | ||
527 | |||
528 | out: | ||
529 | return ret; | ||
530 | } | ||
531 | |||
532 | int exofs_sbi_remove(struct exofs_io_state *ios) | ||
533 | { | ||
534 | int i, ret; | ||
535 | |||
536 | for (i = 0; i < ios->layout->s_numdevs; i++) { | ||
537 | struct osd_request *or; | ||
538 | |||
539 | or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); | ||
540 | if (unlikely(!or)) { | ||
541 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | ||
542 | ret = -ENOMEM; | ||
543 | goto out; | ||
544 | } | ||
545 | ios->per_dev[i].or = or; | ||
546 | ios->numdevs++; | ||
547 | |||
548 | osd_req_remove_object(or, &ios->obj); | ||
549 | } | ||
550 | ret = exofs_io_execute(ios); | ||
551 | |||
552 | out: | ||
553 | return ret; | ||
554 | } | ||
555 | |||
556 | static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) | ||
557 | { | ||
558 | struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp]; | ||
559 | unsigned dev = ios->per_dev[cur_comp].dev; | ||
560 | unsigned last_comp = cur_comp + ios->layout->mirrors_p1; | ||
561 | int ret = 0; | ||
562 | |||
563 | if (ios->pages && !master_dev->length) | ||
564 | return 0; /* Just an empty slot */ | ||
565 | |||
566 | for (; cur_comp < last_comp; ++cur_comp, ++dev) { | ||
567 | struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; | ||
568 | struct osd_request *or; | ||
569 | |||
570 | or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL); | ||
571 | if (unlikely(!or)) { | ||
572 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | ||
573 | ret = -ENOMEM; | ||
574 | goto out; | ||
575 | } | ||
576 | per_dev->or = or; | ||
577 | per_dev->offset = master_dev->offset; | ||
578 | |||
579 | if (ios->pages) { | ||
580 | struct bio *bio; | ||
581 | |||
582 | if (per_dev != master_dev) { | ||
583 | bio = bio_kmalloc(GFP_KERNEL, | ||
584 | master_dev->bio->bi_max_vecs); | ||
585 | if (unlikely(!bio)) { | ||
586 | EXOFS_DBGMSG( | ||
587 | "Faild to allocate BIO size=%u\n", | ||
588 | master_dev->bio->bi_max_vecs); | ||
589 | ret = -ENOMEM; | ||
590 | goto out; | ||
591 | } | ||
592 | |||
593 | __bio_clone(bio, master_dev->bio); | ||
594 | bio->bi_bdev = NULL; | ||
595 | bio->bi_next = NULL; | ||
596 | per_dev->length = master_dev->length; | ||
597 | per_dev->bio = bio; | ||
598 | per_dev->dev = dev; | ||
599 | } else { | ||
600 | bio = master_dev->bio; | ||
601 | /* FIXME: bio_set_dir() */ | ||
602 | bio->bi_rw |= (1 << BIO_RW); | ||
603 | } | ||
604 | |||
605 | osd_req_write(or, &ios->obj, per_dev->offset, bio, | ||
606 | per_dev->length); | ||
607 | EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " | ||
608 | "length=0x%llx dev=%d\n", | ||
609 | _LLU(ios->obj.id), _LLU(per_dev->offset), | ||
610 | _LLU(per_dev->length), dev); | ||
611 | } else if (ios->kern_buff) { | ||
612 | ret = osd_req_write_kern(or, &ios->obj, per_dev->offset, | ||
613 | ios->kern_buff, ios->length); | ||
614 | if (unlikely(ret)) | ||
615 | goto out; | ||
616 | EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " | ||
617 | "length=0x%llx dev=%d\n", | ||
618 | _LLU(ios->obj.id), _LLU(per_dev->offset), | ||
619 | _LLU(ios->length), dev); | ||
620 | } else { | ||
621 | osd_req_set_attributes(or, &ios->obj); | ||
622 | EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", | ||
623 | _LLU(ios->obj.id), ios->out_attr_len, dev); | ||
624 | } | ||
625 | |||
626 | if (ios->out_attr) | ||
627 | osd_req_add_set_attr_list(or, ios->out_attr, | ||
628 | ios->out_attr_len); | ||
629 | |||
630 | if (ios->in_attr) | ||
631 | osd_req_add_get_attr_list(or, ios->in_attr, | ||
632 | ios->in_attr_len); | ||
633 | } | ||
634 | |||
635 | out: | ||
636 | return ret; | ||
637 | } | ||
638 | |||
639 | int exofs_sbi_write(struct exofs_io_state *ios) | ||
640 | { | ||
641 | int i; | ||
642 | int ret; | ||
643 | |||
644 | ret = _prepare_for_striping(ios); | ||
645 | if (unlikely(ret)) | ||
646 | return ret; | ||
647 | |||
648 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | ||
649 | ret = _sbi_write_mirror(ios, i); | ||
650 | if (unlikely(ret)) | ||
651 | return ret; | ||
652 | } | ||
653 | |||
654 | ret = exofs_io_execute(ios); | ||
655 | return ret; | ||
656 | } | ||
657 | |||
658 | static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) | ||
659 | { | ||
660 | struct osd_request *or; | ||
661 | struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; | ||
662 | unsigned first_dev = (unsigned)ios->obj.id; | ||
663 | |||
664 | if (ios->pages && !per_dev->length) | ||
665 | return 0; /* Just an empty slot */ | ||
666 | |||
667 | first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; | ||
668 | or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); | ||
669 | if (unlikely(!or)) { | ||
670 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | ||
671 | return -ENOMEM; | ||
672 | } | ||
673 | per_dev->or = or; | ||
674 | |||
675 | if (ios->pages) { | ||
676 | osd_req_read(or, &ios->obj, per_dev->offset, | ||
677 | per_dev->bio, per_dev->length); | ||
678 | EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" | ||
679 | " dev=%d\n", _LLU(ios->obj.id), | ||
680 | _LLU(per_dev->offset), _LLU(per_dev->length), | ||
681 | first_dev); | ||
682 | } else if (ios->kern_buff) { | ||
683 | int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset, | ||
684 | ios->kern_buff, ios->length); | ||
685 | EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " | ||
686 | "length=0x%llx dev=%d ret=>%d\n", | ||
687 | _LLU(ios->obj.id), _LLU(per_dev->offset), | ||
688 | _LLU(ios->length), first_dev, ret); | ||
689 | if (unlikely(ret)) | ||
690 | return ret; | ||
691 | } else { | ||
692 | osd_req_get_attributes(or, &ios->obj); | ||
693 | EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", | ||
694 | _LLU(ios->obj.id), ios->in_attr_len, first_dev); | ||
695 | } | ||
696 | if (ios->out_attr) | ||
697 | osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); | ||
698 | |||
699 | if (ios->in_attr) | ||
700 | osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); | ||
701 | |||
702 | return 0; | ||
703 | } | ||
704 | |||
705 | int exofs_sbi_read(struct exofs_io_state *ios) | ||
706 | { | ||
707 | int i; | ||
708 | int ret; | ||
709 | |||
710 | ret = _prepare_for_striping(ios); | ||
711 | if (unlikely(ret)) | ||
712 | return ret; | ||
713 | |||
714 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | ||
715 | ret = _sbi_read_mirror(ios, i); | ||
716 | if (unlikely(ret)) | ||
717 | return ret; | ||
718 | } | ||
719 | |||
720 | ret = exofs_io_execute(ios); | ||
721 | return ret; | ||
722 | } | ||
723 | |||
724 | int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) | ||
725 | { | ||
726 | struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ | ||
727 | void *iter = NULL; | ||
728 | int nelem; | ||
729 | |||
730 | do { | ||
731 | nelem = 1; | ||
732 | osd_req_decode_get_attr_list(ios->per_dev[0].or, | ||
733 | &cur_attr, &nelem, &iter); | ||
734 | if ((cur_attr.attr_page == attr->attr_page) && | ||
735 | (cur_attr.attr_id == attr->attr_id)) { | ||
736 | attr->len = cur_attr.len; | ||
737 | attr->val_ptr = cur_attr.val_ptr; | ||
738 | return 0; | ||
739 | } | ||
740 | } while (iter); | ||
741 | |||
742 | return -EIO; | ||
743 | } | ||
744 | |||
745 | static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp, | ||
746 | struct osd_attr *attr) | ||
747 | { | ||
748 | int last_comp = cur_comp + ios->layout->mirrors_p1; | ||
749 | |||
750 | for (; cur_comp < last_comp; ++cur_comp) { | ||
751 | struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; | ||
752 | struct osd_request *or; | ||
753 | |||
754 | or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL); | ||
755 | if (unlikely(!or)) { | ||
756 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | ||
757 | return -ENOMEM; | ||
758 | } | ||
759 | per_dev->or = or; | ||
760 | |||
761 | osd_req_set_attributes(or, &ios->obj); | ||
762 | osd_req_add_set_attr_list(or, attr, 1); | ||
763 | } | ||
764 | |||
765 | return 0; | ||
766 | } | ||
767 | |||
768 | int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) | ||
769 | { | ||
770 | struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; | ||
771 | struct exofs_io_state *ios; | ||
772 | struct exofs_trunc_attr { | ||
773 | struct osd_attr attr; | ||
774 | __be64 newsize; | ||
775 | } *size_attrs; | ||
776 | struct _striping_info si; | ||
777 | int i, ret; | ||
778 | |||
779 | ret = exofs_get_io_state(&sbi->layout, &ios); | ||
780 | if (unlikely(ret)) | ||
781 | return ret; | ||
782 | |||
783 | size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs), | ||
784 | GFP_KERNEL); | ||
785 | if (unlikely(!size_attrs)) { | ||
786 | ret = -ENOMEM; | ||
787 | goto out; | ||
788 | } | ||
789 | |||
790 | ios->obj.id = exofs_oi_objno(oi); | ||
791 | ios->cred = oi->i_cred; | ||
792 | |||
793 | ios->numdevs = ios->layout->s_numdevs; | ||
794 | _calc_stripe_info(ios, size, &si); | ||
795 | |||
796 | for (i = 0; i < ios->layout->group_width; ++i) { | ||
797 | struct exofs_trunc_attr *size_attr = &size_attrs[i]; | ||
798 | u64 obj_size; | ||
799 | |||
800 | if (i < si.dev) | ||
801 | obj_size = si.obj_offset + | ||
802 | ios->layout->stripe_unit - si.unit_off; | ||
803 | else if (i == si.dev) | ||
804 | obj_size = si.obj_offset; | ||
805 | else /* i > si.dev */ | ||
806 | obj_size = si.obj_offset - si.unit_off; | ||
807 | |||
808 | size_attr->newsize = cpu_to_be64(obj_size); | ||
809 | size_attr->attr = g_attr_logical_length; | ||
810 | size_attr->attr.val_ptr = &size_attr->newsize; | ||
811 | |||
812 | ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, | ||
813 | &size_attr->attr); | ||
814 | if (unlikely(ret)) | ||
815 | goto out; | ||
816 | } | ||
817 | ret = exofs_io_execute(ios); | ||
818 | |||
819 | out: | ||
820 | kfree(size_attrs); | ||
821 | exofs_put_io_state(ios); | ||
822 | return ret; | ||
823 | } | ||
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c deleted file mode 100644 index 4372542df284..000000000000 --- a/fs/exofs/osd.c +++ /dev/null | |||
@@ -1,125 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2005, 2006 | ||
3 | * Avishay Traeger (avishay@gmail.com) | ||
4 | * Copyright (C) 2008, 2009 | ||
5 | * Boaz Harrosh <bharrosh@panasas.com> | ||
6 | * | ||
7 | * This file is part of exofs. | ||
8 | * | ||
9 | * exofs is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation. Since it is based on ext2, and the only | ||
12 | * valid version of GPL for the Linux kernel is version 2, the only valid | ||
13 | * version of GPL for exofs is version 2. | ||
14 | * | ||
15 | * exofs is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | * GNU General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with exofs; if not, write to the Free Software | ||
22 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
23 | */ | ||
24 | |||
25 | #include <scsi/scsi_device.h> | ||
26 | #include <scsi/osd_sense.h> | ||
27 | |||
28 | #include "exofs.h" | ||
29 | |||
30 | int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid) | ||
31 | { | ||
32 | struct osd_sense_info osi; | ||
33 | int ret = osd_req_decode_sense(or, &osi); | ||
34 | |||
35 | if (ret) { /* translate to Linux codes */ | ||
36 | if (osi.additional_code == scsi_invalid_field_in_cdb) { | ||
37 | if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE) | ||
38 | ret = -EFAULT; | ||
39 | if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID) | ||
40 | ret = -ENOENT; | ||
41 | else | ||
42 | ret = -EINVAL; | ||
43 | } else if (osi.additional_code == osd_quota_error) | ||
44 | ret = -ENOSPC; | ||
45 | else | ||
46 | ret = -EIO; | ||
47 | } | ||
48 | |||
49 | /* FIXME: should be include in osd_sense_info */ | ||
50 | if (in_resid) | ||
51 | *in_resid = or->in.req ? or->in.req->resid_len : 0; | ||
52 | |||
53 | if (out_resid) | ||
54 | *out_resid = or->out.req ? or->out.req->resid_len : 0; | ||
55 | |||
56 | return ret; | ||
57 | } | ||
58 | |||
59 | void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) | ||
60 | { | ||
61 | osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * Perform a synchronous OSD operation. | ||
66 | */ | ||
67 | int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential) | ||
68 | { | ||
69 | int ret; | ||
70 | |||
71 | or->timeout = timeout; | ||
72 | ret = osd_finalize_request(or, 0, credential, NULL); | ||
73 | if (ret) { | ||
74 | EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); | ||
75 | return ret; | ||
76 | } | ||
77 | |||
78 | ret = osd_execute_request(or); | ||
79 | |||
80 | if (ret) | ||
81 | EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); | ||
82 | /* osd_req_decode_sense(or, ret); */ | ||
83 | return ret; | ||
84 | } | ||
85 | |||
86 | /* | ||
87 | * Perform an asynchronous OSD operation. | ||
88 | */ | ||
89 | int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done, | ||
90 | void *caller_context, u8 *cred) | ||
91 | { | ||
92 | int ret; | ||
93 | |||
94 | ret = osd_finalize_request(or, 0, cred, NULL); | ||
95 | if (ret) { | ||
96 | EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); | ||
97 | return ret; | ||
98 | } | ||
99 | |||
100 | ret = osd_execute_request_async(or, async_done, caller_context); | ||
101 | |||
102 | if (ret) | ||
103 | EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret); | ||
104 | return ret; | ||
105 | } | ||
106 | |||
107 | int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr) | ||
108 | { | ||
109 | struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ | ||
110 | void *iter = NULL; | ||
111 | int nelem; | ||
112 | |||
113 | do { | ||
114 | nelem = 1; | ||
115 | osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter); | ||
116 | if ((cur_attr.attr_page == attr->attr_page) && | ||
117 | (cur_attr.attr_id == attr->attr_id)) { | ||
118 | attr->len = cur_attr.len; | ||
119 | attr->val_ptr = cur_attr.val_ptr; | ||
120 | return 0; | ||
121 | } | ||
122 | } while (iter); | ||
123 | |||
124 | return -EIO; | ||
125 | } | ||
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h new file mode 100644 index 000000000000..c52e9888b8ab --- /dev/null +++ b/fs/exofs/pnfs.h | |||
@@ -0,0 +1,45 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008, 2009 | ||
3 | * Boaz Harrosh <bharrosh@panasas.com> | ||
4 | * | ||
5 | * This file is part of exofs. | ||
6 | * | ||
7 | * exofs is free software; you can redistribute it and/or modify it under the | ||
8 | * terms of the GNU General Public License version 2 as published by the Free | ||
9 | * Software Foundation. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | /* FIXME: Remove this file once pnfs hits mainline */ | ||
14 | |||
15 | #ifndef __EXOFS_PNFS_H__ | ||
16 | #define __EXOFS_PNFS_H__ | ||
17 | |||
18 | #if ! defined(__PNFS_OSD_XDR_H__) | ||
19 | |||
20 | enum pnfs_iomode { | ||
21 | IOMODE_READ = 1, | ||
22 | IOMODE_RW = 2, | ||
23 | IOMODE_ANY = 3, | ||
24 | }; | ||
25 | |||
26 | /* Layout Structure */ | ||
27 | enum pnfs_osd_raid_algorithm4 { | ||
28 | PNFS_OSD_RAID_0 = 1, | ||
29 | PNFS_OSD_RAID_4 = 2, | ||
30 | PNFS_OSD_RAID_5 = 3, | ||
31 | PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ | ||
32 | }; | ||
33 | |||
34 | struct pnfs_osd_data_map { | ||
35 | u32 odm_num_comps; | ||
36 | u64 odm_stripe_unit; | ||
37 | u32 odm_group_width; | ||
38 | u32 odm_group_depth; | ||
39 | u32 odm_mirror_cnt; | ||
40 | u32 odm_raid_algorithm; | ||
41 | }; | ||
42 | |||
43 | #endif /* ! defined(__PNFS_OSD_XDR_H__) */ | ||
44 | |||
45 | #endif /* __EXOFS_PNFS_H__ */ | ||
diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 9f500dec3b59..18e57ea1e5b4 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/vfs.h> | 37 | #include <linux/vfs.h> |
38 | #include <linux/random.h> | 38 | #include <linux/random.h> |
39 | #include <linux/exportfs.h> | 39 | #include <linux/exportfs.h> |
40 | #include <linux/slab.h> | ||
40 | 41 | ||
41 | #include "exofs.h" | 42 | #include "exofs.h" |
42 | 43 | ||
@@ -203,49 +204,45 @@ int exofs_sync_fs(struct super_block *sb, int wait) | |||
203 | { | 204 | { |
204 | struct exofs_sb_info *sbi; | 205 | struct exofs_sb_info *sbi; |
205 | struct exofs_fscb *fscb; | 206 | struct exofs_fscb *fscb; |
206 | struct osd_request *or; | 207 | struct exofs_io_state *ios; |
207 | struct osd_obj_id obj; | ||
208 | int ret = -ENOMEM; | 208 | int ret = -ENOMEM; |
209 | 209 | ||
210 | fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL); | ||
211 | if (!fscb) { | ||
212 | EXOFS_ERR("exofs_write_super: memory allocation failed.\n"); | ||
213 | return -ENOMEM; | ||
214 | } | ||
215 | |||
216 | lock_super(sb); | 210 | lock_super(sb); |
217 | sbi = sb->s_fs_info; | 211 | sbi = sb->s_fs_info; |
212 | fscb = &sbi->s_fscb; | ||
213 | |||
214 | ret = exofs_get_io_state(&sbi->layout, &ios); | ||
215 | if (ret) | ||
216 | goto out; | ||
217 | |||
218 | /* Note: We only write the changing part of the fscb. .i.e upto the | ||
219 | * the fscb->s_dev_table_oid member. There is no read-modify-write | ||
220 | * here. | ||
221 | */ | ||
222 | ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); | ||
223 | memset(fscb, 0, ios->length); | ||
218 | fscb->s_nextid = cpu_to_le64(sbi->s_nextid); | 224 | fscb->s_nextid = cpu_to_le64(sbi->s_nextid); |
219 | fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); | 225 | fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); |
220 | fscb->s_magic = cpu_to_le16(sb->s_magic); | 226 | fscb->s_magic = cpu_to_le16(sb->s_magic); |
221 | fscb->s_newfs = 0; | 227 | fscb->s_newfs = 0; |
228 | fscb->s_version = EXOFS_FSCB_VER; | ||
222 | 229 | ||
223 | or = osd_start_request(sbi->s_dev, GFP_KERNEL); | 230 | ios->obj.id = EXOFS_SUPER_ID; |
224 | if (unlikely(!or)) { | 231 | ios->offset = 0; |
225 | EXOFS_ERR("exofs_write_super: osd_start_request failed.\n"); | 232 | ios->kern_buff = fscb; |
226 | goto out; | 233 | ios->cred = sbi->s_cred; |
227 | } | ||
228 | |||
229 | obj.partition = sbi->s_pid; | ||
230 | obj.id = EXOFS_SUPER_ID; | ||
231 | ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb)); | ||
232 | if (unlikely(ret)) { | ||
233 | EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n"); | ||
234 | goto out; | ||
235 | } | ||
236 | 234 | ||
237 | ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred); | 235 | ret = exofs_sbi_write(ios); |
238 | if (unlikely(ret)) { | 236 | if (unlikely(ret)) { |
239 | EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n"); | 237 | EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); |
240 | goto out; | 238 | goto out; |
241 | } | 239 | } |
242 | sb->s_dirt = 0; | 240 | sb->s_dirt = 0; |
243 | 241 | ||
244 | out: | 242 | out: |
245 | if (or) | 243 | EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); |
246 | osd_end_request(or); | 244 | exofs_put_io_state(ios); |
247 | unlock_super(sb); | 245 | unlock_super(sb); |
248 | kfree(fscb); | ||
249 | return ret; | 246 | return ret; |
250 | } | 247 | } |
251 | 248 | ||
@@ -257,6 +254,29 @@ static void exofs_write_super(struct super_block *sb) | |||
257 | sb->s_dirt = 0; | 254 | sb->s_dirt = 0; |
258 | } | 255 | } |
259 | 256 | ||
257 | static void _exofs_print_device(const char *msg, const char *dev_path, | ||
258 | struct osd_dev *od, u64 pid) | ||
259 | { | ||
260 | const struct osd_dev_info *odi = osduld_device_info(od); | ||
261 | |||
262 | printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n", | ||
263 | msg, dev_path ?: "", odi->osdname, _LLU(pid)); | ||
264 | } | ||
265 | |||
266 | void exofs_free_sbi(struct exofs_sb_info *sbi) | ||
267 | { | ||
268 | while (sbi->layout.s_numdevs) { | ||
269 | int i = --sbi->layout.s_numdevs; | ||
270 | struct osd_dev *od = sbi->layout.s_ods[i]; | ||
271 | |||
272 | if (od) { | ||
273 | sbi->layout.s_ods[i] = NULL; | ||
274 | osduld_put_device(od); | ||
275 | } | ||
276 | } | ||
277 | kfree(sbi); | ||
278 | } | ||
279 | |||
260 | /* | 280 | /* |
261 | * This function is called when the vfs is freeing the superblock. We just | 281 | * This function is called when the vfs is freeing the superblock. We just |
262 | * need to free our own part. | 282 | * need to free our own part. |
@@ -279,11 +299,235 @@ static void exofs_put_super(struct super_block *sb) | |||
279 | msecs_to_jiffies(100)); | 299 | msecs_to_jiffies(100)); |
280 | } | 300 | } |
281 | 301 | ||
282 | osduld_put_device(sbi->s_dev); | 302 | _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], |
283 | kfree(sb->s_fs_info); | 303 | sbi->layout.s_pid); |
304 | |||
305 | exofs_free_sbi(sbi); | ||
284 | sb->s_fs_info = NULL; | 306 | sb->s_fs_info = NULL; |
285 | } | 307 | } |
286 | 308 | ||
309 | static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, | ||
310 | struct exofs_device_table *dt) | ||
311 | { | ||
312 | u64 stripe_length; | ||
313 | |||
314 | sbi->data_map.odm_num_comps = | ||
315 | le32_to_cpu(dt->dt_data_map.cb_num_comps); | ||
316 | sbi->data_map.odm_stripe_unit = | ||
317 | le64_to_cpu(dt->dt_data_map.cb_stripe_unit); | ||
318 | sbi->data_map.odm_group_width = | ||
319 | le32_to_cpu(dt->dt_data_map.cb_group_width); | ||
320 | sbi->data_map.odm_group_depth = | ||
321 | le32_to_cpu(dt->dt_data_map.cb_group_depth); | ||
322 | sbi->data_map.odm_mirror_cnt = | ||
323 | le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); | ||
324 | sbi->data_map.odm_raid_algorithm = | ||
325 | le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); | ||
326 | |||
327 | /* FIXME: Only raid0 for now. if not so, do not mount */ | ||
328 | if (sbi->data_map.odm_num_comps != numdevs) { | ||
329 | EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", | ||
330 | sbi->data_map.odm_num_comps, numdevs); | ||
331 | return -EINVAL; | ||
332 | } | ||
333 | if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { | ||
334 | EXOFS_ERR("Only RAID_0 for now\n"); | ||
335 | return -EINVAL; | ||
336 | } | ||
337 | if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { | ||
338 | EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", | ||
339 | numdevs, sbi->data_map.odm_mirror_cnt); | ||
340 | return -EINVAL; | ||
341 | } | ||
342 | |||
343 | if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { | ||
344 | EXOFS_ERR("Stripe Unit(0x%llx)" | ||
345 | " must be Multples of PAGE_SIZE(0x%lx)\n", | ||
346 | _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); | ||
347 | return -EINVAL; | ||
348 | } | ||
349 | |||
350 | sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; | ||
351 | sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; | ||
352 | |||
353 | if (sbi->data_map.odm_group_width) { | ||
354 | sbi->layout.group_width = sbi->data_map.odm_group_width; | ||
355 | sbi->layout.group_depth = sbi->data_map.odm_group_depth; | ||
356 | if (!sbi->layout.group_depth) { | ||
357 | EXOFS_ERR("group_depth == 0 && group_width != 0\n"); | ||
358 | return -EINVAL; | ||
359 | } | ||
360 | sbi->layout.group_count = sbi->data_map.odm_num_comps / | ||
361 | sbi->layout.mirrors_p1 / | ||
362 | sbi->data_map.odm_group_width; | ||
363 | } else { | ||
364 | if (sbi->data_map.odm_group_depth) { | ||
365 | printk(KERN_NOTICE "Warning: group_depth ignored " | ||
366 | "group_width == 0 && group_depth == %d\n", | ||
367 | sbi->data_map.odm_group_depth); | ||
368 | sbi->data_map.odm_group_depth = 0; | ||
369 | } | ||
370 | sbi->layout.group_width = sbi->data_map.odm_num_comps / | ||
371 | sbi->layout.mirrors_p1; | ||
372 | sbi->layout.group_depth = -1; | ||
373 | sbi->layout.group_count = 1; | ||
374 | } | ||
375 | |||
376 | stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; | ||
377 | if (stripe_length >= (1ULL << 32)) { | ||
378 | EXOFS_ERR("Total Stripe length(0x%llx)" | ||
379 | " >= 32bit is not supported\n", _LLU(stripe_length)); | ||
380 | return -EINVAL; | ||
381 | } | ||
382 | |||
383 | return 0; | ||
384 | } | ||
385 | |||
386 | /* @odi is valid only as long as @fscb_dev is valid */ | ||
387 | static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, | ||
388 | struct osd_dev_info *odi) | ||
389 | { | ||
390 | odi->systemid_len = le32_to_cpu(dt_dev->systemid_len); | ||
391 | memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len); | ||
392 | |||
393 | odi->osdname_len = le32_to_cpu(dt_dev->osdname_len); | ||
394 | odi->osdname = dt_dev->osdname; | ||
395 | |||
396 | /* FIXME support long names. Will need a _put function */ | ||
397 | if (dt_dev->long_name_offset) | ||
398 | return -EINVAL; | ||
399 | |||
400 | /* Make sure osdname is printable! | ||
401 | * mkexofs should give us space for a null-terminator else the | ||
402 | * device-table is invalid. | ||
403 | */ | ||
404 | if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname))) | ||
405 | odi->osdname_len = sizeof(dt_dev->osdname) - 1; | ||
406 | dt_dev->osdname[odi->osdname_len] = 0; | ||
407 | |||
408 | /* If it's all zeros something is bad we read past end-of-obj */ | ||
409 | return !(odi->systemid_len || odi->osdname_len); | ||
410 | } | ||
411 | |||
412 | static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, | ||
413 | unsigned table_count) | ||
414 | { | ||
415 | struct exofs_sb_info *sbi = *psbi; | ||
416 | struct osd_dev *fscb_od; | ||
417 | struct osd_obj_id obj = {.partition = sbi->layout.s_pid, | ||
418 | .id = EXOFS_DEVTABLE_ID}; | ||
419 | struct exofs_device_table *dt; | ||
420 | unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + | ||
421 | sizeof(*dt); | ||
422 | unsigned numdevs, i; | ||
423 | int ret; | ||
424 | |||
425 | dt = kmalloc(table_bytes, GFP_KERNEL); | ||
426 | if (unlikely(!dt)) { | ||
427 | EXOFS_ERR("ERROR: allocating %x bytes for device table\n", | ||
428 | table_bytes); | ||
429 | return -ENOMEM; | ||
430 | } | ||
431 | |||
432 | fscb_od = sbi->layout.s_ods[0]; | ||
433 | sbi->layout.s_ods[0] = NULL; | ||
434 | sbi->layout.s_numdevs = 0; | ||
435 | ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); | ||
436 | if (unlikely(ret)) { | ||
437 | EXOFS_ERR("ERROR: reading device table\n"); | ||
438 | goto out; | ||
439 | } | ||
440 | |||
441 | numdevs = le64_to_cpu(dt->dt_num_devices); | ||
442 | if (unlikely(!numdevs)) { | ||
443 | ret = -EINVAL; | ||
444 | goto out; | ||
445 | } | ||
446 | WARN_ON(table_count != numdevs); | ||
447 | |||
448 | ret = _read_and_match_data_map(sbi, numdevs, dt); | ||
449 | if (unlikely(ret)) | ||
450 | goto out; | ||
451 | |||
452 | if (likely(numdevs > 1)) { | ||
453 | unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]); | ||
454 | |||
455 | sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); | ||
456 | if (unlikely(!sbi)) { | ||
457 | ret = -ENOMEM; | ||
458 | goto out; | ||
459 | } | ||
460 | memset(&sbi->layout.s_ods[1], 0, | ||
461 | size - sizeof(sbi->layout.s_ods[0])); | ||
462 | *psbi = sbi; | ||
463 | } | ||
464 | |||
465 | for (i = 0; i < numdevs; i++) { | ||
466 | struct exofs_fscb fscb; | ||
467 | struct osd_dev_info odi; | ||
468 | struct osd_dev *od; | ||
469 | |||
470 | if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) { | ||
471 | EXOFS_ERR("ERROR: Read all-zeros device entry\n"); | ||
472 | ret = -EINVAL; | ||
473 | goto out; | ||
474 | } | ||
475 | |||
476 | printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", | ||
477 | i, odi.osdname); | ||
478 | |||
479 | /* On all devices the device table is identical. The user can | ||
480 | * specify any one of the participating devices on the command | ||
481 | * line. We always keep them in device-table order. | ||
482 | */ | ||
483 | if (fscb_od && osduld_device_same(fscb_od, &odi)) { | ||
484 | sbi->layout.s_ods[i] = fscb_od; | ||
485 | ++sbi->layout.s_numdevs; | ||
486 | fscb_od = NULL; | ||
487 | continue; | ||
488 | } | ||
489 | |||
490 | od = osduld_info_lookup(&odi); | ||
491 | if (unlikely(IS_ERR(od))) { | ||
492 | ret = PTR_ERR(od); | ||
493 | EXOFS_ERR("ERROR: device requested is not found " | ||
494 | "osd_name-%s =>%d\n", odi.osdname, ret); | ||
495 | goto out; | ||
496 | } | ||
497 | |||
498 | sbi->layout.s_ods[i] = od; | ||
499 | ++sbi->layout.s_numdevs; | ||
500 | |||
501 | /* Read the fscb of the other devices to make sure the FS | ||
502 | * partition is there. | ||
503 | */ | ||
504 | ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, | ||
505 | sizeof(fscb)); | ||
506 | if (unlikely(ret)) { | ||
507 | EXOFS_ERR("ERROR: Malformed participating device " | ||
508 | "error reading fscb osd_name-%s\n", | ||
509 | odi.osdname); | ||
510 | goto out; | ||
511 | } | ||
512 | |||
513 | /* TODO: verify other information is correct and FS-uuid | ||
514 | * matches. Benny what did you say about device table | ||
515 | * generation and old devices? | ||
516 | */ | ||
517 | } | ||
518 | |||
519 | out: | ||
520 | kfree(dt); | ||
521 | if (unlikely(!ret && fscb_od)) { | ||
522 | EXOFS_ERR( | ||
523 | "ERROR: Bad device-table container device not present\n"); | ||
524 | osduld_put_device(fscb_od); | ||
525 | ret = -EINVAL; | ||
526 | } | ||
527 | |||
528 | return ret; | ||
529 | } | ||
530 | |||
287 | /* | 531 | /* |
288 | * Read the superblock from the OSD and fill in the fields | 532 | * Read the superblock from the OSD and fill in the fields |
289 | */ | 533 | */ |
@@ -292,25 +536,32 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
292 | struct inode *root; | 536 | struct inode *root; |
293 | struct exofs_mountopt *opts = data; | 537 | struct exofs_mountopt *opts = data; |
294 | struct exofs_sb_info *sbi; /*extended info */ | 538 | struct exofs_sb_info *sbi; /*extended info */ |
539 | struct osd_dev *od; /* Master device */ | ||
295 | struct exofs_fscb fscb; /*on-disk superblock info */ | 540 | struct exofs_fscb fscb; /*on-disk superblock info */ |
296 | struct osd_request *or = NULL; | ||
297 | struct osd_obj_id obj; | 541 | struct osd_obj_id obj; |
542 | unsigned table_count; | ||
298 | int ret; | 543 | int ret; |
299 | 544 | ||
300 | sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); | 545 | sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); |
301 | if (!sbi) | 546 | if (!sbi) |
302 | return -ENOMEM; | 547 | return -ENOMEM; |
303 | sb->s_fs_info = sbi; | ||
304 | 548 | ||
305 | /* use mount options to fill superblock */ | 549 | /* use mount options to fill superblock */ |
306 | sbi->s_dev = osduld_path_lookup(opts->dev_name); | 550 | od = osduld_path_lookup(opts->dev_name); |
307 | if (IS_ERR(sbi->s_dev)) { | 551 | if (IS_ERR(od)) { |
308 | ret = PTR_ERR(sbi->s_dev); | 552 | ret = PTR_ERR(od); |
309 | sbi->s_dev = NULL; | ||
310 | goto free_sbi; | 553 | goto free_sbi; |
311 | } | 554 | } |
312 | 555 | ||
313 | sbi->s_pid = opts->pid; | 556 | /* Default layout in case we do not have a device-table */ |
557 | sbi->layout.stripe_unit = PAGE_SIZE; | ||
558 | sbi->layout.mirrors_p1 = 1; | ||
559 | sbi->layout.group_width = 1; | ||
560 | sbi->layout.group_depth = -1; | ||
561 | sbi->layout.group_count = 1; | ||
562 | sbi->layout.s_ods[0] = od; | ||
563 | sbi->layout.s_numdevs = 1; | ||
564 | sbi->layout.s_pid = opts->pid; | ||
314 | sbi->s_timeout = opts->timeout; | 565 | sbi->s_timeout = opts->timeout; |
315 | 566 | ||
316 | /* fill in some other data by hand */ | 567 | /* fill in some other data by hand */ |
@@ -323,35 +574,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
323 | sb->s_bdev = NULL; | 574 | sb->s_bdev = NULL; |
324 | sb->s_dev = 0; | 575 | sb->s_dev = 0; |
325 | 576 | ||
326 | /* read data from on-disk superblock object */ | 577 | obj.partition = sbi->layout.s_pid; |
327 | obj.partition = sbi->s_pid; | ||
328 | obj.id = EXOFS_SUPER_ID; | 578 | obj.id = EXOFS_SUPER_ID; |
329 | exofs_make_credential(sbi->s_cred, &obj); | 579 | exofs_make_credential(sbi->s_cred, &obj); |
330 | 580 | ||
331 | or = osd_start_request(sbi->s_dev, GFP_KERNEL); | 581 | ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb)); |
332 | if (unlikely(!or)) { | 582 | if (unlikely(ret)) |
333 | if (!silent) | ||
334 | EXOFS_ERR( | ||
335 | "exofs_fill_super: osd_start_request failed.\n"); | ||
336 | ret = -ENOMEM; | ||
337 | goto free_sbi; | ||
338 | } | ||
339 | ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb)); | ||
340 | if (unlikely(ret)) { | ||
341 | if (!silent) | ||
342 | EXOFS_ERR( | ||
343 | "exofs_fill_super: osd_req_read_kern failed.\n"); | ||
344 | ret = -ENOMEM; | ||
345 | goto free_sbi; | ||
346 | } | ||
347 | |||
348 | ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred); | ||
349 | if (unlikely(ret)) { | ||
350 | if (!silent) | ||
351 | EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n"); | ||
352 | ret = -EIO; | ||
353 | goto free_sbi; | 583 | goto free_sbi; |
354 | } | ||
355 | 584 | ||
356 | sb->s_magic = le16_to_cpu(fscb.s_magic); | 585 | sb->s_magic = le16_to_cpu(fscb.s_magic); |
357 | sbi->s_nextid = le64_to_cpu(fscb.s_nextid); | 586 | sbi->s_nextid = le64_to_cpu(fscb.s_nextid); |
@@ -364,12 +593,26 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
364 | ret = -EINVAL; | 593 | ret = -EINVAL; |
365 | goto free_sbi; | 594 | goto free_sbi; |
366 | } | 595 | } |
596 | if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) { | ||
597 | EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", | ||
598 | EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); | ||
599 | ret = -EINVAL; | ||
600 | goto free_sbi; | ||
601 | } | ||
367 | 602 | ||
368 | /* start generation numbers from a random point */ | 603 | /* start generation numbers from a random point */ |
369 | get_random_bytes(&sbi->s_next_generation, sizeof(u32)); | 604 | get_random_bytes(&sbi->s_next_generation, sizeof(u32)); |
370 | spin_lock_init(&sbi->s_next_gen_lock); | 605 | spin_lock_init(&sbi->s_next_gen_lock); |
371 | 606 | ||
607 | table_count = le64_to_cpu(fscb.s_dev_table_count); | ||
608 | if (table_count) { | ||
609 | ret = exofs_read_lookup_dev_table(&sbi, table_count); | ||
610 | if (unlikely(ret)) | ||
611 | goto free_sbi; | ||
612 | } | ||
613 | |||
372 | /* set up operation vectors */ | 614 | /* set up operation vectors */ |
615 | sb->s_fs_info = sbi; | ||
373 | sb->s_op = &exofs_sops; | 616 | sb->s_op = &exofs_sops; |
374 | sb->s_export_op = &exofs_export_ops; | 617 | sb->s_export_op = &exofs_export_ops; |
375 | root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); | 618 | root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); |
@@ -395,16 +638,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
395 | goto free_sbi; | 638 | goto free_sbi; |
396 | } | 639 | } |
397 | 640 | ||
398 | ret = 0; | 641 | _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], |
399 | out: | 642 | sbi->layout.s_pid); |
400 | if (or) | 643 | return 0; |
401 | osd_end_request(or); | ||
402 | return ret; | ||
403 | 644 | ||
404 | free_sbi: | 645 | free_sbi: |
405 | osduld_put_device(sbi->s_dev); /* NULL safe */ | 646 | EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", |
406 | kfree(sbi); | 647 | opts->dev_name, sbi->layout.s_pid, ret); |
407 | goto out; | 648 | exofs_free_sbi(sbi); |
649 | return ret; | ||
408 | } | 650 | } |
409 | 651 | ||
410 | /* | 652 | /* |
@@ -433,7 +675,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
433 | { | 675 | { |
434 | struct super_block *sb = dentry->d_sb; | 676 | struct super_block *sb = dentry->d_sb; |
435 | struct exofs_sb_info *sbi = sb->s_fs_info; | 677 | struct exofs_sb_info *sbi = sb->s_fs_info; |
436 | struct osd_obj_id obj = {sbi->s_pid, 0}; | 678 | struct exofs_io_state *ios; |
437 | struct osd_attr attrs[] = { | 679 | struct osd_attr attrs[] = { |
438 | ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, | 680 | ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, |
439 | OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), | 681 | OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), |
@@ -442,32 +684,33 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
442 | }; | 684 | }; |
443 | uint64_t capacity = ULLONG_MAX; | 685 | uint64_t capacity = ULLONG_MAX; |
444 | uint64_t used = ULLONG_MAX; | 686 | uint64_t used = ULLONG_MAX; |
445 | struct osd_request *or; | ||
446 | uint8_t cred_a[OSD_CAP_LEN]; | 687 | uint8_t cred_a[OSD_CAP_LEN]; |
447 | int ret; | 688 | int ret; |
448 | 689 | ||
449 | /* get used/capacity attributes */ | 690 | ret = exofs_get_io_state(&sbi->layout, &ios); |
450 | exofs_make_credential(cred_a, &obj); | 691 | if (ret) { |
451 | 692 | EXOFS_DBGMSG("exofs_get_io_state failed.\n"); | |
452 | or = osd_start_request(sbi->s_dev, GFP_KERNEL); | 693 | return ret; |
453 | if (unlikely(!or)) { | ||
454 | EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n"); | ||
455 | return -ENOMEM; | ||
456 | } | 694 | } |
457 | 695 | ||
458 | osd_req_get_attributes(or, &obj); | 696 | exofs_make_credential(cred_a, &ios->obj); |
459 | osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs)); | 697 | ios->cred = sbi->s_cred; |
460 | ret = exofs_sync_op(or, sbi->s_timeout, cred_a); | 698 | ios->in_attr = attrs; |
699 | ios->in_attr_len = ARRAY_SIZE(attrs); | ||
700 | |||
701 | ret = exofs_sbi_read(ios); | ||
461 | if (unlikely(ret)) | 702 | if (unlikely(ret)) |
462 | goto out; | 703 | goto out; |
463 | 704 | ||
464 | ret = extract_attr_from_req(or, &attrs[0]); | 705 | ret = extract_attr_from_ios(ios, &attrs[0]); |
465 | if (likely(!ret)) | 706 | if (likely(!ret)) { |
466 | capacity = get_unaligned_be64(attrs[0].val_ptr); | 707 | capacity = get_unaligned_be64(attrs[0].val_ptr); |
467 | else | 708 | if (unlikely(!capacity)) |
709 | capacity = ULLONG_MAX; | ||
710 | } else | ||
468 | EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n"); | 711 | EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n"); |
469 | 712 | ||
470 | ret = extract_attr_from_req(or, &attrs[1]); | 713 | ret = extract_attr_from_ios(ios, &attrs[1]); |
471 | if (likely(!ret)) | 714 | if (likely(!ret)) |
472 | used = get_unaligned_be64(attrs[1].val_ptr); | 715 | used = get_unaligned_be64(attrs[1].val_ptr); |
473 | else | 716 | else |
@@ -476,15 +719,15 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
476 | /* fill in the stats buffer */ | 719 | /* fill in the stats buffer */ |
477 | buf->f_type = EXOFS_SUPER_MAGIC; | 720 | buf->f_type = EXOFS_SUPER_MAGIC; |
478 | buf->f_bsize = EXOFS_BLKSIZE; | 721 | buf->f_bsize = EXOFS_BLKSIZE; |
479 | buf->f_blocks = (capacity >> EXOFS_BLKSHIFT); | 722 | buf->f_blocks = capacity >> 9; |
480 | buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT); | 723 | buf->f_bfree = (capacity - used) >> 9; |
481 | buf->f_bavail = buf->f_bfree; | 724 | buf->f_bavail = buf->f_bfree; |
482 | buf->f_files = sbi->s_numfiles; | 725 | buf->f_files = sbi->s_numfiles; |
483 | buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles; | 726 | buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles; |
484 | buf->f_namelen = EXOFS_NAME_LEN; | 727 | buf->f_namelen = EXOFS_NAME_LEN; |
485 | 728 | ||
486 | out: | 729 | out: |
487 | osd_end_request(or); | 730 | exofs_put_io_state(ios); |
488 | return ret; | 731 | return ret; |
489 | } | 732 | } |
490 | 733 | ||