diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-10 18:02:42 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-10 18:02:42 -0400 |
commit | d1e1cda862c16252087374ac75949b0e89a5717e (patch) | |
tree | 544ce467bed23638949a1991b4f7b00e7472baa4 /fs/nfs/filelayout | |
parent | 07888238f55056605cd23aa4ea3ca97d5e15938f (diff) | |
parent | a914722f333b3359d2f4f12919380a334176bb89 (diff) |
Merge tag 'nfs-for-3.16-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust:
"Highlights include:
- massive cleanup of the NFS read/write code by Anna and Dros
- support multiple NFS read/write requests per page in order to deal
with non-page aligned pNFS striping. Also cleans up the r/wsize <
page size code nicely.
- stable fix for ensuring inode is declared uptodate only after all
the attributes have been checked.
- stable fix for a kernel Oops when remounting
- NFS over RDMA client fixes
- move the pNFS files layout driver into its own subdirectory"
* tag 'nfs-for-3.16-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (79 commits)
NFS: populate ->net in mount data when remounting
pnfs: fix lockup caused by pnfs_generic_pg_test
NFSv4.1: Fix typo in dprintk
NFSv4.1: Comment is now wrong and redundant to code
NFS: Use raw_write_seqcount_begin/end int nfs4_reclaim_open_state
xprtrdma: Disconnect on registration failure
xprtrdma: Remove BUG_ON() call sites
xprtrdma: Avoid deadlock when credit window is reset
SUNRPC: Move congestion window constants to header file
xprtrdma: Reset connection timeout after successful reconnect
xprtrdma: Use macros for reconnection timeout constants
xprtrdma: Allocate missing pagelist
xprtrdma: Remove Tavor MTU setting
xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting
xprtrdma: Reduce the number of hardway buffer allocations
xprtrdma: Limit work done by completion handler
xprtrmda: Reduce calls to ib_poll_cq() in completion handlers
xprtrmda: Reduce lock contention in completion handlers
xprtrdma: Split the completion queue
xprtrdma: Make rpcrdma_ep_destroy() return void
...
Diffstat (limited to 'fs/nfs/filelayout')
-rw-r--r-- | fs/nfs/filelayout/Makefile | 5 | ||||
-rw-r--r-- | fs/nfs/filelayout/filelayout.c | 1406 | ||||
-rw-r--r-- | fs/nfs/filelayout/filelayout.h | 156 | ||||
-rw-r--r-- | fs/nfs/filelayout/filelayoutdev.c | 843 |
4 files changed, 2410 insertions, 0 deletions
diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile new file mode 100644 index 000000000000..8516cdffb9e9 --- /dev/null +++ b/fs/nfs/filelayout/Makefile | |||
@@ -0,0 +1,5 @@ | |||
1 | # | ||
2 | # Makefile for the pNFS Files Layout Driver kernel module | ||
3 | # | ||
4 | obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o | ||
5 | nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o | ||
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c new file mode 100644 index 000000000000..d2eba1c13b7e --- /dev/null +++ b/fs/nfs/filelayout/filelayout.c | |||
@@ -0,0 +1,1406 @@ | |||
1 | /* | ||
2 | * Module for the pnfs nfs4 file layout driver. | ||
3 | * Defines all I/O and Policy interface operations, plus code | ||
4 | * to register itself with the pNFS client. | ||
5 | * | ||
6 | * Copyright (c) 2002 | ||
7 | * The Regents of the University of Michigan | ||
8 | * All Rights Reserved | ||
9 | * | ||
10 | * Dean Hildebrand <dhildebz@umich.edu> | ||
11 | * | ||
12 | * Permission is granted to use, copy, create derivative works, and | ||
13 | * redistribute this software and such derivative works for any purpose, | ||
14 | * so long as the name of the University of Michigan is not used in | ||
15 | * any advertising or publicity pertaining to the use or distribution | ||
16 | * of this software without specific, written prior authorization. If | ||
17 | * the above copyright notice or any other identification of the | ||
18 | * University of Michigan is included in any copy of any portion of | ||
19 | * this software, then the disclaimer below must also be included. | ||
20 | * | ||
21 | * This software is provided as is, without representation or warranty | ||
22 | * of any kind either express or implied, including without limitation | ||
23 | * the implied warranties of merchantability, fitness for a particular | ||
24 | * purpose, or noninfringement. The Regents of the University of | ||
25 | * Michigan shall not be liable for any damages, including special, | ||
26 | * indirect, incidental, or consequential damages, with respect to any | ||
27 | * claim arising out of or in connection with the use of the software, | ||
28 | * even if it has been or is hereafter advised of the possibility of | ||
29 | * such damages. | ||
30 | */ | ||
31 | |||
32 | #include <linux/nfs_fs.h> | ||
33 | #include <linux/nfs_page.h> | ||
34 | #include <linux/module.h> | ||
35 | |||
36 | #include <linux/sunrpc/metrics.h> | ||
37 | |||
38 | #include "../nfs4session.h" | ||
39 | #include "../internal.h" | ||
40 | #include "../delegation.h" | ||
41 | #include "filelayout.h" | ||
42 | #include "../nfs4trace.h" | ||
43 | |||
44 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | ||
45 | |||
46 | MODULE_LICENSE("GPL"); | ||
47 | MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); | ||
48 | MODULE_DESCRIPTION("The NFSv4 file layout driver"); | ||
49 | |||
50 | #define FILELAYOUT_POLL_RETRY_MAX (15*HZ) | ||
51 | |||
52 | static loff_t | ||
53 | filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, | ||
54 | loff_t offset) | ||
55 | { | ||
56 | u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count; | ||
57 | u64 stripe_no; | ||
58 | u32 rem; | ||
59 | |||
60 | offset -= flseg->pattern_offset; | ||
61 | stripe_no = div_u64(offset, stripe_width); | ||
62 | div_u64_rem(offset, flseg->stripe_unit, &rem); | ||
63 | |||
64 | return stripe_no * flseg->stripe_unit + rem; | ||
65 | } | ||
66 | |||
67 | /* This function is used by the layout driver to calculate the | ||
68 | * offset of the file on the dserver based on whether the | ||
69 | * layout type is STRIPE_DENSE or STRIPE_SPARSE | ||
70 | */ | ||
71 | static loff_t | ||
72 | filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) | ||
73 | { | ||
74 | struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); | ||
75 | |||
76 | switch (flseg->stripe_type) { | ||
77 | case STRIPE_SPARSE: | ||
78 | return offset; | ||
79 | |||
80 | case STRIPE_DENSE: | ||
81 | return filelayout_get_dense_offset(flseg, offset); | ||
82 | } | ||
83 | |||
84 | BUG(); | ||
85 | } | ||
86 | |||
87 | static void filelayout_reset_write(struct nfs_pgio_data *data) | ||
88 | { | ||
89 | struct nfs_pgio_header *hdr = data->header; | ||
90 | struct rpc_task *task = &data->task; | ||
91 | |||
92 | if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { | ||
93 | dprintk("%s Reset task %5u for i/o through MDS " | ||
94 | "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, | ||
95 | data->task.tk_pid, | ||
96 | hdr->inode->i_sb->s_id, | ||
97 | (unsigned long long)NFS_FILEID(hdr->inode), | ||
98 | data->args.count, | ||
99 | (unsigned long long)data->args.offset); | ||
100 | |||
101 | task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode, | ||
102 | &hdr->pages, | ||
103 | hdr->completion_ops, | ||
104 | hdr->dreq); | ||
105 | } | ||
106 | } | ||
107 | |||
108 | static void filelayout_reset_read(struct nfs_pgio_data *data) | ||
109 | { | ||
110 | struct nfs_pgio_header *hdr = data->header; | ||
111 | struct rpc_task *task = &data->task; | ||
112 | |||
113 | if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { | ||
114 | dprintk("%s Reset task %5u for i/o through MDS " | ||
115 | "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, | ||
116 | data->task.tk_pid, | ||
117 | hdr->inode->i_sb->s_id, | ||
118 | (unsigned long long)NFS_FILEID(hdr->inode), | ||
119 | data->args.count, | ||
120 | (unsigned long long)data->args.offset); | ||
121 | |||
122 | task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode, | ||
123 | &hdr->pages, | ||
124 | hdr->completion_ops, | ||
125 | hdr->dreq); | ||
126 | } | ||
127 | } | ||
128 | |||
129 | static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo) | ||
130 | { | ||
131 | if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) | ||
132 | return; | ||
133 | pnfs_return_layout(inode); | ||
134 | } | ||
135 | |||
136 | static int filelayout_async_handle_error(struct rpc_task *task, | ||
137 | struct nfs4_state *state, | ||
138 | struct nfs_client *clp, | ||
139 | struct pnfs_layout_segment *lseg) | ||
140 | { | ||
141 | struct pnfs_layout_hdr *lo = lseg->pls_layout; | ||
142 | struct inode *inode = lo->plh_inode; | ||
143 | struct nfs_server *mds_server = NFS_SERVER(inode); | ||
144 | struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); | ||
145 | struct nfs_client *mds_client = mds_server->nfs_client; | ||
146 | struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; | ||
147 | |||
148 | if (task->tk_status >= 0) | ||
149 | return 0; | ||
150 | |||
151 | switch (task->tk_status) { | ||
152 | /* MDS state errors */ | ||
153 | case -NFS4ERR_DELEG_REVOKED: | ||
154 | case -NFS4ERR_ADMIN_REVOKED: | ||
155 | case -NFS4ERR_BAD_STATEID: | ||
156 | if (state == NULL) | ||
157 | break; | ||
158 | nfs_remove_bad_delegation(state->inode); | ||
159 | case -NFS4ERR_OPENMODE: | ||
160 | if (state == NULL) | ||
161 | break; | ||
162 | if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) | ||
163 | goto out_bad_stateid; | ||
164 | goto wait_on_recovery; | ||
165 | case -NFS4ERR_EXPIRED: | ||
166 | if (state != NULL) { | ||
167 | if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) | ||
168 | goto out_bad_stateid; | ||
169 | } | ||
170 | nfs4_schedule_lease_recovery(mds_client); | ||
171 | goto wait_on_recovery; | ||
172 | /* DS session errors */ | ||
173 | case -NFS4ERR_BADSESSION: | ||
174 | case -NFS4ERR_BADSLOT: | ||
175 | case -NFS4ERR_BAD_HIGH_SLOT: | ||
176 | case -NFS4ERR_DEADSESSION: | ||
177 | case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: | ||
178 | case -NFS4ERR_SEQ_FALSE_RETRY: | ||
179 | case -NFS4ERR_SEQ_MISORDERED: | ||
180 | dprintk("%s ERROR %d, Reset session. Exchangeid " | ||
181 | "flags 0x%x\n", __func__, task->tk_status, | ||
182 | clp->cl_exchange_flags); | ||
183 | nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); | ||
184 | break; | ||
185 | case -NFS4ERR_DELAY: | ||
186 | case -NFS4ERR_GRACE: | ||
187 | rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); | ||
188 | break; | ||
189 | case -NFS4ERR_RETRY_UNCACHED_REP: | ||
190 | break; | ||
191 | /* Invalidate Layout errors */ | ||
192 | case -NFS4ERR_PNFS_NO_LAYOUT: | ||
193 | case -ESTALE: /* mapped NFS4ERR_STALE */ | ||
194 | case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ | ||
195 | case -EISDIR: /* mapped NFS4ERR_ISDIR */ | ||
196 | case -NFS4ERR_FHEXPIRED: | ||
197 | case -NFS4ERR_WRONG_TYPE: | ||
198 | dprintk("%s Invalid layout error %d\n", __func__, | ||
199 | task->tk_status); | ||
200 | /* | ||
201 | * Destroy layout so new i/o will get a new layout. | ||
202 | * Layout will not be destroyed until all current lseg | ||
203 | * references are put. Mark layout as invalid to resend failed | ||
204 | * i/o and all i/o waiting on the slot table to the MDS until | ||
205 | * layout is destroyed and a new valid layout is obtained. | ||
206 | */ | ||
207 | pnfs_destroy_layout(NFS_I(inode)); | ||
208 | rpc_wake_up(&tbl->slot_tbl_waitq); | ||
209 | goto reset; | ||
210 | /* RPC connection errors */ | ||
211 | case -ECONNREFUSED: | ||
212 | case -EHOSTDOWN: | ||
213 | case -EHOSTUNREACH: | ||
214 | case -ENETUNREACH: | ||
215 | case -EIO: | ||
216 | case -ETIMEDOUT: | ||
217 | case -EPIPE: | ||
218 | dprintk("%s DS connection error %d\n", __func__, | ||
219 | task->tk_status); | ||
220 | nfs4_mark_deviceid_unavailable(devid); | ||
221 | set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); | ||
222 | rpc_wake_up(&tbl->slot_tbl_waitq); | ||
223 | /* fall through */ | ||
224 | default: | ||
225 | reset: | ||
226 | dprintk("%s Retry through MDS. Error %d\n", __func__, | ||
227 | task->tk_status); | ||
228 | return -NFS4ERR_RESET_TO_MDS; | ||
229 | } | ||
230 | out: | ||
231 | task->tk_status = 0; | ||
232 | return -EAGAIN; | ||
233 | out_bad_stateid: | ||
234 | task->tk_status = -EIO; | ||
235 | return 0; | ||
236 | wait_on_recovery: | ||
237 | rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL); | ||
238 | if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0) | ||
239 | rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task); | ||
240 | goto out; | ||
241 | } | ||
242 | |||
243 | /* NFS_PROTO call done callback routines */ | ||
244 | |||
245 | static int filelayout_read_done_cb(struct rpc_task *task, | ||
246 | struct nfs_pgio_data *data) | ||
247 | { | ||
248 | struct nfs_pgio_header *hdr = data->header; | ||
249 | int err; | ||
250 | |||
251 | trace_nfs4_pnfs_read(data, task->tk_status); | ||
252 | err = filelayout_async_handle_error(task, data->args.context->state, | ||
253 | data->ds_clp, hdr->lseg); | ||
254 | |||
255 | switch (err) { | ||
256 | case -NFS4ERR_RESET_TO_MDS: | ||
257 | filelayout_reset_read(data); | ||
258 | return task->tk_status; | ||
259 | case -EAGAIN: | ||
260 | rpc_restart_call_prepare(task); | ||
261 | return -EAGAIN; | ||
262 | } | ||
263 | |||
264 | return 0; | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * We reference the rpc_cred of the first WRITE that triggers the need for | ||
269 | * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. | ||
270 | * rfc5661 is not clear about which credential should be used. | ||
271 | */ | ||
272 | static void | ||
273 | filelayout_set_layoutcommit(struct nfs_pgio_data *wdata) | ||
274 | { | ||
275 | struct nfs_pgio_header *hdr = wdata->header; | ||
276 | |||
277 | if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || | ||
278 | wdata->res.verf->committed == NFS_FILE_SYNC) | ||
279 | return; | ||
280 | |||
281 | pnfs_set_layoutcommit(wdata); | ||
282 | dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, | ||
283 | (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); | ||
284 | } | ||
285 | |||
286 | bool | ||
287 | filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node) | ||
288 | { | ||
289 | return filelayout_test_devid_invalid(node) || | ||
290 | nfs4_test_deviceid_unavailable(node); | ||
291 | } | ||
292 | |||
293 | static bool | ||
294 | filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) | ||
295 | { | ||
296 | struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg); | ||
297 | |||
298 | return filelayout_test_devid_unavailable(node); | ||
299 | } | ||
300 | |||
301 | /* | ||
302 | * Call ops for the async read/write cases | ||
303 | * In the case of dense layouts, the offset needs to be reset to its | ||
304 | * original value. | ||
305 | */ | ||
306 | static void filelayout_read_prepare(struct rpc_task *task, void *data) | ||
307 | { | ||
308 | struct nfs_pgio_data *rdata = data; | ||
309 | |||
310 | if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) { | ||
311 | rpc_exit(task, -EIO); | ||
312 | return; | ||
313 | } | ||
314 | if (filelayout_reset_to_mds(rdata->header->lseg)) { | ||
315 | dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); | ||
316 | filelayout_reset_read(rdata); | ||
317 | rpc_exit(task, 0); | ||
318 | return; | ||
319 | } | ||
320 | rdata->pgio_done_cb = filelayout_read_done_cb; | ||
321 | |||
322 | if (nfs41_setup_sequence(rdata->ds_clp->cl_session, | ||
323 | &rdata->args.seq_args, | ||
324 | &rdata->res.seq_res, | ||
325 | task)) | ||
326 | return; | ||
327 | if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, | ||
328 | rdata->args.lock_context, FMODE_READ) == -EIO) | ||
329 | rpc_exit(task, -EIO); /* lost lock, terminate I/O */ | ||
330 | } | ||
331 | |||
332 | static void filelayout_read_call_done(struct rpc_task *task, void *data) | ||
333 | { | ||
334 | struct nfs_pgio_data *rdata = data; | ||
335 | |||
336 | dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); | ||
337 | |||
338 | if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) && | ||
339 | task->tk_status == 0) { | ||
340 | nfs41_sequence_done(task, &rdata->res.seq_res); | ||
341 | return; | ||
342 | } | ||
343 | |||
344 | /* Note this may cause RPC to be resent */ | ||
345 | rdata->header->mds_ops->rpc_call_done(task, data); | ||
346 | } | ||
347 | |||
348 | static void filelayout_read_count_stats(struct rpc_task *task, void *data) | ||
349 | { | ||
350 | struct nfs_pgio_data *rdata = data; | ||
351 | |||
352 | rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics); | ||
353 | } | ||
354 | |||
355 | static void filelayout_read_release(void *data) | ||
356 | { | ||
357 | struct nfs_pgio_data *rdata = data; | ||
358 | struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout; | ||
359 | |||
360 | filelayout_fenceme(lo->plh_inode, lo); | ||
361 | nfs_put_client(rdata->ds_clp); | ||
362 | rdata->header->mds_ops->rpc_release(data); | ||
363 | } | ||
364 | |||
365 | static int filelayout_write_done_cb(struct rpc_task *task, | ||
366 | struct nfs_pgio_data *data) | ||
367 | { | ||
368 | struct nfs_pgio_header *hdr = data->header; | ||
369 | int err; | ||
370 | |||
371 | trace_nfs4_pnfs_write(data, task->tk_status); | ||
372 | err = filelayout_async_handle_error(task, data->args.context->state, | ||
373 | data->ds_clp, hdr->lseg); | ||
374 | |||
375 | switch (err) { | ||
376 | case -NFS4ERR_RESET_TO_MDS: | ||
377 | filelayout_reset_write(data); | ||
378 | return task->tk_status; | ||
379 | case -EAGAIN: | ||
380 | rpc_restart_call_prepare(task); | ||
381 | return -EAGAIN; | ||
382 | } | ||
383 | |||
384 | filelayout_set_layoutcommit(data); | ||
385 | return 0; | ||
386 | } | ||
387 | |||
388 | /* Fake up some data that will cause nfs_commit_release to retry the writes. */ | ||
389 | static void prepare_to_resend_writes(struct nfs_commit_data *data) | ||
390 | { | ||
391 | struct nfs_page *first = nfs_list_entry(data->pages.next); | ||
392 | |||
393 | data->task.tk_status = 0; | ||
394 | memcpy(&data->verf.verifier, &first->wb_verf, | ||
395 | sizeof(data->verf.verifier)); | ||
396 | data->verf.verifier.data[0]++; /* ensure verifier mismatch */ | ||
397 | } | ||
398 | |||
399 | static int filelayout_commit_done_cb(struct rpc_task *task, | ||
400 | struct nfs_commit_data *data) | ||
401 | { | ||
402 | int err; | ||
403 | |||
404 | trace_nfs4_pnfs_commit_ds(data, task->tk_status); | ||
405 | err = filelayout_async_handle_error(task, NULL, data->ds_clp, | ||
406 | data->lseg); | ||
407 | |||
408 | switch (err) { | ||
409 | case -NFS4ERR_RESET_TO_MDS: | ||
410 | prepare_to_resend_writes(data); | ||
411 | return -EAGAIN; | ||
412 | case -EAGAIN: | ||
413 | rpc_restart_call_prepare(task); | ||
414 | return -EAGAIN; | ||
415 | } | ||
416 | |||
417 | return 0; | ||
418 | } | ||
419 | |||
420 | static void filelayout_write_prepare(struct rpc_task *task, void *data) | ||
421 | { | ||
422 | struct nfs_pgio_data *wdata = data; | ||
423 | |||
424 | if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) { | ||
425 | rpc_exit(task, -EIO); | ||
426 | return; | ||
427 | } | ||
428 | if (filelayout_reset_to_mds(wdata->header->lseg)) { | ||
429 | dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); | ||
430 | filelayout_reset_write(wdata); | ||
431 | rpc_exit(task, 0); | ||
432 | return; | ||
433 | } | ||
434 | if (nfs41_setup_sequence(wdata->ds_clp->cl_session, | ||
435 | &wdata->args.seq_args, | ||
436 | &wdata->res.seq_res, | ||
437 | task)) | ||
438 | return; | ||
439 | if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, | ||
440 | wdata->args.lock_context, FMODE_WRITE) == -EIO) | ||
441 | rpc_exit(task, -EIO); /* lost lock, terminate I/O */ | ||
442 | } | ||
443 | |||
444 | static void filelayout_write_call_done(struct rpc_task *task, void *data) | ||
445 | { | ||
446 | struct nfs_pgio_data *wdata = data; | ||
447 | |||
448 | if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && | ||
449 | task->tk_status == 0) { | ||
450 | nfs41_sequence_done(task, &wdata->res.seq_res); | ||
451 | return; | ||
452 | } | ||
453 | |||
454 | /* Note this may cause RPC to be resent */ | ||
455 | wdata->header->mds_ops->rpc_call_done(task, data); | ||
456 | } | ||
457 | |||
458 | static void filelayout_write_count_stats(struct rpc_task *task, void *data) | ||
459 | { | ||
460 | struct nfs_pgio_data *wdata = data; | ||
461 | |||
462 | rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics); | ||
463 | } | ||
464 | |||
465 | static void filelayout_write_release(void *data) | ||
466 | { | ||
467 | struct nfs_pgio_data *wdata = data; | ||
468 | struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout; | ||
469 | |||
470 | filelayout_fenceme(lo->plh_inode, lo); | ||
471 | nfs_put_client(wdata->ds_clp); | ||
472 | wdata->header->mds_ops->rpc_release(data); | ||
473 | } | ||
474 | |||
475 | static void filelayout_commit_prepare(struct rpc_task *task, void *data) | ||
476 | { | ||
477 | struct nfs_commit_data *wdata = data; | ||
478 | |||
479 | nfs41_setup_sequence(wdata->ds_clp->cl_session, | ||
480 | &wdata->args.seq_args, | ||
481 | &wdata->res.seq_res, | ||
482 | task); | ||
483 | } | ||
484 | |||
485 | static void filelayout_write_commit_done(struct rpc_task *task, void *data) | ||
486 | { | ||
487 | struct nfs_commit_data *wdata = data; | ||
488 | |||
489 | /* Note this may cause RPC to be resent */ | ||
490 | wdata->mds_ops->rpc_call_done(task, data); | ||
491 | } | ||
492 | |||
493 | static void filelayout_commit_count_stats(struct rpc_task *task, void *data) | ||
494 | { | ||
495 | struct nfs_commit_data *cdata = data; | ||
496 | |||
497 | rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics); | ||
498 | } | ||
499 | |||
500 | static void filelayout_commit_release(void *calldata) | ||
501 | { | ||
502 | struct nfs_commit_data *data = calldata; | ||
503 | |||
504 | data->completion_ops->completion(data); | ||
505 | pnfs_put_lseg(data->lseg); | ||
506 | nfs_put_client(data->ds_clp); | ||
507 | nfs_commitdata_release(data); | ||
508 | } | ||
509 | |||
510 | static const struct rpc_call_ops filelayout_read_call_ops = { | ||
511 | .rpc_call_prepare = filelayout_read_prepare, | ||
512 | .rpc_call_done = filelayout_read_call_done, | ||
513 | .rpc_count_stats = filelayout_read_count_stats, | ||
514 | .rpc_release = filelayout_read_release, | ||
515 | }; | ||
516 | |||
517 | static const struct rpc_call_ops filelayout_write_call_ops = { | ||
518 | .rpc_call_prepare = filelayout_write_prepare, | ||
519 | .rpc_call_done = filelayout_write_call_done, | ||
520 | .rpc_count_stats = filelayout_write_count_stats, | ||
521 | .rpc_release = filelayout_write_release, | ||
522 | }; | ||
523 | |||
524 | static const struct rpc_call_ops filelayout_commit_call_ops = { | ||
525 | .rpc_call_prepare = filelayout_commit_prepare, | ||
526 | .rpc_call_done = filelayout_write_commit_done, | ||
527 | .rpc_count_stats = filelayout_commit_count_stats, | ||
528 | .rpc_release = filelayout_commit_release, | ||
529 | }; | ||
530 | |||
531 | static enum pnfs_try_status | ||
532 | filelayout_read_pagelist(struct nfs_pgio_data *data) | ||
533 | { | ||
534 | struct nfs_pgio_header *hdr = data->header; | ||
535 | struct pnfs_layout_segment *lseg = hdr->lseg; | ||
536 | struct nfs4_pnfs_ds *ds; | ||
537 | struct rpc_clnt *ds_clnt; | ||
538 | loff_t offset = data->args.offset; | ||
539 | u32 j, idx; | ||
540 | struct nfs_fh *fh; | ||
541 | |||
542 | dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", | ||
543 | __func__, hdr->inode->i_ino, | ||
544 | data->args.pgbase, (size_t)data->args.count, offset); | ||
545 | |||
546 | /* Retrieve the correct rpc_client for the byte range */ | ||
547 | j = nfs4_fl_calc_j_index(lseg, offset); | ||
548 | idx = nfs4_fl_calc_ds_index(lseg, j); | ||
549 | ds = nfs4_fl_prepare_ds(lseg, idx); | ||
550 | if (!ds) | ||
551 | return PNFS_NOT_ATTEMPTED; | ||
552 | |||
553 | ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode); | ||
554 | if (IS_ERR(ds_clnt)) | ||
555 | return PNFS_NOT_ATTEMPTED; | ||
556 | |||
557 | dprintk("%s USE DS: %s cl_count %d\n", __func__, | ||
558 | ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); | ||
559 | |||
560 | /* No multipath support. Use first DS */ | ||
561 | atomic_inc(&ds->ds_clp->cl_count); | ||
562 | data->ds_clp = ds->ds_clp; | ||
563 | data->ds_idx = idx; | ||
564 | fh = nfs4_fl_select_ds_fh(lseg, j); | ||
565 | if (fh) | ||
566 | data->args.fh = fh; | ||
567 | |||
568 | data->args.offset = filelayout_get_dserver_offset(lseg, offset); | ||
569 | data->mds_offset = offset; | ||
570 | |||
571 | /* Perform an asynchronous read to ds */ | ||
572 | nfs_initiate_pgio(ds_clnt, data, | ||
573 | &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); | ||
574 | return PNFS_ATTEMPTED; | ||
575 | } | ||
576 | |||
577 | /* Perform async writes. */ | ||
578 | static enum pnfs_try_status | ||
579 | filelayout_write_pagelist(struct nfs_pgio_data *data, int sync) | ||
580 | { | ||
581 | struct nfs_pgio_header *hdr = data->header; | ||
582 | struct pnfs_layout_segment *lseg = hdr->lseg; | ||
583 | struct nfs4_pnfs_ds *ds; | ||
584 | struct rpc_clnt *ds_clnt; | ||
585 | loff_t offset = data->args.offset; | ||
586 | u32 j, idx; | ||
587 | struct nfs_fh *fh; | ||
588 | |||
589 | /* Retrieve the correct rpc_client for the byte range */ | ||
590 | j = nfs4_fl_calc_j_index(lseg, offset); | ||
591 | idx = nfs4_fl_calc_ds_index(lseg, j); | ||
592 | ds = nfs4_fl_prepare_ds(lseg, idx); | ||
593 | if (!ds) | ||
594 | return PNFS_NOT_ATTEMPTED; | ||
595 | |||
596 | ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode); | ||
597 | if (IS_ERR(ds_clnt)) | ||
598 | return PNFS_NOT_ATTEMPTED; | ||
599 | |||
600 | dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n", | ||
601 | __func__, hdr->inode->i_ino, sync, (size_t) data->args.count, | ||
602 | offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); | ||
603 | |||
604 | data->pgio_done_cb = filelayout_write_done_cb; | ||
605 | atomic_inc(&ds->ds_clp->cl_count); | ||
606 | data->ds_clp = ds->ds_clp; | ||
607 | data->ds_idx = idx; | ||
608 | fh = nfs4_fl_select_ds_fh(lseg, j); | ||
609 | if (fh) | ||
610 | data->args.fh = fh; | ||
611 | |||
612 | data->args.offset = filelayout_get_dserver_offset(lseg, offset); | ||
613 | |||
614 | /* Perform an asynchronous write */ | ||
615 | nfs_initiate_pgio(ds_clnt, data, | ||
616 | &filelayout_write_call_ops, sync, | ||
617 | RPC_TASK_SOFTCONN); | ||
618 | return PNFS_ATTEMPTED; | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | * filelayout_check_layout() | ||
623 | * | ||
624 | * Make sure layout segment parameters are sane WRT the device. | ||
625 | * At this point no generic layer initialization of the lseg has occurred, | ||
626 | * and nothing has been added to the layout_hdr cache. | ||
627 | * | ||
628 | */ | ||
629 | static int | ||
630 | filelayout_check_layout(struct pnfs_layout_hdr *lo, | ||
631 | struct nfs4_filelayout_segment *fl, | ||
632 | struct nfs4_layoutget_res *lgr, | ||
633 | struct nfs4_deviceid *id, | ||
634 | gfp_t gfp_flags) | ||
635 | { | ||
636 | struct nfs4_deviceid_node *d; | ||
637 | struct nfs4_file_layout_dsaddr *dsaddr; | ||
638 | int status = -EINVAL; | ||
639 | |||
640 | dprintk("--> %s\n", __func__); | ||
641 | |||
642 | /* FIXME: remove this check when layout segment support is added */ | ||
643 | if (lgr->range.offset != 0 || | ||
644 | lgr->range.length != NFS4_MAX_UINT64) { | ||
645 | dprintk("%s Only whole file layouts supported. Use MDS i/o\n", | ||
646 | __func__); | ||
647 | goto out; | ||
648 | } | ||
649 | |||
650 | if (fl->pattern_offset > lgr->range.offset) { | ||
651 | dprintk("%s pattern_offset %lld too large\n", | ||
652 | __func__, fl->pattern_offset); | ||
653 | goto out; | ||
654 | } | ||
655 | |||
656 | if (!fl->stripe_unit) { | ||
657 | dprintk("%s Invalid stripe unit (%u)\n", | ||
658 | __func__, fl->stripe_unit); | ||
659 | goto out; | ||
660 | } | ||
661 | |||
662 | /* find and reference the deviceid */ | ||
663 | d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, | ||
664 | NFS_SERVER(lo->plh_inode)->nfs_client, id); | ||
665 | if (d == NULL) { | ||
666 | dsaddr = filelayout_get_device_info(lo->plh_inode, id, | ||
667 | lo->plh_lc_cred, gfp_flags); | ||
668 | if (dsaddr == NULL) | ||
669 | goto out; | ||
670 | } else | ||
671 | dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); | ||
672 | /* Found deviceid is unavailable */ | ||
673 | if (filelayout_test_devid_unavailable(&dsaddr->id_node)) | ||
674 | goto out_put; | ||
675 | |||
676 | fl->dsaddr = dsaddr; | ||
677 | |||
678 | if (fl->first_stripe_index >= dsaddr->stripe_count) { | ||
679 | dprintk("%s Bad first_stripe_index %u\n", | ||
680 | __func__, fl->first_stripe_index); | ||
681 | goto out_put; | ||
682 | } | ||
683 | |||
684 | if ((fl->stripe_type == STRIPE_SPARSE && | ||
685 | fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) || | ||
686 | (fl->stripe_type == STRIPE_DENSE && | ||
687 | fl->num_fh != dsaddr->stripe_count)) { | ||
688 | dprintk("%s num_fh %u not valid for given packing\n", | ||
689 | __func__, fl->num_fh); | ||
690 | goto out_put; | ||
691 | } | ||
692 | |||
693 | status = 0; | ||
694 | out: | ||
695 | dprintk("--> %s returns %d\n", __func__, status); | ||
696 | return status; | ||
697 | out_put: | ||
698 | nfs4_fl_put_deviceid(dsaddr); | ||
699 | goto out; | ||
700 | } | ||
701 | |||
702 | static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) | ||
703 | { | ||
704 | int i; | ||
705 | |||
706 | for (i = 0; i < fl->num_fh; i++) { | ||
707 | if (!fl->fh_array[i]) | ||
708 | break; | ||
709 | kfree(fl->fh_array[i]); | ||
710 | } | ||
711 | kfree(fl->fh_array); | ||
712 | fl->fh_array = NULL; | ||
713 | } | ||
714 | |||
715 | static void | ||
716 | _filelayout_free_lseg(struct nfs4_filelayout_segment *fl) | ||
717 | { | ||
718 | filelayout_free_fh_array(fl); | ||
719 | kfree(fl); | ||
720 | } | ||
721 | |||
722 | static int | ||
723 | filelayout_decode_layout(struct pnfs_layout_hdr *flo, | ||
724 | struct nfs4_filelayout_segment *fl, | ||
725 | struct nfs4_layoutget_res *lgr, | ||
726 | struct nfs4_deviceid *id, | ||
727 | gfp_t gfp_flags) | ||
728 | { | ||
729 | struct xdr_stream stream; | ||
730 | struct xdr_buf buf; | ||
731 | struct page *scratch; | ||
732 | __be32 *p; | ||
733 | uint32_t nfl_util; | ||
734 | int i; | ||
735 | |||
736 | dprintk("%s: set_layout_map Begin\n", __func__); | ||
737 | |||
738 | scratch = alloc_page(gfp_flags); | ||
739 | if (!scratch) | ||
740 | return -ENOMEM; | ||
741 | |||
742 | xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); | ||
743 | xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); | ||
744 | |||
745 | /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), | ||
746 | * num_fh (4) */ | ||
747 | p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20); | ||
748 | if (unlikely(!p)) | ||
749 | goto out_err; | ||
750 | |||
751 | memcpy(id, p, sizeof(*id)); | ||
752 | p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); | ||
753 | nfs4_print_deviceid(id); | ||
754 | |||
755 | nfl_util = be32_to_cpup(p++); | ||
756 | if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) | ||
757 | fl->commit_through_mds = 1; | ||
758 | if (nfl_util & NFL4_UFLG_DENSE) | ||
759 | fl->stripe_type = STRIPE_DENSE; | ||
760 | else | ||
761 | fl->stripe_type = STRIPE_SPARSE; | ||
762 | fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; | ||
763 | |||
764 | fl->first_stripe_index = be32_to_cpup(p++); | ||
765 | p = xdr_decode_hyper(p, &fl->pattern_offset); | ||
766 | fl->num_fh = be32_to_cpup(p++); | ||
767 | |||
768 | dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n", | ||
769 | __func__, nfl_util, fl->num_fh, fl->first_stripe_index, | ||
770 | fl->pattern_offset); | ||
771 | |||
772 | /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. | ||
773 | * Futher checking is done in filelayout_check_layout */ | ||
774 | if (fl->num_fh > | ||
775 | max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) | ||
776 | goto out_err; | ||
777 | |||
778 | if (fl->num_fh > 0) { | ||
779 | fl->fh_array = kcalloc(fl->num_fh, sizeof(fl->fh_array[0]), | ||
780 | gfp_flags); | ||
781 | if (!fl->fh_array) | ||
782 | goto out_err; | ||
783 | } | ||
784 | |||
785 | for (i = 0; i < fl->num_fh; i++) { | ||
786 | /* Do we want to use a mempool here? */ | ||
787 | fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags); | ||
788 | if (!fl->fh_array[i]) | ||
789 | goto out_err_free; | ||
790 | |||
791 | p = xdr_inline_decode(&stream, 4); | ||
792 | if (unlikely(!p)) | ||
793 | goto out_err_free; | ||
794 | fl->fh_array[i]->size = be32_to_cpup(p++); | ||
795 | if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { | ||
796 | printk(KERN_ERR "NFS: Too big fh %d received %d\n", | ||
797 | i, fl->fh_array[i]->size); | ||
798 | goto out_err_free; | ||
799 | } | ||
800 | |||
801 | p = xdr_inline_decode(&stream, fl->fh_array[i]->size); | ||
802 | if (unlikely(!p)) | ||
803 | goto out_err_free; | ||
804 | memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); | ||
805 | dprintk("DEBUG: %s: fh len %d\n", __func__, | ||
806 | fl->fh_array[i]->size); | ||
807 | } | ||
808 | |||
809 | __free_page(scratch); | ||
810 | return 0; | ||
811 | |||
812 | out_err_free: | ||
813 | filelayout_free_fh_array(fl); | ||
814 | out_err: | ||
815 | __free_page(scratch); | ||
816 | return -EIO; | ||
817 | } | ||
818 | |||
819 | static void | ||
820 | filelayout_free_lseg(struct pnfs_layout_segment *lseg) | ||
821 | { | ||
822 | struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); | ||
823 | |||
824 | dprintk("--> %s\n", __func__); | ||
825 | nfs4_fl_put_deviceid(fl->dsaddr); | ||
826 | /* This assumes a single RW lseg */ | ||
827 | if (lseg->pls_range.iomode == IOMODE_RW) { | ||
828 | struct nfs4_filelayout *flo; | ||
829 | |||
830 | flo = FILELAYOUT_FROM_HDR(lseg->pls_layout); | ||
831 | flo->commit_info.nbuckets = 0; | ||
832 | kfree(flo->commit_info.buckets); | ||
833 | flo->commit_info.buckets = NULL; | ||
834 | } | ||
835 | _filelayout_free_lseg(fl); | ||
836 | } | ||
837 | |||
838 | static int | ||
839 | filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, | ||
840 | struct nfs_commit_info *cinfo, | ||
841 | gfp_t gfp_flags) | ||
842 | { | ||
843 | struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); | ||
844 | struct pnfs_commit_bucket *buckets; | ||
845 | int size, i; | ||
846 | |||
847 | if (fl->commit_through_mds) | ||
848 | return 0; | ||
849 | |||
850 | size = (fl->stripe_type == STRIPE_SPARSE) ? | ||
851 | fl->dsaddr->ds_num : fl->dsaddr->stripe_count; | ||
852 | |||
853 | if (cinfo->ds->nbuckets >= size) { | ||
854 | /* This assumes there is only one IOMODE_RW lseg. What | ||
855 | * we really want to do is have a layout_hdr level | ||
856 | * dictionary of <multipath_list4, fh> keys, each | ||
857 | * associated with a struct list_head, populated by calls | ||
858 | * to filelayout_write_pagelist(). | ||
859 | * */ | ||
860 | return 0; | ||
861 | } | ||
862 | |||
863 | buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), | ||
864 | gfp_flags); | ||
865 | if (!buckets) | ||
866 | return -ENOMEM; | ||
867 | for (i = 0; i < size; i++) { | ||
868 | INIT_LIST_HEAD(&buckets[i].written); | ||
869 | INIT_LIST_HEAD(&buckets[i].committing); | ||
870 | /* mark direct verifier as unset */ | ||
871 | buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; | ||
872 | } | ||
873 | |||
874 | spin_lock(cinfo->lock); | ||
875 | if (cinfo->ds->nbuckets >= size) | ||
876 | goto out; | ||
877 | for (i = 0; i < cinfo->ds->nbuckets; i++) { | ||
878 | list_splice(&cinfo->ds->buckets[i].written, | ||
879 | &buckets[i].written); | ||
880 | list_splice(&cinfo->ds->buckets[i].committing, | ||
881 | &buckets[i].committing); | ||
882 | buckets[i].direct_verf.committed = | ||
883 | cinfo->ds->buckets[i].direct_verf.committed; | ||
884 | buckets[i].wlseg = cinfo->ds->buckets[i].wlseg; | ||
885 | buckets[i].clseg = cinfo->ds->buckets[i].clseg; | ||
886 | } | ||
887 | swap(cinfo->ds->buckets, buckets); | ||
888 | cinfo->ds->nbuckets = size; | ||
889 | out: | ||
890 | spin_unlock(cinfo->lock); | ||
891 | kfree(buckets); | ||
892 | return 0; | ||
893 | } | ||
894 | |||
895 | static struct pnfs_layout_segment * | ||
896 | filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, | ||
897 | struct nfs4_layoutget_res *lgr, | ||
898 | gfp_t gfp_flags) | ||
899 | { | ||
900 | struct nfs4_filelayout_segment *fl; | ||
901 | int rc; | ||
902 | struct nfs4_deviceid id; | ||
903 | |||
904 | dprintk("--> %s\n", __func__); | ||
905 | fl = kzalloc(sizeof(*fl), gfp_flags); | ||
906 | if (!fl) | ||
907 | return NULL; | ||
908 | |||
909 | rc = filelayout_decode_layout(layoutid, fl, lgr, &id, gfp_flags); | ||
910 | if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id, gfp_flags)) { | ||
911 | _filelayout_free_lseg(fl); | ||
912 | return NULL; | ||
913 | } | ||
914 | return &fl->generic_hdr; | ||
915 | } | ||
916 | |||
917 | /* | ||
918 | * filelayout_pg_test(). Called by nfs_can_coalesce_requests() | ||
919 | * | ||
920 | * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number | ||
921 | * of bytes (maximum @req->wb_bytes) that can be coalesced. | ||
922 | */ | ||
923 | static size_t | ||
924 | filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, | ||
925 | struct nfs_page *req) | ||
926 | { | ||
927 | unsigned int size; | ||
928 | u64 p_stripe, r_stripe; | ||
929 | u32 stripe_offset; | ||
930 | u64 segment_offset = pgio->pg_lseg->pls_range.offset; | ||
931 | u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; | ||
932 | |||
933 | /* calls nfs_generic_pg_test */ | ||
934 | size = pnfs_generic_pg_test(pgio, prev, req); | ||
935 | if (!size) | ||
936 | return 0; | ||
937 | |||
938 | /* see if req and prev are in the same stripe */ | ||
939 | if (prev) { | ||
940 | p_stripe = (u64)req_offset(prev) - segment_offset; | ||
941 | r_stripe = (u64)req_offset(req) - segment_offset; | ||
942 | do_div(p_stripe, stripe_unit); | ||
943 | do_div(r_stripe, stripe_unit); | ||
944 | |||
945 | if (p_stripe != r_stripe) | ||
946 | return 0; | ||
947 | } | ||
948 | |||
949 | /* calculate remaining bytes in the current stripe */ | ||
950 | div_u64_rem((u64)req_offset(req) - segment_offset, | ||
951 | stripe_unit, | ||
952 | &stripe_offset); | ||
953 | WARN_ON_ONCE(stripe_offset > stripe_unit); | ||
954 | if (stripe_offset >= stripe_unit) | ||
955 | return 0; | ||
956 | return min(stripe_unit - (unsigned int)stripe_offset, size); | ||
957 | } | ||
958 | |||
959 | static void | ||
960 | filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, | ||
961 | struct nfs_page *req) | ||
962 | { | ||
963 | if (!pgio->pg_lseg) | ||
964 | pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, | ||
965 | req->wb_context, | ||
966 | 0, | ||
967 | NFS4_MAX_UINT64, | ||
968 | IOMODE_READ, | ||
969 | GFP_KERNEL); | ||
970 | /* If no lseg, fall back to read through mds */ | ||
971 | if (pgio->pg_lseg == NULL) | ||
972 | nfs_pageio_reset_read_mds(pgio); | ||
973 | } | ||
974 | |||
975 | static void | ||
976 | filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, | ||
977 | struct nfs_page *req) | ||
978 | { | ||
979 | struct nfs_commit_info cinfo; | ||
980 | int status; | ||
981 | |||
982 | if (!pgio->pg_lseg) | ||
983 | pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, | ||
984 | req->wb_context, | ||
985 | 0, | ||
986 | NFS4_MAX_UINT64, | ||
987 | IOMODE_RW, | ||
988 | GFP_NOFS); | ||
989 | /* If no lseg, fall back to write through mds */ | ||
990 | if (pgio->pg_lseg == NULL) | ||
991 | goto out_mds; | ||
992 | nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); | ||
993 | status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); | ||
994 | if (status < 0) { | ||
995 | pnfs_put_lseg(pgio->pg_lseg); | ||
996 | pgio->pg_lseg = NULL; | ||
997 | goto out_mds; | ||
998 | } | ||
999 | return; | ||
1000 | out_mds: | ||
1001 | nfs_pageio_reset_write_mds(pgio); | ||
1002 | } | ||
1003 | |||
1004 | static const struct nfs_pageio_ops filelayout_pg_read_ops = { | ||
1005 | .pg_init = filelayout_pg_init_read, | ||
1006 | .pg_test = filelayout_pg_test, | ||
1007 | .pg_doio = pnfs_generic_pg_readpages, | ||
1008 | }; | ||
1009 | |||
1010 | static const struct nfs_pageio_ops filelayout_pg_write_ops = { | ||
1011 | .pg_init = filelayout_pg_init_write, | ||
1012 | .pg_test = filelayout_pg_test, | ||
1013 | .pg_doio = pnfs_generic_pg_writepages, | ||
1014 | }; | ||
1015 | |||
1016 | static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) | ||
1017 | { | ||
1018 | if (fl->stripe_type == STRIPE_SPARSE) | ||
1019 | return nfs4_fl_calc_ds_index(&fl->generic_hdr, j); | ||
1020 | else | ||
1021 | return j; | ||
1022 | } | ||
1023 | |||
1024 | /* The generic layer is about to remove the req from the commit list. | ||
1025 | * If this will make the bucket empty, it will need to put the lseg reference. | ||
1026 | */ | ||
1027 | static void | ||
1028 | filelayout_clear_request_commit(struct nfs_page *req, | ||
1029 | struct nfs_commit_info *cinfo) | ||
1030 | { | ||
1031 | struct pnfs_layout_segment *freeme = NULL; | ||
1032 | |||
1033 | spin_lock(cinfo->lock); | ||
1034 | if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) | ||
1035 | goto out; | ||
1036 | cinfo->ds->nwritten--; | ||
1037 | if (list_is_singular(&req->wb_list)) { | ||
1038 | struct pnfs_commit_bucket *bucket; | ||
1039 | |||
1040 | bucket = list_first_entry(&req->wb_list, | ||
1041 | struct pnfs_commit_bucket, | ||
1042 | written); | ||
1043 | freeme = bucket->wlseg; | ||
1044 | bucket->wlseg = NULL; | ||
1045 | } | ||
1046 | out: | ||
1047 | nfs_request_remove_commit_list(req, cinfo); | ||
1048 | spin_unlock(cinfo->lock); | ||
1049 | pnfs_put_lseg(freeme); | ||
1050 | } | ||
1051 | |||
1052 | static struct list_head * | ||
1053 | filelayout_choose_commit_list(struct nfs_page *req, | ||
1054 | struct pnfs_layout_segment *lseg, | ||
1055 | struct nfs_commit_info *cinfo) | ||
1056 | { | ||
1057 | struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); | ||
1058 | u32 i, j; | ||
1059 | struct list_head *list; | ||
1060 | struct pnfs_commit_bucket *buckets; | ||
1061 | |||
1062 | if (fl->commit_through_mds) | ||
1063 | return &cinfo->mds->list; | ||
1064 | |||
1065 | /* Note that we are calling nfs4_fl_calc_j_index on each page | ||
1066 | * that ends up being committed to a data server. An attractive | ||
1067 | * alternative is to add a field to nfs_write_data and nfs_page | ||
1068 | * to store the value calculated in filelayout_write_pagelist | ||
1069 | * and just use that here. | ||
1070 | */ | ||
1071 | j = nfs4_fl_calc_j_index(lseg, req_offset(req)); | ||
1072 | i = select_bucket_index(fl, j); | ||
1073 | spin_lock(cinfo->lock); | ||
1074 | buckets = cinfo->ds->buckets; | ||
1075 | list = &buckets[i].written; | ||
1076 | if (list_empty(list)) { | ||
1077 | /* Non-empty buckets hold a reference on the lseg. That ref | ||
1078 | * is normally transferred to the COMMIT call and released | ||
1079 | * there. It could also be released if the last req is pulled | ||
1080 | * off due to a rewrite, in which case it will be done in | ||
1081 | * filelayout_clear_request_commit | ||
1082 | */ | ||
1083 | buckets[i].wlseg = pnfs_get_lseg(lseg); | ||
1084 | } | ||
1085 | set_bit(PG_COMMIT_TO_DS, &req->wb_flags); | ||
1086 | cinfo->ds->nwritten++; | ||
1087 | spin_unlock(cinfo->lock); | ||
1088 | return list; | ||
1089 | } | ||
1090 | |||
1091 | static void | ||
1092 | filelayout_mark_request_commit(struct nfs_page *req, | ||
1093 | struct pnfs_layout_segment *lseg, | ||
1094 | struct nfs_commit_info *cinfo) | ||
1095 | { | ||
1096 | struct list_head *list; | ||
1097 | |||
1098 | list = filelayout_choose_commit_list(req, lseg, cinfo); | ||
1099 | nfs_request_add_commit_list(req, list, cinfo); | ||
1100 | } | ||
1101 | |||
1102 | static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) | ||
1103 | { | ||
1104 | struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); | ||
1105 | |||
1106 | if (flseg->stripe_type == STRIPE_SPARSE) | ||
1107 | return i; | ||
1108 | else | ||
1109 | return nfs4_fl_calc_ds_index(lseg, i); | ||
1110 | } | ||
1111 | |||
1112 | static struct nfs_fh * | ||
1113 | select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) | ||
1114 | { | ||
1115 | struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); | ||
1116 | |||
1117 | if (flseg->stripe_type == STRIPE_SPARSE) { | ||
1118 | if (flseg->num_fh == 1) | ||
1119 | i = 0; | ||
1120 | else if (flseg->num_fh == 0) | ||
1121 | /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ | ||
1122 | return NULL; | ||
1123 | } | ||
1124 | return flseg->fh_array[i]; | ||
1125 | } | ||
1126 | |||
1127 | static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) | ||
1128 | { | ||
1129 | struct pnfs_layout_segment *lseg = data->lseg; | ||
1130 | struct nfs4_pnfs_ds *ds; | ||
1131 | struct rpc_clnt *ds_clnt; | ||
1132 | u32 idx; | ||
1133 | struct nfs_fh *fh; | ||
1134 | |||
1135 | idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); | ||
1136 | ds = nfs4_fl_prepare_ds(lseg, idx); | ||
1137 | if (!ds) | ||
1138 | goto out_err; | ||
1139 | |||
1140 | ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, data->inode); | ||
1141 | if (IS_ERR(ds_clnt)) | ||
1142 | goto out_err; | ||
1143 | |||
1144 | dprintk("%s ino %lu, how %d cl_count %d\n", __func__, | ||
1145 | data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count)); | ||
1146 | data->commit_done_cb = filelayout_commit_done_cb; | ||
1147 | atomic_inc(&ds->ds_clp->cl_count); | ||
1148 | data->ds_clp = ds->ds_clp; | ||
1149 | fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); | ||
1150 | if (fh) | ||
1151 | data->args.fh = fh; | ||
1152 | return nfs_initiate_commit(ds_clnt, data, | ||
1153 | &filelayout_commit_call_ops, how, | ||
1154 | RPC_TASK_SOFTCONN); | ||
1155 | out_err: | ||
1156 | prepare_to_resend_writes(data); | ||
1157 | filelayout_commit_release(data); | ||
1158 | return -EAGAIN; | ||
1159 | } | ||
1160 | |||
1161 | static int | ||
1162 | transfer_commit_list(struct list_head *src, struct list_head *dst, | ||
1163 | struct nfs_commit_info *cinfo, int max) | ||
1164 | { | ||
1165 | struct nfs_page *req, *tmp; | ||
1166 | int ret = 0; | ||
1167 | |||
1168 | list_for_each_entry_safe(req, tmp, src, wb_list) { | ||
1169 | if (!nfs_lock_request(req)) | ||
1170 | continue; | ||
1171 | kref_get(&req->wb_kref); | ||
1172 | if (cond_resched_lock(cinfo->lock)) | ||
1173 | list_safe_reset_next(req, tmp, wb_list); | ||
1174 | nfs_request_remove_commit_list(req, cinfo); | ||
1175 | clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); | ||
1176 | nfs_list_add_request(req, dst); | ||
1177 | ret++; | ||
1178 | if ((ret == max) && !cinfo->dreq) | ||
1179 | break; | ||
1180 | } | ||
1181 | return ret; | ||
1182 | } | ||
1183 | |||
1184 | /* Note called with cinfo->lock held. */ | ||
1185 | static int | ||
1186 | filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, | ||
1187 | struct nfs_commit_info *cinfo, | ||
1188 | int max) | ||
1189 | { | ||
1190 | struct list_head *src = &bucket->written; | ||
1191 | struct list_head *dst = &bucket->committing; | ||
1192 | int ret; | ||
1193 | |||
1194 | ret = transfer_commit_list(src, dst, cinfo, max); | ||
1195 | if (ret) { | ||
1196 | cinfo->ds->nwritten -= ret; | ||
1197 | cinfo->ds->ncommitting += ret; | ||
1198 | bucket->clseg = bucket->wlseg; | ||
1199 | if (list_empty(src)) | ||
1200 | bucket->wlseg = NULL; | ||
1201 | else | ||
1202 | pnfs_get_lseg(bucket->clseg); | ||
1203 | } | ||
1204 | return ret; | ||
1205 | } | ||
1206 | |||
1207 | /* Move reqs from written to committing lists, returning count of number moved. | ||
1208 | * Note called with cinfo->lock held. | ||
1209 | */ | ||
1210 | static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo, | ||
1211 | int max) | ||
1212 | { | ||
1213 | int i, rv = 0, cnt; | ||
1214 | |||
1215 | for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { | ||
1216 | cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i], | ||
1217 | cinfo, max); | ||
1218 | max -= cnt; | ||
1219 | rv += cnt; | ||
1220 | } | ||
1221 | return rv; | ||
1222 | } | ||
1223 | |||
1224 | /* Pull everything off the committing lists and dump into @dst */ | ||
1225 | static void filelayout_recover_commit_reqs(struct list_head *dst, | ||
1226 | struct nfs_commit_info *cinfo) | ||
1227 | { | ||
1228 | struct pnfs_commit_bucket *b; | ||
1229 | struct pnfs_layout_segment *freeme; | ||
1230 | int i; | ||
1231 | |||
1232 | restart: | ||
1233 | spin_lock(cinfo->lock); | ||
1234 | for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { | ||
1235 | if (transfer_commit_list(&b->written, dst, cinfo, 0)) { | ||
1236 | freeme = b->wlseg; | ||
1237 | b->wlseg = NULL; | ||
1238 | spin_unlock(cinfo->lock); | ||
1239 | pnfs_put_lseg(freeme); | ||
1240 | goto restart; | ||
1241 | } | ||
1242 | } | ||
1243 | cinfo->ds->nwritten = 0; | ||
1244 | spin_unlock(cinfo->lock); | ||
1245 | } | ||
1246 | |||
1247 | static unsigned int | ||
1248 | alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) | ||
1249 | { | ||
1250 | struct pnfs_ds_commit_info *fl_cinfo; | ||
1251 | struct pnfs_commit_bucket *bucket; | ||
1252 | struct nfs_commit_data *data; | ||
1253 | int i, j; | ||
1254 | unsigned int nreq = 0; | ||
1255 | struct pnfs_layout_segment *freeme; | ||
1256 | |||
1257 | fl_cinfo = cinfo->ds; | ||
1258 | bucket = fl_cinfo->buckets; | ||
1259 | for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { | ||
1260 | if (list_empty(&bucket->committing)) | ||
1261 | continue; | ||
1262 | data = nfs_commitdata_alloc(); | ||
1263 | if (!data) | ||
1264 | break; | ||
1265 | data->ds_commit_index = i; | ||
1266 | spin_lock(cinfo->lock); | ||
1267 | data->lseg = bucket->clseg; | ||
1268 | bucket->clseg = NULL; | ||
1269 | spin_unlock(cinfo->lock); | ||
1270 | list_add(&data->pages, list); | ||
1271 | nreq++; | ||
1272 | } | ||
1273 | |||
1274 | /* Clean up on error */ | ||
1275 | for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) { | ||
1276 | if (list_empty(&bucket->committing)) | ||
1277 | continue; | ||
1278 | nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); | ||
1279 | spin_lock(cinfo->lock); | ||
1280 | freeme = bucket->clseg; | ||
1281 | bucket->clseg = NULL; | ||
1282 | spin_unlock(cinfo->lock); | ||
1283 | pnfs_put_lseg(freeme); | ||
1284 | } | ||
1285 | /* Caller will clean up entries put on list */ | ||
1286 | return nreq; | ||
1287 | } | ||
1288 | |||
1289 | /* This follows nfs_commit_list pretty closely */ | ||
1290 | static int | ||
1291 | filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, | ||
1292 | int how, struct nfs_commit_info *cinfo) | ||
1293 | { | ||
1294 | struct nfs_commit_data *data, *tmp; | ||
1295 | LIST_HEAD(list); | ||
1296 | unsigned int nreq = 0; | ||
1297 | |||
1298 | if (!list_empty(mds_pages)) { | ||
1299 | data = nfs_commitdata_alloc(); | ||
1300 | if (data != NULL) { | ||
1301 | data->lseg = NULL; | ||
1302 | list_add(&data->pages, &list); | ||
1303 | nreq++; | ||
1304 | } else | ||
1305 | nfs_retry_commit(mds_pages, NULL, cinfo); | ||
1306 | } | ||
1307 | |||
1308 | nreq += alloc_ds_commits(cinfo, &list); | ||
1309 | |||
1310 | if (nreq == 0) { | ||
1311 | cinfo->completion_ops->error_cleanup(NFS_I(inode)); | ||
1312 | goto out; | ||
1313 | } | ||
1314 | |||
1315 | atomic_add(nreq, &cinfo->mds->rpcs_out); | ||
1316 | |||
1317 | list_for_each_entry_safe(data, tmp, &list, pages) { | ||
1318 | list_del_init(&data->pages); | ||
1319 | if (!data->lseg) { | ||
1320 | nfs_init_commit(data, mds_pages, NULL, cinfo); | ||
1321 | nfs_initiate_commit(NFS_CLIENT(inode), data, | ||
1322 | data->mds_ops, how, 0); | ||
1323 | } else { | ||
1324 | struct pnfs_commit_bucket *buckets; | ||
1325 | |||
1326 | buckets = cinfo->ds->buckets; | ||
1327 | nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo); | ||
1328 | filelayout_initiate_commit(data, how); | ||
1329 | } | ||
1330 | } | ||
1331 | out: | ||
1332 | cinfo->ds->ncommitting = 0; | ||
1333 | return PNFS_ATTEMPTED; | ||
1334 | } | ||
1335 | |||
1336 | static void | ||
1337 | filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) | ||
1338 | { | ||
1339 | nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); | ||
1340 | } | ||
1341 | |||
1342 | static struct pnfs_layout_hdr * | ||
1343 | filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) | ||
1344 | { | ||
1345 | struct nfs4_filelayout *flo; | ||
1346 | |||
1347 | flo = kzalloc(sizeof(*flo), gfp_flags); | ||
1348 | return flo != NULL ? &flo->generic_hdr : NULL; | ||
1349 | } | ||
1350 | |||
1351 | static void | ||
1352 | filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo) | ||
1353 | { | ||
1354 | kfree(FILELAYOUT_FROM_HDR(lo)); | ||
1355 | } | ||
1356 | |||
1357 | static struct pnfs_ds_commit_info * | ||
1358 | filelayout_get_ds_info(struct inode *inode) | ||
1359 | { | ||
1360 | struct pnfs_layout_hdr *layout = NFS_I(inode)->layout; | ||
1361 | |||
1362 | if (layout == NULL) | ||
1363 | return NULL; | ||
1364 | else | ||
1365 | return &FILELAYOUT_FROM_HDR(layout)->commit_info; | ||
1366 | } | ||
1367 | |||
1368 | static struct pnfs_layoutdriver_type filelayout_type = { | ||
1369 | .id = LAYOUT_NFSV4_1_FILES, | ||
1370 | .name = "LAYOUT_NFSV4_1_FILES", | ||
1371 | .owner = THIS_MODULE, | ||
1372 | .alloc_layout_hdr = filelayout_alloc_layout_hdr, | ||
1373 | .free_layout_hdr = filelayout_free_layout_hdr, | ||
1374 | .alloc_lseg = filelayout_alloc_lseg, | ||
1375 | .free_lseg = filelayout_free_lseg, | ||
1376 | .pg_read_ops = &filelayout_pg_read_ops, | ||
1377 | .pg_write_ops = &filelayout_pg_write_ops, | ||
1378 | .get_ds_info = &filelayout_get_ds_info, | ||
1379 | .mark_request_commit = filelayout_mark_request_commit, | ||
1380 | .clear_request_commit = filelayout_clear_request_commit, | ||
1381 | .scan_commit_lists = filelayout_scan_commit_lists, | ||
1382 | .recover_commit_reqs = filelayout_recover_commit_reqs, | ||
1383 | .commit_pagelist = filelayout_commit_pagelist, | ||
1384 | .read_pagelist = filelayout_read_pagelist, | ||
1385 | .write_pagelist = filelayout_write_pagelist, | ||
1386 | .free_deviceid_node = filelayout_free_deveiceid_node, | ||
1387 | }; | ||
1388 | |||
1389 | static int __init nfs4filelayout_init(void) | ||
1390 | { | ||
1391 | printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", | ||
1392 | __func__); | ||
1393 | return pnfs_register_layoutdriver(&filelayout_type); | ||
1394 | } | ||
1395 | |||
1396 | static void __exit nfs4filelayout_exit(void) | ||
1397 | { | ||
1398 | printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", | ||
1399 | __func__); | ||
1400 | pnfs_unregister_layoutdriver(&filelayout_type); | ||
1401 | } | ||
1402 | |||
1403 | MODULE_ALIAS("nfs-layouttype4-1"); | ||
1404 | |||
1405 | module_init(nfs4filelayout_init); | ||
1406 | module_exit(nfs4filelayout_exit); | ||
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h new file mode 100644 index 000000000000..ffbddf2219ea --- /dev/null +++ b/fs/nfs/filelayout/filelayout.h | |||
@@ -0,0 +1,156 @@ | |||
1 | /* | ||
2 | * NFSv4 file layout driver data structures. | ||
3 | * | ||
4 | * Copyright (c) 2002 | ||
5 | * The Regents of the University of Michigan | ||
6 | * All Rights Reserved | ||
7 | * | ||
8 | * Dean Hildebrand <dhildebz@umich.edu> | ||
9 | * | ||
10 | * Permission is granted to use, copy, create derivative works, and | ||
11 | * redistribute this software and such derivative works for any purpose, | ||
12 | * so long as the name of the University of Michigan is not used in | ||
13 | * any advertising or publicity pertaining to the use or distribution | ||
14 | * of this software without specific, written prior authorization. If | ||
15 | * the above copyright notice or any other identification of the | ||
16 | * University of Michigan is included in any copy of any portion of | ||
17 | * this software, then the disclaimer below must also be included. | ||
18 | * | ||
19 | * This software is provided as is, without representation or warranty | ||
20 | * of any kind either express or implied, including without limitation | ||
21 | * the implied warranties of merchantability, fitness for a particular | ||
22 | * purpose, or noninfringement. The Regents of the University of | ||
23 | * Michigan shall not be liable for any damages, including special, | ||
24 | * indirect, incidental, or consequential damages, with respect to any | ||
25 | * claim arising out of or in connection with the use of the software, | ||
26 | * even if it has been or is hereafter advised of the possibility of | ||
27 | * such damages. | ||
28 | */ | ||
29 | |||
30 | #ifndef FS_NFS_NFS4FILELAYOUT_H | ||
31 | #define FS_NFS_NFS4FILELAYOUT_H | ||
32 | |||
33 | #include "../pnfs.h" | ||
34 | |||
35 | /* | ||
36 | * Default data server connection timeout and retrans vaules. | ||
37 | * Set by module paramters dataserver_timeo and dataserver_retrans. | ||
38 | */ | ||
39 | #define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ | ||
40 | #define NFS4_DEF_DS_RETRANS 5 | ||
41 | |||
42 | /* | ||
43 | * Field testing shows we need to support up to 4096 stripe indices. | ||
44 | * We store each index as a u8 (u32 on the wire) to keep the memory footprint | ||
45 | * reasonable. This in turn means we support a maximum of 256 | ||
46 | * RFC 5661 multipath_list4 structures. | ||
47 | */ | ||
48 | #define NFS4_PNFS_MAX_STRIPE_CNT 4096 | ||
49 | #define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ | ||
50 | |||
51 | /* error codes for internal use */ | ||
52 | #define NFS4ERR_RESET_TO_MDS 12001 | ||
53 | |||
54 | enum stripetype4 { | ||
55 | STRIPE_SPARSE = 1, | ||
56 | STRIPE_DENSE = 2 | ||
57 | }; | ||
58 | |||
59 | /* Individual ip address */ | ||
60 | struct nfs4_pnfs_ds_addr { | ||
61 | struct sockaddr_storage da_addr; | ||
62 | size_t da_addrlen; | ||
63 | struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */ | ||
64 | char *da_remotestr; /* human readable addr+port */ | ||
65 | }; | ||
66 | |||
67 | struct nfs4_pnfs_ds { | ||
68 | struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ | ||
69 | char *ds_remotestr; /* comma sep list of addrs */ | ||
70 | struct list_head ds_addrs; | ||
71 | struct nfs_client *ds_clp; | ||
72 | atomic_t ds_count; | ||
73 | unsigned long ds_state; | ||
74 | #define NFS4DS_CONNECTING 0 /* ds is establishing connection */ | ||
75 | }; | ||
76 | |||
77 | struct nfs4_file_layout_dsaddr { | ||
78 | struct nfs4_deviceid_node id_node; | ||
79 | u32 stripe_count; | ||
80 | u8 *stripe_indices; | ||
81 | u32 ds_num; | ||
82 | struct nfs4_pnfs_ds *ds_list[1]; | ||
83 | }; | ||
84 | |||
85 | struct nfs4_filelayout_segment { | ||
86 | struct pnfs_layout_segment generic_hdr; | ||
87 | u32 stripe_type; | ||
88 | u32 commit_through_mds; | ||
89 | u32 stripe_unit; | ||
90 | u32 first_stripe_index; | ||
91 | u64 pattern_offset; | ||
92 | struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ | ||
93 | unsigned int num_fh; | ||
94 | struct nfs_fh **fh_array; | ||
95 | }; | ||
96 | |||
97 | struct nfs4_filelayout { | ||
98 | struct pnfs_layout_hdr generic_hdr; | ||
99 | struct pnfs_ds_commit_info commit_info; | ||
100 | }; | ||
101 | |||
102 | static inline struct nfs4_filelayout * | ||
103 | FILELAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo) | ||
104 | { | ||
105 | return container_of(lo, struct nfs4_filelayout, generic_hdr); | ||
106 | } | ||
107 | |||
108 | static inline struct nfs4_filelayout_segment * | ||
109 | FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) | ||
110 | { | ||
111 | return container_of(lseg, | ||
112 | struct nfs4_filelayout_segment, | ||
113 | generic_hdr); | ||
114 | } | ||
115 | |||
116 | static inline struct nfs4_deviceid_node * | ||
117 | FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg) | ||
118 | { | ||
119 | return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; | ||
120 | } | ||
121 | |||
122 | static inline void | ||
123 | filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node) | ||
124 | { | ||
125 | u32 *p = (u32 *)&node->deviceid; | ||
126 | |||
127 | printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n", | ||
128 | p[0], p[1], p[2], p[3]); | ||
129 | |||
130 | set_bit(NFS_DEVICEID_INVALID, &node->flags); | ||
131 | } | ||
132 | |||
133 | static inline bool | ||
134 | filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) | ||
135 | { | ||
136 | return test_bit(NFS_DEVICEID_INVALID, &node->flags); | ||
137 | } | ||
138 | |||
139 | extern bool | ||
140 | filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node); | ||
141 | |||
142 | extern struct nfs_fh * | ||
143 | nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); | ||
144 | |||
145 | extern void print_ds(struct nfs4_pnfs_ds *ds); | ||
146 | u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); | ||
147 | u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); | ||
148 | struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, | ||
149 | u32 ds_idx); | ||
150 | extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); | ||
151 | extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); | ||
152 | struct nfs4_file_layout_dsaddr * | ||
153 | filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, | ||
154 | struct rpc_cred *cred, gfp_t gfp_flags); | ||
155 | |||
156 | #endif /* FS_NFS_NFS4FILELAYOUT_H */ | ||
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c new file mode 100644 index 000000000000..44bf0140a4c7 --- /dev/null +++ b/fs/nfs/filelayout/filelayoutdev.c | |||
@@ -0,0 +1,843 @@ | |||
1 | /* | ||
2 | * Device operations for the pnfs nfs4 file layout driver. | ||
3 | * | ||
4 | * Copyright (c) 2002 | ||
5 | * The Regents of the University of Michigan | ||
6 | * All Rights Reserved | ||
7 | * | ||
8 | * Dean Hildebrand <dhildebz@umich.edu> | ||
9 | * Garth Goodson <Garth.Goodson@netapp.com> | ||
10 | * | ||
11 | * Permission is granted to use, copy, create derivative works, and | ||
12 | * redistribute this software and such derivative works for any purpose, | ||
13 | * so long as the name of the University of Michigan is not used in | ||
14 | * any advertising or publicity pertaining to the use or distribution | ||
15 | * of this software without specific, written prior authorization. If | ||
16 | * the above copyright notice or any other identification of the | ||
17 | * University of Michigan is included in any copy of any portion of | ||
18 | * this software, then the disclaimer below must also be included. | ||
19 | * | ||
20 | * This software is provided as is, without representation or warranty | ||
21 | * of any kind either express or implied, including without limitation | ||
22 | * the implied warranties of merchantability, fitness for a particular | ||
23 | * purpose, or noninfringement. The Regents of the University of | ||
24 | * Michigan shall not be liable for any damages, including special, | ||
25 | * indirect, incidental, or consequential damages, with respect to any | ||
26 | * claim arising out of or in connection with the use of the software, | ||
27 | * even if it has been or is hereafter advised of the possibility of | ||
28 | * such damages. | ||
29 | */ | ||
30 | |||
31 | #include <linux/nfs_fs.h> | ||
32 | #include <linux/vmalloc.h> | ||
33 | #include <linux/module.h> | ||
34 | #include <linux/sunrpc/addr.h> | ||
35 | |||
36 | #include "../internal.h" | ||
37 | #include "../nfs4session.h" | ||
38 | #include "filelayout.h" | ||
39 | |||
40 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | ||
41 | |||
42 | static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; | ||
43 | static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; | ||
44 | |||
45 | /* | ||
46 | * Data server cache | ||
47 | * | ||
48 | * Data servers can be mapped to different device ids. | ||
49 | * nfs4_pnfs_ds reference counting | ||
50 | * - set to 1 on allocation | ||
51 | * - incremented when a device id maps a data server already in the cache. | ||
52 | * - decremented when deviceid is removed from the cache. | ||
53 | */ | ||
54 | static DEFINE_SPINLOCK(nfs4_ds_cache_lock); | ||
55 | static LIST_HEAD(nfs4_data_server_cache); | ||
56 | |||
57 | /* Debug routines */ | ||
58 | void | ||
59 | print_ds(struct nfs4_pnfs_ds *ds) | ||
60 | { | ||
61 | if (ds == NULL) { | ||
62 | printk("%s NULL device\n", __func__); | ||
63 | return; | ||
64 | } | ||
65 | printk(" ds %s\n" | ||
66 | " ref count %d\n" | ||
67 | " client %p\n" | ||
68 | " cl_exchange_flags %x\n", | ||
69 | ds->ds_remotestr, | ||
70 | atomic_read(&ds->ds_count), ds->ds_clp, | ||
71 | ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); | ||
72 | } | ||
73 | |||
74 | static bool | ||
75 | same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) | ||
76 | { | ||
77 | struct sockaddr_in *a, *b; | ||
78 | struct sockaddr_in6 *a6, *b6; | ||
79 | |||
80 | if (addr1->sa_family != addr2->sa_family) | ||
81 | return false; | ||
82 | |||
83 | switch (addr1->sa_family) { | ||
84 | case AF_INET: | ||
85 | a = (struct sockaddr_in *)addr1; | ||
86 | b = (struct sockaddr_in *)addr2; | ||
87 | |||
88 | if (a->sin_addr.s_addr == b->sin_addr.s_addr && | ||
89 | a->sin_port == b->sin_port) | ||
90 | return true; | ||
91 | break; | ||
92 | |||
93 | case AF_INET6: | ||
94 | a6 = (struct sockaddr_in6 *)addr1; | ||
95 | b6 = (struct sockaddr_in6 *)addr2; | ||
96 | |||
97 | /* LINKLOCAL addresses must have matching scope_id */ | ||
98 | if (ipv6_addr_src_scope(&a6->sin6_addr) == | ||
99 | IPV6_ADDR_SCOPE_LINKLOCAL && | ||
100 | a6->sin6_scope_id != b6->sin6_scope_id) | ||
101 | return false; | ||
102 | |||
103 | if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) && | ||
104 | a6->sin6_port == b6->sin6_port) | ||
105 | return true; | ||
106 | break; | ||
107 | |||
108 | default: | ||
109 | dprintk("%s: unhandled address family: %u\n", | ||
110 | __func__, addr1->sa_family); | ||
111 | return false; | ||
112 | } | ||
113 | |||
114 | return false; | ||
115 | } | ||
116 | |||
117 | static bool | ||
118 | _same_data_server_addrs_locked(const struct list_head *dsaddrs1, | ||
119 | const struct list_head *dsaddrs2) | ||
120 | { | ||
121 | struct nfs4_pnfs_ds_addr *da1, *da2; | ||
122 | |||
123 | /* step through both lists, comparing as we go */ | ||
124 | for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node), | ||
125 | da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node); | ||
126 | da1 != NULL && da2 != NULL; | ||
127 | da1 = list_entry(da1->da_node.next, typeof(*da1), da_node), | ||
128 | da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) { | ||
129 | if (!same_sockaddr((struct sockaddr *)&da1->da_addr, | ||
130 | (struct sockaddr *)&da2->da_addr)) | ||
131 | return false; | ||
132 | } | ||
133 | if (da1 == NULL && da2 == NULL) | ||
134 | return true; | ||
135 | |||
136 | return false; | ||
137 | } | ||
138 | |||
139 | /* | ||
140 | * Lookup DS by addresses. nfs4_ds_cache_lock is held | ||
141 | */ | ||
142 | static struct nfs4_pnfs_ds * | ||
143 | _data_server_lookup_locked(const struct list_head *dsaddrs) | ||
144 | { | ||
145 | struct nfs4_pnfs_ds *ds; | ||
146 | |||
147 | list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) | ||
148 | if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) | ||
149 | return ds; | ||
150 | return NULL; | ||
151 | } | ||
152 | |||
153 | /* | ||
154 | * Create an rpc connection to the nfs4_pnfs_ds data server | ||
155 | * Currently only supports IPv4 and IPv6 addresses | ||
156 | */ | ||
157 | static int | ||
158 | nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) | ||
159 | { | ||
160 | struct nfs_client *clp = ERR_PTR(-EIO); | ||
161 | struct nfs4_pnfs_ds_addr *da; | ||
162 | int status = 0; | ||
163 | |||
164 | dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, | ||
165 | mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); | ||
166 | |||
167 | list_for_each_entry(da, &ds->ds_addrs, da_node) { | ||
168 | dprintk("%s: DS %s: trying address %s\n", | ||
169 | __func__, ds->ds_remotestr, da->da_remotestr); | ||
170 | |||
171 | clp = nfs4_set_ds_client(mds_srv->nfs_client, | ||
172 | (struct sockaddr *)&da->da_addr, | ||
173 | da->da_addrlen, IPPROTO_TCP, | ||
174 | dataserver_timeo, dataserver_retrans); | ||
175 | if (!IS_ERR(clp)) | ||
176 | break; | ||
177 | } | ||
178 | |||
179 | if (IS_ERR(clp)) { | ||
180 | status = PTR_ERR(clp); | ||
181 | goto out; | ||
182 | } | ||
183 | |||
184 | status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time); | ||
185 | if (status) | ||
186 | goto out_put; | ||
187 | |||
188 | smp_wmb(); | ||
189 | ds->ds_clp = clp; | ||
190 | dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); | ||
191 | out: | ||
192 | return status; | ||
193 | out_put: | ||
194 | nfs_put_client(clp); | ||
195 | goto out; | ||
196 | } | ||
197 | |||
198 | static void | ||
199 | destroy_ds(struct nfs4_pnfs_ds *ds) | ||
200 | { | ||
201 | struct nfs4_pnfs_ds_addr *da; | ||
202 | |||
203 | dprintk("--> %s\n", __func__); | ||
204 | ifdebug(FACILITY) | ||
205 | print_ds(ds); | ||
206 | |||
207 | if (ds->ds_clp) | ||
208 | nfs_put_client(ds->ds_clp); | ||
209 | |||
210 | while (!list_empty(&ds->ds_addrs)) { | ||
211 | da = list_first_entry(&ds->ds_addrs, | ||
212 | struct nfs4_pnfs_ds_addr, | ||
213 | da_node); | ||
214 | list_del_init(&da->da_node); | ||
215 | kfree(da->da_remotestr); | ||
216 | kfree(da); | ||
217 | } | ||
218 | |||
219 | kfree(ds->ds_remotestr); | ||
220 | kfree(ds); | ||
221 | } | ||
222 | |||
223 | void | ||
224 | nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) | ||
225 | { | ||
226 | struct nfs4_pnfs_ds *ds; | ||
227 | int i; | ||
228 | |||
229 | nfs4_print_deviceid(&dsaddr->id_node.deviceid); | ||
230 | |||
231 | for (i = 0; i < dsaddr->ds_num; i++) { | ||
232 | ds = dsaddr->ds_list[i]; | ||
233 | if (ds != NULL) { | ||
234 | if (atomic_dec_and_lock(&ds->ds_count, | ||
235 | &nfs4_ds_cache_lock)) { | ||
236 | list_del_init(&ds->ds_node); | ||
237 | spin_unlock(&nfs4_ds_cache_lock); | ||
238 | destroy_ds(ds); | ||
239 | } | ||
240 | } | ||
241 | } | ||
242 | kfree(dsaddr->stripe_indices); | ||
243 | kfree(dsaddr); | ||
244 | } | ||
245 | |||
246 | /* | ||
247 | * Create a string with a human readable address and port to avoid | ||
248 | * complicated setup around many dprinks. | ||
249 | */ | ||
250 | static char * | ||
251 | nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags) | ||
252 | { | ||
253 | struct nfs4_pnfs_ds_addr *da; | ||
254 | char *remotestr; | ||
255 | size_t len; | ||
256 | char *p; | ||
257 | |||
258 | len = 3; /* '{', '}' and eol */ | ||
259 | list_for_each_entry(da, dsaddrs, da_node) { | ||
260 | len += strlen(da->da_remotestr) + 1; /* string plus comma */ | ||
261 | } | ||
262 | |||
263 | remotestr = kzalloc(len, gfp_flags); | ||
264 | if (!remotestr) | ||
265 | return NULL; | ||
266 | |||
267 | p = remotestr; | ||
268 | *(p++) = '{'; | ||
269 | len--; | ||
270 | list_for_each_entry(da, dsaddrs, da_node) { | ||
271 | size_t ll = strlen(da->da_remotestr); | ||
272 | |||
273 | if (ll > len) | ||
274 | goto out_err; | ||
275 | |||
276 | memcpy(p, da->da_remotestr, ll); | ||
277 | p += ll; | ||
278 | len -= ll; | ||
279 | |||
280 | if (len < 1) | ||
281 | goto out_err; | ||
282 | (*p++) = ','; | ||
283 | len--; | ||
284 | } | ||
285 | if (len < 2) | ||
286 | goto out_err; | ||
287 | *(p++) = '}'; | ||
288 | *p = '\0'; | ||
289 | return remotestr; | ||
290 | out_err: | ||
291 | kfree(remotestr); | ||
292 | return NULL; | ||
293 | } | ||
294 | |||
295 | static struct nfs4_pnfs_ds * | ||
296 | nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) | ||
297 | { | ||
298 | struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; | ||
299 | char *remotestr; | ||
300 | |||
301 | if (list_empty(dsaddrs)) { | ||
302 | dprintk("%s: no addresses defined\n", __func__); | ||
303 | goto out; | ||
304 | } | ||
305 | |||
306 | ds = kzalloc(sizeof(*ds), gfp_flags); | ||
307 | if (!ds) | ||
308 | goto out; | ||
309 | |||
310 | /* this is only used for debugging, so it's ok if its NULL */ | ||
311 | remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); | ||
312 | |||
313 | spin_lock(&nfs4_ds_cache_lock); | ||
314 | tmp_ds = _data_server_lookup_locked(dsaddrs); | ||
315 | if (tmp_ds == NULL) { | ||
316 | INIT_LIST_HEAD(&ds->ds_addrs); | ||
317 | list_splice_init(dsaddrs, &ds->ds_addrs); | ||
318 | ds->ds_remotestr = remotestr; | ||
319 | atomic_set(&ds->ds_count, 1); | ||
320 | INIT_LIST_HEAD(&ds->ds_node); | ||
321 | ds->ds_clp = NULL; | ||
322 | list_add(&ds->ds_node, &nfs4_data_server_cache); | ||
323 | dprintk("%s add new data server %s\n", __func__, | ||
324 | ds->ds_remotestr); | ||
325 | } else { | ||
326 | kfree(remotestr); | ||
327 | kfree(ds); | ||
328 | atomic_inc(&tmp_ds->ds_count); | ||
329 | dprintk("%s data server %s found, inc'ed ds_count to %d\n", | ||
330 | __func__, tmp_ds->ds_remotestr, | ||
331 | atomic_read(&tmp_ds->ds_count)); | ||
332 | ds = tmp_ds; | ||
333 | } | ||
334 | spin_unlock(&nfs4_ds_cache_lock); | ||
335 | out: | ||
336 | return ds; | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * Currently only supports ipv4, ipv6 and one multi-path address. | ||
341 | */ | ||
342 | static struct nfs4_pnfs_ds_addr * | ||
343 | decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags) | ||
344 | { | ||
345 | struct nfs4_pnfs_ds_addr *da = NULL; | ||
346 | char *buf, *portstr; | ||
347 | __be16 port; | ||
348 | int nlen, rlen; | ||
349 | int tmp[2]; | ||
350 | __be32 *p; | ||
351 | char *netid, *match_netid; | ||
352 | size_t len, match_netid_len; | ||
353 | char *startsep = ""; | ||
354 | char *endsep = ""; | ||
355 | |||
356 | |||
357 | /* r_netid */ | ||
358 | p = xdr_inline_decode(streamp, 4); | ||
359 | if (unlikely(!p)) | ||
360 | goto out_err; | ||
361 | nlen = be32_to_cpup(p++); | ||
362 | |||
363 | p = xdr_inline_decode(streamp, nlen); | ||
364 | if (unlikely(!p)) | ||
365 | goto out_err; | ||
366 | |||
367 | netid = kmalloc(nlen+1, gfp_flags); | ||
368 | if (unlikely(!netid)) | ||
369 | goto out_err; | ||
370 | |||
371 | netid[nlen] = '\0'; | ||
372 | memcpy(netid, p, nlen); | ||
373 | |||
374 | /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ | ||
375 | p = xdr_inline_decode(streamp, 4); | ||
376 | if (unlikely(!p)) | ||
377 | goto out_free_netid; | ||
378 | rlen = be32_to_cpup(p); | ||
379 | |||
380 | p = xdr_inline_decode(streamp, rlen); | ||
381 | if (unlikely(!p)) | ||
382 | goto out_free_netid; | ||
383 | |||
384 | /* port is ".ABC.DEF", 8 chars max */ | ||
385 | if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) { | ||
386 | dprintk("%s: Invalid address, length %d\n", __func__, | ||
387 | rlen); | ||
388 | goto out_free_netid; | ||
389 | } | ||
390 | buf = kmalloc(rlen + 1, gfp_flags); | ||
391 | if (!buf) { | ||
392 | dprintk("%s: Not enough memory\n", __func__); | ||
393 | goto out_free_netid; | ||
394 | } | ||
395 | buf[rlen] = '\0'; | ||
396 | memcpy(buf, p, rlen); | ||
397 | |||
398 | /* replace port '.' with '-' */ | ||
399 | portstr = strrchr(buf, '.'); | ||
400 | if (!portstr) { | ||
401 | dprintk("%s: Failed finding expected dot in port\n", | ||
402 | __func__); | ||
403 | goto out_free_buf; | ||
404 | } | ||
405 | *portstr = '-'; | ||
406 | |||
407 | /* find '.' between address and port */ | ||
408 | portstr = strrchr(buf, '.'); | ||
409 | if (!portstr) { | ||
410 | dprintk("%s: Failed finding expected dot between address and " | ||
411 | "port\n", __func__); | ||
412 | goto out_free_buf; | ||
413 | } | ||
414 | *portstr = '\0'; | ||
415 | |||
416 | da = kzalloc(sizeof(*da), gfp_flags); | ||
417 | if (unlikely(!da)) | ||
418 | goto out_free_buf; | ||
419 | |||
420 | INIT_LIST_HEAD(&da->da_node); | ||
421 | |||
422 | if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr, | ||
423 | sizeof(da->da_addr))) { | ||
424 | dprintk("%s: error parsing address %s\n", __func__, buf); | ||
425 | goto out_free_da; | ||
426 | } | ||
427 | |||
428 | portstr++; | ||
429 | sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]); | ||
430 | port = htons((tmp[0] << 8) | (tmp[1])); | ||
431 | |||
432 | switch (da->da_addr.ss_family) { | ||
433 | case AF_INET: | ||
434 | ((struct sockaddr_in *)&da->da_addr)->sin_port = port; | ||
435 | da->da_addrlen = sizeof(struct sockaddr_in); | ||
436 | match_netid = "tcp"; | ||
437 | match_netid_len = 3; | ||
438 | break; | ||
439 | |||
440 | case AF_INET6: | ||
441 | ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; | ||
442 | da->da_addrlen = sizeof(struct sockaddr_in6); | ||
443 | match_netid = "tcp6"; | ||
444 | match_netid_len = 4; | ||
445 | startsep = "["; | ||
446 | endsep = "]"; | ||
447 | break; | ||
448 | |||
449 | default: | ||
450 | dprintk("%s: unsupported address family: %u\n", | ||
451 | __func__, da->da_addr.ss_family); | ||
452 | goto out_free_da; | ||
453 | } | ||
454 | |||
455 | if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { | ||
456 | dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", | ||
457 | __func__, netid, match_netid); | ||
458 | goto out_free_da; | ||
459 | } | ||
460 | |||
461 | /* save human readable address */ | ||
462 | len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; | ||
463 | da->da_remotestr = kzalloc(len, gfp_flags); | ||
464 | |||
465 | /* NULL is ok, only used for dprintk */ | ||
466 | if (da->da_remotestr) | ||
467 | snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep, | ||
468 | buf, endsep, ntohs(port)); | ||
469 | |||
470 | dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); | ||
471 | kfree(buf); | ||
472 | kfree(netid); | ||
473 | return da; | ||
474 | |||
475 | out_free_da: | ||
476 | kfree(da); | ||
477 | out_free_buf: | ||
478 | dprintk("%s: Error parsing DS addr: %s\n", __func__, buf); | ||
479 | kfree(buf); | ||
480 | out_free_netid: | ||
481 | kfree(netid); | ||
482 | out_err: | ||
483 | return NULL; | ||
484 | } | ||
485 | |||
486 | /* Decode opaque device data and return the result */ | ||
487 | static struct nfs4_file_layout_dsaddr* | ||
488 | decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) | ||
489 | { | ||
490 | int i; | ||
491 | u32 cnt, num; | ||
492 | u8 *indexp; | ||
493 | __be32 *p; | ||
494 | u8 *stripe_indices; | ||
495 | u8 max_stripe_index; | ||
496 | struct nfs4_file_layout_dsaddr *dsaddr = NULL; | ||
497 | struct xdr_stream stream; | ||
498 | struct xdr_buf buf; | ||
499 | struct page *scratch; | ||
500 | struct list_head dsaddrs; | ||
501 | struct nfs4_pnfs_ds_addr *da; | ||
502 | |||
503 | /* set up xdr stream */ | ||
504 | scratch = alloc_page(gfp_flags); | ||
505 | if (!scratch) | ||
506 | goto out_err; | ||
507 | |||
508 | xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); | ||
509 | xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); | ||
510 | |||
511 | /* Get the stripe count (number of stripe index) */ | ||
512 | p = xdr_inline_decode(&stream, 4); | ||
513 | if (unlikely(!p)) | ||
514 | goto out_err_free_scratch; | ||
515 | |||
516 | cnt = be32_to_cpup(p); | ||
517 | dprintk("%s stripe count %d\n", __func__, cnt); | ||
518 | if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { | ||
519 | printk(KERN_WARNING "NFS: %s: stripe count %d greater than " | ||
520 | "supported maximum %d\n", __func__, | ||
521 | cnt, NFS4_PNFS_MAX_STRIPE_CNT); | ||
522 | goto out_err_free_scratch; | ||
523 | } | ||
524 | |||
525 | /* read stripe indices */ | ||
526 | stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags); | ||
527 | if (!stripe_indices) | ||
528 | goto out_err_free_scratch; | ||
529 | |||
530 | p = xdr_inline_decode(&stream, cnt << 2); | ||
531 | if (unlikely(!p)) | ||
532 | goto out_err_free_stripe_indices; | ||
533 | |||
534 | indexp = &stripe_indices[0]; | ||
535 | max_stripe_index = 0; | ||
536 | for (i = 0; i < cnt; i++) { | ||
537 | *indexp = be32_to_cpup(p++); | ||
538 | max_stripe_index = max(max_stripe_index, *indexp); | ||
539 | indexp++; | ||
540 | } | ||
541 | |||
542 | /* Check the multipath list count */ | ||
543 | p = xdr_inline_decode(&stream, 4); | ||
544 | if (unlikely(!p)) | ||
545 | goto out_err_free_stripe_indices; | ||
546 | |||
547 | num = be32_to_cpup(p); | ||
548 | dprintk("%s ds_num %u\n", __func__, num); | ||
549 | if (num > NFS4_PNFS_MAX_MULTI_CNT) { | ||
550 | printk(KERN_WARNING "NFS: %s: multipath count %d greater than " | ||
551 | "supported maximum %d\n", __func__, | ||
552 | num, NFS4_PNFS_MAX_MULTI_CNT); | ||
553 | goto out_err_free_stripe_indices; | ||
554 | } | ||
555 | |||
556 | /* validate stripe indices are all < num */ | ||
557 | if (max_stripe_index >= num) { | ||
558 | printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n", | ||
559 | __func__, max_stripe_index, num); | ||
560 | goto out_err_free_stripe_indices; | ||
561 | } | ||
562 | |||
563 | dsaddr = kzalloc(sizeof(*dsaddr) + | ||
564 | (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), | ||
565 | gfp_flags); | ||
566 | if (!dsaddr) | ||
567 | goto out_err_free_stripe_indices; | ||
568 | |||
569 | dsaddr->stripe_count = cnt; | ||
570 | dsaddr->stripe_indices = stripe_indices; | ||
571 | stripe_indices = NULL; | ||
572 | dsaddr->ds_num = num; | ||
573 | nfs4_init_deviceid_node(&dsaddr->id_node, | ||
574 | NFS_SERVER(ino)->pnfs_curr_ld, | ||
575 | NFS_SERVER(ino)->nfs_client, | ||
576 | &pdev->dev_id); | ||
577 | |||
578 | INIT_LIST_HEAD(&dsaddrs); | ||
579 | |||
580 | for (i = 0; i < dsaddr->ds_num; i++) { | ||
581 | int j; | ||
582 | u32 mp_count; | ||
583 | |||
584 | p = xdr_inline_decode(&stream, 4); | ||
585 | if (unlikely(!p)) | ||
586 | goto out_err_free_deviceid; | ||
587 | |||
588 | mp_count = be32_to_cpup(p); /* multipath count */ | ||
589 | for (j = 0; j < mp_count; j++) { | ||
590 | da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net, | ||
591 | &stream, gfp_flags); | ||
592 | if (da) | ||
593 | list_add_tail(&da->da_node, &dsaddrs); | ||
594 | } | ||
595 | if (list_empty(&dsaddrs)) { | ||
596 | dprintk("%s: no suitable DS addresses found\n", | ||
597 | __func__); | ||
598 | goto out_err_free_deviceid; | ||
599 | } | ||
600 | |||
601 | dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); | ||
602 | if (!dsaddr->ds_list[i]) | ||
603 | goto out_err_drain_dsaddrs; | ||
604 | |||
605 | /* If DS was already in cache, free ds addrs */ | ||
606 | while (!list_empty(&dsaddrs)) { | ||
607 | da = list_first_entry(&dsaddrs, | ||
608 | struct nfs4_pnfs_ds_addr, | ||
609 | da_node); | ||
610 | list_del_init(&da->da_node); | ||
611 | kfree(da->da_remotestr); | ||
612 | kfree(da); | ||
613 | } | ||
614 | } | ||
615 | |||
616 | __free_page(scratch); | ||
617 | return dsaddr; | ||
618 | |||
619 | out_err_drain_dsaddrs: | ||
620 | while (!list_empty(&dsaddrs)) { | ||
621 | da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr, | ||
622 | da_node); | ||
623 | list_del_init(&da->da_node); | ||
624 | kfree(da->da_remotestr); | ||
625 | kfree(da); | ||
626 | } | ||
627 | out_err_free_deviceid: | ||
628 | nfs4_fl_free_deviceid(dsaddr); | ||
629 | /* stripe_indicies was part of dsaddr */ | ||
630 | goto out_err_free_scratch; | ||
631 | out_err_free_stripe_indices: | ||
632 | kfree(stripe_indices); | ||
633 | out_err_free_scratch: | ||
634 | __free_page(scratch); | ||
635 | out_err: | ||
636 | dprintk("%s ERROR: returning NULL\n", __func__); | ||
637 | return NULL; | ||
638 | } | ||
639 | |||
640 | /* | ||
641 | * Decode the opaque device specified in 'dev' and add it to the cache of | ||
642 | * available devices. | ||
643 | */ | ||
644 | static struct nfs4_file_layout_dsaddr * | ||
645 | decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) | ||
646 | { | ||
647 | struct nfs4_deviceid_node *d; | ||
648 | struct nfs4_file_layout_dsaddr *n, *new; | ||
649 | |||
650 | new = decode_device(inode, dev, gfp_flags); | ||
651 | if (!new) { | ||
652 | printk(KERN_WARNING "NFS: %s: Could not decode or add device\n", | ||
653 | __func__); | ||
654 | return NULL; | ||
655 | } | ||
656 | |||
657 | d = nfs4_insert_deviceid_node(&new->id_node); | ||
658 | n = container_of(d, struct nfs4_file_layout_dsaddr, id_node); | ||
659 | if (n != new) { | ||
660 | nfs4_fl_free_deviceid(new); | ||
661 | return n; | ||
662 | } | ||
663 | |||
664 | return new; | ||
665 | } | ||
666 | |||
667 | /* | ||
668 | * Retrieve the information for dev_id, add it to the list | ||
669 | * of available devices, and return it. | ||
670 | */ | ||
671 | struct nfs4_file_layout_dsaddr * | ||
672 | filelayout_get_device_info(struct inode *inode, | ||
673 | struct nfs4_deviceid *dev_id, | ||
674 | struct rpc_cred *cred, | ||
675 | gfp_t gfp_flags) | ||
676 | { | ||
677 | struct pnfs_device *pdev = NULL; | ||
678 | u32 max_resp_sz; | ||
679 | int max_pages; | ||
680 | struct page **pages = NULL; | ||
681 | struct nfs4_file_layout_dsaddr *dsaddr = NULL; | ||
682 | int rc, i; | ||
683 | struct nfs_server *server = NFS_SERVER(inode); | ||
684 | |||
685 | /* | ||
686 | * Use the session max response size as the basis for setting | ||
687 | * GETDEVICEINFO's maxcount | ||
688 | */ | ||
689 | max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; | ||
690 | max_pages = nfs_page_array_len(0, max_resp_sz); | ||
691 | dprintk("%s inode %p max_resp_sz %u max_pages %d\n", | ||
692 | __func__, inode, max_resp_sz, max_pages); | ||
693 | |||
694 | pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags); | ||
695 | if (pdev == NULL) | ||
696 | return NULL; | ||
697 | |||
698 | pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); | ||
699 | if (pages == NULL) { | ||
700 | kfree(pdev); | ||
701 | return NULL; | ||
702 | } | ||
703 | for (i = 0; i < max_pages; i++) { | ||
704 | pages[i] = alloc_page(gfp_flags); | ||
705 | if (!pages[i]) | ||
706 | goto out_free; | ||
707 | } | ||
708 | |||
709 | memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); | ||
710 | pdev->layout_type = LAYOUT_NFSV4_1_FILES; | ||
711 | pdev->pages = pages; | ||
712 | pdev->pgbase = 0; | ||
713 | pdev->pglen = max_resp_sz; | ||
714 | pdev->mincount = 0; | ||
715 | pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; | ||
716 | |||
717 | rc = nfs4_proc_getdeviceinfo(server, pdev, cred); | ||
718 | dprintk("%s getdevice info returns %d\n", __func__, rc); | ||
719 | if (rc) | ||
720 | goto out_free; | ||
721 | |||
722 | /* | ||
723 | * Found new device, need to decode it and then add it to the | ||
724 | * list of known devices for this mountpoint. | ||
725 | */ | ||
726 | dsaddr = decode_and_add_device(inode, pdev, gfp_flags); | ||
727 | out_free: | ||
728 | for (i = 0; i < max_pages; i++) | ||
729 | __free_page(pages[i]); | ||
730 | kfree(pages); | ||
731 | kfree(pdev); | ||
732 | dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); | ||
733 | return dsaddr; | ||
734 | } | ||
735 | |||
736 | void | ||
737 | nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) | ||
738 | { | ||
739 | nfs4_put_deviceid_node(&dsaddr->id_node); | ||
740 | } | ||
741 | |||
742 | /* | ||
743 | * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit | ||
744 | * Then: ((res + fsi) % dsaddr->stripe_count) | ||
745 | */ | ||
746 | u32 | ||
747 | nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) | ||
748 | { | ||
749 | struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); | ||
750 | u64 tmp; | ||
751 | |||
752 | tmp = offset - flseg->pattern_offset; | ||
753 | do_div(tmp, flseg->stripe_unit); | ||
754 | tmp += flseg->first_stripe_index; | ||
755 | return do_div(tmp, flseg->dsaddr->stripe_count); | ||
756 | } | ||
757 | |||
758 | u32 | ||
759 | nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j) | ||
760 | { | ||
761 | return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j]; | ||
762 | } | ||
763 | |||
764 | struct nfs_fh * | ||
765 | nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) | ||
766 | { | ||
767 | struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); | ||
768 | u32 i; | ||
769 | |||
770 | if (flseg->stripe_type == STRIPE_SPARSE) { | ||
771 | if (flseg->num_fh == 1) | ||
772 | i = 0; | ||
773 | else if (flseg->num_fh == 0) | ||
774 | /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ | ||
775 | return NULL; | ||
776 | else | ||
777 | i = nfs4_fl_calc_ds_index(lseg, j); | ||
778 | } else | ||
779 | i = j; | ||
780 | return flseg->fh_array[i]; | ||
781 | } | ||
782 | |||
783 | static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) | ||
784 | { | ||
785 | might_sleep(); | ||
786 | wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, | ||
787 | nfs_wait_bit_killable, TASK_KILLABLE); | ||
788 | } | ||
789 | |||
790 | static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) | ||
791 | { | ||
792 | smp_mb__before_atomic(); | ||
793 | clear_bit(NFS4DS_CONNECTING, &ds->ds_state); | ||
794 | smp_mb__after_atomic(); | ||
795 | wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); | ||
796 | } | ||
797 | |||
798 | |||
799 | struct nfs4_pnfs_ds * | ||
800 | nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) | ||
801 | { | ||
802 | struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; | ||
803 | struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; | ||
804 | struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); | ||
805 | struct nfs4_pnfs_ds *ret = ds; | ||
806 | |||
807 | if (ds == NULL) { | ||
808 | printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", | ||
809 | __func__, ds_idx); | ||
810 | filelayout_mark_devid_invalid(devid); | ||
811 | goto out; | ||
812 | } | ||
813 | smp_rmb(); | ||
814 | if (ds->ds_clp) | ||
815 | goto out_test_devid; | ||
816 | |||
817 | if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { | ||
818 | struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); | ||
819 | int err; | ||
820 | |||
821 | err = nfs4_ds_connect(s, ds); | ||
822 | if (err) | ||
823 | nfs4_mark_deviceid_unavailable(devid); | ||
824 | nfs4_clear_ds_conn_bit(ds); | ||
825 | } else { | ||
826 | /* Either ds is connected, or ds is NULL */ | ||
827 | nfs4_wait_ds_connect(ds); | ||
828 | } | ||
829 | out_test_devid: | ||
830 | if (filelayout_test_devid_unavailable(devid)) | ||
831 | ret = NULL; | ||
832 | out: | ||
833 | return ret; | ||
834 | } | ||
835 | |||
836 | module_param(dataserver_retrans, uint, 0644); | ||
837 | MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " | ||
838 | "retries a request before it attempts further " | ||
839 | " recovery action."); | ||
840 | module_param(dataserver_timeo, uint, 0644); | ||
841 | MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the " | ||
842 | "NFSv4.1 client waits for a response from a " | ||
843 | " data server before it retries an NFS request."); | ||