aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nfs/filelayout
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-10 18:02:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-10 18:02:42 -0400
commitd1e1cda862c16252087374ac75949b0e89a5717e (patch)
tree544ce467bed23638949a1991b4f7b00e7472baa4 /fs/nfs/filelayout
parent07888238f55056605cd23aa4ea3ca97d5e15938f (diff)
parenta914722f333b3359d2f4f12919380a334176bb89 (diff)
Merge tag 'nfs-for-3.16-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust: "Highlights include: - massive cleanup of the NFS read/write code by Anna and Dros - support multiple NFS read/write requests per page in order to deal with non-page aligned pNFS striping. Also cleans up the r/wsize < page size code nicely. - stable fix for ensuring inode is declared uptodate only after all the attributes have been checked. - stable fix for a kernel Oops when remounting - NFS over RDMA client fixes - move the pNFS files layout driver into its own subdirectory" * tag 'nfs-for-3.16-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (79 commits) NFS: populate ->net in mount data when remounting pnfs: fix lockup caused by pnfs_generic_pg_test NFSv4.1: Fix typo in dprintk NFSv4.1: Comment is now wrong and redundant to code NFS: Use raw_write_seqcount_begin/end int nfs4_reclaim_open_state xprtrdma: Disconnect on registration failure xprtrdma: Remove BUG_ON() call sites xprtrdma: Avoid deadlock when credit window is reset SUNRPC: Move congestion window constants to header file xprtrdma: Reset connection timeout after successful reconnect xprtrdma: Use macros for reconnection timeout constants xprtrdma: Allocate missing pagelist xprtrdma: Remove Tavor MTU setting xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting xprtrdma: Reduce the number of hardway buffer allocations xprtrdma: Limit work done by completion handler xprtrmda: Reduce calls to ib_poll_cq() in completion handlers xprtrmda: Reduce lock contention in completion handlers xprtrdma: Split the completion queue xprtrdma: Make rpcrdma_ep_destroy() return void ...
Diffstat (limited to 'fs/nfs/filelayout')
-rw-r--r--fs/nfs/filelayout/Makefile5
-rw-r--r--fs/nfs/filelayout/filelayout.c1406
-rw-r--r--fs/nfs/filelayout/filelayout.h156
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c843
4 files changed, 2410 insertions, 0 deletions
diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile
new file mode 100644
index 000000000000..8516cdffb9e9
--- /dev/null
+++ b/fs/nfs/filelayout/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the pNFS Files Layout Driver kernel module
3#
4obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
5nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
new file mode 100644
index 000000000000..d2eba1c13b7e
--- /dev/null
+++ b/fs/nfs/filelayout/filelayout.c
@@ -0,0 +1,1406 @@
1/*
2 * Module for the pnfs nfs4 file layout driver.
3 * Defines all I/O and Policy interface operations, plus code
4 * to register itself with the pNFS client.
5 *
6 * Copyright (c) 2002
7 * The Regents of the University of Michigan
8 * All Rights Reserved
9 *
10 * Dean Hildebrand <dhildebz@umich.edu>
11 *
12 * Permission is granted to use, copy, create derivative works, and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the University of Michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. If
17 * the above copyright notice or any other identification of the
18 * University of Michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * This software is provided as is, without representation or warranty
22 * of any kind either express or implied, including without limitation
23 * the implied warranties of merchantability, fitness for a particular
24 * purpose, or noninfringement. The Regents of the University of
25 * Michigan shall not be liable for any damages, including special,
26 * indirect, incidental, or consequential damages, with respect to any
27 * claim arising out of or in connection with the use of the software,
28 * even if it has been or is hereafter advised of the possibility of
29 * such damages.
30 */
31
32#include <linux/nfs_fs.h>
33#include <linux/nfs_page.h>
34#include <linux/module.h>
35
36#include <linux/sunrpc/metrics.h>
37
38#include "../nfs4session.h"
39#include "../internal.h"
40#include "../delegation.h"
41#include "filelayout.h"
42#include "../nfs4trace.h"
43
44#define NFSDBG_FACILITY NFSDBG_PNFS_LD
45
46MODULE_LICENSE("GPL");
47MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
48MODULE_DESCRIPTION("The NFSv4 file layout driver");
49
50#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
51
52static loff_t
53filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
54 loff_t offset)
55{
56 u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
57 u64 stripe_no;
58 u32 rem;
59
60 offset -= flseg->pattern_offset;
61 stripe_no = div_u64(offset, stripe_width);
62 div_u64_rem(offset, flseg->stripe_unit, &rem);
63
64 return stripe_no * flseg->stripe_unit + rem;
65}
66
67/* This function is used by the layout driver to calculate the
68 * offset of the file on the dserver based on whether the
69 * layout type is STRIPE_DENSE or STRIPE_SPARSE
70 */
71static loff_t
72filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
73{
74 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
75
76 switch (flseg->stripe_type) {
77 case STRIPE_SPARSE:
78 return offset;
79
80 case STRIPE_DENSE:
81 return filelayout_get_dense_offset(flseg, offset);
82 }
83
84 BUG();
85}
86
87static void filelayout_reset_write(struct nfs_pgio_data *data)
88{
89 struct nfs_pgio_header *hdr = data->header;
90 struct rpc_task *task = &data->task;
91
92 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
93 dprintk("%s Reset task %5u for i/o through MDS "
94 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
95 data->task.tk_pid,
96 hdr->inode->i_sb->s_id,
97 (unsigned long long)NFS_FILEID(hdr->inode),
98 data->args.count,
99 (unsigned long long)data->args.offset);
100
101 task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
102 &hdr->pages,
103 hdr->completion_ops,
104 hdr->dreq);
105 }
106}
107
108static void filelayout_reset_read(struct nfs_pgio_data *data)
109{
110 struct nfs_pgio_header *hdr = data->header;
111 struct rpc_task *task = &data->task;
112
113 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
114 dprintk("%s Reset task %5u for i/o through MDS "
115 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
116 data->task.tk_pid,
117 hdr->inode->i_sb->s_id,
118 (unsigned long long)NFS_FILEID(hdr->inode),
119 data->args.count,
120 (unsigned long long)data->args.offset);
121
122 task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
123 &hdr->pages,
124 hdr->completion_ops,
125 hdr->dreq);
126 }
127}
128
129static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo)
130{
131 if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
132 return;
133 pnfs_return_layout(inode);
134}
135
136static int filelayout_async_handle_error(struct rpc_task *task,
137 struct nfs4_state *state,
138 struct nfs_client *clp,
139 struct pnfs_layout_segment *lseg)
140{
141 struct pnfs_layout_hdr *lo = lseg->pls_layout;
142 struct inode *inode = lo->plh_inode;
143 struct nfs_server *mds_server = NFS_SERVER(inode);
144 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
145 struct nfs_client *mds_client = mds_server->nfs_client;
146 struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
147
148 if (task->tk_status >= 0)
149 return 0;
150
151 switch (task->tk_status) {
152 /* MDS state errors */
153 case -NFS4ERR_DELEG_REVOKED:
154 case -NFS4ERR_ADMIN_REVOKED:
155 case -NFS4ERR_BAD_STATEID:
156 if (state == NULL)
157 break;
158 nfs_remove_bad_delegation(state->inode);
159 case -NFS4ERR_OPENMODE:
160 if (state == NULL)
161 break;
162 if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
163 goto out_bad_stateid;
164 goto wait_on_recovery;
165 case -NFS4ERR_EXPIRED:
166 if (state != NULL) {
167 if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
168 goto out_bad_stateid;
169 }
170 nfs4_schedule_lease_recovery(mds_client);
171 goto wait_on_recovery;
172 /* DS session errors */
173 case -NFS4ERR_BADSESSION:
174 case -NFS4ERR_BADSLOT:
175 case -NFS4ERR_BAD_HIGH_SLOT:
176 case -NFS4ERR_DEADSESSION:
177 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
178 case -NFS4ERR_SEQ_FALSE_RETRY:
179 case -NFS4ERR_SEQ_MISORDERED:
180 dprintk("%s ERROR %d, Reset session. Exchangeid "
181 "flags 0x%x\n", __func__, task->tk_status,
182 clp->cl_exchange_flags);
183 nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
184 break;
185 case -NFS4ERR_DELAY:
186 case -NFS4ERR_GRACE:
187 rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
188 break;
189 case -NFS4ERR_RETRY_UNCACHED_REP:
190 break;
191 /* Invalidate Layout errors */
192 case -NFS4ERR_PNFS_NO_LAYOUT:
193 case -ESTALE: /* mapped NFS4ERR_STALE */
194 case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
195 case -EISDIR: /* mapped NFS4ERR_ISDIR */
196 case -NFS4ERR_FHEXPIRED:
197 case -NFS4ERR_WRONG_TYPE:
198 dprintk("%s Invalid layout error %d\n", __func__,
199 task->tk_status);
200 /*
201 * Destroy layout so new i/o will get a new layout.
202 * Layout will not be destroyed until all current lseg
203 * references are put. Mark layout as invalid to resend failed
204 * i/o and all i/o waiting on the slot table to the MDS until
205 * layout is destroyed and a new valid layout is obtained.
206 */
207 pnfs_destroy_layout(NFS_I(inode));
208 rpc_wake_up(&tbl->slot_tbl_waitq);
209 goto reset;
210 /* RPC connection errors */
211 case -ECONNREFUSED:
212 case -EHOSTDOWN:
213 case -EHOSTUNREACH:
214 case -ENETUNREACH:
215 case -EIO:
216 case -ETIMEDOUT:
217 case -EPIPE:
218 dprintk("%s DS connection error %d\n", __func__,
219 task->tk_status);
220 nfs4_mark_deviceid_unavailable(devid);
221 set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
222 rpc_wake_up(&tbl->slot_tbl_waitq);
223 /* fall through */
224 default:
225reset:
226 dprintk("%s Retry through MDS. Error %d\n", __func__,
227 task->tk_status);
228 return -NFS4ERR_RESET_TO_MDS;
229 }
230out:
231 task->tk_status = 0;
232 return -EAGAIN;
233out_bad_stateid:
234 task->tk_status = -EIO;
235 return 0;
236wait_on_recovery:
237 rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
238 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
239 rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
240 goto out;
241}
242
243/* NFS_PROTO call done callback routines */
244
245static int filelayout_read_done_cb(struct rpc_task *task,
246 struct nfs_pgio_data *data)
247{
248 struct nfs_pgio_header *hdr = data->header;
249 int err;
250
251 trace_nfs4_pnfs_read(data, task->tk_status);
252 err = filelayout_async_handle_error(task, data->args.context->state,
253 data->ds_clp, hdr->lseg);
254
255 switch (err) {
256 case -NFS4ERR_RESET_TO_MDS:
257 filelayout_reset_read(data);
258 return task->tk_status;
259 case -EAGAIN:
260 rpc_restart_call_prepare(task);
261 return -EAGAIN;
262 }
263
264 return 0;
265}
266
267/*
268 * We reference the rpc_cred of the first WRITE that triggers the need for
269 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
270 * rfc5661 is not clear about which credential should be used.
271 */
272static void
273filelayout_set_layoutcommit(struct nfs_pgio_data *wdata)
274{
275 struct nfs_pgio_header *hdr = wdata->header;
276
277 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
278 wdata->res.verf->committed == NFS_FILE_SYNC)
279 return;
280
281 pnfs_set_layoutcommit(wdata);
282 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
283 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
284}
285
286bool
287filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node)
288{
289 return filelayout_test_devid_invalid(node) ||
290 nfs4_test_deviceid_unavailable(node);
291}
292
293static bool
294filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
295{
296 struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg);
297
298 return filelayout_test_devid_unavailable(node);
299}
300
301/*
302 * Call ops for the async read/write cases
303 * In the case of dense layouts, the offset needs to be reset to its
304 * original value.
305 */
306static void filelayout_read_prepare(struct rpc_task *task, void *data)
307{
308 struct nfs_pgio_data *rdata = data;
309
310 if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
311 rpc_exit(task, -EIO);
312 return;
313 }
314 if (filelayout_reset_to_mds(rdata->header->lseg)) {
315 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
316 filelayout_reset_read(rdata);
317 rpc_exit(task, 0);
318 return;
319 }
320 rdata->pgio_done_cb = filelayout_read_done_cb;
321
322 if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
323 &rdata->args.seq_args,
324 &rdata->res.seq_res,
325 task))
326 return;
327 if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
328 rdata->args.lock_context, FMODE_READ) == -EIO)
329 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
330}
331
332static void filelayout_read_call_done(struct rpc_task *task, void *data)
333{
334 struct nfs_pgio_data *rdata = data;
335
336 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
337
338 if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
339 task->tk_status == 0) {
340 nfs41_sequence_done(task, &rdata->res.seq_res);
341 return;
342 }
343
344 /* Note this may cause RPC to be resent */
345 rdata->header->mds_ops->rpc_call_done(task, data);
346}
347
348static void filelayout_read_count_stats(struct rpc_task *task, void *data)
349{
350 struct nfs_pgio_data *rdata = data;
351
352 rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);
353}
354
355static void filelayout_read_release(void *data)
356{
357 struct nfs_pgio_data *rdata = data;
358 struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout;
359
360 filelayout_fenceme(lo->plh_inode, lo);
361 nfs_put_client(rdata->ds_clp);
362 rdata->header->mds_ops->rpc_release(data);
363}
364
365static int filelayout_write_done_cb(struct rpc_task *task,
366 struct nfs_pgio_data *data)
367{
368 struct nfs_pgio_header *hdr = data->header;
369 int err;
370
371 trace_nfs4_pnfs_write(data, task->tk_status);
372 err = filelayout_async_handle_error(task, data->args.context->state,
373 data->ds_clp, hdr->lseg);
374
375 switch (err) {
376 case -NFS4ERR_RESET_TO_MDS:
377 filelayout_reset_write(data);
378 return task->tk_status;
379 case -EAGAIN:
380 rpc_restart_call_prepare(task);
381 return -EAGAIN;
382 }
383
384 filelayout_set_layoutcommit(data);
385 return 0;
386}
387
388/* Fake up some data that will cause nfs_commit_release to retry the writes. */
389static void prepare_to_resend_writes(struct nfs_commit_data *data)
390{
391 struct nfs_page *first = nfs_list_entry(data->pages.next);
392
393 data->task.tk_status = 0;
394 memcpy(&data->verf.verifier, &first->wb_verf,
395 sizeof(data->verf.verifier));
396 data->verf.verifier.data[0]++; /* ensure verifier mismatch */
397}
398
399static int filelayout_commit_done_cb(struct rpc_task *task,
400 struct nfs_commit_data *data)
401{
402 int err;
403
404 trace_nfs4_pnfs_commit_ds(data, task->tk_status);
405 err = filelayout_async_handle_error(task, NULL, data->ds_clp,
406 data->lseg);
407
408 switch (err) {
409 case -NFS4ERR_RESET_TO_MDS:
410 prepare_to_resend_writes(data);
411 return -EAGAIN;
412 case -EAGAIN:
413 rpc_restart_call_prepare(task);
414 return -EAGAIN;
415 }
416
417 return 0;
418}
419
420static void filelayout_write_prepare(struct rpc_task *task, void *data)
421{
422 struct nfs_pgio_data *wdata = data;
423
424 if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
425 rpc_exit(task, -EIO);
426 return;
427 }
428 if (filelayout_reset_to_mds(wdata->header->lseg)) {
429 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
430 filelayout_reset_write(wdata);
431 rpc_exit(task, 0);
432 return;
433 }
434 if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
435 &wdata->args.seq_args,
436 &wdata->res.seq_res,
437 task))
438 return;
439 if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
440 wdata->args.lock_context, FMODE_WRITE) == -EIO)
441 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
442}
443
444static void filelayout_write_call_done(struct rpc_task *task, void *data)
445{
446 struct nfs_pgio_data *wdata = data;
447
448 if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
449 task->tk_status == 0) {
450 nfs41_sequence_done(task, &wdata->res.seq_res);
451 return;
452 }
453
454 /* Note this may cause RPC to be resent */
455 wdata->header->mds_ops->rpc_call_done(task, data);
456}
457
458static void filelayout_write_count_stats(struct rpc_task *task, void *data)
459{
460 struct nfs_pgio_data *wdata = data;
461
462 rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);
463}
464
465static void filelayout_write_release(void *data)
466{
467 struct nfs_pgio_data *wdata = data;
468 struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout;
469
470 filelayout_fenceme(lo->plh_inode, lo);
471 nfs_put_client(wdata->ds_clp);
472 wdata->header->mds_ops->rpc_release(data);
473}
474
475static void filelayout_commit_prepare(struct rpc_task *task, void *data)
476{
477 struct nfs_commit_data *wdata = data;
478
479 nfs41_setup_sequence(wdata->ds_clp->cl_session,
480 &wdata->args.seq_args,
481 &wdata->res.seq_res,
482 task);
483}
484
485static void filelayout_write_commit_done(struct rpc_task *task, void *data)
486{
487 struct nfs_commit_data *wdata = data;
488
489 /* Note this may cause RPC to be resent */
490 wdata->mds_ops->rpc_call_done(task, data);
491}
492
493static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
494{
495 struct nfs_commit_data *cdata = data;
496
497 rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
498}
499
500static void filelayout_commit_release(void *calldata)
501{
502 struct nfs_commit_data *data = calldata;
503
504 data->completion_ops->completion(data);
505 pnfs_put_lseg(data->lseg);
506 nfs_put_client(data->ds_clp);
507 nfs_commitdata_release(data);
508}
509
510static const struct rpc_call_ops filelayout_read_call_ops = {
511 .rpc_call_prepare = filelayout_read_prepare,
512 .rpc_call_done = filelayout_read_call_done,
513 .rpc_count_stats = filelayout_read_count_stats,
514 .rpc_release = filelayout_read_release,
515};
516
517static const struct rpc_call_ops filelayout_write_call_ops = {
518 .rpc_call_prepare = filelayout_write_prepare,
519 .rpc_call_done = filelayout_write_call_done,
520 .rpc_count_stats = filelayout_write_count_stats,
521 .rpc_release = filelayout_write_release,
522};
523
524static const struct rpc_call_ops filelayout_commit_call_ops = {
525 .rpc_call_prepare = filelayout_commit_prepare,
526 .rpc_call_done = filelayout_write_commit_done,
527 .rpc_count_stats = filelayout_commit_count_stats,
528 .rpc_release = filelayout_commit_release,
529};
530
531static enum pnfs_try_status
532filelayout_read_pagelist(struct nfs_pgio_data *data)
533{
534 struct nfs_pgio_header *hdr = data->header;
535 struct pnfs_layout_segment *lseg = hdr->lseg;
536 struct nfs4_pnfs_ds *ds;
537 struct rpc_clnt *ds_clnt;
538 loff_t offset = data->args.offset;
539 u32 j, idx;
540 struct nfs_fh *fh;
541
542 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
543 __func__, hdr->inode->i_ino,
544 data->args.pgbase, (size_t)data->args.count, offset);
545
546 /* Retrieve the correct rpc_client for the byte range */
547 j = nfs4_fl_calc_j_index(lseg, offset);
548 idx = nfs4_fl_calc_ds_index(lseg, j);
549 ds = nfs4_fl_prepare_ds(lseg, idx);
550 if (!ds)
551 return PNFS_NOT_ATTEMPTED;
552
553 ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode);
554 if (IS_ERR(ds_clnt))
555 return PNFS_NOT_ATTEMPTED;
556
557 dprintk("%s USE DS: %s cl_count %d\n", __func__,
558 ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
559
560 /* No multipath support. Use first DS */
561 atomic_inc(&ds->ds_clp->cl_count);
562 data->ds_clp = ds->ds_clp;
563 data->ds_idx = idx;
564 fh = nfs4_fl_select_ds_fh(lseg, j);
565 if (fh)
566 data->args.fh = fh;
567
568 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
569 data->mds_offset = offset;
570
571 /* Perform an asynchronous read to ds */
572 nfs_initiate_pgio(ds_clnt, data,
573 &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
574 return PNFS_ATTEMPTED;
575}
576
577/* Perform async writes. */
578static enum pnfs_try_status
579filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
580{
581 struct nfs_pgio_header *hdr = data->header;
582 struct pnfs_layout_segment *lseg = hdr->lseg;
583 struct nfs4_pnfs_ds *ds;
584 struct rpc_clnt *ds_clnt;
585 loff_t offset = data->args.offset;
586 u32 j, idx;
587 struct nfs_fh *fh;
588
589 /* Retrieve the correct rpc_client for the byte range */
590 j = nfs4_fl_calc_j_index(lseg, offset);
591 idx = nfs4_fl_calc_ds_index(lseg, j);
592 ds = nfs4_fl_prepare_ds(lseg, idx);
593 if (!ds)
594 return PNFS_NOT_ATTEMPTED;
595
596 ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode);
597 if (IS_ERR(ds_clnt))
598 return PNFS_NOT_ATTEMPTED;
599
600 dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
601 __func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
602 offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
603
604 data->pgio_done_cb = filelayout_write_done_cb;
605 atomic_inc(&ds->ds_clp->cl_count);
606 data->ds_clp = ds->ds_clp;
607 data->ds_idx = idx;
608 fh = nfs4_fl_select_ds_fh(lseg, j);
609 if (fh)
610 data->args.fh = fh;
611
612 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
613
614 /* Perform an asynchronous write */
615 nfs_initiate_pgio(ds_clnt, data,
616 &filelayout_write_call_ops, sync,
617 RPC_TASK_SOFTCONN);
618 return PNFS_ATTEMPTED;
619}
620
621/*
622 * filelayout_check_layout()
623 *
624 * Make sure layout segment parameters are sane WRT the device.
625 * At this point no generic layer initialization of the lseg has occurred,
626 * and nothing has been added to the layout_hdr cache.
627 *
628 */
629static int
630filelayout_check_layout(struct pnfs_layout_hdr *lo,
631 struct nfs4_filelayout_segment *fl,
632 struct nfs4_layoutget_res *lgr,
633 struct nfs4_deviceid *id,
634 gfp_t gfp_flags)
635{
636 struct nfs4_deviceid_node *d;
637 struct nfs4_file_layout_dsaddr *dsaddr;
638 int status = -EINVAL;
639
640 dprintk("--> %s\n", __func__);
641
642 /* FIXME: remove this check when layout segment support is added */
643 if (lgr->range.offset != 0 ||
644 lgr->range.length != NFS4_MAX_UINT64) {
645 dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
646 __func__);
647 goto out;
648 }
649
650 if (fl->pattern_offset > lgr->range.offset) {
651 dprintk("%s pattern_offset %lld too large\n",
652 __func__, fl->pattern_offset);
653 goto out;
654 }
655
656 if (!fl->stripe_unit) {
657 dprintk("%s Invalid stripe unit (%u)\n",
658 __func__, fl->stripe_unit);
659 goto out;
660 }
661
662 /* find and reference the deviceid */
663 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
664 NFS_SERVER(lo->plh_inode)->nfs_client, id);
665 if (d == NULL) {
666 dsaddr = filelayout_get_device_info(lo->plh_inode, id,
667 lo->plh_lc_cred, gfp_flags);
668 if (dsaddr == NULL)
669 goto out;
670 } else
671 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
672 /* Found deviceid is unavailable */
673 if (filelayout_test_devid_unavailable(&dsaddr->id_node))
674 goto out_put;
675
676 fl->dsaddr = dsaddr;
677
678 if (fl->first_stripe_index >= dsaddr->stripe_count) {
679 dprintk("%s Bad first_stripe_index %u\n",
680 __func__, fl->first_stripe_index);
681 goto out_put;
682 }
683
684 if ((fl->stripe_type == STRIPE_SPARSE &&
685 fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
686 (fl->stripe_type == STRIPE_DENSE &&
687 fl->num_fh != dsaddr->stripe_count)) {
688 dprintk("%s num_fh %u not valid for given packing\n",
689 __func__, fl->num_fh);
690 goto out_put;
691 }
692
693 status = 0;
694out:
695 dprintk("--> %s returns %d\n", __func__, status);
696 return status;
697out_put:
698 nfs4_fl_put_deviceid(dsaddr);
699 goto out;
700}
701
702static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
703{
704 int i;
705
706 for (i = 0; i < fl->num_fh; i++) {
707 if (!fl->fh_array[i])
708 break;
709 kfree(fl->fh_array[i]);
710 }
711 kfree(fl->fh_array);
712 fl->fh_array = NULL;
713}
714
715static void
716_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
717{
718 filelayout_free_fh_array(fl);
719 kfree(fl);
720}
721
722static int
723filelayout_decode_layout(struct pnfs_layout_hdr *flo,
724 struct nfs4_filelayout_segment *fl,
725 struct nfs4_layoutget_res *lgr,
726 struct nfs4_deviceid *id,
727 gfp_t gfp_flags)
728{
729 struct xdr_stream stream;
730 struct xdr_buf buf;
731 struct page *scratch;
732 __be32 *p;
733 uint32_t nfl_util;
734 int i;
735
736 dprintk("%s: set_layout_map Begin\n", __func__);
737
738 scratch = alloc_page(gfp_flags);
739 if (!scratch)
740 return -ENOMEM;
741
742 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
743 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
744
745 /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
746 * num_fh (4) */
747 p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20);
748 if (unlikely(!p))
749 goto out_err;
750
751 memcpy(id, p, sizeof(*id));
752 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
753 nfs4_print_deviceid(id);
754
755 nfl_util = be32_to_cpup(p++);
756 if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
757 fl->commit_through_mds = 1;
758 if (nfl_util & NFL4_UFLG_DENSE)
759 fl->stripe_type = STRIPE_DENSE;
760 else
761 fl->stripe_type = STRIPE_SPARSE;
762 fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
763
764 fl->first_stripe_index = be32_to_cpup(p++);
765 p = xdr_decode_hyper(p, &fl->pattern_offset);
766 fl->num_fh = be32_to_cpup(p++);
767
768 dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
769 __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
770 fl->pattern_offset);
771
772 /* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
773 * Futher checking is done in filelayout_check_layout */
774 if (fl->num_fh >
775 max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
776 goto out_err;
777
778 if (fl->num_fh > 0) {
779 fl->fh_array = kcalloc(fl->num_fh, sizeof(fl->fh_array[0]),
780 gfp_flags);
781 if (!fl->fh_array)
782 goto out_err;
783 }
784
785 for (i = 0; i < fl->num_fh; i++) {
786 /* Do we want to use a mempool here? */
787 fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags);
788 if (!fl->fh_array[i])
789 goto out_err_free;
790
791 p = xdr_inline_decode(&stream, 4);
792 if (unlikely(!p))
793 goto out_err_free;
794 fl->fh_array[i]->size = be32_to_cpup(p++);
795 if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
796 printk(KERN_ERR "NFS: Too big fh %d received %d\n",
797 i, fl->fh_array[i]->size);
798 goto out_err_free;
799 }
800
801 p = xdr_inline_decode(&stream, fl->fh_array[i]->size);
802 if (unlikely(!p))
803 goto out_err_free;
804 memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
805 dprintk("DEBUG: %s: fh len %d\n", __func__,
806 fl->fh_array[i]->size);
807 }
808
809 __free_page(scratch);
810 return 0;
811
812out_err_free:
813 filelayout_free_fh_array(fl);
814out_err:
815 __free_page(scratch);
816 return -EIO;
817}
818
819static void
820filelayout_free_lseg(struct pnfs_layout_segment *lseg)
821{
822 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
823
824 dprintk("--> %s\n", __func__);
825 nfs4_fl_put_deviceid(fl->dsaddr);
826 /* This assumes a single RW lseg */
827 if (lseg->pls_range.iomode == IOMODE_RW) {
828 struct nfs4_filelayout *flo;
829
830 flo = FILELAYOUT_FROM_HDR(lseg->pls_layout);
831 flo->commit_info.nbuckets = 0;
832 kfree(flo->commit_info.buckets);
833 flo->commit_info.buckets = NULL;
834 }
835 _filelayout_free_lseg(fl);
836}
837
838static int
839filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
840 struct nfs_commit_info *cinfo,
841 gfp_t gfp_flags)
842{
843 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
844 struct pnfs_commit_bucket *buckets;
845 int size, i;
846
847 if (fl->commit_through_mds)
848 return 0;
849
850 size = (fl->stripe_type == STRIPE_SPARSE) ?
851 fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
852
853 if (cinfo->ds->nbuckets >= size) {
854 /* This assumes there is only one IOMODE_RW lseg. What
855 * we really want to do is have a layout_hdr level
856 * dictionary of <multipath_list4, fh> keys, each
857 * associated with a struct list_head, populated by calls
858 * to filelayout_write_pagelist().
859 * */
860 return 0;
861 }
862
863 buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
864 gfp_flags);
865 if (!buckets)
866 return -ENOMEM;
867 for (i = 0; i < size; i++) {
868 INIT_LIST_HEAD(&buckets[i].written);
869 INIT_LIST_HEAD(&buckets[i].committing);
870 /* mark direct verifier as unset */
871 buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
872 }
873
874 spin_lock(cinfo->lock);
875 if (cinfo->ds->nbuckets >= size)
876 goto out;
877 for (i = 0; i < cinfo->ds->nbuckets; i++) {
878 list_splice(&cinfo->ds->buckets[i].written,
879 &buckets[i].written);
880 list_splice(&cinfo->ds->buckets[i].committing,
881 &buckets[i].committing);
882 buckets[i].direct_verf.committed =
883 cinfo->ds->buckets[i].direct_verf.committed;
884 buckets[i].wlseg = cinfo->ds->buckets[i].wlseg;
885 buckets[i].clseg = cinfo->ds->buckets[i].clseg;
886 }
887 swap(cinfo->ds->buckets, buckets);
888 cinfo->ds->nbuckets = size;
889out:
890 spin_unlock(cinfo->lock);
891 kfree(buckets);
892 return 0;
893}
894
895static struct pnfs_layout_segment *
896filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
897 struct nfs4_layoutget_res *lgr,
898 gfp_t gfp_flags)
899{
900 struct nfs4_filelayout_segment *fl;
901 int rc;
902 struct nfs4_deviceid id;
903
904 dprintk("--> %s\n", __func__);
905 fl = kzalloc(sizeof(*fl), gfp_flags);
906 if (!fl)
907 return NULL;
908
909 rc = filelayout_decode_layout(layoutid, fl, lgr, &id, gfp_flags);
910 if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id, gfp_flags)) {
911 _filelayout_free_lseg(fl);
912 return NULL;
913 }
914 return &fl->generic_hdr;
915}
916
917/*
918 * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
919 *
920 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
921 * of bytes (maximum @req->wb_bytes) that can be coalesced.
922 */
923static size_t
924filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
925 struct nfs_page *req)
926{
927 unsigned int size;
928 u64 p_stripe, r_stripe;
929 u32 stripe_offset;
930 u64 segment_offset = pgio->pg_lseg->pls_range.offset;
931 u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
932
933 /* calls nfs_generic_pg_test */
934 size = pnfs_generic_pg_test(pgio, prev, req);
935 if (!size)
936 return 0;
937
938 /* see if req and prev are in the same stripe */
939 if (prev) {
940 p_stripe = (u64)req_offset(prev) - segment_offset;
941 r_stripe = (u64)req_offset(req) - segment_offset;
942 do_div(p_stripe, stripe_unit);
943 do_div(r_stripe, stripe_unit);
944
945 if (p_stripe != r_stripe)
946 return 0;
947 }
948
949 /* calculate remaining bytes in the current stripe */
950 div_u64_rem((u64)req_offset(req) - segment_offset,
951 stripe_unit,
952 &stripe_offset);
953 WARN_ON_ONCE(stripe_offset > stripe_unit);
954 if (stripe_offset >= stripe_unit)
955 return 0;
956 return min(stripe_unit - (unsigned int)stripe_offset, size);
957}
958
959static void
960filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
961 struct nfs_page *req)
962{
963 if (!pgio->pg_lseg)
964 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
965 req->wb_context,
966 0,
967 NFS4_MAX_UINT64,
968 IOMODE_READ,
969 GFP_KERNEL);
970 /* If no lseg, fall back to read through mds */
971 if (pgio->pg_lseg == NULL)
972 nfs_pageio_reset_read_mds(pgio);
973}
974
975static void
976filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
977 struct nfs_page *req)
978{
979 struct nfs_commit_info cinfo;
980 int status;
981
982 if (!pgio->pg_lseg)
983 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
984 req->wb_context,
985 0,
986 NFS4_MAX_UINT64,
987 IOMODE_RW,
988 GFP_NOFS);
989 /* If no lseg, fall back to write through mds */
990 if (pgio->pg_lseg == NULL)
991 goto out_mds;
992 nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
993 status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
994 if (status < 0) {
995 pnfs_put_lseg(pgio->pg_lseg);
996 pgio->pg_lseg = NULL;
997 goto out_mds;
998 }
999 return;
1000out_mds:
1001 nfs_pageio_reset_write_mds(pgio);
1002}
1003
1004static const struct nfs_pageio_ops filelayout_pg_read_ops = {
1005 .pg_init = filelayout_pg_init_read,
1006 .pg_test = filelayout_pg_test,
1007 .pg_doio = pnfs_generic_pg_readpages,
1008};
1009
1010static const struct nfs_pageio_ops filelayout_pg_write_ops = {
1011 .pg_init = filelayout_pg_init_write,
1012 .pg_test = filelayout_pg_test,
1013 .pg_doio = pnfs_generic_pg_writepages,
1014};
1015
1016static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
1017{
1018 if (fl->stripe_type == STRIPE_SPARSE)
1019 return nfs4_fl_calc_ds_index(&fl->generic_hdr, j);
1020 else
1021 return j;
1022}
1023
1024/* The generic layer is about to remove the req from the commit list.
1025 * If this will make the bucket empty, it will need to put the lseg reference.
1026 */
1027static void
1028filelayout_clear_request_commit(struct nfs_page *req,
1029 struct nfs_commit_info *cinfo)
1030{
1031 struct pnfs_layout_segment *freeme = NULL;
1032
1033 spin_lock(cinfo->lock);
1034 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
1035 goto out;
1036 cinfo->ds->nwritten--;
1037 if (list_is_singular(&req->wb_list)) {
1038 struct pnfs_commit_bucket *bucket;
1039
1040 bucket = list_first_entry(&req->wb_list,
1041 struct pnfs_commit_bucket,
1042 written);
1043 freeme = bucket->wlseg;
1044 bucket->wlseg = NULL;
1045 }
1046out:
1047 nfs_request_remove_commit_list(req, cinfo);
1048 spin_unlock(cinfo->lock);
1049 pnfs_put_lseg(freeme);
1050}
1051
1052static struct list_head *
1053filelayout_choose_commit_list(struct nfs_page *req,
1054 struct pnfs_layout_segment *lseg,
1055 struct nfs_commit_info *cinfo)
1056{
1057 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
1058 u32 i, j;
1059 struct list_head *list;
1060 struct pnfs_commit_bucket *buckets;
1061
1062 if (fl->commit_through_mds)
1063 return &cinfo->mds->list;
1064
1065 /* Note that we are calling nfs4_fl_calc_j_index on each page
1066 * that ends up being committed to a data server. An attractive
1067 * alternative is to add a field to nfs_write_data and nfs_page
1068 * to store the value calculated in filelayout_write_pagelist
1069 * and just use that here.
1070 */
1071 j = nfs4_fl_calc_j_index(lseg, req_offset(req));
1072 i = select_bucket_index(fl, j);
1073 spin_lock(cinfo->lock);
1074 buckets = cinfo->ds->buckets;
1075 list = &buckets[i].written;
1076 if (list_empty(list)) {
1077 /* Non-empty buckets hold a reference on the lseg. That ref
1078 * is normally transferred to the COMMIT call and released
1079 * there. It could also be released if the last req is pulled
1080 * off due to a rewrite, in which case it will be done in
1081 * filelayout_clear_request_commit
1082 */
1083 buckets[i].wlseg = pnfs_get_lseg(lseg);
1084 }
1085 set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
1086 cinfo->ds->nwritten++;
1087 spin_unlock(cinfo->lock);
1088 return list;
1089}
1090
1091static void
1092filelayout_mark_request_commit(struct nfs_page *req,
1093 struct pnfs_layout_segment *lseg,
1094 struct nfs_commit_info *cinfo)
1095{
1096 struct list_head *list;
1097
1098 list = filelayout_choose_commit_list(req, lseg, cinfo);
1099 nfs_request_add_commit_list(req, list, cinfo);
1100}
1101
1102static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
1103{
1104 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
1105
1106 if (flseg->stripe_type == STRIPE_SPARSE)
1107 return i;
1108 else
1109 return nfs4_fl_calc_ds_index(lseg, i);
1110}
1111
1112static struct nfs_fh *
1113select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
1114{
1115 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
1116
1117 if (flseg->stripe_type == STRIPE_SPARSE) {
1118 if (flseg->num_fh == 1)
1119 i = 0;
1120 else if (flseg->num_fh == 0)
1121 /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
1122 return NULL;
1123 }
1124 return flseg->fh_array[i];
1125}
1126
1127static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
1128{
1129 struct pnfs_layout_segment *lseg = data->lseg;
1130 struct nfs4_pnfs_ds *ds;
1131 struct rpc_clnt *ds_clnt;
1132 u32 idx;
1133 struct nfs_fh *fh;
1134
1135 idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
1136 ds = nfs4_fl_prepare_ds(lseg, idx);
1137 if (!ds)
1138 goto out_err;
1139
1140 ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, data->inode);
1141 if (IS_ERR(ds_clnt))
1142 goto out_err;
1143
1144 dprintk("%s ino %lu, how %d cl_count %d\n", __func__,
1145 data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count));
1146 data->commit_done_cb = filelayout_commit_done_cb;
1147 atomic_inc(&ds->ds_clp->cl_count);
1148 data->ds_clp = ds->ds_clp;
1149 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
1150 if (fh)
1151 data->args.fh = fh;
1152 return nfs_initiate_commit(ds_clnt, data,
1153 &filelayout_commit_call_ops, how,
1154 RPC_TASK_SOFTCONN);
1155out_err:
1156 prepare_to_resend_writes(data);
1157 filelayout_commit_release(data);
1158 return -EAGAIN;
1159}
1160
1161static int
1162transfer_commit_list(struct list_head *src, struct list_head *dst,
1163 struct nfs_commit_info *cinfo, int max)
1164{
1165 struct nfs_page *req, *tmp;
1166 int ret = 0;
1167
1168 list_for_each_entry_safe(req, tmp, src, wb_list) {
1169 if (!nfs_lock_request(req))
1170 continue;
1171 kref_get(&req->wb_kref);
1172 if (cond_resched_lock(cinfo->lock))
1173 list_safe_reset_next(req, tmp, wb_list);
1174 nfs_request_remove_commit_list(req, cinfo);
1175 clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
1176 nfs_list_add_request(req, dst);
1177 ret++;
1178 if ((ret == max) && !cinfo->dreq)
1179 break;
1180 }
1181 return ret;
1182}
1183
1184/* Note called with cinfo->lock held. */
1185static int
1186filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
1187 struct nfs_commit_info *cinfo,
1188 int max)
1189{
1190 struct list_head *src = &bucket->written;
1191 struct list_head *dst = &bucket->committing;
1192 int ret;
1193
1194 ret = transfer_commit_list(src, dst, cinfo, max);
1195 if (ret) {
1196 cinfo->ds->nwritten -= ret;
1197 cinfo->ds->ncommitting += ret;
1198 bucket->clseg = bucket->wlseg;
1199 if (list_empty(src))
1200 bucket->wlseg = NULL;
1201 else
1202 pnfs_get_lseg(bucket->clseg);
1203 }
1204 return ret;
1205}
1206
1207/* Move reqs from written to committing lists, returning count of number moved.
1208 * Note called with cinfo->lock held.
1209 */
1210static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo,
1211 int max)
1212{
1213 int i, rv = 0, cnt;
1214
1215 for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
1216 cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i],
1217 cinfo, max);
1218 max -= cnt;
1219 rv += cnt;
1220 }
1221 return rv;
1222}
1223
1224/* Pull everything off the committing lists and dump into @dst */
1225static void filelayout_recover_commit_reqs(struct list_head *dst,
1226 struct nfs_commit_info *cinfo)
1227{
1228 struct pnfs_commit_bucket *b;
1229 struct pnfs_layout_segment *freeme;
1230 int i;
1231
1232restart:
1233 spin_lock(cinfo->lock);
1234 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
1235 if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
1236 freeme = b->wlseg;
1237 b->wlseg = NULL;
1238 spin_unlock(cinfo->lock);
1239 pnfs_put_lseg(freeme);
1240 goto restart;
1241 }
1242 }
1243 cinfo->ds->nwritten = 0;
1244 spin_unlock(cinfo->lock);
1245}
1246
1247static unsigned int
1248alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
1249{
1250 struct pnfs_ds_commit_info *fl_cinfo;
1251 struct pnfs_commit_bucket *bucket;
1252 struct nfs_commit_data *data;
1253 int i, j;
1254 unsigned int nreq = 0;
1255 struct pnfs_layout_segment *freeme;
1256
1257 fl_cinfo = cinfo->ds;
1258 bucket = fl_cinfo->buckets;
1259 for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
1260 if (list_empty(&bucket->committing))
1261 continue;
1262 data = nfs_commitdata_alloc();
1263 if (!data)
1264 break;
1265 data->ds_commit_index = i;
1266 spin_lock(cinfo->lock);
1267 data->lseg = bucket->clseg;
1268 bucket->clseg = NULL;
1269 spin_unlock(cinfo->lock);
1270 list_add(&data->pages, list);
1271 nreq++;
1272 }
1273
1274 /* Clean up on error */
1275 for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) {
1276 if (list_empty(&bucket->committing))
1277 continue;
1278 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
1279 spin_lock(cinfo->lock);
1280 freeme = bucket->clseg;
1281 bucket->clseg = NULL;
1282 spin_unlock(cinfo->lock);
1283 pnfs_put_lseg(freeme);
1284 }
1285 /* Caller will clean up entries put on list */
1286 return nreq;
1287}
1288
1289/* This follows nfs_commit_list pretty closely */
1290static int
1291filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
1292 int how, struct nfs_commit_info *cinfo)
1293{
1294 struct nfs_commit_data *data, *tmp;
1295 LIST_HEAD(list);
1296 unsigned int nreq = 0;
1297
1298 if (!list_empty(mds_pages)) {
1299 data = nfs_commitdata_alloc();
1300 if (data != NULL) {
1301 data->lseg = NULL;
1302 list_add(&data->pages, &list);
1303 nreq++;
1304 } else
1305 nfs_retry_commit(mds_pages, NULL, cinfo);
1306 }
1307
1308 nreq += alloc_ds_commits(cinfo, &list);
1309
1310 if (nreq == 0) {
1311 cinfo->completion_ops->error_cleanup(NFS_I(inode));
1312 goto out;
1313 }
1314
1315 atomic_add(nreq, &cinfo->mds->rpcs_out);
1316
1317 list_for_each_entry_safe(data, tmp, &list, pages) {
1318 list_del_init(&data->pages);
1319 if (!data->lseg) {
1320 nfs_init_commit(data, mds_pages, NULL, cinfo);
1321 nfs_initiate_commit(NFS_CLIENT(inode), data,
1322 data->mds_ops, how, 0);
1323 } else {
1324 struct pnfs_commit_bucket *buckets;
1325
1326 buckets = cinfo->ds->buckets;
1327 nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo);
1328 filelayout_initiate_commit(data, how);
1329 }
1330 }
1331out:
1332 cinfo->ds->ncommitting = 0;
1333 return PNFS_ATTEMPTED;
1334}
1335
1336static void
1337filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
1338{
1339 nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
1340}
1341
1342static struct pnfs_layout_hdr *
1343filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
1344{
1345 struct nfs4_filelayout *flo;
1346
1347 flo = kzalloc(sizeof(*flo), gfp_flags);
1348 return flo != NULL ? &flo->generic_hdr : NULL;
1349}
1350
1351static void
1352filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
1353{
1354 kfree(FILELAYOUT_FROM_HDR(lo));
1355}
1356
1357static struct pnfs_ds_commit_info *
1358filelayout_get_ds_info(struct inode *inode)
1359{
1360 struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
1361
1362 if (layout == NULL)
1363 return NULL;
1364 else
1365 return &FILELAYOUT_FROM_HDR(layout)->commit_info;
1366}
1367
1368static struct pnfs_layoutdriver_type filelayout_type = {
1369 .id = LAYOUT_NFSV4_1_FILES,
1370 .name = "LAYOUT_NFSV4_1_FILES",
1371 .owner = THIS_MODULE,
1372 .alloc_layout_hdr = filelayout_alloc_layout_hdr,
1373 .free_layout_hdr = filelayout_free_layout_hdr,
1374 .alloc_lseg = filelayout_alloc_lseg,
1375 .free_lseg = filelayout_free_lseg,
1376 .pg_read_ops = &filelayout_pg_read_ops,
1377 .pg_write_ops = &filelayout_pg_write_ops,
1378 .get_ds_info = &filelayout_get_ds_info,
1379 .mark_request_commit = filelayout_mark_request_commit,
1380 .clear_request_commit = filelayout_clear_request_commit,
1381 .scan_commit_lists = filelayout_scan_commit_lists,
1382 .recover_commit_reqs = filelayout_recover_commit_reqs,
1383 .commit_pagelist = filelayout_commit_pagelist,
1384 .read_pagelist = filelayout_read_pagelist,
1385 .write_pagelist = filelayout_write_pagelist,
1386 .free_deviceid_node = filelayout_free_deveiceid_node,
1387};
1388
1389static int __init nfs4filelayout_init(void)
1390{
1391 printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
1392 __func__);
1393 return pnfs_register_layoutdriver(&filelayout_type);
1394}
1395
1396static void __exit nfs4filelayout_exit(void)
1397{
1398 printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
1399 __func__);
1400 pnfs_unregister_layoutdriver(&filelayout_type);
1401}
1402
1403MODULE_ALIAS("nfs-layouttype4-1");
1404
1405module_init(nfs4filelayout_init);
1406module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
new file mode 100644
index 000000000000..ffbddf2219ea
--- /dev/null
+++ b/fs/nfs/filelayout/filelayout.h
@@ -0,0 +1,156 @@
1/*
2 * NFSv4 file layout driver data structures.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#ifndef FS_NFS_NFS4FILELAYOUT_H
31#define FS_NFS_NFS4FILELAYOUT_H
32
33#include "../pnfs.h"
34
35/*
36 * Default data server connection timeout and retrans vaules.
37 * Set by module paramters dataserver_timeo and dataserver_retrans.
38 */
39#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
40#define NFS4_DEF_DS_RETRANS 5
41
42/*
43 * Field testing shows we need to support up to 4096 stripe indices.
44 * We store each index as a u8 (u32 on the wire) to keep the memory footprint
45 * reasonable. This in turn means we support a maximum of 256
46 * RFC 5661 multipath_list4 structures.
47 */
48#define NFS4_PNFS_MAX_STRIPE_CNT 4096
49#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
50
51/* error codes for internal use */
52#define NFS4ERR_RESET_TO_MDS 12001
53
54enum stripetype4 {
55 STRIPE_SPARSE = 1,
56 STRIPE_DENSE = 2
57};
58
59/* Individual ip address */
60struct nfs4_pnfs_ds_addr {
61 struct sockaddr_storage da_addr;
62 size_t da_addrlen;
63 struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
64 char *da_remotestr; /* human readable addr+port */
65};
66
67struct nfs4_pnfs_ds {
68 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
69 char *ds_remotestr; /* comma sep list of addrs */
70 struct list_head ds_addrs;
71 struct nfs_client *ds_clp;
72 atomic_t ds_count;
73 unsigned long ds_state;
74#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
75};
76
77struct nfs4_file_layout_dsaddr {
78 struct nfs4_deviceid_node id_node;
79 u32 stripe_count;
80 u8 *stripe_indices;
81 u32 ds_num;
82 struct nfs4_pnfs_ds *ds_list[1];
83};
84
85struct nfs4_filelayout_segment {
86 struct pnfs_layout_segment generic_hdr;
87 u32 stripe_type;
88 u32 commit_through_mds;
89 u32 stripe_unit;
90 u32 first_stripe_index;
91 u64 pattern_offset;
92 struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
93 unsigned int num_fh;
94 struct nfs_fh **fh_array;
95};
96
97struct nfs4_filelayout {
98 struct pnfs_layout_hdr generic_hdr;
99 struct pnfs_ds_commit_info commit_info;
100};
101
102static inline struct nfs4_filelayout *
103FILELAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
104{
105 return container_of(lo, struct nfs4_filelayout, generic_hdr);
106}
107
108static inline struct nfs4_filelayout_segment *
109FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
110{
111 return container_of(lseg,
112 struct nfs4_filelayout_segment,
113 generic_hdr);
114}
115
116static inline struct nfs4_deviceid_node *
117FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
118{
119 return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
120}
121
122static inline void
123filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
124{
125 u32 *p = (u32 *)&node->deviceid;
126
127 printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n",
128 p[0], p[1], p[2], p[3]);
129
130 set_bit(NFS_DEVICEID_INVALID, &node->flags);
131}
132
133static inline bool
134filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
135{
136 return test_bit(NFS_DEVICEID_INVALID, &node->flags);
137}
138
139extern bool
140filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
141
142extern struct nfs_fh *
143nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
144
145extern void print_ds(struct nfs4_pnfs_ds *ds);
146u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
147u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
148struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
149 u32 ds_idx);
150extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
151extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
152struct nfs4_file_layout_dsaddr *
153filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
154 struct rpc_cred *cred, gfp_t gfp_flags);
155
156#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
new file mode 100644
index 000000000000..44bf0140a4c7
--- /dev/null
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -0,0 +1,843 @@
1/*
2 * Device operations for the pnfs nfs4 file layout driver.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 * Garth Goodson <Garth.Goodson@netapp.com>
10 *
11 * Permission is granted to use, copy, create derivative works, and
12 * redistribute this software and such derivative works for any purpose,
13 * so long as the name of the University of Michigan is not used in
14 * any advertising or publicity pertaining to the use or distribution
15 * of this software without specific, written prior authorization. If
16 * the above copyright notice or any other identification of the
17 * University of Michigan is included in any copy of any portion of
18 * this software, then the disclaimer below must also be included.
19 *
20 * This software is provided as is, without representation or warranty
21 * of any kind either express or implied, including without limitation
22 * the implied warranties of merchantability, fitness for a particular
23 * purpose, or noninfringement. The Regents of the University of
24 * Michigan shall not be liable for any damages, including special,
25 * indirect, incidental, or consequential damages, with respect to any
26 * claim arising out of or in connection with the use of the software,
27 * even if it has been or is hereafter advised of the possibility of
28 * such damages.
29 */
30
31#include <linux/nfs_fs.h>
32#include <linux/vmalloc.h>
33#include <linux/module.h>
34#include <linux/sunrpc/addr.h>
35
36#include "../internal.h"
37#include "../nfs4session.h"
38#include "filelayout.h"
39
40#define NFSDBG_FACILITY NFSDBG_PNFS_LD
41
42static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
43static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
44
45/*
46 * Data server cache
47 *
48 * Data servers can be mapped to different device ids.
49 * nfs4_pnfs_ds reference counting
50 * - set to 1 on allocation
51 * - incremented when a device id maps a data server already in the cache.
52 * - decremented when deviceid is removed from the cache.
53 */
54static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
55static LIST_HEAD(nfs4_data_server_cache);
56
57/* Debug routines */
58void
59print_ds(struct nfs4_pnfs_ds *ds)
60{
61 if (ds == NULL) {
62 printk("%s NULL device\n", __func__);
63 return;
64 }
65 printk(" ds %s\n"
66 " ref count %d\n"
67 " client %p\n"
68 " cl_exchange_flags %x\n",
69 ds->ds_remotestr,
70 atomic_read(&ds->ds_count), ds->ds_clp,
71 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
72}
73
74static bool
75same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
76{
77 struct sockaddr_in *a, *b;
78 struct sockaddr_in6 *a6, *b6;
79
80 if (addr1->sa_family != addr2->sa_family)
81 return false;
82
83 switch (addr1->sa_family) {
84 case AF_INET:
85 a = (struct sockaddr_in *)addr1;
86 b = (struct sockaddr_in *)addr2;
87
88 if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
89 a->sin_port == b->sin_port)
90 return true;
91 break;
92
93 case AF_INET6:
94 a6 = (struct sockaddr_in6 *)addr1;
95 b6 = (struct sockaddr_in6 *)addr2;
96
97 /* LINKLOCAL addresses must have matching scope_id */
98 if (ipv6_addr_src_scope(&a6->sin6_addr) ==
99 IPV6_ADDR_SCOPE_LINKLOCAL &&
100 a6->sin6_scope_id != b6->sin6_scope_id)
101 return false;
102
103 if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
104 a6->sin6_port == b6->sin6_port)
105 return true;
106 break;
107
108 default:
109 dprintk("%s: unhandled address family: %u\n",
110 __func__, addr1->sa_family);
111 return false;
112 }
113
114 return false;
115}
116
117static bool
118_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
119 const struct list_head *dsaddrs2)
120{
121 struct nfs4_pnfs_ds_addr *da1, *da2;
122
123 /* step through both lists, comparing as we go */
124 for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
125 da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
126 da1 != NULL && da2 != NULL;
127 da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
128 da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
129 if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
130 (struct sockaddr *)&da2->da_addr))
131 return false;
132 }
133 if (da1 == NULL && da2 == NULL)
134 return true;
135
136 return false;
137}
138
139/*
140 * Lookup DS by addresses. nfs4_ds_cache_lock is held
141 */
142static struct nfs4_pnfs_ds *
143_data_server_lookup_locked(const struct list_head *dsaddrs)
144{
145 struct nfs4_pnfs_ds *ds;
146
147 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
148 if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
149 return ds;
150 return NULL;
151}
152
153/*
154 * Create an rpc connection to the nfs4_pnfs_ds data server
155 * Currently only supports IPv4 and IPv6 addresses
156 */
157static int
158nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
159{
160 struct nfs_client *clp = ERR_PTR(-EIO);
161 struct nfs4_pnfs_ds_addr *da;
162 int status = 0;
163
164 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
165 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
166
167 list_for_each_entry(da, &ds->ds_addrs, da_node) {
168 dprintk("%s: DS %s: trying address %s\n",
169 __func__, ds->ds_remotestr, da->da_remotestr);
170
171 clp = nfs4_set_ds_client(mds_srv->nfs_client,
172 (struct sockaddr *)&da->da_addr,
173 da->da_addrlen, IPPROTO_TCP,
174 dataserver_timeo, dataserver_retrans);
175 if (!IS_ERR(clp))
176 break;
177 }
178
179 if (IS_ERR(clp)) {
180 status = PTR_ERR(clp);
181 goto out;
182 }
183
184 status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
185 if (status)
186 goto out_put;
187
188 smp_wmb();
189 ds->ds_clp = clp;
190 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
191out:
192 return status;
193out_put:
194 nfs_put_client(clp);
195 goto out;
196}
197
198static void
199destroy_ds(struct nfs4_pnfs_ds *ds)
200{
201 struct nfs4_pnfs_ds_addr *da;
202
203 dprintk("--> %s\n", __func__);
204 ifdebug(FACILITY)
205 print_ds(ds);
206
207 if (ds->ds_clp)
208 nfs_put_client(ds->ds_clp);
209
210 while (!list_empty(&ds->ds_addrs)) {
211 da = list_first_entry(&ds->ds_addrs,
212 struct nfs4_pnfs_ds_addr,
213 da_node);
214 list_del_init(&da->da_node);
215 kfree(da->da_remotestr);
216 kfree(da);
217 }
218
219 kfree(ds->ds_remotestr);
220 kfree(ds);
221}
222
223void
224nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
225{
226 struct nfs4_pnfs_ds *ds;
227 int i;
228
229 nfs4_print_deviceid(&dsaddr->id_node.deviceid);
230
231 for (i = 0; i < dsaddr->ds_num; i++) {
232 ds = dsaddr->ds_list[i];
233 if (ds != NULL) {
234 if (atomic_dec_and_lock(&ds->ds_count,
235 &nfs4_ds_cache_lock)) {
236 list_del_init(&ds->ds_node);
237 spin_unlock(&nfs4_ds_cache_lock);
238 destroy_ds(ds);
239 }
240 }
241 }
242 kfree(dsaddr->stripe_indices);
243 kfree(dsaddr);
244}
245
246/*
247 * Create a string with a human readable address and port to avoid
248 * complicated setup around many dprinks.
249 */
250static char *
251nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
252{
253 struct nfs4_pnfs_ds_addr *da;
254 char *remotestr;
255 size_t len;
256 char *p;
257
258 len = 3; /* '{', '}' and eol */
259 list_for_each_entry(da, dsaddrs, da_node) {
260 len += strlen(da->da_remotestr) + 1; /* string plus comma */
261 }
262
263 remotestr = kzalloc(len, gfp_flags);
264 if (!remotestr)
265 return NULL;
266
267 p = remotestr;
268 *(p++) = '{';
269 len--;
270 list_for_each_entry(da, dsaddrs, da_node) {
271 size_t ll = strlen(da->da_remotestr);
272
273 if (ll > len)
274 goto out_err;
275
276 memcpy(p, da->da_remotestr, ll);
277 p += ll;
278 len -= ll;
279
280 if (len < 1)
281 goto out_err;
282 (*p++) = ',';
283 len--;
284 }
285 if (len < 2)
286 goto out_err;
287 *(p++) = '}';
288 *p = '\0';
289 return remotestr;
290out_err:
291 kfree(remotestr);
292 return NULL;
293}
294
295static struct nfs4_pnfs_ds *
296nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
297{
298 struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
299 char *remotestr;
300
301 if (list_empty(dsaddrs)) {
302 dprintk("%s: no addresses defined\n", __func__);
303 goto out;
304 }
305
306 ds = kzalloc(sizeof(*ds), gfp_flags);
307 if (!ds)
308 goto out;
309
310 /* this is only used for debugging, so it's ok if its NULL */
311 remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
312
313 spin_lock(&nfs4_ds_cache_lock);
314 tmp_ds = _data_server_lookup_locked(dsaddrs);
315 if (tmp_ds == NULL) {
316 INIT_LIST_HEAD(&ds->ds_addrs);
317 list_splice_init(dsaddrs, &ds->ds_addrs);
318 ds->ds_remotestr = remotestr;
319 atomic_set(&ds->ds_count, 1);
320 INIT_LIST_HEAD(&ds->ds_node);
321 ds->ds_clp = NULL;
322 list_add(&ds->ds_node, &nfs4_data_server_cache);
323 dprintk("%s add new data server %s\n", __func__,
324 ds->ds_remotestr);
325 } else {
326 kfree(remotestr);
327 kfree(ds);
328 atomic_inc(&tmp_ds->ds_count);
329 dprintk("%s data server %s found, inc'ed ds_count to %d\n",
330 __func__, tmp_ds->ds_remotestr,
331 atomic_read(&tmp_ds->ds_count));
332 ds = tmp_ds;
333 }
334 spin_unlock(&nfs4_ds_cache_lock);
335out:
336 return ds;
337}
338
339/*
340 * Currently only supports ipv4, ipv6 and one multi-path address.
341 */
342static struct nfs4_pnfs_ds_addr *
343decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
344{
345 struct nfs4_pnfs_ds_addr *da = NULL;
346 char *buf, *portstr;
347 __be16 port;
348 int nlen, rlen;
349 int tmp[2];
350 __be32 *p;
351 char *netid, *match_netid;
352 size_t len, match_netid_len;
353 char *startsep = "";
354 char *endsep = "";
355
356
357 /* r_netid */
358 p = xdr_inline_decode(streamp, 4);
359 if (unlikely(!p))
360 goto out_err;
361 nlen = be32_to_cpup(p++);
362
363 p = xdr_inline_decode(streamp, nlen);
364 if (unlikely(!p))
365 goto out_err;
366
367 netid = kmalloc(nlen+1, gfp_flags);
368 if (unlikely(!netid))
369 goto out_err;
370
371 netid[nlen] = '\0';
372 memcpy(netid, p, nlen);
373
374 /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
375 p = xdr_inline_decode(streamp, 4);
376 if (unlikely(!p))
377 goto out_free_netid;
378 rlen = be32_to_cpup(p);
379
380 p = xdr_inline_decode(streamp, rlen);
381 if (unlikely(!p))
382 goto out_free_netid;
383
384 /* port is ".ABC.DEF", 8 chars max */
385 if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
386 dprintk("%s: Invalid address, length %d\n", __func__,
387 rlen);
388 goto out_free_netid;
389 }
390 buf = kmalloc(rlen + 1, gfp_flags);
391 if (!buf) {
392 dprintk("%s: Not enough memory\n", __func__);
393 goto out_free_netid;
394 }
395 buf[rlen] = '\0';
396 memcpy(buf, p, rlen);
397
398 /* replace port '.' with '-' */
399 portstr = strrchr(buf, '.');
400 if (!portstr) {
401 dprintk("%s: Failed finding expected dot in port\n",
402 __func__);
403 goto out_free_buf;
404 }
405 *portstr = '-';
406
407 /* find '.' between address and port */
408 portstr = strrchr(buf, '.');
409 if (!portstr) {
410 dprintk("%s: Failed finding expected dot between address and "
411 "port\n", __func__);
412 goto out_free_buf;
413 }
414 *portstr = '\0';
415
416 da = kzalloc(sizeof(*da), gfp_flags);
417 if (unlikely(!da))
418 goto out_free_buf;
419
420 INIT_LIST_HEAD(&da->da_node);
421
422 if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
423 sizeof(da->da_addr))) {
424 dprintk("%s: error parsing address %s\n", __func__, buf);
425 goto out_free_da;
426 }
427
428 portstr++;
429 sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
430 port = htons((tmp[0] << 8) | (tmp[1]));
431
432 switch (da->da_addr.ss_family) {
433 case AF_INET:
434 ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
435 da->da_addrlen = sizeof(struct sockaddr_in);
436 match_netid = "tcp";
437 match_netid_len = 3;
438 break;
439
440 case AF_INET6:
441 ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
442 da->da_addrlen = sizeof(struct sockaddr_in6);
443 match_netid = "tcp6";
444 match_netid_len = 4;
445 startsep = "[";
446 endsep = "]";
447 break;
448
449 default:
450 dprintk("%s: unsupported address family: %u\n",
451 __func__, da->da_addr.ss_family);
452 goto out_free_da;
453 }
454
455 if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
456 dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
457 __func__, netid, match_netid);
458 goto out_free_da;
459 }
460
461 /* save human readable address */
462 len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
463 da->da_remotestr = kzalloc(len, gfp_flags);
464
465 /* NULL is ok, only used for dprintk */
466 if (da->da_remotestr)
467 snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
468 buf, endsep, ntohs(port));
469
470 dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
471 kfree(buf);
472 kfree(netid);
473 return da;
474
475out_free_da:
476 kfree(da);
477out_free_buf:
478 dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
479 kfree(buf);
480out_free_netid:
481 kfree(netid);
482out_err:
483 return NULL;
484}
485
486/* Decode opaque device data and return the result */
487static struct nfs4_file_layout_dsaddr*
488decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
489{
490 int i;
491 u32 cnt, num;
492 u8 *indexp;
493 __be32 *p;
494 u8 *stripe_indices;
495 u8 max_stripe_index;
496 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
497 struct xdr_stream stream;
498 struct xdr_buf buf;
499 struct page *scratch;
500 struct list_head dsaddrs;
501 struct nfs4_pnfs_ds_addr *da;
502
503 /* set up xdr stream */
504 scratch = alloc_page(gfp_flags);
505 if (!scratch)
506 goto out_err;
507
508 xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
509 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
510
511 /* Get the stripe count (number of stripe index) */
512 p = xdr_inline_decode(&stream, 4);
513 if (unlikely(!p))
514 goto out_err_free_scratch;
515
516 cnt = be32_to_cpup(p);
517 dprintk("%s stripe count %d\n", __func__, cnt);
518 if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
519 printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
520 "supported maximum %d\n", __func__,
521 cnt, NFS4_PNFS_MAX_STRIPE_CNT);
522 goto out_err_free_scratch;
523 }
524
525 /* read stripe indices */
526 stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags);
527 if (!stripe_indices)
528 goto out_err_free_scratch;
529
530 p = xdr_inline_decode(&stream, cnt << 2);
531 if (unlikely(!p))
532 goto out_err_free_stripe_indices;
533
534 indexp = &stripe_indices[0];
535 max_stripe_index = 0;
536 for (i = 0; i < cnt; i++) {
537 *indexp = be32_to_cpup(p++);
538 max_stripe_index = max(max_stripe_index, *indexp);
539 indexp++;
540 }
541
542 /* Check the multipath list count */
543 p = xdr_inline_decode(&stream, 4);
544 if (unlikely(!p))
545 goto out_err_free_stripe_indices;
546
547 num = be32_to_cpup(p);
548 dprintk("%s ds_num %u\n", __func__, num);
549 if (num > NFS4_PNFS_MAX_MULTI_CNT) {
550 printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
551 "supported maximum %d\n", __func__,
552 num, NFS4_PNFS_MAX_MULTI_CNT);
553 goto out_err_free_stripe_indices;
554 }
555
556 /* validate stripe indices are all < num */
557 if (max_stripe_index >= num) {
558 printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
559 __func__, max_stripe_index, num);
560 goto out_err_free_stripe_indices;
561 }
562
563 dsaddr = kzalloc(sizeof(*dsaddr) +
564 (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
565 gfp_flags);
566 if (!dsaddr)
567 goto out_err_free_stripe_indices;
568
569 dsaddr->stripe_count = cnt;
570 dsaddr->stripe_indices = stripe_indices;
571 stripe_indices = NULL;
572 dsaddr->ds_num = num;
573 nfs4_init_deviceid_node(&dsaddr->id_node,
574 NFS_SERVER(ino)->pnfs_curr_ld,
575 NFS_SERVER(ino)->nfs_client,
576 &pdev->dev_id);
577
578 INIT_LIST_HEAD(&dsaddrs);
579
580 for (i = 0; i < dsaddr->ds_num; i++) {
581 int j;
582 u32 mp_count;
583
584 p = xdr_inline_decode(&stream, 4);
585 if (unlikely(!p))
586 goto out_err_free_deviceid;
587
588 mp_count = be32_to_cpup(p); /* multipath count */
589 for (j = 0; j < mp_count; j++) {
590 da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
591 &stream, gfp_flags);
592 if (da)
593 list_add_tail(&da->da_node, &dsaddrs);
594 }
595 if (list_empty(&dsaddrs)) {
596 dprintk("%s: no suitable DS addresses found\n",
597 __func__);
598 goto out_err_free_deviceid;
599 }
600
601 dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
602 if (!dsaddr->ds_list[i])
603 goto out_err_drain_dsaddrs;
604
605 /* If DS was already in cache, free ds addrs */
606 while (!list_empty(&dsaddrs)) {
607 da = list_first_entry(&dsaddrs,
608 struct nfs4_pnfs_ds_addr,
609 da_node);
610 list_del_init(&da->da_node);
611 kfree(da->da_remotestr);
612 kfree(da);
613 }
614 }
615
616 __free_page(scratch);
617 return dsaddr;
618
619out_err_drain_dsaddrs:
620 while (!list_empty(&dsaddrs)) {
621 da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
622 da_node);
623 list_del_init(&da->da_node);
624 kfree(da->da_remotestr);
625 kfree(da);
626 }
627out_err_free_deviceid:
628 nfs4_fl_free_deviceid(dsaddr);
629 /* stripe_indicies was part of dsaddr */
630 goto out_err_free_scratch;
631out_err_free_stripe_indices:
632 kfree(stripe_indices);
633out_err_free_scratch:
634 __free_page(scratch);
635out_err:
636 dprintk("%s ERROR: returning NULL\n", __func__);
637 return NULL;
638}
639
640/*
641 * Decode the opaque device specified in 'dev' and add it to the cache of
642 * available devices.
643 */
644static struct nfs4_file_layout_dsaddr *
645decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
646{
647 struct nfs4_deviceid_node *d;
648 struct nfs4_file_layout_dsaddr *n, *new;
649
650 new = decode_device(inode, dev, gfp_flags);
651 if (!new) {
652 printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
653 __func__);
654 return NULL;
655 }
656
657 d = nfs4_insert_deviceid_node(&new->id_node);
658 n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
659 if (n != new) {
660 nfs4_fl_free_deviceid(new);
661 return n;
662 }
663
664 return new;
665}
666
667/*
668 * Retrieve the information for dev_id, add it to the list
669 * of available devices, and return it.
670 */
671struct nfs4_file_layout_dsaddr *
672filelayout_get_device_info(struct inode *inode,
673 struct nfs4_deviceid *dev_id,
674 struct rpc_cred *cred,
675 gfp_t gfp_flags)
676{
677 struct pnfs_device *pdev = NULL;
678 u32 max_resp_sz;
679 int max_pages;
680 struct page **pages = NULL;
681 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
682 int rc, i;
683 struct nfs_server *server = NFS_SERVER(inode);
684
685 /*
686 * Use the session max response size as the basis for setting
687 * GETDEVICEINFO's maxcount
688 */
689 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
690 max_pages = nfs_page_array_len(0, max_resp_sz);
691 dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
692 __func__, inode, max_resp_sz, max_pages);
693
694 pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
695 if (pdev == NULL)
696 return NULL;
697
698 pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
699 if (pages == NULL) {
700 kfree(pdev);
701 return NULL;
702 }
703 for (i = 0; i < max_pages; i++) {
704 pages[i] = alloc_page(gfp_flags);
705 if (!pages[i])
706 goto out_free;
707 }
708
709 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
710 pdev->layout_type = LAYOUT_NFSV4_1_FILES;
711 pdev->pages = pages;
712 pdev->pgbase = 0;
713 pdev->pglen = max_resp_sz;
714 pdev->mincount = 0;
715 pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
716
717 rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
718 dprintk("%s getdevice info returns %d\n", __func__, rc);
719 if (rc)
720 goto out_free;
721
722 /*
723 * Found new device, need to decode it and then add it to the
724 * list of known devices for this mountpoint.
725 */
726 dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
727out_free:
728 for (i = 0; i < max_pages; i++)
729 __free_page(pages[i]);
730 kfree(pages);
731 kfree(pdev);
732 dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
733 return dsaddr;
734}
735
736void
737nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
738{
739 nfs4_put_deviceid_node(&dsaddr->id_node);
740}
741
742/*
743 * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
744 * Then: ((res + fsi) % dsaddr->stripe_count)
745 */
746u32
747nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
748{
749 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
750 u64 tmp;
751
752 tmp = offset - flseg->pattern_offset;
753 do_div(tmp, flseg->stripe_unit);
754 tmp += flseg->first_stripe_index;
755 return do_div(tmp, flseg->dsaddr->stripe_count);
756}
757
758u32
759nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
760{
761 return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
762}
763
764struct nfs_fh *
765nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
766{
767 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
768 u32 i;
769
770 if (flseg->stripe_type == STRIPE_SPARSE) {
771 if (flseg->num_fh == 1)
772 i = 0;
773 else if (flseg->num_fh == 0)
774 /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
775 return NULL;
776 else
777 i = nfs4_fl_calc_ds_index(lseg, j);
778 } else
779 i = j;
780 return flseg->fh_array[i];
781}
782
783static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
784{
785 might_sleep();
786 wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
787 nfs_wait_bit_killable, TASK_KILLABLE);
788}
789
790static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
791{
792 smp_mb__before_atomic();
793 clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
794 smp_mb__after_atomic();
795 wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
796}
797
798
799struct nfs4_pnfs_ds *
800nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
801{
802 struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
803 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
804 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
805 struct nfs4_pnfs_ds *ret = ds;
806
807 if (ds == NULL) {
808 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
809 __func__, ds_idx);
810 filelayout_mark_devid_invalid(devid);
811 goto out;
812 }
813 smp_rmb();
814 if (ds->ds_clp)
815 goto out_test_devid;
816
817 if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
818 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
819 int err;
820
821 err = nfs4_ds_connect(s, ds);
822 if (err)
823 nfs4_mark_deviceid_unavailable(devid);
824 nfs4_clear_ds_conn_bit(ds);
825 } else {
826 /* Either ds is connected, or ds is NULL */
827 nfs4_wait_ds_connect(ds);
828 }
829out_test_devid:
830 if (filelayout_test_devid_unavailable(devid))
831 ret = NULL;
832out:
833 return ret;
834}
835
836module_param(dataserver_retrans, uint, 0644);
837MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
838 "retries a request before it attempts further "
839 " recovery action.");
840module_param(dataserver_timeo, uint, 0644);
841MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
842 "NFSv4.1 client waits for a response from a "
843 " data server before it retries an NFS request.");