diff options
author | David Howells <dhowells@redhat.com> | 2018-10-19 19:57:59 -0400 |
---|---|---|
committer | David Howells <dhowells@redhat.com> | 2018-10-23 19:41:09 -0400 |
commit | 3bf0fb6f33dd545693da5e65f5b1b9b9f0bfc35e (patch) | |
tree | df215e6a6ad11b6ac8158461144667e168591d28 /fs/afs/rotate.c | |
parent | 18ac61853cc4e44eb30e125fc8344a3b25c7b6fe (diff) |
afs: Probe multiple fileservers simultaneously
Send probes to all the unprobed fileservers in a fileserver list on all
addresses simultaneously in an attempt to find out the fastest route whilst
not getting stuck for 20s on any server or address that we don't get a
reply from.
This alleviates the problem whereby attempting to access a new server can
take a long time because the rotation algorithm ends up rotating through
all servers and addresses until it finds one that responds.
Signed-off-by: David Howells <dhowells@redhat.com>
Diffstat (limited to 'fs/afs/rotate.c')
-rw-r--r-- | fs/afs/rotate.c | 174 |
1 files changed, 114 insertions, 60 deletions
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 7c4487781637..00504254c1c2 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c | |||
@@ -19,14 +19,6 @@ | |||
19 | #include "afs_fs.h" | 19 | #include "afs_fs.h" |
20 | 20 | ||
21 | /* | 21 | /* |
22 | * Initialise a filesystem server cursor for iterating over FS servers. | ||
23 | */ | ||
24 | static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) | ||
25 | { | ||
26 | memset(fc, 0, sizeof(*fc)); | ||
27 | } | ||
28 | |||
29 | /* | ||
30 | * Begin an operation on the fileserver. | 22 | * Begin an operation on the fileserver. |
31 | * | 23 | * |
32 | * Fileserver operations are serialised on the server by vnode, so we serialise | 24 | * Fileserver operations are serialised on the server by vnode, so we serialise |
@@ -35,7 +27,7 @@ static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode | |||
35 | bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode, | 27 | bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode, |
36 | struct key *key) | 28 | struct key *key) |
37 | { | 29 | { |
38 | afs_init_fs_cursor(fc, vnode); | 30 | memset(fc, 0, sizeof(*fc)); |
39 | fc->vnode = vnode; | 31 | fc->vnode = vnode; |
40 | fc->key = key; | 32 | fc->key = key; |
41 | fc->ac.error = SHRT_MAX; | 33 | fc->ac.error = SHRT_MAX; |
@@ -66,12 +58,15 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, | |||
66 | fc->server_list = afs_get_serverlist(vnode->volume->servers); | 58 | fc->server_list = afs_get_serverlist(vnode->volume->servers); |
67 | read_unlock(&vnode->volume->servers_lock); | 59 | read_unlock(&vnode->volume->servers_lock); |
68 | 60 | ||
61 | fc->untried = (1UL << fc->server_list->nr_servers) - 1; | ||
62 | fc->index = READ_ONCE(fc->server_list->preferred); | ||
63 | |||
69 | cbi = vnode->cb_interest; | 64 | cbi = vnode->cb_interest; |
70 | if (cbi) { | 65 | if (cbi) { |
71 | /* See if the vnode's preferred record is still available */ | 66 | /* See if the vnode's preferred record is still available */ |
72 | for (i = 0; i < fc->server_list->nr_servers; i++) { | 67 | for (i = 0; i < fc->server_list->nr_servers; i++) { |
73 | if (fc->server_list->servers[i].cb_interest == cbi) { | 68 | if (fc->server_list->servers[i].cb_interest == cbi) { |
74 | fc->start = i; | 69 | fc->index = i; |
75 | goto found_interest; | 70 | goto found_interest; |
76 | } | 71 | } |
77 | } | 72 | } |
@@ -95,12 +90,9 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, | |||
95 | 90 | ||
96 | afs_put_cb_interest(afs_v2net(vnode), cbi); | 91 | afs_put_cb_interest(afs_v2net(vnode), cbi); |
97 | cbi = NULL; | 92 | cbi = NULL; |
98 | } else { | ||
99 | fc->start = READ_ONCE(fc->server_list->index); | ||
100 | } | 93 | } |
101 | 94 | ||
102 | found_interest: | 95 | found_interest: |
103 | fc->index = fc->start; | ||
104 | return true; | 96 | return true; |
105 | } | 97 | } |
106 | 98 | ||
@@ -144,11 +136,12 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) | |||
144 | struct afs_addr_list *alist; | 136 | struct afs_addr_list *alist; |
145 | struct afs_server *server; | 137 | struct afs_server *server; |
146 | struct afs_vnode *vnode = fc->vnode; | 138 | struct afs_vnode *vnode = fc->vnode; |
147 | int error = fc->ac.error; | 139 | u32 rtt, abort_code; |
140 | int error = fc->ac.error, i; | ||
148 | 141 | ||
149 | _enter("%u/%u,%u/%u,%d,%d", | 142 | _enter("%lx[%d],%lx[%d],%d,%d", |
150 | fc->index, fc->start, | 143 | fc->untried, fc->index, |
151 | fc->ac.index, fc->ac.start, | 144 | fc->ac.tried, fc->ac.index, |
152 | error, fc->ac.abort_code); | 145 | error, fc->ac.abort_code); |
153 | 146 | ||
154 | if (fc->flags & AFS_FS_CURSOR_STOP) { | 147 | if (fc->flags & AFS_FS_CURSOR_STOP) { |
@@ -345,8 +338,50 @@ start: | |||
345 | if (!afs_start_fs_iteration(fc, vnode)) | 338 | if (!afs_start_fs_iteration(fc, vnode)) |
346 | goto failed; | 339 | goto failed; |
347 | 340 | ||
348 | use_server: | 341 | _debug("__ VOL %llx __", vnode->volume->vid); |
349 | _debug("use"); | 342 | error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list); |
343 | if (error < 0) | ||
344 | goto failed_set_error; | ||
345 | |||
346 | pick_server: | ||
347 | _debug("pick [%lx]", fc->untried); | ||
348 | |||
349 | error = afs_wait_for_fs_probes(fc->server_list, fc->untried); | ||
350 | if (error < 0) | ||
351 | goto failed_set_error; | ||
352 | |||
353 | /* Pick the untried server with the lowest RTT. If we have outstanding | ||
354 | * callbacks, we stick with the server we're already using if we can. | ||
355 | */ | ||
356 | if (fc->cbi) { | ||
357 | _debug("cbi %u", fc->index); | ||
358 | if (test_bit(fc->index, &fc->untried)) | ||
359 | goto selected_server; | ||
360 | afs_put_cb_interest(afs_v2net(vnode), fc->cbi); | ||
361 | fc->cbi = NULL; | ||
362 | _debug("nocbi"); | ||
363 | } | ||
364 | |||
365 | fc->index = -1; | ||
366 | rtt = U32_MAX; | ||
367 | for (i = 0; i < fc->server_list->nr_servers; i++) { | ||
368 | struct afs_server *s = fc->server_list->servers[i].server; | ||
369 | |||
370 | if (!test_bit(i, &fc->untried) || !s->probe.responded) | ||
371 | continue; | ||
372 | if (s->probe.rtt < rtt) { | ||
373 | fc->index = i; | ||
374 | rtt = s->probe.rtt; | ||
375 | } | ||
376 | } | ||
377 | |||
378 | if (fc->index == -1) | ||
379 | goto no_more_servers; | ||
380 | |||
381 | selected_server: | ||
382 | _debug("use %d", fc->index); | ||
383 | __clear_bit(fc->index, &fc->untried); | ||
384 | |||
350 | /* We're starting on a different fileserver from the list. We need to | 385 | /* We're starting on a different fileserver from the list. We need to |
351 | * check it, create a callback intercept, find its address list and | 386 | * check it, create a callback intercept, find its address list and |
352 | * probe its capabilities before we use it. | 387 | * probe its capabilities before we use it. |
@@ -379,60 +414,81 @@ use_server: | |||
379 | 414 | ||
380 | memset(&fc->ac, 0, sizeof(fc->ac)); | 415 | memset(&fc->ac, 0, sizeof(fc->ac)); |
381 | 416 | ||
382 | /* Probe the current fileserver if we haven't done so yet. */ | ||
383 | if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) { | ||
384 | fc->ac.alist = afs_get_addrlist(alist); | ||
385 | |||
386 | if (!afs_probe_fileserver(fc)) { | ||
387 | switch (fc->ac.error) { | ||
388 | case -ENOMEM: | ||
389 | case -ERESTARTSYS: | ||
390 | case -EINTR: | ||
391 | goto failed; | ||
392 | default: | ||
393 | goto next_server; | ||
394 | } | ||
395 | } | ||
396 | } | ||
397 | |||
398 | if (!fc->ac.alist) | 417 | if (!fc->ac.alist) |
399 | fc->ac.alist = alist; | 418 | fc->ac.alist = alist; |
400 | else | 419 | else |
401 | afs_put_addrlist(alist); | 420 | afs_put_addrlist(alist); |
402 | 421 | ||
403 | fc->ac.start = READ_ONCE(alist->index); | 422 | fc->ac.index = -1; |
404 | fc->ac.index = fc->ac.start; | ||
405 | 423 | ||
406 | iterate_address: | 424 | iterate_address: |
407 | ASSERT(fc->ac.alist); | 425 | ASSERT(fc->ac.alist); |
408 | _debug("iterate %d/%d", fc->ac.index, fc->ac.alist->nr_addrs); | ||
409 | /* Iterate over the current server's address list to try and find an | 426 | /* Iterate over the current server's address list to try and find an |
410 | * address on which it will respond to us. | 427 | * address on which it will respond to us. |
411 | */ | 428 | */ |
412 | if (!afs_iterate_addresses(&fc->ac)) | 429 | if (!afs_iterate_addresses(&fc->ac)) |
413 | goto next_server; | 430 | goto next_server; |
414 | 431 | ||
432 | _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs); | ||
433 | |||
415 | _leave(" = t"); | 434 | _leave(" = t"); |
416 | return true; | 435 | return true; |
417 | 436 | ||
418 | next_server: | 437 | next_server: |
419 | _debug("next"); | 438 | _debug("next"); |
420 | afs_end_cursor(&fc->ac); | 439 | afs_end_cursor(&fc->ac); |
421 | afs_put_cb_interest(afs_v2net(vnode), fc->cbi); | 440 | goto pick_server; |
422 | fc->cbi = NULL; | ||
423 | fc->index++; | ||
424 | if (fc->index >= fc->server_list->nr_servers) | ||
425 | fc->index = 0; | ||
426 | if (fc->index != fc->start) | ||
427 | goto use_server; | ||
428 | 441 | ||
442 | no_more_servers: | ||
429 | /* That's all the servers poked to no good effect. Try again if some | 443 | /* That's all the servers poked to no good effect. Try again if some |
430 | * of them were busy. | 444 | * of them were busy. |
431 | */ | 445 | */ |
432 | if (fc->flags & AFS_FS_CURSOR_VBUSY) | 446 | if (fc->flags & AFS_FS_CURSOR_VBUSY) |
433 | goto restart_from_beginning; | 447 | goto restart_from_beginning; |
434 | 448 | ||
435 | goto failed; | 449 | abort_code = 0; |
450 | error = -EDESTADDRREQ; | ||
451 | for (i = 0; i < fc->server_list->nr_servers; i++) { | ||
452 | struct afs_server *s = fc->server_list->servers[i].server; | ||
453 | int probe_error = READ_ONCE(s->probe.error); | ||
454 | |||
455 | switch (probe_error) { | ||
456 | case 0: | ||
457 | continue; | ||
458 | default: | ||
459 | if (error == -ETIMEDOUT || | ||
460 | error == -ETIME) | ||
461 | continue; | ||
462 | case -ETIMEDOUT: | ||
463 | case -ETIME: | ||
464 | if (error == -ENOMEM || | ||
465 | error == -ENONET) | ||
466 | continue; | ||
467 | case -ENOMEM: | ||
468 | case -ENONET: | ||
469 | if (error == -ENETUNREACH) | ||
470 | continue; | ||
471 | case -ENETUNREACH: | ||
472 | if (error == -EHOSTUNREACH) | ||
473 | continue; | ||
474 | case -EHOSTUNREACH: | ||
475 | if (error == -ECONNREFUSED) | ||
476 | continue; | ||
477 | case -ECONNREFUSED: | ||
478 | if (error == -ECONNRESET) | ||
479 | continue; | ||
480 | case -ECONNRESET: /* Responded, but call expired. */ | ||
481 | if (error == -ECONNABORTED) | ||
482 | continue; | ||
483 | case -ECONNABORTED: | ||
484 | abort_code = s->probe.abort_code; | ||
485 | error = probe_error; | ||
486 | continue; | ||
487 | } | ||
488 | } | ||
489 | |||
490 | if (error == -ECONNABORTED) | ||
491 | error = afs_abort_to_error(abort_code); | ||
436 | 492 | ||
437 | failed_set_error: | 493 | failed_set_error: |
438 | fc->error = error; | 494 | fc->error = error; |
@@ -480,8 +536,7 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) | |||
480 | 536 | ||
481 | memset(&fc->ac, 0, sizeof(fc->ac)); | 537 | memset(&fc->ac, 0, sizeof(fc->ac)); |
482 | fc->ac.alist = alist; | 538 | fc->ac.alist = alist; |
483 | fc->ac.start = READ_ONCE(alist->index); | 539 | fc->ac.index = -1; |
484 | fc->ac.index = fc->ac.start; | ||
485 | goto iterate_address; | 540 | goto iterate_address; |
486 | 541 | ||
487 | case 0: | 542 | case 0: |
@@ -538,13 +593,13 @@ static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc) | |||
538 | pr_notice("EDESTADDR occurred\n"); | 593 | pr_notice("EDESTADDR occurred\n"); |
539 | pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n", | 594 | pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n", |
540 | fc->cb_break, fc->cb_break_2, fc->flags, fc->error); | 595 | fc->cb_break, fc->cb_break_2, fc->flags, fc->error); |
541 | pr_notice("FC: st=%u ix=%u ni=%u\n", | 596 | pr_notice("FC: ut=%lx ix=%d ni=%u\n", |
542 | fc->start, fc->index, fc->nr_iterations); | 597 | fc->untried, fc->index, fc->nr_iterations); |
543 | 598 | ||
544 | if (fc->server_list) { | 599 | if (fc->server_list) { |
545 | const struct afs_server_list *sl = fc->server_list; | 600 | const struct afs_server_list *sl = fc->server_list; |
546 | pr_notice("FC: SL nr=%u ix=%u vnov=%hx\n", | 601 | pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n", |
547 | sl->nr_servers, sl->index, sl->vnovol_mask); | 602 | sl->nr_servers, sl->preferred, sl->vnovol_mask); |
548 | for (i = 0; i < sl->nr_servers; i++) { | 603 | for (i = 0; i < sl->nr_servers; i++) { |
549 | const struct afs_server *s = sl->servers[i].server; | 604 | const struct afs_server *s = sl->servers[i].server; |
550 | pr_notice("FC: server fl=%lx av=%u %pU\n", | 605 | pr_notice("FC: server fl=%lx av=%u %pU\n", |
@@ -552,22 +607,21 @@ static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc) | |||
552 | if (s->addresses) { | 607 | if (s->addresses) { |
553 | const struct afs_addr_list *a = | 608 | const struct afs_addr_list *a = |
554 | rcu_dereference(s->addresses); | 609 | rcu_dereference(s->addresses); |
555 | pr_notice("FC: - av=%u nr=%u/%u/%u ax=%u\n", | 610 | pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n", |
556 | a->version, | 611 | a->version, |
557 | a->nr_ipv4, a->nr_addrs, a->max_addrs, | 612 | a->nr_ipv4, a->nr_addrs, a->max_addrs, |
558 | a->index); | 613 | a->preferred); |
559 | pr_notice("FC: - pr=%lx yf=%lx\n", | 614 | pr_notice("FC: - pr=%lx R=%lx F=%lx\n", |
560 | a->probed, a->yfs); | 615 | a->probed, a->responded, a->failed); |
561 | if (a == fc->ac.alist) | 616 | if (a == fc->ac.alist) |
562 | pr_notice("FC: - current\n"); | 617 | pr_notice("FC: - current\n"); |
563 | } | 618 | } |
564 | } | 619 | } |
565 | } | 620 | } |
566 | 621 | ||
567 | pr_notice("AC: as=%u ax=%u ac=%d er=%d b=%u r=%u ni=%u\n", | 622 | pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", |
568 | fc->ac.start, fc->ac.index, fc->ac.abort_code, fc->ac.error, | 623 | fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error, |
569 | fc->ac.begun, fc->ac.responded, fc->ac.nr_iterations); | 624 | fc->ac.responded, fc->ac.nr_iterations); |
570 | |||
571 | rcu_read_unlock(); | 625 | rcu_read_unlock(); |
572 | } | 626 | } |
573 | 627 | ||