aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-06-04 18:37:44 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-06-04 18:37:44 -0400
commitd2dd328b7f7bc6cebe167648289337755944ad2a (patch)
tree5d664a2db1ac209f7537452ddc02597972f7aa37
parentc1518f12bab97a6d409a25aaccb02dc8895800f3 (diff)
parent1abec4fdbb142e3ccb6ce99832fae42129134a96 (diff)
Merge branch 'for-linus' of git://git.kernel.dk/linux-2.6-block
* 'for-linus' of git://git.kernel.dk/linux-2.6-block: (27 commits) block: make blk_init_free_list and elevator_init idempotent block: avoid unconditionally freeing previously allocated request_queue pipe: change /proc/sys/fs/pipe-max-pages to byte sized interface pipe: change the privilege required for growing a pipe beyond system max pipe: adjust minimum pipe size to 1 page block: disable preemption before using sched_clock() cciss: call BUG() earlier Preparing 8.3.8rc2 drbd: Reduce verbosity drbd: use drbd specific ratelimit instead of global printk_ratelimit drbd: fix hang on local read errors while disconnected drbd: Removed the now empty w_io_error() function drbd: removed duplicated #includes drbd: improve usage of MSG_MORE drbd: need to set socket bufsize early to take effect drbd: improve network latency, TCP_QUICKACK drbd: Revert "drbd: Create new current UUID as late as possible" brd: support discard Revert "writeback: fix WB_SYNC_NONE writeback from umount" Revert "writeback: ensure that WB_SYNC_NONE writeback with sb pinned is sync" ...
-rw-r--r--block/blk-core.c20
-rw-r--r--block/cfq-iosched.c101
-rw-r--r--block/elevator.c8
-rw-r--r--drivers/block/brd.c53
-rw-r--r--drivers/block/cciss_scsi.c2
-rw-r--r--drivers/block/drbd/drbd_int.h14
-rw-r--r--drivers/block/drbd/drbd_main.c68
-rw-r--r--drivers/block/drbd/drbd_receiver.c45
-rw-r--r--drivers/block/drbd/drbd_req.c54
-rw-r--r--drivers/block/drbd/drbd_req.h1
-rw-r--r--drivers/block/drbd/drbd_worker.c24
-rw-r--r--fs/fs-writeback.c64
-rw-r--r--fs/pipe.c77
-rw-r--r--fs/splice.c2
-rw-r--r--fs/sync.c2
-rw-r--r--include/linux/backing-dev.h2
-rw-r--r--include/linux/blkdev.h9
-rw-r--r--include/linux/drbd.h2
-rw-r--r--include/linux/iocontext.h1
-rw-r--r--include/linux/pipe_fs_i.h4
-rw-r--r--include/linux/writeback.h10
-rw-r--r--kernel/sysctl.c8
-rw-r--r--mm/page-writeback.c4
23 files changed, 311 insertions, 264 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 3bc5579d6f54..f84cce42fc58 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -467,6 +467,9 @@ static int blk_init_free_list(struct request_queue *q)
467{ 467{
468 struct request_list *rl = &q->rq; 468 struct request_list *rl = &q->rq;
469 469
470 if (unlikely(rl->rq_pool))
471 return 0;
472
470 rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; 473 rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
471 rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; 474 rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
472 rl->elvpriv = 0; 475 rl->elvpriv = 0;
@@ -570,9 +573,17 @@ EXPORT_SYMBOL(blk_init_queue);
570struct request_queue * 573struct request_queue *
571blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) 574blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
572{ 575{
573 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); 576 struct request_queue *uninit_q, *q;
577
578 uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
579 if (!uninit_q)
580 return NULL;
581
582 q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
583 if (!q)
584 blk_cleanup_queue(uninit_q);
574 585
575 return blk_init_allocated_queue_node(q, rfn, lock, node_id); 586 return q;
576} 587}
577EXPORT_SYMBOL(blk_init_queue_node); 588EXPORT_SYMBOL(blk_init_queue_node);
578 589
@@ -592,10 +603,8 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
592 return NULL; 603 return NULL;
593 604
594 q->node = node_id; 605 q->node = node_id;
595 if (blk_init_free_list(q)) { 606 if (blk_init_free_list(q))
596 kmem_cache_free(blk_requestq_cachep, q);
597 return NULL; 607 return NULL;
598 }
599 608
600 q->request_fn = rfn; 609 q->request_fn = rfn;
601 q->prep_rq_fn = NULL; 610 q->prep_rq_fn = NULL;
@@ -618,7 +627,6 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
618 return q; 627 return q;
619 } 628 }
620 629
621 blk_put_queue(q);
622 return NULL; 630 return NULL;
623} 631}
624EXPORT_SYMBOL(blk_init_allocated_queue_node); 632EXPORT_SYMBOL(blk_init_allocated_queue_node);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ed897b5ef315..5ff4f4850e71 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -64,6 +64,9 @@ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
64static struct completion *ioc_gone; 64static struct completion *ioc_gone;
65static DEFINE_SPINLOCK(ioc_gone_lock); 65static DEFINE_SPINLOCK(ioc_gone_lock);
66 66
67static DEFINE_SPINLOCK(cic_index_lock);
68static DEFINE_IDA(cic_index_ida);
69
67#define CFQ_PRIO_LISTS IOPRIO_BE_NR 70#define CFQ_PRIO_LISTS IOPRIO_BE_NR
68#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 71#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
69#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 72#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
@@ -271,6 +274,7 @@ struct cfq_data {
271 unsigned int cfq_latency; 274 unsigned int cfq_latency;
272 unsigned int cfq_group_isolation; 275 unsigned int cfq_group_isolation;
273 276
277 unsigned int cic_index;
274 struct list_head cic_list; 278 struct list_head cic_list;
275 279
276 /* 280 /*
@@ -430,6 +434,24 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
430 cic->cfqq[is_sync] = cfqq; 434 cic->cfqq[is_sync] = cfqq;
431} 435}
432 436
437#define CIC_DEAD_KEY 1ul
438#define CIC_DEAD_INDEX_SHIFT 1
439
440static inline void *cfqd_dead_key(struct cfq_data *cfqd)
441{
442 return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
443}
444
445static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
446{
447 struct cfq_data *cfqd = cic->key;
448
449 if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
450 return NULL;
451
452 return cfqd;
453}
454
433/* 455/*
434 * We regard a request as SYNC, if it's either a read or has the SYNC bit 456 * We regard a request as SYNC, if it's either a read or has the SYNC bit
435 * set (in which case it could also be direct WRITE). 457 * set (in which case it could also be direct WRITE).
@@ -2510,11 +2532,12 @@ static void cfq_cic_free(struct cfq_io_context *cic)
2510static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) 2532static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
2511{ 2533{
2512 unsigned long flags; 2534 unsigned long flags;
2535 unsigned long dead_key = (unsigned long) cic->key;
2513 2536
2514 BUG_ON(!cic->dead_key); 2537 BUG_ON(!(dead_key & CIC_DEAD_KEY));
2515 2538
2516 spin_lock_irqsave(&ioc->lock, flags); 2539 spin_lock_irqsave(&ioc->lock, flags);
2517 radix_tree_delete(&ioc->radix_root, cic->dead_key); 2540 radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
2518 hlist_del_rcu(&cic->cic_list); 2541 hlist_del_rcu(&cic->cic_list);
2519 spin_unlock_irqrestore(&ioc->lock, flags); 2542 spin_unlock_irqrestore(&ioc->lock, flags);
2520 2543
@@ -2537,15 +2560,10 @@ static void cfq_free_io_context(struct io_context *ioc)
2537 __call_for_each_cic(ioc, cic_free_func); 2560 __call_for_each_cic(ioc, cic_free_func);
2538} 2561}
2539 2562
2540static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2563static void cfq_put_cooperator(struct cfq_queue *cfqq)
2541{ 2564{
2542 struct cfq_queue *__cfqq, *next; 2565 struct cfq_queue *__cfqq, *next;
2543 2566
2544 if (unlikely(cfqq == cfqd->active_queue)) {
2545 __cfq_slice_expired(cfqd, cfqq, 0);
2546 cfq_schedule_dispatch(cfqd);
2547 }
2548
2549 /* 2567 /*
2550 * If this queue was scheduled to merge with another queue, be 2568 * If this queue was scheduled to merge with another queue, be
2551 * sure to drop the reference taken on that queue (and others in 2569 * sure to drop the reference taken on that queue (and others in
@@ -2561,6 +2579,16 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2561 cfq_put_queue(__cfqq); 2579 cfq_put_queue(__cfqq);
2562 __cfqq = next; 2580 __cfqq = next;
2563 } 2581 }
2582}
2583
2584static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2585{
2586 if (unlikely(cfqq == cfqd->active_queue)) {
2587 __cfq_slice_expired(cfqd, cfqq, 0);
2588 cfq_schedule_dispatch(cfqd);
2589 }
2590
2591 cfq_put_cooperator(cfqq);
2564 2592
2565 cfq_put_queue(cfqq); 2593 cfq_put_queue(cfqq);
2566} 2594}
@@ -2573,11 +2601,10 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2573 list_del_init(&cic->queue_list); 2601 list_del_init(&cic->queue_list);
2574 2602
2575 /* 2603 /*
2576 * Make sure key == NULL is seen for dead queues 2604 * Make sure dead mark is seen for dead queues
2577 */ 2605 */
2578 smp_wmb(); 2606 smp_wmb();
2579 cic->dead_key = (unsigned long) cic->key; 2607 cic->key = cfqd_dead_key(cfqd);
2580 cic->key = NULL;
2581 2608
2582 if (ioc->ioc_data == cic) 2609 if (ioc->ioc_data == cic)
2583 rcu_assign_pointer(ioc->ioc_data, NULL); 2610 rcu_assign_pointer(ioc->ioc_data, NULL);
@@ -2596,7 +2623,7 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2596static void cfq_exit_single_io_context(struct io_context *ioc, 2623static void cfq_exit_single_io_context(struct io_context *ioc,
2597 struct cfq_io_context *cic) 2624 struct cfq_io_context *cic)
2598{ 2625{
2599 struct cfq_data *cfqd = cic->key; 2626 struct cfq_data *cfqd = cic_to_cfqd(cic);
2600 2627
2601 if (cfqd) { 2628 if (cfqd) {
2602 struct request_queue *q = cfqd->queue; 2629 struct request_queue *q = cfqd->queue;
@@ -2609,7 +2636,7 @@ static void cfq_exit_single_io_context(struct io_context *ioc,
2609 * race between exiting task and queue 2636 * race between exiting task and queue
2610 */ 2637 */
2611 smp_read_barrier_depends(); 2638 smp_read_barrier_depends();
2612 if (cic->key) 2639 if (cic->key == cfqd)
2613 __cfq_exit_single_io_context(cfqd, cic); 2640 __cfq_exit_single_io_context(cfqd, cic);
2614 2641
2615 spin_unlock_irqrestore(q->queue_lock, flags); 2642 spin_unlock_irqrestore(q->queue_lock, flags);
@@ -2689,7 +2716,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2689 2716
2690static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) 2717static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
2691{ 2718{
2692 struct cfq_data *cfqd = cic->key; 2719 struct cfq_data *cfqd = cic_to_cfqd(cic);
2693 struct cfq_queue *cfqq; 2720 struct cfq_queue *cfqq;
2694 unsigned long flags; 2721 unsigned long flags;
2695 2722
@@ -2746,7 +2773,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2746static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) 2773static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
2747{ 2774{
2748 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); 2775 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2749 struct cfq_data *cfqd = cic->key; 2776 struct cfq_data *cfqd = cic_to_cfqd(cic);
2750 unsigned long flags; 2777 unsigned long flags;
2751 struct request_queue *q; 2778 struct request_queue *q;
2752 2779
@@ -2883,12 +2910,13 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
2883 unsigned long flags; 2910 unsigned long flags;
2884 2911
2885 WARN_ON(!list_empty(&cic->queue_list)); 2912 WARN_ON(!list_empty(&cic->queue_list));
2913 BUG_ON(cic->key != cfqd_dead_key(cfqd));
2886 2914
2887 spin_lock_irqsave(&ioc->lock, flags); 2915 spin_lock_irqsave(&ioc->lock, flags);
2888 2916
2889 BUG_ON(ioc->ioc_data == cic); 2917 BUG_ON(ioc->ioc_data == cic);
2890 2918
2891 radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd); 2919 radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
2892 hlist_del_rcu(&cic->cic_list); 2920 hlist_del_rcu(&cic->cic_list);
2893 spin_unlock_irqrestore(&ioc->lock, flags); 2921 spin_unlock_irqrestore(&ioc->lock, flags);
2894 2922
@@ -2900,7 +2928,6 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
2900{ 2928{
2901 struct cfq_io_context *cic; 2929 struct cfq_io_context *cic;
2902 unsigned long flags; 2930 unsigned long flags;
2903 void *k;
2904 2931
2905 if (unlikely(!ioc)) 2932 if (unlikely(!ioc))
2906 return NULL; 2933 return NULL;
@@ -2917,13 +2944,11 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
2917 } 2944 }
2918 2945
2919 do { 2946 do {
2920 cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); 2947 cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
2921 rcu_read_unlock(); 2948 rcu_read_unlock();
2922 if (!cic) 2949 if (!cic)
2923 break; 2950 break;
2924 /* ->key must be copied to avoid race with cfq_exit_queue() */ 2951 if (unlikely(cic->key != cfqd)) {
2925 k = cic->key;
2926 if (unlikely(!k)) {
2927 cfq_drop_dead_cic(cfqd, ioc, cic); 2952 cfq_drop_dead_cic(cfqd, ioc, cic);
2928 rcu_read_lock(); 2953 rcu_read_lock();
2929 continue; 2954 continue;
@@ -2956,7 +2981,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
2956 2981
2957 spin_lock_irqsave(&ioc->lock, flags); 2982 spin_lock_irqsave(&ioc->lock, flags);
2958 ret = radix_tree_insert(&ioc->radix_root, 2983 ret = radix_tree_insert(&ioc->radix_root,
2959 (unsigned long) cfqd, cic); 2984 cfqd->cic_index, cic);
2960 if (!ret) 2985 if (!ret)
2961 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); 2986 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
2962 spin_unlock_irqrestore(&ioc->lock, flags); 2987 spin_unlock_irqrestore(&ioc->lock, flags);
@@ -3516,6 +3541,9 @@ split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
3516 } 3541 }
3517 3542
3518 cic_set_cfqq(cic, NULL, 1); 3543 cic_set_cfqq(cic, NULL, 1);
3544
3545 cfq_put_cooperator(cfqq);
3546
3519 cfq_put_queue(cfqq); 3547 cfq_put_queue(cfqq);
3520 return NULL; 3548 return NULL;
3521} 3549}
@@ -3708,10 +3736,32 @@ static void cfq_exit_queue(struct elevator_queue *e)
3708 3736
3709 cfq_shutdown_timer_wq(cfqd); 3737 cfq_shutdown_timer_wq(cfqd);
3710 3738
3739 spin_lock(&cic_index_lock);
3740 ida_remove(&cic_index_ida, cfqd->cic_index);
3741 spin_unlock(&cic_index_lock);
3742
3711 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 3743 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
3712 call_rcu(&cfqd->rcu, cfq_cfqd_free); 3744 call_rcu(&cfqd->rcu, cfq_cfqd_free);
3713} 3745}
3714 3746
3747static int cfq_alloc_cic_index(void)
3748{
3749 int index, error;
3750
3751 do {
3752 if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
3753 return -ENOMEM;
3754
3755 spin_lock(&cic_index_lock);
3756 error = ida_get_new(&cic_index_ida, &index);
3757 spin_unlock(&cic_index_lock);
3758 if (error && error != -EAGAIN)
3759 return error;
3760 } while (error);
3761
3762 return index;
3763}
3764
3715static void *cfq_init_queue(struct request_queue *q) 3765static void *cfq_init_queue(struct request_queue *q)
3716{ 3766{
3717 struct cfq_data *cfqd; 3767 struct cfq_data *cfqd;
@@ -3719,10 +3769,16 @@ static void *cfq_init_queue(struct request_queue *q)
3719 struct cfq_group *cfqg; 3769 struct cfq_group *cfqg;
3720 struct cfq_rb_root *st; 3770 struct cfq_rb_root *st;
3721 3771
3772 i = cfq_alloc_cic_index();
3773 if (i < 0)
3774 return NULL;
3775
3722 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3776 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3723 if (!cfqd) 3777 if (!cfqd)
3724 return NULL; 3778 return NULL;
3725 3779
3780 cfqd->cic_index = i;
3781
3726 /* Init root service tree */ 3782 /* Init root service tree */
3727 cfqd->grp_service_tree = CFQ_RB_ROOT; 3783 cfqd->grp_service_tree = CFQ_RB_ROOT;
3728 3784
@@ -3984,6 +4040,7 @@ static void __exit cfq_exit(void)
3984 */ 4040 */
3985 if (elv_ioc_count_read(cfq_ioc_count)) 4041 if (elv_ioc_count_read(cfq_ioc_count))
3986 wait_for_completion(&all_gone); 4042 wait_for_completion(&all_gone);
4043 ida_destroy(&cic_index_ida);
3987 cfq_slab_kill(); 4044 cfq_slab_kill();
3988} 4045}
3989 4046
diff --git a/block/elevator.c b/block/elevator.c
index 6df2b5056b51..923a9139106c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -242,9 +242,11 @@ int elevator_init(struct request_queue *q, char *name)
242{ 242{
243 struct elevator_type *e = NULL; 243 struct elevator_type *e = NULL;
244 struct elevator_queue *eq; 244 struct elevator_queue *eq;
245 int ret = 0;
246 void *data; 245 void *data;
247 246
247 if (unlikely(q->elevator))
248 return 0;
249
248 INIT_LIST_HEAD(&q->queue_head); 250 INIT_LIST_HEAD(&q->queue_head);
249 q->last_merge = NULL; 251 q->last_merge = NULL;
250 q->end_sector = 0; 252 q->end_sector = 0;
@@ -284,7 +286,7 @@ int elevator_init(struct request_queue *q, char *name)
284 } 286 }
285 287
286 elevator_attach(q, eq, data); 288 elevator_attach(q, eq, data);
287 return ret; 289 return 0;
288} 290}
289EXPORT_SYMBOL(elevator_init); 291EXPORT_SYMBOL(elevator_init);
290 292
@@ -1097,7 +1099,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
1097 struct elevator_type *__e; 1099 struct elevator_type *__e;
1098 int len = 0; 1100 int len = 0;
1099 1101
1100 if (!q->elevator) 1102 if (!q->elevator || !blk_queue_stackable(q))
1101 return sprintf(name, "none\n"); 1103 return sprintf(name, "none\n");
1102 1104
1103 elv = e->elevator_type; 1105 elv = e->elevator_type;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 6081e81d5738..f1bf79d9bc0a 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -133,6 +133,28 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
133 return page; 133 return page;
134} 134}
135 135
136static void brd_free_page(struct brd_device *brd, sector_t sector)
137{
138 struct page *page;
139 pgoff_t idx;
140
141 spin_lock(&brd->brd_lock);
142 idx = sector >> PAGE_SECTORS_SHIFT;
143 page = radix_tree_delete(&brd->brd_pages, idx);
144 spin_unlock(&brd->brd_lock);
145 if (page)
146 __free_page(page);
147}
148
149static void brd_zero_page(struct brd_device *brd, sector_t sector)
150{
151 struct page *page;
152
153 page = brd_lookup_page(brd, sector);
154 if (page)
155 clear_highpage(page);
156}
157
136/* 158/*
137 * Free all backing store pages and radix tree. This must only be called when 159 * Free all backing store pages and radix tree. This must only be called when
138 * there are no other users of the device. 160 * there are no other users of the device.
@@ -189,6 +211,24 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
189 return 0; 211 return 0;
190} 212}
191 213
214static void discard_from_brd(struct brd_device *brd,
215 sector_t sector, size_t n)
216{
217 while (n >= PAGE_SIZE) {
218 /*
219 * Don't want to actually discard pages here because
220 * re-allocating the pages can result in writeback
221 * deadlocks under heavy load.
222 */
223 if (0)
224 brd_free_page(brd, sector);
225 else
226 brd_zero_page(brd, sector);
227 sector += PAGE_SIZE >> SECTOR_SHIFT;
228 n -= PAGE_SIZE;
229 }
230}
231
192/* 232/*
193 * Copy n bytes from src to the brd starting at sector. Does not sleep. 233 * Copy n bytes from src to the brd starting at sector. Does not sleep.
194 */ 234 */
@@ -300,6 +340,12 @@ static int brd_make_request(struct request_queue *q, struct bio *bio)
300 get_capacity(bdev->bd_disk)) 340 get_capacity(bdev->bd_disk))
301 goto out; 341 goto out;
302 342
343 if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
344 err = 0;
345 discard_from_brd(brd, sector, bio->bi_size);
346 goto out;
347 }
348
303 rw = bio_rw(bio); 349 rw = bio_rw(bio);
304 if (rw == READA) 350 if (rw == READA)
305 rw = READ; 351 rw = READ;
@@ -320,7 +366,7 @@ out:
320} 366}
321 367
322#ifdef CONFIG_BLK_DEV_XIP 368#ifdef CONFIG_BLK_DEV_XIP
323static int brd_direct_access (struct block_device *bdev, sector_t sector, 369static int brd_direct_access(struct block_device *bdev, sector_t sector,
324 void **kaddr, unsigned long *pfn) 370 void **kaddr, unsigned long *pfn)
325{ 371{
326 struct brd_device *brd = bdev->bd_disk->private_data; 372 struct brd_device *brd = bdev->bd_disk->private_data;
@@ -437,6 +483,11 @@ static struct brd_device *brd_alloc(int i)
437 blk_queue_max_hw_sectors(brd->brd_queue, 1024); 483 blk_queue_max_hw_sectors(brd->brd_queue, 1024);
438 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); 484 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
439 485
486 brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
487 brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
488 brd->brd_queue->limits.discard_zeroes_data = 1;
489 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
490
440 disk = brd->brd_disk = alloc_disk(1 << part_shift); 491 disk = brd->brd_disk = alloc_disk(1 << part_shift);
441 if (!disk) 492 if (!disk)
442 goto out_free_queue; 493 goto out_free_queue;
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index e1d0e2cfec72..3381505c8a6c 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -188,11 +188,11 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *cmd)
188 188
189 sa = h->scsi_ctlr; 189 sa = h->scsi_ctlr;
190 stk = &sa->cmd_stack; 190 stk = &sa->cmd_stack;
191 stk->top++;
191 if (stk->top >= CMD_STACK_SIZE) { 192 if (stk->top >= CMD_STACK_SIZE) {
192 printk("cciss: scsi_cmd_free called too many times.\n"); 193 printk("cciss: scsi_cmd_free called too many times.\n");
193 BUG(); 194 BUG();
194 } 195 }
195 stk->top++;
196 stk->elem[stk->top] = (struct cciss_scsi_cmd_stack_elem_t *) cmd; 196 stk->elem[stk->top] = (struct cciss_scsi_cmd_stack_elem_t *) cmd;
197} 197}
198 198
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e9654c8d5b62..485ed8c7d623 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -943,8 +943,7 @@ struct drbd_conf {
943 struct drbd_work resync_work, 943 struct drbd_work resync_work,
944 unplug_work, 944 unplug_work,
945 md_sync_work, 945 md_sync_work,
946 delay_probe_work, 946 delay_probe_work;
947 uuid_work;
948 struct timer_list resync_timer; 947 struct timer_list resync_timer;
949 struct timer_list md_sync_timer; 948 struct timer_list md_sync_timer;
950 struct timer_list delay_probe_timer; 949 struct timer_list delay_probe_timer;
@@ -1069,7 +1068,6 @@ struct drbd_conf {
1069 struct timeval dps_time; /* delay-probes-start-time */ 1068 struct timeval dps_time; /* delay-probes-start-time */
1070 unsigned int dp_volume_last; /* send_cnt of last delay probe */ 1069 unsigned int dp_volume_last; /* send_cnt of last delay probe */
1071 int c_sync_rate; /* current resync rate after delay_probe magic */ 1070 int c_sync_rate; /* current resync rate after delay_probe magic */
1072 atomic_t new_c_uuid;
1073}; 1071};
1074 1072
1075static inline struct drbd_conf *minor_to_mdev(unsigned int minor) 1073static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1476,7 +1474,6 @@ extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
1476extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); 1474extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
1477extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int); 1475extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
1478extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); 1476extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
1479extern int w_io_error(struct drbd_conf *, struct drbd_work *, int);
1480extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); 1477extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
1481extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int); 1478extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
1482extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); 1479extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
@@ -1542,7 +1539,7 @@ static inline void drbd_tcp_nodelay(struct socket *sock)
1542 1539
1543static inline void drbd_tcp_quickack(struct socket *sock) 1540static inline void drbd_tcp_quickack(struct socket *sock)
1544{ 1541{
1545 int __user val = 1; 1542 int __user val = 2;
1546 (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, 1543 (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
1547 (char __user *)&val, sizeof(val)); 1544 (char __user *)&val, sizeof(val));
1548} 1545}
@@ -1728,7 +1725,7 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
1728 switch (mdev->ldev->dc.on_io_error) { 1725 switch (mdev->ldev->dc.on_io_error) {
1729 case EP_PASS_ON: 1726 case EP_PASS_ON:
1730 if (!forcedetach) { 1727 if (!forcedetach) {
1731 if (printk_ratelimit()) 1728 if (__ratelimit(&drbd_ratelimit_state))
1732 dev_err(DEV, "Local IO failed in %s." 1729 dev_err(DEV, "Local IO failed in %s."
1733 "Passing error on...\n", where); 1730 "Passing error on...\n", where);
1734 break; 1731 break;
@@ -2219,8 +2216,6 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
2219 return 0; 2216 return 0;
2220 if (test_bit(BITMAP_IO, &mdev->flags)) 2217 if (test_bit(BITMAP_IO, &mdev->flags))
2221 return 0; 2218 return 0;
2222 if (atomic_read(&mdev->new_c_uuid))
2223 return 0;
2224 return 1; 2219 return 1;
2225} 2220}
2226 2221
@@ -2241,9 +2236,6 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
2241 * to avoid races with the reconnect code, 2236 * to avoid races with the reconnect code,
2242 * we need to atomic_inc within the spinlock. */ 2237 * we need to atomic_inc within the spinlock. */
2243 2238
2244 if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1))
2245 drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work);
2246
2247 spin_lock_irq(&mdev->req_lock); 2239 spin_lock_irq(&mdev->req_lock);
2248 while (!__inc_ap_bio_cond(mdev)) { 2240 while (!__inc_ap_bio_cond(mdev)) {
2249 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); 2241 prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index be2d2da9cdba..6b077f93acc6 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1215,18 +1215,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1215 ns.pdsk == D_OUTDATED)) { 1215 ns.pdsk == D_OUTDATED)) {
1216 if (get_ldev(mdev)) { 1216 if (get_ldev(mdev)) {
1217 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && 1217 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1218 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE && 1218 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1219 !atomic_read(&mdev->new_c_uuid)) 1219 drbd_uuid_new_current(mdev);
1220 atomic_set(&mdev->new_c_uuid, 2); 1220 drbd_send_uuids(mdev);
1221 }
1221 put_ldev(mdev); 1222 put_ldev(mdev);
1222 } 1223 }
1223 } 1224 }
1224 1225
1225 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { 1226 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1226 /* Diskless peer becomes primary or got connected do diskless, primary peer. */ 1227 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
1227 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 && 1228 drbd_uuid_new_current(mdev);
1228 !atomic_read(&mdev->new_c_uuid))
1229 atomic_set(&mdev->new_c_uuid, 2);
1230 1229
1231 /* D_DISKLESS Peer becomes secondary */ 1230 /* D_DISKLESS Peer becomes secondary */
1232 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) 1231 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1350,24 +1349,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1350 drbd_md_sync(mdev); 1349 drbd_md_sync(mdev);
1351} 1350}
1352 1351
1353static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1354{
1355 if (get_ldev(mdev)) {
1356 if (mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1357 drbd_uuid_new_current(mdev);
1358 if (get_net_conf(mdev)) {
1359 drbd_send_uuids(mdev);
1360 put_net_conf(mdev);
1361 }
1362 drbd_md_sync(mdev);
1363 }
1364 put_ldev(mdev);
1365 }
1366 atomic_dec(&mdev->new_c_uuid);
1367 wake_up(&mdev->misc_wait);
1368
1369 return 1;
1370}
1371 1352
1372static int drbd_thread_setup(void *arg) 1353static int drbd_thread_setup(void *arg)
1373{ 1354{
@@ -2291,9 +2272,9 @@ static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *
2291 * with page_count == 0 or PageSlab. 2272 * with page_count == 0 or PageSlab.
2292 */ 2273 */
2293static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, 2274static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2294 int offset, size_t size) 2275 int offset, size_t size, unsigned msg_flags)
2295{ 2276{
2296 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); 2277 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2297 kunmap(page); 2278 kunmap(page);
2298 if (sent == size) 2279 if (sent == size)
2299 mdev->send_cnt += size>>9; 2280 mdev->send_cnt += size>>9;
@@ -2301,7 +2282,7 @@ static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2301} 2282}
2302 2283
2303static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, 2284static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2304 int offset, size_t size) 2285 int offset, size_t size, unsigned msg_flags)
2305{ 2286{
2306 mm_segment_t oldfs = get_fs(); 2287 mm_segment_t oldfs = get_fs();
2307 int sent, ok; 2288 int sent, ok;
@@ -2314,14 +2295,15 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2314 * __page_cache_release a page that would actually still be referenced 2295 * __page_cache_release a page that would actually still be referenced
2315 * by someone, leading to some obscure delayed Oops somewhere else. */ 2296 * by someone, leading to some obscure delayed Oops somewhere else. */
2316 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) 2297 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2317 return _drbd_no_send_page(mdev, page, offset, size); 2298 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2318 2299
2300 msg_flags |= MSG_NOSIGNAL;
2319 drbd_update_congested(mdev); 2301 drbd_update_congested(mdev);
2320 set_fs(KERNEL_DS); 2302 set_fs(KERNEL_DS);
2321 do { 2303 do {
2322 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, 2304 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2323 offset, len, 2305 offset, len,
2324 MSG_NOSIGNAL); 2306 msg_flags);
2325 if (sent == -EAGAIN) { 2307 if (sent == -EAGAIN) {
2326 if (we_should_drop_the_connection(mdev, 2308 if (we_should_drop_the_connection(mdev,
2327 mdev->data.socket)) 2309 mdev->data.socket))
@@ -2350,9 +2332,11 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2350{ 2332{
2351 struct bio_vec *bvec; 2333 struct bio_vec *bvec;
2352 int i; 2334 int i;
2335 /* hint all but last page with MSG_MORE */
2353 __bio_for_each_segment(bvec, bio, i, 0) { 2336 __bio_for_each_segment(bvec, bio, i, 0) {
2354 if (!_drbd_no_send_page(mdev, bvec->bv_page, 2337 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2355 bvec->bv_offset, bvec->bv_len)) 2338 bvec->bv_offset, bvec->bv_len,
2339 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2356 return 0; 2340 return 0;
2357 } 2341 }
2358 return 1; 2342 return 1;
@@ -2362,12 +2346,13 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2362{ 2346{
2363 struct bio_vec *bvec; 2347 struct bio_vec *bvec;
2364 int i; 2348 int i;
2349 /* hint all but last page with MSG_MORE */
2365 __bio_for_each_segment(bvec, bio, i, 0) { 2350 __bio_for_each_segment(bvec, bio, i, 0) {
2366 if (!_drbd_send_page(mdev, bvec->bv_page, 2351 if (!_drbd_send_page(mdev, bvec->bv_page,
2367 bvec->bv_offset, bvec->bv_len)) 2352 bvec->bv_offset, bvec->bv_len,
2353 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2368 return 0; 2354 return 0;
2369 } 2355 }
2370
2371 return 1; 2356 return 1;
2372} 2357}
2373 2358
@@ -2375,9 +2360,11 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2375{ 2360{
2376 struct page *page = e->pages; 2361 struct page *page = e->pages;
2377 unsigned len = e->size; 2362 unsigned len = e->size;
2363 /* hint all but last page with MSG_MORE */
2378 page_chain_for_each(page) { 2364 page_chain_for_each(page) {
2379 unsigned l = min_t(unsigned, len, PAGE_SIZE); 2365 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2380 if (!_drbd_send_page(mdev, page, 0, l)) 2366 if (!_drbd_send_page(mdev, page, 0, l,
2367 page_chain_next(page) ? MSG_MORE : 0))
2381 return 0; 2368 return 0;
2382 len -= l; 2369 len -= l;
2383 } 2370 }
@@ -2457,11 +2444,11 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2457 p.dp_flags = cpu_to_be32(dp_flags); 2444 p.dp_flags = cpu_to_be32(dp_flags);
2458 set_bit(UNPLUG_REMOTE, &mdev->flags); 2445 set_bit(UNPLUG_REMOTE, &mdev->flags);
2459 ok = (sizeof(p) == 2446 ok = (sizeof(p) ==
2460 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); 2447 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2461 if (ok && dgs) { 2448 if (ok && dgs) {
2462 dgb = mdev->int_dig_out; 2449 dgb = mdev->int_dig_out;
2463 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); 2450 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2464 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); 2451 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2465 } 2452 }
2466 if (ok) { 2453 if (ok) {
2467 if (mdev->net_conf->wire_protocol == DRBD_PROT_A) 2454 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
@@ -2510,11 +2497,11 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2510 return 0; 2497 return 0;
2511 2498
2512 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, 2499 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2513 sizeof(p), MSG_MORE); 2500 sizeof(p), dgs ? MSG_MORE : 0);
2514 if (ok && dgs) { 2501 if (ok && dgs) {
2515 dgb = mdev->int_dig_out; 2502 dgb = mdev->int_dig_out;
2516 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); 2503 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2517 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); 2504 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2518 } 2505 }
2519 if (ok) 2506 if (ok)
2520 ok = _drbd_send_zc_ee(mdev, e); 2507 ok = _drbd_send_zc_ee(mdev, e);
@@ -2708,7 +2695,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2708 atomic_set(&mdev->net_cnt, 0); 2695 atomic_set(&mdev->net_cnt, 0);
2709 atomic_set(&mdev->packet_seq, 0); 2696 atomic_set(&mdev->packet_seq, 0);
2710 atomic_set(&mdev->pp_in_use, 0); 2697 atomic_set(&mdev->pp_in_use, 0);
2711 atomic_set(&mdev->new_c_uuid, 0);
2712 2698
2713 mutex_init(&mdev->md_io_mutex); 2699 mutex_init(&mdev->md_io_mutex);
2714 mutex_init(&mdev->data.mutex); 2700 mutex_init(&mdev->data.mutex);
@@ -2739,14 +2725,12 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
2739 INIT_LIST_HEAD(&mdev->bm_io_work.w.list); 2725 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2740 INIT_LIST_HEAD(&mdev->delay_probes); 2726 INIT_LIST_HEAD(&mdev->delay_probes);
2741 INIT_LIST_HEAD(&mdev->delay_probe_work.list); 2727 INIT_LIST_HEAD(&mdev->delay_probe_work.list);
2742 INIT_LIST_HEAD(&mdev->uuid_work.list);
2743 2728
2744 mdev->resync_work.cb = w_resync_inactive; 2729 mdev->resync_work.cb = w_resync_inactive;
2745 mdev->unplug_work.cb = w_send_write_hint; 2730 mdev->unplug_work.cb = w_send_write_hint;
2746 mdev->md_sync_work.cb = w_md_sync; 2731 mdev->md_sync_work.cb = w_md_sync;
2747 mdev->bm_io_work.w.cb = w_bitmap_io; 2732 mdev->bm_io_work.w.cb = w_bitmap_io;
2748 mdev->delay_probe_work.cb = w_delay_probes; 2733 mdev->delay_probe_work.cb = w_delay_probes;
2749 mdev->uuid_work.cb = w_new_current_uuid;
2750 init_timer(&mdev->resync_timer); 2734 init_timer(&mdev->resync_timer);
2751 init_timer(&mdev->md_sync_timer); 2735 init_timer(&mdev->md_sync_timer);
2752 init_timer(&mdev->delay_probe_timer); 2736 init_timer(&mdev->delay_probe_timer);
@@ -3799,7 +3783,7 @@ _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3799 if (ret) { 3783 if (ret) {
3800 fault_count++; 3784 fault_count++;
3801 3785
3802 if (printk_ratelimit()) 3786 if (__ratelimit(&drbd_ratelimit_state))
3803 dev_warn(DEV, "***Simulating %s failure\n", 3787 dev_warn(DEV, "***Simulating %s failure\n",
3804 _drbd_fault_str(type)); 3788 _drbd_fault_str(type));
3805 } 3789 }
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index bc9ab7fb2cc7..dff48701b84d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -42,7 +42,6 @@
42#include <linux/unistd.h> 42#include <linux/unistd.h>
43#include <linux/vmalloc.h> 43#include <linux/vmalloc.h>
44#include <linux/random.h> 44#include <linux/random.h>
45#include <linux/mm.h>
46#include <linux/string.h> 45#include <linux/string.h>
47#include <linux/scatterlist.h> 46#include <linux/scatterlist.h>
48#include "drbd_int.h" 47#include "drbd_int.h"
@@ -571,6 +570,25 @@ static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
571 return rv; 570 return rv;
572} 571}
573 572
573/* quoting tcp(7):
574 * On individual connections, the socket buffer size must be set prior to the
575 * listen(2) or connect(2) calls in order to have it take effect.
576 * This is our wrapper to do so.
577 */
578static void drbd_setbufsize(struct socket *sock, unsigned int snd,
579 unsigned int rcv)
580{
581 /* open coded SO_SNDBUF, SO_RCVBUF */
582 if (snd) {
583 sock->sk->sk_sndbuf = snd;
584 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
585 }
586 if (rcv) {
587 sock->sk->sk_rcvbuf = rcv;
588 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
589 }
590}
591
574static struct socket *drbd_try_connect(struct drbd_conf *mdev) 592static struct socket *drbd_try_connect(struct drbd_conf *mdev)
575{ 593{
576 const char *what; 594 const char *what;
@@ -592,6 +610,8 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev)
592 610
593 sock->sk->sk_rcvtimeo = 611 sock->sk->sk_rcvtimeo =
594 sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; 612 sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ;
613 drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
614 mdev->net_conf->rcvbuf_size);
595 615
596 /* explicitly bind to the configured IP as source IP 616 /* explicitly bind to the configured IP as source IP
597 * for the outgoing connections. 617 * for the outgoing connections.
@@ -670,6 +690,8 @@ static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
670 s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ 690 s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
671 s_listen->sk->sk_rcvtimeo = timeo; 691 s_listen->sk->sk_rcvtimeo = timeo;
672 s_listen->sk->sk_sndtimeo = timeo; 692 s_listen->sk->sk_sndtimeo = timeo;
693 drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
694 mdev->net_conf->rcvbuf_size);
673 695
674 what = "bind before listen"; 696 what = "bind before listen";
675 err = s_listen->ops->bind(s_listen, 697 err = s_listen->ops->bind(s_listen,
@@ -856,16 +878,6 @@ retry:
856 sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; 878 sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
857 msock->sk->sk_priority = TC_PRIO_INTERACTIVE; 879 msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
858 880
859 if (mdev->net_conf->sndbuf_size) {
860 sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size;
861 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
862 }
863
864 if (mdev->net_conf->rcvbuf_size) {
865 sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size;
866 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
867 }
868
869 /* NOT YET ... 881 /* NOT YET ...
870 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; 882 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
871 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 883 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
@@ -1154,17 +1166,6 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1154 unsigned n_bios = 0; 1166 unsigned n_bios = 0;
1155 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; 1167 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
1156 1168
1157 if (atomic_read(&mdev->new_c_uuid)) {
1158 if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) {
1159 drbd_uuid_new_current(mdev);
1160 drbd_md_sync(mdev);
1161
1162 atomic_dec(&mdev->new_c_uuid);
1163 wake_up(&mdev->misc_wait);
1164 }
1165 wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid));
1166 }
1167
1168 /* In most cases, we will only need one bio. But in case the lower 1169 /* In most cases, we will only need one bio. But in case the lower
1169 * level restrictions happen to be different at this offset on this 1170 * level restrictions happen to be different at this offset on this
1170 * side than those of the sending peer, we may need to submit the 1171 * side than those of the sending peer, we may need to submit the
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3397f11d0ba9..654f1ef5cbb0 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -102,32 +102,7 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
102 } 102 }
103 } 103 }
104 104
105 /* if it was a local io error, we want to notify our 105 drbd_req_free(req);
106 * peer about that, and see if we need to
107 * detach the disk and stuff.
108 * to avoid allocating some special work
109 * struct, reuse the request. */
110
111 /* THINK
112 * why do we do this not when we detect the error,
113 * but delay it until it is "done", i.e. possibly
114 * until the next barrier ack? */
115
116 if (rw == WRITE &&
117 ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
118 if (!(req->w.list.next == LIST_POISON1 ||
119 list_empty(&req->w.list))) {
120 /* DEBUG ASSERT only; if this triggers, we
121 * probably corrupt the worker list here */
122 dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next);
123 dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev);
124 }
125 req->w.cb = w_io_error;
126 drbd_queue_work(&mdev->data.work, &req->w);
127 /* drbd_req_free() is done in w_io_error */
128 } else {
129 drbd_req_free(req);
130 }
131} 106}
132 107
133static void queue_barrier(struct drbd_conf *mdev) 108static void queue_barrier(struct drbd_conf *mdev)
@@ -453,9 +428,6 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
453 req->rq_state |= RQ_LOCAL_COMPLETED; 428 req->rq_state |= RQ_LOCAL_COMPLETED;
454 req->rq_state &= ~RQ_LOCAL_PENDING; 429 req->rq_state &= ~RQ_LOCAL_PENDING;
455 430
456 dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n",
457 (unsigned long long)req->sector, req->size);
458 /* and now: check how to handle local io error. */
459 __drbd_chk_io_error(mdev, FALSE); 431 __drbd_chk_io_error(mdev, FALSE);
460 _req_may_be_done(req, m); 432 _req_may_be_done(req, m);
461 put_ldev(mdev); 433 put_ldev(mdev);
@@ -475,22 +447,21 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
475 req->rq_state |= RQ_LOCAL_COMPLETED; 447 req->rq_state |= RQ_LOCAL_COMPLETED;
476 req->rq_state &= ~RQ_LOCAL_PENDING; 448 req->rq_state &= ~RQ_LOCAL_PENDING;
477 449
478 dev_alert(DEV, "Local READ failed sec=%llus size=%u\n",
479 (unsigned long long)req->sector, req->size);
480 /* _req_mod(req,to_be_send); oops, recursion... */
481 D_ASSERT(!(req->rq_state & RQ_NET_MASK)); 450 D_ASSERT(!(req->rq_state & RQ_NET_MASK));
482 req->rq_state |= RQ_NET_PENDING;
483 inc_ap_pending(mdev);
484 451
485 __drbd_chk_io_error(mdev, FALSE); 452 __drbd_chk_io_error(mdev, FALSE);
486 put_ldev(mdev); 453 put_ldev(mdev);
487 /* NOTE: if we have no connection,
488 * or know the peer has no good data either,
489 * then we don't actually need to "queue_for_net_read",
490 * but we do so anyways, since the drbd_io_error()
491 * and the potential state change to "Diskless"
492 * needs to be done from process context */
493 454
455 /* no point in retrying if there is no good remote data,
456 * or we have no connection. */
457 if (mdev->state.pdsk != D_UP_TO_DATE) {
458 _req_may_be_done(req, m);
459 break;
460 }
461
462 /* _req_mod(req,to_be_send); oops, recursion... */
463 req->rq_state |= RQ_NET_PENDING;
464 inc_ap_pending(mdev);
494 /* fall through: _req_mod(req,queue_for_net_read); */ 465 /* fall through: _req_mod(req,queue_for_net_read); */
495 466
496 case queue_for_net_read: 467 case queue_for_net_read:
@@ -600,6 +571,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
600 _req_may_be_done(req, m); 571 _req_may_be_done(req, m);
601 break; 572 break;
602 573
574 case read_retry_remote_canceled:
575 req->rq_state &= ~RQ_NET_QUEUED;
576 /* fall through, in case we raced with drbd_disconnect */
603 case connection_lost_while_pending: 577 case connection_lost_while_pending:
604 /* transfer log cleanup after connection loss */ 578 /* transfer log cleanup after connection loss */
605 /* assert something? */ 579 /* assert something? */
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 16119d7056cc..02d575d24518 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -91,6 +91,7 @@ enum drbd_req_event {
91 send_failed, 91 send_failed,
92 handed_over_to_network, 92 handed_over_to_network,
93 connection_lost_while_pending, 93 connection_lost_while_pending,
94 read_retry_remote_canceled,
94 recv_acked_by_peer, 95 recv_acked_by_peer,
95 write_acked_by_peer, 96 write_acked_by_peer,
96 write_acked_by_peer_and_sis, /* and set_in_sync */ 97 write_acked_by_peer_and_sis, /* and set_in_sync */
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 727ff6339754..b623ceee2a4a 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -224,9 +224,6 @@ void drbd_endio_pri(struct bio *bio, int error)
224 enum drbd_req_event what; 224 enum drbd_req_event what;
225 int uptodate = bio_flagged(bio, BIO_UPTODATE); 225 int uptodate = bio_flagged(bio, BIO_UPTODATE);
226 226
227 if (error)
228 dev_warn(DEV, "p %s: error=%d\n",
229 bio_data_dir(bio) == WRITE ? "write" : "read", error);
230 if (!error && !uptodate) { 227 if (!error && !uptodate) {
231 dev_warn(DEV, "p %s: setting error to -EIO\n", 228 dev_warn(DEV, "p %s: setting error to -EIO\n",
232 bio_data_dir(bio) == WRITE ? "write" : "read"); 229 bio_data_dir(bio) == WRITE ? "write" : "read");
@@ -257,20 +254,6 @@ void drbd_endio_pri(struct bio *bio, int error)
257 complete_master_bio(mdev, &m); 254 complete_master_bio(mdev, &m);
258} 255}
259 256
260int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
261{
262 struct drbd_request *req = container_of(w, struct drbd_request, w);
263
264 /* NOTE: mdev->ldev can be NULL by the time we get here! */
265 /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
266
267 /* the only way this callback is scheduled is from _req_may_be_done,
268 * when it is done and had a local write error, see comments there */
269 drbd_req_free(req);
270
271 return TRUE;
272}
273
274int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 257int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
275{ 258{
276 struct drbd_request *req = container_of(w, struct drbd_request, w); 259 struct drbd_request *req = container_of(w, struct drbd_request, w);
@@ -280,12 +263,9 @@ int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
280 * to give the disk the chance to relocate that block */ 263 * to give the disk the chance to relocate that block */
281 264
282 spin_lock_irq(&mdev->req_lock); 265 spin_lock_irq(&mdev->req_lock);
283 if (cancel || 266 if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
284 mdev->state.conn < C_CONNECTED || 267 _req_mod(req, read_retry_remote_canceled);
285 mdev->state.pdsk <= D_INCONSISTENT) {
286 _req_mod(req, send_canceled);
287 spin_unlock_irq(&mdev->req_lock); 268 spin_unlock_irq(&mdev->req_lock);
288 dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
289 return 1; 269 return 1;
290 } 270 }
291 spin_unlock_irq(&mdev->req_lock); 271 spin_unlock_irq(&mdev->req_lock);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ea8592b90696..1d1088f48bc2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,7 +45,6 @@ struct wb_writeback_args {
45 unsigned int for_kupdate:1; 45 unsigned int for_kupdate:1;
46 unsigned int range_cyclic:1; 46 unsigned int range_cyclic:1;
47 unsigned int for_background:1; 47 unsigned int for_background:1;
48 unsigned int sb_pinned:1;
49}; 48};
50 49
51/* 50/*
@@ -193,8 +192,7 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
193} 192}
194 193
195static void bdi_alloc_queue_work(struct backing_dev_info *bdi, 194static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
196 struct wb_writeback_args *args, 195 struct wb_writeback_args *args)
197 int wait)
198{ 196{
199 struct bdi_work *work; 197 struct bdi_work *work;
200 198
@@ -206,8 +204,6 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
206 if (work) { 204 if (work) {
207 bdi_work_init(work, args); 205 bdi_work_init(work, args);
208 bdi_queue_work(bdi, work); 206 bdi_queue_work(bdi, work);
209 if (wait)
210 bdi_wait_on_work_clear(work);
211 } else { 207 } else {
212 struct bdi_writeback *wb = &bdi->wb; 208 struct bdi_writeback *wb = &bdi->wb;
213 209
@@ -234,11 +230,6 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
234 .sync_mode = WB_SYNC_ALL, 230 .sync_mode = WB_SYNC_ALL,
235 .nr_pages = LONG_MAX, 231 .nr_pages = LONG_MAX,
236 .range_cyclic = 0, 232 .range_cyclic = 0,
237 /*
238 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
239 * lets make it explicitly clear.
240 */
241 .sb_pinned = 1,
242 }; 233 };
243 struct bdi_work work; 234 struct bdi_work work;
244 235
@@ -254,23 +245,21 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
254 * @bdi: the backing device to write from 245 * @bdi: the backing device to write from
255 * @sb: write inodes from this super_block 246 * @sb: write inodes from this super_block
256 * @nr_pages: the number of pages to write 247 * @nr_pages: the number of pages to write
257 * @sb_locked: caller already holds sb umount sem.
258 * 248 *
259 * Description: 249 * Description:
260 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 250 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
261 * started when this function returns, we make no guarentees on 251 * started when this function returns, we make no guarentees on
262 * completion. Caller specifies whether sb umount sem is held already or not. 252 * completion. Caller need not hold sb s_umount semaphore.
263 * 253 *
264 */ 254 */
265void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 255void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
266 long nr_pages, int sb_locked) 256 long nr_pages)
267{ 257{
268 struct wb_writeback_args args = { 258 struct wb_writeback_args args = {
269 .sb = sb, 259 .sb = sb,
270 .sync_mode = WB_SYNC_NONE, 260 .sync_mode = WB_SYNC_NONE,
271 .nr_pages = nr_pages, 261 .nr_pages = nr_pages,
272 .range_cyclic = 1, 262 .range_cyclic = 1,
273 .sb_pinned = sb_locked,
274 }; 263 };
275 264
276 /* 265 /*
@@ -282,7 +271,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
282 args.for_background = 1; 271 args.for_background = 1;
283 } 272 }
284 273
285 bdi_alloc_queue_work(bdi, &args, sb_locked); 274 bdi_alloc_queue_work(bdi, &args);
286} 275}
287 276
288/* 277/*
@@ -595,7 +584,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
595 /* 584 /*
596 * Caller must already hold the ref for this 585 * Caller must already hold the ref for this
597 */ 586 */
598 if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) { 587 if (wbc->sync_mode == WB_SYNC_ALL) {
599 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 588 WARN_ON(!rwsem_is_locked(&sb->s_umount));
600 return SB_NOT_PINNED; 589 return SB_NOT_PINNED;
601 } 590 }
@@ -769,7 +758,6 @@ static long wb_writeback(struct bdi_writeback *wb,
769 .for_kupdate = args->for_kupdate, 758 .for_kupdate = args->for_kupdate,
770 .for_background = args->for_background, 759 .for_background = args->for_background,
771 .range_cyclic = args->range_cyclic, 760 .range_cyclic = args->range_cyclic,
772 .sb_pinned = args->sb_pinned,
773 }; 761 };
774 unsigned long oldest_jif; 762 unsigned long oldest_jif;
775 long wrote = 0; 763 long wrote = 0;
@@ -912,7 +900,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
912 900
913 while ((work = get_next_work_item(bdi, wb)) != NULL) { 901 while ((work = get_next_work_item(bdi, wb)) != NULL) {
914 struct wb_writeback_args args = work->args; 902 struct wb_writeback_args args = work->args;
915 int post_clear;
916 903
917 /* 904 /*
918 * Override sync mode, in case we must wait for completion 905 * Override sync mode, in case we must wait for completion
@@ -920,13 +907,11 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
920 if (force_wait) 907 if (force_wait)
921 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 908 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
922 909
923 post_clear = WB_SYNC_ALL || args.sb_pinned;
924
925 /* 910 /*
926 * If this isn't a data integrity operation, just notify 911 * If this isn't a data integrity operation, just notify
927 * that we have seen this work and we are now starting it. 912 * that we have seen this work and we are now starting it.
928 */ 913 */
929 if (!post_clear) 914 if (args.sync_mode == WB_SYNC_NONE)
930 wb_clear_pending(wb, work); 915 wb_clear_pending(wb, work);
931 916
932 wrote += wb_writeback(wb, &args); 917 wrote += wb_writeback(wb, &args);
@@ -935,7 +920,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
935 * This is a data integrity writeback, so only do the 920 * This is a data integrity writeback, so only do the
936 * notification when we have completed the work. 921 * notification when we have completed the work.
937 */ 922 */
938 if (post_clear) 923 if (args.sync_mode == WB_SYNC_ALL)
939 wb_clear_pending(wb, work); 924 wb_clear_pending(wb, work);
940 } 925 }
941 926
@@ -1011,7 +996,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
1011 if (!bdi_has_dirty_io(bdi)) 996 if (!bdi_has_dirty_io(bdi))
1012 continue; 997 continue;
1013 998
1014 bdi_alloc_queue_work(bdi, &args, 0); 999 bdi_alloc_queue_work(bdi, &args);
1015 } 1000 }
1016 1001
1017 rcu_read_unlock(); 1002 rcu_read_unlock();
@@ -1220,18 +1205,6 @@ static void wait_sb_inodes(struct super_block *sb)
1220 iput(old_inode); 1205 iput(old_inode);
1221} 1206}
1222 1207
1223static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
1224{
1225 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1226 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1227 long nr_to_write;
1228
1229 nr_to_write = nr_dirty + nr_unstable +
1230 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1231
1232 bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
1233}
1234
1235/** 1208/**
1236 * writeback_inodes_sb - writeback dirty inodes from given super_block 1209 * writeback_inodes_sb - writeback dirty inodes from given super_block
1237 * @sb: the superblock 1210 * @sb: the superblock
@@ -1243,21 +1216,16 @@ static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
1243 */ 1216 */
1244void writeback_inodes_sb(struct super_block *sb) 1217void writeback_inodes_sb(struct super_block *sb)
1245{ 1218{
1246 __writeback_inodes_sb(sb, 0); 1219 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1247} 1220 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1248EXPORT_SYMBOL(writeback_inodes_sb); 1221 long nr_to_write;
1249 1222
1250/** 1223 nr_to_write = nr_dirty + nr_unstable +
1251 * writeback_inodes_sb_locked - writeback dirty inodes from given super_block 1224 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1252 * @sb: the superblock 1225
1253 * 1226 bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
1254 * Like writeback_inodes_sb(), except the caller already holds the
1255 * sb umount sem.
1256 */
1257void writeback_inodes_sb_locked(struct super_block *sb)
1258{
1259 __writeback_inodes_sb(sb, 1);
1260} 1227}
1228EXPORT_SYMBOL(writeback_inodes_sb);
1261 1229
1262/** 1230/**
1263 * writeback_inodes_sb_if_idle - start writeback if none underway 1231 * writeback_inodes_sb_if_idle - start writeback if none underway
diff --git a/fs/pipe.c b/fs/pipe.c
index db6eaaba0dd8..69c4c7c13ea9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -26,9 +26,14 @@
26 26
27/* 27/*
28 * The max size that a non-root user is allowed to grow the pipe. Can 28 * The max size that a non-root user is allowed to grow the pipe. Can
29 * be set by root in /proc/sys/fs/pipe-max-pages 29 * be set by root in /proc/sys/fs/pipe-max-size
30 */ 30 */
31unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; 31unsigned int pipe_max_size = 1048576;
32
33/*
34 * Minimum pipe size, as required by POSIX
35 */
36unsigned int pipe_min_size = PAGE_SIZE;
32 37
33/* 38/*
34 * We use a start+len construction, which provides full use of the 39 * We use a start+len construction, which provides full use of the
@@ -1118,26 +1123,20 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
1118 * Allocate a new array of pipe buffers and copy the info over. Returns the 1123 * Allocate a new array of pipe buffers and copy the info over. Returns the
1119 * pipe size if successful, or return -ERROR on error. 1124 * pipe size if successful, or return -ERROR on error.
1120 */ 1125 */
1121static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) 1126static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
1122{ 1127{
1123 struct pipe_buffer *bufs; 1128 struct pipe_buffer *bufs;
1124 1129
1125 /* 1130 /*
1126 * Must be a power-of-2 currently
1127 */
1128 if (!is_power_of_2(arg))
1129 return -EINVAL;
1130
1131 /*
1132 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1131 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1133 * expect a lot of shrink+grow operations, just free and allocate 1132 * expect a lot of shrink+grow operations, just free and allocate
1134 * again like we would do for growing. If the pipe currently 1133 * again like we would do for growing. If the pipe currently
1135 * contains more buffers than arg, then return busy. 1134 * contains more buffers than arg, then return busy.
1136 */ 1135 */
1137 if (arg < pipe->nrbufs) 1136 if (nr_pages < pipe->nrbufs)
1138 return -EBUSY; 1137 return -EBUSY;
1139 1138
1140 bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); 1139 bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
1141 if (unlikely(!bufs)) 1140 if (unlikely(!bufs))
1142 return -ENOMEM; 1141 return -ENOMEM;
1143 1142
@@ -1158,8 +1157,37 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1158 pipe->curbuf = 0; 1157 pipe->curbuf = 0;
1159 kfree(pipe->bufs); 1158 kfree(pipe->bufs);
1160 pipe->bufs = bufs; 1159 pipe->bufs = bufs;
1161 pipe->buffers = arg; 1160 pipe->buffers = nr_pages;
1162 return arg; 1161 return nr_pages * PAGE_SIZE;
1162}
1163
1164/*
1165 * Currently we rely on the pipe array holding a power-of-2 number
1166 * of pages.
1167 */
1168static inline unsigned int round_pipe_size(unsigned int size)
1169{
1170 unsigned long nr_pages;
1171
1172 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1173 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
1174}
1175
1176/*
1177 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
1178 * will return an error.
1179 */
1180int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
1181 size_t *lenp, loff_t *ppos)
1182{
1183 int ret;
1184
1185 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
1186 if (ret < 0 || !write)
1187 return ret;
1188
1189 pipe_max_size = round_pipe_size(pipe_max_size);
1190 return ret;
1163} 1191}
1164 1192
1165long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1193long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1174,23 +1202,24 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1174 mutex_lock(&pipe->inode->i_mutex); 1202 mutex_lock(&pipe->inode->i_mutex);
1175 1203
1176 switch (cmd) { 1204 switch (cmd) {
1177 case F_SETPIPE_SZ: 1205 case F_SETPIPE_SZ: {
1178 if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) { 1206 unsigned int size, nr_pages;
1179 ret = -EINVAL; 1207
1208 size = round_pipe_size(arg);
1209 nr_pages = size >> PAGE_SHIFT;
1210
1211 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
1212 ret = -EPERM;
1180 goto out; 1213 goto out;
1181 } 1214 } else if (nr_pages < PAGE_SIZE) {
1182 /*
1183 * The pipe needs to be at least 2 pages large to
1184 * guarantee POSIX behaviour.
1185 */
1186 if (arg < 2) {
1187 ret = -EINVAL; 1215 ret = -EINVAL;
1188 goto out; 1216 goto out;
1189 } 1217 }
1190 ret = pipe_set_size(pipe, arg); 1218 ret = pipe_set_size(pipe, nr_pages);
1191 break; 1219 break;
1220 }
1192 case F_GETPIPE_SZ: 1221 case F_GETPIPE_SZ:
1193 ret = pipe->buffers; 1222 ret = pipe->buffers * PAGE_SIZE;
1194 break; 1223 break;
1195 default: 1224 default:
1196 ret = -EINVAL; 1225 ret = -EINVAL;
diff --git a/fs/splice.c b/fs/splice.c
index ac22b00d86c3..740e6b9faf7a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -354,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
354 break; 354 break;
355 355
356 error = add_to_page_cache_lru(page, mapping, index, 356 error = add_to_page_cache_lru(page, mapping, index,
357 mapping_gfp_mask(mapping)); 357 GFP_KERNEL);
358 if (unlikely(error)) { 358 if (unlikely(error)) {
359 page_cache_release(page); 359 page_cache_release(page);
360 if (error == -EEXIST) 360 if (error == -EEXIST)
diff --git a/fs/sync.c b/fs/sync.c
index c9f83f480ec5..15aa6f03b2da 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
42 if (wait) 42 if (wait)
43 sync_inodes_sb(sb); 43 sync_inodes_sb(sb);
44 else 44 else
45 writeback_inodes_sb_locked(sb); 45 writeback_inodes_sb(sb);
46 46
47 if (sb->s_op->sync_fs) 47 if (sb->s_op->sync_fs)
48 sb->s_op->sync_fs(sb, wait); 48 sb->s_op->sync_fs(sb, wait);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e6e0cb5437e6..aee5f6ce166e 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -106,7 +106,7 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
106void bdi_unregister(struct backing_dev_info *bdi); 106void bdi_unregister(struct backing_dev_info *bdi);
107int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); 107int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
108void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 108void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
109 long nr_pages, int sb_locked); 109 long nr_pages);
110int bdi_writeback_task(struct bdi_writeback *wb); 110int bdi_writeback_task(struct bdi_writeback *wb);
111int bdi_has_dirty_io(struct backing_dev_info *bdi); 111int bdi_has_dirty_io(struct backing_dev_info *bdi);
112void bdi_arm_supers_timer(void); 112void bdi_arm_supers_timer(void);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8b7f5e0914ad..09a840264d6f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1211,14 +1211,23 @@ struct work_struct;
1211int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1211int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
1212 1212
1213#ifdef CONFIG_BLK_CGROUP 1213#ifdef CONFIG_BLK_CGROUP
1214/*
1215 * This should not be using sched_clock(). A real patch is in progress
1216 * to fix this up, until that is in place we need to disable preemption
1217 * around sched_clock() in this function and set_io_start_time_ns().
1218 */
1214static inline void set_start_time_ns(struct request *req) 1219static inline void set_start_time_ns(struct request *req)
1215{ 1220{
1221 preempt_disable();
1216 req->start_time_ns = sched_clock(); 1222 req->start_time_ns = sched_clock();
1223 preempt_enable();
1217} 1224}
1218 1225
1219static inline void set_io_start_time_ns(struct request *req) 1226static inline void set_io_start_time_ns(struct request *req)
1220{ 1227{
1228 preempt_disable();
1221 req->io_start_time_ns = sched_clock(); 1229 req->io_start_time_ns = sched_clock();
1230 preempt_enable();
1222} 1231}
1223 1232
1224static inline uint64_t rq_start_time_ns(struct request *req) 1233static inline uint64_t rq_start_time_ns(struct request *req)
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 68530521ad00..30da4ae48972 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,7 +53,7 @@
53 53
54 54
55extern const char *drbd_buildtag(void); 55extern const char *drbd_buildtag(void);
56#define REL_VERSION "8.3.8rc1" 56#define REL_VERSION "8.3.8rc2"
57#define API_VERSION 88 57#define API_VERSION 88
58#define PRO_VERSION_MIN 86 58#define PRO_VERSION_MIN 86
59#define PRO_VERSION_MAX 94 59#define PRO_VERSION_MAX 94
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index a0bb301afac0..64d529133031 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -7,7 +7,6 @@
7struct cfq_queue; 7struct cfq_queue;
8struct cfq_io_context { 8struct cfq_io_context {
9 void *key; 9 void *key;
10 unsigned long dead_key;
11 10
12 struct cfq_queue *cfqq[2]; 11 struct cfq_queue *cfqq[2];
13 12
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 16de3933c45e..445796945ac9 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -139,7 +139,9 @@ void pipe_lock(struct pipe_inode_info *);
139void pipe_unlock(struct pipe_inode_info *); 139void pipe_unlock(struct pipe_inode_info *);
140void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); 140void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
141 141
142extern unsigned int pipe_max_pages; 142extern unsigned int pipe_max_size, pipe_min_size;
143int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *);
144
143 145
144/* Drop the inode semaphore and wait for a pipe event, atomically */ 146/* Drop the inode semaphore and wait for a pipe event, atomically */
145void pipe_wait(struct pipe_inode_info *pipe); 147void pipe_wait(struct pipe_inode_info *pipe);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index cc97d6caf2b3..f64134653a8c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -65,15 +65,6 @@ struct writeback_control {
65 * so we use a single control to update them 65 * so we use a single control to update them
66 */ 66 */
67 unsigned no_nrwrite_index_update:1; 67 unsigned no_nrwrite_index_update:1;
68
69 /*
70 * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE,
71 * the writeback code will pin the sb for the caller. However,
72 * for eg umount, the caller does WB_SYNC_NONE but already has
73 * the sb pinned. If the below is set, caller already has the
74 * sb pinned.
75 */
76 unsigned sb_pinned:1;
77}; 68};
78 69
79/* 70/*
@@ -82,7 +73,6 @@ struct writeback_control {
82struct bdi_writeback; 73struct bdi_writeback;
83int inode_wait(void *); 74int inode_wait(void *);
84void writeback_inodes_sb(struct super_block *); 75void writeback_inodes_sb(struct super_block *);
85void writeback_inodes_sb_locked(struct super_block *);
86int writeback_inodes_sb_if_idle(struct super_block *); 76int writeback_inodes_sb_if_idle(struct super_block *);
87void sync_inodes_sb(struct super_block *); 77void sync_inodes_sb(struct super_block *);
88void writeback_inodes_wbc(struct writeback_control *wbc); 78void writeback_inodes_wbc(struct writeback_control *wbc);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 997080f00e0b..d24f761f4876 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1471,12 +1471,12 @@ static struct ctl_table fs_table[] = {
1471 }, 1471 },
1472#endif 1472#endif
1473 { 1473 {
1474 .procname = "pipe-max-pages", 1474 .procname = "pipe-max-size",
1475 .data = &pipe_max_pages, 1475 .data = &pipe_max_size,
1476 .maxlen = sizeof(int), 1476 .maxlen = sizeof(int),
1477 .mode = 0644, 1477 .mode = 0644,
1478 .proc_handler = &proc_dointvec_minmax, 1478 .proc_handler = &pipe_proc_fn,
1479 .extra1 = &two, 1479 .extra1 = &pipe_min_size,
1480 }, 1480 },
1481/* 1481/*
1482 * NOTE: do not add new entries to this table unless you have read 1482 * NOTE: do not add new entries to this table unless you have read
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b289310e2c89..5fa63bdf52e4 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
597 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) 597 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
598 + global_page_state(NR_UNSTABLE_NFS)) 598 + global_page_state(NR_UNSTABLE_NFS))
599 > background_thresh))) 599 > background_thresh)))
600 bdi_start_writeback(bdi, NULL, 0, 0); 600 bdi_start_writeback(bdi, NULL, 0);
601} 601}
602 602
603void set_page_dirty_balance(struct page *page, int page_mkwrite) 603void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -707,7 +707,7 @@ void laptop_mode_timer_fn(unsigned long data)
707 */ 707 */
708 708
709 if (bdi_has_dirty_io(&q->backing_dev_info)) 709 if (bdi_has_dirty_io(&q->backing_dev_info))
710 bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0); 710 bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages);
711} 711}
712 712
713/* 713/*