summaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-10 21:56:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-10 21:56:14 -0400
commitb0a1ea51bda4c2bcdde460221e1772f3a4f8c44f (patch)
tree9684c11b72718cd7e96e5eb93298690269ecf447 /fs/fs-writeback.c
parent33e247c7e58d335d70ecb84fd869091e2e4b8dcb (diff)
parent69d7fde5909b614114343974cfc52cb8ff30b544 (diff)
Merge branch 'for-4.3/blkcg' of git://git.kernel.dk/linux-block
Pull blk-cg updates from Jens Axboe: "A bit later in the cycle, but this has been in the block tree for a a while. This is basically four patchsets from Tejun, that improve our buffered cgroup writeback. It was dependent on the other cgroup changes, but they went in earlier in this cycle. Series 1 is set of 5 patches that has cgroup writeback updates: - bdi_writeback iteration fix which could lead to some wb's being skipped or repeated during e.g. sync under memory pressure. - Simplification of wb work wait mechanism. - Writeback tracepoints updated to report cgroup. Series 2 is is a set of updates for the CFQ cgroup writeback handling: cfq has always charged all async IOs to the root cgroup. It didn't have much choice as writeback didn't know about cgroups and there was no way to tell who to blame for a given writeback IO. writeback finally grew support for cgroups and now tags each writeback IO with the appropriate cgroup to charge it against. This patchset updates cfq so that it follows the blkcg each bio is tagged with. Async cfq_queues are now shared across cfq_group, which is per-cgroup, instead of per-request_queue cfq_data. This makes all IOs follow the weight based IO resource distribution implemented by cfq. - Switched from GFP_ATOMIC to GFP_NOWAIT as suggested by Jeff. - Other misc review points addressed, acks added and rebased. Series 3 is the blkcg policy cleanup patches: This patchset contains assorted cleanups for blkcg_policy methods and blk[c]g_policy_data handling. - alloc/free added for blkg_policy_data. exit dropped. - alloc/free added for blkcg_policy_data. - blk-throttle's async percpu allocation is replaced with direct allocation. - all methods now take blk[c]g_policy_data instead of blkcg_gq or blkcg. And finally, series 4 is a set of patches cleaning up the blkcg stats handling: blkcg's stats have always been somwhat of a mess. This patchset tries to improve the situation a bit. - The following patches added to consolidate blkcg entry point and blkg creation. This is in itself is an improvement and helps colllecting common stats on bio issue. - per-blkg stats now accounted on bio issue rather than request completion so that bio based and request based drivers can behave the same way. The issue was spotted by Vivek. - cfq-iosched implements custom recursive stats and blk-throttle implements custom per-cpu stats. This patchset make blkcg core support both by default. - cfq-iosched and blk-throttle keep track of the same stats multiple times. Unify them" * 'for-4.3/blkcg' of git://git.kernel.dk/linux-block: (45 commits) blkcg: use CGROUP_WEIGHT_* scale for io.weight on the unified hierarchy blkcg: s/CFQ_WEIGHT_*/CFQ_WEIGHT_LEGACY_*/ blkcg: implement interface for the unified hierarchy blkcg: misc preparations for unified hierarchy interface blkcg: separate out tg_conf_updated() from tg_set_conf() blkcg: move body parsing from blkg_conf_prep() to its callers blkcg: mark existing cftypes as legacy blkcg: rename subsystem name from blkio to io blkcg: refine error codes returned during blkcg configuration blkcg: remove unnecessary NULL checks from __cfqg_set_weight_device() blkcg: reduce stack usage of blkg_rwstat_recursive_sum() blkcg: remove cfqg_stats->sectors blkcg: move io_service_bytes and io_serviced stats into blkcg_gq blkcg: make blkg_[rw]stat_recursive_sum() to be able to index into blkcg_gq blkcg: make blkcg_[rw]stat per-cpu blkcg: add blkg_[rw]stat->aux_cnt and replace cfq_group->dead_stats with it blkcg: consolidate blkg creation in blkcg_bio_issue_check() blk-throttle: improve queue bypass handling blkcg: move root blkg lookup optimization from throtl_lookup_tg() to __blkg_lookup() blkcg: inline [__]blkg_lookup() ...
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c139
1 files changed, 43 insertions, 96 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ae0f438c2ee6..24489126f8ca 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,8 +53,6 @@ struct wb_writeback_work {
53 unsigned int for_background:1; 53 unsigned int for_background:1;
54 unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ 54 unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
55 unsigned int auto_free:1; /* free on completion */ 55 unsigned int auto_free:1; /* free on completion */
56 unsigned int single_wait:1;
57 unsigned int single_done:1;
58 enum wb_reason reason; /* why was writeback initiated? */ 56 enum wb_reason reason; /* why was writeback initiated? */
59 57
60 struct list_head list; /* pending work list */ 58 struct list_head list; /* pending work list */
@@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb)
178static void wb_queue_work(struct bdi_writeback *wb, 176static void wb_queue_work(struct bdi_writeback *wb,
179 struct wb_writeback_work *work) 177 struct wb_writeback_work *work)
180{ 178{
181 trace_writeback_queue(wb->bdi, work); 179 trace_writeback_queue(wb, work);
182 180
183 spin_lock_bh(&wb->work_lock); 181 spin_lock_bh(&wb->work_lock);
184 if (!test_bit(WB_registered, &wb->state)) { 182 if (!test_bit(WB_registered, &wb->state))
185 if (work->single_wait)
186 work->single_done = 1;
187 goto out_unlock; 183 goto out_unlock;
188 }
189 if (work->done) 184 if (work->done)
190 atomic_inc(&work->done->cnt); 185 atomic_inc(&work->done->cnt);
191 list_add_tail(&work->list, &wb->work_list); 186 list_add_tail(&work->list, &wb->work_list);
@@ -706,7 +701,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
706 701
707/** 702/**
708 * inode_congested - test whether an inode is congested 703 * inode_congested - test whether an inode is congested
709 * @inode: inode to test for congestion 704 * @inode: inode to test for congestion (may be NULL)
710 * @cong_bits: mask of WB_[a]sync_congested bits to test 705 * @cong_bits: mask of WB_[a]sync_congested bits to test
711 * 706 *
712 * Tests whether @inode is congested. @cong_bits is the mask of congestion 707 * Tests whether @inode is congested. @cong_bits is the mask of congestion
@@ -716,6 +711,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
716 * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg 711 * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
717 * associated with @inode is congested; otherwise, the root wb's congestion 712 * associated with @inode is congested; otherwise, the root wb's congestion
718 * state is used. 713 * state is used.
714 *
715 * @inode is allowed to be NULL as this function is often called on
716 * mapping->host which is NULL for the swapper space.
719 */ 717 */
720int inode_congested(struct inode *inode, int cong_bits) 718int inode_congested(struct inode *inode, int cong_bits)
721{ 719{
@@ -738,32 +736,6 @@ int inode_congested(struct inode *inode, int cong_bits)
738EXPORT_SYMBOL_GPL(inode_congested); 736EXPORT_SYMBOL_GPL(inode_congested);
739 737
740/** 738/**
741 * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
742 * @bdi: bdi the work item was issued to
743 * @work: work item to wait for
744 *
745 * Wait for the completion of @work which was issued to one of @bdi's
746 * bdi_writeback's. The caller must have set @work->single_wait before
747 * issuing it. This wait operates independently fo
748 * wb_wait_for_completion() and also disables automatic freeing of @work.
749 */
750static void wb_wait_for_single_work(struct backing_dev_info *bdi,
751 struct wb_writeback_work *work)
752{
753 if (WARN_ON_ONCE(!work->single_wait))
754 return;
755
756 wait_event(bdi->wb_waitq, work->single_done);
757
758 /*
759 * Paired with smp_wmb() in wb_do_writeback() and ensures that all
760 * modifications to @work prior to assertion of ->single_done is
761 * visible to the caller once this function returns.
762 */
763 smp_rmb();
764}
765
766/**
767 * wb_split_bdi_pages - split nr_pages to write according to bandwidth 739 * wb_split_bdi_pages - split nr_pages to write according to bandwidth
768 * @wb: target bdi_writeback to split @nr_pages to 740 * @wb: target bdi_writeback to split @nr_pages to
769 * @nr_pages: number of pages to write for the whole bdi 741 * @nr_pages: number of pages to write for the whole bdi
@@ -792,38 +764,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
792} 764}
793 765
794/** 766/**
795 * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
796 * @wb: target bdi_writeback
797 * @base_work: source wb_writeback_work
798 *
799 * Try to make a clone of @base_work and issue it to @wb. If cloning
800 * succeeds, %true is returned; otherwise, @base_work is issued directly
801 * and %false is returned. In the latter case, the caller is required to
802 * wait for @base_work's completion using wb_wait_for_single_work().
803 *
804 * A clone is auto-freed on completion. @base_work never is.
805 */
806static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
807 struct wb_writeback_work *base_work)
808{
809 struct wb_writeback_work *work;
810
811 work = kmalloc(sizeof(*work), GFP_ATOMIC);
812 if (work) {
813 *work = *base_work;
814 work->auto_free = 1;
815 work->single_wait = 0;
816 } else {
817 work = base_work;
818 work->auto_free = 0;
819 work->single_wait = 1;
820 }
821 work->single_done = 0;
822 wb_queue_work(wb, work);
823 return work != base_work;
824}
825
826/**
827 * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi 767 * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
828 * @bdi: target backing_dev_info 768 * @bdi: target backing_dev_info
829 * @base_work: wb_writeback_work to issue 769 * @base_work: wb_writeback_work to issue
@@ -838,15 +778,19 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
838 struct wb_writeback_work *base_work, 778 struct wb_writeback_work *base_work,
839 bool skip_if_busy) 779 bool skip_if_busy)
840{ 780{
841 long nr_pages = base_work->nr_pages; 781 int next_memcg_id = 0;
842 int next_blkcg_id = 0;
843 struct bdi_writeback *wb; 782 struct bdi_writeback *wb;
844 struct wb_iter iter; 783 struct wb_iter iter;
845 784
846 might_sleep(); 785 might_sleep();
847restart: 786restart:
848 rcu_read_lock(); 787 rcu_read_lock();
849 bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { 788 bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
789 DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
790 struct wb_writeback_work fallback_work;
791 struct wb_writeback_work *work;
792 long nr_pages;
793
850 /* SYNC_ALL writes out I_DIRTY_TIME too */ 794 /* SYNC_ALL writes out I_DIRTY_TIME too */
851 if (!wb_has_dirty_io(wb) && 795 if (!wb_has_dirty_io(wb) &&
852 (base_work->sync_mode == WB_SYNC_NONE || 796 (base_work->sync_mode == WB_SYNC_NONE ||
@@ -855,13 +799,30 @@ restart:
855 if (skip_if_busy && writeback_in_progress(wb)) 799 if (skip_if_busy && writeback_in_progress(wb))
856 continue; 800 continue;
857 801
858 base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); 802 nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
859 if (!wb_clone_and_queue_work(wb, base_work)) { 803
860 next_blkcg_id = wb->blkcg_css->id + 1; 804 work = kmalloc(sizeof(*work), GFP_ATOMIC);
861 rcu_read_unlock(); 805 if (work) {
862 wb_wait_for_single_work(bdi, base_work); 806 *work = *base_work;
863 goto restart; 807 work->nr_pages = nr_pages;
808 work->auto_free = 1;
809 wb_queue_work(wb, work);
810 continue;
864 } 811 }
812
813 /* alloc failed, execute synchronously using on-stack fallback */
814 work = &fallback_work;
815 *work = *base_work;
816 work->nr_pages = nr_pages;
817 work->auto_free = 0;
818 work->done = &fallback_work_done;
819
820 wb_queue_work(wb, work);
821
822 next_memcg_id = wb->memcg_css->id + 1;
823 rcu_read_unlock();
824 wb_wait_for_completion(bdi, &fallback_work_done);
825 goto restart;
865 } 826 }
866 rcu_read_unlock(); 827 rcu_read_unlock();
867} 828}
@@ -902,8 +863,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
902 863
903 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { 864 if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
904 base_work->auto_free = 0; 865 base_work->auto_free = 0;
905 base_work->single_wait = 0;
906 base_work->single_done = 0;
907 wb_queue_work(&bdi->wb, base_work); 866 wb_queue_work(&bdi->wb, base_work);
908 } 867 }
909} 868}
@@ -924,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
924 */ 883 */
925 work = kzalloc(sizeof(*work), GFP_ATOMIC); 884 work = kzalloc(sizeof(*work), GFP_ATOMIC);
926 if (!work) { 885 if (!work) {
927 trace_writeback_nowork(wb->bdi); 886 trace_writeback_nowork(wb);
928 wb_wakeup(wb); 887 wb_wakeup(wb);
929 return; 888 return;
930 } 889 }
@@ -954,7 +913,7 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
954 * We just wake up the flusher thread. It will perform background 913 * We just wake up the flusher thread. It will perform background
955 * writeback as soon as there is no other work to do. 914 * writeback as soon as there is no other work to do.
956 */ 915 */
957 trace_writeback_wake_background(wb->bdi); 916 trace_writeback_wake_background(wb);
958 wb_wakeup(wb); 917 wb_wakeup(wb);
959} 918}
960 919
@@ -1660,14 +1619,14 @@ static long wb_writeback(struct bdi_writeback *wb,
1660 } else if (work->for_background) 1619 } else if (work->for_background)
1661 oldest_jif = jiffies; 1620 oldest_jif = jiffies;
1662 1621
1663 trace_writeback_start(wb->bdi, work); 1622 trace_writeback_start(wb, work);
1664 if (list_empty(&wb->b_io)) 1623 if (list_empty(&wb->b_io))
1665 queue_io(wb, work); 1624 queue_io(wb, work);
1666 if (work->sb) 1625 if (work->sb)
1667 progress = writeback_sb_inodes(work->sb, wb, work); 1626 progress = writeback_sb_inodes(work->sb, wb, work);
1668 else 1627 else
1669 progress = __writeback_inodes_wb(wb, work); 1628 progress = __writeback_inodes_wb(wb, work);
1670 trace_writeback_written(wb->bdi, work); 1629 trace_writeback_written(wb, work);
1671 1630
1672 wb_update_bandwidth(wb, wb_start); 1631 wb_update_bandwidth(wb, wb_start);
1673 1632
@@ -1692,7 +1651,7 @@ static long wb_writeback(struct bdi_writeback *wb,
1692 * we'll just busyloop. 1651 * we'll just busyloop.
1693 */ 1652 */
1694 if (!list_empty(&wb->b_more_io)) { 1653 if (!list_empty(&wb->b_more_io)) {
1695 trace_writeback_wait(wb->bdi, work); 1654 trace_writeback_wait(wb, work);
1696 inode = wb_inode(wb->b_more_io.prev); 1655 inode = wb_inode(wb->b_more_io.prev);
1697 spin_lock(&inode->i_lock); 1656 spin_lock(&inode->i_lock);
1698 spin_unlock(&wb->list_lock); 1657 spin_unlock(&wb->list_lock);
@@ -1797,26 +1756,14 @@ static long wb_do_writeback(struct bdi_writeback *wb)
1797 set_bit(WB_writeback_running, &wb->state); 1756 set_bit(WB_writeback_running, &wb->state);
1798 while ((work = get_next_work_item(wb)) != NULL) { 1757 while ((work = get_next_work_item(wb)) != NULL) {
1799 struct wb_completion *done = work->done; 1758 struct wb_completion *done = work->done;
1800 bool need_wake_up = false;
1801 1759
1802 trace_writeback_exec(wb->bdi, work); 1760 trace_writeback_exec(wb, work);
1803 1761
1804 wrote += wb_writeback(wb, work); 1762 wrote += wb_writeback(wb, work);
1805 1763
1806 if (work->single_wait) { 1764 if (work->auto_free)
1807 WARN_ON_ONCE(work->auto_free);
1808 /* paired w/ rmb in wb_wait_for_single_work() */
1809 smp_wmb();
1810 work->single_done = 1;
1811 need_wake_up = true;
1812 } else if (work->auto_free) {
1813 kfree(work); 1765 kfree(work);
1814 }
1815
1816 if (done && atomic_dec_and_test(&done->cnt)) 1766 if (done && atomic_dec_and_test(&done->cnt))
1817 need_wake_up = true;
1818
1819 if (need_wake_up)
1820 wake_up_all(&wb->bdi->wb_waitq); 1767 wake_up_all(&wb->bdi->wb_waitq);
1821 } 1768 }
1822 1769