aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2010-10-26 17:21:45 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-26 19:52:07 -0400
commit0e093d99763eb4cea09f8ca4f1d01f34e121d10b (patch)
treefad38f9c3651c81db298521141a79d9468f71986
parent08fc468f4eaf6683bae5bdb94743a09d8630cb80 (diff)
writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone
If congestion_wait() is called with no BDI congested, the caller will sleep for the full timeout and this may be an unnecessary sleep. This patch adds a wait_iff_congested() that checks congestion and only sleeps if a BDI is congested else, it calls cond_resched() to ensure the caller is not hogging the CPU longer than its quota but otherwise will not sleep. This is aimed at reducing some of the major desktop stalls reported during IO. For example, while kswapd is operating, it calls congestion_wait() but it could just have been reclaiming clean page cache pages with no congestion. Without this patch, it would sleep for a full timeout but after this patch, it'll just call schedule() if it has been on the CPU too long. Similar logic applies to direct reclaimers that are not making enough progress. Signed-off-by: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Cc: Jens Axboe <axboe@kernel.dk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/backing-dev.h2
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/trace/events/writeback.h7
-rw-r--r--mm/backing-dev.c61
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/vmscan.c42
6 files changed, 112 insertions, 12 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 35b00746c712..f1b402a50679 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -285,7 +285,7 @@ enum {
285void clear_bdi_congested(struct backing_dev_info *bdi, int sync); 285void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
286void set_bdi_congested(struct backing_dev_info *bdi, int sync); 286void set_bdi_congested(struct backing_dev_info *bdi, int sync);
287long congestion_wait(int sync, long timeout); 287long congestion_wait(int sync, long timeout);
288 288long wait_iff_congested(struct zone *zone, int sync, long timeout);
289 289
290static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) 290static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
291{ 291{
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c3c17fb675ee..39c24ebe9cfd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -423,6 +423,9 @@ struct zone {
423typedef enum { 423typedef enum {
424 ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ 424 ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
425 ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ 425 ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
426 ZONE_CONGESTED, /* zone has many dirty pages backed by
427 * a congested BDI
428 */
426} zone_flags_t; 429} zone_flags_t;
427 430
428static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) 431static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -440,6 +443,11 @@ static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag)
440 clear_bit(flag, &zone->flags); 443 clear_bit(flag, &zone->flags);
441} 444}
442 445
446static inline int zone_is_reclaim_congested(const struct zone *zone)
447{
448 return test_bit(ZONE_CONGESTED, &zone->flags);
449}
450
443static inline int zone_is_reclaim_locked(const struct zone *zone) 451static inline int zone_is_reclaim_locked(const struct zone *zone)
444{ 452{
445 return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); 453 return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index d2b2654606ec..89a2b2db4375 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -179,6 +179,13 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
179 TP_ARGS(usec_timeout, usec_delayed) 179 TP_ARGS(usec_timeout, usec_delayed)
180); 180);
181 181
182DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
183
184 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
185
186 TP_ARGS(usec_timeout, usec_delayed)
187);
188
182#endif /* _TRACE_WRITEBACK_H */ 189#endif /* _TRACE_WRITEBACK_H */
183 190
184/* This part must be outside protection */ 191/* This part must be outside protection */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 55627306abe0..5ad3c106606b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
729 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 729 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
730 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 730 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
731 }; 731 };
732static atomic_t nr_bdi_congested[2];
732 733
733void clear_bdi_congested(struct backing_dev_info *bdi, int sync) 734void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
734{ 735{
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
736 wait_queue_head_t *wqh = &congestion_wqh[sync]; 737 wait_queue_head_t *wqh = &congestion_wqh[sync];
737 738
738 bit = sync ? BDI_sync_congested : BDI_async_congested; 739 bit = sync ? BDI_sync_congested : BDI_async_congested;
739 clear_bit(bit, &bdi->state); 740 if (test_and_clear_bit(bit, &bdi->state))
741 atomic_dec(&nr_bdi_congested[sync]);
740 smp_mb__after_clear_bit(); 742 smp_mb__after_clear_bit();
741 if (waitqueue_active(wqh)) 743 if (waitqueue_active(wqh))
742 wake_up(wqh); 744 wake_up(wqh);
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
748 enum bdi_state bit; 750 enum bdi_state bit;
749 751
750 bit = sync ? BDI_sync_congested : BDI_async_congested; 752 bit = sync ? BDI_sync_congested : BDI_async_congested;
751 set_bit(bit, &bdi->state); 753 if (!test_and_set_bit(bit, &bdi->state))
754 atomic_inc(&nr_bdi_congested[sync]);
752} 755}
753EXPORT_SYMBOL(set_bdi_congested); 756EXPORT_SYMBOL(set_bdi_congested);
754 757
@@ -779,3 +782,57 @@ long congestion_wait(int sync, long timeout)
779} 782}
780EXPORT_SYMBOL(congestion_wait); 783EXPORT_SYMBOL(congestion_wait);
781 784
785/**
786 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
787 * @zone: A zone to check if it is heavily congested
788 * @sync: SYNC or ASYNC IO
789 * @timeout: timeout in jiffies
790 *
791 * In the event of a congested backing_dev (any backing_dev) and the given
792 * @zone has experienced recent congestion, this waits for up to @timeout
793 * jiffies for either a BDI to exit congestion of the given @sync queue
794 * or a write to complete.
795 *
796 * In the absense of zone congestion, cond_resched() is called to yield
797 * the processor if necessary but otherwise does not sleep.
798 *
799 * The return value is 0 if the sleep is for the full timeout. Otherwise,
800 * it is the number of jiffies that were still remaining when the function
801 * returned. return_value == timeout implies the function did not sleep.
802 */
803long wait_iff_congested(struct zone *zone, int sync, long timeout)
804{
805 long ret;
806 unsigned long start = jiffies;
807 DEFINE_WAIT(wait);
808 wait_queue_head_t *wqh = &congestion_wqh[sync];
809
810 /*
811 * If there is no congestion, or heavy congestion is not being
812 * encountered in the current zone, yield if necessary instead
813 * of sleeping on the congestion queue
814 */
815 if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
816 !zone_is_reclaim_congested(zone)) {
817 cond_resched();
818
819 /* In case we scheduled, work out time remaining */
820 ret = timeout - (jiffies - start);
821 if (ret < 0)
822 ret = 0;
823
824 goto out;
825 }
826
827 /* Sleep until uncongested or a write happens */
828 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
829 ret = io_schedule_timeout(timeout);
830 finish_wait(wqh, &wait);
831
832out:
833 trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
834 jiffies_to_usecs(jiffies - start));
835
836 return ret;
837}
838EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6a683f819439..b13bc5e5bd7d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1907 preferred_zone, migratetype); 1907 preferred_zone, migratetype);
1908 1908
1909 if (!page && gfp_mask & __GFP_NOFAIL) 1909 if (!page && gfp_mask & __GFP_NOFAIL)
1910 congestion_wait(BLK_RW_ASYNC, HZ/50); 1910 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1911 } while (!page && (gfp_mask & __GFP_NOFAIL)); 1911 } while (!page && (gfp_mask & __GFP_NOFAIL));
1912 1912
1913 return page; 1913 return page;
@@ -2095,7 +2095,7 @@ rebalance:
2095 pages_reclaimed += did_some_progress; 2095 pages_reclaimed += did_some_progress;
2096 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 2096 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
2097 /* Wait for some write requests to complete then retry */ 2097 /* Wait for some write requests to complete then retry */
2098 congestion_wait(BLK_RW_ASYNC, HZ/50); 2098 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2099 goto rebalance; 2099 goto rebalance;
2100 } 2100 }
2101 2101
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 130ad0239f52..30fd658bb289 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -401,10 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
401 } 401 }
402 if (mapping->a_ops->writepage == NULL) 402 if (mapping->a_ops->writepage == NULL)
403 return PAGE_ACTIVATE; 403 return PAGE_ACTIVATE;
404 if (!may_write_to_queue(mapping->backing_dev_info, sc)) { 404 if (!may_write_to_queue(mapping->backing_dev_info, sc))
405 disable_lumpy_reclaim_mode(sc);
406 return PAGE_KEEP; 405 return PAGE_KEEP;
407 }
408 406
409 if (clear_page_dirty_for_io(page)) { 407 if (clear_page_dirty_for_io(page)) {
410 int res; 408 int res;
@@ -681,11 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
681 * shrink_page_list() returns the number of reclaimed pages 679 * shrink_page_list() returns the number of reclaimed pages
682 */ 680 */
683static unsigned long shrink_page_list(struct list_head *page_list, 681static unsigned long shrink_page_list(struct list_head *page_list,
682 struct zone *zone,
684 struct scan_control *sc) 683 struct scan_control *sc)
685{ 684{
686 LIST_HEAD(ret_pages); 685 LIST_HEAD(ret_pages);
687 LIST_HEAD(free_pages); 686 LIST_HEAD(free_pages);
688 int pgactivate = 0; 687 int pgactivate = 0;
688 unsigned long nr_dirty = 0;
689 unsigned long nr_congested = 0;
689 unsigned long nr_reclaimed = 0; 690 unsigned long nr_reclaimed = 0;
690 691
691 cond_resched(); 692 cond_resched();
@@ -705,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
705 goto keep; 706 goto keep;
706 707
707 VM_BUG_ON(PageActive(page)); 708 VM_BUG_ON(PageActive(page));
709 VM_BUG_ON(page_zone(page) != zone);
708 710
709 sc->nr_scanned++; 711 sc->nr_scanned++;
710 712
@@ -782,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
782 } 784 }
783 785
784 if (PageDirty(page)) { 786 if (PageDirty(page)) {
787 nr_dirty++;
788
785 if (references == PAGEREF_RECLAIM_CLEAN) 789 if (references == PAGEREF_RECLAIM_CLEAN)
786 goto keep_locked; 790 goto keep_locked;
787 if (!may_enter_fs) 791 if (!may_enter_fs)
@@ -792,6 +796,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
792 /* Page is dirty, try to write it out here */ 796 /* Page is dirty, try to write it out here */
793 switch (pageout(page, mapping, sc)) { 797 switch (pageout(page, mapping, sc)) {
794 case PAGE_KEEP: 798 case PAGE_KEEP:
799 nr_congested++;
795 goto keep_locked; 800 goto keep_locked;
796 case PAGE_ACTIVATE: 801 case PAGE_ACTIVATE:
797 goto activate_locked; 802 goto activate_locked;
@@ -902,6 +907,15 @@ keep_lumpy:
902 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 907 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
903 } 908 }
904 909
910 /*
911 * Tag a zone as congested if all the dirty pages encountered were
912 * backed by a congested BDI. In this case, reclaimers should just
913 * back off and wait for congestion to clear because further reclaim
914 * will encounter the same problem
915 */
916 if (nr_dirty == nr_congested)
917 zone_set_flag(zone, ZONE_CONGESTED);
918
905 free_page_list(&free_pages); 919 free_page_list(&free_pages);
906 920
907 list_splice(&ret_pages, page_list); 921 list_splice(&ret_pages, page_list);
@@ -1386,12 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1386 1400
1387 spin_unlock_irq(&zone->lru_lock); 1401 spin_unlock_irq(&zone->lru_lock);
1388 1402
1389 nr_reclaimed = shrink_page_list(&page_list, sc); 1403 nr_reclaimed = shrink_page_list(&page_list, zone, sc);
1390 1404
1391 /* Check if we should syncronously wait for writeback */ 1405 /* Check if we should syncronously wait for writeback */
1392 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1406 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1393 set_lumpy_reclaim_mode(priority, sc, true); 1407 set_lumpy_reclaim_mode(priority, sc, true);
1394 nr_reclaimed += shrink_page_list(&page_list, sc); 1408 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1395 } 1409 }
1396 1410
1397 local_irq_disable(); 1411 local_irq_disable();
@@ -1982,8 +1996,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1982 1996
1983 /* Take a nap, wait for some writeback to complete */ 1997 /* Take a nap, wait for some writeback to complete */
1984 if (!sc->hibernation_mode && sc->nr_scanned && 1998 if (!sc->hibernation_mode && sc->nr_scanned &&
1985 priority < DEF_PRIORITY - 2) 1999 priority < DEF_PRIORITY - 2) {
1986 congestion_wait(BLK_RW_ASYNC, HZ/10); 2000 struct zone *preferred_zone;
2001
2002 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2003 NULL, &preferred_zone);
2004 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2005 }
1987 } 2006 }
1988 2007
1989out: 2008out:
@@ -2282,6 +2301,15 @@ loop_again:
2282 if (!zone_watermark_ok(zone, order, 2301 if (!zone_watermark_ok(zone, order,
2283 min_wmark_pages(zone), end_zone, 0)) 2302 min_wmark_pages(zone), end_zone, 0))
2284 has_under_min_watermark_zone = 1; 2303 has_under_min_watermark_zone = 1;
2304 } else {
2305 /*
2306 * If a zone reaches its high watermark,
2307 * consider it to be no longer congested. It's
2308 * possible there are dirty pages backed by
2309 * congested BDIs but as pressure is relieved,
2310 * spectulatively avoid congestion waits
2311 */
2312 zone_clear_flag(zone, ZONE_CONGESTED);
2285 } 2313 }
2286 2314
2287 } 2315 }