aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/backing-dev.h2
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/trace/events/writeback.h7
-rw-r--r--mm/backing-dev.c61
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/vmscan.c42
6 files changed, 112 insertions, 12 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 35b00746c712..f1b402a50679 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -285,7 +285,7 @@ enum {
285void clear_bdi_congested(struct backing_dev_info *bdi, int sync); 285void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
286void set_bdi_congested(struct backing_dev_info *bdi, int sync); 286void set_bdi_congested(struct backing_dev_info *bdi, int sync);
287long congestion_wait(int sync, long timeout); 287long congestion_wait(int sync, long timeout);
288 288long wait_iff_congested(struct zone *zone, int sync, long timeout);
289 289
290static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) 290static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
291{ 291{
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c3c17fb675ee..39c24ebe9cfd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -423,6 +423,9 @@ struct zone {
423typedef enum { 423typedef enum {
424 ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ 424 ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
425 ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ 425 ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
426 ZONE_CONGESTED, /* zone has many dirty pages backed by
427 * a congested BDI
428 */
426} zone_flags_t; 429} zone_flags_t;
427 430
428static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) 431static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -440,6 +443,11 @@ static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag)
440 clear_bit(flag, &zone->flags); 443 clear_bit(flag, &zone->flags);
441} 444}
442 445
446static inline int zone_is_reclaim_congested(const struct zone *zone)
447{
448 return test_bit(ZONE_CONGESTED, &zone->flags);
449}
450
443static inline int zone_is_reclaim_locked(const struct zone *zone) 451static inline int zone_is_reclaim_locked(const struct zone *zone)
444{ 452{
445 return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); 453 return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index d2b2654606ec..89a2b2db4375 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -179,6 +179,13 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
179 TP_ARGS(usec_timeout, usec_delayed) 179 TP_ARGS(usec_timeout, usec_delayed)
180); 180);
181 181
182DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
183
184 TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
185
186 TP_ARGS(usec_timeout, usec_delayed)
187);
188
182#endif /* _TRACE_WRITEBACK_H */ 189#endif /* _TRACE_WRITEBACK_H */
183 190
184/* This part must be outside protection */ 191/* This part must be outside protection */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 55627306abe0..5ad3c106606b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
729 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 729 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
730 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 730 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
731 }; 731 };
732static atomic_t nr_bdi_congested[2];
732 733
733void clear_bdi_congested(struct backing_dev_info *bdi, int sync) 734void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
734{ 735{
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
736 wait_queue_head_t *wqh = &congestion_wqh[sync]; 737 wait_queue_head_t *wqh = &congestion_wqh[sync];
737 738
738 bit = sync ? BDI_sync_congested : BDI_async_congested; 739 bit = sync ? BDI_sync_congested : BDI_async_congested;
739 clear_bit(bit, &bdi->state); 740 if (test_and_clear_bit(bit, &bdi->state))
741 atomic_dec(&nr_bdi_congested[sync]);
740 smp_mb__after_clear_bit(); 742 smp_mb__after_clear_bit();
741 if (waitqueue_active(wqh)) 743 if (waitqueue_active(wqh))
742 wake_up(wqh); 744 wake_up(wqh);
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
748 enum bdi_state bit; 750 enum bdi_state bit;
749 751
750 bit = sync ? BDI_sync_congested : BDI_async_congested; 752 bit = sync ? BDI_sync_congested : BDI_async_congested;
751 set_bit(bit, &bdi->state); 753 if (!test_and_set_bit(bit, &bdi->state))
754 atomic_inc(&nr_bdi_congested[sync]);
752} 755}
753EXPORT_SYMBOL(set_bdi_congested); 756EXPORT_SYMBOL(set_bdi_congested);
754 757
@@ -779,3 +782,57 @@ long congestion_wait(int sync, long timeout)
779} 782}
780EXPORT_SYMBOL(congestion_wait); 783EXPORT_SYMBOL(congestion_wait);
781 784
785/**
786 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
787 * @zone: A zone to check if it is heavily congested
788 * @sync: SYNC or ASYNC IO
789 * @timeout: timeout in jiffies
790 *
791 * In the event of a congested backing_dev (any backing_dev) and the given
792 * @zone has experienced recent congestion, this waits for up to @timeout
793 * jiffies for either a BDI to exit congestion of the given @sync queue
794 * or a write to complete.
795 *
796 * In the absense of zone congestion, cond_resched() is called to yield
797 * the processor if necessary but otherwise does not sleep.
798 *
799 * The return value is 0 if the sleep is for the full timeout. Otherwise,
800 * it is the number of jiffies that were still remaining when the function
801 * returned. return_value == timeout implies the function did not sleep.
802 */
803long wait_iff_congested(struct zone *zone, int sync, long timeout)
804{
805 long ret;
806 unsigned long start = jiffies;
807 DEFINE_WAIT(wait);
808 wait_queue_head_t *wqh = &congestion_wqh[sync];
809
810 /*
811 * If there is no congestion, or heavy congestion is not being
812 * encountered in the current zone, yield if necessary instead
813 * of sleeping on the congestion queue
814 */
815 if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
816 !zone_is_reclaim_congested(zone)) {
817 cond_resched();
818
819 /* In case we scheduled, work out time remaining */
820 ret = timeout - (jiffies - start);
821 if (ret < 0)
822 ret = 0;
823
824 goto out;
825 }
826
827 /* Sleep until uncongested or a write happens */
828 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
829 ret = io_schedule_timeout(timeout);
830 finish_wait(wqh, &wait);
831
832out:
833 trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
834 jiffies_to_usecs(jiffies - start));
835
836 return ret;
837}
838EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6a683f819439..b13bc5e5bd7d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1907 preferred_zone, migratetype); 1907 preferred_zone, migratetype);
1908 1908
1909 if (!page && gfp_mask & __GFP_NOFAIL) 1909 if (!page && gfp_mask & __GFP_NOFAIL)
1910 congestion_wait(BLK_RW_ASYNC, HZ/50); 1910 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1911 } while (!page && (gfp_mask & __GFP_NOFAIL)); 1911 } while (!page && (gfp_mask & __GFP_NOFAIL));
1912 1912
1913 return page; 1913 return page;
@@ -2095,7 +2095,7 @@ rebalance:
2095 pages_reclaimed += did_some_progress; 2095 pages_reclaimed += did_some_progress;
2096 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 2096 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
2097 /* Wait for some write requests to complete then retry */ 2097 /* Wait for some write requests to complete then retry */
2098 congestion_wait(BLK_RW_ASYNC, HZ/50); 2098 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2099 goto rebalance; 2099 goto rebalance;
2100 } 2100 }
2101 2101
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 130ad0239f52..30fd658bb289 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -401,10 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
401 } 401 }
402 if (mapping->a_ops->writepage == NULL) 402 if (mapping->a_ops->writepage == NULL)
403 return PAGE_ACTIVATE; 403 return PAGE_ACTIVATE;
404 if (!may_write_to_queue(mapping->backing_dev_info, sc)) { 404 if (!may_write_to_queue(mapping->backing_dev_info, sc))
405 disable_lumpy_reclaim_mode(sc);
406 return PAGE_KEEP; 405 return PAGE_KEEP;
407 }
408 406
409 if (clear_page_dirty_for_io(page)) { 407 if (clear_page_dirty_for_io(page)) {
410 int res; 408 int res;
@@ -681,11 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
681 * shrink_page_list() returns the number of reclaimed pages 679 * shrink_page_list() returns the number of reclaimed pages
682 */ 680 */
683static unsigned long shrink_page_list(struct list_head *page_list, 681static unsigned long shrink_page_list(struct list_head *page_list,
682 struct zone *zone,
684 struct scan_control *sc) 683 struct scan_control *sc)
685{ 684{
686 LIST_HEAD(ret_pages); 685 LIST_HEAD(ret_pages);
687 LIST_HEAD(free_pages); 686 LIST_HEAD(free_pages);
688 int pgactivate = 0; 687 int pgactivate = 0;
688 unsigned long nr_dirty = 0;
689 unsigned long nr_congested = 0;
689 unsigned long nr_reclaimed = 0; 690 unsigned long nr_reclaimed = 0;
690 691
691 cond_resched(); 692 cond_resched();
@@ -705,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
705 goto keep; 706 goto keep;
706 707
707 VM_BUG_ON(PageActive(page)); 708 VM_BUG_ON(PageActive(page));
709 VM_BUG_ON(page_zone(page) != zone);
708 710
709 sc->nr_scanned++; 711 sc->nr_scanned++;
710 712
@@ -782,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
782 } 784 }
783 785
784 if (PageDirty(page)) { 786 if (PageDirty(page)) {
787 nr_dirty++;
788
785 if (references == PAGEREF_RECLAIM_CLEAN) 789 if (references == PAGEREF_RECLAIM_CLEAN)
786 goto keep_locked; 790 goto keep_locked;
787 if (!may_enter_fs) 791 if (!may_enter_fs)
@@ -792,6 +796,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
792 /* Page is dirty, try to write it out here */ 796 /* Page is dirty, try to write it out here */
793 switch (pageout(page, mapping, sc)) { 797 switch (pageout(page, mapping, sc)) {
794 case PAGE_KEEP: 798 case PAGE_KEEP:
799 nr_congested++;
795 goto keep_locked; 800 goto keep_locked;
796 case PAGE_ACTIVATE: 801 case PAGE_ACTIVATE:
797 goto activate_locked; 802 goto activate_locked;
@@ -902,6 +907,15 @@ keep_lumpy:
902 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 907 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
903 } 908 }
904 909
910 /*
911 * Tag a zone as congested if all the dirty pages encountered were
912 * backed by a congested BDI. In this case, reclaimers should just
913 * back off and wait for congestion to clear because further reclaim
914 * will encounter the same problem
915 */
916 if (nr_dirty == nr_congested)
917 zone_set_flag(zone, ZONE_CONGESTED);
918
905 free_page_list(&free_pages); 919 free_page_list(&free_pages);
906 920
907 list_splice(&ret_pages, page_list); 921 list_splice(&ret_pages, page_list);
@@ -1386,12 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1386 1400
1387 spin_unlock_irq(&zone->lru_lock); 1401 spin_unlock_irq(&zone->lru_lock);
1388 1402
1389 nr_reclaimed = shrink_page_list(&page_list, sc); 1403 nr_reclaimed = shrink_page_list(&page_list, zone, sc);
1390 1404
1391 /* Check if we should syncronously wait for writeback */ 1405 /* Check if we should syncronously wait for writeback */
1392 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1406 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1393 set_lumpy_reclaim_mode(priority, sc, true); 1407 set_lumpy_reclaim_mode(priority, sc, true);
1394 nr_reclaimed += shrink_page_list(&page_list, sc); 1408 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1395 } 1409 }
1396 1410
1397 local_irq_disable(); 1411 local_irq_disable();
@@ -1982,8 +1996,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1982 1996
1983 /* Take a nap, wait for some writeback to complete */ 1997 /* Take a nap, wait for some writeback to complete */
1984 if (!sc->hibernation_mode && sc->nr_scanned && 1998 if (!sc->hibernation_mode && sc->nr_scanned &&
1985 priority < DEF_PRIORITY - 2) 1999 priority < DEF_PRIORITY - 2) {
1986 congestion_wait(BLK_RW_ASYNC, HZ/10); 2000 struct zone *preferred_zone;
2001
2002 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2003 NULL, &preferred_zone);
2004 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2005 }
1987 } 2006 }
1988 2007
1989out: 2008out:
@@ -2282,6 +2301,15 @@ loop_again:
2282 if (!zone_watermark_ok(zone, order, 2301 if (!zone_watermark_ok(zone, order,
2283 min_wmark_pages(zone), end_zone, 0)) 2302 min_wmark_pages(zone), end_zone, 0))
2284 has_under_min_watermark_zone = 1; 2303 has_under_min_watermark_zone = 1;
2304 } else {
2305 /*
2306 * If a zone reaches its high watermark,
2307 * consider it to be no longer congested. It's
2308 * possible there are dirty pages backed by
2309 * congested BDIs but as pressure is relieved,
2310 * spectulatively avoid congestion waits
2311 */
2312 zone_clear_flag(zone, ZONE_CONGESTED);
2285 } 2313 }
2286 2314
2287 } 2315 }