diff options
-rw-r--r-- | include/linux/backing-dev.h | 2 | ||||
-rw-r--r-- | include/linux/mmzone.h | 8 | ||||
-rw-r--r-- | include/trace/events/writeback.h | 7 | ||||
-rw-r--r-- | mm/backing-dev.c | 61 | ||||
-rw-r--r-- | mm/page_alloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 42 |
6 files changed, 112 insertions, 12 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 35b00746c712..f1b402a50679 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
@@ -285,7 +285,7 @@ enum { | |||
285 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync); | 285 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync); |
286 | void set_bdi_congested(struct backing_dev_info *bdi, int sync); | 286 | void set_bdi_congested(struct backing_dev_info *bdi, int sync); |
287 | long congestion_wait(int sync, long timeout); | 287 | long congestion_wait(int sync, long timeout); |
288 | 288 | long wait_iff_congested(struct zone *zone, int sync, long timeout); | |
289 | 289 | ||
290 | static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) | 290 | static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) |
291 | { | 291 | { |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c3c17fb675ee..39c24ebe9cfd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -423,6 +423,9 @@ struct zone { | |||
423 | typedef enum { | 423 | typedef enum { |
424 | ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ | 424 | ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ |
425 | ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ | 425 | ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ |
426 | ZONE_CONGESTED, /* zone has many dirty pages backed by | ||
427 | * a congested BDI | ||
428 | */ | ||
426 | } zone_flags_t; | 429 | } zone_flags_t; |
427 | 430 | ||
428 | static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) | 431 | static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) |
@@ -440,6 +443,11 @@ static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag) | |||
440 | clear_bit(flag, &zone->flags); | 443 | clear_bit(flag, &zone->flags); |
441 | } | 444 | } |
442 | 445 | ||
446 | static inline int zone_is_reclaim_congested(const struct zone *zone) | ||
447 | { | ||
448 | return test_bit(ZONE_CONGESTED, &zone->flags); | ||
449 | } | ||
450 | |||
443 | static inline int zone_is_reclaim_locked(const struct zone *zone) | 451 | static inline int zone_is_reclaim_locked(const struct zone *zone) |
444 | { | 452 | { |
445 | return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); | 453 | return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); |
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index d2b2654606ec..89a2b2db4375 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h | |||
@@ -179,6 +179,13 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, | |||
179 | TP_ARGS(usec_timeout, usec_delayed) | 179 | TP_ARGS(usec_timeout, usec_delayed) |
180 | ); | 180 | ); |
181 | 181 | ||
182 | DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, | ||
183 | |||
184 | TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), | ||
185 | |||
186 | TP_ARGS(usec_timeout, usec_delayed) | ||
187 | ); | ||
188 | |||
182 | #endif /* _TRACE_WRITEBACK_H */ | 189 | #endif /* _TRACE_WRITEBACK_H */ |
183 | 190 | ||
184 | /* This part must be outside protection */ | 191 | /* This part must be outside protection */ |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 55627306abe0..5ad3c106606b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = { | |||
729 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), | 729 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), |
730 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) | 730 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) |
731 | }; | 731 | }; |
732 | static atomic_t nr_bdi_congested[2]; | ||
732 | 733 | ||
733 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync) | 734 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync) |
734 | { | 735 | { |
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) | |||
736 | wait_queue_head_t *wqh = &congestion_wqh[sync]; | 737 | wait_queue_head_t *wqh = &congestion_wqh[sync]; |
737 | 738 | ||
738 | bit = sync ? BDI_sync_congested : BDI_async_congested; | 739 | bit = sync ? BDI_sync_congested : BDI_async_congested; |
739 | clear_bit(bit, &bdi->state); | 740 | if (test_and_clear_bit(bit, &bdi->state)) |
741 | atomic_dec(&nr_bdi_congested[sync]); | ||
740 | smp_mb__after_clear_bit(); | 742 | smp_mb__after_clear_bit(); |
741 | if (waitqueue_active(wqh)) | 743 | if (waitqueue_active(wqh)) |
742 | wake_up(wqh); | 744 | wake_up(wqh); |
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync) | |||
748 | enum bdi_state bit; | 750 | enum bdi_state bit; |
749 | 751 | ||
750 | bit = sync ? BDI_sync_congested : BDI_async_congested; | 752 | bit = sync ? BDI_sync_congested : BDI_async_congested; |
751 | set_bit(bit, &bdi->state); | 753 | if (!test_and_set_bit(bit, &bdi->state)) |
754 | atomic_inc(&nr_bdi_congested[sync]); | ||
752 | } | 755 | } |
753 | EXPORT_SYMBOL(set_bdi_congested); | 756 | EXPORT_SYMBOL(set_bdi_congested); |
754 | 757 | ||
@@ -779,3 +782,57 @@ long congestion_wait(int sync, long timeout) | |||
779 | } | 782 | } |
780 | EXPORT_SYMBOL(congestion_wait); | 783 | EXPORT_SYMBOL(congestion_wait); |
781 | 784 | ||
785 | /** | ||
786 | * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes | ||
787 | * @zone: A zone to check if it is heavily congested | ||
788 | * @sync: SYNC or ASYNC IO | ||
789 | * @timeout: timeout in jiffies | ||
790 | * | ||
791 | * In the event of a congested backing_dev (any backing_dev) and the given | ||
792 | * @zone has experienced recent congestion, this waits for up to @timeout | ||
793 | * jiffies for either a BDI to exit congestion of the given @sync queue | ||
794 | * or a write to complete. | ||
795 | * | ||
796 | * In the absense of zone congestion, cond_resched() is called to yield | ||
797 | * the processor if necessary but otherwise does not sleep. | ||
798 | * | ||
799 | * The return value is 0 if the sleep is for the full timeout. Otherwise, | ||
800 | * it is the number of jiffies that were still remaining when the function | ||
801 | * returned. return_value == timeout implies the function did not sleep. | ||
802 | */ | ||
803 | long wait_iff_congested(struct zone *zone, int sync, long timeout) | ||
804 | { | ||
805 | long ret; | ||
806 | unsigned long start = jiffies; | ||
807 | DEFINE_WAIT(wait); | ||
808 | wait_queue_head_t *wqh = &congestion_wqh[sync]; | ||
809 | |||
810 | /* | ||
811 | * If there is no congestion, or heavy congestion is not being | ||
812 | * encountered in the current zone, yield if necessary instead | ||
813 | * of sleeping on the congestion queue | ||
814 | */ | ||
815 | if (atomic_read(&nr_bdi_congested[sync]) == 0 || | ||
816 | !zone_is_reclaim_congested(zone)) { | ||
817 | cond_resched(); | ||
818 | |||
819 | /* In case we scheduled, work out time remaining */ | ||
820 | ret = timeout - (jiffies - start); | ||
821 | if (ret < 0) | ||
822 | ret = 0; | ||
823 | |||
824 | goto out; | ||
825 | } | ||
826 | |||
827 | /* Sleep until uncongested or a write happens */ | ||
828 | prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); | ||
829 | ret = io_schedule_timeout(timeout); | ||
830 | finish_wait(wqh, &wait); | ||
831 | |||
832 | out: | ||
833 | trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), | ||
834 | jiffies_to_usecs(jiffies - start)); | ||
835 | |||
836 | return ret; | ||
837 | } | ||
838 | EXPORT_SYMBOL(wait_iff_congested); | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6a683f819439..b13bc5e5bd7d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
1907 | preferred_zone, migratetype); | 1907 | preferred_zone, migratetype); |
1908 | 1908 | ||
1909 | if (!page && gfp_mask & __GFP_NOFAIL) | 1909 | if (!page && gfp_mask & __GFP_NOFAIL) |
1910 | congestion_wait(BLK_RW_ASYNC, HZ/50); | 1910 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
1911 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | 1911 | } while (!page && (gfp_mask & __GFP_NOFAIL)); |
1912 | 1912 | ||
1913 | return page; | 1913 | return page; |
@@ -2095,7 +2095,7 @@ rebalance: | |||
2095 | pages_reclaimed += did_some_progress; | 2095 | pages_reclaimed += did_some_progress; |
2096 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { | 2096 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { |
2097 | /* Wait for some write requests to complete then retry */ | 2097 | /* Wait for some write requests to complete then retry */ |
2098 | congestion_wait(BLK_RW_ASYNC, HZ/50); | 2098 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
2099 | goto rebalance; | 2099 | goto rebalance; |
2100 | } | 2100 | } |
2101 | 2101 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 130ad0239f52..30fd658bb289 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -401,10 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
401 | } | 401 | } |
402 | if (mapping->a_ops->writepage == NULL) | 402 | if (mapping->a_ops->writepage == NULL) |
403 | return PAGE_ACTIVATE; | 403 | return PAGE_ACTIVATE; |
404 | if (!may_write_to_queue(mapping->backing_dev_info, sc)) { | 404 | if (!may_write_to_queue(mapping->backing_dev_info, sc)) |
405 | disable_lumpy_reclaim_mode(sc); | ||
406 | return PAGE_KEEP; | 405 | return PAGE_KEEP; |
407 | } | ||
408 | 406 | ||
409 | if (clear_page_dirty_for_io(page)) { | 407 | if (clear_page_dirty_for_io(page)) { |
410 | int res; | 408 | int res; |
@@ -681,11 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) | |||
681 | * shrink_page_list() returns the number of reclaimed pages | 679 | * shrink_page_list() returns the number of reclaimed pages |
682 | */ | 680 | */ |
683 | static unsigned long shrink_page_list(struct list_head *page_list, | 681 | static unsigned long shrink_page_list(struct list_head *page_list, |
682 | struct zone *zone, | ||
684 | struct scan_control *sc) | 683 | struct scan_control *sc) |
685 | { | 684 | { |
686 | LIST_HEAD(ret_pages); | 685 | LIST_HEAD(ret_pages); |
687 | LIST_HEAD(free_pages); | 686 | LIST_HEAD(free_pages); |
688 | int pgactivate = 0; | 687 | int pgactivate = 0; |
688 | unsigned long nr_dirty = 0; | ||
689 | unsigned long nr_congested = 0; | ||
689 | unsigned long nr_reclaimed = 0; | 690 | unsigned long nr_reclaimed = 0; |
690 | 691 | ||
691 | cond_resched(); | 692 | cond_resched(); |
@@ -705,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
705 | goto keep; | 706 | goto keep; |
706 | 707 | ||
707 | VM_BUG_ON(PageActive(page)); | 708 | VM_BUG_ON(PageActive(page)); |
709 | VM_BUG_ON(page_zone(page) != zone); | ||
708 | 710 | ||
709 | sc->nr_scanned++; | 711 | sc->nr_scanned++; |
710 | 712 | ||
@@ -782,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
782 | } | 784 | } |
783 | 785 | ||
784 | if (PageDirty(page)) { | 786 | if (PageDirty(page)) { |
787 | nr_dirty++; | ||
788 | |||
785 | if (references == PAGEREF_RECLAIM_CLEAN) | 789 | if (references == PAGEREF_RECLAIM_CLEAN) |
786 | goto keep_locked; | 790 | goto keep_locked; |
787 | if (!may_enter_fs) | 791 | if (!may_enter_fs) |
@@ -792,6 +796,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
792 | /* Page is dirty, try to write it out here */ | 796 | /* Page is dirty, try to write it out here */ |
793 | switch (pageout(page, mapping, sc)) { | 797 | switch (pageout(page, mapping, sc)) { |
794 | case PAGE_KEEP: | 798 | case PAGE_KEEP: |
799 | nr_congested++; | ||
795 | goto keep_locked; | 800 | goto keep_locked; |
796 | case PAGE_ACTIVATE: | 801 | case PAGE_ACTIVATE: |
797 | goto activate_locked; | 802 | goto activate_locked; |
@@ -902,6 +907,15 @@ keep_lumpy: | |||
902 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 907 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
903 | } | 908 | } |
904 | 909 | ||
910 | /* | ||
911 | * Tag a zone as congested if all the dirty pages encountered were | ||
912 | * backed by a congested BDI. In this case, reclaimers should just | ||
913 | * back off and wait for congestion to clear because further reclaim | ||
914 | * will encounter the same problem | ||
915 | */ | ||
916 | if (nr_dirty == nr_congested) | ||
917 | zone_set_flag(zone, ZONE_CONGESTED); | ||
918 | |||
905 | free_page_list(&free_pages); | 919 | free_page_list(&free_pages); |
906 | 920 | ||
907 | list_splice(&ret_pages, page_list); | 921 | list_splice(&ret_pages, page_list); |
@@ -1386,12 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1386 | 1400 | ||
1387 | spin_unlock_irq(&zone->lru_lock); | 1401 | spin_unlock_irq(&zone->lru_lock); |
1388 | 1402 | ||
1389 | nr_reclaimed = shrink_page_list(&page_list, sc); | 1403 | nr_reclaimed = shrink_page_list(&page_list, zone, sc); |
1390 | 1404 | ||
1391 | /* Check if we should syncronously wait for writeback */ | 1405 | /* Check if we should syncronously wait for writeback */ |
1392 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1406 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
1393 | set_lumpy_reclaim_mode(priority, sc, true); | 1407 | set_lumpy_reclaim_mode(priority, sc, true); |
1394 | nr_reclaimed += shrink_page_list(&page_list, sc); | 1408 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); |
1395 | } | 1409 | } |
1396 | 1410 | ||
1397 | local_irq_disable(); | 1411 | local_irq_disable(); |
@@ -1982,8 +1996,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1982 | 1996 | ||
1983 | /* Take a nap, wait for some writeback to complete */ | 1997 | /* Take a nap, wait for some writeback to complete */ |
1984 | if (!sc->hibernation_mode && sc->nr_scanned && | 1998 | if (!sc->hibernation_mode && sc->nr_scanned && |
1985 | priority < DEF_PRIORITY - 2) | 1999 | priority < DEF_PRIORITY - 2) { |
1986 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 2000 | struct zone *preferred_zone; |
2001 | |||
2002 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), | ||
2003 | NULL, &preferred_zone); | ||
2004 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); | ||
2005 | } | ||
1987 | } | 2006 | } |
1988 | 2007 | ||
1989 | out: | 2008 | out: |
@@ -2282,6 +2301,15 @@ loop_again: | |||
2282 | if (!zone_watermark_ok(zone, order, | 2301 | if (!zone_watermark_ok(zone, order, |
2283 | min_wmark_pages(zone), end_zone, 0)) | 2302 | min_wmark_pages(zone), end_zone, 0)) |
2284 | has_under_min_watermark_zone = 1; | 2303 | has_under_min_watermark_zone = 1; |
2304 | } else { | ||
2305 | /* | ||
2306 | * If a zone reaches its high watermark, | ||
2307 | * consider it to be no longer congested. It's | ||
2308 | * possible there are dirty pages backed by | ||
2309 | * congested BDIs but as pressure is relieved, | ||
2310 | * spectulatively avoid congestion waits | ||
2311 | */ | ||
2312 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2285 | } | 2313 | } |
2286 | 2314 | ||
2287 | } | 2315 | } |