[PATCH] separate bdi congestion functions from queue congestion functions

Separate out the concept of "queue congestion" from "backing-dev congestion". Congestion is a backing-dev concept, not a queue concept. The blk_* congestion functions are retained, as wrappers around the core backing-dev congestion functions. This proper layering is needed so that NFS can cleanly use the congestion functions, and so that CONFIG_BLOCK=n actually links. Cc: "Thomas Maier" <balagi@justmail.de> Cc: "Jens Axboe" <jens.axboe@oracle.com> Cc: Trond Myklebust <trond.myklebust@fys.uio.no> Cc: David Howells <dhowells@redhat.com> Cc: Peter Osterlund <petero2@telia.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Andrew Morton <akpm@osdl.org> 2006-10-20 02:28:16 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-10-20 13:26:35 -0400
commit: 3fcfab16c5b86eaa3db3a9a31adba550c5b67141 (patch)
tree: bd348fa081b8fbec2c79fbf8f173a306d70b2b2c /mm
parent: 79e2de4bc53d7ca2a8eedee49e4a92479b4b530e (diff)
6 files changed, 83 insertions, 20 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 12b3a4eee88d..f3c077eb0b8e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,8 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y                   := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           page_alloc.o page-writeback.o pdflush.o \
                           readahead.o swap.o truncate.o vmscan.o \
-                           prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
+                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+                           $(mmu-y)
 ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
 obj-y                   += bounce.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
new file mode 100644
index 000000000000..f50a2811f9dc
--- /dev/null
+++ b/mm/backing-dev.c
@@ -0,0 +1,69 @@
+#include <linux/wait.h>
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+static wait_queue_head_t congestion_wqh[2] = {
+                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
+                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
+        };
+void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
+{
+        enum bdi_state bit;
+        wait_queue_head_t *wqh = &congestion_wqh[rw];
+        bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+        clear_bit(bit, &bdi->state);
+        smp_mb__after_clear_bit();
+        if (waitqueue_active(wqh))
+                wake_up(wqh);
+}
+EXPORT_SYMBOL(clear_bdi_congested);
+void set_bdi_congested(struct backing_dev_info *bdi, int rw)
+{
+        enum bdi_state bit;
+        bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+        set_bit(bit, &bdi->state);
+}
+EXPORT_SYMBOL(set_bdi_congested);
+/**
+ * congestion_wait - wait for a backing_dev to become uncongested
+ * @rw: READ or WRITE
+ * @timeout: timeout in jiffies
+ *
+ * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
+ * write congestion.  If no backing_devs are congested then just wait for the
+ * next write to be completed.
+ */
+long congestion_wait(int rw, long timeout)
+{
+        long ret;
+        DEFINE_WAIT(wait);
+        wait_queue_head_t *wqh = &congestion_wqh[rw];
+        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+        ret = io_schedule_timeout(timeout);
+        finish_wait(wqh, &wait);
+        return ret;
+}
+EXPORT_SYMBOL(congestion_wait);
+/**
+ * congestion_end - wake up sleepers on a congested backing_dev_info
+ * @rw: READ or WRITE
+ */
+void congestion_end(int rw)
+{
+        wait_queue_head_t *wqh = &congestion_wqh[rw];
+        if (waitqueue_active(wqh))
+                wake_up(wqh);
+}
+EXPORT_SYMBOL(congestion_end);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a0f339057449..8d9b19f239c3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -222,7 +222,7 @@ static void balance_dirty_pages(struct address_space *mapping)
                        if (pages_written >= write_chunk)
                                break;          /* We've done our duty */
                }
-                blk_congestion_wait(WRITE, HZ/10);
+                congestion_wait(WRITE, HZ/10);
        }
        if (nr_reclaimable + global_page_state(NR_WRITEBACK)
@@ -314,7 +314,7 @@ void throttle_vm_writeout(void)
                if (global_page_state(NR_UNSTABLE_NFS) +
                        global_page_state(NR_WRITEBACK) <= dirty_thresh)
                                break;
-                blk_congestion_wait(WRITE, HZ/10);
+                congestion_wait(WRITE, HZ/10);
        }
 }
@@ -351,7 +351,7 @@ static void background_writeout(unsigned long _min_pages)
                min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
                if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                        /* Wrote less than expected */
-                        blk_congestion_wait(WRITE, HZ/10);
+                        congestion_wait(WRITE, HZ/10);
                        if (!wbc.encountered_congestion)
                                break;
                }
@@ -422,7 +422,7 @@ static void wb_kupdate(unsigned long arg)
                writeback_inodes(&wbc);
                if (wbc.nr_to_write > 0) {
                        if (wbc.encountered_congestion)
-                                blk_congestion_wait(WRITE, HZ/10);
+                                congestion_wait(WRITE, HZ/10);
                        else
                                break;  /* All the old data is written */
                }
@@ -956,15 +956,6 @@ int test_set_page_writeback(struct page *page)
 EXPORT_SYMBOL(test_set_page_writeback);
 /*
- * Wakes up tasks that are being throttled due to writeback congestion
- */
-void writeback_congestion_end(void)
-{
-        blk_congestion_end(WRITE);
-}
-EXPORT_SYMBOL(writeback_congestion_end);
-/*
 * Return true if any of the pages in the mapping are marged with the
 * passed tag.
 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40db96a655d0..afee38f04d84 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -39,6 +39,7 @@
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
+#include <linux/backing-dev.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1050,7 +1051,7 @@ nofail_alloc:
                        if (page)
                                goto got_pg;
                        if (gfp_mask & __GFP_NOFAIL) {
-                                blk_congestion_wait(WRITE, HZ/50);
+                                congestion_wait(WRITE, HZ/50);
                                goto nofail_alloc;
                        }
                }
@@ -1113,7 +1114,7 @@ rebalance:
                        do_retry = 1;
        }
        if (do_retry) {
-                blk_congestion_wait(WRITE, HZ/50);
+                congestion_wait(WRITE, HZ/50);
                goto rebalance;
        }
diff --git a/mm/shmem.c b/mm/shmem.c
index b378f66cf2f9..4959535fc14c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -48,6 +48,7 @@
 #include <linux/ctype.h>
 #include <linux/migrate.h>
 #include <linux/highmem.h>
+#include <linux/backing-dev.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
@@ -1131,7 +1132,7 @@ repeat:
                        page_cache_release(swappage);
                        if (error == -ENOMEM) {
                                /* let kswapd refresh zone for GFP_ATOMICs */
-                                blk_congestion_wait(WRITE, HZ/50);
+                                congestion_wait(WRITE, HZ/50);
                        }
                        goto repeat;
                }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index af73c14f9d88..f05527bf792b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1059,7 +1059,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                /* Take a nap, wait for some writeback to complete */
                if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
-                        blk_congestion_wait(WRITE, HZ/10);
+                        congestion_wait(WRITE, HZ/10);
        }
        /* top priority shrink_caches still had more to do? don't OOM, then */
        if (!sc.all_unreclaimable)
@@ -1214,7 +1214,7 @@ scan:
                 * another pass across the zones.
                 */
                if (total_scanned && priority < DEF_PRIORITY - 2)
-                        blk_congestion_wait(WRITE, HZ/10);
+                        congestion_wait(WRITE, HZ/10);
                /*
                 * We do this so kswapd doesn't build up large priorities for
@@ -1458,7 +1458,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                                goto out;
                        if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
-                                blk_congestion_wait(WRITE, HZ / 10);
+                                congestion_wait(WRITE, HZ / 10);
                }
                lru_pages = 0;
author	Andrew Morton <akpm@osdl.org>	2006-10-20 02:28:16 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-10-20 13:26:35 -0400
commit	3fcfab16c5b86eaa3db3a9a31adba550c5b67141 (patch)
tree	bd348fa081b8fbec2c79fbf8f173a306d70b2b2c /mm
parent	79e2de4bc53d7ca2a8eedee49e4a92479b4b530e (diff)