swapfile: swap allocation use discard

When scan_swap_map() finds a free cluster of swap pages to allocate, discard the old contents of the cluster if the device supports discard. But don't bother when swap is so fragmented that we allocate single pages. Be careful about racing allocations made while we're scanning for a cluster; and hold up allocations made while we're discarding. Signed-off-by: Hugh Dickins <hugh@veritas.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: David Woodhouse <dwmw2@infradead.org> Cc: Jens Axboe <jens.axboe@oracle.com> Cc: Matthew Wilcox <matthew@wil.cx> Cc: Joern Engel <joern@logfs.org> Cc: James Bottomley <James.Bottomley@HansenPartnership.com> Cc: Donjun Shin <djshin90@gmail.com> Cc: Tejun Heo <teheo@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Hugh Dickins <hugh@veritas.com> 2009-01-06 17:39:53 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-01-06 18:59:05 -0500
commit: 7992fde72ce06c73280a1939b7a1e903bc95ef85 (patch)
tree: 8e8ef30ec4e29b325f70c2d01d2a9def192b5c64 /mm/swapfile.c
parent: 6a6ba83175c029c7820765bae44692266b29e67a (diff)
1 files changed, 118 insertions, 1 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fbeb4bb8eb50..ca75b9e7c09f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -115,14 +115,62 @@ static int discard_swap(struct swap_info_struct *si)
        return err;             /* That will often be -EOPNOTSUPP */
 }
+/*
+ * swap allocation tell device that a cluster of swap can now be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static void discard_swap_cluster(struct swap_info_struct *si,
+                                 pgoff_t start_page, pgoff_t nr_pages)
+{
+        struct swap_extent *se = si->curr_swap_extent;
+        int found_extent = 0;
+        while (nr_pages) {
+                struct list_head *lh;
+                if (se->start_page <= start_page &&
+                    start_page < se->start_page + se->nr_pages) {
+                        pgoff_t offset = start_page - se->start_page;
+                        sector_t start_block = se->start_block + offset;
+                        pgoff_t nr_blocks = se->nr_pages - offset;
+                        if (nr_blocks > nr_pages)
+                                nr_blocks = nr_pages;
+                        start_page += nr_blocks;
+                        nr_pages -= nr_blocks;
+                        if (!found_extent++)
+                                si->curr_swap_extent = se;
+                        start_block <<= PAGE_SHIFT - 9;
+                        nr_blocks <<= PAGE_SHIFT - 9;
+                        if (blkdev_issue_discard(si->bdev, start_block,
+                                                        nr_blocks, GFP_NOIO))
+                                break;
+                }
+                lh = se->list.next;
+                if (lh == &si->extent_list)
+                        lh = lh->next;
+                se = list_entry(lh, struct swap_extent, list);
+        }
+}
+static int wait_for_discard(void *word)
+{
+        schedule();
+        return 0;
+}
 #define SWAPFILE_CLUSTER        256
 #define LATENCY_LIMIT           256
 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
        unsigned long offset;
-        unsigned long last_in_cluster;
+        unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
+        int found_free_cluster = 0;
        /*
         * We try to cluster swap pages by allocating them sequentially
@@ -142,6 +190,19 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
                }
+                if (si->flags & SWP_DISCARDABLE) {
+                        /*
+                         * Start range check on racing allocations, in case
+                         * they overlap the cluster we eventually decide on
+                         * (we scan without swap_lock to allow preemption).
+                         * It's hardly conceivable that cluster_nr could be
+                         * wrapped during our scan, but don't depend on it.
+                         */
+                        if (si->lowest_alloc)
+                                goto checks;
+                        si->lowest_alloc = si->max;
+                        si->highest_alloc = 0;
+                }
                spin_unlock(&swap_lock);
                offset = si->lowest_bit;
@@ -156,6 +217,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                                found_free_cluster = 1;
                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
@@ -167,6 +229,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                offset = si->lowest_bit;
                spin_lock(&swap_lock);
                si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                si->lowest_alloc = 0;
        }
 checks:
@@ -191,6 +254,60 @@ checks:
        si->swap_map[offset] = 1;
        si->cluster_next = offset + 1;
        si->flags -= SWP_SCANNING;
+        if (si->lowest_alloc) {
+                /*
+                 * Only set when SWP_DISCARDABLE, and there's a scan
+                 * for a free cluster in progress or just completed.
+                 */
+                if (found_free_cluster) {
+                        /*
+                         * To optimize wear-levelling, discard the
+                         * old data of the cluster, taking care not to
+                         * discard any of its pages that have already
+                         * been allocated by racing tasks (offset has
+                         * already stepped over any at the beginning).
+                         */
+                        if (offset < si->highest_alloc &&
+                            si->lowest_alloc <= last_in_cluster)
+                                last_in_cluster = si->lowest_alloc - 1;
+                        si->flags |= SWP_DISCARDING;
+                        spin_unlock(&swap_lock);
+                        if (offset < last_in_cluster)
+                                discard_swap_cluster(si, offset,
+                                        last_in_cluster - offset + 1);
+                        spin_lock(&swap_lock);
+                        si->lowest_alloc = 0;
+                        si->flags &= ~SWP_DISCARDING;
+                        smp_mb();       /* wake_up_bit advises this */
+                        wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
+                } else if (si->flags & SWP_DISCARDING) {
+                        /*
+                         * Delay using pages allocated by racing tasks
+                         * until the whole discard has been issued. We
+                         * could defer that delay until swap_writepage,
+                         * but it's easier to keep this self-contained.
+                         */
+                        spin_unlock(&swap_lock);
+                        wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
+                                wait_for_discard, TASK_UNINTERRUPTIBLE);
+                        spin_lock(&swap_lock);
+                } else {
+                        /*
+                         * Note pages allocated by racing tasks while
+                         * scan for a free cluster is in progress, so
+                         * that its final discard can exclude them.
+                         */
+                        if (offset < si->lowest_alloc)
+                                si->lowest_alloc = offset;
+                        if (offset > si->highest_alloc)
+                                si->highest_alloc = offset;
+                }
+        }
        return offset;
 scan:
author	Hugh Dickins <hugh@veritas.com>	2009-01-06 17:39:53 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-01-06 18:59:05 -0500
commit	7992fde72ce06c73280a1939b7a1e903bc95ef85 (patch)
tree	8e8ef30ec4e29b325f70c2d01d2a9def192b5c64 /mm/swapfile.c
parent	6a6ba83175c029c7820765bae44692266b29e67a (diff)

diff --git a/mm/swapfile.c b/mm/swapfile.c index fbeb4bb8eb50..ca75b9e7c09f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c
@@ -115,14 +115,62 @@ static int discard_swap(struct swap_info_struct *si)
115	return err; /* That will often be -EOPNOTSUPP */	115	return err; /* That will often be -EOPNOTSUPP */
116	}	116	}
117		117
		118	/*
		119	* swap allocation tell device that a cluster of swap can now be discarded,
		120	* to allow the swap device to optimize its wear-levelling.
		121	*/
		122	static void discard_swap_cluster(struct swap_info_struct *si,
		123	pgoff_t start_page, pgoff_t nr_pages)
		124	{
		125	struct swap_extent *se = si->curr_swap_extent;
		126	int found_extent = 0;
		127
		128	while (nr_pages) {
		129	struct list_head *lh;
		130
		131	if (se->start_page <= start_page &&
		132	start_page < se->start_page + se->nr_pages) {
		133	pgoff_t offset = start_page - se->start_page;
		134	sector_t start_block = se->start_block + offset;
		135	pgoff_t nr_blocks = se->nr_pages - offset;
		136
		137	if (nr_blocks > nr_pages)
		138	nr_blocks = nr_pages;
		139	start_page += nr_blocks;
		140	nr_pages -= nr_blocks;
		141
		142	if (!found_extent++)
		143	si->curr_swap_extent = se;
		144
		145	start_block <<= PAGE_SHIFT - 9;
		146	nr_blocks <<= PAGE_SHIFT - 9;
		147	if (blkdev_issue_discard(si->bdev, start_block,
		148	nr_blocks, GFP_NOIO))
		149	break;
		150	}
		151
		152	lh = se->list.next;
		153	if (lh == &si->extent_list)
		154	lh = lh->next;
		155	se = list_entry(lh, struct swap_extent, list);
		156	}
		157	}
		158
		159	static int wait_for_discard(void *word)
		160	{
		161	schedule();
		162	return 0;
		163	}
		164
118	#define SWAPFILE_CLUSTER 256	165	#define SWAPFILE_CLUSTER 256
119	#define LATENCY_LIMIT 256	166	#define LATENCY_LIMIT 256
120		167
121	static inline unsigned long scan_swap_map(struct swap_info_struct *si)	168	static inline unsigned long scan_swap_map(struct swap_info_struct *si)
122	{	169	{
123	unsigned long offset;	170	unsigned long offset;
124	unsigned long last_in_cluster;	171	unsigned long last_in_cluster = 0;
125	int latency_ration = LATENCY_LIMIT;	172	int latency_ration = LATENCY_LIMIT;
		173	int found_free_cluster = 0;
126		174
127	/*	175	/*
128	* We try to cluster swap pages by allocating them sequentially	176	* We try to cluster swap pages by allocating them sequentially
@@ -142,6 +190,19 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
142	si->cluster_nr = SWAPFILE_CLUSTER - 1;	190	si->cluster_nr = SWAPFILE_CLUSTER - 1;
143	goto checks;	191	goto checks;
144	}	192	}
		193	if (si->flags & SWP_DISCARDABLE) {
		194	/*
		195	* Start range check on racing allocations, in case
		196	* they overlap the cluster we eventually decide on
		197	* (we scan without swap_lock to allow preemption).
		198	* It's hardly conceivable that cluster_nr could be
		199	* wrapped during our scan, but don't depend on it.
		200	*/
		201	if (si->lowest_alloc)
		202	goto checks;
		203	si->lowest_alloc = si->max;
		204	si->highest_alloc = 0;
		205	}
145	spin_unlock(&swap_lock);	206	spin_unlock(&swap_lock);
146		207
147	offset = si->lowest_bit;	208	offset = si->lowest_bit;
@@ -156,6 +217,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
156	offset -= SWAPFILE_CLUSTER - 1;	217	offset -= SWAPFILE_CLUSTER - 1;
157	si->cluster_next = offset;	218	si->cluster_next = offset;
158	si->cluster_nr = SWAPFILE_CLUSTER - 1;	219	si->cluster_nr = SWAPFILE_CLUSTER - 1;
		220	found_free_cluster = 1;
159	goto checks;	221	goto checks;
160	}	222	}
161	if (unlikely(--latency_ration < 0)) {	223	if (unlikely(--latency_ration < 0)) {
@@ -167,6 +229,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
167	offset = si->lowest_bit;	229	offset = si->lowest_bit;
168	spin_lock(&swap_lock);	230	spin_lock(&swap_lock);
169	si->cluster_nr = SWAPFILE_CLUSTER - 1;	231	si->cluster_nr = SWAPFILE_CLUSTER - 1;
		232	si->lowest_alloc = 0;
170	}	233	}
171		234
172	checks:	235	checks:
@@ -191,6 +254,60 @@ checks:
191	si->swap_map[offset] = 1;	254	si->swap_map[offset] = 1;
192	si->cluster_next = offset + 1;	255	si->cluster_next = offset + 1;
193	si->flags -= SWP_SCANNING;	256	si->flags -= SWP_SCANNING;
		257
		258	if (si->lowest_alloc) {
		259	/*
		260	* Only set when SWP_DISCARDABLE, and there's a scan
		261	* for a free cluster in progress or just completed.
		262	*/
		263	if (found_free_cluster) {
		264	/*
		265	* To optimize wear-levelling, discard the
		266	* old data of the cluster, taking care not to
		267	* discard any of its pages that have already
		268	* been allocated by racing tasks (offset has
		269	* already stepped over any at the beginning).
		270	*/
		271	if (offset < si->highest_alloc &&
		272	si->lowest_alloc <= last_in_cluster)
		273	last_in_cluster = si->lowest_alloc - 1;
		274	si->flags \|= SWP_DISCARDING;
		275	spin_unlock(&swap_lock);
		276
		277	if (offset < last_in_cluster)
		278	discard_swap_cluster(si, offset,
		279	last_in_cluster - offset + 1);
		280
		281	spin_lock(&swap_lock);
		282	si->lowest_alloc = 0;
		283	si->flags &= ~SWP_DISCARDING;
		284
		285	smp_mb(); /* wake_up_bit advises this */
		286	wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
		287
		288	} else if (si->flags & SWP_DISCARDING) {
		289	/*
		290	* Delay using pages allocated by racing tasks
		291	* until the whole discard has been issued. We
		292	* could defer that delay until swap_writepage,
		293	* but it's easier to keep this self-contained.
		294	*/
		295	spin_unlock(&swap_lock);
		296	wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
		297	wait_for_discard, TASK_UNINTERRUPTIBLE);
		298	spin_lock(&swap_lock);
		299	} else {
		300	/*
		301	* Note pages allocated by racing tasks while
		302	* scan for a free cluster is in progress, so
		303	* that its final discard can exclude them.
		304	*/
		305	if (offset < si->lowest_alloc)
		306	si->lowest_alloc = offset;
		307	if (offset > si->highest_alloc)
		308	si->highest_alloc = offset;
		309	}
		310	}
194	return offset;	311	return offset;
195		312
196	scan:	313	scan: