1 files changed, 118 insertions, 1 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fbeb4bb8eb50..ca75b9e7c09f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -115,14 +115,62 @@ static int discard_swap(struct swap_info_struct *si)
        return err;             /* That will often be -EOPNOTSUPP */
 }
+/*
+ * swap allocation tell device that a cluster of swap can now be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static void discard_swap_cluster(struct swap_info_struct *si,
+                                 pgoff_t start_page, pgoff_t nr_pages)
+{
+        struct swap_extent *se = si->curr_swap_extent;
+        int found_extent = 0;
+        while (nr_pages) {
+                struct list_head *lh;
+                if (se->start_page <= start_page &&
+                    start_page < se->start_page + se->nr_pages) {
+                        pgoff_t offset = start_page - se->start_page;
+                        sector_t start_block = se->start_block + offset;
+                        pgoff_t nr_blocks = se->nr_pages - offset;
+                        if (nr_blocks > nr_pages)
+                                nr_blocks = nr_pages;
+                        start_page += nr_blocks;
+                        nr_pages -= nr_blocks;
+                        if (!found_extent++)
+                                si->curr_swap_extent = se;
+                        start_block <<= PAGE_SHIFT - 9;
+                        nr_blocks <<= PAGE_SHIFT - 9;
+                        if (blkdev_issue_discard(si->bdev, start_block,
+                                                        nr_blocks, GFP_NOIO))
+                                break;
+                }
+                lh = se->list.next;
+                if (lh == &si->extent_list)
+                        lh = lh->next;
+                se = list_entry(lh, struct swap_extent, list);
+        }
+}
+static int wait_for_discard(void *word)
+{
+        schedule();
+        return 0;
+}
 #define SWAPFILE_CLUSTER        256
 #define LATENCY_LIMIT           256
 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
        unsigned long offset;
-        unsigned long last_in_cluster;
+        unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
+        int found_free_cluster = 0;
        /*
         * We try to cluster swap pages by allocating them sequentially
@@ -142,6 +190,19 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
                }
+                if (si->flags & SWP_DISCARDABLE) {
+                        /*
+                         * Start range check on racing allocations, in case
+                         * they overlap the cluster we eventually decide on
+                         * (we scan without swap_lock to allow preemption).
+                         * It's hardly conceivable that cluster_nr could be
+                         * wrapped during our scan, but don't depend on it.
+                         */
+                        if (si->lowest_alloc)
+                                goto checks;
+                        si->lowest_alloc = si->max;
+                        si->highest_alloc = 0;
+                }
                spin_unlock(&swap_lock);
                offset = si->lowest_bit;
@@ -156,6 +217,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                                found_free_cluster = 1;
                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
@@ -167,6 +229,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                offset = si->lowest_bit;
                spin_lock(&swap_lock);
                si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                si->lowest_alloc = 0;
        }
 checks:
@@ -191,6 +254,60 @@ checks:
        si->swap_map[offset] = 1;
        si->cluster_next = offset + 1;
        si->flags -= SWP_SCANNING;
+        if (si->lowest_alloc) {
+                /*
+                 * Only set when SWP_DISCARDABLE, and there's a scan
+                 * for a free cluster in progress or just completed.
+                 */
+                if (found_free_cluster) {
+                        /*
+                         * To optimize wear-levelling, discard the
+                         * old data of the cluster, taking care not to
+                         * discard any of its pages that have already
+                         * been allocated by racing tasks (offset has
+                         * already stepped over any at the beginning).
+                         */
+                        if (offset < si->highest_alloc &&
+                            si->lowest_alloc <= last_in_cluster)
+                                last_in_cluster = si->lowest_alloc - 1;
+                        si->flags |= SWP_DISCARDING;
+                        spin_unlock(&swap_lock);
+                        if (offset < last_in_cluster)
+                                discard_swap_cluster(si, offset,
+                                        last_in_cluster - offset + 1);
+                        spin_lock(&swap_lock);
+                        si->lowest_alloc = 0;
+                        si->flags &= ~SWP_DISCARDING;
+                        smp_mb();       /* wake_up_bit advises this */
+                        wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
+                } else if (si->flags & SWP_DISCARDING) {
+                        /*
+                         * Delay using pages allocated by racing tasks
+                         * until the whole discard has been issued. We
+                         * could defer that delay until swap_writepage,
+                         * but it's easier to keep this self-contained.
+                         */
+                        spin_unlock(&swap_lock);
+                        wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
+                                wait_for_discard, TASK_UNINTERRUPTIBLE);
+                        spin_lock(&swap_lock);
+                } else {
+                        /*
+                         * Note pages allocated by racing tasks while
+                         * scan for a free cluster is in progress, so
+                         * that its final discard can exclude them.
+                         */
+                        if (offset < si->lowest_alloc)
+                                si->lowest_alloc = offset;
+                        if (offset > si->highest_alloc)
+                                si->highest_alloc = offset;
+                }
+        }
        return offset;
 scan:

diff --git a/mm/swapfile.c b/mm/swapfile.c index fbeb4bb8eb50..ca75b9e7c09f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c
@@ -115,14 +115,62 @@ static int discard_swap(struct swap_info_struct *si)
115	return err; /* That will often be -EOPNOTSUPP */	115	return err; /* That will often be -EOPNOTSUPP */
116	}	116	}
117		117
		118	/*
		119	* swap allocation tell device that a cluster of swap can now be discarded,
		120	* to allow the swap device to optimize its wear-levelling.
		121	*/
		122	static void discard_swap_cluster(struct swap_info_struct *si,
		123	pgoff_t start_page, pgoff_t nr_pages)
		124	{
		125	struct swap_extent *se = si->curr_swap_extent;
		126	int found_extent = 0;
		127
		128	while (nr_pages) {
		129	struct list_head *lh;
		130
		131	if (se->start_page <= start_page &&
		132	start_page < se->start_page + se->nr_pages) {
		133	pgoff_t offset = start_page - se->start_page;
		134	sector_t start_block = se->start_block + offset;
		135	pgoff_t nr_blocks = se->nr_pages - offset;
		136
		137	if (nr_blocks > nr_pages)
		138	nr_blocks = nr_pages;
		139	start_page += nr_blocks;
		140	nr_pages -= nr_blocks;
		141
		142	if (!found_extent++)
		143	si->curr_swap_extent = se;
		144
		145	start_block <<= PAGE_SHIFT - 9;
		146	nr_blocks <<= PAGE_SHIFT - 9;
		147	if (blkdev_issue_discard(si->bdev, start_block,
		148	nr_blocks, GFP_NOIO))
		149	break;
		150	}
		151
		152	lh = se->list.next;
		153	if (lh == &si->extent_list)
		154	lh = lh->next;
		155	se = list_entry(lh, struct swap_extent, list);
		156	}
		157	}
		158
		159	static int wait_for_discard(void *word)
		160	{
		161	schedule();
		162	return 0;
		163	}
		164
118	#define SWAPFILE_CLUSTER 256	165	#define SWAPFILE_CLUSTER 256
119	#define LATENCY_LIMIT 256	166	#define LATENCY_LIMIT 256
120		167
121	static inline unsigned long scan_swap_map(struct swap_info_struct *si)	168	static inline unsigned long scan_swap_map(struct swap_info_struct *si)
122	{	169	{
123	unsigned long offset;	170	unsigned long offset;
124	unsigned long last_in_cluster;	171	unsigned long last_in_cluster = 0;
125	int latency_ration = LATENCY_LIMIT;	172	int latency_ration = LATENCY_LIMIT;
		173	int found_free_cluster = 0;
126		174
127	/*	175	/*
128	* We try to cluster swap pages by allocating them sequentially	176	* We try to cluster swap pages by allocating them sequentially
@@ -142,6 +190,19 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
142	si->cluster_nr = SWAPFILE_CLUSTER - 1;	190	si->cluster_nr = SWAPFILE_CLUSTER - 1;
143	goto checks;	191	goto checks;
144	}	192	}
		193	if (si->flags & SWP_DISCARDABLE) {
		194	/*
		195	* Start range check on racing allocations, in case
		196	* they overlap the cluster we eventually decide on
		197	* (we scan without swap_lock to allow preemption).
		198	* It's hardly conceivable that cluster_nr could be
		199	* wrapped during our scan, but don't depend on it.
		200	*/
		201	if (si->lowest_alloc)
		202	goto checks;
		203	si->lowest_alloc = si->max;
		204	si->highest_alloc = 0;
		205	}
145	spin_unlock(&swap_lock);	206	spin_unlock(&swap_lock);
146		207
147	offset = si->lowest_bit;	208	offset = si->lowest_bit;
@@ -156,6 +217,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
156	offset -= SWAPFILE_CLUSTER - 1;	217	offset -= SWAPFILE_CLUSTER - 1;
157	si->cluster_next = offset;	218	si->cluster_next = offset;
158	si->cluster_nr = SWAPFILE_CLUSTER - 1;	219	si->cluster_nr = SWAPFILE_CLUSTER - 1;
		220	found_free_cluster = 1;
159	goto checks;	221	goto checks;
160	}	222	}
161	if (unlikely(--latency_ration < 0)) {	223	if (unlikely(--latency_ration < 0)) {
@@ -167,6 +229,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
167	offset = si->lowest_bit;	229	offset = si->lowest_bit;
168	spin_lock(&swap_lock);	230	spin_lock(&swap_lock);
169	si->cluster_nr = SWAPFILE_CLUSTER - 1;	231	si->cluster_nr = SWAPFILE_CLUSTER - 1;
		232	si->lowest_alloc = 0;
170	}	233	}
171		234
172	checks:	235	checks:
@@ -191,6 +254,60 @@ checks:
191	si->swap_map[offset] = 1;	254	si->swap_map[offset] = 1;
192	si->cluster_next = offset + 1;	255	si->cluster_next = offset + 1;
193	si->flags -= SWP_SCANNING;	256	si->flags -= SWP_SCANNING;
		257
		258	if (si->lowest_alloc) {
		259	/*
		260	* Only set when SWP_DISCARDABLE, and there's a scan
		261	* for a free cluster in progress or just completed.
		262	*/
		263	if (found_free_cluster) {
		264	/*
		265	* To optimize wear-levelling, discard the
		266	* old data of the cluster, taking care not to
		267	* discard any of its pages that have already
		268	* been allocated by racing tasks (offset has
		269	* already stepped over any at the beginning).
		270	*/
		271	if (offset < si->highest_alloc &&
		272	si->lowest_alloc <= last_in_cluster)
		273	last_in_cluster = si->lowest_alloc - 1;
		274	si->flags \|= SWP_DISCARDING;
		275	spin_unlock(&swap_lock);
		276
		277	if (offset < last_in_cluster)
		278	discard_swap_cluster(si, offset,
		279	last_in_cluster - offset + 1);
		280
		281	spin_lock(&swap_lock);
		282	si->lowest_alloc = 0;
		283	si->flags &= ~SWP_DISCARDING;
		284
		285	smp_mb(); /* wake_up_bit advises this */
		286	wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
		287
		288	} else if (si->flags & SWP_DISCARDING) {
		289	/*
		290	* Delay using pages allocated by racing tasks
		291	* until the whole discard has been issued. We
		292	* could defer that delay until swap_writepage,
		293	* but it's easier to keep this self-contained.
		294	*/
		295	spin_unlock(&swap_lock);
		296	wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
		297	wait_for_discard, TASK_UNINTERRUPTIBLE);
		298	spin_lock(&swap_lock);
		299	} else {
		300	/*
		301	* Note pages allocated by racing tasks while
		302	* scan for a free cluster is in progress, so
		303	* that its final discard can exclude them.
		304	*/
		305	if (offset < si->lowest_alloc)
		306	si->lowest_alloc = offset;
		307	if (offset > si->highest_alloc)
		308	si->highest_alloc = offset;
		309	}
		310	}
194	return offset;	311	return offset;
195		312
196	scan:	313	scan: