diff options
author | Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp> | 2007-10-16 04:24:52 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-16 12:42:54 -0400 |
commit | 902aaed0d983dfd459fcb2b678608d4584782200 (patch) | |
tree | 00b18597d87101ba1b37e5dcfde1d91312870160 /mm/swap.c | |
parent | 754af6f5a85fcd1ecb456851d20c65e4c6ce10ab (diff) |
mm: use pagevec to rotate reclaimable page
While running some memory intensive load, system response deteriorated just
after swap-out started.
The cause of this problem is that when a PG_reclaim page is moved to the tail
of the inactive LRU list in rotate_reclaimable_page(), lru_lock spin lock is
acquired every page writeback . This deteriorates system performance and
makes interrupt hold off time longer when swap-out started.
Following patch solves this problem. I use pagevec in rotating reclaimable
pages to mitigate LRU spin lock contention and reduce interrupt hold off time.
I did a test that allocating and touching pages in multiple processes, and
pinging to the test machine in flooding mode to measure response under memory
intensive load.
The test result is:
-2.6.23-rc5
--- testmachine ping statistics ---
3000 packets transmitted, 3000 received, 0% packet loss, time 53222ms
rtt min/avg/max/mdev = 0.074/0.652/172.228/7.176 ms, pipe 11, ipg/ewma
17.746/0.092 ms
-2.6.23-rc5-patched
--- testmachine ping statistics ---
3000 packets transmitted, 3000 received, 0% packet loss, time 51924ms
rtt min/avg/max/mdev = 0.072/0.108/3.884/0.114 ms, pipe 2, ipg/ewma
17.314/0.091 ms
Max round-trip-time was improved.
The test machine spec is that 4CPU(3.16GHz, Hyper-threading enabled)
8GB memory , 8GB swap.
I did ping test again to observe performance deterioration caused by taking
a ref.
-2.6.23-rc6-with-modifiedpatch
--- testmachine ping statistics ---
3000 packets transmitted, 3000 received, 0% packet loss, time 53386ms
rtt min/avg/max/mdev = 0.074/0.110/4.716/0.147 ms, pipe 2, ipg/ewma 17.801/0.129 ms
The result for my original patch is as follows.
-2.6.23-rc5-with-originalpatch
--- testmachine ping statistics ---
3000 packets transmitted, 3000 received, 0% packet loss, time 51924ms
rtt min/avg/max/mdev = 0.072/0.108/3.884/0.114 ms, pipe 2, ipg/ewma 17.314/0.091 ms
The influence to response was small.
[akpm@linux-foundation.org: fix uninitalised var warning]
[hugh@veritas.com: fix locking]
[randy.dunlap@oracle.com: fix function declaration]
[hugh@veritas.com: fix BUG at include/linux/mm.h:220!]
[hugh@veritas.com: kill redundancy in rotate_reclaimable_page]
[hugh@veritas.com: move_tail_pages into lru_add_drain]
Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swap.c')
-rw-r--r-- | mm/swap.c | 104 |
1 files changed, 74 insertions, 30 deletions
@@ -32,6 +32,10 @@ | |||
32 | /* How many pages do we try to swap or page in/out together? */ | 32 | /* How many pages do we try to swap or page in/out together? */ |
33 | int page_cluster; | 33 | int page_cluster; |
34 | 34 | ||
35 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; | ||
36 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; | ||
37 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; | ||
38 | |||
35 | /* | 39 | /* |
36 | * This path almost never happens for VM activity - pages are normally | 40 | * This path almost never happens for VM activity - pages are normally |
37 | * freed via pagevecs. But it gets used by networking. | 41 | * freed via pagevecs. But it gets used by networking. |
@@ -92,23 +96,47 @@ void put_pages_list(struct list_head *pages) | |||
92 | EXPORT_SYMBOL(put_pages_list); | 96 | EXPORT_SYMBOL(put_pages_list); |
93 | 97 | ||
94 | /* | 98 | /* |
99 | * pagevec_move_tail() must be called with IRQ disabled. | ||
100 | * Otherwise this may cause nasty races. | ||
101 | */ | ||
102 | static void pagevec_move_tail(struct pagevec *pvec) | ||
103 | { | ||
104 | int i; | ||
105 | int pgmoved = 0; | ||
106 | struct zone *zone = NULL; | ||
107 | |||
108 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
109 | struct page *page = pvec->pages[i]; | ||
110 | struct zone *pagezone = page_zone(page); | ||
111 | |||
112 | if (pagezone != zone) { | ||
113 | if (zone) | ||
114 | spin_unlock(&zone->lru_lock); | ||
115 | zone = pagezone; | ||
116 | spin_lock(&zone->lru_lock); | ||
117 | } | ||
118 | if (PageLRU(page) && !PageActive(page)) { | ||
119 | list_move_tail(&page->lru, &zone->inactive_list); | ||
120 | pgmoved++; | ||
121 | } | ||
122 | } | ||
123 | if (zone) | ||
124 | spin_unlock(&zone->lru_lock); | ||
125 | __count_vm_events(PGROTATED, pgmoved); | ||
126 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
127 | pagevec_reinit(pvec); | ||
128 | } | ||
129 | |||
130 | /* | ||
95 | * Writeback is about to end against a page which has been marked for immediate | 131 | * Writeback is about to end against a page which has been marked for immediate |
96 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | 132 | * reclaim. If it still appears to be reclaimable, move it to the tail of the |
97 | * inactive list. The page still has PageWriteback set, which will pin it. | 133 | * inactive list. |
98 | * | ||
99 | * We don't expect many pages to come through here, so don't bother batching | ||
100 | * things up. | ||
101 | * | ||
102 | * To avoid placing the page at the tail of the LRU while PG_writeback is still | ||
103 | * set, this function will clear PG_writeback before performing the page | ||
104 | * motion. Do that inside the lru lock because once PG_writeback is cleared | ||
105 | * we may not touch the page. | ||
106 | * | 134 | * |
107 | * Returns zero if it cleared PG_writeback. | 135 | * Returns zero if it cleared PG_writeback. |
108 | */ | 136 | */ |
109 | int rotate_reclaimable_page(struct page *page) | 137 | int rotate_reclaimable_page(struct page *page) |
110 | { | 138 | { |
111 | struct zone *zone; | 139 | struct pagevec *pvec; |
112 | unsigned long flags; | 140 | unsigned long flags; |
113 | 141 | ||
114 | if (PageLocked(page)) | 142 | if (PageLocked(page)) |
@@ -120,15 +148,16 @@ int rotate_reclaimable_page(struct page *page) | |||
120 | if (!PageLRU(page)) | 148 | if (!PageLRU(page)) |
121 | return 1; | 149 | return 1; |
122 | 150 | ||
123 | zone = page_zone(page); | 151 | page_cache_get(page); |
124 | spin_lock_irqsave(&zone->lru_lock, flags); | 152 | local_irq_save(flags); |
125 | if (PageLRU(page) && !PageActive(page)) { | 153 | pvec = &__get_cpu_var(lru_rotate_pvecs); |
126 | list_move_tail(&page->lru, &zone->inactive_list); | 154 | if (!pagevec_add(pvec, page)) |
127 | __count_vm_event(PGROTATED); | 155 | pagevec_move_tail(pvec); |
128 | } | 156 | local_irq_restore(flags); |
157 | |||
129 | if (!test_clear_page_writeback(page)) | 158 | if (!test_clear_page_writeback(page)) |
130 | BUG(); | 159 | BUG(); |
131 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 160 | |
132 | return 0; | 161 | return 0; |
133 | } | 162 | } |
134 | 163 | ||
@@ -172,9 +201,6 @@ EXPORT_SYMBOL(mark_page_accessed); | |||
172 | * lru_cache_add: add a page to the page lists | 201 | * lru_cache_add: add a page to the page lists |
173 | * @page: the page to add | 202 | * @page: the page to add |
174 | */ | 203 | */ |
175 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; | ||
176 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; | ||
177 | |||
178 | void fastcall lru_cache_add(struct page *page) | 204 | void fastcall lru_cache_add(struct page *page) |
179 | { | 205 | { |
180 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); | 206 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); |
@@ -195,21 +221,37 @@ void fastcall lru_cache_add_active(struct page *page) | |||
195 | put_cpu_var(lru_add_active_pvecs); | 221 | put_cpu_var(lru_add_active_pvecs); |
196 | } | 222 | } |
197 | 223 | ||
198 | static void __lru_add_drain(int cpu) | 224 | /* |
225 | * Drain pages out of the cpu's pagevecs. | ||
226 | * Either "cpu" is the current CPU, and preemption has already been | ||
227 | * disabled; or "cpu" is being hot-unplugged, and is already dead. | ||
228 | */ | ||
229 | static void drain_cpu_pagevecs(int cpu) | ||
199 | { | 230 | { |
200 | struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); | 231 | struct pagevec *pvec; |
201 | 232 | ||
202 | /* CPU is dead, so no locking needed. */ | 233 | pvec = &per_cpu(lru_add_pvecs, cpu); |
203 | if (pagevec_count(pvec)) | 234 | if (pagevec_count(pvec)) |
204 | __pagevec_lru_add(pvec); | 235 | __pagevec_lru_add(pvec); |
236 | |||
205 | pvec = &per_cpu(lru_add_active_pvecs, cpu); | 237 | pvec = &per_cpu(lru_add_active_pvecs, cpu); |
206 | if (pagevec_count(pvec)) | 238 | if (pagevec_count(pvec)) |
207 | __pagevec_lru_add_active(pvec); | 239 | __pagevec_lru_add_active(pvec); |
240 | |||
241 | pvec = &per_cpu(lru_rotate_pvecs, cpu); | ||
242 | if (pagevec_count(pvec)) { | ||
243 | unsigned long flags; | ||
244 | |||
245 | /* No harm done if a racing interrupt already did this */ | ||
246 | local_irq_save(flags); | ||
247 | pagevec_move_tail(pvec); | ||
248 | local_irq_restore(flags); | ||
249 | } | ||
208 | } | 250 | } |
209 | 251 | ||
210 | void lru_add_drain(void) | 252 | void lru_add_drain(void) |
211 | { | 253 | { |
212 | __lru_add_drain(get_cpu()); | 254 | drain_cpu_pagevecs(get_cpu()); |
213 | put_cpu(); | 255 | put_cpu(); |
214 | } | 256 | } |
215 | 257 | ||
@@ -256,6 +298,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
256 | int i; | 298 | int i; |
257 | struct pagevec pages_to_free; | 299 | struct pagevec pages_to_free; |
258 | struct zone *zone = NULL; | 300 | struct zone *zone = NULL; |
301 | unsigned long uninitialized_var(flags); | ||
259 | 302 | ||
260 | pagevec_init(&pages_to_free, cold); | 303 | pagevec_init(&pages_to_free, cold); |
261 | for (i = 0; i < nr; i++) { | 304 | for (i = 0; i < nr; i++) { |
@@ -263,7 +306,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
263 | 306 | ||
264 | if (unlikely(PageCompound(page))) { | 307 | if (unlikely(PageCompound(page))) { |
265 | if (zone) { | 308 | if (zone) { |
266 | spin_unlock_irq(&zone->lru_lock); | 309 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
267 | zone = NULL; | 310 | zone = NULL; |
268 | } | 311 | } |
269 | put_compound_page(page); | 312 | put_compound_page(page); |
@@ -277,9 +320,10 @@ void release_pages(struct page **pages, int nr, int cold) | |||
277 | struct zone *pagezone = page_zone(page); | 320 | struct zone *pagezone = page_zone(page); |
278 | if (pagezone != zone) { | 321 | if (pagezone != zone) { |
279 | if (zone) | 322 | if (zone) |
280 | spin_unlock_irq(&zone->lru_lock); | 323 | spin_unlock_irqrestore(&zone->lru_lock, |
324 | flags); | ||
281 | zone = pagezone; | 325 | zone = pagezone; |
282 | spin_lock_irq(&zone->lru_lock); | 326 | spin_lock_irqsave(&zone->lru_lock, flags); |
283 | } | 327 | } |
284 | VM_BUG_ON(!PageLRU(page)); | 328 | VM_BUG_ON(!PageLRU(page)); |
285 | __ClearPageLRU(page); | 329 | __ClearPageLRU(page); |
@@ -288,7 +332,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
288 | 332 | ||
289 | if (!pagevec_add(&pages_to_free, page)) { | 333 | if (!pagevec_add(&pages_to_free, page)) { |
290 | if (zone) { | 334 | if (zone) { |
291 | spin_unlock_irq(&zone->lru_lock); | 335 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
292 | zone = NULL; | 336 | zone = NULL; |
293 | } | 337 | } |
294 | __pagevec_free(&pages_to_free); | 338 | __pagevec_free(&pages_to_free); |
@@ -296,7 +340,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
296 | } | 340 | } |
297 | } | 341 | } |
298 | if (zone) | 342 | if (zone) |
299 | spin_unlock_irq(&zone->lru_lock); | 343 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
300 | 344 | ||
301 | pagevec_free(&pages_to_free); | 345 | pagevec_free(&pages_to_free); |
302 | } | 346 | } |
@@ -489,7 +533,7 @@ static int cpu_swap_callback(struct notifier_block *nfb, | |||
489 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | 533 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
490 | atomic_add(*committed, &vm_committed_space); | 534 | atomic_add(*committed, &vm_committed_space); |
491 | *committed = 0; | 535 | *committed = 0; |
492 | __lru_add_drain((long)hcpu); | 536 | drain_cpu_pagevecs((long)hcpu); |
493 | } | 537 | } |
494 | return NOTIFY_OK; | 538 | return NOTIFY_OK; |
495 | } | 539 | } |