aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2009-09-21 20:03:19 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-22 10:17:39 -0400
commit5f8dcc21211a3d4e3a7a5ca366b469fb88117f61 (patch)
tree4bbb1b55c7787462fe313c7c003e77823c032422
parent5d863b89688e5811cd9e5bd0082cb38abe03adf3 (diff)
page-allocator: split per-cpu list into one-list-per-migrate-type
The following two patches remove searching in the page allocator fast-path by maintaining multiple free-lists in the per-cpu structure. At the time the search was introduced, increasing the per-cpu structures would waste a lot of memory as per-cpu structures were statically allocated at compile-time. This is no longer the case. The patches are as follows. They are based on mmotm-2009-08-27. Patch 1 adds multiple lists to struct per_cpu_pages, one per migratetype that can be stored on the PCP lists. Patch 2 notes that the pcpu drain path check empty lists multiple times. The patch reduces the number of checks by maintaining a count of free lists encountered. Lists containing pages will then free multiple pages in batch The patches were tested with kernbench, netperf udp/tcp, hackbench and sysbench. The netperf tests were not bound to any CPU in particular and were run such that the results should be 99% confidence that the reported results are within 1% of the estimated mean. sysbench was run with a postgres background and read-only tests. Similar to netperf, it was run multiple times so that it's 99% confidence results are within 1%. The patches were tested on x86, x86-64 and ppc64 as x86: Intel Pentium D 3GHz with 8G RAM (no-brand machine) kernbench - No significant difference, variance well within noise netperf-udp - 1.34% to 2.28% gain netperf-tcp - 0.45% to 1.22% gain hackbench - Small variances, very close to noise sysbench - Very small gains x86-64: AMD Phenom 9950 1.3GHz with 8G RAM (no-brand machine) kernbench - No significant difference, variance well within noise netperf-udp - 1.83% to 10.42% gains netperf-tcp - No conclusive until buffer >= PAGE_SIZE 4096 +15.83% 8192 + 0.34% (not significant) 16384 + 1% hackbench - Small gains, very close to noise sysbench - 0.79% to 1.6% gain ppc64: PPC970MP 2.5GHz with 10GB RAM (it's a terrasoft powerstation) kernbench - No significant difference, variance well within noise netperf-udp - 2-3% gain for almost all buffer sizes tested netperf-tcp - losses on small buffers, gains on larger buffers possibly indicates some bad caching effect. hackbench - No significant difference sysbench - 2-4% gain This patch: Currently the per-cpu page allocator searches the PCP list for pages of the correct migrate-type to reduce the possibility of pages being inappropriate placed from a fragmentation perspective. This search is potentially expensive in a fast-path and undesirable. Splitting the per-cpu list into multiple lists increases the size of a per-cpu structure and this was potentially a major problem at the time the search was introduced. These problem has been mitigated as now only the necessary number of structures is allocated for the running system. This patch replaces a list search in the per-cpu allocator with one list per migrate type. The potential snag with this approach is when bulk freeing pages. We round-robin free pages based on migrate type which has little bearing on the cache hotness of the page and potentially checks empty lists repeatedly in the event the majority of PCP pages are of one type. Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Nick Piggin <npiggin@suse.de> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mmzone.h5
-rw-r--r--mm/page_alloc.c106
2 files changed, 63 insertions, 48 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c188ea624c74..652ef01be582 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -38,6 +38,7 @@
38#define MIGRATE_UNMOVABLE 0 38#define MIGRATE_UNMOVABLE 0
39#define MIGRATE_RECLAIMABLE 1 39#define MIGRATE_RECLAIMABLE 1
40#define MIGRATE_MOVABLE 2 40#define MIGRATE_MOVABLE 2
41#define MIGRATE_PCPTYPES 3 /* the number of types on the pcp lists */
41#define MIGRATE_RESERVE 3 42#define MIGRATE_RESERVE 3
42#define MIGRATE_ISOLATE 4 /* can't allocate from here */ 43#define MIGRATE_ISOLATE 4 /* can't allocate from here */
43#define MIGRATE_TYPES 5 44#define MIGRATE_TYPES 5
@@ -169,7 +170,9 @@ struct per_cpu_pages {
169 int count; /* number of pages in the list */ 170 int count; /* number of pages in the list */
170 int high; /* high watermark, emptying needed */ 171 int high; /* high watermark, emptying needed */
171 int batch; /* chunk size for buddy add/remove */ 172 int batch; /* chunk size for buddy add/remove */
172 struct list_head list; /* the list of pages */ 173
174 /* Lists of pages, one per migrate type stored on the pcp-lists */
175 struct list_head lists[MIGRATE_PCPTYPES];
173}; 176};
174 177
175struct per_cpu_pageset { 178struct per_cpu_pageset {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 84d9da1e8f4c..1b1c39e6a9b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -511,7 +511,7 @@ static inline int free_pages_check(struct page *page)
511} 511}
512 512
513/* 513/*
514 * Frees a list of pages. 514 * Frees a number of pages from the PCP lists
515 * Assumes all pages on list are in same zone, and of same order. 515 * Assumes all pages on list are in same zone, and of same order.
516 * count is the number of pages to free. 516 * count is the number of pages to free.
517 * 517 *
@@ -521,23 +521,36 @@ static inline int free_pages_check(struct page *page)
521 * And clear the zone's pages_scanned counter, to hold off the "all pages are 521 * And clear the zone's pages_scanned counter, to hold off the "all pages are
522 * pinned" detection logic. 522 * pinned" detection logic.
523 */ 523 */
524static void free_pages_bulk(struct zone *zone, int count, 524static void free_pcppages_bulk(struct zone *zone, int count,
525 struct list_head *list, int order) 525 struct per_cpu_pages *pcp)
526{ 526{
527 int migratetype = 0;
528
527 spin_lock(&zone->lock); 529 spin_lock(&zone->lock);
528 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 530 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
529 zone->pages_scanned = 0; 531 zone->pages_scanned = 0;
530 532
531 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); 533 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
532 while (count--) { 534 while (count--) {
533 struct page *page; 535 struct page *page;
536 struct list_head *list;
537
538 /*
539 * Remove pages from lists in a round-robin fashion. This spinning
540 * around potentially empty lists is bloody awful, alternatives that
541 * don't suck are welcome
542 */
543 do {
544 if (++migratetype == MIGRATE_PCPTYPES)
545 migratetype = 0;
546 list = &pcp->lists[migratetype];
547 } while (list_empty(list));
534 548
535 VM_BUG_ON(list_empty(list));
536 page = list_entry(list->prev, struct page, lru); 549 page = list_entry(list->prev, struct page, lru);
537 /* have to delete it as __free_one_page list manipulates */ 550 /* have to delete it as __free_one_page list manipulates */
538 list_del(&page->lru); 551 list_del(&page->lru);
539 trace_mm_page_pcpu_drain(page, order, page_private(page)); 552 trace_mm_page_pcpu_drain(page, 0, migratetype);
540 __free_one_page(page, zone, order, page_private(page)); 553 __free_one_page(page, zone, 0, migratetype);
541 } 554 }
542 spin_unlock(&zone->lock); 555 spin_unlock(&zone->lock);
543} 556}
@@ -953,7 +966,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
953 to_drain = pcp->batch; 966 to_drain = pcp->batch;
954 else 967 else
955 to_drain = pcp->count; 968 to_drain = pcp->count;
956 free_pages_bulk(zone, to_drain, &pcp->list, 0); 969 free_pcppages_bulk(zone, to_drain, pcp);
957 pcp->count -= to_drain; 970 pcp->count -= to_drain;
958 local_irq_restore(flags); 971 local_irq_restore(flags);
959} 972}
@@ -979,7 +992,7 @@ static void drain_pages(unsigned int cpu)
979 992
980 pcp = &pset->pcp; 993 pcp = &pset->pcp;
981 local_irq_save(flags); 994 local_irq_save(flags);
982 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 995 free_pcppages_bulk(zone, pcp->count, pcp);
983 pcp->count = 0; 996 pcp->count = 0;
984 local_irq_restore(flags); 997 local_irq_restore(flags);
985 } 998 }
@@ -1045,6 +1058,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1045 struct zone *zone = page_zone(page); 1058 struct zone *zone = page_zone(page);
1046 struct per_cpu_pages *pcp; 1059 struct per_cpu_pages *pcp;
1047 unsigned long flags; 1060 unsigned long flags;
1061 int migratetype;
1048 int wasMlocked = __TestClearPageMlocked(page); 1062 int wasMlocked = __TestClearPageMlocked(page);
1049 1063
1050 kmemcheck_free_shadow(page, 0); 1064 kmemcheck_free_shadow(page, 0);
@@ -1062,21 +1076,39 @@ static void free_hot_cold_page(struct page *page, int cold)
1062 kernel_map_pages(page, 1, 0); 1076 kernel_map_pages(page, 1, 0);
1063 1077
1064 pcp = &zone_pcp(zone, get_cpu())->pcp; 1078 pcp = &zone_pcp(zone, get_cpu())->pcp;
1065 set_page_private(page, get_pageblock_migratetype(page)); 1079 migratetype = get_pageblock_migratetype(page);
1080 set_page_private(page, migratetype);
1066 local_irq_save(flags); 1081 local_irq_save(flags);
1067 if (unlikely(wasMlocked)) 1082 if (unlikely(wasMlocked))
1068 free_page_mlock(page); 1083 free_page_mlock(page);
1069 __count_vm_event(PGFREE); 1084 __count_vm_event(PGFREE);
1070 1085
1086 /*
1087 * We only track unmovable, reclaimable and movable on pcp lists.
1088 * Free ISOLATE pages back to the allocator because they are being
1089 * offlined but treat RESERVE as movable pages so we can get those
1090 * areas back if necessary. Otherwise, we may have to free
1091 * excessively into the page allocator
1092 */
1093 if (migratetype >= MIGRATE_PCPTYPES) {
1094 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1095 free_one_page(zone, page, 0, migratetype);
1096 goto out;
1097 }
1098 migratetype = MIGRATE_MOVABLE;
1099 }
1100
1071 if (cold) 1101 if (cold)
1072 list_add_tail(&page->lru, &pcp->list); 1102 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1073 else 1103 else
1074 list_add(&page->lru, &pcp->list); 1104 list_add(&page->lru, &pcp->lists[migratetype]);
1075 pcp->count++; 1105 pcp->count++;
1076 if (pcp->count >= pcp->high) { 1106 if (pcp->count >= pcp->high) {
1077 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1107 free_pcppages_bulk(zone, pcp->batch, pcp);
1078 pcp->count -= pcp->batch; 1108 pcp->count -= pcp->batch;
1079 } 1109 }
1110
1111out:
1080 local_irq_restore(flags); 1112 local_irq_restore(flags);
1081 put_cpu(); 1113 put_cpu();
1082} 1114}
@@ -1134,46 +1166,24 @@ again:
1134 cpu = get_cpu(); 1166 cpu = get_cpu();
1135 if (likely(order == 0)) { 1167 if (likely(order == 0)) {
1136 struct per_cpu_pages *pcp; 1168 struct per_cpu_pages *pcp;
1169 struct list_head *list;
1137 1170
1138 pcp = &zone_pcp(zone, cpu)->pcp; 1171 pcp = &zone_pcp(zone, cpu)->pcp;
1172 list = &pcp->lists[migratetype];
1139 local_irq_save(flags); 1173 local_irq_save(flags);
1140 if (!pcp->count) { 1174 if (list_empty(list)) {
1141 pcp->count = rmqueue_bulk(zone, 0,
1142 pcp->batch, &pcp->list,
1143 migratetype, cold);
1144 if (unlikely(!pcp->count))
1145 goto failed;
1146 }
1147
1148 /* Find a page of the appropriate migrate type */
1149 if (cold) {
1150 list_for_each_entry_reverse(page, &pcp->list, lru)
1151 if (page_private(page) == migratetype)
1152 break;
1153 } else {
1154 list_for_each_entry(page, &pcp->list, lru)
1155 if (page_private(page) == migratetype)
1156 break;
1157 }
1158
1159 /* Allocate more to the pcp list if necessary */
1160 if (unlikely(&page->lru == &pcp->list)) {
1161 int get_one_page = 0;
1162
1163 pcp->count += rmqueue_bulk(zone, 0, 1175 pcp->count += rmqueue_bulk(zone, 0,
1164 pcp->batch, &pcp->list, 1176 pcp->batch, list,
1165 migratetype, cold); 1177 migratetype, cold);
1166 list_for_each_entry(page, &pcp->list, lru) { 1178 if (unlikely(list_empty(list)))
1167 if (get_pageblock_migratetype(page) !=
1168 MIGRATE_ISOLATE) {
1169 get_one_page = 1;
1170 break;
1171 }
1172 }
1173 if (!get_one_page)
1174 goto failed; 1179 goto failed;
1175 } 1180 }
1176 1181
1182 if (cold)
1183 page = list_entry(list->prev, struct page, lru);
1184 else
1185 page = list_entry(list->next, struct page, lru);
1186
1177 list_del(&page->lru); 1187 list_del(&page->lru);
1178 pcp->count--; 1188 pcp->count--;
1179 } else { 1189 } else {
@@ -3024,6 +3034,7 @@ static int zone_batchsize(struct zone *zone)
3024static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 3034static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
3025{ 3035{
3026 struct per_cpu_pages *pcp; 3036 struct per_cpu_pages *pcp;
3037 int migratetype;
3027 3038
3028 memset(p, 0, sizeof(*p)); 3039 memset(p, 0, sizeof(*p));
3029 3040
@@ -3031,7 +3042,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
3031 pcp->count = 0; 3042 pcp->count = 0;
3032 pcp->high = 6 * batch; 3043 pcp->high = 6 * batch;
3033 pcp->batch = max(1UL, 1 * batch); 3044 pcp->batch = max(1UL, 1 * batch);
3034 INIT_LIST_HEAD(&pcp->list); 3045 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3046 INIT_LIST_HEAD(&pcp->lists[migratetype]);
3035} 3047}
3036 3048
3037/* 3049/*
@@ -3223,7 +3235,7 @@ static int __zone_pcp_update(void *data)
3223 pcp = &pset->pcp; 3235 pcp = &pset->pcp;
3224 3236
3225 local_irq_save(flags); 3237 local_irq_save(flags);
3226 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 3238 free_pcppages_bulk(zone, pcp->count, pcp);
3227 setup_pageset(pset, batch); 3239 setup_pageset(pset, batch);
3228 local_irq_restore(flags); 3240 local_irq_restore(flags);
3229 } 3241 }