diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 234 |
1 files changed, 190 insertions, 44 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8deb9d0fd5b1..bde9ea1ab26f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -162,6 +162,53 @@ static unsigned long __meminitdata dma_reserve; | |||
162 | EXPORT_SYMBOL(movable_zone); | 162 | EXPORT_SYMBOL(movable_zone); |
163 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | 163 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ |
164 | 164 | ||
165 | #ifdef CONFIG_PREEMPT_RT | ||
166 | static DEFINE_PER_CPU_LOCKED(int, pcp_locks); | ||
167 | #endif | ||
168 | |||
169 | static inline void __lock_cpu_pcp(unsigned long *flags, int cpu) | ||
170 | { | ||
171 | #ifdef CONFIG_PREEMPT_RT | ||
172 | spin_lock(&__get_cpu_lock(pcp_locks, cpu)); | ||
173 | *flags = 0; | ||
174 | #else | ||
175 | local_irq_save(*flags); | ||
176 | #endif | ||
177 | } | ||
178 | |||
179 | static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu) | ||
180 | { | ||
181 | #ifdef CONFIG_PREEMPT_RT | ||
182 | (void)get_cpu_var_locked(pcp_locks, this_cpu); | ||
183 | flags = 0; | ||
184 | #else | ||
185 | local_irq_save(*flags); | ||
186 | *this_cpu = smp_processor_id(); | ||
187 | #endif | ||
188 | } | ||
189 | |||
190 | static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu) | ||
191 | { | ||
192 | #ifdef CONFIG_PREEMPT_RT | ||
193 | put_cpu_var_locked(pcp_locks, this_cpu); | ||
194 | #else | ||
195 | local_irq_restore(flags); | ||
196 | #endif | ||
197 | } | ||
198 | |||
199 | static struct per_cpu_pageset * | ||
200 | get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu) | ||
201 | { | ||
202 | lock_cpu_pcp(flags, this_cpu); | ||
203 | return zone_pcp(zone, *this_cpu); | ||
204 | } | ||
205 | |||
206 | static void | ||
207 | put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu) | ||
208 | { | ||
209 | unlock_cpu_pcp(flags, this_cpu); | ||
210 | } | ||
211 | |||
165 | #if MAX_NUMNODES > 1 | 212 | #if MAX_NUMNODES > 1 |
166 | int nr_node_ids __read_mostly = MAX_NUMNODES; | 213 | int nr_node_ids __read_mostly = MAX_NUMNODES; |
167 | int nr_online_nodes __read_mostly = 1; | 214 | int nr_online_nodes __read_mostly = 1; |
@@ -524,16 +571,48 @@ static inline int free_pages_check(struct page *page) | |||
524 | * pinned" detection logic. | 571 | * pinned" detection logic. |
525 | */ | 572 | */ |
526 | static void free_pcppages_bulk(struct zone *zone, int count, | 573 | static void free_pcppages_bulk(struct zone *zone, int count, |
527 | struct per_cpu_pages *pcp) | 574 | struct per_cpu_pages *pcp) |
528 | { | 575 | { |
529 | int migratetype = 0; | 576 | int migratetype = 0; |
530 | int batch_free = 0; | 577 | unsigned long flags; |
531 | 578 | ||
532 | spin_lock(&zone->lock); | 579 | spin_lock_irqsave(&zone->lock, flags); |
533 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 580 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
534 | zone->pages_scanned = 0; | 581 | zone->pages_scanned = 0; |
535 | 582 | ||
536 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | 583 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); |
584 | |||
585 | for (migratetype =0; migratetype < MIGRATE_PCPTYPES; migratetype++) { | ||
586 | struct list_head *list = &pcp->lists[migratetype]; | ||
587 | |||
588 | while (!list_empty(list)) { | ||
589 | struct page *page; | ||
590 | |||
591 | page = list_first_entry(list, struct page, lru); | ||
592 | /* must delete as __free_one_page list manipulates */ | ||
593 | list_del(&page->lru); | ||
594 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | ||
595 | __free_one_page(page, zone, 0, page_private(page)); | ||
596 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); | ||
597 | #ifdef CONFIG_PREEMPT_RT | ||
598 | cond_resched_lock(&zone->lock); | ||
599 | #endif | ||
600 | count--; | ||
601 | } | ||
602 | } | ||
603 | WARN_ON(count != 0); | ||
604 | spin_unlock_irqrestore(&zone->lock, flags); | ||
605 | } | ||
606 | |||
607 | static void isolate_pcp_pages(int count, struct per_cpu_pages *src, | ||
608 | struct per_cpu_pages *dst) | ||
609 | { | ||
610 | int migratetype, batch_free = 0; | ||
611 | |||
612 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) | ||
613 | INIT_LIST_HEAD(&dst->lists[migratetype]); | ||
614 | migratetype = 0; | ||
615 | |||
537 | while (count) { | 616 | while (count) { |
538 | struct page *page; | 617 | struct page *page; |
539 | struct list_head *list; | 618 | struct list_head *list; |
@@ -549,38 +628,36 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
549 | batch_free++; | 628 | batch_free++; |
550 | if (++migratetype == MIGRATE_PCPTYPES) | 629 | if (++migratetype == MIGRATE_PCPTYPES) |
551 | migratetype = 0; | 630 | migratetype = 0; |
552 | list = &pcp->lists[migratetype]; | 631 | list = &src->lists[migratetype]; |
553 | } while (list_empty(list)); | 632 | } while (list_empty(list)); |
554 | 633 | ||
555 | do { | 634 | do { |
556 | page = list_entry(list->prev, struct page, lru); | 635 | page = list_last_entry(list, struct page, lru); |
557 | /* must delete as __free_one_page list manipulates */ | 636 | /* must delete as __free_one_page list manipulates */ |
558 | list_del(&page->lru); | 637 | list_del(&page->lru); |
559 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 638 | list_add(&page->lru, &dst->lists[migratetype]); |
560 | __free_one_page(page, zone, 0, page_private(page)); | ||
561 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); | ||
562 | } while (--count && --batch_free && !list_empty(list)); | 639 | } while (--count && --batch_free && !list_empty(list)); |
563 | } | 640 | } |
564 | spin_unlock(&zone->lock); | ||
565 | } | 641 | } |
566 | 642 | ||
567 | static void free_one_page(struct zone *zone, struct page *page, int order, | 643 | static void free_one_page(struct zone *zone, struct page *page, int order, |
568 | int migratetype) | 644 | int migratetype) |
569 | { | 645 | { |
570 | spin_lock(&zone->lock); | 646 | unsigned long flags; |
647 | |||
648 | spin_lock_irqsave(&zone->lock, flags); | ||
571 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 649 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
572 | zone->pages_scanned = 0; | 650 | zone->pages_scanned = 0; |
573 | 651 | ||
574 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 652 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); |
575 | __free_one_page(page, zone, order, migratetype); | 653 | __free_one_page(page, zone, order, migratetype); |
576 | spin_unlock(&zone->lock); | 654 | spin_unlock_irqrestore(&zone->lock, flags); |
577 | } | 655 | } |
578 | 656 | ||
579 | static void __free_pages_ok(struct page *page, unsigned int order) | 657 | static void __free_pages_ok(struct page *page, unsigned int order) |
580 | { | 658 | { |
581 | unsigned long flags; | 659 | unsigned long flags; |
582 | int i; | 660 | int i, this_cpu, bad = 0; |
583 | int bad = 0; | ||
584 | int wasMlocked = __TestClearPageMlocked(page); | 661 | int wasMlocked = __TestClearPageMlocked(page); |
585 | 662 | ||
586 | kmemcheck_free_shadow(page, order); | 663 | kmemcheck_free_shadow(page, order); |
@@ -598,13 +675,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
598 | arch_free_page(page, order); | 675 | arch_free_page(page, order); |
599 | kernel_map_pages(page, 1 << order, 0); | 676 | kernel_map_pages(page, 1 << order, 0); |
600 | 677 | ||
601 | local_irq_save(flags); | 678 | lock_cpu_pcp(&flags, &this_cpu); |
602 | if (unlikely(wasMlocked)) | 679 | if (unlikely(wasMlocked)) |
603 | free_page_mlock(page); | 680 | free_page_mlock(page); |
604 | __count_vm_events(PGFREE, 1 << order); | 681 | count_vm_events(PGFREE, 1 << order); |
682 | unlock_cpu_pcp(flags, this_cpu); | ||
605 | free_one_page(page_zone(page), page, order, | 683 | free_one_page(page_zone(page), page, order, |
606 | get_pageblock_migratetype(page)); | 684 | get_pageblock_migratetype(page)); |
607 | local_irq_restore(flags); | ||
608 | } | 685 | } |
609 | 686 | ||
610 | /* | 687 | /* |
@@ -979,17 +1056,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
979 | */ | 1056 | */ |
980 | void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | 1057 | void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) |
981 | { | 1058 | { |
1059 | struct per_cpu_pages dst; | ||
982 | unsigned long flags; | 1060 | unsigned long flags; |
983 | int to_drain; | 1061 | int to_drain, this_cpu; |
984 | 1062 | ||
985 | local_irq_save(flags); | 1063 | lock_cpu_pcp(&flags, &this_cpu); |
986 | if (pcp->count >= pcp->batch) | 1064 | if (pcp->count >= pcp->batch) |
987 | to_drain = pcp->batch; | 1065 | to_drain = pcp->batch; |
988 | else | 1066 | else |
989 | to_drain = pcp->count; | 1067 | to_drain = pcp->count; |
990 | free_pcppages_bulk(zone, to_drain, pcp); | 1068 | isolate_pcp_pages(to_drain, pcp, &dst); |
991 | pcp->count -= to_drain; | 1069 | pcp->count -= to_drain; |
992 | local_irq_restore(flags); | 1070 | unlock_cpu_pcp(flags, this_cpu); |
1071 | free_pcppages_bulk(zone, to_drain, &dst); | ||
993 | } | 1072 | } |
994 | #endif | 1073 | #endif |
995 | 1074 | ||
@@ -1007,15 +1086,23 @@ static void drain_pages(unsigned int cpu) | |||
1007 | 1086 | ||
1008 | for_each_populated_zone(zone) { | 1087 | for_each_populated_zone(zone) { |
1009 | struct per_cpu_pageset *pset; | 1088 | struct per_cpu_pageset *pset; |
1010 | struct per_cpu_pages *pcp; | 1089 | struct per_cpu_pages *pcp, dst; |
1090 | int count; | ||
1011 | 1091 | ||
1092 | __lock_cpu_pcp(&flags, cpu); | ||
1012 | pset = zone_pcp(zone, cpu); | 1093 | pset = zone_pcp(zone, cpu); |
1094 | if (!pset) { | ||
1095 | unlock_cpu_pcp(flags, cpu); | ||
1096 | WARN_ON(1); | ||
1097 | continue; | ||
1098 | } | ||
1013 | 1099 | ||
1014 | pcp = &pset->pcp; | 1100 | pcp = &pset->pcp; |
1015 | local_irq_save(flags); | 1101 | isolate_pcp_pages(pcp->count, pcp, &dst); |
1016 | free_pcppages_bulk(zone, pcp->count, pcp); | 1102 | count = pcp->count; |
1017 | pcp->count = 0; | 1103 | pcp->count = 0; |
1018 | local_irq_restore(flags); | 1104 | unlock_cpu_pcp(flags, cpu); |
1105 | free_pcppages_bulk(zone, count, &dst); | ||
1019 | } | 1106 | } |
1020 | } | 1107 | } |
1021 | 1108 | ||
@@ -1027,12 +1114,52 @@ void drain_local_pages(void *arg) | |||
1027 | drain_pages(smp_processor_id()); | 1114 | drain_pages(smp_processor_id()); |
1028 | } | 1115 | } |
1029 | 1116 | ||
1117 | #ifdef CONFIG_PREEMPT_RT | ||
1118 | static void drain_local_pages_work(struct work_struct *wrk) | ||
1119 | { | ||
1120 | drain_pages(smp_processor_id()); | ||
1121 | } | ||
1122 | #endif | ||
1123 | |||
1030 | /* | 1124 | /* |
1031 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator | 1125 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator |
1032 | */ | 1126 | */ |
1033 | void drain_all_pages(void) | 1127 | void drain_all_pages(void) |
1034 | { | 1128 | { |
1129 | #ifdef CONFIG_PREEMPT_RT | ||
1130 | /* | ||
1131 | * HACK!!!!! | ||
1132 | * For RT we can't use IPIs to run drain_local_pages, since | ||
1133 | * that code will call spin_locks that will now sleep. | ||
1134 | * But, schedule_on_each_cpu will call kzalloc, which will | ||
1135 | * call page_alloc which was what calls this. | ||
1136 | * | ||
1137 | * Luckily, there's a condition to get here, and that is if | ||
1138 | * the order passed in to alloc_pages is greater than 0 | ||
1139 | * (alloced more than a page size). The slabs only allocate | ||
1140 | * what is needed, and the allocation made by schedule_on_each_cpu | ||
1141 | * does an alloc of "sizeof(void *)*nr_cpu_ids". | ||
1142 | * | ||
1143 | * So we can safely call schedule_on_each_cpu if that number | ||
1144 | * is less than a page. Otherwise don't bother. At least warn of | ||
1145 | * this issue. | ||
1146 | * | ||
1147 | * And yes, this is one big hack. Please fix ;-) | ||
1148 | */ | ||
1149 | if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE) | ||
1150 | schedule_on_each_cpu(drain_local_pages_work); | ||
1151 | else { | ||
1152 | static int once; | ||
1153 | if (!once) { | ||
1154 | printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n"); | ||
1155 | once = 1; | ||
1156 | } | ||
1157 | drain_local_pages(NULL); | ||
1158 | } | ||
1159 | |||
1160 | #else | ||
1035 | on_each_cpu(drain_local_pages, NULL, 1); | 1161 | on_each_cpu(drain_local_pages, NULL, 1); |
1162 | #endif | ||
1036 | } | 1163 | } |
1037 | 1164 | ||
1038 | #ifdef CONFIG_HIBERNATION | 1165 | #ifdef CONFIG_HIBERNATION |
@@ -1077,9 +1204,10 @@ void mark_free_pages(struct zone *zone) | |||
1077 | static void free_hot_cold_page(struct page *page, int cold) | 1204 | static void free_hot_cold_page(struct page *page, int cold) |
1078 | { | 1205 | { |
1079 | struct zone *zone = page_zone(page); | 1206 | struct zone *zone = page_zone(page); |
1207 | struct per_cpu_pageset *pset; | ||
1080 | struct per_cpu_pages *pcp; | 1208 | struct per_cpu_pages *pcp; |
1081 | unsigned long flags; | 1209 | unsigned long flags; |
1082 | int migratetype; | 1210 | int migratetype, this_cpu, count; |
1083 | int wasMlocked = __TestClearPageMlocked(page); | 1211 | int wasMlocked = __TestClearPageMlocked(page); |
1084 | 1212 | ||
1085 | kmemcheck_free_shadow(page, 0); | 1213 | kmemcheck_free_shadow(page, 0); |
@@ -1096,13 +1224,13 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1096 | arch_free_page(page, 0); | 1224 | arch_free_page(page, 0); |
1097 | kernel_map_pages(page, 1, 0); | 1225 | kernel_map_pages(page, 1, 0); |
1098 | 1226 | ||
1099 | pcp = &zone_pcp(zone, get_cpu())->pcp; | 1227 | pset = get_zone_pcp(zone, &flags, &this_cpu); |
1228 | pcp = &pset->pcp; | ||
1100 | migratetype = get_pageblock_migratetype(page); | 1229 | migratetype = get_pageblock_migratetype(page); |
1101 | set_page_private(page, migratetype); | 1230 | set_page_private(page, migratetype); |
1102 | local_irq_save(flags); | ||
1103 | if (unlikely(wasMlocked)) | 1231 | if (unlikely(wasMlocked)) |
1104 | free_page_mlock(page); | 1232 | free_page_mlock(page); |
1105 | __count_vm_event(PGFREE); | 1233 | count_vm_event(PGFREE); |
1106 | 1234 | ||
1107 | /* | 1235 | /* |
1108 | * We only track unmovable, reclaimable and movable on pcp lists. | 1236 | * We only track unmovable, reclaimable and movable on pcp lists. |
@@ -1125,13 +1253,17 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1125 | list_add(&page->lru, &pcp->lists[migratetype]); | 1253 | list_add(&page->lru, &pcp->lists[migratetype]); |
1126 | pcp->count++; | 1254 | pcp->count++; |
1127 | if (pcp->count >= pcp->high) { | 1255 | if (pcp->count >= pcp->high) { |
1128 | free_pcppages_bulk(zone, pcp->batch, pcp); | 1256 | struct per_cpu_pages dst; |
1257 | |||
1258 | isolate_pcp_pages(pcp->batch, pcp, &dst); | ||
1129 | pcp->count -= pcp->batch; | 1259 | pcp->count -= pcp->batch; |
1260 | count = pcp->batch; | ||
1261 | put_zone_pcp(zone, flags, this_cpu); | ||
1262 | free_pcppages_bulk(zone, count, &dst); | ||
1263 | return; | ||
1130 | } | 1264 | } |
1131 | |||
1132 | out: | 1265 | out: |
1133 | local_irq_restore(flags); | 1266 | put_zone_pcp(zone, flags, this_cpu); |
1134 | put_cpu(); | ||
1135 | } | 1267 | } |
1136 | 1268 | ||
1137 | void free_hot_page(struct page *page) | 1269 | void free_hot_page(struct page *page) |
@@ -1181,17 +1313,17 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
1181 | unsigned long flags; | 1313 | unsigned long flags; |
1182 | struct page *page; | 1314 | struct page *page; |
1183 | int cold = !!(gfp_flags & __GFP_COLD); | 1315 | int cold = !!(gfp_flags & __GFP_COLD); |
1184 | int cpu; | 1316 | struct per_cpu_pageset *pset; |
1317 | int this_cpu; | ||
1185 | 1318 | ||
1186 | again: | 1319 | again: |
1187 | cpu = get_cpu(); | 1320 | pset = get_zone_pcp(zone, &flags, &this_cpu); |
1321 | |||
1188 | if (likely(order == 0)) { | 1322 | if (likely(order == 0)) { |
1189 | struct per_cpu_pages *pcp; | 1323 | struct per_cpu_pages *pcp = &pset->pcp; |
1190 | struct list_head *list; | 1324 | struct list_head *list; |
1191 | 1325 | ||
1192 | pcp = &zone_pcp(zone, cpu)->pcp; | ||
1193 | list = &pcp->lists[migratetype]; | 1326 | list = &pcp->lists[migratetype]; |
1194 | local_irq_save(flags); | ||
1195 | if (list_empty(list)) { | 1327 | if (list_empty(list)) { |
1196 | pcp->count += rmqueue_bulk(zone, 0, | 1328 | pcp->count += rmqueue_bulk(zone, 0, |
1197 | pcp->batch, list, | 1329 | pcp->batch, list, |
@@ -1221,7 +1353,7 @@ again: | |||
1221 | */ | 1353 | */ |
1222 | WARN_ON_ONCE(order > 1); | 1354 | WARN_ON_ONCE(order > 1); |
1223 | } | 1355 | } |
1224 | spin_lock_irqsave(&zone->lock, flags); | 1356 | spin_lock(&zone->lock); |
1225 | page = __rmqueue(zone, order, migratetype); | 1357 | page = __rmqueue(zone, order, migratetype); |
1226 | spin_unlock(&zone->lock); | 1358 | spin_unlock(&zone->lock); |
1227 | if (!page) | 1359 | if (!page) |
@@ -1231,8 +1363,7 @@ again: | |||
1231 | 1363 | ||
1232 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1364 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1233 | zone_statistics(preferred_zone, zone); | 1365 | zone_statistics(preferred_zone, zone); |
1234 | local_irq_restore(flags); | 1366 | put_zone_pcp(zone, flags, this_cpu); |
1235 | put_cpu(); | ||
1236 | 1367 | ||
1237 | VM_BUG_ON(bad_range(zone, page)); | 1368 | VM_BUG_ON(bad_range(zone, page)); |
1238 | if (prep_new_page(page, order, gfp_flags)) | 1369 | if (prep_new_page(page, order, gfp_flags)) |
@@ -1240,8 +1371,7 @@ again: | |||
1240 | return page; | 1371 | return page; |
1241 | 1372 | ||
1242 | failed: | 1373 | failed: |
1243 | local_irq_restore(flags); | 1374 | put_zone_pcp(zone, flags, this_cpu); |
1244 | put_cpu(); | ||
1245 | return NULL; | 1375 | return NULL; |
1246 | } | 1376 | } |
1247 | 1377 | ||
@@ -3159,7 +3289,23 @@ static inline void free_zone_pagesets(int cpu) | |||
3159 | struct zone *zone; | 3289 | struct zone *zone; |
3160 | 3290 | ||
3161 | for_each_zone(zone) { | 3291 | for_each_zone(zone) { |
3162 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | 3292 | unsigned long flags; |
3293 | struct per_cpu_pageset *pset; | ||
3294 | |||
3295 | /* | ||
3296 | * On PREEMPT_RT the allocator is preemptible, therefore | ||
3297 | * kstopmachine can preempt a process in the middle of an | ||
3298 | * allocation, freeing the pset underneath such a process | ||
3299 | * isn't a good idea. | ||
3300 | * | ||
3301 | * Take the per-cpu pcp lock to allow the task to complete | ||
3302 | * before we free it. New tasks will be held off by the | ||
3303 | * cpu_online() check in get_cpu_var_locked(). | ||
3304 | */ | ||
3305 | __lock_cpu_pcp(&flags, cpu); | ||
3306 | pset = zone_pcp(zone, cpu); | ||
3307 | zone_pcp(zone, cpu) = NULL; | ||
3308 | unlock_cpu_pcp(flags, cpu); | ||
3163 | 3309 | ||
3164 | /* Free per_cpu_pageset if it is slab allocated */ | 3310 | /* Free per_cpu_pageset if it is slab allocated */ |
3165 | if (pset != &boot_pageset[cpu]) | 3311 | if (pset != &boot_pageset[cpu]) |