aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c234
1 files changed, 190 insertions, 44 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8deb9d0fd5b1..bde9ea1ab26f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -162,6 +162,53 @@ static unsigned long __meminitdata dma_reserve;
162 EXPORT_SYMBOL(movable_zone); 162 EXPORT_SYMBOL(movable_zone);
163#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 163#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
164 164
165#ifdef CONFIG_PREEMPT_RT
166static DEFINE_PER_CPU_LOCKED(int, pcp_locks);
167#endif
168
169static inline void __lock_cpu_pcp(unsigned long *flags, int cpu)
170{
171#ifdef CONFIG_PREEMPT_RT
172 spin_lock(&__get_cpu_lock(pcp_locks, cpu));
173 *flags = 0;
174#else
175 local_irq_save(*flags);
176#endif
177}
178
179static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu)
180{
181#ifdef CONFIG_PREEMPT_RT
182 (void)get_cpu_var_locked(pcp_locks, this_cpu);
183 flags = 0;
184#else
185 local_irq_save(*flags);
186 *this_cpu = smp_processor_id();
187#endif
188}
189
190static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu)
191{
192#ifdef CONFIG_PREEMPT_RT
193 put_cpu_var_locked(pcp_locks, this_cpu);
194#else
195 local_irq_restore(flags);
196#endif
197}
198
199static struct per_cpu_pageset *
200get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu)
201{
202 lock_cpu_pcp(flags, this_cpu);
203 return zone_pcp(zone, *this_cpu);
204}
205
206static void
207put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu)
208{
209 unlock_cpu_pcp(flags, this_cpu);
210}
211
165#if MAX_NUMNODES > 1 212#if MAX_NUMNODES > 1
166int nr_node_ids __read_mostly = MAX_NUMNODES; 213int nr_node_ids __read_mostly = MAX_NUMNODES;
167int nr_online_nodes __read_mostly = 1; 214int nr_online_nodes __read_mostly = 1;
@@ -524,16 +571,48 @@ static inline int free_pages_check(struct page *page)
524 * pinned" detection logic. 571 * pinned" detection logic.
525 */ 572 */
526static void free_pcppages_bulk(struct zone *zone, int count, 573static void free_pcppages_bulk(struct zone *zone, int count,
527 struct per_cpu_pages *pcp) 574 struct per_cpu_pages *pcp)
528{ 575{
529 int migratetype = 0; 576 int migratetype = 0;
530 int batch_free = 0; 577 unsigned long flags;
531 578
532 spin_lock(&zone->lock); 579 spin_lock_irqsave(&zone->lock, flags);
533 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 580 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
534 zone->pages_scanned = 0; 581 zone->pages_scanned = 0;
535 582
536 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 583 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
584
585 for (migratetype =0; migratetype < MIGRATE_PCPTYPES; migratetype++) {
586 struct list_head *list = &pcp->lists[migratetype];
587
588 while (!list_empty(list)) {
589 struct page *page;
590
591 page = list_first_entry(list, struct page, lru);
592 /* must delete as __free_one_page list manipulates */
593 list_del(&page->lru);
594 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
595 __free_one_page(page, zone, 0, page_private(page));
596 trace_mm_page_pcpu_drain(page, 0, page_private(page));
597#ifdef CONFIG_PREEMPT_RT
598 cond_resched_lock(&zone->lock);
599#endif
600 count--;
601 }
602 }
603 WARN_ON(count != 0);
604 spin_unlock_irqrestore(&zone->lock, flags);
605}
606
607static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
608 struct per_cpu_pages *dst)
609{
610 int migratetype, batch_free = 0;
611
612 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
613 INIT_LIST_HEAD(&dst->lists[migratetype]);
614 migratetype = 0;
615
537 while (count) { 616 while (count) {
538 struct page *page; 617 struct page *page;
539 struct list_head *list; 618 struct list_head *list;
@@ -549,38 +628,36 @@ static void free_pcppages_bulk(struct zone *zone, int count,
549 batch_free++; 628 batch_free++;
550 if (++migratetype == MIGRATE_PCPTYPES) 629 if (++migratetype == MIGRATE_PCPTYPES)
551 migratetype = 0; 630 migratetype = 0;
552 list = &pcp->lists[migratetype]; 631 list = &src->lists[migratetype];
553 } while (list_empty(list)); 632 } while (list_empty(list));
554 633
555 do { 634 do {
556 page = list_entry(list->prev, struct page, lru); 635 page = list_last_entry(list, struct page, lru);
557 /* must delete as __free_one_page list manipulates */ 636 /* must delete as __free_one_page list manipulates */
558 list_del(&page->lru); 637 list_del(&page->lru);
559 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 638 list_add(&page->lru, &dst->lists[migratetype]);
560 __free_one_page(page, zone, 0, page_private(page));
561 trace_mm_page_pcpu_drain(page, 0, page_private(page));
562 } while (--count && --batch_free && !list_empty(list)); 639 } while (--count && --batch_free && !list_empty(list));
563 } 640 }
564 spin_unlock(&zone->lock);
565} 641}
566 642
567static void free_one_page(struct zone *zone, struct page *page, int order, 643static void free_one_page(struct zone *zone, struct page *page, int order,
568 int migratetype) 644 int migratetype)
569{ 645{
570 spin_lock(&zone->lock); 646 unsigned long flags;
647
648 spin_lock_irqsave(&zone->lock, flags);
571 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 649 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
572 zone->pages_scanned = 0; 650 zone->pages_scanned = 0;
573 651
574 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 652 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
575 __free_one_page(page, zone, order, migratetype); 653 __free_one_page(page, zone, order, migratetype);
576 spin_unlock(&zone->lock); 654 spin_unlock_irqrestore(&zone->lock, flags);
577} 655}
578 656
579static void __free_pages_ok(struct page *page, unsigned int order) 657static void __free_pages_ok(struct page *page, unsigned int order)
580{ 658{
581 unsigned long flags; 659 unsigned long flags;
582 int i; 660 int i, this_cpu, bad = 0;
583 int bad = 0;
584 int wasMlocked = __TestClearPageMlocked(page); 661 int wasMlocked = __TestClearPageMlocked(page);
585 662
586 kmemcheck_free_shadow(page, order); 663 kmemcheck_free_shadow(page, order);
@@ -598,13 +675,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
598 arch_free_page(page, order); 675 arch_free_page(page, order);
599 kernel_map_pages(page, 1 << order, 0); 676 kernel_map_pages(page, 1 << order, 0);
600 677
601 local_irq_save(flags); 678 lock_cpu_pcp(&flags, &this_cpu);
602 if (unlikely(wasMlocked)) 679 if (unlikely(wasMlocked))
603 free_page_mlock(page); 680 free_page_mlock(page);
604 __count_vm_events(PGFREE, 1 << order); 681 count_vm_events(PGFREE, 1 << order);
682 unlock_cpu_pcp(flags, this_cpu);
605 free_one_page(page_zone(page), page, order, 683 free_one_page(page_zone(page), page, order,
606 get_pageblock_migratetype(page)); 684 get_pageblock_migratetype(page));
607 local_irq_restore(flags);
608} 685}
609 686
610/* 687/*
@@ -979,17 +1056,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
979 */ 1056 */
980void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1057void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
981{ 1058{
1059 struct per_cpu_pages dst;
982 unsigned long flags; 1060 unsigned long flags;
983 int to_drain; 1061 int to_drain, this_cpu;
984 1062
985 local_irq_save(flags); 1063 lock_cpu_pcp(&flags, &this_cpu);
986 if (pcp->count >= pcp->batch) 1064 if (pcp->count >= pcp->batch)
987 to_drain = pcp->batch; 1065 to_drain = pcp->batch;
988 else 1066 else
989 to_drain = pcp->count; 1067 to_drain = pcp->count;
990 free_pcppages_bulk(zone, to_drain, pcp); 1068 isolate_pcp_pages(to_drain, pcp, &dst);
991 pcp->count -= to_drain; 1069 pcp->count -= to_drain;
992 local_irq_restore(flags); 1070 unlock_cpu_pcp(flags, this_cpu);
1071 free_pcppages_bulk(zone, to_drain, &dst);
993} 1072}
994#endif 1073#endif
995 1074
@@ -1007,15 +1086,23 @@ static void drain_pages(unsigned int cpu)
1007 1086
1008 for_each_populated_zone(zone) { 1087 for_each_populated_zone(zone) {
1009 struct per_cpu_pageset *pset; 1088 struct per_cpu_pageset *pset;
1010 struct per_cpu_pages *pcp; 1089 struct per_cpu_pages *pcp, dst;
1090 int count;
1011 1091
1092 __lock_cpu_pcp(&flags, cpu);
1012 pset = zone_pcp(zone, cpu); 1093 pset = zone_pcp(zone, cpu);
1094 if (!pset) {
1095 unlock_cpu_pcp(flags, cpu);
1096 WARN_ON(1);
1097 continue;
1098 }
1013 1099
1014 pcp = &pset->pcp; 1100 pcp = &pset->pcp;
1015 local_irq_save(flags); 1101 isolate_pcp_pages(pcp->count, pcp, &dst);
1016 free_pcppages_bulk(zone, pcp->count, pcp); 1102 count = pcp->count;
1017 pcp->count = 0; 1103 pcp->count = 0;
1018 local_irq_restore(flags); 1104 unlock_cpu_pcp(flags, cpu);
1105 free_pcppages_bulk(zone, count, &dst);
1019 } 1106 }
1020} 1107}
1021 1108
@@ -1027,12 +1114,52 @@ void drain_local_pages(void *arg)
1027 drain_pages(smp_processor_id()); 1114 drain_pages(smp_processor_id());
1028} 1115}
1029 1116
1117#ifdef CONFIG_PREEMPT_RT
1118static void drain_local_pages_work(struct work_struct *wrk)
1119{
1120 drain_pages(smp_processor_id());
1121}
1122#endif
1123
1030/* 1124/*
1031 * Spill all the per-cpu pages from all CPUs back into the buddy allocator 1125 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
1032 */ 1126 */
1033void drain_all_pages(void) 1127void drain_all_pages(void)
1034{ 1128{
1129#ifdef CONFIG_PREEMPT_RT
1130 /*
1131 * HACK!!!!!
1132 * For RT we can't use IPIs to run drain_local_pages, since
1133 * that code will call spin_locks that will now sleep.
1134 * But, schedule_on_each_cpu will call kzalloc, which will
1135 * call page_alloc which was what calls this.
1136 *
1137 * Luckily, there's a condition to get here, and that is if
1138 * the order passed in to alloc_pages is greater than 0
1139 * (alloced more than a page size). The slabs only allocate
1140 * what is needed, and the allocation made by schedule_on_each_cpu
1141 * does an alloc of "sizeof(void *)*nr_cpu_ids".
1142 *
1143 * So we can safely call schedule_on_each_cpu if that number
1144 * is less than a page. Otherwise don't bother. At least warn of
1145 * this issue.
1146 *
1147 * And yes, this is one big hack. Please fix ;-)
1148 */
1149 if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE)
1150 schedule_on_each_cpu(drain_local_pages_work);
1151 else {
1152 static int once;
1153 if (!once) {
1154 printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n");
1155 once = 1;
1156 }
1157 drain_local_pages(NULL);
1158 }
1159
1160#else
1035 on_each_cpu(drain_local_pages, NULL, 1); 1161 on_each_cpu(drain_local_pages, NULL, 1);
1162#endif
1036} 1163}
1037 1164
1038#ifdef CONFIG_HIBERNATION 1165#ifdef CONFIG_HIBERNATION
@@ -1077,9 +1204,10 @@ void mark_free_pages(struct zone *zone)
1077static void free_hot_cold_page(struct page *page, int cold) 1204static void free_hot_cold_page(struct page *page, int cold)
1078{ 1205{
1079 struct zone *zone = page_zone(page); 1206 struct zone *zone = page_zone(page);
1207 struct per_cpu_pageset *pset;
1080 struct per_cpu_pages *pcp; 1208 struct per_cpu_pages *pcp;
1081 unsigned long flags; 1209 unsigned long flags;
1082 int migratetype; 1210 int migratetype, this_cpu, count;
1083 int wasMlocked = __TestClearPageMlocked(page); 1211 int wasMlocked = __TestClearPageMlocked(page);
1084 1212
1085 kmemcheck_free_shadow(page, 0); 1213 kmemcheck_free_shadow(page, 0);
@@ -1096,13 +1224,13 @@ static void free_hot_cold_page(struct page *page, int cold)
1096 arch_free_page(page, 0); 1224 arch_free_page(page, 0);
1097 kernel_map_pages(page, 1, 0); 1225 kernel_map_pages(page, 1, 0);
1098 1226
1099 pcp = &zone_pcp(zone, get_cpu())->pcp; 1227 pset = get_zone_pcp(zone, &flags, &this_cpu);
1228 pcp = &pset->pcp;
1100 migratetype = get_pageblock_migratetype(page); 1229 migratetype = get_pageblock_migratetype(page);
1101 set_page_private(page, migratetype); 1230 set_page_private(page, migratetype);
1102 local_irq_save(flags);
1103 if (unlikely(wasMlocked)) 1231 if (unlikely(wasMlocked))
1104 free_page_mlock(page); 1232 free_page_mlock(page);
1105 __count_vm_event(PGFREE); 1233 count_vm_event(PGFREE);
1106 1234
1107 /* 1235 /*
1108 * We only track unmovable, reclaimable and movable on pcp lists. 1236 * We only track unmovable, reclaimable and movable on pcp lists.
@@ -1125,13 +1253,17 @@ static void free_hot_cold_page(struct page *page, int cold)
1125 list_add(&page->lru, &pcp->lists[migratetype]); 1253 list_add(&page->lru, &pcp->lists[migratetype]);
1126 pcp->count++; 1254 pcp->count++;
1127 if (pcp->count >= pcp->high) { 1255 if (pcp->count >= pcp->high) {
1128 free_pcppages_bulk(zone, pcp->batch, pcp); 1256 struct per_cpu_pages dst;
1257
1258 isolate_pcp_pages(pcp->batch, pcp, &dst);
1129 pcp->count -= pcp->batch; 1259 pcp->count -= pcp->batch;
1260 count = pcp->batch;
1261 put_zone_pcp(zone, flags, this_cpu);
1262 free_pcppages_bulk(zone, count, &dst);
1263 return;
1130 } 1264 }
1131
1132out: 1265out:
1133 local_irq_restore(flags); 1266 put_zone_pcp(zone, flags, this_cpu);
1134 put_cpu();
1135} 1267}
1136 1268
1137void free_hot_page(struct page *page) 1269void free_hot_page(struct page *page)
@@ -1181,17 +1313,17 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1181 unsigned long flags; 1313 unsigned long flags;
1182 struct page *page; 1314 struct page *page;
1183 int cold = !!(gfp_flags & __GFP_COLD); 1315 int cold = !!(gfp_flags & __GFP_COLD);
1184 int cpu; 1316 struct per_cpu_pageset *pset;
1317 int this_cpu;
1185 1318
1186again: 1319again:
1187 cpu = get_cpu(); 1320 pset = get_zone_pcp(zone, &flags, &this_cpu);
1321
1188 if (likely(order == 0)) { 1322 if (likely(order == 0)) {
1189 struct per_cpu_pages *pcp; 1323 struct per_cpu_pages *pcp = &pset->pcp;
1190 struct list_head *list; 1324 struct list_head *list;
1191 1325
1192 pcp = &zone_pcp(zone, cpu)->pcp;
1193 list = &pcp->lists[migratetype]; 1326 list = &pcp->lists[migratetype];
1194 local_irq_save(flags);
1195 if (list_empty(list)) { 1327 if (list_empty(list)) {
1196 pcp->count += rmqueue_bulk(zone, 0, 1328 pcp->count += rmqueue_bulk(zone, 0,
1197 pcp->batch, list, 1329 pcp->batch, list,
@@ -1221,7 +1353,7 @@ again:
1221 */ 1353 */
1222 WARN_ON_ONCE(order > 1); 1354 WARN_ON_ONCE(order > 1);
1223 } 1355 }
1224 spin_lock_irqsave(&zone->lock, flags); 1356 spin_lock(&zone->lock);
1225 page = __rmqueue(zone, order, migratetype); 1357 page = __rmqueue(zone, order, migratetype);
1226 spin_unlock(&zone->lock); 1358 spin_unlock(&zone->lock);
1227 if (!page) 1359 if (!page)
@@ -1231,8 +1363,7 @@ again:
1231 1363
1232 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1364 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1233 zone_statistics(preferred_zone, zone); 1365 zone_statistics(preferred_zone, zone);
1234 local_irq_restore(flags); 1366 put_zone_pcp(zone, flags, this_cpu);
1235 put_cpu();
1236 1367
1237 VM_BUG_ON(bad_range(zone, page)); 1368 VM_BUG_ON(bad_range(zone, page));
1238 if (prep_new_page(page, order, gfp_flags)) 1369 if (prep_new_page(page, order, gfp_flags))
@@ -1240,8 +1371,7 @@ again:
1240 return page; 1371 return page;
1241 1372
1242failed: 1373failed:
1243 local_irq_restore(flags); 1374 put_zone_pcp(zone, flags, this_cpu);
1244 put_cpu();
1245 return NULL; 1375 return NULL;
1246} 1376}
1247 1377
@@ -3159,7 +3289,23 @@ static inline void free_zone_pagesets(int cpu)
3159 struct zone *zone; 3289 struct zone *zone;
3160 3290
3161 for_each_zone(zone) { 3291 for_each_zone(zone) {
3162 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 3292 unsigned long flags;
3293 struct per_cpu_pageset *pset;
3294
3295 /*
3296 * On PREEMPT_RT the allocator is preemptible, therefore
3297 * kstopmachine can preempt a process in the middle of an
3298 * allocation, freeing the pset underneath such a process
3299 * isn't a good idea.
3300 *
3301 * Take the per-cpu pcp lock to allow the task to complete
3302 * before we free it. New tasks will be held off by the
3303 * cpu_online() check in get_cpu_var_locked().
3304 */
3305 __lock_cpu_pcp(&flags, cpu);
3306 pset = zone_pcp(zone, cpu);
3307 zone_pcp(zone, cpu) = NULL;
3308 unlock_cpu_pcp(flags, cpu);
3163 3309
3164 /* Free per_cpu_pageset if it is slab allocated */ 3310 /* Free per_cpu_pageset if it is slab allocated */
3165 if (pset != &boot_pageset[cpu]) 3311 if (pset != &boot_pageset[cpu])