aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2009-07-28 18:00:16 -0400
committerThomas Gleixner <tglx@linutronix.de>2009-07-28 18:00:16 -0400
commitba36d1d9dd11b98a0bdee1d15ef2a11148905805 (patch)
tree7749d3ba1d71aaa62a8dab72cca8820e27af7069 /mm
parent55f9e9a3b3a3229f0ee73c1c2f990785bbf2ff88 (diff)
parent104f75cb1a751a023beddacf56ca6c19ed90ce6c (diff)
Merge branch 'rt/mm' into rt/base
Conflicts: include/linux/percpu.h Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c7
-rw-r--r--mm/memory.c7
-rw-r--r--mm/page_alloc.c205
-rw-r--r--mm/quicklist.c15
-rw-r--r--mm/slab.c581
-rw-r--r--mm/swap.c107
-rw-r--r--mm/vmscan.c10
-rw-r--r--mm/vmstat.c23
8 files changed, 718 insertions, 237 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e717964cb5a0..e5159e2ff807 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -948,13 +948,14 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
948 goto done; 948 goto done;
949 949
950 /* 950 /*
951 * Preemption is already disabled, we don't need get_cpu() 951 * Preemption is already disabled, we don't need get_cpu(), but
952 * that's not true for RT :)
952 */ 953 */
953 cpu = smp_processor_id(); 954 cpu = get_cpu();
954 stat = &mem->stat; 955 stat = &mem->stat;
955 cpustat = &stat->cpustat[cpu]; 956 cpustat = &stat->cpustat[cpu];
956
957 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); 957 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
958 put_cpu();
958done: 959done:
959 unlock_page_cgroup(pc); 960 unlock_page_cgroup(pc);
960} 961}
diff --git a/mm/memory.c b/mm/memory.c
index 2d2fc7a3db52..f5579956fa4c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -922,10 +922,13 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
922 return addr; 922 return addr;
923} 923}
924 924
925#ifdef CONFIG_PREEMPT 925#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT)
926# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) 926# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
927#else 927#else
928/* No preempt: go for improved straight-line efficiency */ 928/*
929 * No preempt: go for improved straight-line efficiency
930 * on PREEMPT_RT this is not a critical latency-path.
931 */
929# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) 932# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
930#endif 933#endif
931 934
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index caa92689aac9..910b62810a1e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -161,6 +161,53 @@ static unsigned long __meminitdata dma_reserve;
161 EXPORT_SYMBOL(movable_zone); 161 EXPORT_SYMBOL(movable_zone);
162#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 162#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
163 163
164#ifdef CONFIG_PREEMPT_RT
165static DEFINE_PER_CPU_LOCKED(int, pcp_locks);
166#endif
167
168static inline void __lock_cpu_pcp(unsigned long *flags, int cpu)
169{
170#ifdef CONFIG_PREEMPT_RT
171 spin_lock(&__get_cpu_lock(pcp_locks, cpu));
172 flags = 0;
173#else
174 local_irq_save(*flags);
175#endif
176}
177
178static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu)
179{
180#ifdef CONFIG_PREEMPT_RT
181 (void)get_cpu_var_locked(pcp_locks, this_cpu);
182 flags = 0;
183#else
184 local_irq_save(*flags);
185 *this_cpu = smp_processor_id();
186#endif
187}
188
189static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu)
190{
191#ifdef CONFIG_PREEMPT_RT
192 put_cpu_var_locked(pcp_locks, this_cpu);
193#else
194 local_irq_restore(flags);
195#endif
196}
197
198static struct per_cpu_pageset *
199get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu)
200{
201 lock_cpu_pcp(flags, this_cpu);
202 return zone_pcp(zone, *this_cpu);
203}
204
205static void
206put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu)
207{
208 unlock_cpu_pcp(flags, this_cpu);
209}
210
164#if MAX_NUMNODES > 1 211#if MAX_NUMNODES > 1
165int nr_node_ids __read_mostly = MAX_NUMNODES; 212int nr_node_ids __read_mostly = MAX_NUMNODES;
166int nr_online_nodes __read_mostly = 1; 213int nr_online_nodes __read_mostly = 1;
@@ -523,7 +570,9 @@ static inline int free_pages_check(struct page *page)
523static void free_pages_bulk(struct zone *zone, int count, 570static void free_pages_bulk(struct zone *zone, int count,
524 struct list_head *list, int order) 571 struct list_head *list, int order)
525{ 572{
526 spin_lock(&zone->lock); 573 unsigned long flags;
574
575 spin_lock_irqsave(&zone->lock, flags);
527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 576 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
528 zone->pages_scanned = 0; 577 zone->pages_scanned = 0;
529 578
@@ -536,27 +585,31 @@ static void free_pages_bulk(struct zone *zone, int count,
536 /* have to delete it as __free_one_page list manipulates */ 585 /* have to delete it as __free_one_page list manipulates */
537 list_del(&page->lru); 586 list_del(&page->lru);
538 __free_one_page(page, zone, order, page_private(page)); 587 __free_one_page(page, zone, order, page_private(page));
588#ifdef CONFIG_PREEMPT_RT
589 cond_resched_lock(&zone->lock);
590#endif
539 } 591 }
540 spin_unlock(&zone->lock); 592 spin_unlock_irqrestore(&zone->lock, flags);
541} 593}
542 594
543static void free_one_page(struct zone *zone, struct page *page, int order, 595static void free_one_page(struct zone *zone, struct page *page, int order,
544 int migratetype) 596 int migratetype)
545{ 597{
546 spin_lock(&zone->lock); 598 unsigned long flags;
599
600 spin_lock_irqsave(&zone->lock, flags);
547 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 601 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
548 zone->pages_scanned = 0; 602 zone->pages_scanned = 0;
549 603
550 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 604 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
551 __free_one_page(page, zone, order, migratetype); 605 __free_one_page(page, zone, order, migratetype);
552 spin_unlock(&zone->lock); 606 spin_unlock_irqrestore(&zone->lock, flags);
553} 607}
554 608
555static void __free_pages_ok(struct page *page, unsigned int order) 609static void __free_pages_ok(struct page *page, unsigned int order)
556{ 610{
557 unsigned long flags; 611 unsigned long flags;
558 int i; 612 int i, this_cpu, bad = 0;
559 int bad = 0;
560 int wasMlocked = TestClearPageMlocked(page); 613 int wasMlocked = TestClearPageMlocked(page);
561 614
562 kmemcheck_free_shadow(page, order); 615 kmemcheck_free_shadow(page, order);
@@ -574,13 +627,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
574 arch_free_page(page, order); 627 arch_free_page(page, order);
575 kernel_map_pages(page, 1 << order, 0); 628 kernel_map_pages(page, 1 << order, 0);
576 629
577 local_irq_save(flags); 630 lock_cpu_pcp(&flags, &this_cpu);
578 if (unlikely(wasMlocked)) 631 if (unlikely(wasMlocked))
579 free_page_mlock(page); 632 free_page_mlock(page);
580 __count_vm_events(PGFREE, 1 << order); 633 count_vm_events(PGFREE, 1 << order);
634 unlock_cpu_pcp(flags, this_cpu);
581 free_one_page(page_zone(page), page, order, 635 free_one_page(page_zone(page), page, order,
582 get_pageblock_migratetype(page)); 636 get_pageblock_migratetype(page));
583 local_irq_restore(flags);
584} 637}
585 638
586/* 639/*
@@ -910,6 +963,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
910 return i; 963 return i;
911} 964}
912 965
966static void
967isolate_pcp_pages(int count, struct list_head *src, struct list_head *dst)
968{
969 while (count--) {
970 struct page *page = list_last_entry(src, struct page, lru);
971 list_move(&page->lru, dst);
972 }
973}
974
975
913#ifdef CONFIG_NUMA 976#ifdef CONFIG_NUMA
914/* 977/*
915 * Called from the vmstat counter updater to drain pagesets of this 978 * Called from the vmstat counter updater to drain pagesets of this
@@ -921,17 +984,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
921 */ 984 */
922void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 985void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
923{ 986{
987 LIST_HEAD(free_list);
924 unsigned long flags; 988 unsigned long flags;
925 int to_drain; 989 int to_drain;
990 int this_cpu;
926 991
927 local_irq_save(flags); 992 lock_cpu_pcp(&flags, &this_cpu);
928 if (pcp->count >= pcp->batch) 993 if (pcp->count >= pcp->batch)
929 to_drain = pcp->batch; 994 to_drain = pcp->batch;
930 else 995 else
931 to_drain = pcp->count; 996 to_drain = pcp->count;
932 free_pages_bulk(zone, to_drain, &pcp->list, 0); 997 isolate_pcp_pages(to_drain, &pcp->list, &free_list);
933 pcp->count -= to_drain; 998 pcp->count -= to_drain;
934 local_irq_restore(flags); 999 unlock_cpu_pcp(flags, this_cpu);
1000 free_pages_bulk(zone, to_drain, &free_list, 0);
935} 1001}
936#endif 1002#endif
937 1003
@@ -950,14 +1016,22 @@ static void drain_pages(unsigned int cpu)
950 for_each_populated_zone(zone) { 1016 for_each_populated_zone(zone) {
951 struct per_cpu_pageset *pset; 1017 struct per_cpu_pageset *pset;
952 struct per_cpu_pages *pcp; 1018 struct per_cpu_pages *pcp;
1019 LIST_HEAD(free_list);
1020 int count;
953 1021
1022 __lock_cpu_pcp(&flags, cpu);
954 pset = zone_pcp(zone, cpu); 1023 pset = zone_pcp(zone, cpu);
955 1024 if (!pset) {
1025 unlock_cpu_pcp(flags, cpu);
1026 WARN_ON(1);
1027 continue;
1028 }
956 pcp = &pset->pcp; 1029 pcp = &pset->pcp;
957 local_irq_save(flags); 1030 isolate_pcp_pages(pcp->count, &pcp->list, &free_list);
958 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 1031 count = pcp->count;
959 pcp->count = 0; 1032 pcp->count = 0;
960 local_irq_restore(flags); 1033 unlock_cpu_pcp(flags, cpu);
1034 free_pages_bulk(zone, count, &free_list, 0);
961 } 1035 }
962} 1036}
963 1037
@@ -969,12 +1043,52 @@ void drain_local_pages(void *arg)
969 drain_pages(smp_processor_id()); 1043 drain_pages(smp_processor_id());
970} 1044}
971 1045
1046#ifdef CONFIG_PREEMPT_RT
1047static void drain_local_pages_work(struct work_struct *wrk)
1048{
1049 drain_pages(smp_processor_id());
1050}
1051#endif
1052
972/* 1053/*
973 * Spill all the per-cpu pages from all CPUs back into the buddy allocator 1054 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
974 */ 1055 */
975void drain_all_pages(void) 1056void drain_all_pages(void)
976{ 1057{
1058#ifdef CONFIG_PREEMPT_RT
1059 /*
1060 * HACK!!!!!
1061 * For RT we can't use IPIs to run drain_local_pages, since
1062 * that code will call spin_locks that will now sleep.
1063 * But, schedule_on_each_cpu will call kzalloc, which will
1064 * call page_alloc which was what calls this.
1065 *
1066 * Luckily, there's a condition to get here, and that is if
1067 * the order passed in to alloc_pages is greater than 0
1068 * (alloced more than a page size). The slabs only allocate
1069 * what is needed, and the allocation made by schedule_on_each_cpu
1070 * does an alloc of "sizeof(void *)*nr_cpu_ids".
1071 *
1072 * So we can safely call schedule_on_each_cpu if that number
1073 * is less than a page. Otherwise don't bother. At least warn of
1074 * this issue.
1075 *
1076 * And yes, this is one big hack. Please fix ;-)
1077 */
1078 if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE)
1079 schedule_on_each_cpu(drain_local_pages_work);
1080 else {
1081 static int once;
1082 if (!once) {
1083 printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n");
1084 once = 1;
1085 }
1086 drain_local_pages(NULL);
1087 }
1088
1089#else
977 on_each_cpu(drain_local_pages, NULL, 1); 1090 on_each_cpu(drain_local_pages, NULL, 1);
1091#endif
978} 1092}
979 1093
980#ifdef CONFIG_HIBERNATION 1094#ifdef CONFIG_HIBERNATION
@@ -1019,9 +1133,10 @@ void mark_free_pages(struct zone *zone)
1019static void free_hot_cold_page(struct page *page, int cold) 1133static void free_hot_cold_page(struct page *page, int cold)
1020{ 1134{
1021 struct zone *zone = page_zone(page); 1135 struct zone *zone = page_zone(page);
1136 struct per_cpu_pageset *pset;
1022 struct per_cpu_pages *pcp; 1137 struct per_cpu_pages *pcp;
1023 unsigned long flags; 1138 unsigned long flags;
1024 int wasMlocked = TestClearPageMlocked(page); 1139 int count, this_cpu, wasMlocked = TestClearPageMlocked(page);
1025 1140
1026 kmemcheck_free_shadow(page, 0); 1141 kmemcheck_free_shadow(page, 0);
1027 1142
@@ -1037,12 +1152,12 @@ static void free_hot_cold_page(struct page *page, int cold)
1037 arch_free_page(page, 0); 1152 arch_free_page(page, 0);
1038 kernel_map_pages(page, 1, 0); 1153 kernel_map_pages(page, 1, 0);
1039 1154
1040 pcp = &zone_pcp(zone, get_cpu())->pcp; 1155 pset = get_zone_pcp(zone, &flags, &this_cpu);
1156 pcp = &pset->pcp;
1041 set_page_private(page, get_pageblock_migratetype(page)); 1157 set_page_private(page, get_pageblock_migratetype(page));
1042 local_irq_save(flags);
1043 if (unlikely(wasMlocked)) 1158 if (unlikely(wasMlocked))
1044 free_page_mlock(page); 1159 free_page_mlock(page);
1045 __count_vm_event(PGFREE); 1160 count_vm_event(PGFREE);
1046 1161
1047 if (cold) 1162 if (cold)
1048 list_add_tail(&page->lru, &pcp->list); 1163 list_add_tail(&page->lru, &pcp->list);
@@ -1050,11 +1165,15 @@ static void free_hot_cold_page(struct page *page, int cold)
1050 list_add(&page->lru, &pcp->list); 1165 list_add(&page->lru, &pcp->list);
1051 pcp->count++; 1166 pcp->count++;
1052 if (pcp->count >= pcp->high) { 1167 if (pcp->count >= pcp->high) {
1053 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1168 LIST_HEAD(free_list);
1169
1170 isolate_pcp_pages(pcp->batch, &pcp->list, &free_list);
1054 pcp->count -= pcp->batch; 1171 pcp->count -= pcp->batch;
1055 } 1172 count = pcp->batch;
1056 local_irq_restore(flags); 1173 put_zone_pcp(zone, flags, this_cpu);
1057 put_cpu(); 1174 free_pages_bulk(zone, count, &free_list, 0);
1175 } else
1176 put_zone_pcp(zone, flags, this_cpu);
1058} 1177}
1059 1178
1060void free_hot_page(struct page *page) 1179void free_hot_page(struct page *page)
@@ -1108,15 +1227,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1108 unsigned long flags; 1227 unsigned long flags;
1109 struct page *page; 1228 struct page *page;
1110 int cold = !!(gfp_flags & __GFP_COLD); 1229 int cold = !!(gfp_flags & __GFP_COLD);
1111 int cpu; 1230 struct per_cpu_pageset *pset;
1231 int this_cpu;
1112 1232
1113again: 1233again:
1114 cpu = get_cpu(); 1234 pset = get_zone_pcp(zone, &flags, &this_cpu);
1235
1115 if (likely(order == 0)) { 1236 if (likely(order == 0)) {
1116 struct per_cpu_pages *pcp; 1237 struct per_cpu_pages *pcp = &pset->pcp;
1117 1238
1118 pcp = &zone_pcp(zone, cpu)->pcp;
1119 local_irq_save(flags);
1120 if (!pcp->count) { 1239 if (!pcp->count) {
1121 pcp->count = rmqueue_bulk(zone, 0, 1240 pcp->count = rmqueue_bulk(zone, 0,
1122 pcp->batch, &pcp->list, migratetype); 1241 pcp->batch, &pcp->list, migratetype);
@@ -1158,7 +1277,7 @@ again:
1158 */ 1277 */
1159 WARN_ON_ONCE(order > 1); 1278 WARN_ON_ONCE(order > 1);
1160 } 1279 }
1161 spin_lock_irqsave(&zone->lock, flags); 1280 spin_lock(&zone->lock);
1162 page = __rmqueue(zone, order, migratetype); 1281 page = __rmqueue(zone, order, migratetype);
1163 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); 1282 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1164 spin_unlock(&zone->lock); 1283 spin_unlock(&zone->lock);
@@ -1168,8 +1287,7 @@ again:
1168 1287
1169 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1288 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1170 zone_statistics(preferred_zone, zone); 1289 zone_statistics(preferred_zone, zone);
1171 local_irq_restore(flags); 1290 put_zone_pcp(zone, flags, this_cpu);
1172 put_cpu();
1173 1291
1174 VM_BUG_ON(bad_range(zone, page)); 1292 VM_BUG_ON(bad_range(zone, page));
1175 if (prep_new_page(page, order, gfp_flags)) 1293 if (prep_new_page(page, order, gfp_flags))
@@ -1177,8 +1295,7 @@ again:
1177 return page; 1295 return page;
1178 1296
1179failed: 1297failed:
1180 local_irq_restore(flags); 1298 put_zone_pcp(zone, flags, this_cpu);
1181 put_cpu();
1182 return NULL; 1299 return NULL;
1183} 1300}
1184 1301
@@ -3036,7 +3153,23 @@ static inline void free_zone_pagesets(int cpu)
3036 struct zone *zone; 3153 struct zone *zone;
3037 3154
3038 for_each_zone(zone) { 3155 for_each_zone(zone) {
3039 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 3156 unsigned long flags;
3157 struct per_cpu_pageset *pset;
3158
3159 /*
3160 * On PREEMPT_RT the allocator is preemptible, therefore
3161 * kstopmachine can preempt a process in the middle of an
3162 * allocation, freeing the pset underneath such a process
3163 * isn't a good idea.
3164 *
3165 * Take the per-cpu pcp lock to allow the task to complete
3166 * before we free it. New tasks will be held off by the
3167 * cpu_online() check in get_cpu_var_locked().
3168 */
3169 __lock_cpu_pcp(&flags, cpu);
3170 pset = zone_pcp(zone, cpu);
3171 zone_pcp(zone, cpu) = NULL;
3172 unlock_cpu_pcp(flags, cpu);
3040 3173
3041 /* Free per_cpu_pageset if it is slab allocated */ 3174 /* Free per_cpu_pageset if it is slab allocated */
3042 if (pset != &boot_pageset[cpu]) 3175 if (pset != &boot_pageset[cpu])
diff --git a/mm/quicklist.c b/mm/quicklist.c
index e66d07d1b4ff..03341b014c2b 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -19,7 +19,7 @@
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/quicklist.h> 20#include <linux/quicklist.h>
21 21
22DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; 22DEFINE_PER_CPU_LOCKED(struct quicklist, quicklist)[CONFIG_NR_QUICK];
23 23
24#define FRACTION_OF_NODE_MEM 16 24#define FRACTION_OF_NODE_MEM 16
25 25
@@ -66,17 +66,14 @@ void quicklist_trim(int nr, void (*dtor)(void *),
66{ 66{
67 long pages_to_free; 67 long pages_to_free;
68 struct quicklist *q; 68 struct quicklist *q;
69 int cpu;
69 70
70 q = &get_cpu_var(quicklist)[nr]; 71 q = &get_cpu_var_locked(quicklist, &cpu)[nr];
71 if (q->nr_pages > min_pages) { 72 if (q->nr_pages > min_pages) {
72 pages_to_free = min_pages_to_free(q, min_pages, max_free); 73 pages_to_free = min_pages_to_free(q, min_pages, max_free);
73 74
74 while (pages_to_free > 0) { 75 while (pages_to_free > 0) {
75 /* 76 void *p = __quicklist_alloc(q);
76 * We pass a gfp_t of 0 to quicklist_alloc here
77 * because we will never call into the page allocator.
78 */
79 void *p = quicklist_alloc(nr, 0, NULL);
80 77
81 if (dtor) 78 if (dtor)
82 dtor(p); 79 dtor(p);
@@ -84,7 +81,7 @@ void quicklist_trim(int nr, void (*dtor)(void *),
84 pages_to_free--; 81 pages_to_free--;
85 } 82 }
86 } 83 }
87 put_cpu_var(quicklist); 84 put_cpu_var_locked(quicklist, cpu);
88} 85}
89 86
90unsigned long quicklist_total_size(void) 87unsigned long quicklist_total_size(void)
@@ -94,7 +91,7 @@ unsigned long quicklist_total_size(void)
94 struct quicklist *ql, *q; 91 struct quicklist *ql, *q;
95 92
96 for_each_online_cpu(cpu) { 93 for_each_online_cpu(cpu) {
97 ql = per_cpu(quicklist, cpu); 94 ql = per_cpu_var_locked(quicklist, cpu);
98 for (q = ql; q < ql + CONFIG_NR_QUICK; q++) 95 for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
99 count += q->nr_pages; 96 count += q->nr_pages;
100 } 97 }
diff --git a/mm/slab.c b/mm/slab.c
index 7b5d4deacfcd..a4bd9068c557 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -121,6 +121,138 @@
121#include <asm/page.h> 121#include <asm/page.h>
122 122
123/* 123/*
124 * On !PREEMPT_RT, raw irq flags are used as a per-CPU locking
125 * mechanism.
126 *
127 * On PREEMPT_RT, we use per-CPU locks for this. That's why the
128 * calling convention is changed slightly: a new 'flags' argument
129 * is passed to 'irq disable/enable' - the PREEMPT_RT code stores
130 * the CPU number of the lock there.
131 */
132#ifndef CONFIG_PREEMPT_RT
133
134# define slab_irq_disable(cpu) \
135 do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0)
136# define slab_irq_enable(cpu) local_irq_enable()
137
138static inline void slab_irq_disable_this_rt(int cpu)
139{
140}
141
142static inline void slab_irq_enable_rt(int cpu)
143{
144}
145
146# define slab_irq_save(flags, cpu) \
147 do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0)
148# define slab_irq_restore(flags, cpu) local_irq_restore(flags)
149
150/*
151 * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT,
152 * which has no per-CPU locking effect since we are holding the cache
153 * lock in that case already.
154 */
155static void slab_irq_enable_GFP_WAIT(gfp_t flags, int *cpu)
156{
157 if (flags & __GFP_WAIT)
158 local_irq_enable();
159}
160
161static void slab_irq_disable_GFP_WAIT(gfp_t flags, int *cpu)
162{
163 if (flags & __GFP_WAIT)
164 local_irq_disable();
165}
166
167# define slab_spin_lock_irq(lock, cpu) \
168 do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0)
169# define slab_spin_unlock_irq(lock, cpu) spin_unlock_irq(lock)
170
171# define slab_spin_lock_irqsave(lock, flags, cpu) \
172 do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0)
173# define slab_spin_unlock_irqrestore(lock, flags, cpu) \
174 do { spin_unlock_irqrestore(lock, flags); } while (0)
175
176#else /* CONFIG_PREEMPT_RT */
177
178/*
179 * Instead of serializing the per-cpu state by disabling interrupts we do so
180 * by a lock. This keeps the code preemptable - albeit at the cost of remote
181 * memory access when the task does get migrated away.
182 */
183DEFINE_PER_CPU_LOCKED(struct list_head, slab) = { 0, };
184
185static void _slab_irq_disable(int *cpu)
186{
187 (void)get_cpu_var_locked(slab, cpu);
188}
189
190#define slab_irq_disable(cpu) _slab_irq_disable(&(cpu))
191
192static inline void slab_irq_enable(int cpu)
193{
194 LIST_HEAD(list);
195
196 list_splice_init(&__get_cpu_var_locked(slab, cpu), &list);
197 put_cpu_var_locked(slab, cpu);
198
199 while (!list_empty(&list)) {
200 struct page *page = list_first_entry(&list, struct page, lru);
201 list_del(&page->lru);
202 __free_pages(page, page->index);
203 }
204}
205
206static inline void slab_irq_disable_this_rt(int cpu)
207{
208 spin_lock(&__get_cpu_lock(slab, cpu));
209}
210
211static inline void slab_irq_enable_rt(int cpu)
212{
213 LIST_HEAD(list);
214
215 list_splice_init(&__get_cpu_var_locked(slab, cpu), &list);
216 spin_unlock(&__get_cpu_lock(slab, cpu));
217
218 while (!list_empty(&list)) {
219 struct page *page = list_first_entry(&list, struct page, lru);
220 list_del(&page->lru);
221 __free_pages(page, page->index);
222 }
223}
224
225# define slab_irq_save(flags, cpu) \
226 do { slab_irq_disable(cpu); (void) (flags); } while (0)
227# define slab_irq_restore(flags, cpu) \
228 do { slab_irq_enable(cpu); (void) (flags); } while (0)
229
230/*
231 * On PREEMPT_RT we have to drop the locks unconditionally to avoid lock
232 * recursion on the cache_grow()->alloc_slabmgmt() path.
233 */
234static void slab_irq_enable_GFP_WAIT(gfp_t flags, int *cpu)
235{
236 slab_irq_enable(*cpu);
237}
238
239static void slab_irq_disable_GFP_WAIT(gfp_t flags, int *cpu)
240{
241 slab_irq_disable(*cpu);
242}
243
244# define slab_spin_lock_irq(lock, cpu) \
245 do { slab_irq_disable(cpu); spin_lock(lock); } while (0)
246# define slab_spin_unlock_irq(lock, cpu) \
247 do { spin_unlock(lock); slab_irq_enable(cpu); } while (0)
248# define slab_spin_lock_irqsave(lock, flags, cpu) \
249 do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0)
250# define slab_spin_unlock_irqrestore(lock, flags, cpu) \
251 do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0)
252
253#endif /* CONFIG_PREEMPT_RT */
254
255/*
124 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 256 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
125 * 0 for faster, smaller code (especially in the critical paths). 257 * 0 for faster, smaller code (especially in the critical paths).
126 * 258 *
@@ -316,7 +448,7 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
316static int drain_freelist(struct kmem_cache *cache, 448static int drain_freelist(struct kmem_cache *cache,
317 struct kmem_list3 *l3, int tofree); 449 struct kmem_list3 *l3, int tofree);
318static void free_block(struct kmem_cache *cachep, void **objpp, int len, 450static void free_block(struct kmem_cache *cachep, void **objpp, int len,
319 int node); 451 int node, int *this_cpu);
320static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); 452static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
321static void cache_reap(struct work_struct *unused); 453static void cache_reap(struct work_struct *unused);
322 454
@@ -687,9 +819,10 @@ int slab_is_available(void)
687 819
688static DEFINE_PER_CPU(struct delayed_work, reap_work); 820static DEFINE_PER_CPU(struct delayed_work, reap_work);
689 821
690static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 822static inline struct array_cache *
823cpu_cache_get(struct kmem_cache *cachep, int this_cpu)
691{ 824{
692 return cachep->array[smp_processor_id()]; 825 return cachep->array[this_cpu];
693} 826}
694 827
695static inline struct kmem_cache *__find_general_cachep(size_t size, 828static inline struct kmem_cache *__find_general_cachep(size_t size,
@@ -930,7 +1063,7 @@ static int transfer_objects(struct array_cache *to,
930#ifndef CONFIG_NUMA 1063#ifndef CONFIG_NUMA
931 1064
932#define drain_alien_cache(cachep, alien) do { } while (0) 1065#define drain_alien_cache(cachep, alien) do { } while (0)
933#define reap_alien(cachep, l3) do { } while (0) 1066#define reap_alien(cachep, l3, this_cpu) 0
934 1067
935static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 1068static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
936{ 1069{
@@ -941,27 +1074,28 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
941{ 1074{
942} 1075}
943 1076
944static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1077static inline int
1078cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu)
945{ 1079{
946 return 0; 1080 return 0;
947} 1081}
948 1082
949static inline void *alternate_node_alloc(struct kmem_cache *cachep, 1083static inline void *alternate_node_alloc(struct kmem_cache *cachep,
950 gfp_t flags) 1084 gfp_t flags, int *this_cpu)
951{ 1085{
952 return NULL; 1086 return NULL;
953} 1087}
954 1088
955static inline void *____cache_alloc_node(struct kmem_cache *cachep, 1089static inline void *____cache_alloc_node(struct kmem_cache *cachep,
956 gfp_t flags, int nodeid) 1090 gfp_t flags, int nodeid, int *this_cpu)
957{ 1091{
958 return NULL; 1092 return NULL;
959} 1093}
960 1094
961#else /* CONFIG_NUMA */ 1095#else /* CONFIG_NUMA */
962 1096
963static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 1097static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int, int *);
964static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1098static void *alternate_node_alloc(struct kmem_cache *, gfp_t, int *);
965 1099
966static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 1100static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
967{ 1101{
@@ -1002,7 +1136,8 @@ static void free_alien_cache(struct array_cache **ac_ptr)
1002} 1136}
1003 1137
1004static void __drain_alien_cache(struct kmem_cache *cachep, 1138static void __drain_alien_cache(struct kmem_cache *cachep,
1005 struct array_cache *ac, int node) 1139 struct array_cache *ac, int node,
1140 int *this_cpu)
1006{ 1141{
1007 struct kmem_list3 *rl3 = cachep->nodelists[node]; 1142 struct kmem_list3 *rl3 = cachep->nodelists[node];
1008 1143
@@ -1016,7 +1151,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
1016 if (rl3->shared) 1151 if (rl3->shared)
1017 transfer_objects(rl3->shared, ac, ac->limit); 1152 transfer_objects(rl3->shared, ac, ac->limit);
1018 1153
1019 free_block(cachep, ac->entry, ac->avail, node); 1154 free_block(cachep, ac->entry, ac->avail, node, this_cpu);
1020 ac->avail = 0; 1155 ac->avail = 0;
1021 spin_unlock(&rl3->list_lock); 1156 spin_unlock(&rl3->list_lock);
1022 } 1157 }
@@ -1025,38 +1160,42 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
1025/* 1160/*
1026 * Called from cache_reap() to regularly drain alien caches round robin. 1161 * Called from cache_reap() to regularly drain alien caches round robin.
1027 */ 1162 */
1028static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) 1163static int
1164reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, int *this_cpu)
1029{ 1165{
1030 int node = __get_cpu_var(reap_node); 1166 int node = per_cpu(reap_node, *this_cpu);
1031 1167
1032 if (l3->alien) { 1168 if (l3->alien) {
1033 struct array_cache *ac = l3->alien[node]; 1169 struct array_cache *ac = l3->alien[node];
1034 1170
1035 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { 1171 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1036 __drain_alien_cache(cachep, ac, node); 1172 __drain_alien_cache(cachep, ac, node, this_cpu);
1037 spin_unlock_irq(&ac->lock); 1173 spin_unlock_irq(&ac->lock);
1174 return 1;
1038 } 1175 }
1039 } 1176 }
1177 return 0;
1040} 1178}
1041 1179
1042static void drain_alien_cache(struct kmem_cache *cachep, 1180static void drain_alien_cache(struct kmem_cache *cachep,
1043 struct array_cache **alien) 1181 struct array_cache **alien)
1044{ 1182{
1045 int i = 0; 1183 int i = 0, this_cpu;
1046 struct array_cache *ac; 1184 struct array_cache *ac;
1047 unsigned long flags; 1185 unsigned long flags;
1048 1186
1049 for_each_online_node(i) { 1187 for_each_online_node(i) {
1050 ac = alien[i]; 1188 ac = alien[i];
1051 if (ac) { 1189 if (ac) {
1052 spin_lock_irqsave(&ac->lock, flags); 1190 slab_spin_lock_irqsave(&ac->lock, flags, this_cpu);
1053 __drain_alien_cache(cachep, ac, i); 1191 __drain_alien_cache(cachep, ac, i, &this_cpu);
1054 spin_unlock_irqrestore(&ac->lock, flags); 1192 slab_spin_unlock_irqrestore(&ac->lock, flags, this_cpu);
1055 } 1193 }
1056 } 1194 }
1057} 1195}
1058 1196
1059static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1197static inline int
1198cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu)
1060{ 1199{
1061 struct slab *slabp = virt_to_slab(objp); 1200 struct slab *slabp = virt_to_slab(objp);
1062 int nodeid = slabp->nodeid; 1201 int nodeid = slabp->nodeid;
@@ -1064,7 +1203,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1064 struct array_cache *alien = NULL; 1203 struct array_cache *alien = NULL;
1065 int node; 1204 int node;
1066 1205
1067 node = numa_node_id(); 1206 node = cpu_to_node(*this_cpu);
1068 1207
1069 /* 1208 /*
1070 * Make sure we are not freeing a object from another node to the array 1209 * Make sure we are not freeing a object from another node to the array
@@ -1080,20 +1219,20 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1080 spin_lock(&alien->lock); 1219 spin_lock(&alien->lock);
1081 if (unlikely(alien->avail == alien->limit)) { 1220 if (unlikely(alien->avail == alien->limit)) {
1082 STATS_INC_ACOVERFLOW(cachep); 1221 STATS_INC_ACOVERFLOW(cachep);
1083 __drain_alien_cache(cachep, alien, nodeid); 1222 __drain_alien_cache(cachep, alien, nodeid, this_cpu);
1084 } 1223 }
1085 alien->entry[alien->avail++] = objp; 1224 alien->entry[alien->avail++] = objp;
1086 spin_unlock(&alien->lock); 1225 spin_unlock(&alien->lock);
1087 } else { 1226 } else {
1088 spin_lock(&(cachep->nodelists[nodeid])->list_lock); 1227 spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1089 free_block(cachep, &objp, 1, nodeid); 1228 free_block(cachep, &objp, 1, nodeid, this_cpu);
1090 spin_unlock(&(cachep->nodelists[nodeid])->list_lock); 1229 spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1091 } 1230 }
1092 return 1; 1231 return 1;
1093} 1232}
1094#endif 1233#endif
1095 1234
1096static void __cpuinit cpuup_canceled(long cpu) 1235static void __cpuinit cpuup_canceled(int cpu)
1097{ 1236{
1098 struct kmem_cache *cachep; 1237 struct kmem_cache *cachep;
1099 struct kmem_list3 *l3 = NULL; 1238 struct kmem_list3 *l3 = NULL;
@@ -1104,6 +1243,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1104 struct array_cache *nc; 1243 struct array_cache *nc;
1105 struct array_cache *shared; 1244 struct array_cache *shared;
1106 struct array_cache **alien; 1245 struct array_cache **alien;
1246 int orig_cpu = cpu;
1107 1247
1108 /* cpu is dead; no one can alloc from it. */ 1248 /* cpu is dead; no one can alloc from it. */
1109 nc = cachep->array[cpu]; 1249 nc = cachep->array[cpu];
@@ -1118,7 +1258,8 @@ static void __cpuinit cpuup_canceled(long cpu)
1118 /* Free limit for this kmem_list3 */ 1258 /* Free limit for this kmem_list3 */
1119 l3->free_limit -= cachep->batchcount; 1259 l3->free_limit -= cachep->batchcount;
1120 if (nc) 1260 if (nc)
1121 free_block(cachep, nc->entry, nc->avail, node); 1261 free_block(cachep, nc->entry, nc->avail, node,
1262 &cpu);
1122 1263
1123 if (!cpus_empty(*mask)) { 1264 if (!cpus_empty(*mask)) {
1124 spin_unlock_irq(&l3->list_lock); 1265 spin_unlock_irq(&l3->list_lock);
@@ -1128,7 +1269,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1128 shared = l3->shared; 1269 shared = l3->shared;
1129 if (shared) { 1270 if (shared) {
1130 free_block(cachep, shared->entry, 1271 free_block(cachep, shared->entry,
1131 shared->avail, node); 1272 shared->avail, node, &cpu);
1132 l3->shared = NULL; 1273 l3->shared = NULL;
1133 } 1274 }
1134 1275
@@ -1144,6 +1285,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1144 } 1285 }
1145free_array_cache: 1286free_array_cache:
1146 kfree(nc); 1287 kfree(nc);
1288 BUG_ON(cpu != orig_cpu);
1147 } 1289 }
1148 /* 1290 /*
1149 * In the previous loop, all the objects were freed to 1291 * In the previous loop, all the objects were freed to
@@ -1158,7 +1300,7 @@ free_array_cache:
1158 } 1300 }
1159} 1301}
1160 1302
1161static int __cpuinit cpuup_prepare(long cpu) 1303static int __cpuinit cpuup_prepare(int cpu)
1162{ 1304{
1163 struct kmem_cache *cachep; 1305 struct kmem_cache *cachep;
1164 struct kmem_list3 *l3 = NULL; 1306 struct kmem_list3 *l3 = NULL;
@@ -1266,10 +1408,19 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1266 long cpu = (long)hcpu; 1408 long cpu = (long)hcpu;
1267 int err = 0; 1409 int err = 0;
1268 1410
1411
1269 switch (action) { 1412 switch (action) {
1270 case CPU_UP_PREPARE: 1413 case CPU_UP_PREPARE:
1271 case CPU_UP_PREPARE_FROZEN: 1414 case CPU_UP_PREPARE_FROZEN:
1272 mutex_lock(&cache_chain_mutex); 1415 mutex_lock(&cache_chain_mutex);
1416 /*
1417 * lock/unlock cycle to push any holders away -- no new ones
1418 * can come in due to the cpu still being offline.
1419 *
1420 * XXX -- weird case anyway, can it happen?
1421 */
1422 slab_irq_disable_this_rt(cpu);
1423 slab_irq_enable_rt(cpu);
1273 err = cpuup_prepare(cpu); 1424 err = cpuup_prepare(cpu);
1274 mutex_unlock(&cache_chain_mutex); 1425 mutex_unlock(&cache_chain_mutex);
1275 break; 1426 break;
@@ -1309,10 +1460,14 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1309 case CPU_UP_CANCELED: 1460 case CPU_UP_CANCELED:
1310 case CPU_UP_CANCELED_FROZEN: 1461 case CPU_UP_CANCELED_FROZEN:
1311 mutex_lock(&cache_chain_mutex); 1462 mutex_lock(&cache_chain_mutex);
1463 slab_irq_disable_this_rt(cpu);
1312 cpuup_canceled(cpu); 1464 cpuup_canceled(cpu);
1465 slab_irq_enable_rt(cpu);
1313 mutex_unlock(&cache_chain_mutex); 1466 mutex_unlock(&cache_chain_mutex);
1314 break; 1467 break;
1315 } 1468 }
1469
1470
1316 return err ? NOTIFY_BAD : NOTIFY_OK; 1471 return err ? NOTIFY_BAD : NOTIFY_OK;
1317} 1472}
1318 1473
@@ -1370,6 +1525,12 @@ void __init kmem_cache_init(void)
1370 int order; 1525 int order;
1371 int node; 1526 int node;
1372 1527
1528#ifdef CONFIG_PREEMPT_RT
1529 for_each_possible_cpu(i) {
1530 INIT_LIST_HEAD(&__get_cpu_var_locked(slab, i));
1531 }
1532#endif
1533
1373 if (num_possible_nodes() == 1) 1534 if (num_possible_nodes() == 1)
1374 use_alien_caches = 0; 1535 use_alien_caches = 0;
1375 1536
@@ -1499,32 +1660,34 @@ void __init kmem_cache_init(void)
1499 /* 4) Replace the bootstrap head arrays */ 1660 /* 4) Replace the bootstrap head arrays */
1500 { 1661 {
1501 struct array_cache *ptr; 1662 struct array_cache *ptr;
1663 int cpu = smp_processor_id();
1502 1664
1503 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1665 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1504 1666
1505 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); 1667 BUG_ON(cpu_cache_get(&cache_cache, cpu) !=
1506 memcpy(ptr, cpu_cache_get(&cache_cache), 1668 &initarray_cache.cache);
1669 memcpy(ptr, cpu_cache_get(&cache_cache, cpu),
1507 sizeof(struct arraycache_init)); 1670 sizeof(struct arraycache_init));
1508 /* 1671 /*
1509 * Do not assume that spinlocks can be initialized via memcpy: 1672 * Do not assume that spinlocks can be initialized via memcpy:
1510 */ 1673 */
1511 spin_lock_init(&ptr->lock); 1674 spin_lock_init(&ptr->lock);
1512 1675
1513 cache_cache.array[smp_processor_id()] = ptr; 1676 cache_cache.array[cpu] = ptr;
1514 1677
1515 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1678 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1516 1679
1517 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) 1680 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, cpu)
1518 != &initarray_generic.cache); 1681 != &initarray_generic.cache);
1519 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), 1682 memcpy(ptr,
1683 cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, cpu),
1520 sizeof(struct arraycache_init)); 1684 sizeof(struct arraycache_init));
1521 /* 1685 /*
1522 * Do not assume that spinlocks can be initialized via memcpy: 1686 * Do not assume that spinlocks can be initialized via memcpy:
1523 */ 1687 */
1524 spin_lock_init(&ptr->lock); 1688 spin_lock_init(&ptr->lock);
1525 1689
1526 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1690 malloc_sizes[INDEX_AC].cs_cachep->array[cpu] = ptr;
1527 ptr;
1528 } 1691 }
1529 /* 5) Replace the bootstrap kmem_list3's */ 1692 /* 5) Replace the bootstrap kmem_list3's */
1530 { 1693 {
@@ -1642,12 +1805,14 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1642/* 1805/*
1643 * Interface to system's page release. 1806 * Interface to system's page release.
1644 */ 1807 */
1645static void kmem_freepages(struct kmem_cache *cachep, void *addr) 1808static void kmem_freepages(struct kmem_cache *cachep, void *addr, int cpu)
1646{ 1809{
1647 unsigned long i = (1 << cachep->gfporder); 1810 unsigned long i = (1 << cachep->gfporder);
1648 struct page *page = virt_to_page(addr); 1811 struct page *page, *basepage = virt_to_page(addr);
1649 const unsigned long nr_freed = i; 1812 const unsigned long nr_freed = i;
1650 1813
1814 page = basepage;
1815
1651 kmemcheck_free_shadow(page, cachep->gfporder); 1816 kmemcheck_free_shadow(page, cachep->gfporder);
1652 1817
1653 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1818 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -1656,6 +1821,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1656 else 1821 else
1657 sub_zone_page_state(page_zone(page), 1822 sub_zone_page_state(page_zone(page),
1658 NR_SLAB_UNRECLAIMABLE, nr_freed); 1823 NR_SLAB_UNRECLAIMABLE, nr_freed);
1824
1659 while (i--) { 1825 while (i--) {
1660 BUG_ON(!PageSlab(page)); 1826 BUG_ON(!PageSlab(page));
1661 __ClearPageSlab(page); 1827 __ClearPageSlab(page);
@@ -1663,6 +1829,13 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1663 } 1829 }
1664 if (current->reclaim_state) 1830 if (current->reclaim_state)
1665 current->reclaim_state->reclaimed_slab += nr_freed; 1831 current->reclaim_state->reclaimed_slab += nr_freed;
1832
1833#ifdef CONFIG_PREEMPT_RT
1834 if (cpu >= 0) {
1835 basepage->index = cachep->gfporder;
1836 list_add(&basepage->lru, &__get_cpu_var_locked(slab, cpu));
1837 } else
1838#endif
1666 free_pages((unsigned long)addr, cachep->gfporder); 1839 free_pages((unsigned long)addr, cachep->gfporder);
1667} 1840}
1668 1841
@@ -1671,7 +1844,7 @@ static void kmem_rcu_free(struct rcu_head *head)
1671 struct slab_rcu *slab_rcu = (struct slab_rcu *)head; 1844 struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1672 struct kmem_cache *cachep = slab_rcu->cachep; 1845 struct kmem_cache *cachep = slab_rcu->cachep;
1673 1846
1674 kmem_freepages(cachep, slab_rcu->addr); 1847 kmem_freepages(cachep, slab_rcu->addr, -1);
1675 if (OFF_SLAB(cachep)) 1848 if (OFF_SLAB(cachep))
1676 kmem_cache_free(cachep->slabp_cache, slab_rcu); 1849 kmem_cache_free(cachep->slabp_cache, slab_rcu);
1677} 1850}
@@ -1691,7 +1864,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1691 1864
1692 *addr++ = 0x12345678; 1865 *addr++ = 0x12345678;
1693 *addr++ = caller; 1866 *addr++ = caller;
1694 *addr++ = smp_processor_id(); 1867 *addr++ = raw_smp_processor_id();
1695 size -= 3 * sizeof(unsigned long); 1868 size -= 3 * sizeof(unsigned long);
1696 { 1869 {
1697 unsigned long *sptr = &caller; 1870 unsigned long *sptr = &caller;
@@ -1881,6 +2054,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
1881} 2054}
1882#endif 2055#endif
1883 2056
2057static void
2058__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu);
2059
2060
1884/** 2061/**
1885 * slab_destroy - destroy and release all objects in a slab 2062 * slab_destroy - destroy and release all objects in a slab
1886 * @cachep: cache pointer being destroyed 2063 * @cachep: cache pointer being destroyed
@@ -1890,7 +2067,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
1890 * Before calling the slab must have been unlinked from the cache. The 2067 * Before calling the slab must have been unlinked from the cache. The
1891 * cache-lock is not held/needed. 2068 * cache-lock is not held/needed.
1892 */ 2069 */
1893static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) 2070static void
2071slab_destroy(struct kmem_cache *cachep, struct slab *slabp, int *this_cpu)
1894{ 2072{
1895 void *addr = slabp->s_mem - slabp->colouroff; 2073 void *addr = slabp->s_mem - slabp->colouroff;
1896 2074
@@ -1903,9 +2081,13 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1903 slab_rcu->addr = addr; 2081 slab_rcu->addr = addr;
1904 call_rcu(&slab_rcu->head, kmem_rcu_free); 2082 call_rcu(&slab_rcu->head, kmem_rcu_free);
1905 } else { 2083 } else {
1906 kmem_freepages(cachep, addr); 2084 kmem_freepages(cachep, addr, *this_cpu);
1907 if (OFF_SLAB(cachep)) 2085 if (OFF_SLAB(cachep)) {
1908 kmem_cache_free(cachep->slabp_cache, slabp); 2086 if (this_cpu)
2087 __cache_free(cachep->slabp_cache, slabp, this_cpu);
2088 else
2089 kmem_cache_free(cachep->slabp_cache, slabp);
2090 }
1909 } 2091 }
1910} 2092}
1911 2093
@@ -2002,6 +2184,8 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2002 2184
2003static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 2185static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2004{ 2186{
2187 int this_cpu;
2188
2005 if (g_cpucache_up == FULL) 2189 if (g_cpucache_up == FULL)
2006 return enable_cpucache(cachep, gfp); 2190 return enable_cpucache(cachep, gfp);
2007 2191
@@ -2045,10 +2229,12 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2045 jiffies + REAPTIMEOUT_LIST3 + 2229 jiffies + REAPTIMEOUT_LIST3 +
2046 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 2230 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2047 2231
2048 cpu_cache_get(cachep)->avail = 0; 2232 this_cpu = raw_smp_processor_id();
2049 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 2233
2050 cpu_cache_get(cachep)->batchcount = 1; 2234 cpu_cache_get(cachep, this_cpu)->avail = 0;
2051 cpu_cache_get(cachep)->touched = 0; 2235 cpu_cache_get(cachep, this_cpu)->limit = BOOT_CPUCACHE_ENTRIES;
2236 cpu_cache_get(cachep, this_cpu)->batchcount = 1;
2237 cpu_cache_get(cachep, this_cpu)->touched = 0;
2052 cachep->batchcount = 1; 2238 cachep->batchcount = 1;
2053 cachep->limit = BOOT_CPUCACHE_ENTRIES; 2239 cachep->limit = BOOT_CPUCACHE_ENTRIES;
2054 return 0; 2240 return 0;
@@ -2358,19 +2544,19 @@ EXPORT_SYMBOL(kmem_cache_create);
2358#if DEBUG 2544#if DEBUG
2359static void check_irq_off(void) 2545static void check_irq_off(void)
2360{ 2546{
2547/*
2548 * On PREEMPT_RT we use locks to protect the per-CPU lists,
2549 * and keep interrupts enabled.
2550 */
2551#ifndef CONFIG_PREEMPT_RT
2361 BUG_ON(!irqs_disabled()); 2552 BUG_ON(!irqs_disabled());
2553#endif
2362} 2554}
2363 2555
2364static void check_irq_on(void) 2556static void check_irq_on(void)
2365{ 2557{
2558#ifndef CONFIG_PREEMPT_RT
2366 BUG_ON(irqs_disabled()); 2559 BUG_ON(irqs_disabled());
2367}
2368
2369static void check_spinlock_acquired(struct kmem_cache *cachep)
2370{
2371#ifdef CONFIG_SMP
2372 check_irq_off();
2373 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
2374#endif 2560#endif
2375} 2561}
2376 2562
@@ -2385,34 +2571,67 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2385#else 2571#else
2386#define check_irq_off() do { } while(0) 2572#define check_irq_off() do { } while(0)
2387#define check_irq_on() do { } while(0) 2573#define check_irq_on() do { } while(0)
2388#define check_spinlock_acquired(x) do { } while(0)
2389#define check_spinlock_acquired_node(x, y) do { } while(0) 2574#define check_spinlock_acquired_node(x, y) do { } while(0)
2390#endif 2575#endif
2391 2576
2392static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 2577static int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2393 struct array_cache *ac, 2578 struct array_cache *ac,
2394 int force, int node); 2579 int force, int node);
2395 2580
2396static void do_drain(void *arg) 2581static void __do_drain(void *arg, int this_cpu)
2397{ 2582{
2398 struct kmem_cache *cachep = arg; 2583 struct kmem_cache *cachep = arg;
2584 int node = cpu_to_node(this_cpu);
2399 struct array_cache *ac; 2585 struct array_cache *ac;
2400 int node = numa_node_id();
2401 2586
2402 check_irq_off(); 2587 check_irq_off();
2403 ac = cpu_cache_get(cachep); 2588 ac = cpu_cache_get(cachep, this_cpu);
2404 spin_lock(&cachep->nodelists[node]->list_lock); 2589 spin_lock(&cachep->nodelists[node]->list_lock);
2405 free_block(cachep, ac->entry, ac->avail, node); 2590 free_block(cachep, ac->entry, ac->avail, node, &this_cpu);
2406 spin_unlock(&cachep->nodelists[node]->list_lock); 2591 spin_unlock(&cachep->nodelists[node]->list_lock);
2407 ac->avail = 0; 2592 ac->avail = 0;
2408} 2593}
2409 2594
2595#ifdef CONFIG_PREEMPT_RT
2596static void do_drain(void *arg, int this_cpu)
2597{
2598 __do_drain(arg, this_cpu);
2599}
2600#else
2601static void do_drain(void *arg)
2602{
2603 __do_drain(arg, smp_processor_id());
2604}
2605#endif
2606
2607#ifdef CONFIG_PREEMPT_RT
2608/*
2609 * execute func() for all CPUs. On PREEMPT_RT we dont actually have
2610 * to run on the remote CPUs - we only have to take their CPU-locks.
2611 * (This is a rare operation, so cacheline bouncing is not an issue.)
2612 */
2613static void
2614slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg)
2615{
2616 unsigned int i;
2617
2618 check_irq_on();
2619 for_each_online_cpu(i) {
2620 spin_lock(&__get_cpu_lock(slab, i));
2621 func(arg, i);
2622 spin_unlock(&__get_cpu_lock(slab, i));
2623 }
2624}
2625#else
2626# define slab_on_each_cpu(func, cachep) on_each_cpu(func, cachep, 1)
2627#endif
2628
2410static void drain_cpu_caches(struct kmem_cache *cachep) 2629static void drain_cpu_caches(struct kmem_cache *cachep)
2411{ 2630{
2412 struct kmem_list3 *l3; 2631 struct kmem_list3 *l3;
2413 int node; 2632 int node;
2414 2633
2415 on_each_cpu(do_drain, cachep, 1); 2634 slab_on_each_cpu(do_drain, cachep);
2416 check_irq_on(); 2635 check_irq_on();
2417 for_each_online_node(node) { 2636 for_each_online_node(node) {
2418 l3 = cachep->nodelists[node]; 2637 l3 = cachep->nodelists[node];
@@ -2437,16 +2656,16 @@ static int drain_freelist(struct kmem_cache *cache,
2437 struct kmem_list3 *l3, int tofree) 2656 struct kmem_list3 *l3, int tofree)
2438{ 2657{
2439 struct list_head *p; 2658 struct list_head *p;
2440 int nr_freed; 2659 int nr_freed, this_cpu;
2441 struct slab *slabp; 2660 struct slab *slabp;
2442 2661
2443 nr_freed = 0; 2662 nr_freed = 0;
2444 while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { 2663 while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2445 2664
2446 spin_lock_irq(&l3->list_lock); 2665 slab_spin_lock_irq(&l3->list_lock, this_cpu);
2447 p = l3->slabs_free.prev; 2666 p = l3->slabs_free.prev;
2448 if (p == &l3->slabs_free) { 2667 if (p == &l3->slabs_free) {
2449 spin_unlock_irq(&l3->list_lock); 2668 slab_spin_unlock_irq(&l3->list_lock, this_cpu);
2450 goto out; 2669 goto out;
2451 } 2670 }
2452 2671
@@ -2455,13 +2674,9 @@ static int drain_freelist(struct kmem_cache *cache,
2455 BUG_ON(slabp->inuse); 2674 BUG_ON(slabp->inuse);
2456#endif 2675#endif
2457 list_del(&slabp->list); 2676 list_del(&slabp->list);
2458 /*
2459 * Safe to drop the lock. The slab is no longer linked
2460 * to the cache.
2461 */
2462 l3->free_objects -= cache->num; 2677 l3->free_objects -= cache->num;
2463 spin_unlock_irq(&l3->list_lock); 2678 slab_destroy(cache, slabp, &this_cpu);
2464 slab_destroy(cache, slabp); 2679 slab_spin_unlock_irq(&l3->list_lock, this_cpu);
2465 nr_freed++; 2680 nr_freed++;
2466 } 2681 }
2467out: 2682out:
@@ -2725,8 +2940,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2725 * Grow (by 1) the number of slabs within a cache. This is called by 2940 * Grow (by 1) the number of slabs within a cache. This is called by
2726 * kmem_cache_alloc() when there are no active objs left in a cache. 2941 * kmem_cache_alloc() when there are no active objs left in a cache.
2727 */ 2942 */
2728static int cache_grow(struct kmem_cache *cachep, 2943static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid,
2729 gfp_t flags, int nodeid, void *objp) 2944 void *objp, int *this_cpu)
2730{ 2945{
2731 struct slab *slabp; 2946 struct slab *slabp;
2732 size_t offset; 2947 size_t offset;
@@ -2754,8 +2969,7 @@ static int cache_grow(struct kmem_cache *cachep,
2754 2969
2755 offset *= cachep->colour_off; 2970 offset *= cachep->colour_off;
2756 2971
2757 if (local_flags & __GFP_WAIT) 2972 slab_irq_enable_GFP_WAIT(local_flags, this_cpu);
2758 local_irq_enable();
2759 2973
2760 /* 2974 /*
2761 * The test for missing atomic flag is performed here, rather than 2975 * The test for missing atomic flag is performed here, rather than
@@ -2784,8 +2998,8 @@ static int cache_grow(struct kmem_cache *cachep,
2784 2998
2785 cache_init_objs(cachep, slabp); 2999 cache_init_objs(cachep, slabp);
2786 3000
2787 if (local_flags & __GFP_WAIT) 3001 slab_irq_disable_GFP_WAIT(local_flags, this_cpu);
2788 local_irq_disable(); 3002
2789 check_irq_off(); 3003 check_irq_off();
2790 spin_lock(&l3->list_lock); 3004 spin_lock(&l3->list_lock);
2791 3005
@@ -2796,10 +3010,9 @@ static int cache_grow(struct kmem_cache *cachep,
2796 spin_unlock(&l3->list_lock); 3010 spin_unlock(&l3->list_lock);
2797 return 1; 3011 return 1;
2798opps1: 3012opps1:
2799 kmem_freepages(cachep, objp); 3013 kmem_freepages(cachep, objp, -1);
2800failed: 3014failed:
2801 if (local_flags & __GFP_WAIT) 3015 slab_irq_disable_GFP_WAIT(local_flags, this_cpu);
2802 local_irq_disable();
2803 return 0; 3016 return 0;
2804} 3017}
2805 3018
@@ -2921,7 +3134,8 @@ bad:
2921#define check_slabp(x,y) do { } while(0) 3134#define check_slabp(x,y) do { } while(0)
2922#endif 3135#endif
2923 3136
2924static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) 3137static void *
3138cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
2925{ 3139{
2926 int batchcount; 3140 int batchcount;
2927 struct kmem_list3 *l3; 3141 struct kmem_list3 *l3;
@@ -2931,7 +3145,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2931retry: 3145retry:
2932 check_irq_off(); 3146 check_irq_off();
2933 node = numa_node_id(); 3147 node = numa_node_id();
2934 ac = cpu_cache_get(cachep); 3148 ac = cpu_cache_get(cachep, *this_cpu);
2935 batchcount = ac->batchcount; 3149 batchcount = ac->batchcount;
2936 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 3150 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2937 /* 3151 /*
@@ -2941,7 +3155,7 @@ retry:
2941 */ 3155 */
2942 batchcount = BATCHREFILL_LIMIT; 3156 batchcount = BATCHREFILL_LIMIT;
2943 } 3157 }
2944 l3 = cachep->nodelists[node]; 3158 l3 = cachep->nodelists[cpu_to_node(*this_cpu)];
2945 3159
2946 BUG_ON(ac->avail > 0 || !l3); 3160 BUG_ON(ac->avail > 0 || !l3);
2947 spin_lock(&l3->list_lock); 3161 spin_lock(&l3->list_lock);
@@ -2964,7 +3178,7 @@ retry:
2964 3178
2965 slabp = list_entry(entry, struct slab, list); 3179 slabp = list_entry(entry, struct slab, list);
2966 check_slabp(cachep, slabp); 3180 check_slabp(cachep, slabp);
2967 check_spinlock_acquired(cachep); 3181 check_spinlock_acquired_node(cachep, cpu_to_node(*this_cpu));
2968 3182
2969 /* 3183 /*
2970 * The slab was either on partial or free list so 3184 * The slab was either on partial or free list so
@@ -2978,8 +3192,9 @@ retry:
2978 STATS_INC_ACTIVE(cachep); 3192 STATS_INC_ACTIVE(cachep);
2979 STATS_SET_HIGH(cachep); 3193 STATS_SET_HIGH(cachep);
2980 3194
2981 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, 3195 ac->entry[ac->avail++] =
2982 node); 3196 slab_get_obj(cachep, slabp,
3197 cpu_to_node(*this_cpu));
2983 } 3198 }
2984 check_slabp(cachep, slabp); 3199 check_slabp(cachep, slabp);
2985 3200
@@ -2998,10 +3213,10 @@ alloc_done:
2998 3213
2999 if (unlikely(!ac->avail)) { 3214 if (unlikely(!ac->avail)) {
3000 int x; 3215 int x;
3001 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 3216 x = cache_grow(cachep, flags | GFP_THISNODE, cpu_to_node(*this_cpu), NULL, this_cpu);
3002 3217
3003 /* cache_grow can reenable interrupts, then ac could change. */ 3218 /* cache_grow can reenable interrupts, then ac could change. */
3004 ac = cpu_cache_get(cachep); 3219 ac = cpu_cache_get(cachep, *this_cpu);
3005 if (!x && ac->avail == 0) /* no objects in sight? abort */ 3220 if (!x && ac->avail == 0) /* no objects in sight? abort */
3006 return NULL; 3221 return NULL;
3007 3222
@@ -3088,21 +3303,22 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3088 return should_failslab(obj_size(cachep), flags); 3303 return should_failslab(obj_size(cachep), flags);
3089} 3304}
3090 3305
3091static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3306static inline void *
3307____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
3092{ 3308{
3093 void *objp; 3309 void *objp;
3094 struct array_cache *ac; 3310 struct array_cache *ac;
3095 3311
3096 check_irq_off(); 3312 check_irq_off();
3097 3313
3098 ac = cpu_cache_get(cachep); 3314 ac = cpu_cache_get(cachep, *this_cpu);
3099 if (likely(ac->avail)) { 3315 if (likely(ac->avail)) {
3100 STATS_INC_ALLOCHIT(cachep); 3316 STATS_INC_ALLOCHIT(cachep);
3101 ac->touched = 1; 3317 ac->touched = 1;
3102 objp = ac->entry[--ac->avail]; 3318 objp = ac->entry[--ac->avail];
3103 } else { 3319 } else {
3104 STATS_INC_ALLOCMISS(cachep); 3320 STATS_INC_ALLOCMISS(cachep);
3105 objp = cache_alloc_refill(cachep, flags); 3321 objp = cache_alloc_refill(cachep, flags, this_cpu);
3106 } 3322 }
3107 /* 3323 /*
3108 * To avoid a false negative, if an object that is in one of the 3324 * To avoid a false negative, if an object that is in one of the
@@ -3120,7 +3336,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3120 * If we are in_interrupt, then process context, including cpusets and 3336 * If we are in_interrupt, then process context, including cpusets and
3121 * mempolicy, may not apply and should not be used for allocation policy. 3337 * mempolicy, may not apply and should not be used for allocation policy.
3122 */ 3338 */
3123static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) 3339static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags,
3340 int *this_cpu)
3124{ 3341{
3125 int nid_alloc, nid_here; 3342 int nid_alloc, nid_here;
3126 3343
@@ -3132,7 +3349,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3132 else if (current->mempolicy) 3349 else if (current->mempolicy)
3133 nid_alloc = slab_node(current->mempolicy); 3350 nid_alloc = slab_node(current->mempolicy);
3134 if (nid_alloc != nid_here) 3351 if (nid_alloc != nid_here)
3135 return ____cache_alloc_node(cachep, flags, nid_alloc); 3352 return ____cache_alloc_node(cachep, flags, nid_alloc, this_cpu);
3136 return NULL; 3353 return NULL;
3137} 3354}
3138 3355
@@ -3144,7 +3361,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3144 * allocator to do its reclaim / fallback magic. We then insert the 3361 * allocator to do its reclaim / fallback magic. We then insert the
3145 * slab into the proper nodelist and then allocate from it. 3362 * slab into the proper nodelist and then allocate from it.
3146 */ 3363 */
3147static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3364static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu)
3148{ 3365{
3149 struct zonelist *zonelist; 3366 struct zonelist *zonelist;
3150 gfp_t local_flags; 3367 gfp_t local_flags;
@@ -3172,7 +3389,8 @@ retry:
3172 cache->nodelists[nid] && 3389 cache->nodelists[nid] &&
3173 cache->nodelists[nid]->free_objects) { 3390 cache->nodelists[nid]->free_objects) {
3174 obj = ____cache_alloc_node(cache, 3391 obj = ____cache_alloc_node(cache,
3175 flags | GFP_THISNODE, nid); 3392 flags | GFP_THISNODE, nid,
3393 this_cpu);
3176 if (obj) 3394 if (obj)
3177 break; 3395 break;
3178 } 3396 }
@@ -3185,20 +3403,21 @@ retry:
3185 * We may trigger various forms of reclaim on the allowed 3403 * We may trigger various forms of reclaim on the allowed
3186 * set and go into memory reserves if necessary. 3404 * set and go into memory reserves if necessary.
3187 */ 3405 */
3188 if (local_flags & __GFP_WAIT) 3406 slab_irq_enable_GFP_WAIT(local_flags, this_cpu);
3189 local_irq_enable(); 3407
3190 kmem_flagcheck(cache, flags); 3408 kmem_flagcheck(cache, flags);
3191 obj = kmem_getpages(cache, local_flags, numa_node_id()); 3409 obj = kmem_getpages(cache, local_flags, cpu_to_node(*this_cpu));
3192 if (local_flags & __GFP_WAIT) 3410
3193 local_irq_disable(); 3411 slab_irq_disable_GFP_WAIT(local_flags, this_cpu);
3412
3194 if (obj) { 3413 if (obj) {
3195 /* 3414 /*
3196 * Insert into the appropriate per node queues 3415 * Insert into the appropriate per node queues
3197 */ 3416 */
3198 nid = page_to_nid(virt_to_page(obj)); 3417 nid = page_to_nid(virt_to_page(obj));
3199 if (cache_grow(cache, flags, nid, obj)) { 3418 if (cache_grow(cache, flags, nid, obj, this_cpu)) {
3200 obj = ____cache_alloc_node(cache, 3419 obj = ____cache_alloc_node(cache,
3201 flags | GFP_THISNODE, nid); 3420 flags | GFP_THISNODE, nid, this_cpu);
3202 if (!obj) 3421 if (!obj)
3203 /* 3422 /*
3204 * Another processor may allocate the 3423 * Another processor may allocate the
@@ -3219,7 +3438,7 @@ retry:
3219 * A interface to enable slab creation on nodeid 3438 * A interface to enable slab creation on nodeid
3220 */ 3439 */
3221static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3440static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3222 int nodeid) 3441 int nodeid, int *this_cpu)
3223{ 3442{
3224 struct list_head *entry; 3443 struct list_head *entry;
3225 struct slab *slabp; 3444 struct slab *slabp;
@@ -3267,11 +3486,11 @@ retry:
3267 3486
3268must_grow: 3487must_grow:
3269 spin_unlock(&l3->list_lock); 3488 spin_unlock(&l3->list_lock);
3270 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); 3489 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL, this_cpu);
3271 if (x) 3490 if (x)
3272 goto retry; 3491 goto retry;
3273 3492
3274 return fallback_alloc(cachep, flags); 3493 return fallback_alloc(cachep, flags, this_cpu);
3275 3494
3276done: 3495done:
3277 return obj; 3496 return obj;
@@ -3294,6 +3513,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3294 void *caller) 3513 void *caller)
3295{ 3514{
3296 unsigned long save_flags; 3515 unsigned long save_flags;
3516 int this_cpu, this_node;
3297 void *ptr; 3517 void *ptr;
3298 3518
3299 flags &= gfp_allowed_mask; 3519 flags &= gfp_allowed_mask;
@@ -3304,32 +3524,34 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3304 return NULL; 3524 return NULL;
3305 3525
3306 cache_alloc_debugcheck_before(cachep, flags); 3526 cache_alloc_debugcheck_before(cachep, flags);
3307 local_irq_save(save_flags);
3308 3527
3528 slab_irq_save(save_flags, this_cpu);
3529
3530 this_node = cpu_to_node(this_cpu);
3309 if (unlikely(nodeid == -1)) 3531 if (unlikely(nodeid == -1))
3310 nodeid = numa_node_id(); 3532 nodeid = this_node;
3311 3533
3312 if (unlikely(!cachep->nodelists[nodeid])) { 3534 if (unlikely(!cachep->nodelists[nodeid])) {
3313 /* Node not bootstrapped yet */ 3535 /* Node not bootstrapped yet */
3314 ptr = fallback_alloc(cachep, flags); 3536 ptr = fallback_alloc(cachep, flags, &this_cpu);
3315 goto out; 3537 goto out;
3316 } 3538 }
3317 3539
3318 if (nodeid == numa_node_id()) { 3540 if (nodeid == this_node) {
3319 /* 3541 /*
3320 * Use the locally cached objects if possible. 3542 * Use the locally cached objects if possible.
3321 * However ____cache_alloc does not allow fallback 3543 * However ____cache_alloc does not allow fallback
3322 * to other nodes. It may fail while we still have 3544 * to other nodes. It may fail while we still have
3323 * objects on other nodes available. 3545 * objects on other nodes available.
3324 */ 3546 */
3325 ptr = ____cache_alloc(cachep, flags); 3547 ptr = ____cache_alloc(cachep, flags, &this_cpu);
3326 if (ptr) 3548 if (ptr)
3327 goto out; 3549 goto out;
3328 } 3550 }
3329 /* ___cache_alloc_node can fall back to other nodes */ 3551 /* ___cache_alloc_node can fall back to other nodes */
3330 ptr = ____cache_alloc_node(cachep, flags, nodeid); 3552 ptr = ____cache_alloc_node(cachep, flags, nodeid, &this_cpu);
3331 out: 3553 out:
3332 local_irq_restore(save_flags); 3554 slab_irq_restore(save_flags, this_cpu);
3333 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3555 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3334 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, 3556 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
3335 flags); 3557 flags);
@@ -3344,33 +3566,33 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3344} 3566}
3345 3567
3346static __always_inline void * 3568static __always_inline void *
3347__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) 3569__do_cache_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu)
3348{ 3570{
3349 void *objp; 3571 void *objp;
3350 3572
3351 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { 3573 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3352 objp = alternate_node_alloc(cache, flags); 3574 objp = alternate_node_alloc(cache, flags, this_cpu);
3353 if (objp) 3575 if (objp)
3354 goto out; 3576 goto out;
3355 } 3577 }
3356 objp = ____cache_alloc(cache, flags);
3357 3578
3579 objp = ____cache_alloc(cache, flags, this_cpu);
3358 /* 3580 /*
3359 * We may just have run out of memory on the local node. 3581 * We may just have run out of memory on the local node.
3360 * ____cache_alloc_node() knows how to locate memory on other nodes 3582 * ____cache_alloc_node() knows how to locate memory on other nodes
3361 */ 3583 */
3362 if (!objp) 3584 if (!objp)
3363 objp = ____cache_alloc_node(cache, flags, numa_node_id()); 3585 objp = ____cache_alloc_node(cache, flags,
3364 3586 cpu_to_node(*this_cpu), this_cpu);
3365 out: 3587 out:
3366 return objp; 3588 return objp;
3367} 3589}
3368#else 3590#else
3369 3591
3370static __always_inline void * 3592static __always_inline void *
3371__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3593__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
3372{ 3594{
3373 return ____cache_alloc(cachep, flags); 3595 return ____cache_alloc(cachep, flags, this_cpu);
3374} 3596}
3375 3597
3376#endif /* CONFIG_NUMA */ 3598#endif /* CONFIG_NUMA */
@@ -3379,6 +3601,7 @@ static __always_inline void *
3379__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) 3601__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3380{ 3602{
3381 unsigned long save_flags; 3603 unsigned long save_flags;
3604 int this_cpu;
3382 void *objp; 3605 void *objp;
3383 3606
3384 flags &= gfp_allowed_mask; 3607 flags &= gfp_allowed_mask;
@@ -3389,9 +3612,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3389 return NULL; 3612 return NULL;
3390 3613
3391 cache_alloc_debugcheck_before(cachep, flags); 3614 cache_alloc_debugcheck_before(cachep, flags);
3392 local_irq_save(save_flags); 3615 slab_irq_save(save_flags, this_cpu);
3393 objp = __do_cache_alloc(cachep, flags); 3616 objp = __do_cache_alloc(cachep, flags, &this_cpu);
3394 local_irq_restore(save_flags); 3617 slab_irq_restore(save_flags, this_cpu);
3395 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3618 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3396 kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, 3619 kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
3397 flags); 3620 flags);
@@ -3410,7 +3633,7 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3410 * Caller needs to acquire correct kmem_list's list_lock 3633 * Caller needs to acquire correct kmem_list's list_lock
3411 */ 3634 */
3412static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, 3635static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3413 int node) 3636 int node, int *this_cpu)
3414{ 3637{
3415 int i; 3638 int i;
3416 struct kmem_list3 *l3; 3639 struct kmem_list3 *l3;
@@ -3439,7 +3662,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3439 * a different cache, refer to comments before 3662 * a different cache, refer to comments before
3440 * alloc_slabmgmt. 3663 * alloc_slabmgmt.
3441 */ 3664 */
3442 slab_destroy(cachep, slabp); 3665 slab_destroy(cachep, slabp, this_cpu);
3443 } else { 3666 } else {
3444 list_add(&slabp->list, &l3->slabs_free); 3667 list_add(&slabp->list, &l3->slabs_free);
3445 } 3668 }
@@ -3453,11 +3676,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3453 } 3676 }
3454} 3677}
3455 3678
3456static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) 3679static void
3680cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac, int *this_cpu)
3457{ 3681{
3458 int batchcount; 3682 int batchcount;
3459 struct kmem_list3 *l3; 3683 struct kmem_list3 *l3;
3460 int node = numa_node_id(); 3684 int node = cpu_to_node(*this_cpu);
3461 3685
3462 batchcount = ac->batchcount; 3686 batchcount = ac->batchcount;
3463#if DEBUG 3687#if DEBUG
@@ -3479,7 +3703,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3479 } 3703 }
3480 } 3704 }
3481 3705
3482 free_block(cachep, ac->entry, batchcount, node); 3706 free_block(cachep, ac->entry, batchcount, node, this_cpu);
3483free_done: 3707free_done:
3484#if STATS 3708#if STATS
3485 { 3709 {
@@ -3508,9 +3732,10 @@ free_done:
3508 * Release an obj back to its cache. If the obj has a constructed state, it must 3732 * Release an obj back to its cache. If the obj has a constructed state, it must
3509 * be in this state _before_ it is released. Called with disabled ints. 3733 * be in this state _before_ it is released. Called with disabled ints.
3510 */ 3734 */
3511static inline void __cache_free(struct kmem_cache *cachep, void *objp) 3735static inline void
3736__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu)
3512{ 3737{
3513 struct array_cache *ac = cpu_cache_get(cachep); 3738 struct array_cache *ac = cpu_cache_get(cachep, *this_cpu);
3514 3739
3515 check_irq_off(); 3740 check_irq_off();
3516 kmemleak_free_recursive(objp, cachep->flags); 3741 kmemleak_free_recursive(objp, cachep->flags);
@@ -3525,7 +3750,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3525 * variable to skip the call, which is mostly likely to be present in 3750 * variable to skip the call, which is mostly likely to be present in
3526 * the cache. 3751 * the cache.
3527 */ 3752 */
3528 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) 3753 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp, this_cpu))
3529 return; 3754 return;
3530 3755
3531 if (likely(ac->avail < ac->limit)) { 3756 if (likely(ac->avail < ac->limit)) {
@@ -3534,7 +3759,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3534 return; 3759 return;
3535 } else { 3760 } else {
3536 STATS_INC_FREEMISS(cachep); 3761 STATS_INC_FREEMISS(cachep);
3537 cache_flusharray(cachep, ac); 3762 cache_flusharray(cachep, ac, this_cpu);
3538 ac->entry[ac->avail++] = objp; 3763 ac->entry[ac->avail++] = objp;
3539 } 3764 }
3540} 3765}
@@ -3733,13 +3958,14 @@ EXPORT_SYMBOL(__kmalloc);
3733void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3958void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3734{ 3959{
3735 unsigned long flags; 3960 unsigned long flags;
3961 int this_cpu;
3736 3962
3737 local_irq_save(flags); 3963 slab_irq_save(flags, this_cpu);
3738 debug_check_no_locks_freed(objp, obj_size(cachep)); 3964 debug_check_no_locks_freed(objp, obj_size(cachep));
3739 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3965 if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3740 debug_check_no_obj_freed(objp, obj_size(cachep)); 3966 debug_check_no_obj_freed(objp, obj_size(cachep));
3741 __cache_free(cachep, objp); 3967 __cache_free(cachep, objp, &this_cpu);
3742 local_irq_restore(flags); 3968 slab_irq_restore(flags, this_cpu);
3743 3969
3744 trace_kmem_cache_free(_RET_IP_, objp); 3970 trace_kmem_cache_free(_RET_IP_, objp);
3745} 3971}
@@ -3758,18 +3984,19 @@ void kfree(const void *objp)
3758{ 3984{
3759 struct kmem_cache *c; 3985 struct kmem_cache *c;
3760 unsigned long flags; 3986 unsigned long flags;
3987 int this_cpu;
3761 3988
3762 trace_kfree(_RET_IP_, objp); 3989 trace_kfree(_RET_IP_, objp);
3763 3990
3764 if (unlikely(ZERO_OR_NULL_PTR(objp))) 3991 if (unlikely(ZERO_OR_NULL_PTR(objp)))
3765 return; 3992 return;
3766 local_irq_save(flags); 3993 slab_irq_save(flags, this_cpu);
3767 kfree_debugcheck(objp); 3994 kfree_debugcheck(objp);
3768 c = virt_to_cache(objp); 3995 c = virt_to_cache(objp);
3769 debug_check_no_locks_freed(objp, obj_size(c)); 3996 debug_check_no_locks_freed(objp, obj_size(c));
3770 debug_check_no_obj_freed(objp, obj_size(c)); 3997 debug_check_no_obj_freed(objp, obj_size(c));
3771 __cache_free(c, (void *)objp); 3998 __cache_free(c, (void *)objp, &this_cpu);
3772 local_irq_restore(flags); 3999 slab_irq_restore(flags, this_cpu);
3773} 4000}
3774EXPORT_SYMBOL(kfree); 4001EXPORT_SYMBOL(kfree);
3775 4002
@@ -3790,7 +4017,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
3790 */ 4017 */
3791static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) 4018static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3792{ 4019{
3793 int node; 4020 int node, this_cpu;
3794 struct kmem_list3 *l3; 4021 struct kmem_list3 *l3;
3795 struct array_cache *new_shared; 4022 struct array_cache *new_shared;
3796 struct array_cache **new_alien = NULL; 4023 struct array_cache **new_alien = NULL;
@@ -3818,11 +4045,11 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3818 if (l3) { 4045 if (l3) {
3819 struct array_cache *shared = l3->shared; 4046 struct array_cache *shared = l3->shared;
3820 4047
3821 spin_lock_irq(&l3->list_lock); 4048 slab_spin_lock_irq(&l3->list_lock, this_cpu);
3822 4049
3823 if (shared) 4050 if (shared)
3824 free_block(cachep, shared->entry, 4051 free_block(cachep, shared->entry,
3825 shared->avail, node); 4052 shared->avail, node, &this_cpu);
3826 4053
3827 l3->shared = new_shared; 4054 l3->shared = new_shared;
3828 if (!l3->alien) { 4055 if (!l3->alien) {
@@ -3831,7 +4058,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3831 } 4058 }
3832 l3->free_limit = (1 + nr_cpus_node(node)) * 4059 l3->free_limit = (1 + nr_cpus_node(node)) *
3833 cachep->batchcount + cachep->num; 4060 cachep->batchcount + cachep->num;
3834 spin_unlock_irq(&l3->list_lock); 4061 slab_spin_unlock_irq(&l3->list_lock, this_cpu);
3835 kfree(shared); 4062 kfree(shared);
3836 free_alien_cache(new_alien); 4063 free_alien_cache(new_alien);
3837 continue; 4064 continue;
@@ -3878,24 +4105,36 @@ struct ccupdate_struct {
3878 struct array_cache *new[NR_CPUS]; 4105 struct array_cache *new[NR_CPUS];
3879}; 4106};
3880 4107
3881static void do_ccupdate_local(void *info) 4108static void __do_ccupdate_local(void *info, int this_cpu)
3882{ 4109{
3883 struct ccupdate_struct *new = info; 4110 struct ccupdate_struct *new = info;
3884 struct array_cache *old; 4111 struct array_cache *old;
3885 4112
3886 check_irq_off(); 4113 check_irq_off();
3887 old = cpu_cache_get(new->cachep); 4114 old = cpu_cache_get(new->cachep, this_cpu);
3888 4115
3889 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 4116 new->cachep->array[this_cpu] = new->new[this_cpu];
3890 new->new[smp_processor_id()] = old; 4117 new->new[this_cpu] = old;
3891} 4118}
3892 4119
4120#ifdef CONFIG_PREEMPT_RT
4121static void do_ccupdate_local(void *arg, int this_cpu)
4122{
4123 __do_ccupdate_local(arg, this_cpu);
4124}
4125#else
4126static void do_ccupdate_local(void *arg)
4127{
4128 __do_ccupdate_local(arg, smp_processor_id());
4129}
4130#endif
4131
3893/* Always called with the cache_chain_mutex held */ 4132/* Always called with the cache_chain_mutex held */
3894static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 4133static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3895 int batchcount, int shared, gfp_t gfp) 4134 int batchcount, int shared, gfp_t gfp)
3896{ 4135{
3897 struct ccupdate_struct *new; 4136 struct ccupdate_struct *new;
3898 int i; 4137 int i, this_cpu;
3899 4138
3900 new = kzalloc(sizeof(*new), gfp); 4139 new = kzalloc(sizeof(*new), gfp);
3901 if (!new) 4140 if (!new)
@@ -3913,7 +4152,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3913 } 4152 }
3914 new->cachep = cachep; 4153 new->cachep = cachep;
3915 4154
3916 on_each_cpu(do_ccupdate_local, (void *)new, 1); 4155 slab_on_each_cpu(do_ccupdate_local, (void *)new);
3917 4156
3918 check_irq_on(); 4157 check_irq_on();
3919 cachep->batchcount = batchcount; 4158 cachep->batchcount = batchcount;
@@ -3924,9 +4163,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3924 struct array_cache *ccold = new->new[i]; 4163 struct array_cache *ccold = new->new[i];
3925 if (!ccold) 4164 if (!ccold)
3926 continue; 4165 continue;
3927 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 4166 slab_spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock,
3928 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); 4167 this_cpu);
3929 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 4168 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i),
4169 &this_cpu);
4170 slab_spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock,
4171 this_cpu);
3930 kfree(ccold); 4172 kfree(ccold);
3931 } 4173 }
3932 kfree(new); 4174 kfree(new);
@@ -3991,29 +4233,31 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3991 * Drain an array if it contains any elements taking the l3 lock only if 4233 * Drain an array if it contains any elements taking the l3 lock only if
3992 * necessary. Note that the l3 listlock also protects the array_cache 4234 * necessary. Note that the l3 listlock also protects the array_cache
3993 * if drain_array() is used on the shared array. 4235 * if drain_array() is used on the shared array.
4236 * returns non-zero if some work is done
3994 */ 4237 */
3995void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 4238int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3996 struct array_cache *ac, int force, int node) 4239 struct array_cache *ac, int force, int node)
3997{ 4240{
3998 int tofree; 4241 int tofree, this_cpu;
3999 4242
4000 if (!ac || !ac->avail) 4243 if (!ac || !ac->avail)
4001 return; 4244 return 0;
4002 if (ac->touched && !force) { 4245 if (ac->touched && !force) {
4003 ac->touched = 0; 4246 ac->touched = 0;
4004 } else { 4247 } else {
4005 spin_lock_irq(&l3->list_lock); 4248 slab_spin_lock_irq(&l3->list_lock, this_cpu);
4006 if (ac->avail) { 4249 if (ac->avail) {
4007 tofree = force ? ac->avail : (ac->limit + 4) / 5; 4250 tofree = force ? ac->avail : (ac->limit + 4) / 5;
4008 if (tofree > ac->avail) 4251 if (tofree > ac->avail)
4009 tofree = (ac->avail + 1) / 2; 4252 tofree = (ac->avail + 1) / 2;
4010 free_block(cachep, ac->entry, tofree, node); 4253 free_block(cachep, ac->entry, tofree, node, &this_cpu);
4011 ac->avail -= tofree; 4254 ac->avail -= tofree;
4012 memmove(ac->entry, &(ac->entry[tofree]), 4255 memmove(ac->entry, &(ac->entry[tofree]),
4013 sizeof(void *) * ac->avail); 4256 sizeof(void *) * ac->avail);
4014 } 4257 }
4015 spin_unlock_irq(&l3->list_lock); 4258 slab_spin_unlock_irq(&l3->list_lock, this_cpu);
4016 } 4259 }
4260 return 1;
4017} 4261}
4018 4262
4019/** 4263/**
@@ -4030,10 +4274,11 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
4030 */ 4274 */
4031static void cache_reap(struct work_struct *w) 4275static void cache_reap(struct work_struct *w)
4032{ 4276{
4277 int this_cpu = raw_smp_processor_id(), node = cpu_to_node(this_cpu);
4033 struct kmem_cache *searchp; 4278 struct kmem_cache *searchp;
4034 struct kmem_list3 *l3; 4279 struct kmem_list3 *l3;
4035 int node = numa_node_id();
4036 struct delayed_work *work = to_delayed_work(w); 4280 struct delayed_work *work = to_delayed_work(w);
4281 int work_done = 0;
4037 4282
4038 if (!mutex_trylock(&cache_chain_mutex)) 4283 if (!mutex_trylock(&cache_chain_mutex))
4039 /* Give up. Setup the next iteration. */ 4284 /* Give up. Setup the next iteration. */
@@ -4049,9 +4294,12 @@ static void cache_reap(struct work_struct *w)
4049 */ 4294 */
4050 l3 = searchp->nodelists[node]; 4295 l3 = searchp->nodelists[node];
4051 4296
4052 reap_alien(searchp, l3); 4297 work_done += reap_alien(searchp, l3, &this_cpu);
4298
4299 node = cpu_to_node(this_cpu);
4053 4300
4054 drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); 4301 work_done += drain_array(searchp, l3,
4302 cpu_cache_get(searchp, this_cpu), 0, node);
4055 4303
4056 /* 4304 /*
4057 * These are racy checks but it does not matter 4305 * These are racy checks but it does not matter
@@ -4062,7 +4310,7 @@ static void cache_reap(struct work_struct *w)
4062 4310
4063 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 4311 l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4064 4312
4065 drain_array(searchp, l3, l3->shared, 0, node); 4313 work_done += drain_array(searchp, l3, l3->shared, 0, node);
4066 4314
4067 if (l3->free_touched) 4315 if (l3->free_touched)
4068 l3->free_touched = 0; 4316 l3->free_touched = 0;
@@ -4081,7 +4329,8 @@ next:
4081 next_reap_node(); 4329 next_reap_node();
4082out: 4330out:
4083 /* Set up the next iteration */ 4331 /* Set up the next iteration */
4084 schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); 4332 schedule_delayed_work(work,
4333 round_jiffies_relative((1+!work_done) * REAPTIMEOUT_CPUC));
4085} 4334}
4086 4335
4087#ifdef CONFIG_SLABINFO 4336#ifdef CONFIG_SLABINFO
@@ -4140,7 +4389,7 @@ static int s_show(struct seq_file *m, void *p)
4140 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 4389 unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4141 const char *name; 4390 const char *name;
4142 char *error = NULL; 4391 char *error = NULL;
4143 int node; 4392 int this_cpu, node;
4144 struct kmem_list3 *l3; 4393 struct kmem_list3 *l3;
4145 4394
4146 active_objs = 0; 4395 active_objs = 0;
@@ -4151,7 +4400,7 @@ static int s_show(struct seq_file *m, void *p)
4151 continue; 4400 continue;
4152 4401
4153 check_irq_on(); 4402 check_irq_on();
4154 spin_lock_irq(&l3->list_lock); 4403 slab_spin_lock_irq(&l3->list_lock, this_cpu);
4155 4404
4156 list_for_each_entry(slabp, &l3->slabs_full, list) { 4405 list_for_each_entry(slabp, &l3->slabs_full, list) {
4157 if (slabp->inuse != cachep->num && !error) 4406 if (slabp->inuse != cachep->num && !error)
@@ -4176,7 +4425,7 @@ static int s_show(struct seq_file *m, void *p)
4176 if (l3->shared) 4425 if (l3->shared)
4177 shared_avail += l3->shared->avail; 4426 shared_avail += l3->shared->avail;
4178 4427
4179 spin_unlock_irq(&l3->list_lock); 4428 slab_spin_unlock_irq(&l3->list_lock, this_cpu);
4180 } 4429 }
4181 num_slabs += active_slabs; 4430 num_slabs += active_slabs;
4182 num_objs = num_slabs * cachep->num; 4431 num_objs = num_slabs * cachep->num;
@@ -4386,7 +4635,7 @@ static int leaks_show(struct seq_file *m, void *p)
4386 struct kmem_list3 *l3; 4635 struct kmem_list3 *l3;
4387 const char *name; 4636 const char *name;
4388 unsigned long *n = m->private; 4637 unsigned long *n = m->private;
4389 int node; 4638 int node, this_cpu;
4390 int i; 4639 int i;
4391 4640
4392 if (!(cachep->flags & SLAB_STORE_USER)) 4641 if (!(cachep->flags & SLAB_STORE_USER))
@@ -4404,13 +4653,13 @@ static int leaks_show(struct seq_file *m, void *p)
4404 continue; 4653 continue;
4405 4654
4406 check_irq_on(); 4655 check_irq_on();
4407 spin_lock_irq(&l3->list_lock); 4656 slab_spin_lock_irq(&l3->list_lock, this_cpu);
4408 4657
4409 list_for_each_entry(slabp, &l3->slabs_full, list) 4658 list_for_each_entry(slabp, &l3->slabs_full, list)
4410 handle_slab(n, cachep, slabp); 4659 handle_slab(n, cachep, slabp);
4411 list_for_each_entry(slabp, &l3->slabs_partial, list) 4660 list_for_each_entry(slabp, &l3->slabs_partial, list)
4412 handle_slab(n, cachep, slabp); 4661 handle_slab(n, cachep, slabp);
4413 spin_unlock_irq(&l3->list_lock); 4662 slab_spin_unlock_irq(&l3->list_lock, this_cpu);
4414 } 4663 }
4415 name = cachep->name; 4664 name = cachep->name;
4416 if (n[0] == n[1]) { 4665 if (n[0] == n[1]) {
diff --git a/mm/swap.c b/mm/swap.c
index cb29ae5d33ab..a981acde8554 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,15 +30,92 @@
30#include <linux/notifier.h> 30#include <linux/notifier.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/interrupt.h>
33 34
34#include "internal.h" 35#include "internal.h"
35 36
36/* How many pages do we try to swap or page in/out together? */ 37/* How many pages do we try to swap or page in/out together? */
37int page_cluster; 38int page_cluster;
38 39
40#ifdef CONFIG_PREEMPT_RT
41/*
42 * On PREEMPT_RT we don't want to disable preemption for cpu variables.
43 * We grab a cpu and then use that cpu to lock the variables accordingly.
44 *
45 * (On !PREEMPT_RT this turns into normal preempt-off sections, as before.)
46 */
47static DEFINE_PER_CPU_LOCKED(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
48static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_rotate_pvecs);
49
50#define swap_get_cpu_var_irq_save(var, flags, cpu) \
51 ({ \
52 (void)flags; \
53 &get_cpu_var_locked(var, &cpu); \
54 })
55
56#define swap_put_cpu_var_irq_restore(var, flags, cpu) \
57 put_cpu_var_locked(var, cpu)
58
59#define swap_get_cpu_var(var, cpu) \
60 &get_cpu_var_locked(var, &cpu)
61
62#define swap_put_cpu_var(var, cpu) \
63 put_cpu_var_locked(var, cpu)
64
65#define swap_per_cpu_lock(var, cpu) \
66 ({ \
67 spin_lock(&__get_cpu_lock(var, cpu)); \
68 &__get_cpu_var_locked(var, cpu); \
69 })
70
71#define swap_per_cpu_unlock(var, cpu) \
72 spin_unlock(&__get_cpu_lock(var, cpu));
73
74#define swap_get_cpu() raw_smp_processor_id()
75
76#define swap_put_cpu() do { } while (0)
77
78#define swap_irq_save(flags) do { (void)flags; } while (0)
79
80#define swap_irq_restore(flags) do { (void)flags; } while (0)
81
82#else
83
39static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); 84static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
40static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 85static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
41 86
87#define swap_get_cpu_var_irq_save(var, flags, cpu) \
88 ({ \
89 (void)cpu; \
90 local_irq_save(flags); \
91 &__get_cpu_var(var); \
92 })
93
94#define swap_put_cpu_var_irq_restore(var, flags, cpu) \
95 local_irq_restore(flags)
96
97#define swap_get_cpu_var(var, cpu) \
98 ({ \
99 (void)cpu; \
100 &get_cpu_var(var); \
101 })
102
103#define swap_put_cpu_var(var, cpu) put_cpu_var(var)
104
105#define swap_per_cpu_lock(var, cpu) &per_cpu(var, cpu)
106
107#define swap_per_cpu_unlock(var, cpu) do { } while (0)
108
109#define swap_get_cpu() get_cpu()
110
111#define swap_put_cpu() put_cpu()
112
113#define swap_irq_save(flags) local_irq_save(flags)
114
115#define swap_irq_restore(flags) local_irq_restore(flags)
116
117#endif
118
42/* 119/*
43 * This path almost never happens for VM activity - pages are normally 120 * This path almost never happens for VM activity - pages are normally
44 * freed via pagevecs. But it gets used by networking. 121 * freed via pagevecs. But it gets used by networking.
@@ -141,13 +218,13 @@ void rotate_reclaimable_page(struct page *page)
141 !PageUnevictable(page) && PageLRU(page)) { 218 !PageUnevictable(page) && PageLRU(page)) {
142 struct pagevec *pvec; 219 struct pagevec *pvec;
143 unsigned long flags; 220 unsigned long flags;
221 int cpu;
144 222
145 page_cache_get(page); 223 page_cache_get(page);
146 local_irq_save(flags); 224 pvec = swap_get_cpu_var_irq_save(lru_rotate_pvecs, flags, cpu);
147 pvec = &__get_cpu_var(lru_rotate_pvecs);
148 if (!pagevec_add(pvec, page)) 225 if (!pagevec_add(pvec, page))
149 pagevec_move_tail(pvec); 226 pagevec_move_tail(pvec);
150 local_irq_restore(flags); 227 swap_put_cpu_var_irq_restore(lru_rotate_pvecs, flags, cpu);
151 } 228 }
152} 229}
153 230
@@ -216,12 +293,14 @@ EXPORT_SYMBOL(mark_page_accessed);
216 293
217void __lru_cache_add(struct page *page, enum lru_list lru) 294void __lru_cache_add(struct page *page, enum lru_list lru)
218{ 295{
219 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; 296 struct pagevec *pvec;
297 int cpu;
220 298
299 pvec = swap_get_cpu_var(lru_add_pvecs, cpu)[lru];
221 page_cache_get(page); 300 page_cache_get(page);
222 if (!pagevec_add(pvec, page)) 301 if (!pagevec_add(pvec, page))
223 ____pagevec_lru_add(pvec, lru); 302 ____pagevec_lru_add(pvec, lru);
224 put_cpu_var(lru_add_pvecs); 303 swap_put_cpu_var(lru_add_pvecs, cpu);
225} 304}
226 305
227/** 306/**
@@ -271,31 +350,33 @@ void add_page_to_unevictable_list(struct page *page)
271 */ 350 */
272static void drain_cpu_pagevecs(int cpu) 351static void drain_cpu_pagevecs(int cpu)
273{ 352{
274 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); 353 struct pagevec *pvecs, *pvec;
275 struct pagevec *pvec;
276 int lru; 354 int lru;
277 355
356 pvecs = swap_per_cpu_lock(lru_add_pvecs, cpu)[0];
278 for_each_lru(lru) { 357 for_each_lru(lru) {
279 pvec = &pvecs[lru - LRU_BASE]; 358 pvec = &pvecs[lru - LRU_BASE];
280 if (pagevec_count(pvec)) 359 if (pagevec_count(pvec))
281 ____pagevec_lru_add(pvec, lru); 360 ____pagevec_lru_add(pvec, lru);
282 } 361 }
362 swap_per_cpu_unlock(lru_add_pvecs, cpu);
283 363
284 pvec = &per_cpu(lru_rotate_pvecs, cpu); 364 pvec = swap_per_cpu_lock(lru_rotate_pvecs, cpu);
285 if (pagevec_count(pvec)) { 365 if (pagevec_count(pvec)) {
286 unsigned long flags; 366 unsigned long flags;
287 367
288 /* No harm done if a racing interrupt already did this */ 368 /* No harm done if a racing interrupt already did this */
289 local_irq_save(flags); 369 swap_irq_save(flags);
290 pagevec_move_tail(pvec); 370 pagevec_move_tail(pvec);
291 local_irq_restore(flags); 371 swap_irq_restore(flags);
292 } 372 }
373 swap_per_cpu_unlock(lru_rotate_pvecs, cpu);
293} 374}
294 375
295void lru_add_drain(void) 376void lru_add_drain(void)
296{ 377{
297 drain_cpu_pagevecs(get_cpu()); 378 drain_cpu_pagevecs(swap_get_cpu());
298 put_cpu(); 379 swap_put_cpu();
299} 380}
300 381
301static void lru_add_drain_per_cpu(struct work_struct *dummy) 382static void lru_add_drain_per_cpu(struct work_struct *dummy)
@@ -369,7 +450,7 @@ void release_pages(struct page **pages, int nr, int cold)
369 } 450 }
370 __pagevec_free(&pages_to_free); 451 __pagevec_free(&pages_to_free);
371 pagevec_reinit(&pages_to_free); 452 pagevec_reinit(&pages_to_free);
372 } 453 }
373 } 454 }
374 if (zone) 455 if (zone)
375 spin_unlock_irqrestore(&zone->lru_lock, flags); 456 spin_unlock_irqrestore(&zone->lru_lock, flags);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dea7abd31098..6911d54ff9c8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -23,6 +23,7 @@
23#include <linux/file.h> 23#include <linux/file.h>
24#include <linux/writeback.h> 24#include <linux/writeback.h>
25#include <linux/blkdev.h> 25#include <linux/blkdev.h>
26#include <linux/interrupt.h>
26#include <linux/buffer_head.h> /* for try_to_release_page(), 27#include <linux/buffer_head.h> /* for try_to_release_page(),
27 buffer_heads_over_limit */ 28 buffer_heads_over_limit */
28#include <linux/mm_inline.h> 29#include <linux/mm_inline.h>
@@ -1118,7 +1119,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1118 } 1119 }
1119 1120
1120 nr_reclaimed += nr_freed; 1121 nr_reclaimed += nr_freed;
1121 local_irq_disable(); 1122 local_irq_disable_nort();
1122 if (current_is_kswapd()) { 1123 if (current_is_kswapd()) {
1123 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); 1124 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
1124 __count_vm_events(KSWAPD_STEAL, nr_freed); 1125 __count_vm_events(KSWAPD_STEAL, nr_freed);
@@ -1159,9 +1160,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1159 } 1160 }
1160 } 1161 }
1161 } while (nr_scanned < max_scan); 1162 } while (nr_scanned < max_scan);
1163 /*
1164 * Non-PREEMPT_RT relies on IRQs-off protecting the page_states
1165 * per-CPU data. PREEMPT_RT has that data protected even in
1166 * __mod_page_state(), so no need to keep IRQs disabled.
1167 */
1162 spin_unlock(&zone->lru_lock); 1168 spin_unlock(&zone->lru_lock);
1163done: 1169done:
1164 local_irq_enable(); 1170 local_irq_enable_nort();
1165 pagevec_release(&pvec); 1171 pagevec_release(&pvec);
1166 return nr_reclaimed; 1172 return nr_reclaimed;
1167} 1173}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 138bed53706e..9f7c001f1820 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -149,17 +149,16 @@ static void refresh_zone_stat_thresholds(void)
149void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 149void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
150 int delta) 150 int delta)
151{ 151{
152 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 152 struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu());
153 s8 *p = pcp->vm_stat_diff + item; 153 s8 *p = pcp->vm_stat_diff + item;
154 long x; 154 long x = delta + *p;
155
156 x = delta + *p;
157 155
158 if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { 156 if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
159 zone_page_state_add(x, zone, item); 157 zone_page_state_add(x, zone, item);
160 x = 0; 158 x = 0;
161 } 159 }
162 *p = x; 160 *p = x;
161 put_cpu();
163} 162}
164EXPORT_SYMBOL(__mod_zone_page_state); 163EXPORT_SYMBOL(__mod_zone_page_state);
165 164
@@ -202,7 +201,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
202 */ 201 */
203void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 202void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
204{ 203{
205 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 204 struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu());
206 s8 *p = pcp->vm_stat_diff + item; 205 s8 *p = pcp->vm_stat_diff + item;
207 206
208 (*p)++; 207 (*p)++;
@@ -213,17 +212,28 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
213 zone_page_state_add(*p + overstep, zone, item); 212 zone_page_state_add(*p + overstep, zone, item);
214 *p = -overstep; 213 *p = -overstep;
215 } 214 }
215 put_cpu();
216} 216}
217 217
218void __inc_zone_page_state(struct page *page, enum zone_stat_item item) 218void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
219{ 219{
220#ifdef CONFIG_PREEMPT_RT
221 unsigned long flags;
222 struct zone *zone;
223
224 zone = page_zone(page);
225 local_irq_save(flags);
226 __inc_zone_state(zone, item);
227 local_irq_restore(flags);
228#else
220 __inc_zone_state(page_zone(page), item); 229 __inc_zone_state(page_zone(page), item);
230#endif
221} 231}
222EXPORT_SYMBOL(__inc_zone_page_state); 232EXPORT_SYMBOL(__inc_zone_page_state);
223 233
224void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 234void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
225{ 235{
226 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 236 struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu());
227 s8 *p = pcp->vm_stat_diff + item; 237 s8 *p = pcp->vm_stat_diff + item;
228 238
229 (*p)--; 239 (*p)--;
@@ -234,6 +244,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
234 zone_page_state_add(*p - overstep, zone, item); 244 zone_page_state_add(*p - overstep, zone, item);
235 *p = overstep; 245 *p = overstep;
236 } 246 }
247 put_cpu();
237} 248}
238 249
239void __dec_zone_page_state(struct page *page, enum zone_stat_item item) 250void __dec_zone_page_state(struct page *page, enum zone_stat_item item)