From 9b5f4aab419ba370062e28231f6910292813a9c9 Mon Sep 17 00:00:00 2001 From: Namhoon Kim Date: Mon, 5 Jun 2017 12:26:29 -0400 Subject: per-partition buddy allocator --- include/linux/gfp.h | 7 +- include/linux/mmzone.h | 19 ++- include/litmus/page_dev.h | 2 + litmus/bank_proc.c | 22 +-- litmus/color_shm.c | 8 +- litmus/litmus.c | 39 +++-- litmus/page_dev.c | 154 +++++++++-------- mm/page_alloc.c | 417 ++++++++++++++++++++++++++++++++++++---------- mm/vmstat.c | 89 +++++++++- 9 files changed, 553 insertions(+), 204 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 15928f0647e4..10a4601c558b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -35,6 +35,7 @@ struct vm_area_struct; #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u #define ___GFP_WRITE 0x1000000u +#define ___GFP_COLOR 0x2000000u /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -94,6 +95,7 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ +#define __GFP_COLOR ((__force gfp_t)___GFP_COLOR) /* Colored page request */ /* * This may seem redundant, but it's a way of annotating false positives vs. @@ -101,7 +103,7 @@ struct vm_area_struct; */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 26 /* Room for N __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ @@ -146,6 +148,9 @@ struct vm_area_struct; /* 4GB DMA on some platforms */ #define GFP_DMA32 __GFP_DMA32 +/* Colored page requests */ +#define GFP_COLOR __GFP_COLOR + /* Convert GFP flags to their corresponding migrate type */ static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 92084abf3cf5..d28f7ef8228d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -38,11 +38,13 @@ /* For page coloring - This address decoding is used in imx6-sabresd * platform without bank interleaving . */ -#define BANK_MASK 0x38000000 -#define BANK_SHIFT 27 - -#define CACHE_MASK 0x0000f000 -#define CACHE_SHIFT 12 +#define BANK_MASK 0x38000000 +#define BANK_SHIFT 27 +#define CACHE_MASK 0x0000f000 +#define CACHE_SHIFT 12 +#define MAX_NUM_COLOR 16 +#define MAX_NUM_BANK 8 +#define MAX_PARTITIONED_ORDER 3 enum { MIGRATE_UNMOVABLE, @@ -485,7 +487,8 @@ struct zone { ZONE_PADDING(_pad1_) /* free areas of different sizes */ struct free_area free_area[MAX_ORDER]; - + struct free_area free_area_d[NR_CPUS][MAX_PARTITIONED_ORDER]; + /* zone flags, see below */ unsigned long flags; @@ -532,7 +535,9 @@ struct zone { /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif - + + struct list_head color_list[MAX_NUM_COLOR * MAX_NUM_BANK]; + DECLARE_BITMAP(color_map, MAX_NUM_COLOR*MAX_NUM_BANK); ZONE_PADDING(_pad3_) /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; diff --git a/include/litmus/page_dev.h b/include/litmus/page_dev.h index 9dac293651f0..f1791469cba1 100644 --- a/include/litmus/page_dev.h +++ b/include/litmus/page_dev.h @@ -26,5 +26,7 @@ int llc_partition_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); int dram_partition_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +int bank_to_partition(unsigned int bank); +int is_in_llc_partition(struct page* page, int cpu); #endif /* _LITMUS_PAGE_DEV_H */ \ No newline at end of file diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c index 097cff177a2d..353d38dbe9d6 100644 --- a/litmus/bank_proc.c +++ b/litmus/bank_proc.c @@ -120,26 +120,6 @@ unsigned int two_exp(unsigned int e) return v; } -unsigned int num_by_bitmask_index(unsigned int bitmask, unsigned int index) -{ - unsigned int pos = 0; - - while(true) - { - if(index ==0 && (bitmask & 1)==1) - { - break; - } - if(index !=0 && (bitmask & 1)==1){ - index--; - } - pos++; - bitmask = bitmask >>1; - - } - return pos; -} - /* helper functions to find the next colored pool index */ static inline unsigned int first_index(unsigned long node) { @@ -160,7 +140,7 @@ static inline unsigned int first_index(unsigned long node) static inline unsigned int last_index(unsigned long node) { - unsigned int bank_no = 7, color_no = 15; + unsigned int bank_no = NUM_BANKS-1, color_no = NUM_COLORS-1; while(bank_no >= 0) { if ((bank_partition[node]>>bank_no) & 0x1) diff --git a/litmus/color_shm.c b/litmus/color_shm.c index d4913cd5f213..084262335466 100644 --- a/litmus/color_shm.c +++ b/litmus/color_shm.c @@ -264,10 +264,10 @@ static int litmus_color_shm_mmap(struct file *filp, struct vm_area_struct *vma) TRACE_CUR("flags=0x%lx prot=0x%lx\n", vma->vm_flags, pgprot_val(vma->vm_page_prot)); out: - color_param.color == 0x00000000; - color_param.bank == 0x00000000; - color_offset.offset == 0xffffffff; - color_offset.lock == -1; + color_param.color = 0x00000000; + color_param.bank = 0x00000000; + color_offset.offset = 0xffffffff; + color_offset.lock = -1; return err; diff --git a/litmus/litmus.c b/litmus/litmus.c index 84446acb0869..1105408e405a 100644 --- a/litmus/litmus.c +++ b/litmus/litmus.c @@ -343,15 +343,19 @@ asmlinkage long sys_reservation_destroy(unsigned int reservation_id, int cpu) static unsigned long color_mask; -static inline unsigned long page_color(struct page *page) -{ - return ((page_to_phys(page) & color_mask) >> PAGE_SHIFT); -} - extern int isolate_lru_page(struct page *page); extern void putback_movable_page(struct page *page); extern struct page *new_alloc_page(struct page *page, unsigned long node, int **x); +static struct page *alloc_colored_page(struct page *page, unsigned long node, int **result) +{ + struct page *newpage; + + newpage = alloc_pages(GFP_HIGHUSER_MOVABLE|GFP_COLOR, 0); + + return newpage; +} + #define INVALID_PFN (0xffffffff) LIST_HEAD(shared_lib_pages); @@ -479,7 +483,7 @@ asmlinkage long sys_set_page_color(int cpu) /* Migrate private pages */ if (!list_empty(&pagelist)) { - ret = migrate_pages(&pagelist, new_alloc_page, NULL, node, MIGRATE_SYNC, MR_SYSCALL); + ret = migrate_pages(&pagelist, alloc_colored_page, NULL, node, MIGRATE_SYNC, MR_SYSCALL); TRACE_TASK(current, "%ld pages not migrated.\n", ret); nr_not_migrated = ret; if (ret) { @@ -489,7 +493,7 @@ asmlinkage long sys_set_page_color(int cpu) /* Replicate shared pages */ if (!list_empty(&task_shared_pagelist)) { - ret = replicate_pages(&task_shared_pagelist, new_alloc_page, NULL, node, MIGRATE_SYNC, MR_SYSCALL); + ret = replicate_pages(&task_shared_pagelist, alloc_colored_page, NULL, node, MIGRATE_SYNC, MR_SYSCALL); TRACE_TASK(current, "%ld shared pages not migrated.\n", ret); nr_not_migrated += ret; if (ret) { @@ -501,12 +505,27 @@ asmlinkage long sys_set_page_color(int cpu) TRACE_TASK(current, "nr_pages = %d nr_failed = %d nr_not_migrated = %d\n", nr_pages, nr_failed, nr_not_migrated); printk(KERN_INFO "node = %ld, nr_private_pages = %d, nr_shared_pages = %d, nr_failed_to_isolate_lru = %d, nr_not_migrated = %d\n", node, nr_pages, nr_shared_pages, nr_failed, nr_not_migrated); - - flush_cache(1); return nr_not_migrated; } +#define BANK_MASK 0x38000000 +#define BANK_SHIFT 27 +#define CACHE_MASK 0x0000f000 +#define CACHE_SHIFT 12 + +/* Decoding page color, 0~15 */ +static inline unsigned int page_color(struct page *page) +{ + return ((page_to_phys(page)& CACHE_MASK) >> CACHE_SHIFT); +} + +/* Decoding page bank number, 0~7 */ +static inline unsigned int page_bank(struct page *page) +{ + return ((page_to_phys(page)& BANK_MASK) >> BANK_SHIFT); +} + /* sys_test_call() is a test system call for debugging */ asmlinkage long sys_test_call(unsigned int param) { @@ -549,7 +568,7 @@ asmlinkage long sys_test_call(unsigned int param) continue; } - TRACE_TASK(current, "addr: %08x, phy: %08x, pfn: %05lx, _mapcount: %d, _count: %d flags: %s%s%s mapping: %p\n", vma_itr->vm_start + PAGE_SIZE*i, page_to_phys(old_page), page_to_pfn(old_page), page_mapcount(old_page), page_count(old_page), vma_itr->vm_flags&VM_READ?"r":"-", vma_itr->vm_flags&VM_WRITE?"w":"-", vma_itr->vm_flags&VM_EXEC?"x":"-", &(old_page->mapping)); + TRACE_TASK(current, "addr: %08x, phy: %08x, color: %d, bank: %d, pfn: %05lx, _mapcount: %d, _count: %d flags: %s%s%s mapping: %p\n", vma_itr->vm_start + PAGE_SIZE*i, page_to_phys(old_page), page_color(old_page), page_bank(old_page), page_to_pfn(old_page), page_mapcount(old_page), page_count(old_page), vma_itr->vm_flags&VM_READ?"r":"-", vma_itr->vm_flags&VM_WRITE?"w":"-", vma_itr->vm_flags&VM_EXEC?"x":"-", &(old_page->mapping)); put_page(old_page); } vma_itr = vma_itr->vm_next; diff --git a/litmus/page_dev.c b/litmus/page_dev.c index 1e91b989dae2..8e29e68ed89a 100644 --- a/litmus/page_dev.c +++ b/litmus/page_dev.c @@ -7,36 +7,90 @@ #include -#define NR_PARTITIONS 9 +// This Address Decoding is used in imx6-sabredsd platform +#define NUM_BANKS 8 +#define BANK_MASK 0x38000000 +#define BANK_SHIFT 27 + +#define NUM_COLORS 16 +#define CACHE_MASK 0x0000f000 +#define CACHE_SHIFT 12 + +#define NR_LLC_PARTITIONS 9 +#define NR_DRAM_PARTITIONS 5 struct mutex dev_mutex; /* Initial partitions for LLC and DRAM bank */ /* 4 color for each core, all colors for Level C */ -unsigned int llc_partition[NR_PARTITIONS] = { - 0x00000003, /* Core 0, and Level A*/ - 0x00000003, /* Core 0, and Level B*/ - 0x0000000C, /* Core 1, and Level A*/ - 0x0000000C, /* Core 1, and Level B*/ - 0x00000030, /* Core 2, and Level A*/ - 0x00000030, /* Core 2, and Level B*/ - 0x000000C0, /* Core 3, and Level A*/ - 0x000000C0, /* Core 3, and Level B*/ +unsigned int llc_partition[NR_LLC_PARTITIONS] = { + 0x0000000f, /* Core 0, and Level A*/ + 0x0000000f, /* Core 0, and Level B*/ + 0x000000f0, /* Core 1, and Level A*/ + 0x000000f0, /* Core 1, and Level B*/ + 0x00000f00, /* Core 2, and Level A*/ + 0x00000f00, /* Core 2, and Level B*/ + 0x0000f000, /* Core 3, and Level A*/ + 0x0000f000, /* Core 3, and Level B*/ 0x0000ffff, /* Level C */ }; /* 1 bank for each core, 2 banks for Level C */ -unsigned int dram_partition[NR_PARTITIONS] = { - 0x00000010, /* Core 0, and Level A*/ - 0x00000010, /* Core 0, and Level B*/ - 0x00000020, /* Core 1, and Level A*/ - 0x00000020, /* Core 1, and Level B*/ - 0x00000040, /* Core 2, and Level A*/ - 0x00000040, /* Core 2, and Level B*/ - 0x00000080, /* Core 3, and Level A*/ - 0x00000080, /* Core 3, and Level B*/ - 0x0000000c, /* Level C */ +unsigned int dram_partition[NR_DRAM_PARTITIONS] = { + 0x00000010, + 0x00000020, + 0x00000040, + 0x00000080, + 0x0000000f, +}; +/* +unsigned int dram_partition[NR_DRAM_PARTITIONS] = { + 0x00000001, + 0x00000002, + 0x00000004, + 0x00000008, + 0x000000f0, }; +*/ + +/* Decoding page color, 0~15 */ +static inline unsigned int page_color(struct page *page) +{ + return ((page_to_phys(page)& CACHE_MASK) >> CACHE_SHIFT); +} + +/* Decoding page bank number, 0~7 */ +static inline unsigned int page_bank(struct page *page) +{ + return ((page_to_phys(page)& BANK_MASK) >> BANK_SHIFT); +} + +int bank_to_partition(unsigned int bank) +{ + int i; + unsigned int bank_bit = 0x1< #include "internal.h" +// This Address Decoding is used in imx6-sabredsd platform +#define BANK_MASK 0x38000000 +#define BANK_SHIFT 27 + +#define CACHE_MASK 0x0000f000 +#define CACHE_SHIFT 12 +#define MAX_COLOR_NODE 128 + +/* Decoding page color, 0~15 */ +static inline unsigned int page_color(struct page *page) +{ + return ((page_to_phys(page)& CACHE_MASK) >> CACHE_SHIFT); +} + +/* Decoding page bank number, 0~7 */ +static inline unsigned int page_bank(struct page *page) +{ + return ((page_to_phys(page)& BANK_MASK) >> BANK_SHIFT); +} + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) @@ -583,76 +603,153 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; - int max_order = MAX_ORDER; + int max_order, parti_no; + + parti_no = bank_to_partition(page_bank(page)); + BUG_ON(parti_no < 0 || parti_no > NR_CPUS); + if (parti_no < 0 || parti_no > NR_CPUS) + printk(KERN_ALERT "PART_NO %d\n", parti_no); + + if (parti_no < NR_CPUS) + printk(KERN_ALERT "pfn = %lx, part_no = %d order = %d\n", pfn, parti_no, order); + + if (parti_no == NR_CPUS) { + max_order = MAX_ORDER; + + VM_BUG_ON(!zone_is_initialized(zone)); + VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); + + VM_BUG_ON(migratetype == -1); + if (is_migrate_isolate(migratetype)) { + /* + * We restrict max order of merging to prevent merge + * between freepages on isolate pageblock and normal + * pageblock. Without this, pageblock isolation + * could cause incorrect freepage accounting. + */ + max_order = min(MAX_ORDER, pageblock_order + 1); + } else { + __mod_zone_freepage_state(zone, 1 << order, migratetype); + } - VM_BUG_ON(!zone_is_initialized(zone)); - VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); + page_idx = pfn & ((1 << max_order) - 1); + + VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(bad_range(zone, page), page); + + while (order < max_order - 1) { + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + if (!page_is_buddy(page, buddy, order)) + break; + /* + * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, + * merge with it and move up one order. + */ + if (page_is_guard(buddy)) { + clear_page_guard(zone, buddy, order, migratetype); + } else { + list_del(&buddy->lru); + zone->free_area[order].nr_free--; + rmv_page_order(buddy); + } + combined_idx = buddy_idx & page_idx; + page = page + (combined_idx - page_idx); + page_idx = combined_idx; + order++; + } + set_page_order(page, order); - VM_BUG_ON(migratetype == -1); - if (is_migrate_isolate(migratetype)) { /* - * We restrict max order of merging to prevent merge - * between freepages on isolate pageblock and normal - * pageblock. Without this, pageblock isolation - * could cause incorrect freepage accounting. + * If this is not the largest possible page, check if the buddy + * of the next-highest order is free. If it is, it's possible + * that pages are being freed that will coalesce soon. In case, + * that is happening, add the free page to the tail of the list + * so it's less likely to be used soon and more likely to be merged + * as a higher order page */ - max_order = min(MAX_ORDER, pageblock_order + 1); - } else { - __mod_zone_freepage_state(zone, 1 << order, migratetype); - } + if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { + struct page *higher_page, *higher_buddy; + combined_idx = buddy_idx & page_idx; + higher_page = page + (combined_idx - page_idx); + buddy_idx = __find_buddy_index(combined_idx, order + 1); + higher_buddy = higher_page + (buddy_idx - combined_idx); + if (page_is_buddy(higher_page, higher_buddy, order + 1)) { + list_add_tail(&page->lru, + &zone->free_area[order].free_list[migratetype]); + goto out; + } + } - page_idx = pfn & ((1 << max_order) - 1); + list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); +out: + zone->free_area[order].nr_free++; + } else { + max_order = MAX_PARTITIONED_ORDER; - VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); - VM_BUG_ON_PAGE(bad_range(zone, page), page); + VM_BUG_ON(!zone_is_initialized(zone)); + VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); - while (order < max_order - 1) { - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); - if (!page_is_buddy(page, buddy, order)) - break; - /* - * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, - * merge with it and move up one order. - */ - if (page_is_guard(buddy)) { - clear_page_guard(zone, buddy, order, migratetype); + VM_BUG_ON(migratetype == -1); + if (is_migrate_isolate(migratetype)) { + max_order = min(MAX_PARTITIONED_ORDER, pageblock_order + 1); } else { - list_del(&buddy->lru); - zone->free_area[order].nr_free--; - rmv_page_order(buddy); + __mod_zone_freepage_state(zone, 1 << order, migratetype); } - combined_idx = buddy_idx & page_idx; - page = page + (combined_idx - page_idx); - page_idx = combined_idx; - order++; - } - set_page_order(page, order); - /* - * If this is not the largest possible page, check if the buddy - * of the next-highest order is free. If it is, it's possible - * that pages are being freed that will coalesce soon. In case, - * that is happening, add the free page to the tail of the list - * so it's less likely to be used soon and more likely to be merged - * as a higher order page - */ - if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { - struct page *higher_page, *higher_buddy; - combined_idx = buddy_idx & page_idx; - higher_page = page + (combined_idx - page_idx); - buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = higher_page + (buddy_idx - combined_idx); - if (page_is_buddy(higher_page, higher_buddy, order + 1)) { - list_add_tail(&page->lru, - &zone->free_area[order].free_list[migratetype]); - goto out; + page_idx = pfn & ((1 << max_order) - 1); + + VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(bad_range(zone, page), page); + + while (order < max_order - 1) { + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + if (!page_is_buddy(page, buddy, order)) + break; + + if (page_is_guard(buddy)) { + clear_page_guard(zone, buddy, order, migratetype); + } else { + list_del(&buddy->lru); + zone->free_area_d[parti_no][order].nr_free--; + rmv_page_order(buddy); + } + combined_idx = buddy_idx & page_idx; + page = page + (combined_idx - page_idx); + page_idx = combined_idx; + order++; + } + set_page_order(page, order); + + if ((order < MAX_PARTITIONED_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { + struct page *higher_page, *higher_buddy; + combined_idx = buddy_idx & page_idx; + higher_page = page + (combined_idx - page_idx); + buddy_idx = __find_buddy_index(combined_idx, order + 1); + higher_buddy = higher_page + (buddy_idx - combined_idx); + if (page_is_buddy(higher_page, higher_buddy, order + 1)) { + list_add_tail(&page->lru, + &zone->free_area_d[parti_no][order].free_list[migratetype]); + zone->free_area_d[parti_no][order].nr_free++; + return; + } } - } - list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); -out: - zone->free_area[order].nr_free++; + if (order >= MAX_PARTITIONED_ORDER) { + int n_idx = 0; + struct page *lower_page; + for (n_idx = 0 ; n_idx < (1 << (order - MAX_PARTITIONED_ORDER + 1)); n_idx++) { + lower_page = page + (n_idx << (MAX_PARTITIONED_ORDER - 1)); + set_page_order(lower_page, MAX_PARTITIONED_ORDER-1); + list_add(&lower_page->lru, &zone->free_area_d[parti_no][MAX_PARTITIONED_ORDER-1].free_list[migratetype]); + zone->free_area_d[parti_no][MAX_PARTITIONED_ORDER-1].nr_free++; + } + } else { + list_add(&page->lru, &zone->free_area_d[parti_no][order].free_list[migratetype]); + zone->free_area_d[parti_no][order].nr_free++; + } + } } static inline int free_pages_check(struct page *page) @@ -997,34 +1094,140 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, return 0; } +/* Kernel page coloring */ + +/* build colored page list */ +static void build_colored_pages(struct zone *zone, struct page *page, int order) +{ + int i, color, bank; + + list_del(&page->lru); + zone->free_area[order].nr_free--; + + /* insert pages to zone->color_list[] */ + for (i = 0; i < (1<color_list[node]); + bitmap_set(zone->color_map, node, 1); + zone->free_area[0].nr_free++; + rmv_page_order(&page[i]); + } +} + +int color_seq_index[9] = { + 0, /* Core 0, and Level A*/ + 0, /* Core 0, and Level B*/ + 0, /* Core 1, and Level A*/ + 0, /* Core 1, and Level B*/ + 0, /* Core 2, and Level A*/ + 0, /* Core 2, and Level B*/ + 0, /* Core 3, and Level A*/ + 0, /* Core 3, and Level B*/ + 0, /* Level C */ +}; + +/* return a colored page */ +static inline struct page *get_colored_page(struct zone *zone, unsigned long req_color_map[BITS_TO_LONGS(MAX_COLOR_NODE)], int order, int partition) +{ + struct page *page; + unsigned int color, bank, index; + int i; + DECLARE_BITMAP(candidate_bit, MAX_COLOR_NODE); + + /* if req_color_map does not exist in zone, return NULL */ + if (!bitmap_intersects(zone->color_map, req_color_map, MAX_COLOR_NODE)) + return NULL; + + bitmap_and(candidate_bit, zone->color_map, req_color_map, MAX_COLOR_NODE); + index = color_seq_index[partition]; + + for_each_set_bit(i, candidate_bit, MAX_COLOR_NODE) { + if (index-- <= 0) + break; + } + + BUG_ON(i >= MAX_COLOR_NODE); + BUG_ON(list_empty(&zone->color_list[i])); + + page = list_entry(zone->color_list[i].next, struct page, lru); + + list_del(&page->lru); + + if (list_empty(&zone->color_list[i])) + bitmap_clear(zone->color_map, i, 1); + + zone->free_area[0].nr_free--; + color = page_color(page); + bank = page_bank(page); + printk(KERN_INFO "color=%d, bank=%d allocated\n", color, bank); + return page; +} + /* * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists */ static inline struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, - int migratetype) + int migratetype, int color_req) { unsigned int current_order; struct free_area *area; struct page *page; + int cpu = raw_smp_processor_id(); + + if (order == 0 && color_req == 1) { + int found = 0; + /* Colored page request with order = 0 */ + if (is_realtime(current)) + printk(KERN_INFO "COLORED PAGE IS REQUESTED on CPU%d\n", cpu); + /* Find a page of the appropriate size in the preferred list */ + for (current_order = order; current_order < MAX_PARTITIONED_ORDER; ++current_order) { + area = &(zone->free_area_d[cpu][current_order]); + if (list_empty(&area->free_list[migratetype])) + continue; + + page = list_entry(area->free_list[migratetype].next, + struct page, lru); + if (is_in_llc_partition(page, cpu)) + found = 1; + + while(!found) { + page = list_next_entry(page, lru); + if (is_in_llc_partition(page, cpu)) + found = 1; + } + list_del(&page->lru); + rmv_page_order(page); + area->nr_free--; + expand(zone, page, order, current_order, area, migratetype); + set_freepage_migratetype(page, migratetype); + return page; + } + } else { + /* Buddy allocator */ + /* Find a page of the appropriate size in the preferred list */ + for (current_order = order; current_order < MAX_ORDER; ++current_order) { + area = &(zone->free_area[current_order]); + if (list_empty(&area->free_list[migratetype])) + continue; - /* Find a page of the appropriate size in the preferred list */ - for (current_order = order; current_order < MAX_ORDER; ++current_order) { - area = &(zone->free_area[current_order]); - if (list_empty(&area->free_list[migratetype])) - continue; - - page = list_entry(area->free_list[migratetype].next, - struct page, lru); - list_del(&page->lru); - rmv_page_order(page); - area->nr_free--; - expand(zone, page, order, current_order, area, migratetype); - set_freepage_migratetype(page, migratetype); - return page; + page = list_entry(area->free_list[migratetype].next, + struct page, lru); + list_del(&page->lru); + rmv_page_order(page); + area->nr_free--; + expand(zone, page, order, current_order, area, migratetype); + set_freepage_migratetype(page, migratetype); + return page; + } } - + return NULL; } @@ -1050,7 +1253,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { static struct page *__rmqueue_cma_fallback(struct zone *zone, unsigned int order) { - return __rmqueue_smallest(zone, order, MIGRATE_CMA); + return __rmqueue_smallest(zone, order, MIGRATE_CMA, 0); } #else static inline struct page *__rmqueue_cma_fallback(struct zone *zone, @@ -1291,12 +1494,12 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) * Call me with the zone->lock already held. */ static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype) + int migratetype, int color_req) { struct page *page; retry_reserve: - page = __rmqueue_smallest(zone, order, migratetype); + page = __rmqueue_smallest(zone, order, migratetype, color_req); if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { if (migratetype == MIGRATE_MOVABLE) @@ -1333,7 +1536,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order, migratetype); + struct page *page = __rmqueue(zone, order, migratetype, 1); if (unlikely(page == NULL)) break; @@ -1543,6 +1746,8 @@ void free_hot_cold_page(struct page *page, bool cold) unsigned long flags; unsigned long pfn = page_to_pfn(page); int migratetype; + unsigned int cpu; + int is_local, is_in_pcp; if (!free_pages_prepare(page, 0)) return; @@ -1566,19 +1771,33 @@ void free_hot_cold_page(struct page *page, bool cold) } migratetype = MIGRATE_MOVABLE; } - - pcp = &this_cpu_ptr(zone->pageset)->pcp; - if (!cold) - list_add(&page->lru, &pcp->lists[migratetype]); + + cpu = bank_to_partition(page_bank(page)); + BUG_ON(cpu<0); + + if (cpu == smp_processor_id()) + is_local = 1; else - list_add_tail(&page->lru, &pcp->lists[migratetype]); - pcp->count++; - if (pcp->count >= pcp->high) { - unsigned long batch = READ_ONCE(pcp->batch); - free_pcppages_bulk(zone, batch, pcp); - pcp->count -= batch; + is_local = 0; + + is_in_pcp = is_in_llc_partition(page, smp_processor_id()); + if (cpu != NR_CPUS) + printk(KERN_ALERT "CPU%d Free order-0 page bank = %d, color = %d, is_local %d is_in_pcp %d\n", smp_processor_id(), page_bank(page), page_color(page), is_local, is_in_pcp); + if (is_local && is_in_pcp) { + pcp = &this_cpu_ptr(zone->pageset)->pcp; + if (!cold) + list_add(&page->lru, &pcp->lists[migratetype]); + else + list_add_tail(&page->lru, &pcp->lists[migratetype]); + pcp->count++; + if (pcp->count >= pcp->high) { + unsigned long batch = READ_ONCE(pcp->batch); + free_pcppages_bulk(zone, batch, pcp); + pcp->count -= batch; + } + } else { + __free_page(page); } - out: local_irq_restore(flags); } @@ -1706,8 +1925,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; bool cold = ((gfp_flags & __GFP_COLD) != 0); - - if (likely(order == 0)) { + bool colored_req = ((gfp_flags & __GFP_COLOR) != 0); + + if (likely(order == 0) && colored_req) { struct per_cpu_pages *pcp; struct list_head *list; @@ -1744,7 +1964,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, WARN_ON_ONCE(order > 1); } spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order, migratetype); + page = __rmqueue(zone, order, migratetype, 0); spin_unlock(&zone->lock); if (!page) goto failed; @@ -4231,10 +4451,23 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, static void __meminit zone_init_free_lists(struct zone *zone) { unsigned int order, t; + int cpu; + for_each_migratetype_order(order, t) { INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; } + + /* Initialize per-partition free_area data structures */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + for (order = 0; order < MAX_PARTITIONED_ORDER; order++) { + for (t = 0; t < MIGRATE_TYPES; t++) { + INIT_LIST_HEAD(&zone->free_area_d[cpu][order].free_list[t]); + zone->free_area_d[cpu][order].nr_free = 0; + printk(KERN_ALERT "free_area_d[%d][%d].free_list[%d] init.\n", cpu, order, t); + } + } + } } #ifndef __HAVE_ARCH_MEMMAP_INIT diff --git a/mm/vmstat.c b/mm/vmstat.c index 4f5cd974e11a..4bbf65f7335b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -956,6 +956,67 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, } } +static void pagetypeinfo_showpartitionfree_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone, int cpu) +{ + int order, mtype; + + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { + seq_printf(m, "Node %4d, zone %8s, type %12s ", + pgdat->node_id, + zone->name, + migratetype_names[mtype]); + for (order = 0; order < MAX_PARTITIONED_ORDER; ++order) { + unsigned long freecount = 0; + struct free_area *area; + struct list_head *curr; + + area = &(zone->free_area_d[cpu][order]); + + list_for_each(curr, &area->free_list[mtype]) + freecount++; + seq_printf(m, "%6lu ", freecount); + } + seq_putc(m, '\n'); + } +} + +static void walk_zones_in_node_in_partition(struct seq_file *m, pg_data_t *pgdat, + int cpu, void (*print)(struct seq_file *m, pg_data_t *, struct zone *, int)) +{ + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + print(m, pgdat, zone, cpu); + spin_unlock_irqrestore(&zone->lock, flags); + } +} + +/* Print out the free pages at each order for each migatetype and partition */ +static int pagetypeinfo_showpartitioned(struct seq_file *m, void *arg) +{ + int order, cpu; + pg_data_t *pgdat = (pg_data_t *)arg; + + for_each_online_cpu(cpu) { + /* Print header */ + seq_putc(m, '\n'); + seq_printf(m, "CPU%d %-43s ", cpu, "free pages count per migrate type at order"); + for (order = 0; order < MAX_PARTITIONED_ORDER; ++order) + seq_printf(m, "%6d ", order); + seq_putc(m, '\n'); + + walk_zones_in_node_in_partition(m, pgdat, cpu, pagetypeinfo_showpartitionfree_print); + } + return 0; +} + /* Print out the free pages at each order for each migatetype */ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) { @@ -1138,7 +1199,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) pagetypeinfo_showfree(m, pgdat); pagetypeinfo_showblockcount(m, pgdat); pagetypeinfo_showmixedcount(m, pgdat); - + pagetypeinfo_showpartitioned(m, pgdat); return 0; } @@ -1180,10 +1241,27 @@ static const struct file_operations pagetypeinfo_file_ops = { .release = seq_release, }; +#define BANK_MASK 0x38000000 +#define BANK_SHIFT 27 +#define CACHE_MASK 0x0000f000 +#define CACHE_SHIFT 12 +/* Decoding page bank number, 0~7 */ +static inline unsigned int page_bank(struct page *page) +{ + return ((page_to_phys(page)& BANK_MASK) >> BANK_SHIFT); +} +/* Decoding page color, 0~15 */ +static inline unsigned int page_color(struct page *page) +{ + return ((page_to_phys(page)& CACHE_MASK) >> CACHE_SHIFT); +} + static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { int i; + int mtype; + seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); seq_printf(m, "\n pages free %lu" @@ -1232,6 +1310,15 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n vm stats threshold: %d", pageset->stat_threshold); #endif + /* test */ + seq_printf(m, "\n"); + for (mtype = 0; mtype < MIGRATE_PCPTYPES; mtype++) { + struct page *p; + list_for_each_entry(p, &pageset->pcp.lists[mtype], lru) { + if (p) + seq_printf(m, "page bank = %d color = %d\n", page_bank(p), page_color(p)); + } + } } seq_printf(m, "\n all_unreclaimable: %u" -- cgit v1.2.2