From c78e93630d15b5f5774213aad9bdc9f52473a89b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 12 Nov 2013 15:08:15 -0800 Subject: mm: do not walk all of system memory during show_mem It has been reported on very large machines that show_mem is taking almost 5 minutes to display information. This is a serious problem if there is an OOM storm. The bulk of the cost is in show_mem doing a very expensive PFN walk to give us the following information Total RAM: Also available as totalram_pages Highmem pages: Also available as totalhigh_pages Reserved pages: Can be inferred from the zone structure Shared pages: PFN walk required Unshared pages: PFN walk required Quick pages: Per-cpu walk required Only the shared/unshared pages requires a full PFN walk but that information is useless. It is also inaccurate as page pins of unshared pages would be accounted for as shared. Even if the information was accurate, I'm struggling to think how the shared/unshared information could be useful for debugging OOM conditions. Maybe it was useful before rmap existed when reclaiming shared pages was costly but it is less relevant today. The PFN walk could be optimised a bit but why bother as the information is useless. This patch deletes the PFN walker and infers the total RAM, highmem and reserved pages count from struct zone. It omits the shared/unshared page usage on the grounds that it is useless. It also corrects the reporting of HighMem as HighMem/MovableOnly as ZONE_MOVABLE has similar problems to HighMem with respect to lowmem/highmem exhaustion. Signed-off-by: Mel Gorman Cc: David Rientjes Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/show_mem.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) (limited to 'lib') diff --git a/lib/show_mem.c b/lib/show_mem.c index b7c72311ad0c..5847a4921b8e 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -12,8 +12,7 @@ void show_mem(unsigned int filter) { pg_data_t *pgdat; - unsigned long total = 0, reserved = 0, shared = 0, - nonshared = 0, highmem = 0; + unsigned long total = 0, reserved = 0, highmem = 0; printk("Mem-Info:\n"); show_free_areas(filter); @@ -22,43 +21,27 @@ void show_mem(unsigned int filter) return; for_each_online_pgdat(pgdat) { - unsigned long i, flags; + unsigned long flags; + int zoneid; pgdat_resize_lock(pgdat, &flags); - for (i = 0; i < pgdat->node_spanned_pages; i++) { - struct page *page; - unsigned long pfn = pgdat->node_start_pfn + i; - - if (unlikely(!(i % MAX_ORDER_NR_PAGES))) - touch_nmi_watchdog(); - - if (!pfn_valid(pfn)) + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) continue; - page = pfn_to_page(pfn); - - if (PageHighMem(page)) - highmem++; + total += zone->present_pages; + reserved = zone->present_pages - zone->managed_pages; - if (PageReserved(page)) - reserved++; - else if (page_count(page) == 1) - nonshared++; - else if (page_count(page) > 1) - shared += page_count(page) - 1; - - total++; + if (is_highmem_idx(zoneid)) + highmem += zone->present_pages; } pgdat_resize_unlock(pgdat, &flags); } printk("%lu pages RAM\n", total); -#ifdef CONFIG_HIGHMEM - printk("%lu pages HighMem\n", highmem); -#endif + printk("%lu pages HighMem/MovableOnly\n", highmem); printk("%lu pages reserved\n", reserved); - printk("%lu pages shared\n", shared); - printk("%lu pages non-shared\n", nonshared); #ifdef CONFIG_QUICKLIST printk("%lu pages in pagetable cache\n", quicklist_total_size()); -- cgit v1.2.2 From 623fd8072c7c4d77a184bc9e35192acf480c18e4 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Tue, 12 Nov 2013 15:08:34 -0800 Subject: percpu: add test module for various percpu operations Tests various percpu operations. Enable with CONFIG_PERCPU_TEST=m. Signed-off-by: Greg Thelen Acked-by: Tejun Heo Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 9 ++++ lib/Makefile | 2 + lib/percpu_test.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+) create mode 100644 lib/percpu_test.c (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ebef88f61b7d..db25707aa41b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1481,6 +1481,15 @@ config INTERVAL_TREE_TEST help A benchmark measuring the performance of the interval tree library +config PERCPU_TEST + tristate "Per cpu operations test" + depends on m && DEBUG_KERNEL + help + Enable this option to build test module which validates per-cpu + operations. + + If unsure, say N. + config ATOMIC64_SELFTEST bool "Perform an atomic64_t self-test at boot" help diff --git a/lib/Makefile b/lib/Makefile index f3bb2cb98adf..bb016e116ba4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -157,6 +157,8 @@ obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o interval_tree_test-objs := interval_tree_test_main.o interval_tree.o +obj-$(CONFIG_PERCPU_TEST) += percpu_test.o + obj-$(CONFIG_ASN1) += asn1_decoder.o obj-$(CONFIG_FONT_SUPPORT) += fonts/ diff --git a/lib/percpu_test.c b/lib/percpu_test.c new file mode 100644 index 000000000000..0b5d14dadd1a --- /dev/null +++ b/lib/percpu_test.c @@ -0,0 +1,138 @@ +#include + +/* validate @native and @pcp counter values match @expected */ +#define CHECK(native, pcp, expected) \ + do { \ + WARN((native) != (expected), \ + "raw %ld (0x%lx) != expected %lld (0x%llx)", \ + (native), (native), \ + (long long)(expected), (long long)(expected)); \ + WARN(__this_cpu_read(pcp) != (expected), \ + "pcp %ld (0x%lx) != expected %lld (0x%llx)", \ + __this_cpu_read(pcp), __this_cpu_read(pcp), \ + (long long)(expected), (long long)(expected)); \ + } while (0) + +static DEFINE_PER_CPU(long, long_counter); +static DEFINE_PER_CPU(unsigned long, ulong_counter); + +static int __init percpu_test_init(void) +{ + /* + * volatile prevents compiler from optimizing it uses, otherwise the + * +ul_one/-ul_one below would replace with inc/dec instructions. + */ + volatile unsigned int ui_one = 1; + long l = 0; + unsigned long ul = 0; + + pr_info("percpu test start\n"); + + preempt_disable(); + + l += -1; + __this_cpu_add(long_counter, -1); + CHECK(l, long_counter, -1); + + l += 1; + __this_cpu_add(long_counter, 1); + CHECK(l, long_counter, 0); + + ul = 0; + __this_cpu_write(ulong_counter, 0); + + ul += 1UL; + __this_cpu_add(ulong_counter, 1UL); + CHECK(ul, ulong_counter, 1); + + ul += -1UL; + __this_cpu_add(ulong_counter, -1UL); + CHECK(ul, ulong_counter, 0); + + ul += -(unsigned long)1; + __this_cpu_add(ulong_counter, -(unsigned long)1); + CHECK(ul, ulong_counter, -1); + + ul = 0; + __this_cpu_write(ulong_counter, 0); + + ul -= 1; + __this_cpu_dec(ulong_counter); + CHECK(ul, ulong_counter, -1); + CHECK(ul, ulong_counter, ULONG_MAX); + + l += -ui_one; + __this_cpu_add(long_counter, -ui_one); + CHECK(l, long_counter, 0xffffffff); + + l += ui_one; + __this_cpu_add(long_counter, ui_one); + CHECK(l, long_counter, (long)0x100000000LL); + + + l = 0; + __this_cpu_write(long_counter, 0); + + l -= ui_one; + __this_cpu_sub(long_counter, ui_one); + CHECK(l, long_counter, -1); + + l = 0; + __this_cpu_write(long_counter, 0); + + l += ui_one; + __this_cpu_add(long_counter, ui_one); + CHECK(l, long_counter, 1); + + l += -ui_one; + __this_cpu_add(long_counter, -ui_one); + CHECK(l, long_counter, (long)0x100000000LL); + + l = 0; + __this_cpu_write(long_counter, 0); + + l -= ui_one; + this_cpu_sub(long_counter, ui_one); + CHECK(l, long_counter, -1); + CHECK(l, long_counter, ULONG_MAX); + + ul = 0; + __this_cpu_write(ulong_counter, 0); + + ul += ui_one; + __this_cpu_add(ulong_counter, ui_one); + CHECK(ul, ulong_counter, 1); + + ul = 0; + __this_cpu_write(ulong_counter, 0); + + ul -= ui_one; + __this_cpu_sub(ulong_counter, ui_one); + CHECK(ul, ulong_counter, -1); + CHECK(ul, ulong_counter, ULONG_MAX); + + ul = 3; + __this_cpu_write(ulong_counter, 3); + + ul = this_cpu_sub_return(ulong_counter, ui_one); + CHECK(ul, ulong_counter, 2); + + ul = __this_cpu_sub_return(ulong_counter, ui_one); + CHECK(ul, ulong_counter, 1); + + preempt_enable(); + + pr_info("percpu test done\n"); + return -EAGAIN; /* Fail will directly unload the module */ +} + +static void __exit percpu_test_exit(void) +{ +} + +module_init(percpu_test_init) +module_exit(percpu_test_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Greg Thelen"); +MODULE_DESCRIPTION("percpu operations test"); -- cgit v1.2.2 From 312b4e226951f707e120b95b118cbc14f3d162b2 Mon Sep 17 00:00:00 2001 From: Ryan Mallon Date: Tue, 12 Nov 2013 15:08:51 -0800 Subject: vsprintf: check real user/group id for %pK Some setuid binaries will allow reading of files which have read permission by the real user id. This is problematic with files which use %pK because the file access permission is checked at open() time, but the kptr_restrict setting is checked at read() time. If a setuid binary opens a %pK file as an unprivileged user, and then elevates permissions before reading the file, then kernel pointer values may be leaked. This happens for example with the setuid pppd application on Ubuntu 12.04: $ head -1 /proc/kallsyms 00000000 T startup_32 $ pppd file /proc/kallsyms pppd: In file /proc/kallsyms: unrecognized option 'c1000000' This will only leak the pointer value from the first line, but other setuid binaries may leak more information. Fix this by adding a check that in addition to the current process having CAP_SYSLOG, that effective user and group ids are equal to the real ids. If a setuid binary reads the contents of a file which uses %pK then the pointer values will be printed as NULL if the real user is unprivileged. Update the sysctl documentation to reflect the changes, and also correct the documentation to state the kptr_restrict=0 is the default. This is a only temporary solution to the issue. The correct solution is to do the permission check at open() time on files, and to replace %pK with a function which checks the open() time permission. %pK uses in printk should be removed since no sane permission check can be done, and instead protected by using dmesg_restrict. Signed-off-by: Ryan Mallon Cc: Kees Cook Cc: Alexander Viro Cc: Joe Perches Cc: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 26559bdb4c49..d76555c45ff4 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include /* for PAGE_SIZE */ @@ -1312,11 +1313,37 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, spec.field_width = default_width; return string(buf, end, "pK-error", spec); } - if (!((kptr_restrict == 0) || - (kptr_restrict == 1 && - has_capability_noaudit(current, CAP_SYSLOG)))) + + switch (kptr_restrict) { + case 0: + /* Always print %pK values */ + break; + case 1: { + /* + * Only print the real pointer value if the current + * process has CAP_SYSLOG and is running with the + * same credentials it started with. This is because + * access to files is checked at open() time, but %pK + * checks permission at read() time. We don't want to + * leak pointer values if a binary opens a file using + * %pK and then elevates privileges before reading it. + */ + const struct cred *cred = current_cred(); + + if (!has_capability_noaudit(current, CAP_SYSLOG) || + !uid_eq(cred->euid, cred->uid) || + !gid_eq(cred->egid, cred->gid)) + ptr = NULL; + break; + } + case 2: + default: + /* Always print 0's for %pK */ ptr = NULL; + break; + } break; + case 'N': switch (fmt[1]) { case 'F': -- cgit v1.2.2 From d3773ba13c8ca63a4ee9e7926813f6e82be27a6c Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Tue, 12 Nov 2013 15:09:49 -0800 Subject: lib/debugobjects.c: remove unnecessary work pending test Remove unnecessary work pending test before calling schedule_work(). It has been tested in queue_work_on() already. No functional changed. Signed-off-by: Xie XiuQi Reviewed-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/debugobjects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index bf2c8b1043d8..e0731c3db706 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -196,7 +196,7 @@ static void free_object(struct debug_obj *obj) * initialized: */ if (obj_pool_free > ODEBUG_POOL_SIZE && obj_cache) - sched = keventd_up() && !work_pending(&debug_obj_work); + sched = keventd_up(); hlist_add_head(&obj->node, &obj_pool); obj_pool_free++; obj_pool_used--; -- cgit v1.2.2 From c0d92a57a88586586b8cb9c7ac149bd10bc40d11 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Tue, 12 Nov 2013 15:09:50 -0800 Subject: lib/vsprintf.c: document formats for dentry and struct file Looks like these were added to Documentation/printk-formats.txt but not the in-file table. Signed-off-by: Olof Johansson Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'lib') diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d76555c45ff4..48586ac3a62e 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1219,6 +1219,8 @@ int kptr_restrict __read_mostly; * The maximum supported length is 64 bytes of the input. Consider * to use print_hex_dump() for the larger input. * - 'a' For a phys_addr_t type and its derivative types (passed by reference) + * - 'd[234]' For a dentry name (optionally 2-4 last components) + * - 'D[234]' Same as 'd' but for a struct file * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a -- cgit v1.2.2 From ff6092a8a413f3db1938b4606e078fa751ed6cdb Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Tue, 12 Nov 2013 15:09:51 -0800 Subject: lib/digsig.c: use ERR_CAST inlined function instead of ERR_PTR(PTR_ERR(...)) Signed-off-by: Duan Jiong Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/digsig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/digsig.c b/lib/digsig.c index 2f31e6a45f0a..8793aeda30ca 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -209,7 +209,7 @@ int digsig_verify(struct key *keyring, const char *sig, int siglen, kref = keyring_search(make_key_ref(keyring, 1UL), &key_type_user, name); if (IS_ERR(kref)) - key = ERR_PTR(PTR_ERR(kref)); + key = ERR_CAST(kref); else key = key_ref_to_ptr(kref); } else { -- cgit v1.2.2 From 684f0d3d14f2744ae7ad79063b21909e32bf444e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 12 Nov 2013 15:09:52 -0800 Subject: lib/genalloc: add a helper function for DMA buffer allocation When using pool space for DMA buffer, there might be duplicated calling of gen_pool_alloc() and gen_pool_virt_to_phys() in each implementation. Thus it's better to add a simple helper function, a compatible one to the common dma_alloc_coherent(), to save some code. Signed-off-by: Nicolin Chen Cc: "Hans J. Koch" Cc: Dan Williams Cc: Eric Miao Cc: Grant Likely Cc: Greg Kroah-Hartman Cc: Haojian Zhuang Cc: Jaroslav Kysela Cc: Kevin Hilman Cc: Liam Girdwood Cc: Mark Brown Cc: Mauro Carvalho Chehab Cc: Rob Herring Cc: Russell King Cc: Sekhar Nori Cc: Takashi Iwai Cc: Vinod Koul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'lib') diff --git a/lib/genalloc.c b/lib/genalloc.c index 26cf20be72b7..dda31168844f 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -312,6 +312,34 @@ retry: } EXPORT_SYMBOL(gen_pool_alloc); +/** + * gen_pool_dma_alloc - allocate special memory from the pool for DMA usage + * @pool: pool to allocate from + * @size: number of bytes to allocate from the pool + * @dma: dma-view physical address + * + * Allocate the requested number of bytes from the specified pool. + * Uses the pool allocation function (with first-fit algorithm by default). + * Can not be used in NMI handler on architectures without + * NMI-safe cmpxchg implementation. + */ +void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma) +{ + unsigned long vaddr; + + if (!pool) + return NULL; + + vaddr = gen_pool_alloc(pool, size); + if (!vaddr) + return NULL; + + *dma = gen_pool_virt_to_phys(pool, vaddr); + + return (void *)vaddr; +} +EXPORT_SYMBOL(gen_pool_dma_alloc); + /** * gen_pool_free - free allocated special memory back to the pool * @pool: pool to free to -- cgit v1.2.2