aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <ak@linux.intel.com>2012-12-11 19:01:34 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-11 20:22:25 -0500
commit42d7395feb56f0655cd8b68e06fc6063823449f8 (patch)
tree47cfbad1737d98d9752a2aab7e525f1fe5194d27
parentff604cf6d41f1e05f34762e1d764fe14a0f5f964 (diff)
mm: support more pagesizes for MAP_HUGETLB/SHM_HUGETLB
There was some desire in large applications using MAP_HUGETLB or SHM_HUGETLB to use 1GB huge pages on some mappings, and stay with 2MB on others. This is useful together with NUMA policy: use 2MB interleaving on some mappings, but 1GB on local mappings. This patch extends the IPC/SHM syscall interfaces slightly to allow specifying the page size. It borrows some upper bits in the existing flag arguments and allows encoding the log of the desired page size in addition to the *_HUGETLB flag. When 0 is specified the default size is used, this makes the change fully compatible. Extending the internal hugetlb code to handle this is straight forward. Instead of a single mount it just keeps an array of them and selects the right mount based on the specified page size. When no page size is specified it uses the mount of the default page size. The change is not visible in /proc/mounts because internal mounts don't appear there. It also has very little overhead: the additional mounts just consume a super block, but not more memory when not used. I also exported the new flags to the user headers (they were previously under __KERNEL__). Right now only symbols for x86 and some other architecture for 1GB and 2MB are defined. The interface should already work for all other architectures though. Only architectures that define multiple hugetlb sizes actually need it (that is currently x86, tile, powerpc). However tile and powerpc have user configurable hugetlb sizes, so it's not easy to add defines. A program on those architectures would need to query sysfs and use the appropiate log2. [akpm@linux-foundation.org: cleanups] [rientjes@google.com: fix build] [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Andi Kleen <ak@linux.intel.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hillf Danton <dhillf@gmail.com> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/alpha/include/asm/mman.h11
-rw-r--r--arch/mips/include/uapi/asm/mman.h11
-rw-r--r--arch/parisc/include/uapi/asm/mman.h11
-rw-r--r--arch/x86/include/asm/mman.h3
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h11
-rw-r--r--fs/hugetlbfs/inode.c63
-rw-r--r--include/linux/hugetlb.h7
-rw-r--r--include/linux/shm.h15
-rw-r--r--include/uapi/asm-generic/mman-common.h11
-rw-r--r--include/uapi/asm-generic/mman.h2
-rw-r--r--ipc/shm.c3
-rw-r--r--mm/mmap.c5
12 files changed, 135 insertions, 18 deletions
diff --git a/arch/alpha/include/asm/mman.h b/arch/alpha/include/asm/mman.h
index cbeb3616a28e..0086b472bc2b 100644
--- a/arch/alpha/include/asm/mman.h
+++ b/arch/alpha/include/asm/mman.h
@@ -63,4 +63,15 @@
63/* compatibility flags */ 63/* compatibility flags */
64#define MAP_FILE 0 64#define MAP_FILE 0
65 65
66/*
67 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
68 * This gives us 6 bits, which is enough until someone invents 128 bit address
69 * spaces.
70 *
71 * Assume these are all power of twos.
72 * When 0 use the default page size.
73 */
74#define MAP_HUGE_SHIFT 26
75#define MAP_HUGE_MASK 0x3f
76
66#endif /* __ALPHA_MMAN_H__ */ 77#endif /* __ALPHA_MMAN_H__ */
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 46d3da0d4b92..9a936ac9a942 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -87,4 +87,15 @@
87/* compatibility flags */ 87/* compatibility flags */
88#define MAP_FILE 0 88#define MAP_FILE 0
89 89
90/*
91 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
92 * This gives us 6 bits, which is enough until someone invents 128 bit address
93 * spaces.
94 *
95 * Assume these are all power of twos.
96 * When 0 use the default page size.
97 */
98#define MAP_HUGE_SHIFT 26
99#define MAP_HUGE_MASK 0x3f
100
90#endif /* _ASM_MMAN_H */ 101#endif /* _ASM_MMAN_H */
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 12219ebce869..294d251ca7b2 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -70,4 +70,15 @@
70#define MAP_FILE 0 70#define MAP_FILE 0
71#define MAP_VARIABLE 0 71#define MAP_VARIABLE 0
72 72
73/*
74 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
75 * This gives us 6 bits, which is enough until someone invents 128 bit address
76 * spaces.
77 *
78 * Assume these are all power of twos.
79 * When 0 use the default page size.
80 */
81#define MAP_HUGE_SHIFT 26
82#define MAP_HUGE_MASK 0x3f
83
73#endif /* __PARISC_MMAN_H__ */ 84#endif /* __PARISC_MMAN_H__ */
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
index 593e51d4643f..513b05f15bb4 100644
--- a/arch/x86/include/asm/mman.h
+++ b/arch/x86/include/asm/mman.h
@@ -3,6 +3,9 @@
3 3
4#define MAP_32BIT 0x40 /* only give out 32bit addresses */ 4#define MAP_32BIT 0x40 /* only give out 32bit addresses */
5 5
6#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
7#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
8
6#include <asm-generic/mman.h> 9#include <asm-generic/mman.h>
7 10
8#endif /* _ASM_X86_MMAN_H */ 11#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 25bc6c1309c3..00eed6786d7e 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -93,4 +93,15 @@
93/* compatibility flags */ 93/* compatibility flags */
94#define MAP_FILE 0 94#define MAP_FILE 0
95 95
96/*
97 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
98 * This gives us 6 bits, which is enough until someone invents 128 bit address
99 * spaces.
100 *
101 * Assume these are all power of twos.
102 * When 0 use the default page size.
103 */
104#define MAP_HUGE_SHIFT 26
105#define MAP_HUGE_MASK 0x3f
106
96#endif /* _XTENSA_MMAN_H */ 107#endif /* _XTENSA_MMAN_H */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c5bc355d8243..21b8a4875237 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -923,7 +923,7 @@ static struct file_system_type hugetlbfs_fs_type = {
923 .kill_sb = kill_litter_super, 923 .kill_sb = kill_litter_super,
924}; 924};
925 925
926static struct vfsmount *hugetlbfs_vfsmount; 926static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
927 927
928static int can_do_hugetlb_shm(void) 928static int can_do_hugetlb_shm(void)
929{ 929{
@@ -932,9 +932,22 @@ static int can_do_hugetlb_shm(void)
932 return capable(CAP_IPC_LOCK) || in_group_p(shm_group); 932 return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
933} 933}
934 934
935static int get_hstate_idx(int page_size_log)
936{
937 struct hstate *h;
938
939 if (!page_size_log)
940 return default_hstate_idx;
941 h = size_to_hstate(1 << page_size_log);
942 if (!h)
943 return -1;
944 return h - hstates;
945}
946
935struct file *hugetlb_file_setup(const char *name, unsigned long addr, 947struct file *hugetlb_file_setup(const char *name, unsigned long addr,
936 size_t size, vm_flags_t acctflag, 948 size_t size, vm_flags_t acctflag,
937 struct user_struct **user, int creat_flags) 949 struct user_struct **user,
950 int creat_flags, int page_size_log)
938{ 951{
939 int error = -ENOMEM; 952 int error = -ENOMEM;
940 struct file *file; 953 struct file *file;
@@ -944,9 +957,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
944 struct qstr quick_string; 957 struct qstr quick_string;
945 struct hstate *hstate; 958 struct hstate *hstate;
946 unsigned long num_pages; 959 unsigned long num_pages;
960 int hstate_idx;
961
962 hstate_idx = get_hstate_idx(page_size_log);
963 if (hstate_idx < 0)
964 return ERR_PTR(-ENODEV);
947 965
948 *user = NULL; 966 *user = NULL;
949 if (!hugetlbfs_vfsmount) 967 if (!hugetlbfs_vfsmount[hstate_idx])
950 return ERR_PTR(-ENOENT); 968 return ERR_PTR(-ENOENT);
951 969
952 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 970 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
@@ -963,7 +981,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
963 } 981 }
964 } 982 }
965 983
966 root = hugetlbfs_vfsmount->mnt_root; 984 root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
967 quick_string.name = name; 985 quick_string.name = name;
968 quick_string.len = strlen(quick_string.name); 986 quick_string.len = strlen(quick_string.name);
969 quick_string.hash = 0; 987 quick_string.hash = 0;
@@ -971,7 +989,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
971 if (!path.dentry) 989 if (!path.dentry)
972 goto out_shm_unlock; 990 goto out_shm_unlock;
973 991
974 path.mnt = mntget(hugetlbfs_vfsmount); 992 path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
975 error = -ENOSPC; 993 error = -ENOSPC;
976 inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); 994 inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
977 if (!inode) 995 if (!inode)
@@ -1011,8 +1029,9 @@ out_shm_unlock:
1011 1029
1012static int __init init_hugetlbfs_fs(void) 1030static int __init init_hugetlbfs_fs(void)
1013{ 1031{
1032 struct hstate *h;
1014 int error; 1033 int error;
1015 struct vfsmount *vfsmount; 1034 int i;
1016 1035
1017 error = bdi_init(&hugetlbfs_backing_dev_info); 1036 error = bdi_init(&hugetlbfs_backing_dev_info);
1018 if (error) 1037 if (error)
@@ -1029,14 +1048,26 @@ static int __init init_hugetlbfs_fs(void)
1029 if (error) 1048 if (error)
1030 goto out; 1049 goto out;
1031 1050
1032 vfsmount = kern_mount(&hugetlbfs_fs_type); 1051 i = 0;
1052 for_each_hstate(h) {
1053 char buf[50];
1054 unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
1033 1055
1034 if (!IS_ERR(vfsmount)) { 1056 snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
1035 hugetlbfs_vfsmount = vfsmount; 1057 hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
1036 return 0; 1058 buf);
1037 }
1038 1059
1039 error = PTR_ERR(vfsmount); 1060 if (IS_ERR(hugetlbfs_vfsmount[i])) {
1061 pr_err("hugetlb: Cannot mount internal hugetlbfs for "
1062 "page size %uK", ps_kb);
1063 error = PTR_ERR(hugetlbfs_vfsmount[i]);
1064 hugetlbfs_vfsmount[i] = NULL;
1065 }
1066 i++;
1067 }
1068 /* Non default hstates are optional */
1069 if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
1070 return 0;
1040 1071
1041 out: 1072 out:
1042 kmem_cache_destroy(hugetlbfs_inode_cachep); 1073 kmem_cache_destroy(hugetlbfs_inode_cachep);
@@ -1047,13 +1078,19 @@ static int __init init_hugetlbfs_fs(void)
1047 1078
1048static void __exit exit_hugetlbfs_fs(void) 1079static void __exit exit_hugetlbfs_fs(void)
1049{ 1080{
1081 struct hstate *h;
1082 int i;
1083
1084
1050 /* 1085 /*
1051 * Make sure all delayed rcu free inodes are flushed before we 1086 * Make sure all delayed rcu free inodes are flushed before we
1052 * destroy cache. 1087 * destroy cache.
1053 */ 1088 */
1054 rcu_barrier(); 1089 rcu_barrier();
1055 kmem_cache_destroy(hugetlbfs_inode_cachep); 1090 kmem_cache_destroy(hugetlbfs_inode_cachep);
1056 kern_unmount(hugetlbfs_vfsmount); 1091 i = 0;
1092 for_each_hstate(h)
1093 kern_unmount(hugetlbfs_vfsmount[i++]);
1057 unregister_filesystem(&hugetlbfs_fs_type); 1094 unregister_filesystem(&hugetlbfs_fs_type);
1058 bdi_destroy(&hugetlbfs_backing_dev_info); 1095 bdi_destroy(&hugetlbfs_backing_dev_info);
1059} 1096}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 225164842ab6..3e7fa1acf09c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -183,7 +183,8 @@ extern const struct file_operations hugetlbfs_file_operations;
183extern const struct vm_operations_struct hugetlb_vm_ops; 183extern const struct vm_operations_struct hugetlb_vm_ops;
184struct file *hugetlb_file_setup(const char *name, unsigned long addr, 184struct file *hugetlb_file_setup(const char *name, unsigned long addr,
185 size_t size, vm_flags_t acct, 185 size_t size, vm_flags_t acct,
186 struct user_struct **user, int creat_flags); 186 struct user_struct **user, int creat_flags,
187 int page_size_log);
187 188
188static inline int is_file_hugepages(struct file *file) 189static inline int is_file_hugepages(struct file *file)
189{ 190{
@@ -195,12 +196,14 @@ static inline int is_file_hugepages(struct file *file)
195 return 0; 196 return 0;
196} 197}
197 198
199
198#else /* !CONFIG_HUGETLBFS */ 200#else /* !CONFIG_HUGETLBFS */
199 201
200#define is_file_hugepages(file) 0 202#define is_file_hugepages(file) 0
201static inline struct file * 203static inline struct file *
202hugetlb_file_setup(const char *name, unsigned long addr, size_t size, 204hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
203 vm_flags_t acctflag, struct user_struct **user, int creat_flags) 205 vm_flags_t acctflag, struct user_struct **user, int creat_flags,
206 int page_size_log)
204{ 207{
205 return ERR_PTR(-ENOSYS); 208 return ERR_PTR(-ENOSYS);
206} 209}
diff --git a/include/linux/shm.h b/include/linux/shm.h
index bcf8a6a3ec00..429c1995d756 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -29,6 +29,21 @@ struct shmid_kernel /* private to the kernel */
29#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ 29#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
30#define SHM_NORESERVE 010000 /* don't check for reservations */ 30#define SHM_NORESERVE 010000 /* don't check for reservations */
31 31
32/* Bits [26:31] are reserved */
33
34/*
35 * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
36 * This gives us 6 bits, which is enough until someone invents 128 bit address
37 * spaces.
38 *
39 * Assume these are all power of twos.
40 * When 0 use the default page size.
41 */
42#define SHM_HUGE_SHIFT 26
43#define SHM_HUGE_MASK 0x3f
44#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
45#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
46
32#ifdef CONFIG_SYSVIPC 47#ifdef CONFIG_SYSVIPC
33long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, 48long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
34 unsigned long shmlba); 49 unsigned long shmlba);
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index d030d2c2647a..4164529a94f9 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -55,4 +55,15 @@
55/* compatibility flags */ 55/* compatibility flags */
56#define MAP_FILE 0 56#define MAP_FILE 0
57 57
58/*
59 * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
60 * This gives us 6 bits, which is enough until someone invents 128 bit address
61 * spaces.
62 *
63 * Assume these are all power of twos.
64 * When 0 use the default page size.
65 */
66#define MAP_HUGE_SHIFT 26
67#define MAP_HUGE_MASK 0x3f
68
58#endif /* __ASM_GENERIC_MMAN_COMMON_H */ 69#endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index 32c8bd6a196d..e9fe6fd2a074 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -13,6 +13,8 @@
13#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ 13#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
14#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ 14#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
15 15
16/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
17
16#define MCL_CURRENT 1 /* lock all current mappings */ 18#define MCL_CURRENT 1 /* lock all current mappings */
17#define MCL_FUTURE 2 /* lock all future mappings */ 19#define MCL_FUTURE 2 /* lock all future mappings */
18 20
diff --git a/ipc/shm.c b/ipc/shm.c
index dff40c9f73c9..4fa6d8fee730 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -495,7 +495,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
495 if (shmflg & SHM_NORESERVE) 495 if (shmflg & SHM_NORESERVE)
496 acctflag = VM_NORESERVE; 496 acctflag = VM_NORESERVE;
497 file = hugetlb_file_setup(name, 0, size, acctflag, 497 file = hugetlb_file_setup(name, 0, size, acctflag,
498 &shp->mlock_user, HUGETLB_SHMFS_INODE); 498 &shp->mlock_user, HUGETLB_SHMFS_INODE,
499 (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
499 } else { 500 } else {
500 /* 501 /*
501 * Do not allow no accounting for OVERCOMMIT_NEVER, even 502 * Do not allow no accounting for OVERCOMMIT_NEVER, even
diff --git a/mm/mmap.c b/mm/mmap.c
index 9a796c41e7d9..ebf19031c5e4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1153,8 +1153,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1153 * memory so no accounting is necessary 1153 * memory so no accounting is necessary
1154 */ 1154 */
1155 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, 1155 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
1156 VM_NORESERVE, &user, 1156 VM_NORESERVE,
1157 HUGETLB_ANONHUGE_INODE); 1157 &user, HUGETLB_ANONHUGE_INODE,
1158 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1158 if (IS_ERR(file)) 1159 if (IS_ERR(file))
1159 return PTR_ERR(file); 1160 return PTR_ERR(file);
1160 } 1161 }