aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRobin Holt <holt@sgi.com>2006-01-14 16:20:48 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-14 21:27:07 -0500
commit7339ff8302fd70aabf5f1ae26e0c4905fa74a495 (patch)
tree38ee561d51b7e4db7c0d6dd9ebd9fc22c2b6ab88
parent852cf918dcf2ae46468b425e679fbcbf0ea8fdbb (diff)
[PATCH] Add tmpfs options for memory placement policies
Anything that writes into a tmpfs filesystem is liable to disproportionately decrease the available memory on a particular node. Since there's no telling what sort of application (e.g. dd/cp/cat) might be dropping large files there, this lets the admin choose the appropriate default behavior for their site's situation. Introduce a tmpfs mount option which allows specifying a memory policy and a second option to specify the nodelist for that policy. With the default policy, tmpfs will behave as it does today. This patch adds support for preferred, bind, and interleave policies. The default policy will cause pages to be added to tmpfs files on the node which is doing the writing. Some jobs expect a single process to create and manage the tmpfs files. This results in a node which has a significantly reduced number of free pages. With this patch, the administrator can specify the policy and nodes for that policy where they would prefer allocations. This patch was originally written by Brent Casavant and Hugh Dickins. I added support for the bind and preferred policies and the mpol_nodelist mount option. Signed-off-by: Brent Casavant <bcasavan@sgi.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Robin Holt <holt@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--Documentation/filesystems/tmpfs.txt12
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--include/linux/mempolicy.h11
-rw-r--r--include/linux/shmem_fs.h2
-rw-r--r--mm/mempolicy.c24
-rw-r--r--mm/shmem.c39
6 files changed, 75 insertions, 15 deletions
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index 0d783c504ead..dbe4d87d2615 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -78,6 +78,18 @@ use up all the memory on the machine; but enhances the scalability of
78that instance in a system with many cpus making intensive use of it. 78that instance in a system with many cpus making intensive use of it.
79 79
80 80
81tmpfs has a mount option to set the NUMA memory allocation policy for
82all files in that instance:
83mpol=interleave prefers to allocate memory from each node in turn
84mpol=default prefers to allocate memory from the local node
85mpol=bind prefers to allocate from mpol_nodelist
86mpol=preferred prefers to allocate from first node in mpol_nodelist
87
88The following mount option is used in conjunction with mpol=interleave,
89mpol=bind or mpol=preferred:
90mpol_nodelist: nodelist suitable for parsing with nodelist_parse.
91
92
81To specify the initial root directory you can use the following mount 93To specify the initial root directory you can use the following mount
82options: 94options:
83 95
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ab4c3a9d51b8..f568102da1e8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -402,7 +402,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
402 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 402 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
403 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 403 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
404 info = HUGETLBFS_I(inode); 404 info = HUGETLBFS_I(inode);
405 mpol_shared_policy_init(&info->policy); 405 mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
406 switch (mode & S_IFMT) { 406 switch (mode & S_IFMT) {
407 default: 407 default:
408 init_special_inode(inode, mode, dev); 408 init_special_inode(inode, mode, dev);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index c7ac77e873b3..d6a53ed6ab6c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -132,12 +132,8 @@ struct shared_policy {
132 spinlock_t lock; 132 spinlock_t lock;
133}; 133};
134 134
135static inline void mpol_shared_policy_init(struct shared_policy *info) 135void mpol_shared_policy_init(struct shared_policy *info, int policy,
136{ 136 nodemask_t *nodes);
137 info->root = RB_ROOT;
138 spin_lock_init(&info->lock);
139}
140
141int mpol_set_shared_policy(struct shared_policy *info, 137int mpol_set_shared_policy(struct shared_policy *info,
142 struct vm_area_struct *vma, 138 struct vm_area_struct *vma,
143 struct mempolicy *new); 139 struct mempolicy *new);
@@ -211,7 +207,8 @@ static inline int mpol_set_shared_policy(struct shared_policy *info,
211 return -EINVAL; 207 return -EINVAL;
212} 208}
213 209
214static inline void mpol_shared_policy_init(struct shared_policy *info) 210static inline void mpol_shared_policy_init(struct shared_policy *info,
211 int policy, nodemask_t *nodes)
215{ 212{
216} 213}
217 214
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index c3e598276e78..c057f0b32318 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -26,6 +26,8 @@ struct shmem_sb_info {
26 unsigned long free_blocks; /* How many are left for allocation */ 26 unsigned long free_blocks; /* How many are left for allocation */
27 unsigned long max_inodes; /* How many inodes are allowed */ 27 unsigned long max_inodes; /* How many inodes are allowed */
28 unsigned long free_inodes; /* How many are left for allocation */ 28 unsigned long free_inodes; /* How many are left for allocation */
29 int policy; /* Default NUMA memory alloc policy */
30 nodemask_t policy_nodes; /* nodemask for preferred and bind */
29 spinlock_t stat_lock; 31 spinlock_t stat_lock;
30}; 32};
31 33
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b62cab575a84..3171f884d245 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1359,6 +1359,30 @@ restart:
1359 return 0; 1359 return 0;
1360} 1360}
1361 1361
1362void mpol_shared_policy_init(struct shared_policy *info, int policy,
1363 nodemask_t *policy_nodes)
1364{
1365 info->root = RB_ROOT;
1366 spin_lock_init(&info->lock);
1367
1368 if (policy != MPOL_DEFAULT) {
1369 struct mempolicy *newpol;
1370
1371 /* Falls back to MPOL_DEFAULT on any error */
1372 newpol = mpol_new(policy, policy_nodes);
1373 if (!IS_ERR(newpol)) {
1374 /* Create pseudo-vma that contains just the policy */
1375 struct vm_area_struct pvma;
1376
1377 memset(&pvma, 0, sizeof(struct vm_area_struct));
1378 /* Policy covers entire file */
1379 pvma.vm_end = TASK_SIZE;
1380 mpol_set_shared_policy(info, &pvma, newpol);
1381 mpol_free(newpol);
1382 }
1383 }
1384}
1385
1362int mpol_set_shared_policy(struct shared_policy *info, 1386int mpol_set_shared_policy(struct shared_policy *info,
1363 struct vm_area_struct *vma, struct mempolicy *npol) 1387 struct vm_area_struct *vma, struct mempolicy *npol)
1364{ 1388{
diff --git a/mm/shmem.c b/mm/shmem.c
index 343b3c0937e5..ce501bce1c2e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1316,7 +1316,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1316 case S_IFREG: 1316 case S_IFREG:
1317 inode->i_op = &shmem_inode_operations; 1317 inode->i_op = &shmem_inode_operations;
1318 inode->i_fop = &shmem_file_operations; 1318 inode->i_fop = &shmem_file_operations;
1319 mpol_shared_policy_init(&info->policy); 1319 mpol_shared_policy_init(&info->policy, sbinfo->policy,
1320 &sbinfo->policy_nodes);
1320 break; 1321 break;
1321 case S_IFDIR: 1322 case S_IFDIR:
1322 inode->i_nlink++; 1323 inode->i_nlink++;
@@ -1330,7 +1331,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1330 * Must not load anything in the rbtree, 1331 * Must not load anything in the rbtree,
1331 * mpol_free_shared_policy will not be called. 1332 * mpol_free_shared_policy will not be called.
1332 */ 1333 */
1333 mpol_shared_policy_init(&info->policy); 1334 mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
1335 NULL);
1334 break; 1336 break;
1335 } 1337 }
1336 } else if (sbinfo->max_inodes) { 1338 } else if (sbinfo->max_inodes) {
@@ -1843,7 +1845,9 @@ static struct inode_operations shmem_symlink_inode_operations = {
1843 .put_link = shmem_put_link, 1845 .put_link = shmem_put_link,
1844}; 1846};
1845 1847
1846static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes) 1848static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1849 gid_t *gid, unsigned long *blocks, unsigned long *inodes,
1850 int *policy, nodemask_t *policy_nodes)
1847{ 1851{
1848 char *this_char, *value, *rest; 1852 char *this_char, *value, *rest;
1849 1853
@@ -1897,6 +1901,19 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid,
1897 *gid = simple_strtoul(value,&rest,0); 1901 *gid = simple_strtoul(value,&rest,0);
1898 if (*rest) 1902 if (*rest)
1899 goto bad_val; 1903 goto bad_val;
1904 } else if (!strcmp(this_char,"mpol")) {
1905 if (!strcmp(value,"default"))
1906 *policy = MPOL_DEFAULT;
1907 else if (!strcmp(value,"preferred"))
1908 *policy = MPOL_PREFERRED;
1909 else if (!strcmp(value,"bind"))
1910 *policy = MPOL_BIND;
1911 else if (!strcmp(value,"interleave"))
1912 *policy = MPOL_INTERLEAVE;
1913 else
1914 goto bad_val;
1915 } else if (!strcmp(this_char,"mpol_nodelist")) {
1916 nodelist_parse(value, *policy_nodes);
1900 } else { 1917 } else {
1901 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 1918 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
1902 this_char); 1919 this_char);
@@ -1917,12 +1934,14 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
1917 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1934 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1918 unsigned long max_blocks = sbinfo->max_blocks; 1935 unsigned long max_blocks = sbinfo->max_blocks;
1919 unsigned long max_inodes = sbinfo->max_inodes; 1936 unsigned long max_inodes = sbinfo->max_inodes;
1937 int policy = sbinfo->policy;
1938 nodemask_t policy_nodes = sbinfo->policy_nodes;
1920 unsigned long blocks; 1939 unsigned long blocks;
1921 unsigned long inodes; 1940 unsigned long inodes;
1922 int error = -EINVAL; 1941 int error = -EINVAL;
1923 1942
1924 if (shmem_parse_options(data, NULL, NULL, NULL, 1943 if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
1925 &max_blocks, &max_inodes)) 1944 &max_inodes, &policy, &policy_nodes))
1926 return error; 1945 return error;
1927 1946
1928 spin_lock(&sbinfo->stat_lock); 1947 spin_lock(&sbinfo->stat_lock);
@@ -1948,6 +1967,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
1948 sbinfo->free_blocks = max_blocks - blocks; 1967 sbinfo->free_blocks = max_blocks - blocks;
1949 sbinfo->max_inodes = max_inodes; 1968 sbinfo->max_inodes = max_inodes;
1950 sbinfo->free_inodes = max_inodes - inodes; 1969 sbinfo->free_inodes = max_inodes - inodes;
1970 sbinfo->policy = policy;
1971 sbinfo->policy_nodes = policy_nodes;
1951out: 1972out:
1952 spin_unlock(&sbinfo->stat_lock); 1973 spin_unlock(&sbinfo->stat_lock);
1953 return error; 1974 return error;
@@ -1972,6 +1993,8 @@ static int shmem_fill_super(struct super_block *sb,
1972 struct shmem_sb_info *sbinfo; 1993 struct shmem_sb_info *sbinfo;
1973 unsigned long blocks = 0; 1994 unsigned long blocks = 0;
1974 unsigned long inodes = 0; 1995 unsigned long inodes = 0;
1996 int policy = MPOL_DEFAULT;
1997 nodemask_t policy_nodes = node_online_map;
1975 1998
1976#ifdef CONFIG_TMPFS 1999#ifdef CONFIG_TMPFS
1977 /* 2000 /*
@@ -1984,8 +2007,8 @@ static int shmem_fill_super(struct super_block *sb,
1984 inodes = totalram_pages - totalhigh_pages; 2007 inodes = totalram_pages - totalhigh_pages;
1985 if (inodes > blocks) 2008 if (inodes > blocks)
1986 inodes = blocks; 2009 inodes = blocks;
1987 if (shmem_parse_options(data, &mode, &uid, &gid, 2010 if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
1988 &blocks, &inodes)) 2011 &inodes, &policy, &policy_nodes))
1989 return -EINVAL; 2012 return -EINVAL;
1990 } 2013 }
1991#else 2014#else
@@ -2003,6 +2026,8 @@ static int shmem_fill_super(struct super_block *sb,
2003 sbinfo->free_blocks = blocks; 2026 sbinfo->free_blocks = blocks;
2004 sbinfo->max_inodes = inodes; 2027 sbinfo->max_inodes = inodes;
2005 sbinfo->free_inodes = inodes; 2028 sbinfo->free_inodes = inodes;
2029 sbinfo->policy = policy;
2030 sbinfo->policy_nodes = policy_nodes;
2006 2031
2007 sb->s_fs_info = sbinfo; 2032 sb->s_fs_info = sbinfo;
2008 sb->s_maxbytes = SHMEM_MAX_BYTES; 2033 sb->s_maxbytes = SHMEM_MAX_BYTES;