aboutsummaryrefslogtreecommitdiffstats
path: root/fs/proc
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/base.c462
-rw-r--r--fs/proc/fd.c114
-rw-r--r--fs/proc/generic.c100
-rw-r--r--fs/proc/internal.h10
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/namespaces.c87
-rw-r--r--fs/proc/proc_net.c9
-rw-r--r--fs/proc/proc_sysctl.c78
-rw-r--r--fs/proc/root.c19
-rw-r--r--fs/proc/task_mmu.c168
-rw-r--r--fs/proc/uptime.c3
-rw-r--r--fs/proc/vmcore.c694
12 files changed, 950 insertions, 796 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c3834dad09b3..1485e38daaa3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1681,46 +1681,34 @@ const struct dentry_operations pid_dentry_operations =
1681 * reported by readdir in sync with the inode numbers reported 1681 * reported by readdir in sync with the inode numbers reported
1682 * by stat. 1682 * by stat.
1683 */ 1683 */
1684int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 1684bool proc_fill_cache(struct file *file, struct dir_context *ctx,
1685 const char *name, int len, 1685 const char *name, int len,
1686 instantiate_t instantiate, struct task_struct *task, const void *ptr) 1686 instantiate_t instantiate, struct task_struct *task, const void *ptr)
1687{ 1687{
1688 struct dentry *child, *dir = filp->f_path.dentry; 1688 struct dentry *child, *dir = file->f_path.dentry;
1689 struct qstr qname = QSTR_INIT(name, len);
1689 struct inode *inode; 1690 struct inode *inode;
1690 struct qstr qname; 1691 unsigned type;
1691 ino_t ino = 0; 1692 ino_t ino;
1692 unsigned type = DT_UNKNOWN;
1693
1694 qname.name = name;
1695 qname.len = len;
1696 qname.hash = full_name_hash(name, len);
1697 1693
1698 child = d_lookup(dir, &qname); 1694 child = d_hash_and_lookup(dir, &qname);
1699 if (!child) { 1695 if (!child) {
1700 struct dentry *new; 1696 child = d_alloc(dir, &qname);
1701 new = d_alloc(dir, &qname); 1697 if (!child)
1702 if (new) { 1698 goto end_instantiate;
1703 child = instantiate(dir->d_inode, new, task, ptr); 1699 if (instantiate(dir->d_inode, child, task, ptr) < 0) {
1704 if (child) 1700 dput(child);
1705 dput(new); 1701 goto end_instantiate;
1706 else
1707 child = new;
1708 } 1702 }
1709 } 1703 }
1710 if (!child || IS_ERR(child) || !child->d_inode)
1711 goto end_instantiate;
1712 inode = child->d_inode; 1704 inode = child->d_inode;
1713 if (inode) { 1705 ino = inode->i_ino;
1714 ino = inode->i_ino; 1706 type = inode->i_mode >> 12;
1715 type = inode->i_mode >> 12;
1716 }
1717 dput(child); 1707 dput(child);
1708 return dir_emit(ctx, name, len, ino, type);
1709
1718end_instantiate: 1710end_instantiate:
1719 if (!ino) 1711 return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
1720 ino = find_inode_number(dir, &qname);
1721 if (!ino)
1722 ino = 1;
1723 return filldir(dirent, name, len, filp->f_pos, ino, type);
1724} 1712}
1725 1713
1726#ifdef CONFIG_CHECKPOINT_RESTORE 1714#ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1846,7 +1834,7 @@ struct map_files_info {
1846 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ 1834 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
1847}; 1835};
1848 1836
1849static struct dentry * 1837static int
1850proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, 1838proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1851 struct task_struct *task, const void *ptr) 1839 struct task_struct *task, const void *ptr)
1852{ 1840{
@@ -1856,7 +1844,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1856 1844
1857 inode = proc_pid_make_inode(dir->i_sb, task); 1845 inode = proc_pid_make_inode(dir->i_sb, task);
1858 if (!inode) 1846 if (!inode)
1859 return ERR_PTR(-ENOENT); 1847 return -ENOENT;
1860 1848
1861 ei = PROC_I(inode); 1849 ei = PROC_I(inode);
1862 ei->op.proc_get_link = proc_map_files_get_link; 1850 ei->op.proc_get_link = proc_map_files_get_link;
@@ -1873,7 +1861,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1873 d_set_d_op(dentry, &tid_map_files_dentry_operations); 1861 d_set_d_op(dentry, &tid_map_files_dentry_operations);
1874 d_add(dentry, inode); 1862 d_add(dentry, inode);
1875 1863
1876 return NULL; 1864 return 0;
1877} 1865}
1878 1866
1879static struct dentry *proc_map_files_lookup(struct inode *dir, 1867static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -1882,23 +1870,23 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
1882 unsigned long vm_start, vm_end; 1870 unsigned long vm_start, vm_end;
1883 struct vm_area_struct *vma; 1871 struct vm_area_struct *vma;
1884 struct task_struct *task; 1872 struct task_struct *task;
1885 struct dentry *result; 1873 int result;
1886 struct mm_struct *mm; 1874 struct mm_struct *mm;
1887 1875
1888 result = ERR_PTR(-EPERM); 1876 result = -EPERM;
1889 if (!capable(CAP_SYS_ADMIN)) 1877 if (!capable(CAP_SYS_ADMIN))
1890 goto out; 1878 goto out;
1891 1879
1892 result = ERR_PTR(-ENOENT); 1880 result = -ENOENT;
1893 task = get_proc_task(dir); 1881 task = get_proc_task(dir);
1894 if (!task) 1882 if (!task)
1895 goto out; 1883 goto out;
1896 1884
1897 result = ERR_PTR(-EACCES); 1885 result = -EACCES;
1898 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 1886 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1899 goto out_put_task; 1887 goto out_put_task;
1900 1888
1901 result = ERR_PTR(-ENOENT); 1889 result = -ENOENT;
1902 if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) 1890 if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
1903 goto out_put_task; 1891 goto out_put_task;
1904 1892
@@ -1921,7 +1909,7 @@ out_no_vma:
1921out_put_task: 1909out_put_task:
1922 put_task_struct(task); 1910 put_task_struct(task);
1923out: 1911out:
1924 return result; 1912 return ERR_PTR(result);
1925} 1913}
1926 1914
1927static const struct inode_operations proc_map_files_inode_operations = { 1915static const struct inode_operations proc_map_files_inode_operations = {
@@ -1931,14 +1919,15 @@ static const struct inode_operations proc_map_files_inode_operations = {
1931}; 1919};
1932 1920
1933static int 1921static int
1934proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) 1922proc_map_files_readdir(struct file *file, struct dir_context *ctx)
1935{ 1923{
1936 struct dentry *dentry = filp->f_path.dentry;
1937 struct inode *inode = dentry->d_inode;
1938 struct vm_area_struct *vma; 1924 struct vm_area_struct *vma;
1939 struct task_struct *task; 1925 struct task_struct *task;
1940 struct mm_struct *mm; 1926 struct mm_struct *mm;
1941 ino_t ino; 1927 unsigned long nr_files, pos, i;
1928 struct flex_array *fa = NULL;
1929 struct map_files_info info;
1930 struct map_files_info *p;
1942 int ret; 1931 int ret;
1943 1932
1944 ret = -EPERM; 1933 ret = -EPERM;
@@ -1946,7 +1935,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
1946 goto out; 1935 goto out;
1947 1936
1948 ret = -ENOENT; 1937 ret = -ENOENT;
1949 task = get_proc_task(inode); 1938 task = get_proc_task(file_inode(file));
1950 if (!task) 1939 if (!task)
1951 goto out; 1940 goto out;
1952 1941
@@ -1955,91 +1944,73 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
1955 goto out_put_task; 1944 goto out_put_task;
1956 1945
1957 ret = 0; 1946 ret = 0;
1958 switch (filp->f_pos) { 1947 if (!dir_emit_dots(file, ctx))
1959 case 0: 1948 goto out_put_task;
1960 ino = inode->i_ino;
1961 if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
1962 goto out_put_task;
1963 filp->f_pos++;
1964 case 1:
1965 ino = parent_ino(dentry);
1966 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
1967 goto out_put_task;
1968 filp->f_pos++;
1969 default:
1970 {
1971 unsigned long nr_files, pos, i;
1972 struct flex_array *fa = NULL;
1973 struct map_files_info info;
1974 struct map_files_info *p;
1975
1976 mm = get_task_mm(task);
1977 if (!mm)
1978 goto out_put_task;
1979 down_read(&mm->mmap_sem);
1980 1949
1981 nr_files = 0; 1950 mm = get_task_mm(task);
1951 if (!mm)
1952 goto out_put_task;
1953 down_read(&mm->mmap_sem);
1982 1954
1983 /* 1955 nr_files = 0;
1984 * We need two passes here:
1985 *
1986 * 1) Collect vmas of mapped files with mmap_sem taken
1987 * 2) Release mmap_sem and instantiate entries
1988 *
1989 * otherwise we get lockdep complained, since filldir()
1990 * routine might require mmap_sem taken in might_fault().
1991 */
1992 1956
1993 for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { 1957 /*
1994 if (vma->vm_file && ++pos > filp->f_pos) 1958 * We need two passes here:
1995 nr_files++; 1959 *
1996 } 1960 * 1) Collect vmas of mapped files with mmap_sem taken
1961 * 2) Release mmap_sem and instantiate entries
1962 *
1963 * otherwise we get lockdep complained, since filldir()
1964 * routine might require mmap_sem taken in might_fault().
1965 */
1997 1966
1998 if (nr_files) { 1967 for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
1999 fa = flex_array_alloc(sizeof(info), nr_files, 1968 if (vma->vm_file && ++pos > ctx->pos)
2000 GFP_KERNEL); 1969 nr_files++;
2001 if (!fa || flex_array_prealloc(fa, 0, nr_files, 1970 }
2002 GFP_KERNEL)) { 1971
2003 ret = -ENOMEM; 1972 if (nr_files) {
2004 if (fa) 1973 fa = flex_array_alloc(sizeof(info), nr_files,
2005 flex_array_free(fa); 1974 GFP_KERNEL);
2006 up_read(&mm->mmap_sem); 1975 if (!fa || flex_array_prealloc(fa, 0, nr_files,
2007 mmput(mm); 1976 GFP_KERNEL)) {
2008 goto out_put_task; 1977 ret = -ENOMEM;
2009 } 1978 if (fa)
2010 for (i = 0, vma = mm->mmap, pos = 2; vma; 1979 flex_array_free(fa);
2011 vma = vma->vm_next) { 1980 up_read(&mm->mmap_sem);
2012 if (!vma->vm_file) 1981 mmput(mm);
2013 continue; 1982 goto out_put_task;
2014 if (++pos <= filp->f_pos)
2015 continue;
2016
2017 info.mode = vma->vm_file->f_mode;
2018 info.len = snprintf(info.name,
2019 sizeof(info.name), "%lx-%lx",
2020 vma->vm_start, vma->vm_end);
2021 if (flex_array_put(fa, i++, &info, GFP_KERNEL))
2022 BUG();
2023 }
2024 } 1983 }
2025 up_read(&mm->mmap_sem); 1984 for (i = 0, vma = mm->mmap, pos = 2; vma;
2026 1985 vma = vma->vm_next) {
2027 for (i = 0; i < nr_files; i++) { 1986 if (!vma->vm_file)
2028 p = flex_array_get(fa, i); 1987 continue;
2029 ret = proc_fill_cache(filp, dirent, filldir, 1988 if (++pos <= ctx->pos)
2030 p->name, p->len, 1989 continue;
2031 proc_map_files_instantiate, 1990
2032 task, 1991 info.mode = vma->vm_file->f_mode;
2033 (void *)(unsigned long)p->mode); 1992 info.len = snprintf(info.name,
2034 if (ret) 1993 sizeof(info.name), "%lx-%lx",
2035 break; 1994 vma->vm_start, vma->vm_end);
2036 filp->f_pos++; 1995 if (flex_array_put(fa, i++, &info, GFP_KERNEL))
1996 BUG();
2037 } 1997 }
2038 if (fa)
2039 flex_array_free(fa);
2040 mmput(mm);
2041 } 1998 }
1999 up_read(&mm->mmap_sem);
2000
2001 for (i = 0; i < nr_files; i++) {
2002 p = flex_array_get(fa, i);
2003 if (!proc_fill_cache(file, ctx,
2004 p->name, p->len,
2005 proc_map_files_instantiate,
2006 task,
2007 (void *)(unsigned long)p->mode))
2008 break;
2009 ctx->pos++;
2042 } 2010 }
2011 if (fa)
2012 flex_array_free(fa);
2013 mmput(mm);
2043 2014
2044out_put_task: 2015out_put_task:
2045 put_task_struct(task); 2016 put_task_struct(task);
@@ -2049,7 +2020,7 @@ out:
2049 2020
2050static const struct file_operations proc_map_files_operations = { 2021static const struct file_operations proc_map_files_operations = {
2051 .read = generic_read_dir, 2022 .read = generic_read_dir,
2052 .readdir = proc_map_files_readdir, 2023 .iterate = proc_map_files_readdir,
2053 .llseek = default_llseek, 2024 .llseek = default_llseek,
2054}; 2025};
2055 2026
@@ -2152,13 +2123,12 @@ static const struct file_operations proc_timers_operations = {
2152}; 2123};
2153#endif /* CONFIG_CHECKPOINT_RESTORE */ 2124#endif /* CONFIG_CHECKPOINT_RESTORE */
2154 2125
2155static struct dentry *proc_pident_instantiate(struct inode *dir, 2126static int proc_pident_instantiate(struct inode *dir,
2156 struct dentry *dentry, struct task_struct *task, const void *ptr) 2127 struct dentry *dentry, struct task_struct *task, const void *ptr)
2157{ 2128{
2158 const struct pid_entry *p = ptr; 2129 const struct pid_entry *p = ptr;
2159 struct inode *inode; 2130 struct inode *inode;
2160 struct proc_inode *ei; 2131 struct proc_inode *ei;
2161 struct dentry *error = ERR_PTR(-ENOENT);
2162 2132
2163 inode = proc_pid_make_inode(dir->i_sb, task); 2133 inode = proc_pid_make_inode(dir->i_sb, task);
2164 if (!inode) 2134 if (!inode)
@@ -2177,9 +2147,9 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
2177 d_add(dentry, inode); 2147 d_add(dentry, inode);
2178 /* Close the race of the process dying before we return the dentry */ 2148 /* Close the race of the process dying before we return the dentry */
2179 if (pid_revalidate(dentry, 0)) 2149 if (pid_revalidate(dentry, 0))
2180 error = NULL; 2150 return 0;
2181out: 2151out:
2182 return error; 2152 return -ENOENT;
2183} 2153}
2184 2154
2185static struct dentry *proc_pident_lookup(struct inode *dir, 2155static struct dentry *proc_pident_lookup(struct inode *dir,
@@ -2187,11 +2157,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
2187 const struct pid_entry *ents, 2157 const struct pid_entry *ents,
2188 unsigned int nents) 2158 unsigned int nents)
2189{ 2159{
2190 struct dentry *error; 2160 int error;
2191 struct task_struct *task = get_proc_task(dir); 2161 struct task_struct *task = get_proc_task(dir);
2192 const struct pid_entry *p, *last; 2162 const struct pid_entry *p, *last;
2193 2163
2194 error = ERR_PTR(-ENOENT); 2164 error = -ENOENT;
2195 2165
2196 if (!task) 2166 if (!task)
2197 goto out_no_task; 2167 goto out_no_task;
@@ -2214,70 +2184,33 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
2214out: 2184out:
2215 put_task_struct(task); 2185 put_task_struct(task);
2216out_no_task: 2186out_no_task:
2217 return error; 2187 return ERR_PTR(error);
2218}
2219
2220static int proc_pident_fill_cache(struct file *filp, void *dirent,
2221 filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
2222{
2223 return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
2224 proc_pident_instantiate, task, p);
2225} 2188}
2226 2189
2227static int proc_pident_readdir(struct file *filp, 2190static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
2228 void *dirent, filldir_t filldir,
2229 const struct pid_entry *ents, unsigned int nents) 2191 const struct pid_entry *ents, unsigned int nents)
2230{ 2192{
2231 int i; 2193 struct task_struct *task = get_proc_task(file_inode(file));
2232 struct dentry *dentry = filp->f_path.dentry; 2194 const struct pid_entry *p;
2233 struct inode *inode = dentry->d_inode;
2234 struct task_struct *task = get_proc_task(inode);
2235 const struct pid_entry *p, *last;
2236 ino_t ino;
2237 int ret;
2238 2195
2239 ret = -ENOENT;
2240 if (!task) 2196 if (!task)
2241 goto out_no_task; 2197 return -ENOENT;
2242 2198
2243 ret = 0; 2199 if (!dir_emit_dots(file, ctx))
2244 i = filp->f_pos; 2200 goto out;
2245 switch (i) { 2201
2246 case 0: 2202 if (ctx->pos >= nents + 2)
2247 ino = inode->i_ino; 2203 goto out;
2248 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
2249 goto out;
2250 i++;
2251 filp->f_pos++;
2252 /* fall through */
2253 case 1:
2254 ino = parent_ino(dentry);
2255 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
2256 goto out;
2257 i++;
2258 filp->f_pos++;
2259 /* fall through */
2260 default:
2261 i -= 2;
2262 if (i >= nents) {
2263 ret = 1;
2264 goto out;
2265 }
2266 p = ents + i;
2267 last = &ents[nents - 1];
2268 while (p <= last) {
2269 if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
2270 goto out;
2271 filp->f_pos++;
2272 p++;
2273 }
2274 }
2275 2204
2276 ret = 1; 2205 for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
2206 if (!proc_fill_cache(file, ctx, p->name, p->len,
2207 proc_pident_instantiate, task, p))
2208 break;
2209 ctx->pos++;
2210 }
2277out: 2211out:
2278 put_task_struct(task); 2212 put_task_struct(task);
2279out_no_task: 2213 return 0;
2280 return ret;
2281} 2214}
2282 2215
2283#ifdef CONFIG_SECURITY 2216#ifdef CONFIG_SECURITY
@@ -2362,16 +2295,15 @@ static const struct pid_entry attr_dir_stuff[] = {
2362 REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2295 REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2363}; 2296};
2364 2297
2365static int proc_attr_dir_readdir(struct file * filp, 2298static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
2366 void * dirent, filldir_t filldir)
2367{ 2299{
2368 return proc_pident_readdir(filp,dirent,filldir, 2300 return proc_pident_readdir(file, ctx,
2369 attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff)); 2301 attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2370} 2302}
2371 2303
2372static const struct file_operations proc_attr_dir_operations = { 2304static const struct file_operations proc_attr_dir_operations = {
2373 .read = generic_read_dir, 2305 .read = generic_read_dir,
2374 .readdir = proc_attr_dir_readdir, 2306 .iterate = proc_attr_dir_readdir,
2375 .llseek = default_llseek, 2307 .llseek = default_llseek,
2376}; 2308};
2377 2309
@@ -2725,16 +2657,15 @@ static const struct pid_entry tgid_base_stuff[] = {
2725#endif 2657#endif
2726}; 2658};
2727 2659
2728static int proc_tgid_base_readdir(struct file * filp, 2660static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
2729 void * dirent, filldir_t filldir)
2730{ 2661{
2731 return proc_pident_readdir(filp,dirent,filldir, 2662 return proc_pident_readdir(file, ctx,
2732 tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); 2663 tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
2733} 2664}
2734 2665
2735static const struct file_operations proc_tgid_base_operations = { 2666static const struct file_operations proc_tgid_base_operations = {
2736 .read = generic_read_dir, 2667 .read = generic_read_dir,
2737 .readdir = proc_tgid_base_readdir, 2668 .iterate = proc_tgid_base_readdir,
2738 .llseek = default_llseek, 2669 .llseek = default_llseek,
2739}; 2670};
2740 2671
@@ -2836,11 +2767,10 @@ void proc_flush_task(struct task_struct *task)
2836 } 2767 }
2837} 2768}
2838 2769
2839static struct dentry *proc_pid_instantiate(struct inode *dir, 2770static int proc_pid_instantiate(struct inode *dir,
2840 struct dentry * dentry, 2771 struct dentry * dentry,
2841 struct task_struct *task, const void *ptr) 2772 struct task_struct *task, const void *ptr)
2842{ 2773{
2843 struct dentry *error = ERR_PTR(-ENOENT);
2844 struct inode *inode; 2774 struct inode *inode;
2845 2775
2846 inode = proc_pid_make_inode(dir->i_sb, task); 2776 inode = proc_pid_make_inode(dir->i_sb, task);
@@ -2860,14 +2790,14 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
2860 d_add(dentry, inode); 2790 d_add(dentry, inode);
2861 /* Close the race of the process dying before we return the dentry */ 2791 /* Close the race of the process dying before we return the dentry */
2862 if (pid_revalidate(dentry, 0)) 2792 if (pid_revalidate(dentry, 0))
2863 error = NULL; 2793 return 0;
2864out: 2794out:
2865 return error; 2795 return -ENOENT;
2866} 2796}
2867 2797
2868struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 2798struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
2869{ 2799{
2870 struct dentry *result = NULL; 2800 int result = 0;
2871 struct task_struct *task; 2801 struct task_struct *task;
2872 unsigned tgid; 2802 unsigned tgid;
2873 struct pid_namespace *ns; 2803 struct pid_namespace *ns;
@@ -2888,7 +2818,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign
2888 result = proc_pid_instantiate(dir, dentry, task, NULL); 2818 result = proc_pid_instantiate(dir, dentry, task, NULL);
2889 put_task_struct(task); 2819 put_task_struct(task);
2890out: 2820out:
2891 return result; 2821 return ERR_PTR(result);
2892} 2822}
2893 2823
2894/* 2824/*
@@ -2936,58 +2866,42 @@ retry:
2936 2866
2937#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1) 2867#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
2938 2868
2939static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2940 struct tgid_iter iter)
2941{
2942 char name[PROC_NUMBUF];
2943 int len = snprintf(name, sizeof(name), "%d", iter.tgid);
2944 return proc_fill_cache(filp, dirent, filldir, name, len,
2945 proc_pid_instantiate, iter.task, NULL);
2946}
2947
2948static int fake_filldir(void *buf, const char *name, int namelen,
2949 loff_t offset, u64 ino, unsigned d_type)
2950{
2951 return 0;
2952}
2953
2954/* for the /proc/ directory itself, after non-process stuff has been done */ 2869/* for the /proc/ directory itself, after non-process stuff has been done */
2955int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) 2870int proc_pid_readdir(struct file *file, struct dir_context *ctx)
2956{ 2871{
2957 struct tgid_iter iter; 2872 struct tgid_iter iter;
2958 struct pid_namespace *ns; 2873 struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info;
2959 filldir_t __filldir; 2874 loff_t pos = ctx->pos;
2960 loff_t pos = filp->f_pos;
2961 2875
2962 if (pos >= PID_MAX_LIMIT + TGID_OFFSET) 2876 if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
2963 goto out; 2877 return 0;
2964 2878
2965 if (pos == TGID_OFFSET - 1) { 2879 if (pos == TGID_OFFSET - 1) {
2966 if (proc_fill_cache(filp, dirent, filldir, "self", 4, 2880 struct inode *inode = ns->proc_self->d_inode;
2967 NULL, NULL, NULL) < 0) 2881 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
2968 goto out; 2882 return 0;
2969 iter.tgid = 0; 2883 iter.tgid = 0;
2970 } else { 2884 } else {
2971 iter.tgid = pos - TGID_OFFSET; 2885 iter.tgid = pos - TGID_OFFSET;
2972 } 2886 }
2973 iter.task = NULL; 2887 iter.task = NULL;
2974 ns = filp->f_dentry->d_sb->s_fs_info;
2975 for (iter = next_tgid(ns, iter); 2888 for (iter = next_tgid(ns, iter);
2976 iter.task; 2889 iter.task;
2977 iter.tgid += 1, iter = next_tgid(ns, iter)) { 2890 iter.tgid += 1, iter = next_tgid(ns, iter)) {
2978 if (has_pid_permissions(ns, iter.task, 2)) 2891 char name[PROC_NUMBUF];
2979 __filldir = filldir; 2892 int len;
2980 else 2893 if (!has_pid_permissions(ns, iter.task, 2))
2981 __filldir = fake_filldir; 2894 continue;
2982 2895
2983 filp->f_pos = iter.tgid + TGID_OFFSET; 2896 len = snprintf(name, sizeof(name), "%d", iter.tgid);
2984 if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { 2897 ctx->pos = iter.tgid + TGID_OFFSET;
2898 if (!proc_fill_cache(file, ctx, name, len,
2899 proc_pid_instantiate, iter.task, NULL)) {
2985 put_task_struct(iter.task); 2900 put_task_struct(iter.task);
2986 goto out; 2901 return 0;
2987 } 2902 }
2988 } 2903 }
2989 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; 2904 ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
2990out:
2991 return 0; 2905 return 0;
2992} 2906}
2993 2907
@@ -3075,11 +2989,10 @@ static const struct pid_entry tid_base_stuff[] = {
3075#endif 2989#endif
3076}; 2990};
3077 2991
3078static int proc_tid_base_readdir(struct file * filp, 2992static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
3079 void * dirent, filldir_t filldir)
3080{ 2993{
3081 return proc_pident_readdir(filp,dirent,filldir, 2994 return proc_pident_readdir(file, ctx,
3082 tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); 2995 tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3083} 2996}
3084 2997
3085static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 2998static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -3090,7 +3003,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
3090 3003
3091static const struct file_operations proc_tid_base_operations = { 3004static const struct file_operations proc_tid_base_operations = {
3092 .read = generic_read_dir, 3005 .read = generic_read_dir,
3093 .readdir = proc_tid_base_readdir, 3006 .iterate = proc_tid_base_readdir,
3094 .llseek = default_llseek, 3007 .llseek = default_llseek,
3095}; 3008};
3096 3009
@@ -3100,10 +3013,9 @@ static const struct inode_operations proc_tid_base_inode_operations = {
3100 .setattr = proc_setattr, 3013 .setattr = proc_setattr,
3101}; 3014};
3102 3015
3103static struct dentry *proc_task_instantiate(struct inode *dir, 3016static int proc_task_instantiate(struct inode *dir,
3104 struct dentry *dentry, struct task_struct *task, const void *ptr) 3017 struct dentry *dentry, struct task_struct *task, const void *ptr)
3105{ 3018{
3106 struct dentry *error = ERR_PTR(-ENOENT);
3107 struct inode *inode; 3019 struct inode *inode;
3108 inode = proc_pid_make_inode(dir->i_sb, task); 3020 inode = proc_pid_make_inode(dir->i_sb, task);
3109 3021
@@ -3122,14 +3034,14 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
3122 d_add(dentry, inode); 3034 d_add(dentry, inode);
3123 /* Close the race of the process dying before we return the dentry */ 3035 /* Close the race of the process dying before we return the dentry */
3124 if (pid_revalidate(dentry, 0)) 3036 if (pid_revalidate(dentry, 0))
3125 error = NULL; 3037 return 0;
3126out: 3038out:
3127 return error; 3039 return -ENOENT;
3128} 3040}
3129 3041
3130static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 3042static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
3131{ 3043{
3132 struct dentry *result = ERR_PTR(-ENOENT); 3044 int result = -ENOENT;
3133 struct task_struct *task; 3045 struct task_struct *task;
3134 struct task_struct *leader = get_proc_task(dir); 3046 struct task_struct *leader = get_proc_task(dir);
3135 unsigned tid; 3047 unsigned tid;
@@ -3159,7 +3071,7 @@ out_drop_task:
3159out: 3071out:
3160 put_task_struct(leader); 3072 put_task_struct(leader);
3161out_no_task: 3073out_no_task:
3162 return result; 3074 return ERR_PTR(result);
3163} 3075}
3164 3076
3165/* 3077/*
@@ -3231,30 +3143,16 @@ static struct task_struct *next_tid(struct task_struct *start)
3231 return pos; 3143 return pos;
3232} 3144}
3233 3145
3234static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
3235 struct task_struct *task, int tid)
3236{
3237 char name[PROC_NUMBUF];
3238 int len = snprintf(name, sizeof(name), "%d", tid);
3239 return proc_fill_cache(filp, dirent, filldir, name, len,
3240 proc_task_instantiate, task, NULL);
3241}
3242
3243/* for the /proc/TGID/task/ directories */ 3146/* for the /proc/TGID/task/ directories */
3244static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) 3147static int proc_task_readdir(struct file *file, struct dir_context *ctx)
3245{ 3148{
3246 struct dentry *dentry = filp->f_path.dentry;
3247 struct inode *inode = dentry->d_inode;
3248 struct task_struct *leader = NULL; 3149 struct task_struct *leader = NULL;
3249 struct task_struct *task; 3150 struct task_struct *task = get_proc_task(file_inode(file));
3250 int retval = -ENOENT;
3251 ino_t ino;
3252 int tid;
3253 struct pid_namespace *ns; 3151 struct pid_namespace *ns;
3152 int tid;
3254 3153
3255 task = get_proc_task(inode);
3256 if (!task) 3154 if (!task)
3257 goto out_no_task; 3155 return -ENOENT;
3258 rcu_read_lock(); 3156 rcu_read_lock();
3259 if (pid_alive(task)) { 3157 if (pid_alive(task)) {
3260 leader = task->group_leader; 3158 leader = task->group_leader;
@@ -3263,46 +3161,36 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
3263 rcu_read_unlock(); 3161 rcu_read_unlock();
3264 put_task_struct(task); 3162 put_task_struct(task);
3265 if (!leader) 3163 if (!leader)
3266 goto out_no_task; 3164 return -ENOENT;
3267 retval = 0;
3268 3165
3269 switch ((unsigned long)filp->f_pos) { 3166 if (!dir_emit_dots(file, ctx))
3270 case 0: 3167 goto out;
3271 ino = inode->i_ino;
3272 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
3273 goto out;
3274 filp->f_pos++;
3275 /* fall through */
3276 case 1:
3277 ino = parent_ino(dentry);
3278 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
3279 goto out;
3280 filp->f_pos++;
3281 /* fall through */
3282 }
3283 3168
3284 /* f_version caches the tgid value that the last readdir call couldn't 3169 /* f_version caches the tgid value that the last readdir call couldn't
3285 * return. lseek aka telldir automagically resets f_version to 0. 3170 * return. lseek aka telldir automagically resets f_version to 0.
3286 */ 3171 */
3287 ns = filp->f_dentry->d_sb->s_fs_info; 3172 ns = file->f_dentry->d_sb->s_fs_info;
3288 tid = (int)filp->f_version; 3173 tid = (int)file->f_version;
3289 filp->f_version = 0; 3174 file->f_version = 0;
3290 for (task = first_tid(leader, tid, filp->f_pos - 2, ns); 3175 for (task = first_tid(leader, tid, ctx->pos - 2, ns);
3291 task; 3176 task;
3292 task = next_tid(task), filp->f_pos++) { 3177 task = next_tid(task), ctx->pos++) {
3178 char name[PROC_NUMBUF];
3179 int len;
3293 tid = task_pid_nr_ns(task, ns); 3180 tid = task_pid_nr_ns(task, ns);
3294 if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { 3181 len = snprintf(name, sizeof(name), "%d", tid);
3182 if (!proc_fill_cache(file, ctx, name, len,
3183 proc_task_instantiate, task, NULL)) {
3295 /* returning this tgid failed, save it as the first 3184 /* returning this tgid failed, save it as the first
3296 * pid for the next readir call */ 3185 * pid for the next readir call */
3297 filp->f_version = (u64)tid; 3186 file->f_version = (u64)tid;
3298 put_task_struct(task); 3187 put_task_struct(task);
3299 break; 3188 break;
3300 } 3189 }
3301 } 3190 }
3302out: 3191out:
3303 put_task_struct(leader); 3192 put_task_struct(leader);
3304out_no_task: 3193 return 0;
3305 return retval;
3306} 3194}
3307 3195
3308static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 3196static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -3328,6 +3216,6 @@ static const struct inode_operations proc_task_inode_operations = {
3328 3216
3329static const struct file_operations proc_task_operations = { 3217static const struct file_operations proc_task_operations = {
3330 .read = generic_read_dir, 3218 .read = generic_read_dir,
3331 .readdir = proc_task_readdir, 3219 .iterate = proc_task_readdir,
3332 .llseek = default_llseek, 3220 .llseek = default_llseek,
3333}; 3221};
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d7a4a28ef630..75f2890abbd8 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -167,11 +167,10 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
167 return ret; 167 return ret;
168} 168}
169 169
170static struct dentry * 170static int
171proc_fd_instantiate(struct inode *dir, struct dentry *dentry, 171proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
172 struct task_struct *task, const void *ptr) 172 struct task_struct *task, const void *ptr)
173{ 173{
174 struct dentry *error = ERR_PTR(-ENOENT);
175 unsigned fd = (unsigned long)ptr; 174 unsigned fd = (unsigned long)ptr;
176 struct proc_inode *ei; 175 struct proc_inode *ei;
177 struct inode *inode; 176 struct inode *inode;
@@ -194,9 +193,9 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
194 193
195 /* Close the race of the process dying before we return the dentry */ 194 /* Close the race of the process dying before we return the dentry */
196 if (tid_fd_revalidate(dentry, 0)) 195 if (tid_fd_revalidate(dentry, 0))
197 error = NULL; 196 return 0;
198 out: 197 out:
199 return error; 198 return -ENOENT;
200} 199}
201 200
202static struct dentry *proc_lookupfd_common(struct inode *dir, 201static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -204,7 +203,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
204 instantiate_t instantiate) 203 instantiate_t instantiate)
205{ 204{
206 struct task_struct *task = get_proc_task(dir); 205 struct task_struct *task = get_proc_task(dir);
207 struct dentry *result = ERR_PTR(-ENOENT); 206 int result = -ENOENT;
208 unsigned fd = name_to_int(dentry); 207 unsigned fd = name_to_int(dentry);
209 208
210 if (!task) 209 if (!task)
@@ -216,77 +215,61 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
216out: 215out:
217 put_task_struct(task); 216 put_task_struct(task);
218out_no_task: 217out_no_task:
219 return result; 218 return ERR_PTR(result);
220} 219}
221 220
222static int proc_readfd_common(struct file * filp, void * dirent, 221static int proc_readfd_common(struct file *file, struct dir_context *ctx,
223 filldir_t filldir, instantiate_t instantiate) 222 instantiate_t instantiate)
224{ 223{
225 struct dentry *dentry = filp->f_path.dentry; 224 struct task_struct *p = get_proc_task(file_inode(file));
226 struct inode *inode = dentry->d_inode;
227 struct task_struct *p = get_proc_task(inode);
228 struct files_struct *files; 225 struct files_struct *files;
229 unsigned int fd, ino; 226 unsigned int fd;
230 int retval;
231 227
232 retval = -ENOENT;
233 if (!p) 228 if (!p)
234 goto out_no_task; 229 return -ENOENT;
235 retval = 0;
236
237 fd = filp->f_pos;
238 switch (fd) {
239 case 0:
240 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
241 goto out;
242 filp->f_pos++;
243 case 1:
244 ino = parent_ino(dentry);
245 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
246 goto out;
247 filp->f_pos++;
248 default:
249 files = get_files_struct(p);
250 if (!files)
251 goto out;
252 rcu_read_lock();
253 for (fd = filp->f_pos - 2;
254 fd < files_fdtable(files)->max_fds;
255 fd++, filp->f_pos++) {
256 char name[PROC_NUMBUF];
257 int len;
258 int rv;
259
260 if (!fcheck_files(files, fd))
261 continue;
262 rcu_read_unlock();
263 230
264 len = snprintf(name, sizeof(name), "%d", fd); 231 if (!dir_emit_dots(file, ctx))
265 rv = proc_fill_cache(filp, dirent, filldir, 232 goto out;
266 name, len, instantiate, p, 233 if (!dir_emit_dots(file, ctx))
267 (void *)(unsigned long)fd); 234 goto out;
268 if (rv < 0) 235 files = get_files_struct(p);
269 goto out_fd_loop; 236 if (!files)
270 rcu_read_lock(); 237 goto out;
271 } 238
272 rcu_read_unlock(); 239 rcu_read_lock();
273out_fd_loop: 240 for (fd = ctx->pos - 2;
274 put_files_struct(files); 241 fd < files_fdtable(files)->max_fds;
242 fd++, ctx->pos++) {
243 char name[PROC_NUMBUF];
244 int len;
245
246 if (!fcheck_files(files, fd))
247 continue;
248 rcu_read_unlock();
249
250 len = snprintf(name, sizeof(name), "%d", fd);
251 if (!proc_fill_cache(file, ctx,
252 name, len, instantiate, p,
253 (void *)(unsigned long)fd))
254 goto out_fd_loop;
255 rcu_read_lock();
275 } 256 }
257 rcu_read_unlock();
258out_fd_loop:
259 put_files_struct(files);
276out: 260out:
277 put_task_struct(p); 261 put_task_struct(p);
278out_no_task: 262 return 0;
279 return retval;
280} 263}
281 264
282static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) 265static int proc_readfd(struct file *file, struct dir_context *ctx)
283{ 266{
284 return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); 267 return proc_readfd_common(file, ctx, proc_fd_instantiate);
285} 268}
286 269
287const struct file_operations proc_fd_operations = { 270const struct file_operations proc_fd_operations = {
288 .read = generic_read_dir, 271 .read = generic_read_dir,
289 .readdir = proc_readfd, 272 .iterate = proc_readfd,
290 .llseek = default_llseek, 273 .llseek = default_llseek,
291}; 274};
292 275
@@ -316,11 +299,10 @@ const struct inode_operations proc_fd_inode_operations = {
316 .setattr = proc_setattr, 299 .setattr = proc_setattr,
317}; 300};
318 301
319static struct dentry * 302static int
320proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, 303proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
321 struct task_struct *task, const void *ptr) 304 struct task_struct *task, const void *ptr)
322{ 305{
323 struct dentry *error = ERR_PTR(-ENOENT);
324 unsigned fd = (unsigned long)ptr; 306 unsigned fd = (unsigned long)ptr;
325 struct proc_inode *ei; 307 struct proc_inode *ei;
326 struct inode *inode; 308 struct inode *inode;
@@ -340,9 +322,9 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
340 322
341 /* Close the race of the process dying before we return the dentry */ 323 /* Close the race of the process dying before we return the dentry */
342 if (tid_fd_revalidate(dentry, 0)) 324 if (tid_fd_revalidate(dentry, 0))
343 error = NULL; 325 return 0;
344 out: 326 out:
345 return error; 327 return -ENOENT;
346} 328}
347 329
348static struct dentry * 330static struct dentry *
@@ -351,9 +333,9 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
351 return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); 333 return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
352} 334}
353 335
354static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) 336static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
355{ 337{
356 return proc_readfd_common(filp, dirent, filldir, 338 return proc_readfd_common(file, ctx,
357 proc_fdinfo_instantiate); 339 proc_fdinfo_instantiate);
358} 340}
359 341
@@ -364,6 +346,6 @@ const struct inode_operations proc_fdinfo_inode_operations = {
364 346
365const struct file_operations proc_fdinfo_operations = { 347const struct file_operations proc_fdinfo_operations = {
366 .read = generic_read_dir, 348 .read = generic_read_dir,
367 .readdir = proc_readfdinfo, 349 .iterate = proc_readfdinfo,
368 .llseek = default_llseek, 350 .llseek = default_llseek,
369}; 351};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a2596afffae6..94441a407337 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -233,76 +233,52 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
233 * value of the readdir() call, as long as it's non-negative 233 * value of the readdir() call, as long as it's non-negative
234 * for success.. 234 * for success..
235 */ 235 */
236int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, 236int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
237 filldir_t filldir) 237 struct dir_context *ctx)
238{ 238{
239 unsigned int ino;
240 int i; 239 int i;
241 struct inode *inode = file_inode(filp);
242 int ret = 0;
243
244 ino = inode->i_ino;
245 i = filp->f_pos;
246 switch (i) {
247 case 0:
248 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
249 goto out;
250 i++;
251 filp->f_pos++;
252 /* fall through */
253 case 1:
254 if (filldir(dirent, "..", 2, i,
255 parent_ino(filp->f_path.dentry),
256 DT_DIR) < 0)
257 goto out;
258 i++;
259 filp->f_pos++;
260 /* fall through */
261 default:
262 spin_lock(&proc_subdir_lock);
263 de = de->subdir;
264 i -= 2;
265 for (;;) {
266 if (!de) {
267 ret = 1;
268 spin_unlock(&proc_subdir_lock);
269 goto out;
270 }
271 if (!i)
272 break;
273 de = de->next;
274 i--;
275 }
276 240
277 do { 241 if (!dir_emit_dots(file, ctx))
278 struct proc_dir_entry *next; 242 return 0;
279 243
280 /* filldir passes info to user space */ 244 spin_lock(&proc_subdir_lock);
281 pde_get(de); 245 de = de->subdir;
282 spin_unlock(&proc_subdir_lock); 246 i = ctx->pos - 2;
283 if (filldir(dirent, de->name, de->namelen, filp->f_pos, 247 for (;;) {
284 de->low_ino, de->mode >> 12) < 0) { 248 if (!de) {
285 pde_put(de);
286 goto out;
287 }
288 spin_lock(&proc_subdir_lock);
289 filp->f_pos++;
290 next = de->next;
291 pde_put(de);
292 de = next;
293 } while (de);
294 spin_unlock(&proc_subdir_lock); 249 spin_unlock(&proc_subdir_lock);
250 return 0;
251 }
252 if (!i)
253 break;
254 de = de->next;
255 i--;
295 } 256 }
296 ret = 1; 257
297out: 258 do {
298 return ret; 259 struct proc_dir_entry *next;
260 pde_get(de);
261 spin_unlock(&proc_subdir_lock);
262 if (!dir_emit(ctx, de->name, de->namelen,
263 de->low_ino, de->mode >> 12)) {
264 pde_put(de);
265 return 0;
266 }
267 spin_lock(&proc_subdir_lock);
268 ctx->pos++;
269 next = de->next;
270 pde_put(de);
271 de = next;
272 } while (de);
273 spin_unlock(&proc_subdir_lock);
274 return 0;
299} 275}
300 276
301int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) 277int proc_readdir(struct file *file, struct dir_context *ctx)
302{ 278{
303 struct inode *inode = file_inode(filp); 279 struct inode *inode = file_inode(file);
304 280
305 return proc_readdir_de(PDE(inode), filp, dirent, filldir); 281 return proc_readdir_de(PDE(inode), file, ctx);
306} 282}
307 283
308/* 284/*
@@ -313,7 +289,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
313static const struct file_operations proc_dir_operations = { 289static const struct file_operations proc_dir_operations = {
314 .llseek = generic_file_llseek, 290 .llseek = generic_file_llseek,
315 .read = generic_read_dir, 291 .read = generic_read_dir,
316 .readdir = proc_readdir, 292 .iterate = proc_readdir,
317}; 293};
318 294
319/* 295/*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d600fb098b6a..651d09a11dde 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -165,14 +165,14 @@ extern int proc_setattr(struct dentry *, struct iattr *);
165extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *); 165extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
166extern int pid_revalidate(struct dentry *, unsigned int); 166extern int pid_revalidate(struct dentry *, unsigned int);
167extern int pid_delete_dentry(const struct dentry *); 167extern int pid_delete_dentry(const struct dentry *);
168extern int proc_pid_readdir(struct file *, void *, filldir_t); 168extern int proc_pid_readdir(struct file *, struct dir_context *);
169extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); 169extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
170extern loff_t mem_lseek(struct file *, loff_t, int); 170extern loff_t mem_lseek(struct file *, loff_t, int);
171 171
172/* Lookups */ 172/* Lookups */
173typedef struct dentry *instantiate_t(struct inode *, struct dentry *, 173typedef int instantiate_t(struct inode *, struct dentry *,
174 struct task_struct *, const void *); 174 struct task_struct *, const void *);
175extern int proc_fill_cache(struct file *, void *, filldir_t, const char *, int, 175extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
176 instantiate_t, struct task_struct *, const void *); 176 instantiate_t, struct task_struct *, const void *);
177 177
178/* 178/*
@@ -183,8 +183,8 @@ extern spinlock_t proc_subdir_lock;
183extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); 183extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
184extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, 184extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
185 struct dentry *); 185 struct dentry *);
186extern int proc_readdir(struct file *, void *, filldir_t); 186extern int proc_readdir(struct file *, struct dir_context *);
187extern int proc_readdir_de(struct proc_dir_entry *, struct file *, void *, filldir_t); 187extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
188 188
189static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) 189static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
190{ 190{
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 0a22194e5d58..06ea155e1a59 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -408,7 +408,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
408 prpsinfo.pr_zomb = 0; 408 prpsinfo.pr_zomb = 0;
409 409
410 strcpy(prpsinfo.pr_fname, "vmlinux"); 410 strcpy(prpsinfo.pr_fname, "vmlinux");
411 strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ); 411 strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
412 412
413 nhdr->p_filesz += notesize(&notes[1]); 413 nhdr->p_filesz += notesize(&notes[1]);
414 bufp = storenote(&notes[1], bufp); 414 bufp = storenote(&notes[1], bufp);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 54bdc6701e9f..49a7fff2e83a 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -187,13 +187,12 @@ static const struct inode_operations proc_ns_link_inode_operations = {
187 .setattr = proc_setattr, 187 .setattr = proc_setattr,
188}; 188};
189 189
190static struct dentry *proc_ns_instantiate(struct inode *dir, 190static int proc_ns_instantiate(struct inode *dir,
191 struct dentry *dentry, struct task_struct *task, const void *ptr) 191 struct dentry *dentry, struct task_struct *task, const void *ptr)
192{ 192{
193 const struct proc_ns_operations *ns_ops = ptr; 193 const struct proc_ns_operations *ns_ops = ptr;
194 struct inode *inode; 194 struct inode *inode;
195 struct proc_inode *ei; 195 struct proc_inode *ei;
196 struct dentry *error = ERR_PTR(-ENOENT);
197 196
198 inode = proc_pid_make_inode(dir->i_sb, task); 197 inode = proc_pid_make_inode(dir->i_sb, task);
199 if (!inode) 198 if (!inode)
@@ -208,90 +207,52 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
208 d_add(dentry, inode); 207 d_add(dentry, inode);
209 /* Close the race of the process dying before we return the dentry */ 208 /* Close the race of the process dying before we return the dentry */
210 if (pid_revalidate(dentry, 0)) 209 if (pid_revalidate(dentry, 0))
211 error = NULL; 210 return 0;
212out: 211out:
213 return error; 212 return -ENOENT;
214}
215
216static int proc_ns_fill_cache(struct file *filp, void *dirent,
217 filldir_t filldir, struct task_struct *task,
218 const struct proc_ns_operations *ops)
219{
220 return proc_fill_cache(filp, dirent, filldir,
221 ops->name, strlen(ops->name),
222 proc_ns_instantiate, task, ops);
223} 213}
224 214
225static int proc_ns_dir_readdir(struct file *filp, void *dirent, 215static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
226 filldir_t filldir)
227{ 216{
228 int i; 217 struct task_struct *task = get_proc_task(file_inode(file));
229 struct dentry *dentry = filp->f_path.dentry;
230 struct inode *inode = dentry->d_inode;
231 struct task_struct *task = get_proc_task(inode);
232 const struct proc_ns_operations **entry, **last; 218 const struct proc_ns_operations **entry, **last;
233 ino_t ino;
234 int ret;
235 219
236 ret = -ENOENT;
237 if (!task) 220 if (!task)
238 goto out_no_task; 221 return -ENOENT;
239 222
240 ret = 0; 223 if (!dir_emit_dots(file, ctx))
241 i = filp->f_pos; 224 goto out;
242 switch (i) { 225 if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
243 case 0: 226 goto out;
244 ino = inode->i_ino; 227 entry = ns_entries + (ctx->pos - 2);
245 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) 228 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
246 goto out; 229 while (entry <= last) {
247 i++; 230 const struct proc_ns_operations *ops = *entry;
248 filp->f_pos++; 231 if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
249 /* fall through */ 232 proc_ns_instantiate, task, ops))
250 case 1: 233 break;
251 ino = parent_ino(dentry); 234 ctx->pos++;
252 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) 235 entry++;
253 goto out;
254 i++;
255 filp->f_pos++;
256 /* fall through */
257 default:
258 i -= 2;
259 if (i >= ARRAY_SIZE(ns_entries)) {
260 ret = 1;
261 goto out;
262 }
263 entry = ns_entries + i;
264 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
265 while (entry <= last) {
266 if (proc_ns_fill_cache(filp, dirent, filldir,
267 task, *entry) < 0)
268 goto out;
269 filp->f_pos++;
270 entry++;
271 }
272 } 236 }
273
274 ret = 1;
275out: 237out:
276 put_task_struct(task); 238 put_task_struct(task);
277out_no_task: 239 return 0;
278 return ret;
279} 240}
280 241
281const struct file_operations proc_ns_dir_operations = { 242const struct file_operations proc_ns_dir_operations = {
282 .read = generic_read_dir, 243 .read = generic_read_dir,
283 .readdir = proc_ns_dir_readdir, 244 .iterate = proc_ns_dir_readdir,
284}; 245};
285 246
286static struct dentry *proc_ns_dir_lookup(struct inode *dir, 247static struct dentry *proc_ns_dir_lookup(struct inode *dir,
287 struct dentry *dentry, unsigned int flags) 248 struct dentry *dentry, unsigned int flags)
288{ 249{
289 struct dentry *error; 250 int error;
290 struct task_struct *task = get_proc_task(dir); 251 struct task_struct *task = get_proc_task(dir);
291 const struct proc_ns_operations **entry, **last; 252 const struct proc_ns_operations **entry, **last;
292 unsigned int len = dentry->d_name.len; 253 unsigned int len = dentry->d_name.len;
293 254
294 error = ERR_PTR(-ENOENT); 255 error = -ENOENT;
295 256
296 if (!task) 257 if (!task)
297 goto out_no_task; 258 goto out_no_task;
@@ -310,7 +271,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
310out: 271out:
311 put_task_struct(task); 272 put_task_struct(task);
312out_no_task: 273out_no_task:
313 return error; 274 return ERR_PTR(error);
314} 275}
315 276
316const struct inode_operations proc_ns_dir_inode_operations = { 277const struct inode_operations proc_ns_dir_inode_operations = {
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 986e83220d56..4677bb7dc7c2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -160,16 +160,15 @@ const struct inode_operations proc_net_inode_operations = {
160 .getattr = proc_tgid_net_getattr, 160 .getattr = proc_tgid_net_getattr,
161}; 161};
162 162
163static int proc_tgid_net_readdir(struct file *filp, void *dirent, 163static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
164 filldir_t filldir)
165{ 164{
166 int ret; 165 int ret;
167 struct net *net; 166 struct net *net;
168 167
169 ret = -EINVAL; 168 ret = -EINVAL;
170 net = get_proc_task_net(file_inode(filp)); 169 net = get_proc_task_net(file_inode(file));
171 if (net != NULL) { 170 if (net != NULL) {
172 ret = proc_readdir_de(net->proc_net, filp, dirent, filldir); 171 ret = proc_readdir_de(net->proc_net, file, ctx);
173 put_net(net); 172 put_net(net);
174 } 173 }
175 return ret; 174 return ret;
@@ -178,7 +177,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
178const struct file_operations proc_net_operations = { 177const struct file_operations proc_net_operations = {
179 .llseek = generic_file_llseek, 178 .llseek = generic_file_llseek,
180 .read = generic_read_dir, 179 .read = generic_read_dir,
181 .readdir = proc_tgid_net_readdir, 180 .iterate = proc_tgid_net_readdir,
182}; 181};
183 182
184static __net_init int proc_net_ns_init(struct net *net) 183static __net_init int proc_net_ns_init(struct net *net)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ac05f33a0dde..71290463a1d3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -573,12 +573,12 @@ out:
573 return ret; 573 return ret;
574} 574}
575 575
576static int proc_sys_fill_cache(struct file *filp, void *dirent, 576static bool proc_sys_fill_cache(struct file *file,
577 filldir_t filldir, 577 struct dir_context *ctx,
578 struct ctl_table_header *head, 578 struct ctl_table_header *head,
579 struct ctl_table *table) 579 struct ctl_table *table)
580{ 580{
581 struct dentry *child, *dir = filp->f_path.dentry; 581 struct dentry *child, *dir = file->f_path.dentry;
582 struct inode *inode; 582 struct inode *inode;
583 struct qstr qname; 583 struct qstr qname;
584 ino_t ino = 0; 584 ino_t ino = 0;
@@ -595,38 +595,38 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
595 inode = proc_sys_make_inode(dir->d_sb, head, table); 595 inode = proc_sys_make_inode(dir->d_sb, head, table);
596 if (!inode) { 596 if (!inode) {
597 dput(child); 597 dput(child);
598 return -ENOMEM; 598 return false;
599 } else { 599 } else {
600 d_set_d_op(child, &proc_sys_dentry_operations); 600 d_set_d_op(child, &proc_sys_dentry_operations);
601 d_add(child, inode); 601 d_add(child, inode);
602 } 602 }
603 } else { 603 } else {
604 return -ENOMEM; 604 return false;
605 } 605 }
606 } 606 }
607 inode = child->d_inode; 607 inode = child->d_inode;
608 ino = inode->i_ino; 608 ino = inode->i_ino;
609 type = inode->i_mode >> 12; 609 type = inode->i_mode >> 12;
610 dput(child); 610 dput(child);
611 return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); 611 return dir_emit(ctx, qname.name, qname.len, ino, type);
612} 612}
613 613
614static int proc_sys_link_fill_cache(struct file *filp, void *dirent, 614static bool proc_sys_link_fill_cache(struct file *file,
615 filldir_t filldir, 615 struct dir_context *ctx,
616 struct ctl_table_header *head, 616 struct ctl_table_header *head,
617 struct ctl_table *table) 617 struct ctl_table *table)
618{ 618{
619 int err, ret = 0; 619 bool ret = true;
620 head = sysctl_head_grab(head); 620 head = sysctl_head_grab(head);
621 621
622 if (S_ISLNK(table->mode)) { 622 if (S_ISLNK(table->mode)) {
623 /* It is not an error if we can not follow the link ignore it */ 623 /* It is not an error if we can not follow the link ignore it */
624 err = sysctl_follow_link(&head, &table, current->nsproxy); 624 int err = sysctl_follow_link(&head, &table, current->nsproxy);
625 if (err) 625 if (err)
626 goto out; 626 goto out;
627 } 627 }
628 628
629 ret = proc_sys_fill_cache(filp, dirent, filldir, head, table); 629 ret = proc_sys_fill_cache(file, ctx, head, table);
630out: 630out:
631 sysctl_head_finish(head); 631 sysctl_head_finish(head);
632 return ret; 632 return ret;
@@ -634,67 +634,50 @@ out:
634 634
635static int scan(struct ctl_table_header *head, ctl_table *table, 635static int scan(struct ctl_table_header *head, ctl_table *table,
636 unsigned long *pos, struct file *file, 636 unsigned long *pos, struct file *file,
637 void *dirent, filldir_t filldir) 637 struct dir_context *ctx)
638{ 638{
639 int res; 639 bool res;
640 640
641 if ((*pos)++ < file->f_pos) 641 if ((*pos)++ < ctx->pos)
642 return 0; 642 return true;
643 643
644 if (unlikely(S_ISLNK(table->mode))) 644 if (unlikely(S_ISLNK(table->mode)))
645 res = proc_sys_link_fill_cache(file, dirent, filldir, head, table); 645 res = proc_sys_link_fill_cache(file, ctx, head, table);
646 else 646 else
647 res = proc_sys_fill_cache(file, dirent, filldir, head, table); 647 res = proc_sys_fill_cache(file, ctx, head, table);
648 648
649 if (res == 0) 649 if (res)
650 file->f_pos = *pos; 650 ctx->pos = *pos;
651 651
652 return res; 652 return res;
653} 653}
654 654
655static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) 655static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
656{ 656{
657 struct dentry *dentry = filp->f_path.dentry; 657 struct ctl_table_header *head = grab_header(file_inode(file));
658 struct inode *inode = dentry->d_inode;
659 struct ctl_table_header *head = grab_header(inode);
660 struct ctl_table_header *h = NULL; 658 struct ctl_table_header *h = NULL;
661 struct ctl_table *entry; 659 struct ctl_table *entry;
662 struct ctl_dir *ctl_dir; 660 struct ctl_dir *ctl_dir;
663 unsigned long pos; 661 unsigned long pos;
664 int ret = -EINVAL;
665 662
666 if (IS_ERR(head)) 663 if (IS_ERR(head))
667 return PTR_ERR(head); 664 return PTR_ERR(head);
668 665
669 ctl_dir = container_of(head, struct ctl_dir, header); 666 ctl_dir = container_of(head, struct ctl_dir, header);
670 667
671 ret = 0; 668 if (!dir_emit_dots(file, ctx))
672 /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ 669 return 0;
673 if (filp->f_pos == 0) { 670
674 if (filldir(dirent, ".", 1, filp->f_pos,
675 inode->i_ino, DT_DIR) < 0)
676 goto out;
677 filp->f_pos++;
678 }
679 if (filp->f_pos == 1) {
680 if (filldir(dirent, "..", 2, filp->f_pos,
681 parent_ino(dentry), DT_DIR) < 0)
682 goto out;
683 filp->f_pos++;
684 }
685 pos = 2; 671 pos = 2;
686 672
687 for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { 673 for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
688 ret = scan(h, entry, &pos, filp, dirent, filldir); 674 if (!scan(h, entry, &pos, file, ctx)) {
689 if (ret) {
690 sysctl_head_finish(h); 675 sysctl_head_finish(h);
691 break; 676 break;
692 } 677 }
693 } 678 }
694 ret = 1;
695out:
696 sysctl_head_finish(head); 679 sysctl_head_finish(head);
697 return ret; 680 return 0;
698} 681}
699 682
700static int proc_sys_permission(struct inode *inode, int mask) 683static int proc_sys_permission(struct inode *inode, int mask)
@@ -769,7 +752,7 @@ static const struct file_operations proc_sys_file_operations = {
769 752
770static const struct file_operations proc_sys_dir_file_operations = { 753static const struct file_operations proc_sys_dir_file_operations = {
771 .read = generic_read_dir, 754 .read = generic_read_dir,
772 .readdir = proc_sys_readdir, 755 .iterate = proc_sys_readdir,
773 .llseek = generic_file_llseek, 756 .llseek = generic_file_llseek,
774}; 757};
775 758
@@ -813,15 +796,16 @@ static int sysctl_is_seen(struct ctl_table_header *p)
813 return res; 796 return res;
814} 797}
815 798
816static int proc_sys_compare(const struct dentry *parent, 799static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry,
817 const struct inode *pinode,
818 const struct dentry *dentry, const struct inode *inode,
819 unsigned int len, const char *str, const struct qstr *name) 800 unsigned int len, const char *str, const struct qstr *name)
820{ 801{
821 struct ctl_table_header *head; 802 struct ctl_table_header *head;
803 struct inode *inode;
804
822 /* Although proc doesn't have negative dentries, rcu-walk means 805 /* Although proc doesn't have negative dentries, rcu-walk means
823 * that inode here can be NULL */ 806 * that inode here can be NULL */
824 /* AV: can it, indeed? */ 807 /* AV: can it, indeed? */
808 inode = ACCESS_ONCE(dentry->d_inode);
825 if (!inode) 809 if (!inode)
826 return 1; 810 return 1;
827 if (name->len != len) 811 if (name->len != len)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 41a6ea93f486..229e366598da 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -202,21 +202,14 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
202 return proc_pid_lookup(dir, dentry, flags); 202 return proc_pid_lookup(dir, dentry, flags);
203} 203}
204 204
205static int proc_root_readdir(struct file * filp, 205static int proc_root_readdir(struct file *file, struct dir_context *ctx)
206 void * dirent, filldir_t filldir)
207{ 206{
208 unsigned int nr = filp->f_pos; 207 if (ctx->pos < FIRST_PROCESS_ENTRY) {
209 int ret; 208 proc_readdir(file, ctx);
210 209 ctx->pos = FIRST_PROCESS_ENTRY;
211 if (nr < FIRST_PROCESS_ENTRY) {
212 int error = proc_readdir(filp, dirent, filldir);
213 if (error <= 0)
214 return error;
215 filp->f_pos = FIRST_PROCESS_ENTRY;
216 } 210 }
217 211
218 ret = proc_pid_readdir(filp, dirent, filldir); 212 return proc_pid_readdir(file, ctx);
219 return ret;
220} 213}
221 214
222/* 215/*
@@ -226,7 +219,7 @@ static int proc_root_readdir(struct file * filp,
226 */ 219 */
227static const struct file_operations proc_root_operations = { 220static const struct file_operations proc_root_operations = {
228 .read = generic_read_dir, 221 .read = generic_read_dir,
229 .readdir = proc_root_readdir, 222 .iterate = proc_root_readdir,
230 .llseek = default_llseek, 223 .llseek = default_llseek,
231}; 224};
232 225
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d864d56..107d026f5d6e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
11#include <linux/rmap.h> 11#include <linux/rmap.h>
12#include <linux/swap.h> 12#include <linux/swap.h>
13#include <linux/swapops.h> 13#include <linux/swapops.h>
14#include <linux/mmu_notifier.h>
14 15
15#include <asm/elf.h> 16#include <asm/elf.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
@@ -688,10 +689,66 @@ const struct file_operations proc_tid_smaps_operations = {
688 .release = seq_release_private, 689 .release = seq_release_private,
689}; 690};
690 691
692/*
693 * We do not want to have constant page-shift bits sitting in
694 * pagemap entries and are about to reuse them some time soon.
695 *
696 * Here's the "migration strategy":
697 * 1. when the system boots these bits remain what they are,
698 * but a warning about future change is printed in log;
699 * 2. once anyone clears soft-dirty bits via clear_refs file,
700 * these flag is set to denote, that user is aware of the
701 * new API and those page-shift bits change their meaning.
702 * The respective warning is printed in dmesg;
703 * 3. In a couple of releases we will remove all the mentions
704 * of page-shift in pagemap entries.
705 */
706
707static bool soft_dirty_cleared __read_mostly;
708
709enum clear_refs_types {
710 CLEAR_REFS_ALL = 1,
711 CLEAR_REFS_ANON,
712 CLEAR_REFS_MAPPED,
713 CLEAR_REFS_SOFT_DIRTY,
714 CLEAR_REFS_LAST,
715};
716
717struct clear_refs_private {
718 struct vm_area_struct *vma;
719 enum clear_refs_types type;
720};
721
722static inline void clear_soft_dirty(struct vm_area_struct *vma,
723 unsigned long addr, pte_t *pte)
724{
725#ifdef CONFIG_MEM_SOFT_DIRTY
726 /*
727 * The soft-dirty tracker uses #PF-s to catch writes
728 * to pages, so write-protect the pte as well. See the
729 * Documentation/vm/soft-dirty.txt for full description
730 * of how soft-dirty works.
731 */
732 pte_t ptent = *pte;
733
734 if (pte_present(ptent)) {
735 ptent = pte_wrprotect(ptent);
736 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
737 } else if (is_swap_pte(ptent)) {
738 ptent = pte_swp_clear_soft_dirty(ptent);
739 } else if (pte_file(ptent)) {
740 ptent = pte_file_clear_soft_dirty(ptent);
741 }
742
743 set_pte_at(vma->vm_mm, addr, pte, ptent);
744#endif
745}
746
691static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 747static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
692 unsigned long end, struct mm_walk *walk) 748 unsigned long end, struct mm_walk *walk)
693{ 749{
694 struct vm_area_struct *vma = walk->private; 750 struct clear_refs_private *cp = walk->private;
751 struct vm_area_struct *vma = cp->vma;
695 pte_t *pte, ptent; 752 pte_t *pte, ptent;
696 spinlock_t *ptl; 753 spinlock_t *ptl;
697 struct page *page; 754 struct page *page;
@@ -703,6 +760,12 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
703 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 760 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
704 for (; addr != end; pte++, addr += PAGE_SIZE) { 761 for (; addr != end; pte++, addr += PAGE_SIZE) {
705 ptent = *pte; 762 ptent = *pte;
763
764 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
765 clear_soft_dirty(vma, addr, pte);
766 continue;
767 }
768
706 if (!pte_present(ptent)) 769 if (!pte_present(ptent))
707 continue; 770 continue;
708 771
@@ -719,10 +782,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
719 return 0; 782 return 0;
720} 783}
721 784
722#define CLEAR_REFS_ALL 1
723#define CLEAR_REFS_ANON 2
724#define CLEAR_REFS_MAPPED 3
725
726static ssize_t clear_refs_write(struct file *file, const char __user *buf, 785static ssize_t clear_refs_write(struct file *file, const char __user *buf,
727 size_t count, loff_t *ppos) 786 size_t count, loff_t *ppos)
728{ 787{
@@ -730,7 +789,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
730 char buffer[PROC_NUMBUF]; 789 char buffer[PROC_NUMBUF];
731 struct mm_struct *mm; 790 struct mm_struct *mm;
732 struct vm_area_struct *vma; 791 struct vm_area_struct *vma;
733 int type; 792 enum clear_refs_types type;
793 int itype;
734 int rv; 794 int rv;
735 795
736 memset(buffer, 0, sizeof(buffer)); 796 memset(buffer, 0, sizeof(buffer));
@@ -738,23 +798,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
738 count = sizeof(buffer) - 1; 798 count = sizeof(buffer) - 1;
739 if (copy_from_user(buffer, buf, count)) 799 if (copy_from_user(buffer, buf, count))
740 return -EFAULT; 800 return -EFAULT;
741 rv = kstrtoint(strstrip(buffer), 10, &type); 801 rv = kstrtoint(strstrip(buffer), 10, &itype);
742 if (rv < 0) 802 if (rv < 0)
743 return rv; 803 return rv;
744 if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) 804 type = (enum clear_refs_types)itype;
805 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
745 return -EINVAL; 806 return -EINVAL;
807
808 if (type == CLEAR_REFS_SOFT_DIRTY) {
809 soft_dirty_cleared = true;
810 pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
811 "See the linux/Documentation/vm/pagemap.txt for details.\n");
812 }
813
746 task = get_proc_task(file_inode(file)); 814 task = get_proc_task(file_inode(file));
747 if (!task) 815 if (!task)
748 return -ESRCH; 816 return -ESRCH;
749 mm = get_task_mm(task); 817 mm = get_task_mm(task);
750 if (mm) { 818 if (mm) {
819 struct clear_refs_private cp = {
820 .type = type,
821 };
751 struct mm_walk clear_refs_walk = { 822 struct mm_walk clear_refs_walk = {
752 .pmd_entry = clear_refs_pte_range, 823 .pmd_entry = clear_refs_pte_range,
753 .mm = mm, 824 .mm = mm,
825 .private = &cp,
754 }; 826 };
755 down_read(&mm->mmap_sem); 827 down_read(&mm->mmap_sem);
828 if (type == CLEAR_REFS_SOFT_DIRTY)
829 mmu_notifier_invalidate_range_start(mm, 0, -1);
756 for (vma = mm->mmap; vma; vma = vma->vm_next) { 830 for (vma = mm->mmap; vma; vma = vma->vm_next) {
757 clear_refs_walk.private = vma; 831 cp.vma = vma;
758 if (is_vm_hugetlb_page(vma)) 832 if (is_vm_hugetlb_page(vma))
759 continue; 833 continue;
760 /* 834 /*
@@ -773,6 +847,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
773 walk_page_range(vma->vm_start, vma->vm_end, 847 walk_page_range(vma->vm_start, vma->vm_end,
774 &clear_refs_walk); 848 &clear_refs_walk);
775 } 849 }
850 if (type == CLEAR_REFS_SOFT_DIRTY)
851 mmu_notifier_invalidate_range_end(mm, 0, -1);
776 flush_tlb_mm(mm); 852 flush_tlb_mm(mm);
777 up_read(&mm->mmap_sem); 853 up_read(&mm->mmap_sem);
778 mmput(mm); 854 mmput(mm);
@@ -792,14 +868,15 @@ typedef struct {
792} pagemap_entry_t; 868} pagemap_entry_t;
793 869
794struct pagemapread { 870struct pagemapread {
795 int pos, len; 871 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
796 pagemap_entry_t *buffer; 872 pagemap_entry_t *buffer;
873 bool v2;
797}; 874};
798 875
799#define PAGEMAP_WALK_SIZE (PMD_SIZE) 876#define PAGEMAP_WALK_SIZE (PMD_SIZE)
800#define PAGEMAP_WALK_MASK (PMD_MASK) 877#define PAGEMAP_WALK_MASK (PMD_MASK)
801 878
802#define PM_ENTRY_BYTES sizeof(u64) 879#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
803#define PM_STATUS_BITS 3 880#define PM_STATUS_BITS 3
804#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) 881#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
805#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) 882#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
@@ -807,14 +884,17 @@ struct pagemapread {
807#define PM_PSHIFT_BITS 6 884#define PM_PSHIFT_BITS 6
808#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) 885#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
809#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) 886#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
810#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) 887#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
811#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) 888#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
812#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) 889#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
890/* in "new" pagemap pshift bits are occupied with more status bits */
891#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
813 892
893#define __PM_SOFT_DIRTY (1LL)
814#define PM_PRESENT PM_STATUS(4LL) 894#define PM_PRESENT PM_STATUS(4LL)
815#define PM_SWAP PM_STATUS(2LL) 895#define PM_SWAP PM_STATUS(2LL)
816#define PM_FILE PM_STATUS(1LL) 896#define PM_FILE PM_STATUS(1LL)
817#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) 897#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
818#define PM_END_OF_BUFFER 1 898#define PM_END_OF_BUFFER 1
819 899
820static inline pagemap_entry_t make_pme(u64 val) 900static inline pagemap_entry_t make_pme(u64 val)
@@ -837,7 +917,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
837 struct pagemapread *pm = walk->private; 917 struct pagemapread *pm = walk->private;
838 unsigned long addr; 918 unsigned long addr;
839 int err = 0; 919 int err = 0;
840 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 920 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
841 921
842 for (addr = start; addr < end; addr += PAGE_SIZE) { 922 for (addr = start; addr < end; addr += PAGE_SIZE) {
843 err = add_to_pagemap(addr, &pme, pm); 923 err = add_to_pagemap(addr, &pme, pm);
@@ -847,38 +927,43 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
847 return err; 927 return err;
848} 928}
849 929
850static void pte_to_pagemap_entry(pagemap_entry_t *pme, 930static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
851 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 931 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
852{ 932{
853 u64 frame, flags; 933 u64 frame, flags;
854 struct page *page = NULL; 934 struct page *page = NULL;
935 int flags2 = 0;
855 936
856 if (pte_present(pte)) { 937 if (pte_present(pte)) {
857 frame = pte_pfn(pte); 938 frame = pte_pfn(pte);
858 flags = PM_PRESENT; 939 flags = PM_PRESENT;
859 page = vm_normal_page(vma, addr, pte); 940 page = vm_normal_page(vma, addr, pte);
860 } else if (is_swap_pte(pte)) { 941 } else if (is_swap_pte(pte)) {
861 swp_entry_t entry = pte_to_swp_entry(pte); 942 swp_entry_t entry;
862 943 if (pte_swp_soft_dirty(pte))
944 flags2 |= __PM_SOFT_DIRTY;
945 entry = pte_to_swp_entry(pte);
863 frame = swp_type(entry) | 946 frame = swp_type(entry) |
864 (swp_offset(entry) << MAX_SWAPFILES_SHIFT); 947 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
865 flags = PM_SWAP; 948 flags = PM_SWAP;
866 if (is_migration_entry(entry)) 949 if (is_migration_entry(entry))
867 page = migration_entry_to_page(entry); 950 page = migration_entry_to_page(entry);
868 } else { 951 } else {
869 *pme = make_pme(PM_NOT_PRESENT); 952 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
870 return; 953 return;
871 } 954 }
872 955
873 if (page && !PageAnon(page)) 956 if (page && !PageAnon(page))
874 flags |= PM_FILE; 957 flags |= PM_FILE;
958 if (pte_soft_dirty(pte))
959 flags2 |= __PM_SOFT_DIRTY;
875 960
876 *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); 961 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
877} 962}
878 963
879#ifdef CONFIG_TRANSPARENT_HUGEPAGE 964#ifdef CONFIG_TRANSPARENT_HUGEPAGE
880static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, 965static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
881 pmd_t pmd, int offset) 966 pmd_t pmd, int offset, int pmd_flags2)
882{ 967{
883 /* 968 /*
884 * Currently pmd for thp is always present because thp can not be 969 * Currently pmd for thp is always present because thp can not be
@@ -887,13 +972,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
887 */ 972 */
888 if (pmd_present(pmd)) 973 if (pmd_present(pmd))
889 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 974 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
890 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 975 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
891 else 976 else
892 *pme = make_pme(PM_NOT_PRESENT); 977 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
893} 978}
894#else 979#else
895static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, 980static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
896 pmd_t pmd, int offset) 981 pmd_t pmd, int offset, int pmd_flags2)
897{ 982{
898} 983}
899#endif 984#endif
@@ -905,17 +990,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
905 struct pagemapread *pm = walk->private; 990 struct pagemapread *pm = walk->private;
906 pte_t *pte; 991 pte_t *pte;
907 int err = 0; 992 int err = 0;
908 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 993 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
909 994
910 /* find the first VMA at or above 'addr' */ 995 /* find the first VMA at or above 'addr' */
911 vma = find_vma(walk->mm, addr); 996 vma = find_vma(walk->mm, addr);
912 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { 997 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
998 int pmd_flags2;
999
1000 pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
913 for (; addr != end; addr += PAGE_SIZE) { 1001 for (; addr != end; addr += PAGE_SIZE) {
914 unsigned long offset; 1002 unsigned long offset;
915 1003
916 offset = (addr & ~PAGEMAP_WALK_MASK) >> 1004 offset = (addr & ~PAGEMAP_WALK_MASK) >>
917 PAGE_SHIFT; 1005 PAGE_SHIFT;
918 thp_pmd_to_pagemap_entry(&pme, *pmd, offset); 1006 thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
919 err = add_to_pagemap(addr, &pme, pm); 1007 err = add_to_pagemap(addr, &pme, pm);
920 if (err) 1008 if (err)
921 break; 1009 break;
@@ -932,7 +1020,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
932 * and need a new, higher one */ 1020 * and need a new, higher one */
933 if (vma && (addr >= vma->vm_end)) { 1021 if (vma && (addr >= vma->vm_end)) {
934 vma = find_vma(walk->mm, addr); 1022 vma = find_vma(walk->mm, addr);
935 pme = make_pme(PM_NOT_PRESENT); 1023 pme = make_pme(PM_NOT_PRESENT(pm->v2));
936 } 1024 }
937 1025
938 /* check that 'vma' actually covers this address, 1026 /* check that 'vma' actually covers this address,
@@ -940,7 +1028,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
940 if (vma && (vma->vm_start <= addr) && 1028 if (vma && (vma->vm_start <= addr) &&
941 !is_vm_hugetlb_page(vma)) { 1029 !is_vm_hugetlb_page(vma)) {
942 pte = pte_offset_map(pmd, addr); 1030 pte = pte_offset_map(pmd, addr);
943 pte_to_pagemap_entry(&pme, vma, addr, *pte); 1031 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
944 /* unmap before userspace copy */ 1032 /* unmap before userspace copy */
945 pte_unmap(pte); 1033 pte_unmap(pte);
946 } 1034 }
@@ -955,14 +1043,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
955} 1043}
956 1044
957#ifdef CONFIG_HUGETLB_PAGE 1045#ifdef CONFIG_HUGETLB_PAGE
958static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, 1046static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
959 pte_t pte, int offset) 1047 pte_t pte, int offset)
960{ 1048{
961 if (pte_present(pte)) 1049 if (pte_present(pte))
962 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) 1050 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
963 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 1051 | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
964 else 1052 else
965 *pme = make_pme(PM_NOT_PRESENT); 1053 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
966} 1054}
967 1055
968/* This function walks within one hugetlb entry in the single call */ 1056/* This function walks within one hugetlb entry in the single call */
@@ -976,7 +1064,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
976 1064
977 for (; addr != end; addr += PAGE_SIZE) { 1065 for (; addr != end; addr += PAGE_SIZE) {
978 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1066 int offset = (addr & ~hmask) >> PAGE_SHIFT;
979 huge_pte_to_pagemap_entry(&pme, *pte, offset); 1067 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
980 err = add_to_pagemap(addr, &pme, pm); 1068 err = add_to_pagemap(addr, &pme, pm);
981 if (err) 1069 if (err)
982 return err; 1070 return err;
@@ -1038,8 +1126,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1038 if (!count) 1126 if (!count)
1039 goto out_task; 1127 goto out_task;
1040 1128
1041 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 1129 pm.v2 = soft_dirty_cleared;
1042 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); 1130 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1131 pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
1043 ret = -ENOMEM; 1132 ret = -ENOMEM;
1044 if (!pm.buffer) 1133 if (!pm.buffer)
1045 goto out_task; 1134 goto out_task;
@@ -1110,9 +1199,18 @@ out:
1110 return ret; 1199 return ret;
1111} 1200}
1112 1201
1202static int pagemap_open(struct inode *inode, struct file *file)
1203{
1204 pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
1205 "to stop being page-shift some time soon. See the "
1206 "linux/Documentation/vm/pagemap.txt for details.\n");
1207 return 0;
1208}
1209
1113const struct file_operations proc_pagemap_operations = { 1210const struct file_operations proc_pagemap_operations = {
1114 .llseek = mem_lseek, /* borrow this */ 1211 .llseek = mem_lseek, /* borrow this */
1115 .read = pagemap_read, 1212 .read = pagemap_read,
1213 .open = pagemap_open,
1116}; 1214};
1117#endif /* CONFIG_PROC_PAGE_MONITOR */ 1215#endif /* CONFIG_PROC_PAGE_MONITOR */
1118 1216
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 9610ac772d7e..061894625903 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -20,8 +20,7 @@ static int uptime_proc_show(struct seq_file *m, void *v)
20 for_each_possible_cpu(i) 20 for_each_possible_cpu(i)
21 idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; 21 idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
22 22
23 do_posix_clock_monotonic_gettime(&uptime); 23 get_monotonic_boottime(&uptime);
24 monotonic_to_bootbased(&uptime);
25 nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; 24 nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
26 idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); 25 idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
27 idle.tv_nsec = rem; 26 idle.tv_nsec = rem;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 17f7e080d7ff..a1a16eb97c7b 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/crash_dump.h> 21#include <linux/crash_dump.h>
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/vmalloc.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24#include <asm/io.h> 25#include <asm/io.h>
25#include "internal.h" 26#include "internal.h"
@@ -32,6 +33,10 @@ static LIST_HEAD(vmcore_list);
32/* Stores the pointer to the buffer containing kernel elf core headers. */ 33/* Stores the pointer to the buffer containing kernel elf core headers. */
33static char *elfcorebuf; 34static char *elfcorebuf;
34static size_t elfcorebuf_sz; 35static size_t elfcorebuf_sz;
36static size_t elfcorebuf_sz_orig;
37
38static char *elfnotes_buf;
39static size_t elfnotes_sz;
35 40
36/* Total size of vmcore file. */ 41/* Total size of vmcore file. */
37static u64 vmcore_size; 42static u64 vmcore_size;
@@ -118,27 +123,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
118 return read; 123 return read;
119} 124}
120 125
121/* Maps vmcore file offset to respective physical address in memroy. */
122static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
123 struct vmcore **m_ptr)
124{
125 struct vmcore *m;
126 u64 paddr;
127
128 list_for_each_entry(m, vc_list, list) {
129 u64 start, end;
130 start = m->offset;
131 end = m->offset + m->size - 1;
132 if (offset >= start && offset <= end) {
133 paddr = m->paddr + offset - start;
134 *m_ptr = m;
135 return paddr;
136 }
137 }
138 *m_ptr = NULL;
139 return 0;
140}
141
142/* Read from the ELF header and then the crash dump. On error, negative value is 126/* Read from the ELF header and then the crash dump. On error, negative value is
143 * returned otherwise number of bytes read are returned. 127 * returned otherwise number of bytes read are returned.
144 */ 128 */
@@ -147,8 +131,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
147{ 131{
148 ssize_t acc = 0, tmp; 132 ssize_t acc = 0, tmp;
149 size_t tsz; 133 size_t tsz;
150 u64 start, nr_bytes; 134 u64 start;
151 struct vmcore *curr_m = NULL; 135 struct vmcore *m = NULL;
152 136
153 if (buflen == 0 || *fpos >= vmcore_size) 137 if (buflen == 0 || *fpos >= vmcore_size)
154 return 0; 138 return 0;
@@ -159,9 +143,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
159 143
160 /* Read ELF core header */ 144 /* Read ELF core header */
161 if (*fpos < elfcorebuf_sz) { 145 if (*fpos < elfcorebuf_sz) {
162 tsz = elfcorebuf_sz - *fpos; 146 tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
163 if (buflen < tsz)
164 tsz = buflen;
165 if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) 147 if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
166 return -EFAULT; 148 return -EFAULT;
167 buflen -= tsz; 149 buflen -= tsz;
@@ -174,39 +156,161 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
174 return acc; 156 return acc;
175 } 157 }
176 158
177 start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); 159 /* Read Elf note segment */
178 if (!curr_m) 160 if (*fpos < elfcorebuf_sz + elfnotes_sz) {
179 return -EINVAL; 161 void *kaddr;
180
181 while (buflen) {
182 tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK));
183 162
184 /* Calculate left bytes in current memory segment. */ 163 tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
185 nr_bytes = (curr_m->size - (start - curr_m->paddr)); 164 kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
186 if (tsz > nr_bytes) 165 if (copy_to_user(buffer, kaddr, tsz))
187 tsz = nr_bytes; 166 return -EFAULT;
188
189 tmp = read_from_oldmem(buffer, tsz, &start, 1);
190 if (tmp < 0)
191 return tmp;
192 buflen -= tsz; 167 buflen -= tsz;
193 *fpos += tsz; 168 *fpos += tsz;
194 buffer += tsz; 169 buffer += tsz;
195 acc += tsz; 170 acc += tsz;
196 if (start >= (curr_m->paddr + curr_m->size)) { 171
197 if (curr_m->list.next == &vmcore_list) 172 /* leave now if filled buffer already */
198 return acc; /*EOF*/ 173 if (buflen == 0)
199 curr_m = list_entry(curr_m->list.next, 174 return acc;
200 struct vmcore, list); 175 }
201 start = curr_m->paddr; 176
177 list_for_each_entry(m, &vmcore_list, list) {
178 if (*fpos < m->offset + m->size) {
179 tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
180 start = m->paddr + *fpos - m->offset;
181 tmp = read_from_oldmem(buffer, tsz, &start, 1);
182 if (tmp < 0)
183 return tmp;
184 buflen -= tsz;
185 *fpos += tsz;
186 buffer += tsz;
187 acc += tsz;
188
189 /* leave now if filled buffer already */
190 if (buflen == 0)
191 return acc;
202 } 192 }
203 } 193 }
194
204 return acc; 195 return acc;
205} 196}
206 197
198/**
199 * alloc_elfnotes_buf - allocate buffer for ELF note segment in
200 * vmalloc memory
201 *
202 * @notes_sz: size of buffer
203 *
204 * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
205 * the buffer to user-space by means of remap_vmalloc_range().
206 *
207 * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
208 * disabled and there's no need to allow users to mmap the buffer.
209 */
210static inline char *alloc_elfnotes_buf(size_t notes_sz)
211{
212#ifdef CONFIG_MMU
213 return vmalloc_user(notes_sz);
214#else
215 return vzalloc(notes_sz);
216#endif
217}
218
219/*
220 * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
221 * essential for mmap_vmcore() in order to map physically
222 * non-contiguous objects (ELF header, ELF note segment and memory
223 * regions in the 1st kernel pointed to by PT_LOAD entries) into
224 * virtually contiguous user-space in ELF layout.
225 */
226#if defined(CONFIG_MMU) && !defined(CONFIG_S390)
227static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
228{
229 size_t size = vma->vm_end - vma->vm_start;
230 u64 start, end, len, tsz;
231 struct vmcore *m;
232
233 start = (u64)vma->vm_pgoff << PAGE_SHIFT;
234 end = start + size;
235
236 if (size > vmcore_size || end > vmcore_size)
237 return -EINVAL;
238
239 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
240 return -EPERM;
241
242 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
243 vma->vm_flags |= VM_MIXEDMAP;
244
245 len = 0;
246
247 if (start < elfcorebuf_sz) {
248 u64 pfn;
249
250 tsz = min(elfcorebuf_sz - (size_t)start, size);
251 pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT;
252 if (remap_pfn_range(vma, vma->vm_start, pfn, tsz,
253 vma->vm_page_prot))
254 return -EAGAIN;
255 size -= tsz;
256 start += tsz;
257 len += tsz;
258
259 if (size == 0)
260 return 0;
261 }
262
263 if (start < elfcorebuf_sz + elfnotes_sz) {
264 void *kaddr;
265
266 tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
267 kaddr = elfnotes_buf + start - elfcorebuf_sz;
268 if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
269 kaddr, tsz))
270 goto fail;
271 size -= tsz;
272 start += tsz;
273 len += tsz;
274
275 if (size == 0)
276 return 0;
277 }
278
279 list_for_each_entry(m, &vmcore_list, list) {
280 if (start < m->offset + m->size) {
281 u64 paddr = 0;
282
283 tsz = min_t(size_t, m->offset + m->size - start, size);
284 paddr = m->paddr + start - m->offset;
285 if (remap_pfn_range(vma, vma->vm_start + len,
286 paddr >> PAGE_SHIFT, tsz,
287 vma->vm_page_prot))
288 goto fail;
289 size -= tsz;
290 start += tsz;
291 len += tsz;
292
293 if (size == 0)
294 return 0;
295 }
296 }
297
298 return 0;
299fail:
300 do_munmap(vma->vm_mm, vma->vm_start, len);
301 return -EAGAIN;
302}
303#else
304static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
305{
306 return -ENOSYS;
307}
308#endif
309
207static const struct file_operations proc_vmcore_operations = { 310static const struct file_operations proc_vmcore_operations = {
208 .read = read_vmcore, 311 .read = read_vmcore,
209 .llseek = default_llseek, 312 .llseek = default_llseek,
313 .mmap = mmap_vmcore,
210}; 314};
211 315
212static struct vmcore* __init get_new_element(void) 316static struct vmcore* __init get_new_element(void)
@@ -214,61 +318,40 @@ static struct vmcore* __init get_new_element(void)
214 return kzalloc(sizeof(struct vmcore), GFP_KERNEL); 318 return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
215} 319}
216 320
217static u64 __init get_vmcore_size_elf64(char *elfptr) 321static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
322 struct list_head *vc_list)
218{ 323{
219 int i;
220 u64 size;
221 Elf64_Ehdr *ehdr_ptr;
222 Elf64_Phdr *phdr_ptr;
223
224 ehdr_ptr = (Elf64_Ehdr *)elfptr;
225 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
226 size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
227 for (i = 0; i < ehdr_ptr->e_phnum; i++) {
228 size += phdr_ptr->p_memsz;
229 phdr_ptr++;
230 }
231 return size;
232}
233
234static u64 __init get_vmcore_size_elf32(char *elfptr)
235{
236 int i;
237 u64 size; 324 u64 size;
238 Elf32_Ehdr *ehdr_ptr; 325 struct vmcore *m;
239 Elf32_Phdr *phdr_ptr;
240 326
241 ehdr_ptr = (Elf32_Ehdr *)elfptr; 327 size = elfsz + elfnotesegsz;
242 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); 328 list_for_each_entry(m, vc_list, list) {
243 size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr)); 329 size += m->size;
244 for (i = 0; i < ehdr_ptr->e_phnum; i++) {
245 size += phdr_ptr->p_memsz;
246 phdr_ptr++;
247 } 330 }
248 return size; 331 return size;
249} 332}
250 333
251/* Merges all the PT_NOTE headers into one. */ 334/**
252static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, 335 * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry
253 struct list_head *vc_list) 336 *
337 * @ehdr_ptr: ELF header
338 *
339 * This function updates p_memsz member of each PT_NOTE entry in the
340 * program header table pointed to by @ehdr_ptr to real size of ELF
341 * note segment.
342 */
343static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
254{ 344{
255 int i, nr_ptnote=0, rc=0; 345 int i, rc=0;
256 char *tmp; 346 Elf64_Phdr *phdr_ptr;
257 Elf64_Ehdr *ehdr_ptr;
258 Elf64_Phdr phdr, *phdr_ptr;
259 Elf64_Nhdr *nhdr_ptr; 347 Elf64_Nhdr *nhdr_ptr;
260 u64 phdr_sz = 0, note_off;
261 348
262 ehdr_ptr = (Elf64_Ehdr *)elfptr; 349 phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
263 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
264 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 350 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
265 int j;
266 void *notes_section; 351 void *notes_section;
267 struct vmcore *new;
268 u64 offset, max_sz, sz, real_sz = 0; 352 u64 offset, max_sz, sz, real_sz = 0;
269 if (phdr_ptr->p_type != PT_NOTE) 353 if (phdr_ptr->p_type != PT_NOTE)
270 continue; 354 continue;
271 nr_ptnote++;
272 max_sz = phdr_ptr->p_memsz; 355 max_sz = phdr_ptr->p_memsz;
273 offset = phdr_ptr->p_offset; 356 offset = phdr_ptr->p_offset;
274 notes_section = kmalloc(max_sz, GFP_KERNEL); 357 notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -280,7 +363,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
280 return rc; 363 return rc;
281 } 364 }
282 nhdr_ptr = notes_section; 365 nhdr_ptr = notes_section;
283 for (j = 0; j < max_sz; j += sz) { 366 while (real_sz < max_sz) {
284 if (nhdr_ptr->n_namesz == 0) 367 if (nhdr_ptr->n_namesz == 0)
285 break; 368 break;
286 sz = sizeof(Elf64_Nhdr) + 369 sz = sizeof(Elf64_Nhdr) +
@@ -289,26 +372,122 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
289 real_sz += sz; 372 real_sz += sz;
290 nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); 373 nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
291 } 374 }
292
293 /* Add this contiguous chunk of notes section to vmcore list.*/
294 new = get_new_element();
295 if (!new) {
296 kfree(notes_section);
297 return -ENOMEM;
298 }
299 new->paddr = phdr_ptr->p_offset;
300 new->size = real_sz;
301 list_add_tail(&new->list, vc_list);
302 phdr_sz += real_sz;
303 kfree(notes_section); 375 kfree(notes_section);
376 phdr_ptr->p_memsz = real_sz;
377 }
378
379 return 0;
380}
381
382/**
383 * get_note_number_and_size_elf64 - get the number of PT_NOTE program
384 * headers and sum of real size of their ELF note segment headers and
385 * data.
386 *
387 * @ehdr_ptr: ELF header
388 * @nr_ptnote: buffer for the number of PT_NOTE program headers
389 * @sz_ptnote: buffer for size of unique PT_NOTE program header
390 *
391 * This function is used to merge multiple PT_NOTE program headers
392 * into a unique single one. The resulting unique entry will have
393 * @sz_ptnote in its phdr->p_mem.
394 *
395 * It is assumed that program headers with PT_NOTE type pointed to by
396 * @ehdr_ptr has already been updated by update_note_header_size_elf64
397 * and each of PT_NOTE program headers has actual ELF note segment
398 * size in its p_memsz member.
399 */
400static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr,
401 int *nr_ptnote, u64 *sz_ptnote)
402{
403 int i;
404 Elf64_Phdr *phdr_ptr;
405
406 *nr_ptnote = *sz_ptnote = 0;
407
408 phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
409 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
410 if (phdr_ptr->p_type != PT_NOTE)
411 continue;
412 *nr_ptnote += 1;
413 *sz_ptnote += phdr_ptr->p_memsz;
414 }
415
416 return 0;
417}
418
419/**
420 * copy_notes_elf64 - copy ELF note segments in a given buffer
421 *
422 * @ehdr_ptr: ELF header
423 * @notes_buf: buffer into which ELF note segments are copied
424 *
425 * This function is used to copy ELF note segment in the 1st kernel
426 * into the buffer @notes_buf in the 2nd kernel. It is assumed that
427 * size of the buffer @notes_buf is equal to or larger than sum of the
428 * real ELF note segment headers and data.
429 *
430 * It is assumed that program headers with PT_NOTE type pointed to by
431 * @ehdr_ptr has already been updated by update_note_header_size_elf64
432 * and each of PT_NOTE program headers has actual ELF note segment
433 * size in its p_memsz member.
434 */
435static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
436{
437 int i, rc=0;
438 Elf64_Phdr *phdr_ptr;
439
440 phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1);
441
442 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
443 u64 offset;
444 if (phdr_ptr->p_type != PT_NOTE)
445 continue;
446 offset = phdr_ptr->p_offset;
447 rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
448 if (rc < 0)
449 return rc;
450 notes_buf += phdr_ptr->p_memsz;
304 } 451 }
305 452
453 return 0;
454}
455
456/* Merges all the PT_NOTE headers into one. */
457static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
458 char **notes_buf, size_t *notes_sz)
459{
460 int i, nr_ptnote=0, rc=0;
461 char *tmp;
462 Elf64_Ehdr *ehdr_ptr;
463 Elf64_Phdr phdr;
464 u64 phdr_sz = 0, note_off;
465
466 ehdr_ptr = (Elf64_Ehdr *)elfptr;
467
468 rc = update_note_header_size_elf64(ehdr_ptr);
469 if (rc < 0)
470 return rc;
471
472 rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz);
473 if (rc < 0)
474 return rc;
475
476 *notes_sz = roundup(phdr_sz, PAGE_SIZE);
477 *notes_buf = alloc_elfnotes_buf(*notes_sz);
478 if (!*notes_buf)
479 return -ENOMEM;
480
481 rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
482 if (rc < 0)
483 return rc;
484
306 /* Prepare merged PT_NOTE program header. */ 485 /* Prepare merged PT_NOTE program header. */
307 phdr.p_type = PT_NOTE; 486 phdr.p_type = PT_NOTE;
308 phdr.p_flags = 0; 487 phdr.p_flags = 0;
309 note_off = sizeof(Elf64_Ehdr) + 488 note_off = sizeof(Elf64_Ehdr) +
310 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); 489 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
311 phdr.p_offset = note_off; 490 phdr.p_offset = roundup(note_off, PAGE_SIZE);
312 phdr.p_vaddr = phdr.p_paddr = 0; 491 phdr.p_vaddr = phdr.p_paddr = 0;
313 phdr.p_filesz = phdr.p_memsz = phdr_sz; 492 phdr.p_filesz = phdr.p_memsz = phdr_sz;
314 phdr.p_align = 0; 493 phdr.p_align = 0;
@@ -322,6 +501,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
322 i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); 501 i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
323 *elfsz = *elfsz - i; 502 *elfsz = *elfsz - i;
324 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); 503 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
504 memset(elfptr + *elfsz, 0, i);
505 *elfsz = roundup(*elfsz, PAGE_SIZE);
325 506
326 /* Modify e_phnum to reflect merged headers. */ 507 /* Modify e_phnum to reflect merged headers. */
327 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; 508 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -329,27 +510,27 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
329 return 0; 510 return 0;
330} 511}
331 512
332/* Merges all the PT_NOTE headers into one. */ 513/**
333static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, 514 * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry
334 struct list_head *vc_list) 515 *
516 * @ehdr_ptr: ELF header
517 *
518 * This function updates p_memsz member of each PT_NOTE entry in the
519 * program header table pointed to by @ehdr_ptr to real size of ELF
520 * note segment.
521 */
522static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
335{ 523{
336 int i, nr_ptnote=0, rc=0; 524 int i, rc=0;
337 char *tmp; 525 Elf32_Phdr *phdr_ptr;
338 Elf32_Ehdr *ehdr_ptr;
339 Elf32_Phdr phdr, *phdr_ptr;
340 Elf32_Nhdr *nhdr_ptr; 526 Elf32_Nhdr *nhdr_ptr;
341 u64 phdr_sz = 0, note_off;
342 527
343 ehdr_ptr = (Elf32_Ehdr *)elfptr; 528 phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
344 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
345 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 529 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
346 int j;
347 void *notes_section; 530 void *notes_section;
348 struct vmcore *new;
349 u64 offset, max_sz, sz, real_sz = 0; 531 u64 offset, max_sz, sz, real_sz = 0;
350 if (phdr_ptr->p_type != PT_NOTE) 532 if (phdr_ptr->p_type != PT_NOTE)
351 continue; 533 continue;
352 nr_ptnote++;
353 max_sz = phdr_ptr->p_memsz; 534 max_sz = phdr_ptr->p_memsz;
354 offset = phdr_ptr->p_offset; 535 offset = phdr_ptr->p_offset;
355 notes_section = kmalloc(max_sz, GFP_KERNEL); 536 notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -361,7 +542,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
361 return rc; 542 return rc;
362 } 543 }
363 nhdr_ptr = notes_section; 544 nhdr_ptr = notes_section;
364 for (j = 0; j < max_sz; j += sz) { 545 while (real_sz < max_sz) {
365 if (nhdr_ptr->n_namesz == 0) 546 if (nhdr_ptr->n_namesz == 0)
366 break; 547 break;
367 sz = sizeof(Elf32_Nhdr) + 548 sz = sizeof(Elf32_Nhdr) +
@@ -370,26 +551,122 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
370 real_sz += sz; 551 real_sz += sz;
371 nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); 552 nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
372 } 553 }
373
374 /* Add this contiguous chunk of notes section to vmcore list.*/
375 new = get_new_element();
376 if (!new) {
377 kfree(notes_section);
378 return -ENOMEM;
379 }
380 new->paddr = phdr_ptr->p_offset;
381 new->size = real_sz;
382 list_add_tail(&new->list, vc_list);
383 phdr_sz += real_sz;
384 kfree(notes_section); 554 kfree(notes_section);
555 phdr_ptr->p_memsz = real_sz;
556 }
557
558 return 0;
559}
560
561/**
562 * get_note_number_and_size_elf32 - get the number of PT_NOTE program
563 * headers and sum of real size of their ELF note segment headers and
564 * data.
565 *
566 * @ehdr_ptr: ELF header
567 * @nr_ptnote: buffer for the number of PT_NOTE program headers
568 * @sz_ptnote: buffer for size of unique PT_NOTE program header
569 *
570 * This function is used to merge multiple PT_NOTE program headers
571 * into a unique single one. The resulting unique entry will have
572 * @sz_ptnote in its phdr->p_mem.
573 *
574 * It is assumed that program headers with PT_NOTE type pointed to by
575 * @ehdr_ptr has already been updated by update_note_header_size_elf32
576 * and each of PT_NOTE program headers has actual ELF note segment
577 * size in its p_memsz member.
578 */
579static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr,
580 int *nr_ptnote, u64 *sz_ptnote)
581{
582 int i;
583 Elf32_Phdr *phdr_ptr;
584
585 *nr_ptnote = *sz_ptnote = 0;
586
587 phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
588 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
589 if (phdr_ptr->p_type != PT_NOTE)
590 continue;
591 *nr_ptnote += 1;
592 *sz_ptnote += phdr_ptr->p_memsz;
593 }
594
595 return 0;
596}
597
598/**
599 * copy_notes_elf32 - copy ELF note segments in a given buffer
600 *
601 * @ehdr_ptr: ELF header
602 * @notes_buf: buffer into which ELF note segments are copied
603 *
604 * This function is used to copy ELF note segment in the 1st kernel
605 * into the buffer @notes_buf in the 2nd kernel. It is assumed that
606 * size of the buffer @notes_buf is equal to or larger than sum of the
607 * real ELF note segment headers and data.
608 *
609 * It is assumed that program headers with PT_NOTE type pointed to by
610 * @ehdr_ptr has already been updated by update_note_header_size_elf32
611 * and each of PT_NOTE program headers has actual ELF note segment
612 * size in its p_memsz member.
613 */
614static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
615{
616 int i, rc=0;
617 Elf32_Phdr *phdr_ptr;
618
619 phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1);
620
621 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
622 u64 offset;
623 if (phdr_ptr->p_type != PT_NOTE)
624 continue;
625 offset = phdr_ptr->p_offset;
626 rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
627 if (rc < 0)
628 return rc;
629 notes_buf += phdr_ptr->p_memsz;
385 } 630 }
386 631
632 return 0;
633}
634
635/* Merges all the PT_NOTE headers into one. */
636static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
637 char **notes_buf, size_t *notes_sz)
638{
639 int i, nr_ptnote=0, rc=0;
640 char *tmp;
641 Elf32_Ehdr *ehdr_ptr;
642 Elf32_Phdr phdr;
643 u64 phdr_sz = 0, note_off;
644
645 ehdr_ptr = (Elf32_Ehdr *)elfptr;
646
647 rc = update_note_header_size_elf32(ehdr_ptr);
648 if (rc < 0)
649 return rc;
650
651 rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz);
652 if (rc < 0)
653 return rc;
654
655 *notes_sz = roundup(phdr_sz, PAGE_SIZE);
656 *notes_buf = alloc_elfnotes_buf(*notes_sz);
657 if (!*notes_buf)
658 return -ENOMEM;
659
660 rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
661 if (rc < 0)
662 return rc;
663
387 /* Prepare merged PT_NOTE program header. */ 664 /* Prepare merged PT_NOTE program header. */
388 phdr.p_type = PT_NOTE; 665 phdr.p_type = PT_NOTE;
389 phdr.p_flags = 0; 666 phdr.p_flags = 0;
390 note_off = sizeof(Elf32_Ehdr) + 667 note_off = sizeof(Elf32_Ehdr) +
391 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); 668 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
392 phdr.p_offset = note_off; 669 phdr.p_offset = roundup(note_off, PAGE_SIZE);
393 phdr.p_vaddr = phdr.p_paddr = 0; 670 phdr.p_vaddr = phdr.p_paddr = 0;
394 phdr.p_filesz = phdr.p_memsz = phdr_sz; 671 phdr.p_filesz = phdr.p_memsz = phdr_sz;
395 phdr.p_align = 0; 672 phdr.p_align = 0;
@@ -403,6 +680,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
403 i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); 680 i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
404 *elfsz = *elfsz - i; 681 *elfsz = *elfsz - i;
405 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); 682 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
683 memset(elfptr + *elfsz, 0, i);
684 *elfsz = roundup(*elfsz, PAGE_SIZE);
406 685
407 /* Modify e_phnum to reflect merged headers. */ 686 /* Modify e_phnum to reflect merged headers. */
408 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; 687 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -414,6 +693,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
414 * the new offset fields of exported program headers. */ 693 * the new offset fields of exported program headers. */
415static int __init process_ptload_program_headers_elf64(char *elfptr, 694static int __init process_ptload_program_headers_elf64(char *elfptr,
416 size_t elfsz, 695 size_t elfsz,
696 size_t elfnotes_sz,
417 struct list_head *vc_list) 697 struct list_head *vc_list)
418{ 698{
419 int i; 699 int i;
@@ -425,32 +705,38 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
425 ehdr_ptr = (Elf64_Ehdr *)elfptr; 705 ehdr_ptr = (Elf64_Ehdr *)elfptr;
426 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ 706 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
427 707
428 /* First program header is PT_NOTE header. */ 708 /* Skip Elf header, program headers and Elf note segment. */
429 vmcore_off = sizeof(Elf64_Ehdr) + 709 vmcore_off = elfsz + elfnotes_sz;
430 (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
431 phdr_ptr->p_memsz; /* Note sections */
432 710
433 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 711 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
712 u64 paddr, start, end, size;
713
434 if (phdr_ptr->p_type != PT_LOAD) 714 if (phdr_ptr->p_type != PT_LOAD)
435 continue; 715 continue;
436 716
717 paddr = phdr_ptr->p_offset;
718 start = rounddown(paddr, PAGE_SIZE);
719 end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
720 size = end - start;
721
437 /* Add this contiguous chunk of memory to vmcore list.*/ 722 /* Add this contiguous chunk of memory to vmcore list.*/
438 new = get_new_element(); 723 new = get_new_element();
439 if (!new) 724 if (!new)
440 return -ENOMEM; 725 return -ENOMEM;
441 new->paddr = phdr_ptr->p_offset; 726 new->paddr = start;
442 new->size = phdr_ptr->p_memsz; 727 new->size = size;
443 list_add_tail(&new->list, vc_list); 728 list_add_tail(&new->list, vc_list);
444 729
445 /* Update the program header offset. */ 730 /* Update the program header offset. */
446 phdr_ptr->p_offset = vmcore_off; 731 phdr_ptr->p_offset = vmcore_off + (paddr - start);
447 vmcore_off = vmcore_off + phdr_ptr->p_memsz; 732 vmcore_off = vmcore_off + size;
448 } 733 }
449 return 0; 734 return 0;
450} 735}
451 736
452static int __init process_ptload_program_headers_elf32(char *elfptr, 737static int __init process_ptload_program_headers_elf32(char *elfptr,
453 size_t elfsz, 738 size_t elfsz,
739 size_t elfnotes_sz,
454 struct list_head *vc_list) 740 struct list_head *vc_list)
455{ 741{
456 int i; 742 int i;
@@ -462,43 +748,44 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
462 ehdr_ptr = (Elf32_Ehdr *)elfptr; 748 ehdr_ptr = (Elf32_Ehdr *)elfptr;
463 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ 749 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
464 750
465 /* First program header is PT_NOTE header. */ 751 /* Skip Elf header, program headers and Elf note segment. */
466 vmcore_off = sizeof(Elf32_Ehdr) + 752 vmcore_off = elfsz + elfnotes_sz;
467 (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) +
468 phdr_ptr->p_memsz; /* Note sections */
469 753
470 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 754 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
755 u64 paddr, start, end, size;
756
471 if (phdr_ptr->p_type != PT_LOAD) 757 if (phdr_ptr->p_type != PT_LOAD)
472 continue; 758 continue;
473 759
760 paddr = phdr_ptr->p_offset;
761 start = rounddown(paddr, PAGE_SIZE);
762 end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
763 size = end - start;
764
474 /* Add this contiguous chunk of memory to vmcore list.*/ 765 /* Add this contiguous chunk of memory to vmcore list.*/
475 new = get_new_element(); 766 new = get_new_element();
476 if (!new) 767 if (!new)
477 return -ENOMEM; 768 return -ENOMEM;
478 new->paddr = phdr_ptr->p_offset; 769 new->paddr = start;
479 new->size = phdr_ptr->p_memsz; 770 new->size = size;
480 list_add_tail(&new->list, vc_list); 771 list_add_tail(&new->list, vc_list);
481 772
482 /* Update the program header offset */ 773 /* Update the program header offset */
483 phdr_ptr->p_offset = vmcore_off; 774 phdr_ptr->p_offset = vmcore_off + (paddr - start);
484 vmcore_off = vmcore_off + phdr_ptr->p_memsz; 775 vmcore_off = vmcore_off + size;
485 } 776 }
486 return 0; 777 return 0;
487} 778}
488 779
489/* Sets offset fields of vmcore elements. */ 780/* Sets offset fields of vmcore elements. */
490static void __init set_vmcore_list_offsets_elf64(char *elfptr, 781static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
491 struct list_head *vc_list) 782 struct list_head *vc_list)
492{ 783{
493 loff_t vmcore_off; 784 loff_t vmcore_off;
494 Elf64_Ehdr *ehdr_ptr;
495 struct vmcore *m; 785 struct vmcore *m;
496 786
497 ehdr_ptr = (Elf64_Ehdr *)elfptr; 787 /* Skip Elf header, program headers and Elf note segment. */
498 788 vmcore_off = elfsz + elfnotes_sz;
499 /* Skip Elf header and program headers. */
500 vmcore_off = sizeof(Elf64_Ehdr) +
501 (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
502 789
503 list_for_each_entry(m, vc_list, list) { 790 list_for_each_entry(m, vc_list, list) {
504 m->offset = vmcore_off; 791 m->offset = vmcore_off;
@@ -506,24 +793,12 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr,
506 } 793 }
507} 794}
508 795
509/* Sets offset fields of vmcore elements. */ 796static void free_elfcorebuf(void)
510static void __init set_vmcore_list_offsets_elf32(char *elfptr,
511 struct list_head *vc_list)
512{ 797{
513 loff_t vmcore_off; 798 free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
514 Elf32_Ehdr *ehdr_ptr; 799 elfcorebuf = NULL;
515 struct vmcore *m; 800 vfree(elfnotes_buf);
516 801 elfnotes_buf = NULL;
517 ehdr_ptr = (Elf32_Ehdr *)elfptr;
518
519 /* Skip Elf header and program headers. */
520 vmcore_off = sizeof(Elf32_Ehdr) +
521 (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr);
522
523 list_for_each_entry(m, vc_list, list) {
524 m->offset = vmcore_off;
525 vmcore_off += m->size;
526 }
527} 802}
528 803
529static int __init parse_crash_elf64_headers(void) 804static int __init parse_crash_elf64_headers(void)
@@ -554,31 +829,32 @@ static int __init parse_crash_elf64_headers(void)
554 } 829 }
555 830
556 /* Read in all elf headers. */ 831 /* Read in all elf headers. */
557 elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr); 832 elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) +
558 elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); 833 ehdr.e_phnum * sizeof(Elf64_Phdr);
834 elfcorebuf_sz = elfcorebuf_sz_orig;
835 elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
836 get_order(elfcorebuf_sz_orig));
559 if (!elfcorebuf) 837 if (!elfcorebuf)
560 return -ENOMEM; 838 return -ENOMEM;
561 addr = elfcorehdr_addr; 839 addr = elfcorehdr_addr;
562 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); 840 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
563 if (rc < 0) { 841 if (rc < 0)
564 kfree(elfcorebuf); 842 goto fail;
565 return rc;
566 }
567 843
568 /* Merge all PT_NOTE headers into one. */ 844 /* Merge all PT_NOTE headers into one. */
569 rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list); 845 rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz,
570 if (rc) { 846 &elfnotes_buf, &elfnotes_sz);
571 kfree(elfcorebuf); 847 if (rc)
572 return rc; 848 goto fail;
573 }
574 rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, 849 rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
575 &vmcore_list); 850 elfnotes_sz, &vmcore_list);
576 if (rc) { 851 if (rc)
577 kfree(elfcorebuf); 852 goto fail;
578 return rc; 853 set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
579 }
580 set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
581 return 0; 854 return 0;
855fail:
856 free_elfcorebuf();
857 return rc;
582} 858}
583 859
584static int __init parse_crash_elf32_headers(void) 860static int __init parse_crash_elf32_headers(void)
@@ -609,31 +885,31 @@ static int __init parse_crash_elf32_headers(void)
609 } 885 }
610 886
611 /* Read in all elf headers. */ 887 /* Read in all elf headers. */
612 elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); 888 elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
613 elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); 889 elfcorebuf_sz = elfcorebuf_sz_orig;
890 elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
891 get_order(elfcorebuf_sz_orig));
614 if (!elfcorebuf) 892 if (!elfcorebuf)
615 return -ENOMEM; 893 return -ENOMEM;
616 addr = elfcorehdr_addr; 894 addr = elfcorehdr_addr;
617 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); 895 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
618 if (rc < 0) { 896 if (rc < 0)
619 kfree(elfcorebuf); 897 goto fail;
620 return rc;
621 }
622 898
623 /* Merge all PT_NOTE headers into one. */ 899 /* Merge all PT_NOTE headers into one. */
624 rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list); 900 rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz,
625 if (rc) { 901 &elfnotes_buf, &elfnotes_sz);
626 kfree(elfcorebuf); 902 if (rc)
627 return rc; 903 goto fail;
628 }
629 rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, 904 rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
630 &vmcore_list); 905 elfnotes_sz, &vmcore_list);
631 if (rc) { 906 if (rc)
632 kfree(elfcorebuf); 907 goto fail;
633 return rc; 908 set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
634 }
635 set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list);
636 return 0; 909 return 0;
910fail:
911 free_elfcorebuf();
912 return rc;
637} 913}
638 914
639static int __init parse_crash_elf_headers(void) 915static int __init parse_crash_elf_headers(void)
@@ -655,20 +931,19 @@ static int __init parse_crash_elf_headers(void)
655 rc = parse_crash_elf64_headers(); 931 rc = parse_crash_elf64_headers();
656 if (rc) 932 if (rc)
657 return rc; 933 return rc;
658
659 /* Determine vmcore size. */
660 vmcore_size = get_vmcore_size_elf64(elfcorebuf);
661 } else if (e_ident[EI_CLASS] == ELFCLASS32) { 934 } else if (e_ident[EI_CLASS] == ELFCLASS32) {
662 rc = parse_crash_elf32_headers(); 935 rc = parse_crash_elf32_headers();
663 if (rc) 936 if (rc)
664 return rc; 937 return rc;
665
666 /* Determine vmcore size. */
667 vmcore_size = get_vmcore_size_elf32(elfcorebuf);
668 } else { 938 } else {
669 pr_warn("Warning: Core image elf header is not sane\n"); 939 pr_warn("Warning: Core image elf header is not sane\n");
670 return -EINVAL; 940 return -EINVAL;
671 } 941 }
942
943 /* Determine vmcore size. */
944 vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
945 &vmcore_list);
946
672 return 0; 947 return 0;
673} 948}
674 949
@@ -711,7 +986,6 @@ void vmcore_cleanup(void)
711 list_del(&m->list); 986 list_del(&m->list);
712 kfree(m); 987 kfree(m);
713 } 988 }
714 kfree(elfcorebuf); 989 free_elfcorebuf();
715 elfcorebuf = NULL;
716} 990}
717EXPORT_SYMBOL_GPL(vmcore_cleanup); 991EXPORT_SYMBOL_GPL(vmcore_cleanup);