summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/aio.c2
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/buffer.c34
-rw-r--r--fs/cachefiles/rdwr.c30
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/coredump.c121
-rw-r--r--fs/eventpoll.c12
-rw-r--r--fs/exec.c7
-rw-r--r--fs/ext3/inode.c1
-rw-r--r--fs/fat/misc.c5
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/hppfs/hppfs.c11
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/callback.c5
-rw-r--r--fs/nfs/dir.c7
-rw-r--r--fs/nfs/file.c30
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nilfs2/alloc.c63
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/ifile.c22
-rw-r--r--fs/nilfs2/ifile.h2
-rw-r--r--fs/nilfs2/inode.c8
-rw-r--r--fs/nilfs2/segment.c4
-rw-r--r--fs/nilfs2/super.c31
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/nilfs2/the_nilfs.h4
-rw-r--r--fs/ocfs2/alloc.c8
-rw-r--r--fs/ocfs2/cluster/heartbeat.c19
-rw-r--r--fs/ocfs2/cluster/quorum.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c29
-rw-r--r--fs/ocfs2/dlm/dlmlock.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c7
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/namei.c70
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/suballoc.c37
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/ocfs2/xattr.c18
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/task_mmu.c145
-rw-r--r--fs/proc/uptime.c3
-rw-r--r--fs/proc/vmcore.c694
43 files changed, 991 insertions, 470 deletions
diff --git a/fs/aio.c b/fs/aio.c
index a8ecc8313fb0..9b5ca1137419 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -625,7 +625,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
625 625
626 /* 626 /*
627 * Add a completion event to the ring buffer. Must be done holding 627 * Add a completion event to the ring buffer. Must be done holding
628 * ctx->ctx_lock to prevent other code from messing with the tail 628 * ctx->completion_lock to prevent other code from messing with the tail
629 * pointer since we might be called from irq context. 629 * pointer since we might be called from irq context.
630 */ 630 */
631 spin_lock_irqsave(&ctx->completion_lock, flags); 631 spin_lock_irqsave(&ctx->completion_lock, flags);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 431b6a04ebfd..bb43ce081d6e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1562,6 +1562,7 @@ static const struct address_space_operations def_blk_aops = {
1562 .writepages = generic_writepages, 1562 .writepages = generic_writepages,
1563 .releasepage = blkdev_releasepage, 1563 .releasepage = blkdev_releasepage,
1564 .direct_IO = blkdev_direct_IO, 1564 .direct_IO = blkdev_direct_IO,
1565 .is_dirty_writeback = buffer_check_dirty_writeback,
1565}; 1566};
1566 1567
1567const struct file_operations def_blk_fops = { 1568const struct file_operations def_blk_fops = {
diff --git a/fs/buffer.c b/fs/buffer.c
index f93392e2df12..4d7433534f5c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -83,6 +83,40 @@ void unlock_buffer(struct buffer_head *bh)
83EXPORT_SYMBOL(unlock_buffer); 83EXPORT_SYMBOL(unlock_buffer);
84 84
85/* 85/*
86 * Returns if the page has dirty or writeback buffers. If all the buffers
87 * are unlocked and clean then the PageDirty information is stale. If
88 * any of the pages are locked, it is assumed they are locked for IO.
89 */
90void buffer_check_dirty_writeback(struct page *page,
91 bool *dirty, bool *writeback)
92{
93 struct buffer_head *head, *bh;
94 *dirty = false;
95 *writeback = false;
96
97 BUG_ON(!PageLocked(page));
98
99 if (!page_has_buffers(page))
100 return;
101
102 if (PageWriteback(page))
103 *writeback = true;
104
105 head = page_buffers(page);
106 bh = head;
107 do {
108 if (buffer_locked(bh))
109 *writeback = true;
110
111 if (buffer_dirty(bh))
112 *dirty = true;
113
114 bh = bh->b_this_page;
115 } while (bh != head);
116}
117EXPORT_SYMBOL(buffer_check_dirty_writeback);
118
119/*
86 * Block until a buffer comes unlocked. This doesn't stop it 120 * Block until a buffer comes unlocked. This doesn't stop it
87 * from becoming locked again - you have to lock it yourself 121 * from becoming locked again - you have to lock it yourself
88 * if you want to preserve its state. 122 * if you want to preserve its state.
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 317f9ee9c991..ebaff368120d 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -12,6 +12,7 @@
12#include <linux/mount.h> 12#include <linux/mount.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/swap.h>
15#include "internal.h" 16#include "internal.h"
16 17
17/* 18/*
@@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
227 */ 228 */
228static int cachefiles_read_backing_file_one(struct cachefiles_object *object, 229static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
229 struct fscache_retrieval *op, 230 struct fscache_retrieval *op,
230 struct page *netpage, 231 struct page *netpage)
231 struct pagevec *pagevec)
232{ 232{
233 struct cachefiles_one_read *monitor; 233 struct cachefiles_one_read *monitor;
234 struct address_space *bmapping; 234 struct address_space *bmapping;
@@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
237 237
238 _enter(""); 238 _enter("");
239 239
240 pagevec_reinit(pagevec);
241
242 _debug("read back %p{%lu,%d}", 240 _debug("read back %p{%lu,%d}",
243 netpage, netpage->index, page_count(netpage)); 241 netpage, netpage->index, page_count(netpage));
244 242
@@ -283,9 +281,7 @@ installed_new_backing_page:
283 backpage = newpage; 281 backpage = newpage;
284 newpage = NULL; 282 newpage = NULL;
285 283
286 page_cache_get(backpage); 284 lru_cache_add_file(backpage);
287 pagevec_add(pagevec, backpage);
288 __pagevec_lru_add_file(pagevec);
289 285
290read_backing_page: 286read_backing_page:
291 ret = bmapping->a_ops->readpage(NULL, backpage); 287 ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
452 if (block) { 448 if (block) {
453 /* submit the apparently valid page to the backing fs to be 449 /* submit the apparently valid page to the backing fs to be
454 * read from disk */ 450 * read from disk */
455 ret = cachefiles_read_backing_file_one(object, op, page, 451 ret = cachefiles_read_backing_file_one(object, op, page);
456 &pagevec);
457 } else if (cachefiles_has_space(cache, 0, 1) == 0) { 452 } else if (cachefiles_has_space(cache, 0, 1) == 0) {
458 /* there's space in the cache we can use */ 453 /* there's space in the cache we can use */
459 fscache_mark_page_cached(op, page); 454 fscache_mark_page_cached(op, page);
@@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
482{ 477{
483 struct cachefiles_one_read *monitor = NULL; 478 struct cachefiles_one_read *monitor = NULL;
484 struct address_space *bmapping = object->backer->d_inode->i_mapping; 479 struct address_space *bmapping = object->backer->d_inode->i_mapping;
485 struct pagevec lru_pvec;
486 struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; 480 struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
487 int ret = 0; 481 int ret = 0;
488 482
489 _enter(""); 483 _enter("");
490 484
491 pagevec_init(&lru_pvec, 0);
492
493 list_for_each_entry_safe(netpage, _n, list, lru) { 485 list_for_each_entry_safe(netpage, _n, list, lru) {
494 list_del(&netpage->lru); 486 list_del(&netpage->lru);
495 487
@@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
534 backpage = newpage; 526 backpage = newpage;
535 newpage = NULL; 527 newpage = NULL;
536 528
537 page_cache_get(backpage); 529 lru_cache_add_file(backpage);
538 if (!pagevec_add(&lru_pvec, backpage))
539 __pagevec_lru_add_file(&lru_pvec);
540 530
541 reread_backing_page: 531 reread_backing_page:
542 ret = bmapping->a_ops->readpage(NULL, backpage); 532 ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
559 goto nomem; 549 goto nomem;
560 } 550 }
561 551
562 page_cache_get(netpage); 552 lru_cache_add_file(netpage);
563 if (!pagevec_add(&lru_pvec, netpage))
564 __pagevec_lru_add_file(&lru_pvec);
565 553
566 /* install a monitor */ 554 /* install a monitor */
567 page_cache_get(netpage); 555 page_cache_get(netpage);
@@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
643 631
644 fscache_mark_page_cached(op, netpage); 632 fscache_mark_page_cached(op, netpage);
645 633
646 page_cache_get(netpage); 634 lru_cache_add_file(netpage);
647 if (!pagevec_add(&lru_pvec, netpage))
648 __pagevec_lru_add_file(&lru_pvec);
649 635
650 /* the netpage is unlocked and marked up to date here */ 636 /* the netpage is unlocked and marked up to date here */
651 fscache_end_io(op, netpage, 0); 637 fscache_end_io(op, netpage, 0);
@@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
661 647
662out: 648out:
663 /* tidy up */ 649 /* tidy up */
664 pagevec_lru_add_file(&lru_pvec);
665
666 if (newpage) 650 if (newpage)
667 page_cache_release(newpage); 651 page_cache_release(newpage);
668 if (netpage) 652 if (netpage)
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2b6cb23dd14e..1d1c41f1014d 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
203 mutex_lock(&buffer->mutex); 203 mutex_lock(&buffer->mutex);
204 len = fill_write_buffer(buffer, buf, count); 204 len = fill_write_buffer(buffer, buf, count);
205 if (len > 0) 205 if (len > 0)
206 len = flush_write_buffer(file->f_path.dentry, buffer, count); 206 len = flush_write_buffer(file->f_path.dentry, buffer, len);
207 if (len > 0) 207 if (len > 0)
208 *ppos += len; 208 *ppos += len;
209 mutex_unlock(&buffer->mutex); 209 mutex_unlock(&buffer->mutex);
diff --git a/fs/coredump.c b/fs/coredump.c
index dafafbafa731..72f816d6cad9 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -45,69 +45,79 @@
45#include <trace/events/sched.h> 45#include <trace/events/sched.h>
46 46
47int core_uses_pid; 47int core_uses_pid;
48char core_pattern[CORENAME_MAX_SIZE] = "core";
49unsigned int core_pipe_limit; 48unsigned int core_pipe_limit;
49char core_pattern[CORENAME_MAX_SIZE] = "core";
50static int core_name_size = CORENAME_MAX_SIZE;
50 51
51struct core_name { 52struct core_name {
52 char *corename; 53 char *corename;
53 int used, size; 54 int used, size;
54}; 55};
55static atomic_t call_count = ATOMIC_INIT(1);
56 56
57/* The maximal length of core_pattern is also specified in sysctl.c */ 57/* The maximal length of core_pattern is also specified in sysctl.c */
58 58
59static int expand_corename(struct core_name *cn) 59static int expand_corename(struct core_name *cn, int size)
60{ 60{
61 char *old_corename = cn->corename; 61 char *corename = krealloc(cn->corename, size, GFP_KERNEL);
62
63 cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
64 cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
65 62
66 if (!cn->corename) { 63 if (!corename)
67 kfree(old_corename);
68 return -ENOMEM; 64 return -ENOMEM;
69 }
70 65
66 if (size > core_name_size) /* racy but harmless */
67 core_name_size = size;
68
69 cn->size = ksize(corename);
70 cn->corename = corename;
71 return 0; 71 return 0;
72} 72}
73 73
74static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
75{
76 int free, need;
77
78again:
79 free = cn->size - cn->used;
80 need = vsnprintf(cn->corename + cn->used, free, fmt, arg);
81 if (need < free) {
82 cn->used += need;
83 return 0;
84 }
85
86 if (!expand_corename(cn, cn->size + need - free + 1))
87 goto again;
88
89 return -ENOMEM;
90}
91
74static int cn_printf(struct core_name *cn, const char *fmt, ...) 92static int cn_printf(struct core_name *cn, const char *fmt, ...)
75{ 93{
76 char *cur;
77 int need;
78 int ret;
79 va_list arg; 94 va_list arg;
95 int ret;
80 96
81 va_start(arg, fmt); 97 va_start(arg, fmt);
82 need = vsnprintf(NULL, 0, fmt, arg); 98 ret = cn_vprintf(cn, fmt, arg);
83 va_end(arg); 99 va_end(arg);
84 100
85 if (likely(need < cn->size - cn->used - 1)) 101 return ret;
86 goto out_printf; 102}
87 103
88 ret = expand_corename(cn); 104static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
89 if (ret) 105{
90 goto expand_fail; 106 int cur = cn->used;
107 va_list arg;
108 int ret;
91 109
92out_printf:
93 cur = cn->corename + cn->used;
94 va_start(arg, fmt); 110 va_start(arg, fmt);
95 vsnprintf(cur, need + 1, fmt, arg); 111 ret = cn_vprintf(cn, fmt, arg);
96 va_end(arg); 112 va_end(arg);
97 cn->used += need;
98 return 0;
99 113
100expand_fail: 114 for (; cur < cn->used; ++cur) {
115 if (cn->corename[cur] == '/')
116 cn->corename[cur] = '!';
117 }
101 return ret; 118 return ret;
102} 119}
103 120
104static void cn_escape(char *str)
105{
106 for (; *str; str++)
107 if (*str == '/')
108 *str = '!';
109}
110
111static int cn_print_exe_file(struct core_name *cn) 121static int cn_print_exe_file(struct core_name *cn)
112{ 122{
113 struct file *exe_file; 123 struct file *exe_file;
@@ -115,12 +125,8 @@ static int cn_print_exe_file(struct core_name *cn)
115 int ret; 125 int ret;
116 126
117 exe_file = get_mm_exe_file(current->mm); 127 exe_file = get_mm_exe_file(current->mm);
118 if (!exe_file) { 128 if (!exe_file)
119 char *commstart = cn->corename + cn->used; 129 return cn_esc_printf(cn, "%s (path unknown)", current->comm);
120 ret = cn_printf(cn, "%s (path unknown)", current->comm);
121 cn_escape(commstart);
122 return ret;
123 }
124 130
125 pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); 131 pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
126 if (!pathbuf) { 132 if (!pathbuf) {
@@ -134,9 +140,7 @@ static int cn_print_exe_file(struct core_name *cn)
134 goto free_buf; 140 goto free_buf;
135 } 141 }
136 142
137 cn_escape(path); 143 ret = cn_esc_printf(cn, "%s", path);
138
139 ret = cn_printf(cn, "%s", path);
140 144
141free_buf: 145free_buf:
142 kfree(pathbuf); 146 kfree(pathbuf);
@@ -157,19 +161,19 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
157 int pid_in_pattern = 0; 161 int pid_in_pattern = 0;
158 int err = 0; 162 int err = 0;
159 163
160 cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
161 cn->corename = kmalloc(cn->size, GFP_KERNEL);
162 cn->used = 0; 164 cn->used = 0;
163 165 cn->corename = NULL;
164 if (!cn->corename) 166 if (expand_corename(cn, core_name_size))
165 return -ENOMEM; 167 return -ENOMEM;
168 cn->corename[0] = '\0';
169
170 if (ispipe)
171 ++pat_ptr;
166 172
167 /* Repeat as long as we have more pattern to process and more output 173 /* Repeat as long as we have more pattern to process and more output
168 space */ 174 space */
169 while (*pat_ptr) { 175 while (*pat_ptr) {
170 if (*pat_ptr != '%') { 176 if (*pat_ptr != '%') {
171 if (*pat_ptr == 0)
172 goto out;
173 err = cn_printf(cn, "%c", *pat_ptr++); 177 err = cn_printf(cn, "%c", *pat_ptr++);
174 } else { 178 } else {
175 switch (*++pat_ptr) { 179 switch (*++pat_ptr) {
@@ -210,22 +214,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
210 break; 214 break;
211 } 215 }
212 /* hostname */ 216 /* hostname */
213 case 'h': { 217 case 'h':
214 char *namestart = cn->corename + cn->used;
215 down_read(&uts_sem); 218 down_read(&uts_sem);
216 err = cn_printf(cn, "%s", 219 err = cn_esc_printf(cn, "%s",
217 utsname()->nodename); 220 utsname()->nodename);
218 up_read(&uts_sem); 221 up_read(&uts_sem);
219 cn_escape(namestart);
220 break; 222 break;
221 }
222 /* executable */ 223 /* executable */
223 case 'e': { 224 case 'e':
224 char *commstart = cn->corename + cn->used; 225 err = cn_esc_printf(cn, "%s", current->comm);
225 err = cn_printf(cn, "%s", current->comm);
226 cn_escape(commstart);
227 break; 226 break;
228 }
229 case 'E': 227 case 'E':
230 err = cn_print_exe_file(cn); 228 err = cn_print_exe_file(cn);
231 break; 229 break;
@@ -244,6 +242,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
244 return err; 242 return err;
245 } 243 }
246 244
245out:
247 /* Backward compatibility with core_uses_pid: 246 /* Backward compatibility with core_uses_pid:
248 * 247 *
249 * If core_pattern does not include a %p (as is the default) 248 * If core_pattern does not include a %p (as is the default)
@@ -254,7 +253,6 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
254 if (err) 253 if (err)
255 return err; 254 return err;
256 } 255 }
257out:
258 return ispipe; 256 return ispipe;
259} 257}
260 258
@@ -549,7 +547,7 @@ void do_coredump(siginfo_t *siginfo)
549 if (ispipe < 0) { 547 if (ispipe < 0) {
550 printk(KERN_WARNING "format_corename failed\n"); 548 printk(KERN_WARNING "format_corename failed\n");
551 printk(KERN_WARNING "Aborting core\n"); 549 printk(KERN_WARNING "Aborting core\n");
552 goto fail_corename; 550 goto fail_unlock;
553 } 551 }
554 552
555 if (cprm.limit == 1) { 553 if (cprm.limit == 1) {
@@ -584,7 +582,7 @@ void do_coredump(siginfo_t *siginfo)
584 goto fail_dropcount; 582 goto fail_dropcount;
585 } 583 }
586 584
587 helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); 585 helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL);
588 if (!helper_argv) { 586 if (!helper_argv) {
589 printk(KERN_WARNING "%s failed to allocate memory\n", 587 printk(KERN_WARNING "%s failed to allocate memory\n",
590 __func__); 588 __func__);
@@ -601,7 +599,7 @@ void do_coredump(siginfo_t *siginfo)
601 599
602 argv_free(helper_argv); 600 argv_free(helper_argv);
603 if (retval) { 601 if (retval) {
604 printk(KERN_INFO "Core dump to %s pipe failed\n", 602 printk(KERN_INFO "Core dump to |%s pipe failed\n",
605 cn.corename); 603 cn.corename);
606 goto close_fail; 604 goto close_fail;
607 } 605 }
@@ -669,7 +667,6 @@ fail_dropcount:
669 atomic_dec(&core_dump_count); 667 atomic_dec(&core_dump_count);
670fail_unlock: 668fail_unlock:
671 kfree(cn.corename); 669 kfree(cn.corename);
672fail_corename:
673 coredump_finish(mm, core_dumped); 670 coredump_finish(mm, core_dumped);
674 revert_creds(old_cred); 671 revert_creds(old_cred);
675fail_creds: 672fail_creds:
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 0cff4434880d..9ad17b15b454 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1977,8 +1977,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1977 return -EINVAL; 1977 return -EINVAL;
1978 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) 1978 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1979 return -EFAULT; 1979 return -EFAULT;
1980 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); 1980 sigsaved = current->blocked;
1981 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 1981 set_current_blocked(&ksigmask);
1982 } 1982 }
1983 1983
1984 error = sys_epoll_wait(epfd, events, maxevents, timeout); 1984 error = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -1995,7 +1995,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1995 sizeof(sigsaved)); 1995 sizeof(sigsaved));
1996 set_restore_sigmask(); 1996 set_restore_sigmask();
1997 } else 1997 } else
1998 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 1998 set_current_blocked(&sigsaved);
1999 } 1999 }
2000 2000
2001 return error; 2001 return error;
@@ -2022,8 +2022,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2022 if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) 2022 if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
2023 return -EFAULT; 2023 return -EFAULT;
2024 sigset_from_compat(&ksigmask, &csigmask); 2024 sigset_from_compat(&ksigmask, &csigmask);
2025 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); 2025 sigsaved = current->blocked;
2026 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 2026 set_current_blocked(&ksigmask);
2027 } 2027 }
2028 2028
2029 err = sys_epoll_wait(epfd, events, maxevents, timeout); 2029 err = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -2040,7 +2040,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2040 sizeof(sigsaved)); 2040 sizeof(sigsaved));
2041 set_restore_sigmask(); 2041 set_restore_sigmask();
2042 } else 2042 } else
2043 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 2043 set_current_blocked(&sigsaved);
2044 } 2044 }
2045 2045
2046 return err; 2046 return err;
diff --git a/fs/exec.c b/fs/exec.c
index 03b907cfd765..9c73def87642 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -932,6 +932,7 @@ static int de_thread(struct task_struct *tsk)
932 * also take its birthdate (always earlier than our own). 932 * also take its birthdate (always earlier than our own).
933 */ 933 */
934 tsk->start_time = leader->start_time; 934 tsk->start_time = leader->start_time;
935 tsk->real_start_time = leader->real_start_time;
935 936
936 BUG_ON(!same_thread_group(leader, tsk)); 937 BUG_ON(!same_thread_group(leader, tsk));
937 BUG_ON(has_group_leader_pid(tsk)); 938 BUG_ON(has_group_leader_pid(tsk));
@@ -947,9 +948,8 @@ static int de_thread(struct task_struct *tsk)
947 * Note: The old leader also uses this pid until release_task 948 * Note: The old leader also uses this pid until release_task
948 * is called. Odd but simple and correct. 949 * is called. Odd but simple and correct.
949 */ 950 */
950 detach_pid(tsk, PIDTYPE_PID);
951 tsk->pid = leader->pid; 951 tsk->pid = leader->pid;
952 attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); 952 change_pid(tsk, PIDTYPE_PID, task_pid(leader));
953 transfer_pid(leader, tsk, PIDTYPE_PGID); 953 transfer_pid(leader, tsk, PIDTYPE_PGID);
954 transfer_pid(leader, tsk, PIDTYPE_SID); 954 transfer_pid(leader, tsk, PIDTYPE_SID);
955 955
@@ -1465,7 +1465,6 @@ static int do_execve_common(const char *filename,
1465 struct files_struct *displaced; 1465 struct files_struct *displaced;
1466 bool clear_in_exec; 1466 bool clear_in_exec;
1467 int retval; 1467 int retval;
1468 const struct cred *cred = current_cred();
1469 1468
1470 /* 1469 /*
1471 * We move the actual failure in case of RLIMIT_NPROC excess from 1470 * We move the actual failure in case of RLIMIT_NPROC excess from
@@ -1474,7 +1473,7 @@ static int do_execve_common(const char *filename,
1474 * whether NPROC limit is still exceeded. 1473 * whether NPROC limit is still exceeded.
1475 */ 1474 */
1476 if ((current->flags & PF_NPROC_EXCEEDED) && 1475 if ((current->flags & PF_NPROC_EXCEEDED) &&
1477 atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) { 1476 atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
1478 retval = -EAGAIN; 1477 retval = -EAGAIN;
1479 goto out_ret; 1478 goto out_ret;
1480 } 1479 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f67668f724ba..2bd85486b879 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1985,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = {
1985 .direct_IO = ext3_direct_IO, 1985 .direct_IO = ext3_direct_IO,
1986 .migratepage = buffer_migrate_page, 1986 .migratepage = buffer_migrate_page,
1987 .is_partially_uptodate = block_is_partially_uptodate, 1987 .is_partially_uptodate = block_is_partially_uptodate,
1988 .is_dirty_writeback = buffer_check_dirty_writeback,
1988 .error_remove_page = generic_error_remove_page, 1989 .error_remove_page = generic_error_remove_page,
1989}; 1990};
1990 1991
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 359d307b5507..628e22a5a543 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
30 va_start(args, fmt); 30 va_start(args, fmt);
31 vaf.fmt = fmt; 31 vaf.fmt = fmt;
32 vaf.va = &args; 32 vaf.va = &args;
33 printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf); 33 fat_msg(sb, KERN_ERR, "error, %pV", &vaf);
34 va_end(args); 34 va_end(args);
35 } 35 }
36 36
@@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
38 panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); 38 panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
39 else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { 39 else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
40 sb->s_flags |= MS_RDONLY; 40 sb->s_flags |= MS_RDONLY;
41 printk(KERN_ERR "FAT-fs (%s): Filesystem has been " 41 fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
42 "set read-only\n", sb->s_id);
43 } 42 }
44} 43}
45EXPORT_SYMBOL_GPL(__fat_fs_error); 44EXPORT_SYMBOL_GPL(__fat_fs_error);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9a0cdde14a08..0b578598c6ac 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -785,7 +785,7 @@ static const struct super_operations fuse_super_operations = {
785static void sanitize_global_limit(unsigned *limit) 785static void sanitize_global_limit(unsigned *limit)
786{ 786{
787 if (*limit == 0) 787 if (*limit == 0)
788 *limit = ((num_physpages << PAGE_SHIFT) >> 13) / 788 *limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
789 sizeof(struct fuse_req); 789 sizeof(struct fuse_req);
790 790
791 if (*limit >= 1 << 16) 791 if (*limit >= 1 << 16)
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index fc90ab11c340..4338ff32959d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -69,7 +69,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
69 struct dentry *parent; 69 struct dentry *parent;
70 char *root, *name; 70 char *root, *name;
71 const char *seg_name; 71 const char *seg_name;
72 int len, seg_len; 72 int len, seg_len, root_len;
73 73
74 len = 0; 74 len = 0;
75 parent = dentry; 75 parent = dentry;
@@ -81,7 +81,8 @@ static char *dentry_name(struct dentry *dentry, int extra)
81 } 81 }
82 82
83 root = "proc"; 83 root = "proc";
84 len += strlen(root); 84 root_len = strlen(root);
85 len += root_len;
85 name = kmalloc(len + extra + 1, GFP_KERNEL); 86 name = kmalloc(len + extra + 1, GFP_KERNEL);
86 if (name == NULL) 87 if (name == NULL)
87 return NULL; 88 return NULL;
@@ -91,7 +92,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
91 while (parent->d_parent != parent) { 92 while (parent->d_parent != parent) {
92 if (is_pid(parent)) { 93 if (is_pid(parent)) {
93 seg_name = "pid"; 94 seg_name = "pid";
94 seg_len = strlen("pid"); 95 seg_len = strlen(seg_name);
95 } 96 }
96 else { 97 else {
97 seg_name = parent->d_name.name; 98 seg_name = parent->d_name.name;
@@ -100,10 +101,10 @@ static char *dentry_name(struct dentry *dentry, int extra)
100 101
101 len -= seg_len + 1; 102 len -= seg_len + 1;
102 name[len] = '/'; 103 name[len] = '/';
103 strncpy(&name[len + 1], seg_name, seg_len); 104 memcpy(&name[len + 1], seg_name, seg_len);
104 parent = parent->d_parent; 105 parent = parent->d_parent;
105 } 106 }
106 strncpy(name, root, strlen(root)); 107 memcpy(name, root, root_len);
107 return name; 108 return name;
108} 109}
109 110
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index a2aa97d45670..10d6c41aecad 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -305,7 +305,7 @@ static int lockd_start_svc(struct svc_serv *serv)
305 svc_sock_update_bufs(serv); 305 svc_sock_update_bufs(serv);
306 serv->sv_maxconn = nlm_max_connections; 306 serv->sv_maxconn = nlm_max_connections;
307 307
308 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); 308 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name);
309 if (IS_ERR(nlmsvc_task)) { 309 if (IS_ERR(nlmsvc_task)) {
310 error = PTR_ERR(nlmsvc_task); 310 error = PTR_ERR(nlmsvc_task);
311 printk(KERN_WARNING 311 printk(KERN_WARNING
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index ee24df5af1f9..3c5dd55d284c 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
117 return -EINVAL; 117 return -EINVAL;
118 /* we do not support files bigger than 4GB... We eventually 118 /* we do not support files bigger than 4GB... We eventually
119 supports just 4GB... */ 119 supports just 4GB... */
120 if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff 120 if (vma_pages(vma) + vma->vm_pgoff
121 > (1U << (32 - PAGE_SHIFT))) 121 > (1U << (32 - PAGE_SHIFT)))
122 return -EFBIG; 122 return -EFBIG;
123 123
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index cff089a412c7..da6a43d19aa3 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -211,7 +211,6 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
211 struct svc_rqst *rqstp; 211 struct svc_rqst *rqstp;
212 int (*callback_svc)(void *vrqstp); 212 int (*callback_svc)(void *vrqstp);
213 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; 213 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
214 char svc_name[12];
215 int ret; 214 int ret;
216 215
217 nfs_callback_bc_serv(minorversion, xprt, serv); 216 nfs_callback_bc_serv(minorversion, xprt, serv);
@@ -235,10 +234,10 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
235 234
236 svc_sock_update_bufs(serv); 235 svc_sock_update_bufs(serv);
237 236
238 sprintf(svc_name, "nfsv4.%u-svc", minorversion);
239 cb_info->serv = serv; 237 cb_info->serv = serv;
240 cb_info->rqst = rqstp; 238 cb_info->rqst = rqstp;
241 cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name); 239 cb_info->task = kthread_run(callback_svc, cb_info->rqst,
240 "nfsv4.%u-svc", minorversion);
242 if (IS_ERR(cb_info->task)) { 241 if (IS_ERR(cb_info->task)) {
243 ret = PTR_ERR(cb_info->task); 242 ret = PTR_ERR(cb_info->task);
244 svc_exit_thread(cb_info->rqst); 243 svc_exit_thread(cb_info->rqst);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5d051419527b..d7ed697133f0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,6 +33,7 @@
33#include <linux/pagevec.h> 33#include <linux/pagevec.h>
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/swap.h>
36#include <linux/sched.h> 37#include <linux/sched.h>
37#include <linux/kmemleak.h> 38#include <linux/kmemleak.h>
38#include <linux/xattr.h> 39#include <linux/xattr.h>
@@ -1758,7 +1759,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
1758 */ 1759 */
1759int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 1760int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1760{ 1761{
1761 struct pagevec lru_pvec;
1762 struct page *page; 1762 struct page *page;
1763 char *kaddr; 1763 char *kaddr;
1764 struct iattr attr; 1764 struct iattr attr;
@@ -1798,11 +1798,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1798 * No big deal if we can't add this page to the page cache here. 1798 * No big deal if we can't add this page to the page cache here.
1799 * READLINK will get the missing page from the server if needed. 1799 * READLINK will get the missing page from the server if needed.
1800 */ 1800 */
1801 pagevec_init(&lru_pvec, 0); 1801 if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0,
1802 if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
1803 GFP_KERNEL)) { 1802 GFP_KERNEL)) {
1804 pagevec_add(&lru_pvec, page);
1805 pagevec_lru_add_file(&lru_pvec);
1806 SetPageUptodate(page); 1803 SetPageUptodate(page);
1807 unlock_page(page); 1804 unlock_page(page);
1808 } else 1805 } else
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6b4a79f4ad1d..94e94bd11aae 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -495,6 +495,35 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
495 return nfs_fscache_release_page(page, gfp); 495 return nfs_fscache_release_page(page, gfp);
496} 496}
497 497
498static void nfs_check_dirty_writeback(struct page *page,
499 bool *dirty, bool *writeback)
500{
501 struct nfs_inode *nfsi;
502 struct address_space *mapping = page_file_mapping(page);
503
504 if (!mapping || PageSwapCache(page))
505 return;
506
507 /*
508 * Check if an unstable page is currently being committed and
509 * if so, have the VM treat it as if the page is under writeback
510 * so it will not block due to pages that will shortly be freeable.
511 */
512 nfsi = NFS_I(mapping->host);
513 if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
514 *writeback = true;
515 return;
516 }
517
518 /*
519 * If PagePrivate() is set, then the page is not freeable and as the
520 * inode is not being committed, it's not going to be cleaned in the
521 * near future so treat it as dirty
522 */
523 if (PagePrivate(page))
524 *dirty = true;
525}
526
498/* 527/*
499 * Attempt to clear the private state associated with a page when an error 528 * Attempt to clear the private state associated with a page when an error
500 * occurs that requires the cached contents of an inode to be written back or 529 * occurs that requires the cached contents of an inode to be written back or
@@ -542,6 +571,7 @@ const struct address_space_operations nfs_file_aops = {
542 .direct_IO = nfs_direct_IO, 571 .direct_IO = nfs_direct_IO,
543 .migratepage = nfs_migrate_page, 572 .migratepage = nfs_migrate_page,
544 .launder_page = nfs_launder_page, 573 .launder_page = nfs_launder_page,
574 .is_dirty_writeback = nfs_check_dirty_writeback,
545 .error_remove_page = generic_error_remove_page, 575 .error_remove_page = generic_error_remove_page,
546#ifdef CONFIG_NFS_SWAP 576#ifdef CONFIG_NFS_SWAP
547 .swap_activate = nfs_swap_activate, 577 .swap_activate = nfs_swap_activate,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index ff10b4aa534c..55418811a55a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1194,7 +1194,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
1194 snprintf(buf, sizeof(buf), "%s-manager", 1194 snprintf(buf, sizeof(buf), "%s-manager",
1195 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 1195 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
1196 rcu_read_unlock(); 1196 rcu_read_unlock();
1197 task = kthread_run(nfs4_run_state_manager, clp, buf); 1197 task = kthread_run(nfs4_run_state_manager, clp, "%s", buf);
1198 if (IS_ERR(task)) { 1198 if (IS_ERR(task)) {
1199 printk(KERN_ERR "%s: kthread_run: %ld\n", 1199 printk(KERN_ERR "%s: kthread_run: %ld\n",
1200 __func__, PTR_ERR(task)); 1200 __func__, PTR_ERR(task));
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index eed4d7b26249..741fd02e0444 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -398,6 +398,69 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
398} 398}
399 399
400/** 400/**
401 * nilfs_palloc_count_desc_blocks - count descriptor blocks number
402 * @inode: inode of metadata file using this allocator
403 * @desc_blocks: descriptor blocks number [out]
404 */
405static int nilfs_palloc_count_desc_blocks(struct inode *inode,
406 unsigned long *desc_blocks)
407{
408 unsigned long blknum;
409 int ret;
410
411 ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
412 if (likely(!ret))
413 *desc_blocks = DIV_ROUND_UP(
414 blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block);
415 return ret;
416}
417
418/**
419 * nilfs_palloc_mdt_file_can_grow - check potential opportunity for
420 * MDT file growing
421 * @inode: inode of metadata file using this allocator
422 * @desc_blocks: known current descriptor blocks count
423 */
424static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode,
425 unsigned long desc_blocks)
426{
427 return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) <
428 nilfs_palloc_groups_count(inode);
429}
430
431/**
432 * nilfs_palloc_count_max_entries - count max number of entries that can be
433 * described by descriptor blocks count
434 * @inode: inode of metadata file using this allocator
435 * @nused: current number of used entries
436 * @nmaxp: max number of entries [out]
437 */
438int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
439{
440 unsigned long desc_blocks = 0;
441 u64 entries_per_desc_block, nmax;
442 int err;
443
444 err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks);
445 if (unlikely(err))
446 return err;
447
448 entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) *
449 nilfs_palloc_groups_per_desc_block(inode);
450 nmax = entries_per_desc_block * desc_blocks;
451
452 if (nused == nmax &&
453 nilfs_palloc_mdt_file_can_grow(inode, desc_blocks))
454 nmax += entries_per_desc_block;
455
456 if (nused > nmax)
457 return -ERANGE;
458
459 *nmaxp = nmax;
460 return 0;
461}
462
463/**
401 * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object 464 * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
402 * @inode: inode of metadata file using this allocator 465 * @inode: inode of metadata file using this allocator
403 * @req: nilfs_palloc_req structure exchanged for the allocation 466 * @req: nilfs_palloc_req structure exchanged for the allocation
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index fb7238100548..4bd6451b5703 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -48,6 +48,8 @@ int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
48void *nilfs_palloc_block_get_entry(const struct inode *, __u64, 48void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
49 const struct buffer_head *, void *); 49 const struct buffer_head *, void *);
50 50
51int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *);
52
51/** 53/**
52 * nilfs_palloc_req - persistent allocator request and reply 54 * nilfs_palloc_req - persistent allocator request and reply
53 * @pr_entry_nr: entry number (vblocknr or inode number) 55 * @pr_entry_nr: entry number (vblocknr or inode number)
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index d8e65bde083c..6548c7851b48 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -160,6 +160,28 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
160} 160}
161 161
162/** 162/**
163 * nilfs_ifile_count_free_inodes - calculate free inodes count
164 * @ifile: ifile inode
165 * @nmaxinodes: current maximum of available inodes count [out]
166 * @nfreeinodes: free inodes count [out]
167 */
168int nilfs_ifile_count_free_inodes(struct inode *ifile,
169 u64 *nmaxinodes, u64 *nfreeinodes)
170{
171 u64 nused;
172 int err;
173
174 *nmaxinodes = 0;
175 *nfreeinodes = 0;
176
177 nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count);
178 err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes);
179 if (likely(!err))
180 *nfreeinodes = *nmaxinodes - nused;
181 return err;
182}
183
184/**
163 * nilfs_ifile_read - read or get ifile inode 185 * nilfs_ifile_read - read or get ifile inode
164 * @sb: super block instance 186 * @sb: super block instance
165 * @root: root object 187 * @root: root object
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 59b6f2b51df6..679674d13372 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
49int nilfs_ifile_delete_inode(struct inode *, ino_t); 49int nilfs_ifile_delete_inode(struct inode *, ino_t);
50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); 50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
51 51
52int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
53
52int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, 54int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
53 size_t inode_size, struct nilfs_inode *raw_inode, 55 size_t inode_size, struct nilfs_inode *raw_inode,
54 struct inode **inodep); 56 struct inode **inodep);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index bccfec8343c5..b1a5277cfd18 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -54,7 +54,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n)
54 54
55 inode_add_bytes(inode, (1 << inode->i_blkbits) * n); 55 inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
56 if (root) 56 if (root)
57 atomic_add(n, &root->blocks_count); 57 atomic64_add(n, &root->blocks_count);
58} 58}
59 59
60void nilfs_inode_sub_blocks(struct inode *inode, int n) 60void nilfs_inode_sub_blocks(struct inode *inode, int n)
@@ -63,7 +63,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
63 63
64 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); 64 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
65 if (root) 65 if (root)
66 atomic_sub(n, &root->blocks_count); 66 atomic64_sub(n, &root->blocks_count);
67} 67}
68 68
69/** 69/**
@@ -369,7 +369,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
369 goto failed_ifile_create_inode; 369 goto failed_ifile_create_inode;
370 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 370 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
371 371
372 atomic_inc(&root->inodes_count); 372 atomic64_inc(&root->inodes_count);
373 inode_init_owner(inode, dir, mode); 373 inode_init_owner(inode, dir, mode);
374 inode->i_ino = ino; 374 inode->i_ino = ino;
375 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 375 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -801,7 +801,7 @@ void nilfs_evict_inode(struct inode *inode)
801 801
802 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); 802 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
803 if (!ret) 803 if (!ret)
804 atomic_dec(&ii->i_root->inodes_count); 804 atomic64_dec(&ii->i_root->inodes_count);
805 805
806 nilfs_clear_inode(inode); 806 nilfs_clear_inode(inode);
807 807
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a5752a589932..bd88a7461063 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -835,9 +835,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
835 raw_cp->cp_snapshot_list.ssl_next = 0; 835 raw_cp->cp_snapshot_list.ssl_next = 0;
836 raw_cp->cp_snapshot_list.ssl_prev = 0; 836 raw_cp->cp_snapshot_list.ssl_prev = 0;
837 raw_cp->cp_inodes_count = 837 raw_cp->cp_inodes_count =
838 cpu_to_le64(atomic_read(&sci->sc_root->inodes_count)); 838 cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
839 raw_cp->cp_blocks_count = 839 raw_cp->cp_blocks_count =
840 cpu_to_le64(atomic_read(&sci->sc_root->blocks_count)); 840 cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
841 raw_cp->cp_nblk_inc = 841 raw_cp->cp_nblk_inc =
842 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); 842 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
843 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); 843 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c7d1f9f18b09..1427de5ebf4d 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -554,8 +554,10 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
554 if (err) 554 if (err)
555 goto failed_bh; 555 goto failed_bh;
556 556
557 atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); 557 atomic64_set(&root->inodes_count,
558 atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); 558 le64_to_cpu(raw_cp->cp_inodes_count));
559 atomic64_set(&root->blocks_count,
560 le64_to_cpu(raw_cp->cp_blocks_count));
559 561
560 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); 562 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
561 563
@@ -609,6 +611,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
609 unsigned long overhead; 611 unsigned long overhead;
610 unsigned long nrsvblocks; 612 unsigned long nrsvblocks;
611 sector_t nfreeblocks; 613 sector_t nfreeblocks;
614 u64 nmaxinodes, nfreeinodes;
612 int err; 615 int err;
613 616
614 /* 617 /*
@@ -633,14 +636,34 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
633 if (unlikely(err)) 636 if (unlikely(err))
634 return err; 637 return err;
635 638
639 err = nilfs_ifile_count_free_inodes(root->ifile,
640 &nmaxinodes, &nfreeinodes);
641 if (unlikely(err)) {
642 printk(KERN_WARNING
643 "NILFS warning: fail to count free inodes: err %d.\n",
644 err);
645 if (err == -ERANGE) {
646 /*
647 * If nilfs_palloc_count_max_entries() returns
648 * -ERANGE error code then we simply treat
649 * curent inodes count as maximum possible and
650 * zero as free inodes value.
651 */
652 nmaxinodes = atomic64_read(&root->inodes_count);
653 nfreeinodes = 0;
654 err = 0;
655 } else
656 return err;
657 }
658
636 buf->f_type = NILFS_SUPER_MAGIC; 659 buf->f_type = NILFS_SUPER_MAGIC;
637 buf->f_bsize = sb->s_blocksize; 660 buf->f_bsize = sb->s_blocksize;
638 buf->f_blocks = blocks - overhead; 661 buf->f_blocks = blocks - overhead;
639 buf->f_bfree = nfreeblocks; 662 buf->f_bfree = nfreeblocks;
640 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? 663 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
641 (buf->f_bfree - nrsvblocks) : 0; 664 (buf->f_bfree - nrsvblocks) : 0;
642 buf->f_files = atomic_read(&root->inodes_count); 665 buf->f_files = nmaxinodes;
643 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ 666 buf->f_ffree = nfreeinodes;
644 buf->f_namelen = NILFS_NAME_LEN; 667 buf->f_namelen = NILFS_NAME_LEN;
645 buf->f_fsid.val[0] = (u32)id; 668 buf->f_fsid.val[0] = (u32)id;
646 buf->f_fsid.val[1] = (u32)(id >> 32); 669 buf->f_fsid.val[1] = (u32)(id >> 32);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 41e6a04a561f..94c451ce6d24 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -764,8 +764,8 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
764 new->ifile = NULL; 764 new->ifile = NULL;
765 new->nilfs = nilfs; 765 new->nilfs = nilfs;
766 atomic_set(&new->count, 1); 766 atomic_set(&new->count, 1);
767 atomic_set(&new->inodes_count, 0); 767 atomic64_set(&new->inodes_count, 0);
768 atomic_set(&new->blocks_count, 0); 768 atomic64_set(&new->blocks_count, 0);
769 769
770 rb_link_node(&new->rb_node, parent, p); 770 rb_link_node(&new->rb_node, parent, p);
771 rb_insert_color(&new->rb_node, &nilfs->ns_cptree); 771 rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index be1267a34cea..de8cc53b4a5c 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -241,8 +241,8 @@ struct nilfs_root {
241 struct the_nilfs *nilfs; 241 struct the_nilfs *nilfs;
242 struct inode *ifile; 242 struct inode *ifile;
243 243
244 atomic_t inodes_count; 244 atomic64_t inodes_count;
245 atomic_t blocks_count; 245 atomic64_t blocks_count;
246}; 246};
247 247
248/* Special checkpoint number */ 248/* Special checkpoint number */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b8a9d87231b1..17e6bdde96c5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5655,7 +5655,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5655 &ref_tree, NULL); 5655 &ref_tree, NULL);
5656 if (ret) { 5656 if (ret) {
5657 mlog_errno(ret); 5657 mlog_errno(ret);
5658 goto out; 5658 goto bail;
5659 } 5659 }
5660 5660
5661 ret = ocfs2_prepare_refcount_change_for_del(inode, 5661 ret = ocfs2_prepare_refcount_change_for_del(inode,
@@ -5666,7 +5666,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5666 &extra_blocks); 5666 &extra_blocks);
5667 if (ret < 0) { 5667 if (ret < 0) {
5668 mlog_errno(ret); 5668 mlog_errno(ret);
5669 goto out; 5669 goto bail;
5670 } 5670 }
5671 } 5671 }
5672 5672
@@ -5674,7 +5674,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5674 extra_blocks); 5674 extra_blocks);
5675 if (ret) { 5675 if (ret) {
5676 mlog_errno(ret); 5676 mlog_errno(ret);
5677 return ret; 5677 goto bail;
5678 } 5678 }
5679 5679
5680 mutex_lock(&tl_inode->i_mutex); 5680 mutex_lock(&tl_inode->i_mutex);
@@ -5734,7 +5734,7 @@ out_commit:
5734 ocfs2_commit_trans(osb, handle); 5734 ocfs2_commit_trans(osb, handle);
5735out: 5735out:
5736 mutex_unlock(&tl_inode->i_mutex); 5736 mutex_unlock(&tl_inode->i_mutex);
5737 5737bail:
5738 if (meta_ac) 5738 if (meta_ac)
5739 ocfs2_free_alloc_context(meta_ac); 5739 ocfs2_free_alloc_context(meta_ac);
5740 5740
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 42252bf64b51..5c1c864e81cc 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -176,7 +176,7 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
176 } 176 }
177} 177}
178 178
179static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode) 179static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode)
180{ 180{
181 int ret = -1; 181 int ret = -1;
182 182
@@ -500,7 +500,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
500 } 500 }
501 501
502 atomic_inc(&write_wc->wc_num_reqs); 502 atomic_inc(&write_wc->wc_num_reqs);
503 submit_bio(WRITE, bio); 503 submit_bio(WRITE_SYNC, bio);
504 504
505 status = 0; 505 status = 0;
506bail: 506bail:
@@ -2271,7 +2271,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
2271 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) 2271 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
2272 continue; 2272 continue;
2273 2273
2274 ret = o2hb_global_hearbeat_mode_set(i); 2274 ret = o2hb_global_heartbeat_mode_set(i);
2275 if (!ret) 2275 if (!ret)
2276 printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n", 2276 printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
2277 o2hb_heartbeat_mode_desc[i]); 2277 o2hb_heartbeat_mode_desc[i]);
@@ -2304,7 +2304,7 @@ static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
2304 NULL, 2304 NULL,
2305}; 2305};
2306 2306
2307static struct configfs_item_operations o2hb_hearbeat_group_item_ops = { 2307static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
2308 .show_attribute = o2hb_heartbeat_group_show, 2308 .show_attribute = o2hb_heartbeat_group_show,
2309 .store_attribute = o2hb_heartbeat_group_store, 2309 .store_attribute = o2hb_heartbeat_group_store,
2310}; 2310};
@@ -2316,7 +2316,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
2316 2316
2317static struct config_item_type o2hb_heartbeat_group_type = { 2317static struct config_item_type o2hb_heartbeat_group_type = {
2318 .ct_group_ops = &o2hb_heartbeat_group_group_ops, 2318 .ct_group_ops = &o2hb_heartbeat_group_group_ops,
2319 .ct_item_ops = &o2hb_hearbeat_group_item_ops, 2319 .ct_item_ops = &o2hb_heartbeat_group_item_ops,
2320 .ct_attrs = o2hb_heartbeat_group_attrs, 2320 .ct_attrs = o2hb_heartbeat_group_attrs,
2321 .ct_owner = THIS_MODULE, 2321 .ct_owner = THIS_MODULE,
2322}; 2322};
@@ -2389,6 +2389,9 @@ static int o2hb_region_pin(const char *region_uuid)
2389 assert_spin_locked(&o2hb_live_lock); 2389 assert_spin_locked(&o2hb_live_lock);
2390 2390
2391 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { 2391 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2392 if (reg->hr_item_dropped)
2393 continue;
2394
2392 uuid = config_item_name(&reg->hr_item); 2395 uuid = config_item_name(&reg->hr_item);
2393 2396
2394 /* local heartbeat */ 2397 /* local heartbeat */
@@ -2439,6 +2442,9 @@ static void o2hb_region_unpin(const char *region_uuid)
2439 assert_spin_locked(&o2hb_live_lock); 2442 assert_spin_locked(&o2hb_live_lock);
2440 2443
2441 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { 2444 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2445 if (reg->hr_item_dropped)
2446 continue;
2447
2442 uuid = config_item_name(&reg->hr_item); 2448 uuid = config_item_name(&reg->hr_item);
2443 if (region_uuid) { 2449 if (region_uuid) {
2444 if (strcmp(region_uuid, uuid)) 2450 if (strcmp(region_uuid, uuid))
@@ -2654,6 +2660,9 @@ int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
2654 2660
2655 p = region_uuids; 2661 p = region_uuids;
2656 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { 2662 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2663 if (reg->hr_item_dropped)
2664 continue;
2665
2657 mlog(0, "Region: %s\n", config_item_name(&reg->hr_item)); 2666 mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
2658 if (numregs < max_regions) { 2667 if (numregs < max_regions) {
2659 memcpy(p, config_item_name(&reg->hr_item), 2668 memcpy(p, config_item_name(&reg->hr_item),
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index c19897d0fe14..1ec141e758d7 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -264,7 +264,7 @@ void o2quo_hb_still_up(u8 node)
264/* This is analogous to hb_up. as a node's connection comes up we delay the 264/* This is analogous to hb_up. as a node's connection comes up we delay the
265 * quorum decision until we see it heartbeating. the hold will be droped in 265 * quorum decision until we see it heartbeating. the hold will be droped in
266 * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if 266 * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
267 * it's already heartbeating we we might be dropping a hold that conn_up got. 267 * it's already heartbeating we might be dropping a hold that conn_up got.
268 * */ 268 * */
269void o2quo_conn_up(u8 node) 269void o2quo_conn_up(u8 node)
270{ 270{
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa88bd8bcedc..d644dc611425 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -406,6 +406,9 @@ static void sc_kref_release(struct kref *kref)
406 sc->sc_node = NULL; 406 sc->sc_node = NULL;
407 407
408 o2net_debug_del_sc(sc); 408 o2net_debug_del_sc(sc);
409
410 if (sc->sc_page)
411 __free_page(sc->sc_page);
409 kfree(sc); 412 kfree(sc);
410} 413}
411 414
@@ -630,19 +633,19 @@ static void o2net_state_change(struct sock *sk)
630 state_change = sc->sc_state_change; 633 state_change = sc->sc_state_change;
631 634
632 switch(sk->sk_state) { 635 switch(sk->sk_state) {
633 /* ignore connecting sockets as they make progress */ 636 /* ignore connecting sockets as they make progress */
634 case TCP_SYN_SENT: 637 case TCP_SYN_SENT:
635 case TCP_SYN_RECV: 638 case TCP_SYN_RECV:
636 break; 639 break;
637 case TCP_ESTABLISHED: 640 case TCP_ESTABLISHED:
638 o2net_sc_queue_work(sc, &sc->sc_connect_work); 641 o2net_sc_queue_work(sc, &sc->sc_connect_work);
639 break; 642 break;
640 default: 643 default:
641 printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT 644 printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
642 " shutdown, state %d\n", 645 " shutdown, state %d\n",
643 SC_NODEF_ARGS(sc), sk->sk_state); 646 SC_NODEF_ARGS(sc), sk->sk_state);
644 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 647 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
645 break; 648 break;
646 } 649 }
647out: 650out:
648 read_unlock(&sk->sk_callback_lock); 651 read_unlock(&sk->sk_callback_lock);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 975810b98492..47e67c2d228f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -178,6 +178,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
178 lock->ml.node); 178 lock->ml.node);
179 } 179 }
180 } else { 180 } else {
181 status = DLM_NORMAL;
181 dlm_lock_get(lock); 182 dlm_lock_get(lock);
182 list_add_tail(&lock->list, &res->blocked); 183 list_add_tail(&lock->list, &res->blocked);
183 kick_thread = 1; 184 kick_thread = 1;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index e68588e6b1e8..773bd32bfd8c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -55,9 +55,6 @@
55static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); 55static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
56 56
57static int dlm_recovery_thread(void *data); 57static int dlm_recovery_thread(void *data);
58void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
59int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
60void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
61static int dlm_do_recovery(struct dlm_ctxt *dlm); 58static int dlm_do_recovery(struct dlm_ctxt *dlm);
62 59
63static int dlm_pick_recovery_master(struct dlm_ctxt *dlm); 60static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
@@ -789,7 +786,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
789 u8 dead_node) 786 u8 dead_node)
790{ 787{
791 struct dlm_lock_request lr; 788 struct dlm_lock_request lr;
792 enum dlm_status ret; 789 int ret;
793 790
794 mlog(0, "\n"); 791 mlog(0, "\n");
795 792
@@ -802,7 +799,6 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
802 lr.dead_node = dead_node; 799 lr.dead_node = dead_node;
803 800
804 // send message 801 // send message
805 ret = DLM_NOLOCKMGR;
806 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, 802 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
807 &lr, sizeof(lr), request_from, NULL); 803 &lr, sizeof(lr), request_from, NULL);
808 804
@@ -2696,6 +2692,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2696 dlm->name, br->node_idx, br->dead_node, 2692 dlm->name, br->node_idx, br->dead_node,
2697 dlm->reco.dead_node, dlm->reco.new_master); 2693 dlm->reco.dead_node, dlm->reco.new_master);
2698 spin_unlock(&dlm->spinlock); 2694 spin_unlock(&dlm->spinlock);
2695 dlm_put(dlm);
2699 return -EAGAIN; 2696 return -EAGAIN;
2700 } 2697 }
2701 spin_unlock(&dlm->spinlock); 2698 spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index a3385b63ff5e..96f9ac237e86 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -200,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
200 200
201static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) 201static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
202{ 202{
203 atomic_set(&osb->needs_checkpoint, 1);
204 wake_up(&osb->checkpoint_event); 203 wake_up(&osb->checkpoint_event);
205} 204}
206 205
@@ -538,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
538 extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); 537 extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
539 538
540 return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + 539 return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
541 ocfs2_quota_trans_credits(sb); 540 ocfs2_quota_trans_credits(sb) + bits_wanted;
542} 541}
543 542
544static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 543static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b4a5cdf9dbc5..be3f8676a438 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -522,7 +522,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
522 522
523 fe->i_last_eb_blk = 0; 523 fe->i_last_eb_blk = 0;
524 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); 524 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
525 le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL); 525 fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
526 fe->i_atime = fe->i_ctime = fe->i_mtime = 526 fe->i_atime = fe->i_ctime = fe->i_mtime =
527 cpu_to_le64(CURRENT_TIME.tv_sec); 527 cpu_to_le64(CURRENT_TIME.tv_sec);
528 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = 528 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
@@ -773,7 +773,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry)
773 return ret; 773 return ret;
774} 774}
775 775
776static inline int inode_is_unlinkable(struct inode *inode) 776static inline int ocfs2_inode_is_unlinkable(struct inode *inode)
777{ 777{
778 if (S_ISDIR(inode->i_mode)) { 778 if (S_ISDIR(inode->i_mode)) {
779 if (inode->i_nlink == 2) 779 if (inode->i_nlink == 2)
@@ -791,6 +791,7 @@ static int ocfs2_unlink(struct inode *dir,
791{ 791{
792 int status; 792 int status;
793 int child_locked = 0; 793 int child_locked = 0;
794 bool is_unlinkable = false;
794 struct inode *inode = dentry->d_inode; 795 struct inode *inode = dentry->d_inode;
795 struct inode *orphan_dir = NULL; 796 struct inode *orphan_dir = NULL;
796 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 797 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -865,7 +866,7 @@ static int ocfs2_unlink(struct inode *dir,
865 goto leave; 866 goto leave;
866 } 867 }
867 868
868 if (inode_is_unlinkable(inode)) { 869 if (ocfs2_inode_is_unlinkable(inode)) {
869 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 870 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
870 OCFS2_I(inode)->ip_blkno, 871 OCFS2_I(inode)->ip_blkno,
871 orphan_name, &orphan_insert); 872 orphan_name, &orphan_insert);
@@ -873,6 +874,7 @@ static int ocfs2_unlink(struct inode *dir,
873 mlog_errno(status); 874 mlog_errno(status);
874 goto leave; 875 goto leave;
875 } 876 }
877 is_unlinkable = true;
876 } 878 }
877 879
878 handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb)); 880 handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
@@ -892,15 +894,6 @@ static int ocfs2_unlink(struct inode *dir,
892 894
893 fe = (struct ocfs2_dinode *) fe_bh->b_data; 895 fe = (struct ocfs2_dinode *) fe_bh->b_data;
894 896
895 if (inode_is_unlinkable(inode)) {
896 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
897 &orphan_insert, orphan_dir);
898 if (status < 0) {
899 mlog_errno(status);
900 goto leave;
901 }
902 }
903
904 /* delete the name from the parent dir */ 897 /* delete the name from the parent dir */
905 status = ocfs2_delete_entry(handle, dir, &lookup); 898 status = ocfs2_delete_entry(handle, dir, &lookup);
906 if (status < 0) { 899 if (status < 0) {
@@ -923,6 +916,14 @@ static int ocfs2_unlink(struct inode *dir,
923 mlog_errno(status); 916 mlog_errno(status);
924 if (S_ISDIR(inode->i_mode)) 917 if (S_ISDIR(inode->i_mode))
925 inc_nlink(dir); 918 inc_nlink(dir);
919 goto leave;
920 }
921
922 if (is_unlinkable) {
923 status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
924 orphan_name, &orphan_insert, orphan_dir);
925 if (status < 0)
926 mlog_errno(status);
926 } 927 }
927 928
928leave: 929leave:
@@ -2012,6 +2013,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2012 goto leave; 2013 goto leave;
2013 } 2014 }
2014 2015
2016 /*
2017 * We're going to journal the change of i_flags and i_orphaned_slot.
2018 * It's safe anyway, though some callers may duplicate the journaling.
2019 * Journaling within the func just make the logic look more
2020 * straightforward.
2021 */
2022 status = ocfs2_journal_access_di(handle,
2023 INODE_CACHE(inode),
2024 fe_bh,
2025 OCFS2_JOURNAL_ACCESS_WRITE);
2026 if (status < 0) {
2027 mlog_errno(status);
2028 goto leave;
2029 }
2030
2015 /* we're a cluster, and nlink can change on disk from 2031 /* we're a cluster, and nlink can change on disk from
2016 * underneath us... */ 2032 * underneath us... */
2017 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2033 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
@@ -2026,25 +2042,10 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2026 orphan_dir_bh, lookup); 2042 orphan_dir_bh, lookup);
2027 if (status < 0) { 2043 if (status < 0) {
2028 mlog_errno(status); 2044 mlog_errno(status);
2029 goto leave; 2045 goto rollback;
2030 }
2031
2032 /*
2033 * We're going to journal the change of i_flags and i_orphaned_slot.
2034 * It's safe anyway, though some callers may duplicate the journaling.
2035 * Journaling within the func just make the logic look more
2036 * straightforward.
2037 */
2038 status = ocfs2_journal_access_di(handle,
2039 INODE_CACHE(inode),
2040 fe_bh,
2041 OCFS2_JOURNAL_ACCESS_WRITE);
2042 if (status < 0) {
2043 mlog_errno(status);
2044 goto leave;
2045 } 2046 }
2046 2047
2047 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); 2048 fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
2048 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; 2049 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
2049 2050
2050 /* Record which orphan dir our inode now resides 2051 /* Record which orphan dir our inode now resides
@@ -2057,11 +2058,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2057 trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno, 2058 trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
2058 osb->slot_num); 2059 osb->slot_num);
2059 2060
2061rollback:
2062 if (status < 0) {
2063 if (S_ISDIR(inode->i_mode))
2064 ocfs2_add_links_count(orphan_fe, -1);
2065 set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
2066 }
2067
2060leave: 2068leave:
2061 brelse(orphan_dir_bh); 2069 brelse(orphan_dir_bh);
2062 2070
2063 if (status)
2064 mlog_errno(status);
2065 return status; 2071 return status;
2066} 2072}
2067 2073
@@ -2434,7 +2440,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2434 } 2440 }
2435 2441
2436 di = (struct ocfs2_dinode *)di_bh->b_data; 2442 di = (struct ocfs2_dinode *)di_bh->b_data;
2437 le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); 2443 di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL);
2438 di->i_orphaned_slot = 0; 2444 di->i_orphaned_slot = 0;
2439 set_nlink(inode, 1); 2445 set_nlink(inode, 1);
2440 ocfs2_set_links_count(di, inode->i_nlink); 2446 ocfs2_set_links_count(di, inode->i_nlink);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d355e6e36b36..3a903470c794 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -347,7 +347,6 @@ struct ocfs2_super
347 struct task_struct *recovery_thread_task; 347 struct task_struct *recovery_thread_task;
348 int disable_recovery; 348 int disable_recovery;
349 wait_queue_head_t checkpoint_event; 349 wait_queue_head_t checkpoint_event;
350 atomic_t needs_checkpoint;
351 struct ocfs2_journal *journal; 350 struct ocfs2_journal *journal;
352 unsigned long osb_commit_interval; 351 unsigned long osb_commit_interval;
353 352
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index b7e74b580c0f..5397c07ce608 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1422,7 +1422,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
1422 int status; 1422 int status;
1423 /* there is a really tiny chance the journal calls could fail, 1423 /* there is a really tiny chance the journal calls could fail,
1424 * but we wouldn't want inconsistent blocks in *any* case. */ 1424 * but we wouldn't want inconsistent blocks in *any* case. */
1425 u64 fe_ptr, bg_ptr, prev_bg_ptr; 1425 u64 bg_ptr, prev_bg_ptr;
1426 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 1426 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1427 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1427 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1428 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1428 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
@@ -1437,51 +1437,44 @@ static int ocfs2_relink_block_group(handle_t *handle,
1437 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1437 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1438 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 1438 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1439 1439
1440 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1441 bg_ptr = le64_to_cpu(bg->bg_next_group); 1440 bg_ptr = le64_to_cpu(bg->bg_next_group);
1442 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1441 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1443 1442
1444 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1443 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1445 prev_bg_bh, 1444 prev_bg_bh,
1446 OCFS2_JOURNAL_ACCESS_WRITE); 1445 OCFS2_JOURNAL_ACCESS_WRITE);
1447 if (status < 0) { 1446 if (status < 0)
1448 mlog_errno(status); 1447 goto out;
1449 goto out_rollback;
1450 }
1451 1448
1452 prev_bg->bg_next_group = bg->bg_next_group; 1449 prev_bg->bg_next_group = bg->bg_next_group;
1453 ocfs2_journal_dirty(handle, prev_bg_bh); 1450 ocfs2_journal_dirty(handle, prev_bg_bh);
1454 1451
1455 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1452 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1456 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1453 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1457 if (status < 0) { 1454 if (status < 0)
1458 mlog_errno(status); 1455 goto out_rollback_prev_bg;
1459 goto out_rollback;
1460 }
1461 1456
1462 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1457 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1463 ocfs2_journal_dirty(handle, bg_bh); 1458 ocfs2_journal_dirty(handle, bg_bh);
1464 1459
1465 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1460 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1466 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1461 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1467 if (status < 0) { 1462 if (status < 0)
1468 mlog_errno(status); 1463 goto out_rollback_bg;
1469 goto out_rollback;
1470 }
1471 1464
1472 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1465 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1473 ocfs2_journal_dirty(handle, fe_bh); 1466 ocfs2_journal_dirty(handle, fe_bh);
1474 1467
1475out_rollback: 1468out:
1476 if (status < 0) { 1469 if (status < 0)
1477 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1478 bg->bg_next_group = cpu_to_le64(bg_ptr);
1479 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1480 }
1481
1482 if (status)
1483 mlog_errno(status); 1470 mlog_errno(status);
1484 return status; 1471 return status;
1472
1473out_rollback_bg:
1474 bg->bg_next_group = cpu_to_le64(bg_ptr);
1475out_rollback_prev_bg:
1476 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1477 goto out;
1485} 1478}
1486 1479
1487static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 1480static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 01b85165552b..854d80955bf8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -286,10 +286,9 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
286 spin_unlock(&osb->osb_lock); 286 spin_unlock(&osb->osb_lock);
287 287
288 out += snprintf(buf + out, len - out, 288 out += snprintf(buf + out, len - out,
289 "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit", 289 "%10s => Pid: %d Interval: %lu\n", "Commit",
290 (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), 290 (osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
291 osb->osb_commit_interval, 291 osb->osb_commit_interval);
292 atomic_read(&osb->needs_checkpoint));
293 292
294 out += snprintf(buf + out, len - out, 293 out += snprintf(buf + out, len - out,
295 "%10s => State: %d TxnId: %lu NumTxns: %d\n", 294 "%10s => State: %d TxnId: %lu NumTxns: %d\n",
@@ -2154,7 +2153,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2154 } 2153 }
2155 2154
2156 init_waitqueue_head(&osb->checkpoint_event); 2155 init_waitqueue_head(&osb->checkpoint_event);
2157 atomic_set(&osb->needs_checkpoint, 0);
2158 2156
2159 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 2157 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
2160 2158
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2e3ea308c144..317ef0abccbb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2751,7 +2751,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2751{ 2751{
2752 int ret; 2752 int ret;
2753 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2753 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2754 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
2755 struct ocfs2_xa_loc loc; 2754 struct ocfs2_xa_loc loc;
2756 2755
2757 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) 2756 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
@@ -2759,13 +2758,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2759 2758
2760 down_write(&oi->ip_alloc_sem); 2759 down_write(&oi->ip_alloc_sem);
2761 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { 2760 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
2762 if (!ocfs2_xattr_has_space_inline(inode, di)) {
2763 ret = -ENOSPC;
2764 goto out;
2765 }
2766 }
2767
2768 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
2769 ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt); 2761 ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
2770 if (ret) { 2762 if (ret) {
2771 if (ret != -ENOSPC) 2763 if (ret != -ENOSPC)
@@ -6499,6 +6491,16 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
6499 } 6491 }
6500 6492
6501 new_oi = OCFS2_I(args->new_inode); 6493 new_oi = OCFS2_I(args->new_inode);
6494 /*
6495 * Adjust extent record count to reserve space for extended attribute.
6496 * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
6497 */
6498 if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
6499 !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
6500 struct ocfs2_extent_list *el = &new_di->id2.i_list;
6501 le16_add_cpu(&el->l_count, -(inline_size /
6502 sizeof(struct ocfs2_extent_rec)));
6503 }
6502 spin_lock(&new_oi->ip_lock); 6504 spin_lock(&new_oi->ip_lock);
6503 new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; 6505 new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
6504 new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); 6506 new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 0a22194e5d58..06ea155e1a59 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -408,7 +408,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
408 prpsinfo.pr_zomb = 0; 408 prpsinfo.pr_zomb = 0;
409 409
410 strcpy(prpsinfo.pr_fname, "vmlinux"); 410 strcpy(prpsinfo.pr_fname, "vmlinux");
411 strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ); 411 strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
412 412
413 nhdr->p_filesz += notesize(&notes[1]); 413 nhdr->p_filesz += notesize(&notes[1]);
414 bufp = storenote(&notes[1], bufp); 414 bufp = storenote(&notes[1], bufp);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d864d56..dbf61f6174f0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
11#include <linux/rmap.h> 11#include <linux/rmap.h>
12#include <linux/swap.h> 12#include <linux/swap.h>
13#include <linux/swapops.h> 13#include <linux/swapops.h>
14#include <linux/mmu_notifier.h>
14 15
15#include <asm/elf.h> 16#include <asm/elf.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
@@ -688,10 +689,58 @@ const struct file_operations proc_tid_smaps_operations = {
688 .release = seq_release_private, 689 .release = seq_release_private,
689}; 690};
690 691
692/*
693 * We do not want to have constant page-shift bits sitting in
694 * pagemap entries and are about to reuse them some time soon.
695 *
696 * Here's the "migration strategy":
697 * 1. when the system boots these bits remain what they are,
698 * but a warning about future change is printed in log;
699 * 2. once anyone clears soft-dirty bits via clear_refs file,
700 * these flag is set to denote, that user is aware of the
701 * new API and those page-shift bits change their meaning.
702 * The respective warning is printed in dmesg;
703 * 3. In a couple of releases we will remove all the mentions
704 * of page-shift in pagemap entries.
705 */
706
707static bool soft_dirty_cleared __read_mostly;
708
709enum clear_refs_types {
710 CLEAR_REFS_ALL = 1,
711 CLEAR_REFS_ANON,
712 CLEAR_REFS_MAPPED,
713 CLEAR_REFS_SOFT_DIRTY,
714 CLEAR_REFS_LAST,
715};
716
717struct clear_refs_private {
718 struct vm_area_struct *vma;
719 enum clear_refs_types type;
720};
721
722static inline void clear_soft_dirty(struct vm_area_struct *vma,
723 unsigned long addr, pte_t *pte)
724{
725#ifdef CONFIG_MEM_SOFT_DIRTY
726 /*
727 * The soft-dirty tracker uses #PF-s to catch writes
728 * to pages, so write-protect the pte as well. See the
729 * Documentation/vm/soft-dirty.txt for full description
730 * of how soft-dirty works.
731 */
732 pte_t ptent = *pte;
733 ptent = pte_wrprotect(ptent);
734 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
735 set_pte_at(vma->vm_mm, addr, pte, ptent);
736#endif
737}
738
691static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 739static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
692 unsigned long end, struct mm_walk *walk) 740 unsigned long end, struct mm_walk *walk)
693{ 741{
694 struct vm_area_struct *vma = walk->private; 742 struct clear_refs_private *cp = walk->private;
743 struct vm_area_struct *vma = cp->vma;
695 pte_t *pte, ptent; 744 pte_t *pte, ptent;
696 spinlock_t *ptl; 745 spinlock_t *ptl;
697 struct page *page; 746 struct page *page;
@@ -706,6 +755,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
706 if (!pte_present(ptent)) 755 if (!pte_present(ptent))
707 continue; 756 continue;
708 757
758 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
759 clear_soft_dirty(vma, addr, pte);
760 continue;
761 }
762
709 page = vm_normal_page(vma, addr, ptent); 763 page = vm_normal_page(vma, addr, ptent);
710 if (!page) 764 if (!page)
711 continue; 765 continue;
@@ -719,10 +773,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
719 return 0; 773 return 0;
720} 774}
721 775
722#define CLEAR_REFS_ALL 1
723#define CLEAR_REFS_ANON 2
724#define CLEAR_REFS_MAPPED 3
725
726static ssize_t clear_refs_write(struct file *file, const char __user *buf, 776static ssize_t clear_refs_write(struct file *file, const char __user *buf,
727 size_t count, loff_t *ppos) 777 size_t count, loff_t *ppos)
728{ 778{
@@ -730,7 +780,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
730 char buffer[PROC_NUMBUF]; 780 char buffer[PROC_NUMBUF];
731 struct mm_struct *mm; 781 struct mm_struct *mm;
732 struct vm_area_struct *vma; 782 struct vm_area_struct *vma;
733 int type; 783 enum clear_refs_types type;
784 int itype;
734 int rv; 785 int rv;
735 786
736 memset(buffer, 0, sizeof(buffer)); 787 memset(buffer, 0, sizeof(buffer));
@@ -738,23 +789,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
738 count = sizeof(buffer) - 1; 789 count = sizeof(buffer) - 1;
739 if (copy_from_user(buffer, buf, count)) 790 if (copy_from_user(buffer, buf, count))
740 return -EFAULT; 791 return -EFAULT;
741 rv = kstrtoint(strstrip(buffer), 10, &type); 792 rv = kstrtoint(strstrip(buffer), 10, &itype);
742 if (rv < 0) 793 if (rv < 0)
743 return rv; 794 return rv;
744 if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) 795 type = (enum clear_refs_types)itype;
796 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
745 return -EINVAL; 797 return -EINVAL;
798
799 if (type == CLEAR_REFS_SOFT_DIRTY) {
800 soft_dirty_cleared = true;
801 pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
802 "See the linux/Documentation/vm/pagemap.txt for details.\n");
803 }
804
746 task = get_proc_task(file_inode(file)); 805 task = get_proc_task(file_inode(file));
747 if (!task) 806 if (!task)
748 return -ESRCH; 807 return -ESRCH;
749 mm = get_task_mm(task); 808 mm = get_task_mm(task);
750 if (mm) { 809 if (mm) {
810 struct clear_refs_private cp = {
811 .type = type,
812 };
751 struct mm_walk clear_refs_walk = { 813 struct mm_walk clear_refs_walk = {
752 .pmd_entry = clear_refs_pte_range, 814 .pmd_entry = clear_refs_pte_range,
753 .mm = mm, 815 .mm = mm,
816 .private = &cp,
754 }; 817 };
755 down_read(&mm->mmap_sem); 818 down_read(&mm->mmap_sem);
819 if (type == CLEAR_REFS_SOFT_DIRTY)
820 mmu_notifier_invalidate_range_start(mm, 0, -1);
756 for (vma = mm->mmap; vma; vma = vma->vm_next) { 821 for (vma = mm->mmap; vma; vma = vma->vm_next) {
757 clear_refs_walk.private = vma; 822 cp.vma = vma;
758 if (is_vm_hugetlb_page(vma)) 823 if (is_vm_hugetlb_page(vma))
759 continue; 824 continue;
760 /* 825 /*
@@ -773,6 +838,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
773 walk_page_range(vma->vm_start, vma->vm_end, 838 walk_page_range(vma->vm_start, vma->vm_end,
774 &clear_refs_walk); 839 &clear_refs_walk);
775 } 840 }
841 if (type == CLEAR_REFS_SOFT_DIRTY)
842 mmu_notifier_invalidate_range_end(mm, 0, -1);
776 flush_tlb_mm(mm); 843 flush_tlb_mm(mm);
777 up_read(&mm->mmap_sem); 844 up_read(&mm->mmap_sem);
778 mmput(mm); 845 mmput(mm);
@@ -794,6 +861,7 @@ typedef struct {
794struct pagemapread { 861struct pagemapread {
795 int pos, len; 862 int pos, len;
796 pagemap_entry_t *buffer; 863 pagemap_entry_t *buffer;
864 bool v2;
797}; 865};
798 866
799#define PAGEMAP_WALK_SIZE (PMD_SIZE) 867#define PAGEMAP_WALK_SIZE (PMD_SIZE)
@@ -807,14 +875,17 @@ struct pagemapread {
807#define PM_PSHIFT_BITS 6 875#define PM_PSHIFT_BITS 6
808#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) 876#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
809#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) 877#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
810#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) 878#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
811#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) 879#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
812#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) 880#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
881/* in "new" pagemap pshift bits are occupied with more status bits */
882#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
813 883
884#define __PM_SOFT_DIRTY (1LL)
814#define PM_PRESENT PM_STATUS(4LL) 885#define PM_PRESENT PM_STATUS(4LL)
815#define PM_SWAP PM_STATUS(2LL) 886#define PM_SWAP PM_STATUS(2LL)
816#define PM_FILE PM_STATUS(1LL) 887#define PM_FILE PM_STATUS(1LL)
817#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) 888#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
818#define PM_END_OF_BUFFER 1 889#define PM_END_OF_BUFFER 1
819 890
820static inline pagemap_entry_t make_pme(u64 val) 891static inline pagemap_entry_t make_pme(u64 val)
@@ -837,7 +908,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
837 struct pagemapread *pm = walk->private; 908 struct pagemapread *pm = walk->private;
838 unsigned long addr; 909 unsigned long addr;
839 int err = 0; 910 int err = 0;
840 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 911 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
841 912
842 for (addr = start; addr < end; addr += PAGE_SIZE) { 913 for (addr = start; addr < end; addr += PAGE_SIZE) {
843 err = add_to_pagemap(addr, &pme, pm); 914 err = add_to_pagemap(addr, &pme, pm);
@@ -847,11 +918,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
847 return err; 918 return err;
848} 919}
849 920
850static void pte_to_pagemap_entry(pagemap_entry_t *pme, 921static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
851 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 922 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
852{ 923{
853 u64 frame, flags; 924 u64 frame, flags;
854 struct page *page = NULL; 925 struct page *page = NULL;
926 int flags2 = 0;
855 927
856 if (pte_present(pte)) { 928 if (pte_present(pte)) {
857 frame = pte_pfn(pte); 929 frame = pte_pfn(pte);
@@ -866,19 +938,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme,
866 if (is_migration_entry(entry)) 938 if (is_migration_entry(entry))
867 page = migration_entry_to_page(entry); 939 page = migration_entry_to_page(entry);
868 } else { 940 } else {
869 *pme = make_pme(PM_NOT_PRESENT); 941 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
870 return; 942 return;
871 } 943 }
872 944
873 if (page && !PageAnon(page)) 945 if (page && !PageAnon(page))
874 flags |= PM_FILE; 946 flags |= PM_FILE;
947 if (pte_soft_dirty(pte))
948 flags2 |= __PM_SOFT_DIRTY;
875 949
876 *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); 950 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
877} 951}
878 952
879#ifdef CONFIG_TRANSPARENT_HUGEPAGE 953#ifdef CONFIG_TRANSPARENT_HUGEPAGE
880static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, 954static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
881 pmd_t pmd, int offset) 955 pmd_t pmd, int offset, int pmd_flags2)
882{ 956{
883 /* 957 /*
884 * Currently pmd for thp is always present because thp can not be 958 * Currently pmd for thp is always present because thp can not be
@@ -887,13 +961,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
887 */ 961 */
888 if (pmd_present(pmd)) 962 if (pmd_present(pmd))
889 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 963 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
890 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 964 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
891 else 965 else
892 *pme = make_pme(PM_NOT_PRESENT); 966 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
893} 967}
894#else 968#else
895static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, 969static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
896 pmd_t pmd, int offset) 970 pmd_t pmd, int offset, int pmd_flags2)
897{ 971{
898} 972}
899#endif 973#endif
@@ -905,17 +979,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
905 struct pagemapread *pm = walk->private; 979 struct pagemapread *pm = walk->private;
906 pte_t *pte; 980 pte_t *pte;
907 int err = 0; 981 int err = 0;
908 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 982 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
909 983
910 /* find the first VMA at or above 'addr' */ 984 /* find the first VMA at or above 'addr' */
911 vma = find_vma(walk->mm, addr); 985 vma = find_vma(walk->mm, addr);
912 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { 986 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
987 int pmd_flags2;
988
989 pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
913 for (; addr != end; addr += PAGE_SIZE) { 990 for (; addr != end; addr += PAGE_SIZE) {
914 unsigned long offset; 991 unsigned long offset;
915 992
916 offset = (addr & ~PAGEMAP_WALK_MASK) >> 993 offset = (addr & ~PAGEMAP_WALK_MASK) >>
917 PAGE_SHIFT; 994 PAGE_SHIFT;
918 thp_pmd_to_pagemap_entry(&pme, *pmd, offset); 995 thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
919 err = add_to_pagemap(addr, &pme, pm); 996 err = add_to_pagemap(addr, &pme, pm);
920 if (err) 997 if (err)
921 break; 998 break;
@@ -932,7 +1009,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
932 * and need a new, higher one */ 1009 * and need a new, higher one */
933 if (vma && (addr >= vma->vm_end)) { 1010 if (vma && (addr >= vma->vm_end)) {
934 vma = find_vma(walk->mm, addr); 1011 vma = find_vma(walk->mm, addr);
935 pme = make_pme(PM_NOT_PRESENT); 1012 pme = make_pme(PM_NOT_PRESENT(pm->v2));
936 } 1013 }
937 1014
938 /* check that 'vma' actually covers this address, 1015 /* check that 'vma' actually covers this address,
@@ -940,7 +1017,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
940 if (vma && (vma->vm_start <= addr) && 1017 if (vma && (vma->vm_start <= addr) &&
941 !is_vm_hugetlb_page(vma)) { 1018 !is_vm_hugetlb_page(vma)) {
942 pte = pte_offset_map(pmd, addr); 1019 pte = pte_offset_map(pmd, addr);
943 pte_to_pagemap_entry(&pme, vma, addr, *pte); 1020 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
944 /* unmap before userspace copy */ 1021 /* unmap before userspace copy */
945 pte_unmap(pte); 1022 pte_unmap(pte);
946 } 1023 }
@@ -955,14 +1032,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
955} 1032}
956 1033
957#ifdef CONFIG_HUGETLB_PAGE 1034#ifdef CONFIG_HUGETLB_PAGE
958static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, 1035static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
959 pte_t pte, int offset) 1036 pte_t pte, int offset)
960{ 1037{
961 if (pte_present(pte)) 1038 if (pte_present(pte))
962 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) 1039 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
963 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 1040 | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
964 else 1041 else
965 *pme = make_pme(PM_NOT_PRESENT); 1042 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
966} 1043}
967 1044
968/* This function walks within one hugetlb entry in the single call */ 1045/* This function walks within one hugetlb entry in the single call */
@@ -976,7 +1053,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
976 1053
977 for (; addr != end; addr += PAGE_SIZE) { 1054 for (; addr != end; addr += PAGE_SIZE) {
978 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1055 int offset = (addr & ~hmask) >> PAGE_SHIFT;
979 huge_pte_to_pagemap_entry(&pme, *pte, offset); 1056 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
980 err = add_to_pagemap(addr, &pme, pm); 1057 err = add_to_pagemap(addr, &pme, pm);
981 if (err) 1058 if (err)
982 return err; 1059 return err;
@@ -1038,6 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1038 if (!count) 1115 if (!count)
1039 goto out_task; 1116 goto out_task;
1040 1117
1118 pm.v2 = soft_dirty_cleared;
1041 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 1119 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1042 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); 1120 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
1043 ret = -ENOMEM; 1121 ret = -ENOMEM;
@@ -1110,9 +1188,18 @@ out:
1110 return ret; 1188 return ret;
1111} 1189}
1112 1190
1191static int pagemap_open(struct inode *inode, struct file *file)
1192{
1193 pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
1194 "to stop being page-shift some time soon. See the "
1195 "linux/Documentation/vm/pagemap.txt for details.\n");
1196 return 0;
1197}
1198
1113const struct file_operations proc_pagemap_operations = { 1199const struct file_operations proc_pagemap_operations = {
1114 .llseek = mem_lseek, /* borrow this */ 1200 .llseek = mem_lseek, /* borrow this */
1115 .read = pagemap_read, 1201 .read = pagemap_read,
1202 .open = pagemap_open,
1116}; 1203};
1117#endif /* CONFIG_PROC_PAGE_MONITOR */ 1204#endif /* CONFIG_PROC_PAGE_MONITOR */
1118 1205
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 9610ac772d7e..061894625903 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -20,8 +20,7 @@ static int uptime_proc_show(struct seq_file *m, void *v)
20 for_each_possible_cpu(i) 20 for_each_possible_cpu(i)
21 idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; 21 idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
22 22
23 do_posix_clock_monotonic_gettime(&uptime); 23 get_monotonic_boottime(&uptime);
24 monotonic_to_bootbased(&uptime);
25 nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; 24 nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
26 idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); 25 idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
27 idle.tv_nsec = rem; 26 idle.tv_nsec = rem;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 17f7e080d7ff..28503172f2e4 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/crash_dump.h> 21#include <linux/crash_dump.h>
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/vmalloc.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24#include <asm/io.h> 25#include <asm/io.h>
25#include "internal.h" 26#include "internal.h"
@@ -32,6 +33,10 @@ static LIST_HEAD(vmcore_list);
32/* Stores the pointer to the buffer containing kernel elf core headers. */ 33/* Stores the pointer to the buffer containing kernel elf core headers. */
33static char *elfcorebuf; 34static char *elfcorebuf;
34static size_t elfcorebuf_sz; 35static size_t elfcorebuf_sz;
36static size_t elfcorebuf_sz_orig;
37
38static char *elfnotes_buf;
39static size_t elfnotes_sz;
35 40
36/* Total size of vmcore file. */ 41/* Total size of vmcore file. */
37static u64 vmcore_size; 42static u64 vmcore_size;
@@ -118,27 +123,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
118 return read; 123 return read;
119} 124}
120 125
121/* Maps vmcore file offset to respective physical address in memroy. */
122static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
123 struct vmcore **m_ptr)
124{
125 struct vmcore *m;
126 u64 paddr;
127
128 list_for_each_entry(m, vc_list, list) {
129 u64 start, end;
130 start = m->offset;
131 end = m->offset + m->size - 1;
132 if (offset >= start && offset <= end) {
133 paddr = m->paddr + offset - start;
134 *m_ptr = m;
135 return paddr;
136 }
137 }
138 *m_ptr = NULL;
139 return 0;
140}
141
142/* Read from the ELF header and then the crash dump. On error, negative value is 126/* Read from the ELF header and then the crash dump. On error, negative value is
143 * returned otherwise number of bytes read are returned. 127 * returned otherwise number of bytes read are returned.
144 */ 128 */
@@ -147,8 +131,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
147{ 131{
148 ssize_t acc = 0, tmp; 132 ssize_t acc = 0, tmp;
149 size_t tsz; 133 size_t tsz;
150 u64 start, nr_bytes; 134 u64 start;
151 struct vmcore *curr_m = NULL; 135 struct vmcore *m = NULL;
152 136
153 if (buflen == 0 || *fpos >= vmcore_size) 137 if (buflen == 0 || *fpos >= vmcore_size)
154 return 0; 138 return 0;
@@ -159,9 +143,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
159 143
160 /* Read ELF core header */ 144 /* Read ELF core header */
161 if (*fpos < elfcorebuf_sz) { 145 if (*fpos < elfcorebuf_sz) {
162 tsz = elfcorebuf_sz - *fpos; 146 tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
163 if (buflen < tsz)
164 tsz = buflen;
165 if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) 147 if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
166 return -EFAULT; 148 return -EFAULT;
167 buflen -= tsz; 149 buflen -= tsz;
@@ -174,39 +156,161 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
174 return acc; 156 return acc;
175 } 157 }
176 158
177 start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); 159 /* Read Elf note segment */
178 if (!curr_m) 160 if (*fpos < elfcorebuf_sz + elfnotes_sz) {
179 return -EINVAL; 161 void *kaddr;
180
181 while (buflen) {
182 tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK));
183 162
184 /* Calculate left bytes in current memory segment. */ 163 tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
185 nr_bytes = (curr_m->size - (start - curr_m->paddr)); 164 kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
186 if (tsz > nr_bytes) 165 if (copy_to_user(buffer, kaddr, tsz))
187 tsz = nr_bytes; 166 return -EFAULT;
188
189 tmp = read_from_oldmem(buffer, tsz, &start, 1);
190 if (tmp < 0)
191 return tmp;
192 buflen -= tsz; 167 buflen -= tsz;
193 *fpos += tsz; 168 *fpos += tsz;
194 buffer += tsz; 169 buffer += tsz;
195 acc += tsz; 170 acc += tsz;
196 if (start >= (curr_m->paddr + curr_m->size)) { 171
197 if (curr_m->list.next == &vmcore_list) 172 /* leave now if filled buffer already */
198 return acc; /*EOF*/ 173 if (buflen == 0)
199 curr_m = list_entry(curr_m->list.next, 174 return acc;
200 struct vmcore, list); 175 }
201 start = curr_m->paddr; 176
177 list_for_each_entry(m, &vmcore_list, list) {
178 if (*fpos < m->offset + m->size) {
179 tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
180 start = m->paddr + *fpos - m->offset;
181 tmp = read_from_oldmem(buffer, tsz, &start, 1);
182 if (tmp < 0)
183 return tmp;
184 buflen -= tsz;
185 *fpos += tsz;
186 buffer += tsz;
187 acc += tsz;
188
189 /* leave now if filled buffer already */
190 if (buflen == 0)
191 return acc;
202 } 192 }
203 } 193 }
194
204 return acc; 195 return acc;
205} 196}
206 197
198/**
199 * alloc_elfnotes_buf - allocate buffer for ELF note segment in
200 * vmalloc memory
201 *
202 * @notes_sz: size of buffer
203 *
204 * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
205 * the buffer to user-space by means of remap_vmalloc_range().
206 *
207 * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
208 * disabled and there's no need to allow users to mmap the buffer.
209 */
210static inline char *alloc_elfnotes_buf(size_t notes_sz)
211{
212#ifdef CONFIG_MMU
213 return vmalloc_user(notes_sz);
214#else
215 return vzalloc(notes_sz);
216#endif
217}
218
219/*
220 * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
221 * essential for mmap_vmcore() in order to map physically
222 * non-contiguous objects (ELF header, ELF note segment and memory
223 * regions in the 1st kernel pointed to by PT_LOAD entries) into
224 * virtually contiguous user-space in ELF layout.
225 */
226#ifdef CONFIG_MMU
227static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
228{
229 size_t size = vma->vm_end - vma->vm_start;
230 u64 start, end, len, tsz;
231 struct vmcore *m;
232
233 start = (u64)vma->vm_pgoff << PAGE_SHIFT;
234 end = start + size;
235
236 if (size > vmcore_size || end > vmcore_size)
237 return -EINVAL;
238
239 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
240 return -EPERM;
241
242 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
243 vma->vm_flags |= VM_MIXEDMAP;
244
245 len = 0;
246
247 if (start < elfcorebuf_sz) {
248 u64 pfn;
249
250 tsz = min(elfcorebuf_sz - (size_t)start, size);
251 pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT;
252 if (remap_pfn_range(vma, vma->vm_start, pfn, tsz,
253 vma->vm_page_prot))
254 return -EAGAIN;
255 size -= tsz;
256 start += tsz;
257 len += tsz;
258
259 if (size == 0)
260 return 0;
261 }
262
263 if (start < elfcorebuf_sz + elfnotes_sz) {
264 void *kaddr;
265
266 tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
267 kaddr = elfnotes_buf + start - elfcorebuf_sz;
268 if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
269 kaddr, tsz))
270 goto fail;
271 size -= tsz;
272 start += tsz;
273 len += tsz;
274
275 if (size == 0)
276 return 0;
277 }
278
279 list_for_each_entry(m, &vmcore_list, list) {
280 if (start < m->offset + m->size) {
281 u64 paddr = 0;
282
283 tsz = min_t(size_t, m->offset + m->size - start, size);
284 paddr = m->paddr + start - m->offset;
285 if (remap_pfn_range(vma, vma->vm_start + len,
286 paddr >> PAGE_SHIFT, tsz,
287 vma->vm_page_prot))
288 goto fail;
289 size -= tsz;
290 start += tsz;
291 len += tsz;
292
293 if (size == 0)
294 return 0;
295 }
296 }
297
298 return 0;
299fail:
300 do_munmap(vma->vm_mm, vma->vm_start, len);
301 return -EAGAIN;
302}
303#else
304static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
305{
306 return -ENOSYS;
307}
308#endif
309
207static const struct file_operations proc_vmcore_operations = { 310static const struct file_operations proc_vmcore_operations = {
208 .read = read_vmcore, 311 .read = read_vmcore,
209 .llseek = default_llseek, 312 .llseek = default_llseek,
313 .mmap = mmap_vmcore,
210}; 314};
211 315
212static struct vmcore* __init get_new_element(void) 316static struct vmcore* __init get_new_element(void)
@@ -214,61 +318,40 @@ static struct vmcore* __init get_new_element(void)
214 return kzalloc(sizeof(struct vmcore), GFP_KERNEL); 318 return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
215} 319}
216 320
217static u64 __init get_vmcore_size_elf64(char *elfptr) 321static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
322 struct list_head *vc_list)
218{ 323{
219 int i;
220 u64 size;
221 Elf64_Ehdr *ehdr_ptr;
222 Elf64_Phdr *phdr_ptr;
223
224 ehdr_ptr = (Elf64_Ehdr *)elfptr;
225 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
226 size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
227 for (i = 0; i < ehdr_ptr->e_phnum; i++) {
228 size += phdr_ptr->p_memsz;
229 phdr_ptr++;
230 }
231 return size;
232}
233
234static u64 __init get_vmcore_size_elf32(char *elfptr)
235{
236 int i;
237 u64 size; 324 u64 size;
238 Elf32_Ehdr *ehdr_ptr; 325 struct vmcore *m;
239 Elf32_Phdr *phdr_ptr;
240 326
241 ehdr_ptr = (Elf32_Ehdr *)elfptr; 327 size = elfsz + elfnotesegsz;
242 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); 328 list_for_each_entry(m, vc_list, list) {
243 size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr)); 329 size += m->size;
244 for (i = 0; i < ehdr_ptr->e_phnum; i++) {
245 size += phdr_ptr->p_memsz;
246 phdr_ptr++;
247 } 330 }
248 return size; 331 return size;
249} 332}
250 333
251/* Merges all the PT_NOTE headers into one. */ 334/**
252static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, 335 * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry
253 struct list_head *vc_list) 336 *
337 * @ehdr_ptr: ELF header
338 *
339 * This function updates p_memsz member of each PT_NOTE entry in the
340 * program header table pointed to by @ehdr_ptr to real size of ELF
341 * note segment.
342 */
343static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
254{ 344{
255 int i, nr_ptnote=0, rc=0; 345 int i, rc=0;
256 char *tmp; 346 Elf64_Phdr *phdr_ptr;
257 Elf64_Ehdr *ehdr_ptr;
258 Elf64_Phdr phdr, *phdr_ptr;
259 Elf64_Nhdr *nhdr_ptr; 347 Elf64_Nhdr *nhdr_ptr;
260 u64 phdr_sz = 0, note_off;
261 348
262 ehdr_ptr = (Elf64_Ehdr *)elfptr; 349 phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
263 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
264 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 350 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
265 int j;
266 void *notes_section; 351 void *notes_section;
267 struct vmcore *new;
268 u64 offset, max_sz, sz, real_sz = 0; 352 u64 offset, max_sz, sz, real_sz = 0;
269 if (phdr_ptr->p_type != PT_NOTE) 353 if (phdr_ptr->p_type != PT_NOTE)
270 continue; 354 continue;
271 nr_ptnote++;
272 max_sz = phdr_ptr->p_memsz; 355 max_sz = phdr_ptr->p_memsz;
273 offset = phdr_ptr->p_offset; 356 offset = phdr_ptr->p_offset;
274 notes_section = kmalloc(max_sz, GFP_KERNEL); 357 notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -280,7 +363,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
280 return rc; 363 return rc;
281 } 364 }
282 nhdr_ptr = notes_section; 365 nhdr_ptr = notes_section;
283 for (j = 0; j < max_sz; j += sz) { 366 while (real_sz < max_sz) {
284 if (nhdr_ptr->n_namesz == 0) 367 if (nhdr_ptr->n_namesz == 0)
285 break; 368 break;
286 sz = sizeof(Elf64_Nhdr) + 369 sz = sizeof(Elf64_Nhdr) +
@@ -289,26 +372,122 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
289 real_sz += sz; 372 real_sz += sz;
290 nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); 373 nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
291 } 374 }
292
293 /* Add this contiguous chunk of notes section to vmcore list.*/
294 new = get_new_element();
295 if (!new) {
296 kfree(notes_section);
297 return -ENOMEM;
298 }
299 new->paddr = phdr_ptr->p_offset;
300 new->size = real_sz;
301 list_add_tail(&new->list, vc_list);
302 phdr_sz += real_sz;
303 kfree(notes_section); 375 kfree(notes_section);
376 phdr_ptr->p_memsz = real_sz;
377 }
378
379 return 0;
380}
381
382/**
383 * get_note_number_and_size_elf64 - get the number of PT_NOTE program
384 * headers and sum of real size of their ELF note segment headers and
385 * data.
386 *
387 * @ehdr_ptr: ELF header
388 * @nr_ptnote: buffer for the number of PT_NOTE program headers
389 * @sz_ptnote: buffer for size of unique PT_NOTE program header
390 *
391 * This function is used to merge multiple PT_NOTE program headers
392 * into a unique single one. The resulting unique entry will have
393 * @sz_ptnote in its phdr->p_mem.
394 *
395 * It is assumed that program headers with PT_NOTE type pointed to by
396 * @ehdr_ptr has already been updated by update_note_header_size_elf64
397 * and each of PT_NOTE program headers has actual ELF note segment
398 * size in its p_memsz member.
399 */
400static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr,
401 int *nr_ptnote, u64 *sz_ptnote)
402{
403 int i;
404 Elf64_Phdr *phdr_ptr;
405
406 *nr_ptnote = *sz_ptnote = 0;
407
408 phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
409 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
410 if (phdr_ptr->p_type != PT_NOTE)
411 continue;
412 *nr_ptnote += 1;
413 *sz_ptnote += phdr_ptr->p_memsz;
414 }
415
416 return 0;
417}
418
419/**
420 * copy_notes_elf64 - copy ELF note segments in a given buffer
421 *
422 * @ehdr_ptr: ELF header
423 * @notes_buf: buffer into which ELF note segments are copied
424 *
425 * This function is used to copy ELF note segment in the 1st kernel
426 * into the buffer @notes_buf in the 2nd kernel. It is assumed that
427 * size of the buffer @notes_buf is equal to or larger than sum of the
428 * real ELF note segment headers and data.
429 *
430 * It is assumed that program headers with PT_NOTE type pointed to by
431 * @ehdr_ptr has already been updated by update_note_header_size_elf64
432 * and each of PT_NOTE program headers has actual ELF note segment
433 * size in its p_memsz member.
434 */
435static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
436{
437 int i, rc=0;
438 Elf64_Phdr *phdr_ptr;
439
440 phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1);
441
442 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
443 u64 offset;
444 if (phdr_ptr->p_type != PT_NOTE)
445 continue;
446 offset = phdr_ptr->p_offset;
447 rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
448 if (rc < 0)
449 return rc;
450 notes_buf += phdr_ptr->p_memsz;
304 } 451 }
305 452
453 return 0;
454}
455
456/* Merges all the PT_NOTE headers into one. */
457static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
458 char **notes_buf, size_t *notes_sz)
459{
460 int i, nr_ptnote=0, rc=0;
461 char *tmp;
462 Elf64_Ehdr *ehdr_ptr;
463 Elf64_Phdr phdr;
464 u64 phdr_sz = 0, note_off;
465
466 ehdr_ptr = (Elf64_Ehdr *)elfptr;
467
468 rc = update_note_header_size_elf64(ehdr_ptr);
469 if (rc < 0)
470 return rc;
471
472 rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz);
473 if (rc < 0)
474 return rc;
475
476 *notes_sz = roundup(phdr_sz, PAGE_SIZE);
477 *notes_buf = alloc_elfnotes_buf(*notes_sz);
478 if (!*notes_buf)
479 return -ENOMEM;
480
481 rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
482 if (rc < 0)
483 return rc;
484
306 /* Prepare merged PT_NOTE program header. */ 485 /* Prepare merged PT_NOTE program header. */
307 phdr.p_type = PT_NOTE; 486 phdr.p_type = PT_NOTE;
308 phdr.p_flags = 0; 487 phdr.p_flags = 0;
309 note_off = sizeof(Elf64_Ehdr) + 488 note_off = sizeof(Elf64_Ehdr) +
310 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); 489 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
311 phdr.p_offset = note_off; 490 phdr.p_offset = roundup(note_off, PAGE_SIZE);
312 phdr.p_vaddr = phdr.p_paddr = 0; 491 phdr.p_vaddr = phdr.p_paddr = 0;
313 phdr.p_filesz = phdr.p_memsz = phdr_sz; 492 phdr.p_filesz = phdr.p_memsz = phdr_sz;
314 phdr.p_align = 0; 493 phdr.p_align = 0;
@@ -322,6 +501,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
322 i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); 501 i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
323 *elfsz = *elfsz - i; 502 *elfsz = *elfsz - i;
324 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); 503 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
504 memset(elfptr + *elfsz, 0, i);
505 *elfsz = roundup(*elfsz, PAGE_SIZE);
325 506
326 /* Modify e_phnum to reflect merged headers. */ 507 /* Modify e_phnum to reflect merged headers. */
327 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; 508 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -329,27 +510,27 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
329 return 0; 510 return 0;
330} 511}
331 512
332/* Merges all the PT_NOTE headers into one. */ 513/**
333static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, 514 * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry
334 struct list_head *vc_list) 515 *
516 * @ehdr_ptr: ELF header
517 *
518 * This function updates p_memsz member of each PT_NOTE entry in the
519 * program header table pointed to by @ehdr_ptr to real size of ELF
520 * note segment.
521 */
522static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
335{ 523{
336 int i, nr_ptnote=0, rc=0; 524 int i, rc=0;
337 char *tmp; 525 Elf32_Phdr *phdr_ptr;
338 Elf32_Ehdr *ehdr_ptr;
339 Elf32_Phdr phdr, *phdr_ptr;
340 Elf32_Nhdr *nhdr_ptr; 526 Elf32_Nhdr *nhdr_ptr;
341 u64 phdr_sz = 0, note_off;
342 527
343 ehdr_ptr = (Elf32_Ehdr *)elfptr; 528 phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
344 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
345 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 529 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
346 int j;
347 void *notes_section; 530 void *notes_section;
348 struct vmcore *new;
349 u64 offset, max_sz, sz, real_sz = 0; 531 u64 offset, max_sz, sz, real_sz = 0;
350 if (phdr_ptr->p_type != PT_NOTE) 532 if (phdr_ptr->p_type != PT_NOTE)
351 continue; 533 continue;
352 nr_ptnote++;
353 max_sz = phdr_ptr->p_memsz; 534 max_sz = phdr_ptr->p_memsz;
354 offset = phdr_ptr->p_offset; 535 offset = phdr_ptr->p_offset;
355 notes_section = kmalloc(max_sz, GFP_KERNEL); 536 notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -361,7 +542,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
361 return rc; 542 return rc;
362 } 543 }
363 nhdr_ptr = notes_section; 544 nhdr_ptr = notes_section;
364 for (j = 0; j < max_sz; j += sz) { 545 while (real_sz < max_sz) {
365 if (nhdr_ptr->n_namesz == 0) 546 if (nhdr_ptr->n_namesz == 0)
366 break; 547 break;
367 sz = sizeof(Elf32_Nhdr) + 548 sz = sizeof(Elf32_Nhdr) +
@@ -370,26 +551,122 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
370 real_sz += sz; 551 real_sz += sz;
371 nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); 552 nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
372 } 553 }
373
374 /* Add this contiguous chunk of notes section to vmcore list.*/
375 new = get_new_element();
376 if (!new) {
377 kfree(notes_section);
378 return -ENOMEM;
379 }
380 new->paddr = phdr_ptr->p_offset;
381 new->size = real_sz;
382 list_add_tail(&new->list, vc_list);
383 phdr_sz += real_sz;
384 kfree(notes_section); 554 kfree(notes_section);
555 phdr_ptr->p_memsz = real_sz;
556 }
557
558 return 0;
559}
560
561/**
562 * get_note_number_and_size_elf32 - get the number of PT_NOTE program
563 * headers and sum of real size of their ELF note segment headers and
564 * data.
565 *
566 * @ehdr_ptr: ELF header
567 * @nr_ptnote: buffer for the number of PT_NOTE program headers
568 * @sz_ptnote: buffer for size of unique PT_NOTE program header
569 *
570 * This function is used to merge multiple PT_NOTE program headers
571 * into a unique single one. The resulting unique entry will have
572 * @sz_ptnote in its phdr->p_mem.
573 *
574 * It is assumed that program headers with PT_NOTE type pointed to by
575 * @ehdr_ptr has already been updated by update_note_header_size_elf32
576 * and each of PT_NOTE program headers has actual ELF note segment
577 * size in its p_memsz member.
578 */
579static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr,
580 int *nr_ptnote, u64 *sz_ptnote)
581{
582 int i;
583 Elf32_Phdr *phdr_ptr;
584
585 *nr_ptnote = *sz_ptnote = 0;
586
587 phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
588 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
589 if (phdr_ptr->p_type != PT_NOTE)
590 continue;
591 *nr_ptnote += 1;
592 *sz_ptnote += phdr_ptr->p_memsz;
593 }
594
595 return 0;
596}
597
598/**
599 * copy_notes_elf32 - copy ELF note segments in a given buffer
600 *
601 * @ehdr_ptr: ELF header
602 * @notes_buf: buffer into which ELF note segments are copied
603 *
604 * This function is used to copy ELF note segment in the 1st kernel
605 * into the buffer @notes_buf in the 2nd kernel. It is assumed that
606 * size of the buffer @notes_buf is equal to or larger than sum of the
607 * real ELF note segment headers and data.
608 *
609 * It is assumed that program headers with PT_NOTE type pointed to by
610 * @ehdr_ptr has already been updated by update_note_header_size_elf32
611 * and each of PT_NOTE program headers has actual ELF note segment
612 * size in its p_memsz member.
613 */
614static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
615{
616 int i, rc=0;
617 Elf32_Phdr *phdr_ptr;
618
619 phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1);
620
621 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
622 u64 offset;
623 if (phdr_ptr->p_type != PT_NOTE)
624 continue;
625 offset = phdr_ptr->p_offset;
626 rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
627 if (rc < 0)
628 return rc;
629 notes_buf += phdr_ptr->p_memsz;
385 } 630 }
386 631
632 return 0;
633}
634
635/* Merges all the PT_NOTE headers into one. */
636static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
637 char **notes_buf, size_t *notes_sz)
638{
639 int i, nr_ptnote=0, rc=0;
640 char *tmp;
641 Elf32_Ehdr *ehdr_ptr;
642 Elf32_Phdr phdr;
643 u64 phdr_sz = 0, note_off;
644
645 ehdr_ptr = (Elf32_Ehdr *)elfptr;
646
647 rc = update_note_header_size_elf32(ehdr_ptr);
648 if (rc < 0)
649 return rc;
650
651 rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz);
652 if (rc < 0)
653 return rc;
654
655 *notes_sz = roundup(phdr_sz, PAGE_SIZE);
656 *notes_buf = alloc_elfnotes_buf(*notes_sz);
657 if (!*notes_buf)
658 return -ENOMEM;
659
660 rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
661 if (rc < 0)
662 return rc;
663
387 /* Prepare merged PT_NOTE program header. */ 664 /* Prepare merged PT_NOTE program header. */
388 phdr.p_type = PT_NOTE; 665 phdr.p_type = PT_NOTE;
389 phdr.p_flags = 0; 666 phdr.p_flags = 0;
390 note_off = sizeof(Elf32_Ehdr) + 667 note_off = sizeof(Elf32_Ehdr) +
391 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); 668 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
392 phdr.p_offset = note_off; 669 phdr.p_offset = roundup(note_off, PAGE_SIZE);
393 phdr.p_vaddr = phdr.p_paddr = 0; 670 phdr.p_vaddr = phdr.p_paddr = 0;
394 phdr.p_filesz = phdr.p_memsz = phdr_sz; 671 phdr.p_filesz = phdr.p_memsz = phdr_sz;
395 phdr.p_align = 0; 672 phdr.p_align = 0;
@@ -403,6 +680,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
403 i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); 680 i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
404 *elfsz = *elfsz - i; 681 *elfsz = *elfsz - i;
405 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); 682 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
683 memset(elfptr + *elfsz, 0, i);
684 *elfsz = roundup(*elfsz, PAGE_SIZE);
406 685
407 /* Modify e_phnum to reflect merged headers. */ 686 /* Modify e_phnum to reflect merged headers. */
408 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; 687 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -414,6 +693,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
414 * the new offset fields of exported program headers. */ 693 * the new offset fields of exported program headers. */
415static int __init process_ptload_program_headers_elf64(char *elfptr, 694static int __init process_ptload_program_headers_elf64(char *elfptr,
416 size_t elfsz, 695 size_t elfsz,
696 size_t elfnotes_sz,
417 struct list_head *vc_list) 697 struct list_head *vc_list)
418{ 698{
419 int i; 699 int i;
@@ -425,32 +705,38 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
425 ehdr_ptr = (Elf64_Ehdr *)elfptr; 705 ehdr_ptr = (Elf64_Ehdr *)elfptr;
426 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ 706 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
427 707
428 /* First program header is PT_NOTE header. */ 708 /* Skip Elf header, program headers and Elf note segment. */
429 vmcore_off = sizeof(Elf64_Ehdr) + 709 vmcore_off = elfsz + elfnotes_sz;
430 (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
431 phdr_ptr->p_memsz; /* Note sections */
432 710
433 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 711 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
712 u64 paddr, start, end, size;
713
434 if (phdr_ptr->p_type != PT_LOAD) 714 if (phdr_ptr->p_type != PT_LOAD)
435 continue; 715 continue;
436 716
717 paddr = phdr_ptr->p_offset;
718 start = rounddown(paddr, PAGE_SIZE);
719 end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
720 size = end - start;
721
437 /* Add this contiguous chunk of memory to vmcore list.*/ 722 /* Add this contiguous chunk of memory to vmcore list.*/
438 new = get_new_element(); 723 new = get_new_element();
439 if (!new) 724 if (!new)
440 return -ENOMEM; 725 return -ENOMEM;
441 new->paddr = phdr_ptr->p_offset; 726 new->paddr = start;
442 new->size = phdr_ptr->p_memsz; 727 new->size = size;
443 list_add_tail(&new->list, vc_list); 728 list_add_tail(&new->list, vc_list);
444 729
445 /* Update the program header offset. */ 730 /* Update the program header offset. */
446 phdr_ptr->p_offset = vmcore_off; 731 phdr_ptr->p_offset = vmcore_off + (paddr - start);
447 vmcore_off = vmcore_off + phdr_ptr->p_memsz; 732 vmcore_off = vmcore_off + size;
448 } 733 }
449 return 0; 734 return 0;
450} 735}
451 736
452static int __init process_ptload_program_headers_elf32(char *elfptr, 737static int __init process_ptload_program_headers_elf32(char *elfptr,
453 size_t elfsz, 738 size_t elfsz,
739 size_t elfnotes_sz,
454 struct list_head *vc_list) 740 struct list_head *vc_list)
455{ 741{
456 int i; 742 int i;
@@ -462,43 +748,44 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
462 ehdr_ptr = (Elf32_Ehdr *)elfptr; 748 ehdr_ptr = (Elf32_Ehdr *)elfptr;
463 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ 749 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
464 750
465 /* First program header is PT_NOTE header. */ 751 /* Skip Elf header, program headers and Elf note segment. */
466 vmcore_off = sizeof(Elf32_Ehdr) + 752 vmcore_off = elfsz + elfnotes_sz;
467 (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) +
468 phdr_ptr->p_memsz; /* Note sections */
469 753
470 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 754 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
755 u64 paddr, start, end, size;
756
471 if (phdr_ptr->p_type != PT_LOAD) 757 if (phdr_ptr->p_type != PT_LOAD)
472 continue; 758 continue;
473 759
760 paddr = phdr_ptr->p_offset;
761 start = rounddown(paddr, PAGE_SIZE);
762 end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
763 size = end - start;
764
474 /* Add this contiguous chunk of memory to vmcore list.*/ 765 /* Add this contiguous chunk of memory to vmcore list.*/
475 new = get_new_element(); 766 new = get_new_element();
476 if (!new) 767 if (!new)
477 return -ENOMEM; 768 return -ENOMEM;
478 new->paddr = phdr_ptr->p_offset; 769 new->paddr = start;
479 new->size = phdr_ptr->p_memsz; 770 new->size = size;
480 list_add_tail(&new->list, vc_list); 771 list_add_tail(&new->list, vc_list);
481 772
482 /* Update the program header offset */ 773 /* Update the program header offset */
483 phdr_ptr->p_offset = vmcore_off; 774 phdr_ptr->p_offset = vmcore_off + (paddr - start);
484 vmcore_off = vmcore_off + phdr_ptr->p_memsz; 775 vmcore_off = vmcore_off + size;
485 } 776 }
486 return 0; 777 return 0;
487} 778}
488 779
489/* Sets offset fields of vmcore elements. */ 780/* Sets offset fields of vmcore elements. */
490static void __init set_vmcore_list_offsets_elf64(char *elfptr, 781static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
491 struct list_head *vc_list) 782 struct list_head *vc_list)
492{ 783{
493 loff_t vmcore_off; 784 loff_t vmcore_off;
494 Elf64_Ehdr *ehdr_ptr;
495 struct vmcore *m; 785 struct vmcore *m;
496 786
497 ehdr_ptr = (Elf64_Ehdr *)elfptr; 787 /* Skip Elf header, program headers and Elf note segment. */
498 788 vmcore_off = elfsz + elfnotes_sz;
499 /* Skip Elf header and program headers. */
500 vmcore_off = sizeof(Elf64_Ehdr) +
501 (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
502 789
503 list_for_each_entry(m, vc_list, list) { 790 list_for_each_entry(m, vc_list, list) {
504 m->offset = vmcore_off; 791 m->offset = vmcore_off;
@@ -506,24 +793,12 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr,
506 } 793 }
507} 794}
508 795
509/* Sets offset fields of vmcore elements. */ 796static void free_elfcorebuf(void)
510static void __init set_vmcore_list_offsets_elf32(char *elfptr,
511 struct list_head *vc_list)
512{ 797{
513 loff_t vmcore_off; 798 free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
514 Elf32_Ehdr *ehdr_ptr; 799 elfcorebuf = NULL;
515 struct vmcore *m; 800 vfree(elfnotes_buf);
516 801 elfnotes_buf = NULL;
517 ehdr_ptr = (Elf32_Ehdr *)elfptr;
518
519 /* Skip Elf header and program headers. */
520 vmcore_off = sizeof(Elf32_Ehdr) +
521 (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr);
522
523 list_for_each_entry(m, vc_list, list) {
524 m->offset = vmcore_off;
525 vmcore_off += m->size;
526 }
527} 802}
528 803
529static int __init parse_crash_elf64_headers(void) 804static int __init parse_crash_elf64_headers(void)
@@ -554,31 +829,32 @@ static int __init parse_crash_elf64_headers(void)
554 } 829 }
555 830
556 /* Read in all elf headers. */ 831 /* Read in all elf headers. */
557 elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr); 832 elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) +
558 elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); 833 ehdr.e_phnum * sizeof(Elf64_Phdr);
834 elfcorebuf_sz = elfcorebuf_sz_orig;
835 elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
836 get_order(elfcorebuf_sz_orig));
559 if (!elfcorebuf) 837 if (!elfcorebuf)
560 return -ENOMEM; 838 return -ENOMEM;
561 addr = elfcorehdr_addr; 839 addr = elfcorehdr_addr;
562 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); 840 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
563 if (rc < 0) { 841 if (rc < 0)
564 kfree(elfcorebuf); 842 goto fail;
565 return rc;
566 }
567 843
568 /* Merge all PT_NOTE headers into one. */ 844 /* Merge all PT_NOTE headers into one. */
569 rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list); 845 rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz,
570 if (rc) { 846 &elfnotes_buf, &elfnotes_sz);
571 kfree(elfcorebuf); 847 if (rc)
572 return rc; 848 goto fail;
573 }
574 rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, 849 rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
575 &vmcore_list); 850 elfnotes_sz, &vmcore_list);
576 if (rc) { 851 if (rc)
577 kfree(elfcorebuf); 852 goto fail;
578 return rc; 853 set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
579 }
580 set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
581 return 0; 854 return 0;
855fail:
856 free_elfcorebuf();
857 return rc;
582} 858}
583 859
584static int __init parse_crash_elf32_headers(void) 860static int __init parse_crash_elf32_headers(void)
@@ -609,31 +885,31 @@ static int __init parse_crash_elf32_headers(void)
609 } 885 }
610 886
611 /* Read in all elf headers. */ 887 /* Read in all elf headers. */
612 elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); 888 elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
613 elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); 889 elfcorebuf_sz = elfcorebuf_sz_orig;
890 elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
891 get_order(elfcorebuf_sz_orig));
614 if (!elfcorebuf) 892 if (!elfcorebuf)
615 return -ENOMEM; 893 return -ENOMEM;
616 addr = elfcorehdr_addr; 894 addr = elfcorehdr_addr;
617 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); 895 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
618 if (rc < 0) { 896 if (rc < 0)
619 kfree(elfcorebuf); 897 goto fail;
620 return rc;
621 }
622 898
623 /* Merge all PT_NOTE headers into one. */ 899 /* Merge all PT_NOTE headers into one. */
624 rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list); 900 rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz,
625 if (rc) { 901 &elfnotes_buf, &elfnotes_sz);
626 kfree(elfcorebuf); 902 if (rc)
627 return rc; 903 goto fail;
628 }
629 rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, 904 rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
630 &vmcore_list); 905 elfnotes_sz, &vmcore_list);
631 if (rc) { 906 if (rc)
632 kfree(elfcorebuf); 907 goto fail;
633 return rc; 908 set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
634 }
635 set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list);
636 return 0; 909 return 0;
910fail:
911 free_elfcorebuf();
912 return rc;
637} 913}
638 914
639static int __init parse_crash_elf_headers(void) 915static int __init parse_crash_elf_headers(void)
@@ -655,20 +931,19 @@ static int __init parse_crash_elf_headers(void)
655 rc = parse_crash_elf64_headers(); 931 rc = parse_crash_elf64_headers();
656 if (rc) 932 if (rc)
657 return rc; 933 return rc;
658
659 /* Determine vmcore size. */
660 vmcore_size = get_vmcore_size_elf64(elfcorebuf);
661 } else if (e_ident[EI_CLASS] == ELFCLASS32) { 934 } else if (e_ident[EI_CLASS] == ELFCLASS32) {
662 rc = parse_crash_elf32_headers(); 935 rc = parse_crash_elf32_headers();
663 if (rc) 936 if (rc)
664 return rc; 937 return rc;
665
666 /* Determine vmcore size. */
667 vmcore_size = get_vmcore_size_elf32(elfcorebuf);
668 } else { 938 } else {
669 pr_warn("Warning: Core image elf header is not sane\n"); 939 pr_warn("Warning: Core image elf header is not sane\n");
670 return -EINVAL; 940 return -EINVAL;
671 } 941 }
942
943 /* Determine vmcore size. */
944 vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
945 &vmcore_list);
946
672 return 0; 947 return 0;
673} 948}
674 949
@@ -711,7 +986,6 @@ void vmcore_cleanup(void)
711 list_del(&m->list); 986 list_del(&m->list);
712 kfree(m); 987 kfree(m);
713 } 988 }
714 kfree(elfcorebuf); 989 free_elfcorebuf();
715 elfcorebuf = NULL;
716} 990}
717EXPORT_SYMBOL_GPL(vmcore_cleanup); 991EXPORT_SYMBOL_GPL(vmcore_cleanup);