Merge branch 'akpm' (patches from Andrew)

Merge fixes from Andrew Morton: "20 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: rapidio/rio_cm: avoid GFP_KERNEL in atomic context Revert "ocfs2: bump up o2cb network protocol version" ocfs2: fix start offset to ocfs2_zero_range_for_truncate() cgroup: duplicate cgroup reference when cloning sockets mm: memcontrol: make per-cpu charge cache IRQ-safe for socket accounting ocfs2: fix double unlock in case retry after free truncate log fanotify: fix list corruption in fanotify_get_response() fsnotify: add a way to stop queueing events on group shutdown ocfs2: fix trans extend while free cached blocks ocfs2: fix trans extend while flush truncate log ipc/shm: fix crash if CONFIG_SHMEM is not set mm: fix the page_swap_info() BUG_ON check autofs: use dentry flags to block walks during expire MAINTAINERS: update email for VLYNQ bus entry mm: avoid endless recursion in dump_page() mm, thp: fix leaking mapped pte in __collapse_huge_page_swapin() khugepaged: fix use-after-free in collapse_huge_page() MAINTAINERS: Maik has moved ocfs2/dlm: fix race between convert and migration mem-hotplug: don't clear the only node in new_node_page()
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-09-19 19:08:03 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-09-19 19:08:03 -0400
commit: d2ffb0103aaefa9b169da042cf39ce27bfb6cdbb (patch)
tree: 967273cfc51bf649cf5f9f4f4ad0cf0be4b633fc
parent: 7fadce0d60d09427e0027d3d468781b08ca0b3d1 (diff)
parent: b92ae139c308c5223521ed6ec022148b81312809 (diff)
22 files changed, 240 insertions, 146 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 644ff65d336d..a0ce40f4c66c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6103,7 +6103,7 @@ S:	Supported
 F:      drivers/cpufreq/intel_pstate.c
 INTEL FRAMEBUFFER DRIVER (excluding 810 and 815)
-M:      Maik Broemme <mbroemme@plusserver.de>
+M:      Maik Broemme <mbroemme@libmpq.org>
 L:      linux-fbdev@vger.kernel.org
 S:      Maintained
 F:      Documentation/fb/intelfb.txt
@@ -12569,7 +12569,7 @@ F:	include/linux/if_*vlan.h
 F:      net/8021q/
 VLYNQ BUS
-M:      Florian Fainelli <florian@openwrt.org>
+M:      Florian Fainelli <f.fainelli@gmail.com>
 L:      openwrt-devel@lists.openwrt.org (subscribers-only)
 S:      Maintained
 F:      drivers/vlynq/vlynq.c
diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c
index 3fa17ac8df54..cebc296463ad 100644
--- a/drivers/rapidio/rio_cm.c
+++ b/drivers/rapidio/rio_cm.c
@@ -2247,17 +2247,30 @@ static int rio_cm_shutdown(struct notifier_block *nb, unsigned long code,
 {
        struct rio_channel *ch;
        unsigned int i;
+        LIST_HEAD(list);
        riocm_debug(EXIT, ".");
+        /*
+         * If there are any channels left in connected state send
+         * close notification to the connection partner.
+         * First build a list of channels that require a closing
+         * notification because function riocm_send_close() should
+         * be called outside of spinlock protected code.
+         */
        spin_lock_bh(&idr_lock);
        idr_for_each_entry(&ch_idr, ch, i) {
-                riocm_debug(EXIT, "close ch %d", ch->id);
+                if (ch->state == RIO_CM_CONNECTED) {
-                if (ch->state == RIO_CM_CONNECTED)
+                        riocm_debug(EXIT, "close ch %d", ch->id);
-                        riocm_send_close(ch);
+                        idr_remove(&ch_idr, ch->id);
+                        list_add(&ch->ch_node, &list);
+                }
        }
        spin_unlock_bh(&idr_lock);
+        list_for_each_entry(ch, &list, ch_node)
+                riocm_send_close(ch);
        return NOTIFY_DONE;
 }
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index b493909e7492..d8e6d421c27f 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry *dentry,
        }
        return NULL;
 }
 /*
 * Find an eligible tree to time-out
 * A tree is eligible if :-
@@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
        struct dentry *root = sb->s_root;
        struct dentry *dentry;
        struct dentry *expired;
+        struct dentry *found;
        struct autofs_info *ino;
        if (!root)
@@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
        dentry = NULL;
        while ((dentry = get_next_positive_subdir(dentry, root))) {
+                int flags = how;
                spin_lock(&sbi->fs_lock);
                ino = autofs4_dentry_ino(dentry);
-                if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
+                if (ino->flags & AUTOFS_INF_WANT_EXPIRE) {
-                        expired = NULL;
-                else
-                        expired = should_expire(dentry, mnt, timeout, how);
-                if (!expired) {
                        spin_unlock(&sbi->fs_lock);
                        continue;
                }
+                spin_unlock(&sbi->fs_lock);
+                expired = should_expire(dentry, mnt, timeout, flags);
+                if (!expired)
+                        continue;
+                spin_lock(&sbi->fs_lock);
                ino = autofs4_dentry_ino(expired);
                ino->flags |= AUTOFS_INF_WANT_EXPIRE;
                spin_unlock(&sbi->fs_lock);
                synchronize_rcu();
-                spin_lock(&sbi->fs_lock);
-                if (should_expire(expired, mnt, timeout, how)) {
-                        if (expired != dentry)
-                                dput(dentry);
-                        goto found;
-                }
+                /* Make sure a reference is not taken on found if
+                 * things have changed.
+                 */
+                flags &= ~AUTOFS_EXP_LEAVES;
+                found = should_expire(expired, mnt, timeout, how);
+                if (!found || found != expired)
+                        /* Something has changed, continue */
+                        goto next;
+                if (expired != dentry)
+                        dput(dentry);
+                spin_lock(&sbi->fs_lock);
+                goto found;
+next:
+                spin_lock(&sbi->fs_lock);
                ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
+                spin_unlock(&sbi->fs_lock);
                if (expired != dentry)
                        dput(expired);
-                spin_unlock(&sbi->fs_lock);
        }
        return NULL;
@@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        int status;
+        int state;
        /* Block on any pending expire */
        if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE))
@@ -490,8 +508,19 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
        if (rcu_walk)
                return -ECHILD;
+retry:
        spin_lock(&sbi->fs_lock);
-        if (ino->flags & AUTOFS_INF_EXPIRING) {
+        state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING);
+        if (state == AUTOFS_INF_WANT_EXPIRE) {
+                spin_unlock(&sbi->fs_lock);
+                /*
+                 * Possibly being selected for expire, wait until
+                 * it's selected or not.
+                 */
+                schedule_timeout_uninterruptible(HZ/10);
+                goto retry;
+        }
+        if (state & AUTOFS_INF_EXPIRING) {
                spin_unlock(&sbi->fs_lock);
                pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index d2f97ecca6a5..e0e5f7c3c99f 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        wait_event(group->fanotify_data.access_waitq, event->response ||
+        wait_event(group->fanotify_data.access_waitq, event->response);
-                                atomic_read(&group->fanotify_data.bypass_perm));
-        if (!event->response) { /* bypass_perm set */
-                /*
-                 * Event was canceled because group is being destroyed. Remove
-                 * it from group's event list because we are responsible for
-                 * freeing the permission event.
-                 */
-                fsnotify_remove_event(group, &event->fae.fse);
-                return 0;
-        }
        /* userspace responded, convert to something usable */
        switch (event->response) {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8e8e6bcd1d43..a64313868d3a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
        struct fanotify_perm_event_info *event, *next;
+        struct fsnotify_event *fsn_event;
        /*
-         * There may be still new events arriving in the notification queue
+         * Stop new events from arriving in the notification queue. since
-         * but since userspace cannot use fanotify fd anymore, no event can
+         * userspace cannot use fanotify fd anymore, no event can enter or
-         * enter or leave access_list by now.
+         * leave access_list by now either.
         */
-        spin_lock(&group->fanotify_data.access_lock);
+        fsnotify_group_stop_queueing(group);
-        atomic_inc(&group->fanotify_data.bypass_perm);
+        /*
+         * Process all permission events on access_list and notification queue
+         * and simulate reply from userspace.
+         */
+        spin_lock(&group->fanotify_data.access_lock);
        list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
                                 fae.fse.list) {
                pr_debug("%s: found group=%p event=%p\n", __func__, group,
@@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
        spin_unlock(&group->fanotify_data.access_lock);
        /*
-         * Since bypass_perm is set, newly queued events will not wait for
+         * Destroy all non-permission events. For permission events just
-         * access response. Wake up the already sleeping ones now.
+         * dequeue them and set the response. They will be freed once the
-         * synchronize_srcu() in fsnotify_destroy_group() will wait for all
+         * response is consumed and fanotify_get_response() returns.
-         * processes sleeping in fanotify_handle_event() waiting for access
-         * response and thus also for all permission events to be freed.
         */
+        mutex_lock(&group->notification_mutex);
+        while (!fsnotify_notify_queue_is_empty(group)) {
+                fsn_event = fsnotify_remove_first_event(group);
+                if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
+                        fsnotify_destroy_event(group, fsn_event);
+                else
+                        FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+        }
+        mutex_unlock(&group->notification_mutex);
+        /* Response for all permission events it set, wakeup waiters */
        wake_up(&group->fanotify_data.access_waitq);
 #endif
@@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        spin_lock_init(&group->fanotify_data.access_lock);
        init_waitqueue_head(&group->fanotify_data.access_waitq);
        INIT_LIST_HEAD(&group->fanotify_data.access_list);
-        atomic_set(&group->fanotify_data.bypass_perm, 0);
 #endif
        switch (flags & FAN_ALL_CLASS_BITS) {
        case FAN_CLASS_NOTIF:
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 3e2dd85be5dd..b47f7cfdcaa4 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -40,6 +40,17 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
 }
 /*
+ * Stop queueing new events for this group. Once this function returns
+ * fsnotify_add_event() will not add any new events to the group's queue.
+ */
+void fsnotify_group_stop_queueing(struct fsnotify_group *group)
+{
+        mutex_lock(&group->notification_mutex);
+        group->shutdown = true;
+        mutex_unlock(&group->notification_mutex);
+}
+/*
 * Trying to get rid of a group. Remove all marks, flush all events and release
 * the group reference.
 * Note that another thread calling fsnotify_clear_marks_by_group() may still
@@ -47,6 +58,14 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
 */
 void fsnotify_destroy_group(struct fsnotify_group *group)
 {
+        /*
+         * Stop queueing new events. The code below is careful enough to not
+         * require this but fanotify needs to stop queuing events even before
+         * fsnotify_destroy_group() is called and this makes the other callers
+         * of fsnotify_destroy_group() to see the same behavior.
+         */
+        fsnotify_group_stop_queueing(group);
        /* clear all inode marks for this group, attach them to destroy_list */
        fsnotify_detach_group_marks(group);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index a95d8e037aeb..e455e83ceeeb 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -82,7 +82,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
 * Add an event to the group notification queue.  The group can later pull this
 * event off the queue to deal with.  The function returns 0 if the event was
 * added to the queue, 1 if the event was merged with some other queued event,
- * 2 if the queue of events has overflown.
+ * 2 if the event was not queued - either the queue of events has overflown
+ * or the group is shutting down.
 */
 int fsnotify_add_event(struct fsnotify_group *group,
                       struct fsnotify_event *event,
@@ -96,6 +97,11 @@ int fsnotify_add_event(struct fsnotify_group *group,
        mutex_lock(&group->notification_mutex);
+        if (group->shutdown) {
+                mutex_unlock(&group->notification_mutex);
+                return 2;
+        }
        if (group->q_len >= group->max_events) {
                ret = 2;
                /* Queue overflow event only if it isn't already queued */
@@ -126,21 +132,6 @@ queue:
 }
 /*
- * Remove @event from group's notification queue. It is the responsibility of
- * the caller to destroy the event.
- */
-void fsnotify_remove_event(struct fsnotify_group *group,
-                           struct fsnotify_event *event)
-{
-        mutex_lock(&group->notification_mutex);
-        if (!list_empty(&event->list)) {
-                list_del_init(&event->list);
-                group->q_len--;
-        }
-        mutex_unlock(&group->notification_mutex);
-}
-/*
 * Remove and return the first event from the notification list.  It is the
 * responsibility of the caller to destroy the obtained event
 */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 7dabbc31060e..f165f867f332 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5922,7 +5922,6 @@ bail:
 }
 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
-                                         handle_t *handle,
                                         struct inode *data_alloc_inode,
                                         struct buffer_head *data_alloc_bh)
 {
@@ -5935,11 +5934,19 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
        struct ocfs2_truncate_log *tl;
        struct inode *tl_inode = osb->osb_tl_inode;
        struct buffer_head *tl_bh = osb->osb_tl_bh;
+        handle_t *handle;
        di = (struct ocfs2_dinode *) tl_bh->b_data;
        tl = &di->id2.i_dealloc;
        i = le16_to_cpu(tl->tl_used) - 1;
        while (i >= 0) {
+                handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto bail;
+                }
                /* Caller has given us at least enough credits to
                 * update the truncate log dinode */
                status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
@@ -5974,12 +5981,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                        }
                }
-                status = ocfs2_extend_trans(handle,
+                ocfs2_commit_trans(osb, handle);
-                                OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                i--;
        }
@@ -5994,7 +5996,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 {
        int status;
        unsigned int num_to_flush;
-        handle_t *handle;
        struct inode *tl_inode = osb->osb_tl_inode;
        struct inode *data_alloc_inode = NULL;
        struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -6038,21 +6039,11 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
                goto out_mutex;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+        status = ocfs2_replay_truncate_records(osb, data_alloc_inode,
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                mlog_errno(status);
-                goto out_unlock;
-        }
-        status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
                                               data_alloc_bh);
        if (status < 0)
                mlog_errno(status);
-        ocfs2_commit_trans(osb, handle);
-out_unlock:
        brelse(data_alloc_bh);
        ocfs2_inode_unlock(data_alloc_inode, 1);
@@ -6413,43 +6404,34 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
                goto out_mutex;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out_unlock;
-        }
        while (head) {
                if (head->free_bg)
                        bg_blkno = head->free_bg;
                else
                        bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
                                                              head->free_bit);
+                handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        mlog_errno(ret);
+                        goto out_unlock;
+                }
                trace_ocfs2_free_cached_blocks(
                     (unsigned long long)head->free_blk, head->free_bit);
                ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
                                               head->free_bit, bg_blkno, 1);
-                if (ret) {
+                if (ret)
                        mlog_errno(ret);
-                        goto out_journal;
-                }
-                ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
+                ocfs2_commit_trans(osb, handle);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_journal;
-                }
                tmp = head;
                head = head->free_next;
                kfree(tmp);
        }
-out_journal:
-        ocfs2_commit_trans(osb, handle);
 out_unlock:
        ocfs2_inode_unlock(inode, 1);
        brelse(di_bh);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 94b18369b1cc..b95e7df5b76a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -44,9 +44,6 @@
 * version here in tcp_internal.h should not need to be bumped for
 * filesystem locking changes.
 *
- * New in version 12
- *      - Negotiate hb timeout when storage is down.
- *
 * New in version 11
 *      - Negotiation of filesystem locking in the dlm join.
 *
@@ -78,7 +75,7 @@
 *      - full 64 bit i_size in the metadata lock lvbs
 *      - introduction of "rw" lock and pushing meta/data locking down
 */
-#define O2NET_PROTOCOL_VERSION 12ULL
+#define O2NET_PROTOCOL_VERSION 11ULL
 struct o2net_handshake {
        __be64  protocol_version;
        __be64  connector_id;
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index cdeafb4e7ed6..0bb128659d4b 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -268,7 +268,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
                                  struct dlm_lock *lock, int flags, int type)
 {
        enum dlm_status status;
-        u8 old_owner = res->owner;
        mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
             lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -335,7 +334,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
        spin_lock(&res->spinlock);
        res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
-        lock->convert_pending = 0;
        /* if it failed, move it back to granted queue.
         * if master returns DLM_NORMAL and then down before sending ast,
         * it may have already been moved to granted queue, reset to
@@ -344,12 +342,14 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
                if (status != DLM_NOTQUEUED)
                        dlm_error(status);
                dlm_revert_pending_convert(res, lock);
-        } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
+        } else if (!lock->convert_pending) {
-                        (old_owner != res->owner)) {
+                mlog(0, "%s: res %.*s, owner died and lock has been moved back "
-                mlog(0, "res %.*s is in recovering or has been recovered.\n",
+                                "to granted list, retry convert.\n",
-                                res->lockname.len, res->lockname.name);
+                                dlm->name, res->lockname.len, res->lockname.name);
                status = DLM_RECOVERING;
        }
+        lock->convert_pending = 0;
 bail:
        spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4e7b0dc22450..0b055bfb8e86 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1506,7 +1506,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
                                       u64 start, u64 len)
 {
        int ret = 0;
-        u64 tmpend, end = start + len;
+        u64 tmpend = 0;
+        u64 end = start + len;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        unsigned int csize = osb->s_clustersize;
        handle_t *handle;
@@ -1538,18 +1539,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
        }
        /*
-         * We want to get the byte offset of the end of the 1st cluster.
+         * If start is on a cluster boundary and end is somewhere in another
+         * cluster, we have not COWed the cluster starting at start, unless
+         * end is also within the same cluster. So, in this case, we skip this
+         * first call to ocfs2_zero_range_for_truncate() truncate and move on
+         * to the next one.
         */
-        tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
+        if ((start & (csize - 1)) != 0) {
-        if (tmpend > end)
+                /*
-                tmpend = end;
+                 * We want to get the byte offset of the end of the 1st
+                 * cluster.
+                 */
+                tmpend = (u64)osb->s_clustersize +
+                        (start & ~(osb->s_clustersize - 1));
+                if (tmpend > end)
+                        tmpend = end;
-        trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
+                trace_ocfs2_zero_partial_clusters_range1(
-                                                 (unsigned long long)tmpend);
+                        (unsigned long long)start,
+                        (unsigned long long)tmpend);
-        ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
+                ret = ocfs2_zero_range_for_truncate(inode, handle, start,
-        if (ret)
+                                                    tmpend);
-                mlog_errno(ret);
+                if (ret)
+                        mlog_errno(ret);
+        }
        if (tmpend < end) {
                /*
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index ea47120a85ff..6ad3533940ba 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1199,14 +1199,24 @@ retry:
                        inode_unlock((*ac)->ac_inode);
                        ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
-                        if (ret == 1)
+                        if (ret == 1) {
+                                iput((*ac)->ac_inode);
+                                (*ac)->ac_inode = NULL;
                                goto retry;
+                        }
                        if (ret < 0)
                                mlog_errno(ret);
                        inode_lock((*ac)->ac_inode);
-                        ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
+                        ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                inode_unlock((*ac)->ac_inode);
+                                iput((*ac)->ac_inode);
+                                (*ac)->ac_inode = NULL;
+                                goto bail;
+                        }
                }
                if (status < 0) {
                        if (status != -ENOSPC)
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 183a212694bf..12af0490322f 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -27,9 +27,17 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/ramfs.h>
+#include <linux/sched.h>
 #include "internal.h"
+static unsigned long ramfs_mmu_get_unmapped_area(struct file *file,
+                unsigned long addr, unsigned long len, unsigned long pgoff,
+                unsigned long flags)
+{
+        return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
 const struct file_operations ramfs_file_operations = {
        .read_iter      = generic_file_read_iter,
        .write_iter     = generic_file_write_iter,
@@ -38,6 +46,7 @@ const struct file_operations ramfs_file_operations = {
        .splice_read    = generic_file_splice_read,
        .splice_write   = iter_file_splice_write,
        .llseek         = generic_file_llseek,
+        .get_unmapped_area      = ramfs_mmu_get_unmapped_area,
 };
 const struct inode_operations ramfs_file_inode_operations = {
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 58205f33af02..7268ed076be8 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -148,6 +148,7 @@ struct fsnotify_group {
        #define FS_PRIO_1       1 /* fanotify content based access control */
        #define FS_PRIO_2       2 /* fanotify pre-content access */
        unsigned int priority;
+        bool shutdown;          /* group is being shut down, don't queue more events */
        /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
        struct mutex mark_mutex;        /* protect marks_list */
@@ -179,7 +180,6 @@ struct fsnotify_group {
                        spinlock_t access_lock;
                        struct list_head access_list;
                        wait_queue_head_t access_waitq;
-                        atomic_t bypass_perm;
 #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
                        int f_flags;
                        unsigned int max_marks;
@@ -292,6 +292,8 @@ extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *op
 extern void fsnotify_get_group(struct fsnotify_group *group);
 /* drop reference on a group from fsnotify_alloc_group */
 extern void fsnotify_put_group(struct fsnotify_group *group);
+/* group destruction begins, stop queuing new events */
+extern void fsnotify_group_stop_queueing(struct fsnotify_group *group);
 /* destroy group */
 extern void fsnotify_destroy_group(struct fsnotify_group *group);
 /* fasync handler function */
@@ -304,8 +306,6 @@ extern int fsnotify_add_event(struct fsnotify_group *group,
                              struct fsnotify_event *event,
                              int (*merge)(struct list_head *,
                                           struct fsnotify_event *));
-/* Remove passed event from groups notification queue */
-extern void fsnotify_remove_event(struct fsnotify_group *group, struct fsnotify_event *event);
 /* true if the group notification queue is empty */
 extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
 /* return, but do not dequeue the first event on the notification queue */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1c51b7f5221..5e8dab5bf9ad 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6270,6 +6270,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
        if (cgroup_sk_alloc_disabled)
                return;
+        /* Socket clone path */
+        if (skcd->val) {
+                cgroup_get(sock_cgroup_ptr(skcd));
+                return;
+        }
        rcu_read_lock();
        while (true) {
diff --git a/mm/debug.c b/mm/debug.c
index 8865bfb41b0b..74c7cae4f683 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -42,9 +42,11 @@ const struct trace_print_flags vmaflag_names[] = {
 void __dump_page(struct page *page, const char *reason)
 {
+        int mapcount = PageSlab(page) ? 0 : page_mapcount(page);
        pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
-                  page, page_ref_count(page), page_mapcount(page),
+                  page, page_ref_count(page), mapcount,
-                  page->mapping, page->index);
+                  page->mapping, page_to_pgoff(page));
        if (PageCompound(page))
                pr_cont(" compound_mapcount: %d", compound_mapcount(page));
        pr_cont("\n");
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 79c52d0061af..728d7790dc2d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -838,7 +838,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
 * value (scan code).
 */
-static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address)
+static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
+                struct vm_area_struct **vmap)
 {
        struct vm_area_struct *vma;
        unsigned long hstart, hend;
@@ -846,7 +847,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address)
        if (unlikely(khugepaged_test_exit(mm)))
                return SCAN_ANY_PROCESS;
-        vma = find_vma(mm, address);
+        *vmap = vma = find_vma(mm, address);
        if (!vma)
                return SCAN_VMA_NULL;
@@ -881,6 +882,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                .pmd = pmd,
        };
+        /* we only decide to swapin, if there is enough young ptes */
+        if (referenced < HPAGE_PMD_NR/2) {
+                trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
+                return false;
+        }
        fe.pte = pte_offset_map(pmd, address);
        for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
                        fe.pte++, fe.address += PAGE_SIZE) {
@@ -888,17 +894,12 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
                if (!is_swap_pte(pteval))
                        continue;
                swapped_in++;
-                /* we only decide to swapin, if there is enough young ptes */
-                if (referenced < HPAGE_PMD_NR/2) {
-                        trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
-                        return false;
-                }
                ret = do_swap_page(&fe, pteval);
                /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
                if (ret & VM_FAULT_RETRY) {
                        down_read(&mm->mmap_sem);
-                        if (hugepage_vma_revalidate(mm, address)) {
+                        if (hugepage_vma_revalidate(mm, address, &fe.vma)) {
                                /* vma is no longer available, don't continue to swapin */
                                trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
                                return false;
@@ -923,7 +924,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 static void collapse_huge_page(struct mm_struct *mm,
                                   unsigned long address,
                                   struct page **hpage,
-                                   struct vm_area_struct *vma,
                                   int node, int referenced)
 {
        pmd_t *pmd, _pmd;
@@ -933,6 +933,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        spinlock_t *pmd_ptl, *pte_ptl;
        int isolated = 0, result = 0;
        struct mem_cgroup *memcg;
+        struct vm_area_struct *vma;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
        gfp_t gfp;
@@ -961,7 +962,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        }
        down_read(&mm->mmap_sem);
-        result = hugepage_vma_revalidate(mm, address);
+        result = hugepage_vma_revalidate(mm, address, &vma);
        if (result) {
                mem_cgroup_cancel_charge(new_page, memcg, true);
                up_read(&mm->mmap_sem);
@@ -994,7 +995,7 @@ static void collapse_huge_page(struct mm_struct *mm,
         * handled by the anon_vma lock + PG_lock.
         */
        down_write(&mm->mmap_sem);
-        result = hugepage_vma_revalidate(mm, address);
+        result = hugepage_vma_revalidate(mm, address, &vma);
        if (result)
                goto out;
        /* check if the pmd is still valid */
@@ -1202,7 +1203,7 @@ out_unmap:
        if (ret) {
                node = khugepaged_find_target_node();
                /* collapse_huge_page will return with the mmap_sem released */
-                collapse_huge_page(mm, address, hpage, vma, node, referenced);
+                collapse_huge_page(mm, address, hpage, node, referenced);
        }
 out:
        trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9a6a51a7c416..4be518d4e68a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1740,17 +1740,22 @@ static DEFINE_MUTEX(percpu_charge_mutex);
 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
        struct memcg_stock_pcp *stock;
+        unsigned long flags;
        bool ret = false;
        if (nr_pages > CHARGE_BATCH)
                return ret;
-        stock = &get_cpu_var(memcg_stock);
+        local_irq_save(flags);
+        stock = this_cpu_ptr(&memcg_stock);
        if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
                stock->nr_pages -= nr_pages;
                ret = true;
        }
-        put_cpu_var(memcg_stock);
+        local_irq_restore(flags);
        return ret;
 }
@@ -1771,15 +1776,18 @@ static void drain_stock(struct memcg_stock_pcp *stock)
        stock->cached = NULL;
 }
-/*
- * This must be called under preempt disabled or must be called by
- * a thread which is pinned to local cpu.
- */
 static void drain_local_stock(struct work_struct *dummy)
 {
-        struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
+        struct memcg_stock_pcp *stock;
+        unsigned long flags;
+        local_irq_save(flags);
+        stock = this_cpu_ptr(&memcg_stock);
        drain_stock(stock);
        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+        local_irq_restore(flags);
 }
 /*
@@ -1788,14 +1796,19 @@ static void drain_local_stock(struct work_struct *dummy)
 */
 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
-        struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
+        struct memcg_stock_pcp *stock;
+        unsigned long flags;
+        local_irq_save(flags);
+        stock = this_cpu_ptr(&memcg_stock);
        if (stock->cached != memcg) { /* reset if necessary */
                drain_stock(stock);
                stock->cached = memcg;
        }
        stock->nr_pages += nr_pages;
-        put_cpu_var(memcg_stock);
+        local_irq_restore(flags);
 }
 /*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 41266dc29f33..b58906b6215c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1567,7 +1567,9 @@ static struct page *new_node_page(struct page *page, unsigned long private,
                return alloc_huge_page_node(page_hstate(compound_head(page)),
                                        next_node_in(nid, nmask));
-        node_clear(nid, nmask);
+        if (nid != next_node_in(nid, nmask))
+                node_clear(nid, nmask);
        if (PageHighMem(page)
            || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
                gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/page_io.c b/mm/page_io.c
index 16bd82fad38c..eafe5ddc2b54 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -264,6 +264,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
        int ret;
        struct swap_info_struct *sis = page_swap_info(page);
+        BUG_ON(!PageSwapCache(page));
        if (sis->flags & SWP_FILE) {
                struct kiocb kiocb;
                struct file *swap_file = sis->swap_file;
@@ -337,6 +338,7 @@ int swap_readpage(struct page *page)
        int ret = 0;
        struct swap_info_struct *sis = page_swap_info(page);
+        BUG_ON(!PageSwapCache(page));
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(PageUptodate(page), page);
        if (frontswap_load(page) == 0) {
@@ -386,6 +388,7 @@ int swap_set_page_dirty(struct page *page)
        if (sis->flags & SWP_FILE) {
                struct address_space *mapping = sis->swap_file->f_mapping;
+                BUG_ON(!PageSwapCache(page));
                return mapping->a_ops->set_page_dirty(page);
        } else {
                return __set_page_dirty_no_writeback(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 78cfa292a29a..2657accc6e2b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2724,7 +2724,6 @@ int swapcache_prepare(swp_entry_t entry)
 struct swap_info_struct *page_swap_info(struct page *page)
 {
        swp_entry_t swap = { .val = page_private(page) };
-        BUG_ON(!PageSwapCache(page));
        return swap_info[swp_type(swap)];
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index 25dab8b60223..fd7b41edf1ce 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1362,7 +1362,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
                if (!try_module_get(prot->owner))
                        goto out_free_sec;
                sk_tx_queue_clear(sk);
-                cgroup_sk_alloc(&sk->sk_cgrp_data);
        }
        return sk;
@@ -1422,6 +1421,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                sock_net_set(sk, net);
                atomic_set(&sk->sk_wmem_alloc, 1);
+                cgroup_sk_alloc(&sk->sk_cgrp_data);
                sock_update_classid(&sk->sk_cgrp_data);
                sock_update_netprioidx(&sk->sk_cgrp_data);
        }
@@ -1566,6 +1566,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                newsk->sk_priority = 0;
                newsk->sk_incoming_cpu = raw_smp_processor_id();
                atomic64_set(&newsk->sk_cookie, 0);
+                cgroup_sk_alloc(&newsk->sk_cgrp_data);
                /*
                 * Before updating sk_refcnt, we must commit prior changes to memory
                 * (Documentation/RCU/rculist_nulls.txt for details)
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-09-19 19:08:03 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-09-19 19:08:03 -0400
commit	d2ffb0103aaefa9b169da042cf39ce27bfb6cdbb (patch)
tree	967273cfc51bf649cf5f9f4f4ad0cf0be4b633fc
parent	7fadce0d60d09427e0027d3d468781b08ca0b3d1 (diff)
parent	b92ae139c308c5223521ed6ec022148b81312809 (diff)