aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-27 17:56:59 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-27 17:56:59 -0400
commitaf7d93729c7c2beadea8ec5a6e66c53bef0e6290 (patch)
treed807ab034c13fe7e758c8ca11fb8ee38e9ceb38c
parent564884fbdecaea56fb65f2f32963059d3049b967 (diff)
parent11e685672a0861ce136cc4e7f6fdd11e5390b1fa (diff)
Merge branch 'akpm' (patches from Andrew)
Merge misc updates and fixes from Andrew Morton: - late-breaking ocfs2 updates - random bunch of fixes * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm: disable DEFERRED_STRUCT_PAGE_INIT on !NO_BOOTMEM mm/memcontrol.c: move comments for get_mctgt_type() to proper position mm/memcontrol.c: fix the margin computation in mem_cgroup_margin() mm/cma: silence warnings due to max() usage mm: thp: avoid false positive VM_BUG_ON_PAGE in page_move_anon_rmap() oom_reaper: close race with exiting task mm: use early_pfn_to_nid in register_page_bootmem_info_node mm: use early_pfn_to_nid in page_ext_init MAINTAINERS: Kdump maintainers update MAINTAINERS: add kexec_core.c and kexec_file.c mm: oom: do not reap task if there are live threads in threadgroup direct-io: fix direct write stale data exposure from concurrent buffered read ocfs2: bump up o2cb network protocol version ocfs2: o2hb: fix hb hung time ocfs2: o2hb: don't negotiate if last hb fail ocfs2: o2hb: add some user/debug log ocfs2: o2hb: add NEGOTIATE_APPROVE message ocfs2: o2hb: add NEGO_TIMEOUT message ocfs2: o2hb: add negotiate timer
-rw-r--r--MAINTAINERS7
-rw-r--r--fs/direct-io.c14
-rw-r--r--fs/ocfs2/cluster/heartbeat.c180
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h5
-rw-r--r--init/main.c3
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/cma.c7
-rw-r--r--mm/memcontrol.c39
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/oom_kill.c32
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/rmap.c2
12 files changed, 246 insertions, 51 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index f2d7337ebdb3..f466673f86ff 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6421,8 +6421,9 @@ F: Documentation/kbuild/kconfig-language.txt
6421F: scripts/kconfig/ 6421F: scripts/kconfig/
6422 6422
6423KDUMP 6423KDUMP
6424M: Vivek Goyal <vgoyal@redhat.com> 6424M: Dave Young <dyoung@redhat.com>
6425M: Haren Myneni <hbabu@us.ibm.com> 6425M: Baoquan He <bhe@redhat.com>
6426R: Vivek Goyal <vgoyal@redhat.com>
6426L: kexec@lists.infradead.org 6427L: kexec@lists.infradead.org
6427W: http://lse.sourceforge.net/kdump/ 6428W: http://lse.sourceforge.net/kdump/
6428S: Maintained 6429S: Maintained
@@ -6568,7 +6569,7 @@ L: kexec@lists.infradead.org
6568S: Maintained 6569S: Maintained
6569F: include/linux/kexec.h 6570F: include/linux/kexec.h
6570F: include/uapi/linux/kexec.h 6571F: include/uapi/linux/kexec.h
6571F: kernel/kexec.c 6572F: kernel/kexec*
6572 6573
6573KEYS/KEYRINGS: 6574KEYS/KEYRINGS:
6574M: David Howells <dhowells@redhat.com> 6575M: David Howells <dhowells@redhat.com>
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 3bf3f20f8ecc..f3b4408be590 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -628,11 +628,11 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
628 map_bh->b_size = fs_count << i_blkbits; 628 map_bh->b_size = fs_count << i_blkbits;
629 629
630 /* 630 /*
631 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we 631 * For writes that could fill holes inside i_size on a
632 * forbid block creations: only overwrites are permitted. 632 * DIO_SKIP_HOLES filesystem we forbid block creations: only
633 * We will return early to the caller once we see an 633 * overwrites are permitted. We will return early to the caller
634 * unmapped buffer head returned, and the caller will fall 634 * once we see an unmapped buffer head returned, and the caller
635 * back to buffered I/O. 635 * will fall back to buffered I/O.
636 * 636 *
637 * Otherwise the decision is left to the get_blocks method, 637 * Otherwise the decision is left to the get_blocks method,
638 * which may decide to handle it or also return an unmapped 638 * which may decide to handle it or also return an unmapped
@@ -640,8 +640,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
640 */ 640 */
641 create = dio->rw & WRITE; 641 create = dio->rw & WRITE;
642 if (dio->flags & DIO_SKIP_HOLES) { 642 if (dio->flags & DIO_SKIP_HOLES) {
643 if (sdio->block_in_file < (i_size_read(dio->inode) >> 643 if (fs_startblk <= ((i_size_read(dio->inode) - 1) >>
644 sdio->blkbits)) 644 i_blkbits))
645 create = 0; 645 create = 0;
646 } 646 }
647 647
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a8d15beee5cb..6aaf3e351391 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -272,10 +272,21 @@ struct o2hb_region {
272 struct delayed_work hr_write_timeout_work; 272 struct delayed_work hr_write_timeout_work;
273 unsigned long hr_last_timeout_start; 273 unsigned long hr_last_timeout_start;
274 274
275 /* negotiate timer, used to negotiate extending hb timeout. */
276 struct delayed_work hr_nego_timeout_work;
277 unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
278
275 /* Used during o2hb_check_slot to hold a copy of the block 279 /* Used during o2hb_check_slot to hold a copy of the block
276 * being checked because we temporarily have to zero out the 280 * being checked because we temporarily have to zero out the
277 * crc field. */ 281 * crc field. */
278 struct o2hb_disk_heartbeat_block *hr_tmp_block; 282 struct o2hb_disk_heartbeat_block *hr_tmp_block;
283
284 /* Message key for negotiate timeout message. */
285 unsigned int hr_key;
286 struct list_head hr_handler_list;
287
288 /* last hb status, 0 for success, other value for error. */
289 int hr_last_hb_status;
279}; 290};
280 291
281struct o2hb_bio_wait_ctxt { 292struct o2hb_bio_wait_ctxt {
@@ -284,6 +295,17 @@ struct o2hb_bio_wait_ctxt {
284 int wc_error; 295 int wc_error;
285}; 296};
286 297
298#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2)
299
300enum {
301 O2HB_NEGO_TIMEOUT_MSG = 1,
302 O2HB_NEGO_APPROVE_MSG = 2,
303};
304
305struct o2hb_nego_msg {
306 u8 node_num;
307};
308
287static void o2hb_write_timeout(struct work_struct *work) 309static void o2hb_write_timeout(struct work_struct *work)
288{ 310{
289 int failed, quorum; 311 int failed, quorum;
@@ -319,7 +341,7 @@ static void o2hb_write_timeout(struct work_struct *work)
319 o2quo_disk_timeout(); 341 o2quo_disk_timeout();
320} 342}
321 343
322static void o2hb_arm_write_timeout(struct o2hb_region *reg) 344static void o2hb_arm_timeout(struct o2hb_region *reg)
323{ 345{
324 /* Arm writeout only after thread reaches steady state */ 346 /* Arm writeout only after thread reaches steady state */
325 if (atomic_read(&reg->hr_steady_iterations) != 0) 347 if (atomic_read(&reg->hr_steady_iterations) != 0)
@@ -334,14 +356,132 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
334 spin_unlock(&o2hb_live_lock); 356 spin_unlock(&o2hb_live_lock);
335 } 357 }
336 cancel_delayed_work(&reg->hr_write_timeout_work); 358 cancel_delayed_work(&reg->hr_write_timeout_work);
337 reg->hr_last_timeout_start = jiffies;
338 schedule_delayed_work(&reg->hr_write_timeout_work, 359 schedule_delayed_work(&reg->hr_write_timeout_work,
339 msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); 360 msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
361
362 cancel_delayed_work(&reg->hr_nego_timeout_work);
363 /* negotiate timeout must be less than write timeout. */
364 schedule_delayed_work(&reg->hr_nego_timeout_work,
365 msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS));
366 memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap));
340} 367}
341 368
342static void o2hb_disarm_write_timeout(struct o2hb_region *reg) 369static void o2hb_disarm_timeout(struct o2hb_region *reg)
343{ 370{
344 cancel_delayed_work_sync(&reg->hr_write_timeout_work); 371 cancel_delayed_work_sync(&reg->hr_write_timeout_work);
372 cancel_delayed_work_sync(&reg->hr_nego_timeout_work);
373}
374
375static int o2hb_send_nego_msg(int key, int type, u8 target)
376{
377 struct o2hb_nego_msg msg;
378 int status, ret;
379
380 msg.node_num = o2nm_this_node();
381again:
382 ret = o2net_send_message(type, key, &msg, sizeof(msg),
383 target, &status);
384
385 if (ret == -EAGAIN || ret == -ENOMEM) {
386 msleep(100);
387 goto again;
388 }
389
390 return ret;
391}
392
393static void o2hb_nego_timeout(struct work_struct *work)
394{
395 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
396 int master_node, i, ret;
397 struct o2hb_region *reg;
398
399 reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work);
400 /* don't negotiate timeout if last hb failed since it is very
401 * possible io failed. Should let write timeout fence self.
402 */
403 if (reg->hr_last_hb_status)
404 return;
405
406 o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
407 /* lowest node as master node to make negotiate decision. */
408 master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0);
409
410 if (master_node == o2nm_this_node()) {
411 if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
412 printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n",
413 o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000,
414 config_item_name(&reg->hr_item), reg->hr_dev_name);
415 set_bit(master_node, reg->hr_nego_node_bitmap);
416 }
417 if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap,
418 sizeof(reg->hr_nego_node_bitmap))) {
419 /* check negotiate bitmap every second to do timeout
420 * approve decision.
421 */
422 schedule_delayed_work(&reg->hr_nego_timeout_work,
423 msecs_to_jiffies(1000));
424
425 return;
426 }
427
428 printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n",
429 config_item_name(&reg->hr_item), reg->hr_dev_name);
430 /* approve negotiate timeout request. */
431 o2hb_arm_timeout(reg);
432
433 i = -1;
434 while ((i = find_next_bit(live_node_bitmap,
435 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
436 if (i == master_node)
437 continue;
438
439 mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i);
440 ret = o2hb_send_nego_msg(reg->hr_key,
441 O2HB_NEGO_APPROVE_MSG, i);
442 if (ret)
443 mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n",
444 i, ret);
445 }
446 } else {
447 /* negotiate timeout with master node. */
448 printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n",
449 o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(&reg->hr_item),
450 reg->hr_dev_name, master_node);
451 ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG,
452 master_node);
453 if (ret)
454 mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n",
455 master_node, ret);
456 }
457}
458
459static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
460 void **ret_data)
461{
462 struct o2hb_region *reg = data;
463 struct o2hb_nego_msg *nego_msg;
464
465 nego_msg = (struct o2hb_nego_msg *)msg->buf;
466 printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n",
467 nego_msg->node_num, config_item_name(&reg->hr_item), reg->hr_dev_name);
468 if (nego_msg->node_num < O2NM_MAX_NODES)
469 set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap);
470 else
471 mlog(ML_ERROR, "got nego timeout message from bad node.\n");
472
473 return 0;
474}
475
476static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data,
477 void **ret_data)
478{
479 struct o2hb_region *reg = data;
480
481 printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n",
482 config_item_name(&reg->hr_item), reg->hr_dev_name);
483 o2hb_arm_timeout(reg);
484 return 0;
345} 485}
346 486
347static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) 487static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -1032,7 +1172,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
1032 /* Skip disarming the timeout if own slot has stale/bad data */ 1172 /* Skip disarming the timeout if own slot has stale/bad data */
1033 if (own_slot_ok) { 1173 if (own_slot_ok) {
1034 o2hb_set_quorum_device(reg); 1174 o2hb_set_quorum_device(reg);
1035 o2hb_arm_write_timeout(reg); 1175 o2hb_arm_timeout(reg);
1176 reg->hr_last_timeout_start = jiffies;
1036 } 1177 }
1037 1178
1038bail: 1179bail:
@@ -1096,6 +1237,7 @@ static int o2hb_thread(void *data)
1096 before_hb = ktime_get_real(); 1237 before_hb = ktime_get_real();
1097 1238
1098 ret = o2hb_do_disk_heartbeat(reg); 1239 ret = o2hb_do_disk_heartbeat(reg);
1240 reg->hr_last_hb_status = ret;
1099 1241
1100 after_hb = ktime_get_real(); 1242 after_hb = ktime_get_real();
1101 1243
@@ -1114,7 +1256,7 @@ static int o2hb_thread(void *data)
1114 } 1256 }
1115 } 1257 }
1116 1258
1117 o2hb_disarm_write_timeout(reg); 1259 o2hb_disarm_timeout(reg);
1118 1260
1119 /* unclean stop is only used in very bad situation */ 1261 /* unclean stop is only used in very bad situation */
1120 for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) 1262 for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
@@ -1451,6 +1593,7 @@ static void o2hb_region_release(struct config_item *item)
1451 list_del(&reg->hr_all_item); 1593 list_del(&reg->hr_all_item);
1452 spin_unlock(&o2hb_live_lock); 1594 spin_unlock(&o2hb_live_lock);
1453 1595
1596 o2net_unregister_handler_list(&reg->hr_handler_list);
1454 kfree(reg); 1597 kfree(reg);
1455} 1598}
1456 1599
@@ -1762,6 +1905,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
1762 } 1905 }
1763 1906
1764 INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout); 1907 INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
1908 INIT_DELAYED_WORK(&reg->hr_nego_timeout_work, o2hb_nego_timeout);
1765 1909
1766 /* 1910 /*
1767 * A node is considered live after it has beat LIVE_THRESHOLD 1911 * A node is considered live after it has beat LIVE_THRESHOLD
@@ -1995,13 +2139,37 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
1995 2139
1996 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type); 2140 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
1997 2141
2142 /* this is the same way to generate msg key as dlm, for local heartbeat,
2143 * name is also the same, so make initial crc value different to avoid
2144 * message key conflict.
2145 */
2146 reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS,
2147 name, strlen(name));
2148 INIT_LIST_HEAD(&reg->hr_handler_list);
2149 ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key,
2150 sizeof(struct o2hb_nego_msg),
2151 o2hb_nego_timeout_handler,
2152 reg, NULL, &reg->hr_handler_list);
2153 if (ret)
2154 goto free;
2155
2156 ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key,
2157 sizeof(struct o2hb_nego_msg),
2158 o2hb_nego_approve_handler,
2159 reg, NULL, &reg->hr_handler_list);
2160 if (ret)
2161 goto unregister_handler;
2162
1998 ret = o2hb_debug_region_init(reg, o2hb_debug_dir); 2163 ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
1999 if (ret) { 2164 if (ret) {
2000 config_item_put(&reg->hr_item); 2165 config_item_put(&reg->hr_item);
2001 goto free; 2166 goto unregister_handler;
2002 } 2167 }
2003 2168
2004 return &reg->hr_item; 2169 return &reg->hr_item;
2170
2171unregister_handler:
2172 o2net_unregister_handler_list(&reg->hr_handler_list);
2005free: 2173free:
2006 kfree(reg); 2174 kfree(reg);
2007 return ERR_PTR(ret); 2175 return ERR_PTR(ret);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index b95e7df5b76a..94b18369b1cc 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -44,6 +44,9 @@
44 * version here in tcp_internal.h should not need to be bumped for 44 * version here in tcp_internal.h should not need to be bumped for
45 * filesystem locking changes. 45 * filesystem locking changes.
46 * 46 *
47 * New in version 12
48 * - Negotiate hb timeout when storage is down.
49 *
47 * New in version 11 50 * New in version 11
48 * - Negotiation of filesystem locking in the dlm join. 51 * - Negotiation of filesystem locking in the dlm join.
49 * 52 *
@@ -75,7 +78,7 @@
75 * - full 64 bit i_size in the metadata lock lvbs 78 * - full 64 bit i_size in the metadata lock lvbs
76 * - introduction of "rw" lock and pushing meta/data locking down 79 * - introduction of "rw" lock and pushing meta/data locking down
77 */ 80 */
78#define O2NET_PROTOCOL_VERSION 11ULL 81#define O2NET_PROTOCOL_VERSION 12ULL
79struct o2net_handshake { 82struct o2net_handshake {
80 __be64 protocol_version; 83 __be64 protocol_version;
81 __be64 connector_id; 84 __be64 connector_id;
diff --git a/init/main.c b/init/main.c
index bc0f9e0bcf22..4c17fda5c2ff 100644
--- a/init/main.c
+++ b/init/main.c
@@ -607,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
607 initrd_start = 0; 607 initrd_start = 0;
608 } 608 }
609#endif 609#endif
610 page_ext_init();
610 debug_objects_mem_init(); 611 debug_objects_mem_init();
611 kmemleak_init(); 612 kmemleak_init();
612 setup_per_cpu_pageset(); 613 setup_per_cpu_pageset();
@@ -1003,8 +1004,6 @@ static noinline void __init kernel_init_freeable(void)
1003 sched_init_smp(); 1004 sched_init_smp();
1004 1005
1005 page_alloc_init_late(); 1006 page_alloc_init_late();
1006 /* Initialize page ext after all struct pages are initializaed */
1007 page_ext_init();
1008 1007
1009 do_basic_setup(); 1008 do_basic_setup();
1010 1009
diff --git a/mm/Kconfig b/mm/Kconfig
index 22fa8189e4fc..3e2daef3c946 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -648,7 +648,7 @@ config DEFERRED_STRUCT_PAGE_INIT
648 bool "Defer initialisation of struct pages to kthreads" 648 bool "Defer initialisation of struct pages to kthreads"
649 default n 649 default n
650 depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT 650 depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
651 depends on MEMORY_HOTPLUG 651 depends on NO_BOOTMEM && MEMORY_HOTPLUG
652 depends on !FLATMEM 652 depends on !FLATMEM
653 help 653 help
654 Ordinarily all struct pages are initialised during early boot in a 654 Ordinarily all struct pages are initialised during early boot in a
diff --git a/mm/cma.c b/mm/cma.c
index ea506eb18cd6..bd0e1412475e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -183,7 +183,8 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
183 return -EINVAL; 183 return -EINVAL;
184 184
185 /* ensure minimal alignment required by mm core */ 185 /* ensure minimal alignment required by mm core */
186 alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); 186 alignment = PAGE_SIZE <<
187 max_t(unsigned long, MAX_ORDER - 1, pageblock_order);
187 188
188 /* alignment should be aligned with order_per_bit */ 189 /* alignment should be aligned with order_per_bit */
189 if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) 190 if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit))
@@ -266,8 +267,8 @@ int __init cma_declare_contiguous(phys_addr_t base,
266 * migratetype page by page allocator's buddy algorithm. In the case, 267 * migratetype page by page allocator's buddy algorithm. In the case,
267 * you couldn't get a contiguous memory, which is not what we want. 268 * you couldn't get a contiguous memory, which is not what we want.
268 */ 269 */
269 alignment = max(alignment, 270 alignment = max(alignment, (phys_addr_t)PAGE_SIZE <<
270 (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); 271 max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
271 base = ALIGN(base, alignment); 272 base = ALIGN(base, alignment);
272 size = ALIGN(size, alignment); 273 size = ALIGN(size, alignment);
273 limit &= ~(alignment - 1); 274 limit &= ~(alignment - 1);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f6477a9dbe7a..925b431f3f03 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1108,6 +1108,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1108 limit = READ_ONCE(memcg->memsw.limit); 1108 limit = READ_ONCE(memcg->memsw.limit);
1109 if (count <= limit) 1109 if (count <= limit)
1110 margin = min(margin, limit - count); 1110 margin = min(margin, limit - count);
1111 else
1112 margin = 0;
1111 } 1113 }
1112 1114
1113 return margin; 1115 return margin;
@@ -4307,24 +4309,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
4307 return 0; 4309 return 0;
4308} 4310}
4309 4311
4310/**
4311 * get_mctgt_type - get target type of moving charge
4312 * @vma: the vma the pte to be checked belongs
4313 * @addr: the address corresponding to the pte to be checked
4314 * @ptent: the pte to be checked
4315 * @target: the pointer the target page or swap ent will be stored(can be NULL)
4316 *
4317 * Returns
4318 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
4319 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4320 * move charge. if @target is not NULL, the page is stored in target->page
4321 * with extra refcnt got(Callers should handle it).
4322 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4323 * target for charge migration. if @target is not NULL, the entry is stored
4324 * in target->ent.
4325 *
4326 * Called with pte lock held.
4327 */
4328union mc_target { 4312union mc_target {
4329 struct page *page; 4313 struct page *page;
4330 swp_entry_t ent; 4314 swp_entry_t ent;
@@ -4513,6 +4497,25 @@ out:
4513 return ret; 4497 return ret;
4514} 4498}
4515 4499
4500/**
4501 * get_mctgt_type - get target type of moving charge
4502 * @vma: the vma the pte to be checked belongs
4503 * @addr: the address corresponding to the pte to be checked
4504 * @ptent: the pte to be checked
4505 * @target: the pointer the target page or swap ent will be stored(can be NULL)
4506 *
4507 * Returns
4508 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
4509 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4510 * move charge. if @target is not NULL, the page is stored in target->page
4511 * with extra refcnt got(Callers should handle it).
4512 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4513 * target for charge migration. if @target is not NULL, the entry is stored
4514 * in target->ent.
4515 *
4516 * Called with pte lock held.
4517 */
4518
4516static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 4519static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4517 unsigned long addr, pte_t ptent, union mc_target *target) 4520 unsigned long addr, pte_t ptent, union mc_target *target)
4518{ 4521{
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index caf2a14c37ad..b8ee0806415f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -300,7 +300,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
300 * multiple nodes we check that this pfn does not already 300 * multiple nodes we check that this pfn does not already
301 * reside in some other nodes. 301 * reside in some other nodes.
302 */ 302 */
303 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 303 if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
304 register_page_bootmem_info_section(pfn); 304 register_page_bootmem_info_section(pfn);
305 } 305 }
306} 306}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5bb2f7698ad7..dfb1ab61fb23 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -443,13 +443,29 @@ static bool __oom_reap_task(struct task_struct *tsk)
443{ 443{
444 struct mmu_gather tlb; 444 struct mmu_gather tlb;
445 struct vm_area_struct *vma; 445 struct vm_area_struct *vma;
446 struct mm_struct *mm; 446 struct mm_struct *mm = NULL;
447 struct task_struct *p; 447 struct task_struct *p;
448 struct zap_details details = {.check_swap_entries = true, 448 struct zap_details details = {.check_swap_entries = true,
449 .ignore_dirty = true}; 449 .ignore_dirty = true};
450 bool ret = true; 450 bool ret = true;
451 451
452 /* 452 /*
453 * We have to make sure to not race with the victim exit path
454 * and cause premature new oom victim selection:
455 * __oom_reap_task exit_mm
456 * atomic_inc_not_zero
457 * mmput
458 * atomic_dec_and_test
459 * exit_oom_victim
460 * [...]
461 * out_of_memory
462 * select_bad_process
463 * # no TIF_MEMDIE task selects new victim
464 * unmap_page_range # frees some memory
465 */
466 mutex_lock(&oom_lock);
467
468 /*
453 * Make sure we find the associated mm_struct even when the particular 469 * Make sure we find the associated mm_struct even when the particular
454 * thread has already terminated and cleared its mm. 470 * thread has already terminated and cleared its mm.
455 * We might have race with exit path so consider our work done if there 471 * We might have race with exit path so consider our work done if there
@@ -457,19 +473,19 @@ static bool __oom_reap_task(struct task_struct *tsk)
457 */ 473 */
458 p = find_lock_task_mm(tsk); 474 p = find_lock_task_mm(tsk);
459 if (!p) 475 if (!p)
460 return true; 476 goto unlock_oom;
461 477
462 mm = p->mm; 478 mm = p->mm;
463 if (!atomic_inc_not_zero(&mm->mm_users)) { 479 if (!atomic_inc_not_zero(&mm->mm_users)) {
464 task_unlock(p); 480 task_unlock(p);
465 return true; 481 goto unlock_oom;
466 } 482 }
467 483
468 task_unlock(p); 484 task_unlock(p);
469 485
470 if (!down_read_trylock(&mm->mmap_sem)) { 486 if (!down_read_trylock(&mm->mmap_sem)) {
471 ret = false; 487 ret = false;
472 goto out; 488 goto unlock_oom;
473 } 489 }
474 490
475 tlb_gather_mmu(&tlb, mm, 0, -1); 491 tlb_gather_mmu(&tlb, mm, 0, -1);
@@ -511,13 +527,15 @@ static bool __oom_reap_task(struct task_struct *tsk)
511 * to release its memory. 527 * to release its memory.
512 */ 528 */
513 set_bit(MMF_OOM_REAPED, &mm->flags); 529 set_bit(MMF_OOM_REAPED, &mm->flags);
514out: 530unlock_oom:
531 mutex_unlock(&oom_lock);
515 /* 532 /*
516 * Drop our reference but make sure the mmput slow path is called from a 533 * Drop our reference but make sure the mmput slow path is called from a
517 * different context because we shouldn't risk we get stuck there and 534 * different context because we shouldn't risk we get stuck there and
518 * put the oom_reaper out of the way. 535 * put the oom_reaper out of the way.
519 */ 536 */
520 mmput_async(mm); 537 if (mm)
538 mmput_async(mm);
521 return ret; 539 return ret;
522} 540}
523 541
@@ -611,8 +629,6 @@ void try_oom_reaper(struct task_struct *tsk)
611 629
612 if (!process_shares_mm(p, mm)) 630 if (!process_shares_mm(p, mm))
613 continue; 631 continue;
614 if (same_thread_group(p, tsk))
615 continue;
616 if (fatal_signal_pending(p)) 632 if (fatal_signal_pending(p))
617 continue; 633 continue;
618 634
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2d864e64f7fe..44a4c029c8e7 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -390,8 +390,10 @@ void __init page_ext_init(void)
390 * We know some arch can have a nodes layout such as 390 * We know some arch can have a nodes layout such as
391 * -------------pfn--------------> 391 * -------------pfn-------------->
392 * N0 | N1 | N2 | N0 | N1 | N2|.... 392 * N0 | N1 | N2 | N0 | N1 | N2|....
393 *
394 * Take into account DEFERRED_STRUCT_PAGE_INIT.
393 */ 395 */
394 if (pfn_to_nid(pfn) != nid) 396 if (early_pfn_to_nid(pfn) != nid)
395 continue; 397 continue;
396 if (init_section_page_ext(pfn, nid)) 398 if (init_section_page_ext(pfn, nid))
397 goto oom; 399 goto oom;
diff --git a/mm/rmap.c b/mm/rmap.c
index 8a839935b18c..0ea5d9071b32 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1098,6 +1098,8 @@ void page_move_anon_rmap(struct page *page,
1098 1098
1099 VM_BUG_ON_PAGE(!PageLocked(page), page); 1099 VM_BUG_ON_PAGE(!PageLocked(page), page);
1100 VM_BUG_ON_VMA(!anon_vma, vma); 1100 VM_BUG_ON_VMA(!anon_vma, vma);
1101 if (IS_ENABLED(CONFIG_DEBUG_VM) && PageTransHuge(page))
1102 address &= HPAGE_PMD_MASK;
1101 VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); 1103 VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
1102 1104
1103 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1105 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;