diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-27 17:56:59 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-27 17:56:59 -0400 |
commit | af7d93729c7c2beadea8ec5a6e66c53bef0e6290 (patch) | |
tree | d807ab034c13fe7e758c8ca11fb8ee38e9ceb38c | |
parent | 564884fbdecaea56fb65f2f32963059d3049b967 (diff) | |
parent | 11e685672a0861ce136cc4e7f6fdd11e5390b1fa (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge misc updates and fixes from Andrew Morton:
- late-breaking ocfs2 updates
- random bunch of fixes
* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
mm: disable DEFERRED_STRUCT_PAGE_INIT on !NO_BOOTMEM
mm/memcontrol.c: move comments for get_mctgt_type() to proper position
mm/memcontrol.c: fix the margin computation in mem_cgroup_margin()
mm/cma: silence warnings due to max() usage
mm: thp: avoid false positive VM_BUG_ON_PAGE in page_move_anon_rmap()
oom_reaper: close race with exiting task
mm: use early_pfn_to_nid in register_page_bootmem_info_node
mm: use early_pfn_to_nid in page_ext_init
MAINTAINERS: Kdump maintainers update
MAINTAINERS: add kexec_core.c and kexec_file.c
mm: oom: do not reap task if there are live threads in threadgroup
direct-io: fix direct write stale data exposure from concurrent buffered read
ocfs2: bump up o2cb network protocol version
ocfs2: o2hb: fix hb hung time
ocfs2: o2hb: don't negotiate if last hb fail
ocfs2: o2hb: add some user/debug log
ocfs2: o2hb: add NEGOTIATE_APPROVE message
ocfs2: o2hb: add NEGO_TIMEOUT message
ocfs2: o2hb: add negotiate timer
-rw-r--r-- | MAINTAINERS | 7 | ||||
-rw-r--r-- | fs/direct-io.c | 14 | ||||
-rw-r--r-- | fs/ocfs2/cluster/heartbeat.c | 180 | ||||
-rw-r--r-- | fs/ocfs2/cluster/tcp_internal.h | 5 | ||||
-rw-r--r-- | init/main.c | 3 | ||||
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/cma.c | 7 | ||||
-rw-r--r-- | mm/memcontrol.c | 39 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 32 | ||||
-rw-r--r-- | mm/page_ext.c | 4 | ||||
-rw-r--r-- | mm/rmap.c | 2 |
12 files changed, 246 insertions, 51 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index f2d7337ebdb3..f466673f86ff 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -6421,8 +6421,9 @@ F: Documentation/kbuild/kconfig-language.txt | |||
6421 | F: scripts/kconfig/ | 6421 | F: scripts/kconfig/ |
6422 | 6422 | ||
6423 | KDUMP | 6423 | KDUMP |
6424 | M: Vivek Goyal <vgoyal@redhat.com> | 6424 | M: Dave Young <dyoung@redhat.com> |
6425 | M: Haren Myneni <hbabu@us.ibm.com> | 6425 | M: Baoquan He <bhe@redhat.com> |
6426 | R: Vivek Goyal <vgoyal@redhat.com> | ||
6426 | L: kexec@lists.infradead.org | 6427 | L: kexec@lists.infradead.org |
6427 | W: http://lse.sourceforge.net/kdump/ | 6428 | W: http://lse.sourceforge.net/kdump/ |
6428 | S: Maintained | 6429 | S: Maintained |
@@ -6568,7 +6569,7 @@ L: kexec@lists.infradead.org | |||
6568 | S: Maintained | 6569 | S: Maintained |
6569 | F: include/linux/kexec.h | 6570 | F: include/linux/kexec.h |
6570 | F: include/uapi/linux/kexec.h | 6571 | F: include/uapi/linux/kexec.h |
6571 | F: kernel/kexec.c | 6572 | F: kernel/kexec* |
6572 | 6573 | ||
6573 | KEYS/KEYRINGS: | 6574 | KEYS/KEYRINGS: |
6574 | M: David Howells <dhowells@redhat.com> | 6575 | M: David Howells <dhowells@redhat.com> |
diff --git a/fs/direct-io.c b/fs/direct-io.c index 3bf3f20f8ecc..f3b4408be590 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -628,11 +628,11 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, | |||
628 | map_bh->b_size = fs_count << i_blkbits; | 628 | map_bh->b_size = fs_count << i_blkbits; |
629 | 629 | ||
630 | /* | 630 | /* |
631 | * For writes inside i_size on a DIO_SKIP_HOLES filesystem we | 631 | * For writes that could fill holes inside i_size on a |
632 | * forbid block creations: only overwrites are permitted. | 632 | * DIO_SKIP_HOLES filesystem we forbid block creations: only |
633 | * We will return early to the caller once we see an | 633 | * overwrites are permitted. We will return early to the caller |
634 | * unmapped buffer head returned, and the caller will fall | 634 | * once we see an unmapped buffer head returned, and the caller |
635 | * back to buffered I/O. | 635 | * will fall back to buffered I/O. |
636 | * | 636 | * |
637 | * Otherwise the decision is left to the get_blocks method, | 637 | * Otherwise the decision is left to the get_blocks method, |
638 | * which may decide to handle it or also return an unmapped | 638 | * which may decide to handle it or also return an unmapped |
@@ -640,8 +640,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, | |||
640 | */ | 640 | */ |
641 | create = dio->rw & WRITE; | 641 | create = dio->rw & WRITE; |
642 | if (dio->flags & DIO_SKIP_HOLES) { | 642 | if (dio->flags & DIO_SKIP_HOLES) { |
643 | if (sdio->block_in_file < (i_size_read(dio->inode) >> | 643 | if (fs_startblk <= ((i_size_read(dio->inode) - 1) >> |
644 | sdio->blkbits)) | 644 | i_blkbits)) |
645 | create = 0; | 645 | create = 0; |
646 | } | 646 | } |
647 | 647 | ||
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a8d15beee5cb..6aaf3e351391 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -272,10 +272,21 @@ struct o2hb_region { | |||
272 | struct delayed_work hr_write_timeout_work; | 272 | struct delayed_work hr_write_timeout_work; |
273 | unsigned long hr_last_timeout_start; | 273 | unsigned long hr_last_timeout_start; |
274 | 274 | ||
275 | /* negotiate timer, used to negotiate extending hb timeout. */ | ||
276 | struct delayed_work hr_nego_timeout_work; | ||
277 | unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
278 | |||
275 | /* Used during o2hb_check_slot to hold a copy of the block | 279 | /* Used during o2hb_check_slot to hold a copy of the block |
276 | * being checked because we temporarily have to zero out the | 280 | * being checked because we temporarily have to zero out the |
277 | * crc field. */ | 281 | * crc field. */ |
278 | struct o2hb_disk_heartbeat_block *hr_tmp_block; | 282 | struct o2hb_disk_heartbeat_block *hr_tmp_block; |
283 | |||
284 | /* Message key for negotiate timeout message. */ | ||
285 | unsigned int hr_key; | ||
286 | struct list_head hr_handler_list; | ||
287 | |||
288 | /* last hb status, 0 for success, other value for error. */ | ||
289 | int hr_last_hb_status; | ||
279 | }; | 290 | }; |
280 | 291 | ||
281 | struct o2hb_bio_wait_ctxt { | 292 | struct o2hb_bio_wait_ctxt { |
@@ -284,6 +295,17 @@ struct o2hb_bio_wait_ctxt { | |||
284 | int wc_error; | 295 | int wc_error; |
285 | }; | 296 | }; |
286 | 297 | ||
298 | #define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2) | ||
299 | |||
300 | enum { | ||
301 | O2HB_NEGO_TIMEOUT_MSG = 1, | ||
302 | O2HB_NEGO_APPROVE_MSG = 2, | ||
303 | }; | ||
304 | |||
305 | struct o2hb_nego_msg { | ||
306 | u8 node_num; | ||
307 | }; | ||
308 | |||
287 | static void o2hb_write_timeout(struct work_struct *work) | 309 | static void o2hb_write_timeout(struct work_struct *work) |
288 | { | 310 | { |
289 | int failed, quorum; | 311 | int failed, quorum; |
@@ -319,7 +341,7 @@ static void o2hb_write_timeout(struct work_struct *work) | |||
319 | o2quo_disk_timeout(); | 341 | o2quo_disk_timeout(); |
320 | } | 342 | } |
321 | 343 | ||
322 | static void o2hb_arm_write_timeout(struct o2hb_region *reg) | 344 | static void o2hb_arm_timeout(struct o2hb_region *reg) |
323 | { | 345 | { |
324 | /* Arm writeout only after thread reaches steady state */ | 346 | /* Arm writeout only after thread reaches steady state */ |
325 | if (atomic_read(®->hr_steady_iterations) != 0) | 347 | if (atomic_read(®->hr_steady_iterations) != 0) |
@@ -334,14 +356,132 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) | |||
334 | spin_unlock(&o2hb_live_lock); | 356 | spin_unlock(&o2hb_live_lock); |
335 | } | 357 | } |
336 | cancel_delayed_work(®->hr_write_timeout_work); | 358 | cancel_delayed_work(®->hr_write_timeout_work); |
337 | reg->hr_last_timeout_start = jiffies; | ||
338 | schedule_delayed_work(®->hr_write_timeout_work, | 359 | schedule_delayed_work(®->hr_write_timeout_work, |
339 | msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); | 360 | msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); |
361 | |||
362 | cancel_delayed_work(®->hr_nego_timeout_work); | ||
363 | /* negotiate timeout must be less than write timeout. */ | ||
364 | schedule_delayed_work(®->hr_nego_timeout_work, | ||
365 | msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS)); | ||
366 | memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap)); | ||
340 | } | 367 | } |
341 | 368 | ||
342 | static void o2hb_disarm_write_timeout(struct o2hb_region *reg) | 369 | static void o2hb_disarm_timeout(struct o2hb_region *reg) |
343 | { | 370 | { |
344 | cancel_delayed_work_sync(®->hr_write_timeout_work); | 371 | cancel_delayed_work_sync(®->hr_write_timeout_work); |
372 | cancel_delayed_work_sync(®->hr_nego_timeout_work); | ||
373 | } | ||
374 | |||
375 | static int o2hb_send_nego_msg(int key, int type, u8 target) | ||
376 | { | ||
377 | struct o2hb_nego_msg msg; | ||
378 | int status, ret; | ||
379 | |||
380 | msg.node_num = o2nm_this_node(); | ||
381 | again: | ||
382 | ret = o2net_send_message(type, key, &msg, sizeof(msg), | ||
383 | target, &status); | ||
384 | |||
385 | if (ret == -EAGAIN || ret == -ENOMEM) { | ||
386 | msleep(100); | ||
387 | goto again; | ||
388 | } | ||
389 | |||
390 | return ret; | ||
391 | } | ||
392 | |||
393 | static void o2hb_nego_timeout(struct work_struct *work) | ||
394 | { | ||
395 | unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
396 | int master_node, i, ret; | ||
397 | struct o2hb_region *reg; | ||
398 | |||
399 | reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work); | ||
400 | /* don't negotiate timeout if last hb failed since it is very | ||
401 | * possible io failed. Should let write timeout fence self. | ||
402 | */ | ||
403 | if (reg->hr_last_hb_status) | ||
404 | return; | ||
405 | |||
406 | o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); | ||
407 | /* lowest node as master node to make negotiate decision. */ | ||
408 | master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0); | ||
409 | |||
410 | if (master_node == o2nm_this_node()) { | ||
411 | if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { | ||
412 | printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n", | ||
413 | o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, | ||
414 | config_item_name(®->hr_item), reg->hr_dev_name); | ||
415 | set_bit(master_node, reg->hr_nego_node_bitmap); | ||
416 | } | ||
417 | if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap, | ||
418 | sizeof(reg->hr_nego_node_bitmap))) { | ||
419 | /* check negotiate bitmap every second to do timeout | ||
420 | * approve decision. | ||
421 | */ | ||
422 | schedule_delayed_work(®->hr_nego_timeout_work, | ||
423 | msecs_to_jiffies(1000)); | ||
424 | |||
425 | return; | ||
426 | } | ||
427 | |||
428 | printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n", | ||
429 | config_item_name(®->hr_item), reg->hr_dev_name); | ||
430 | /* approve negotiate timeout request. */ | ||
431 | o2hb_arm_timeout(reg); | ||
432 | |||
433 | i = -1; | ||
434 | while ((i = find_next_bit(live_node_bitmap, | ||
435 | O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { | ||
436 | if (i == master_node) | ||
437 | continue; | ||
438 | |||
439 | mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i); | ||
440 | ret = o2hb_send_nego_msg(reg->hr_key, | ||
441 | O2HB_NEGO_APPROVE_MSG, i); | ||
442 | if (ret) | ||
443 | mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n", | ||
444 | i, ret); | ||
445 | } | ||
446 | } else { | ||
447 | /* negotiate timeout with master node. */ | ||
448 | printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n", | ||
449 | o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), | ||
450 | reg->hr_dev_name, master_node); | ||
451 | ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, | ||
452 | master_node); | ||
453 | if (ret) | ||
454 | mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n", | ||
455 | master_node, ret); | ||
456 | } | ||
457 | } | ||
458 | |||
459 | static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, | ||
460 | void **ret_data) | ||
461 | { | ||
462 | struct o2hb_region *reg = data; | ||
463 | struct o2hb_nego_msg *nego_msg; | ||
464 | |||
465 | nego_msg = (struct o2hb_nego_msg *)msg->buf; | ||
466 | printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n", | ||
467 | nego_msg->node_num, config_item_name(®->hr_item), reg->hr_dev_name); | ||
468 | if (nego_msg->node_num < O2NM_MAX_NODES) | ||
469 | set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); | ||
470 | else | ||
471 | mlog(ML_ERROR, "got nego timeout message from bad node.\n"); | ||
472 | |||
473 | return 0; | ||
474 | } | ||
475 | |||
476 | static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, | ||
477 | void **ret_data) | ||
478 | { | ||
479 | struct o2hb_region *reg = data; | ||
480 | |||
481 | printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n", | ||
482 | config_item_name(®->hr_item), reg->hr_dev_name); | ||
483 | o2hb_arm_timeout(reg); | ||
484 | return 0; | ||
345 | } | 485 | } |
346 | 486 | ||
347 | static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) | 487 | static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) |
@@ -1032,7 +1172,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
1032 | /* Skip disarming the timeout if own slot has stale/bad data */ | 1172 | /* Skip disarming the timeout if own slot has stale/bad data */ |
1033 | if (own_slot_ok) { | 1173 | if (own_slot_ok) { |
1034 | o2hb_set_quorum_device(reg); | 1174 | o2hb_set_quorum_device(reg); |
1035 | o2hb_arm_write_timeout(reg); | 1175 | o2hb_arm_timeout(reg); |
1176 | reg->hr_last_timeout_start = jiffies; | ||
1036 | } | 1177 | } |
1037 | 1178 | ||
1038 | bail: | 1179 | bail: |
@@ -1096,6 +1237,7 @@ static int o2hb_thread(void *data) | |||
1096 | before_hb = ktime_get_real(); | 1237 | before_hb = ktime_get_real(); |
1097 | 1238 | ||
1098 | ret = o2hb_do_disk_heartbeat(reg); | 1239 | ret = o2hb_do_disk_heartbeat(reg); |
1240 | reg->hr_last_hb_status = ret; | ||
1099 | 1241 | ||
1100 | after_hb = ktime_get_real(); | 1242 | after_hb = ktime_get_real(); |
1101 | 1243 | ||
@@ -1114,7 +1256,7 @@ static int o2hb_thread(void *data) | |||
1114 | } | 1256 | } |
1115 | } | 1257 | } |
1116 | 1258 | ||
1117 | o2hb_disarm_write_timeout(reg); | 1259 | o2hb_disarm_timeout(reg); |
1118 | 1260 | ||
1119 | /* unclean stop is only used in very bad situation */ | 1261 | /* unclean stop is only used in very bad situation */ |
1120 | for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) | 1262 | for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) |
@@ -1451,6 +1593,7 @@ static void o2hb_region_release(struct config_item *item) | |||
1451 | list_del(®->hr_all_item); | 1593 | list_del(®->hr_all_item); |
1452 | spin_unlock(&o2hb_live_lock); | 1594 | spin_unlock(&o2hb_live_lock); |
1453 | 1595 | ||
1596 | o2net_unregister_handler_list(®->hr_handler_list); | ||
1454 | kfree(reg); | 1597 | kfree(reg); |
1455 | } | 1598 | } |
1456 | 1599 | ||
@@ -1762,6 +1905,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, | |||
1762 | } | 1905 | } |
1763 | 1906 | ||
1764 | INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); | 1907 | INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); |
1908 | INIT_DELAYED_WORK(®->hr_nego_timeout_work, o2hb_nego_timeout); | ||
1765 | 1909 | ||
1766 | /* | 1910 | /* |
1767 | * A node is considered live after it has beat LIVE_THRESHOLD | 1911 | * A node is considered live after it has beat LIVE_THRESHOLD |
@@ -1995,13 +2139,37 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g | |||
1995 | 2139 | ||
1996 | config_item_init_type_name(®->hr_item, name, &o2hb_region_type); | 2140 | config_item_init_type_name(®->hr_item, name, &o2hb_region_type); |
1997 | 2141 | ||
2142 | /* this is the same way to generate msg key as dlm, for local heartbeat, | ||
2143 | * name is also the same, so make initial crc value different to avoid | ||
2144 | * message key conflict. | ||
2145 | */ | ||
2146 | reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS, | ||
2147 | name, strlen(name)); | ||
2148 | INIT_LIST_HEAD(®->hr_handler_list); | ||
2149 | ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key, | ||
2150 | sizeof(struct o2hb_nego_msg), | ||
2151 | o2hb_nego_timeout_handler, | ||
2152 | reg, NULL, ®->hr_handler_list); | ||
2153 | if (ret) | ||
2154 | goto free; | ||
2155 | |||
2156 | ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key, | ||
2157 | sizeof(struct o2hb_nego_msg), | ||
2158 | o2hb_nego_approve_handler, | ||
2159 | reg, NULL, ®->hr_handler_list); | ||
2160 | if (ret) | ||
2161 | goto unregister_handler; | ||
2162 | |||
1998 | ret = o2hb_debug_region_init(reg, o2hb_debug_dir); | 2163 | ret = o2hb_debug_region_init(reg, o2hb_debug_dir); |
1999 | if (ret) { | 2164 | if (ret) { |
2000 | config_item_put(®->hr_item); | 2165 | config_item_put(®->hr_item); |
2001 | goto free; | 2166 | goto unregister_handler; |
2002 | } | 2167 | } |
2003 | 2168 | ||
2004 | return ®->hr_item; | 2169 | return ®->hr_item; |
2170 | |||
2171 | unregister_handler: | ||
2172 | o2net_unregister_handler_list(®->hr_handler_list); | ||
2005 | free: | 2173 | free: |
2006 | kfree(reg); | 2174 | kfree(reg); |
2007 | return ERR_PTR(ret); | 2175 | return ERR_PTR(ret); |
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index b95e7df5b76a..94b18369b1cc 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h | |||
@@ -44,6 +44,9 @@ | |||
44 | * version here in tcp_internal.h should not need to be bumped for | 44 | * version here in tcp_internal.h should not need to be bumped for |
45 | * filesystem locking changes. | 45 | * filesystem locking changes. |
46 | * | 46 | * |
47 | * New in version 12 | ||
48 | * - Negotiate hb timeout when storage is down. | ||
49 | * | ||
47 | * New in version 11 | 50 | * New in version 11 |
48 | * - Negotiation of filesystem locking in the dlm join. | 51 | * - Negotiation of filesystem locking in the dlm join. |
49 | * | 52 | * |
@@ -75,7 +78,7 @@ | |||
75 | * - full 64 bit i_size in the metadata lock lvbs | 78 | * - full 64 bit i_size in the metadata lock lvbs |
76 | * - introduction of "rw" lock and pushing meta/data locking down | 79 | * - introduction of "rw" lock and pushing meta/data locking down |
77 | */ | 80 | */ |
78 | #define O2NET_PROTOCOL_VERSION 11ULL | 81 | #define O2NET_PROTOCOL_VERSION 12ULL |
79 | struct o2net_handshake { | 82 | struct o2net_handshake { |
80 | __be64 protocol_version; | 83 | __be64 protocol_version; |
81 | __be64 connector_id; | 84 | __be64 connector_id; |
diff --git a/init/main.c b/init/main.c index bc0f9e0bcf22..4c17fda5c2ff 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -607,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void) | |||
607 | initrd_start = 0; | 607 | initrd_start = 0; |
608 | } | 608 | } |
609 | #endif | 609 | #endif |
610 | page_ext_init(); | ||
610 | debug_objects_mem_init(); | 611 | debug_objects_mem_init(); |
611 | kmemleak_init(); | 612 | kmemleak_init(); |
612 | setup_per_cpu_pageset(); | 613 | setup_per_cpu_pageset(); |
@@ -1003,8 +1004,6 @@ static noinline void __init kernel_init_freeable(void) | |||
1003 | sched_init_smp(); | 1004 | sched_init_smp(); |
1004 | 1005 | ||
1005 | page_alloc_init_late(); | 1006 | page_alloc_init_late(); |
1006 | /* Initialize page ext after all struct pages are initializaed */ | ||
1007 | page_ext_init(); | ||
1008 | 1007 | ||
1009 | do_basic_setup(); | 1008 | do_basic_setup(); |
1010 | 1009 | ||
diff --git a/mm/Kconfig b/mm/Kconfig index 22fa8189e4fc..3e2daef3c946 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -648,7 +648,7 @@ config DEFERRED_STRUCT_PAGE_INIT | |||
648 | bool "Defer initialisation of struct pages to kthreads" | 648 | bool "Defer initialisation of struct pages to kthreads" |
649 | default n | 649 | default n |
650 | depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT | 650 | depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT |
651 | depends on MEMORY_HOTPLUG | 651 | depends on NO_BOOTMEM && MEMORY_HOTPLUG |
652 | depends on !FLATMEM | 652 | depends on !FLATMEM |
653 | help | 653 | help |
654 | Ordinarily all struct pages are initialised during early boot in a | 654 | Ordinarily all struct pages are initialised during early boot in a |
@@ -183,7 +183,8 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, | |||
183 | return -EINVAL; | 183 | return -EINVAL; |
184 | 184 | ||
185 | /* ensure minimal alignment required by mm core */ | 185 | /* ensure minimal alignment required by mm core */ |
186 | alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); | 186 | alignment = PAGE_SIZE << |
187 | max_t(unsigned long, MAX_ORDER - 1, pageblock_order); | ||
187 | 188 | ||
188 | /* alignment should be aligned with order_per_bit */ | 189 | /* alignment should be aligned with order_per_bit */ |
189 | if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) | 190 | if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) |
@@ -266,8 +267,8 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
266 | * migratetype page by page allocator's buddy algorithm. In the case, | 267 | * migratetype page by page allocator's buddy algorithm. In the case, |
267 | * you couldn't get a contiguous memory, which is not what we want. | 268 | * you couldn't get a contiguous memory, which is not what we want. |
268 | */ | 269 | */ |
269 | alignment = max(alignment, | 270 | alignment = max(alignment, (phys_addr_t)PAGE_SIZE << |
270 | (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); | 271 | max_t(unsigned long, MAX_ORDER - 1, pageblock_order)); |
271 | base = ALIGN(base, alignment); | 272 | base = ALIGN(base, alignment); |
272 | size = ALIGN(size, alignment); | 273 | size = ALIGN(size, alignment); |
273 | limit &= ~(alignment - 1); | 274 | limit &= ~(alignment - 1); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f6477a9dbe7a..925b431f3f03 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1108,6 +1108,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) | |||
1108 | limit = READ_ONCE(memcg->memsw.limit); | 1108 | limit = READ_ONCE(memcg->memsw.limit); |
1109 | if (count <= limit) | 1109 | if (count <= limit) |
1110 | margin = min(margin, limit - count); | 1110 | margin = min(margin, limit - count); |
1111 | else | ||
1112 | margin = 0; | ||
1111 | } | 1113 | } |
1112 | 1114 | ||
1113 | return margin; | 1115 | return margin; |
@@ -4307,24 +4309,6 @@ static int mem_cgroup_do_precharge(unsigned long count) | |||
4307 | return 0; | 4309 | return 0; |
4308 | } | 4310 | } |
4309 | 4311 | ||
4310 | /** | ||
4311 | * get_mctgt_type - get target type of moving charge | ||
4312 | * @vma: the vma the pte to be checked belongs | ||
4313 | * @addr: the address corresponding to the pte to be checked | ||
4314 | * @ptent: the pte to be checked | ||
4315 | * @target: the pointer the target page or swap ent will be stored(can be NULL) | ||
4316 | * | ||
4317 | * Returns | ||
4318 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | ||
4319 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | ||
4320 | * move charge. if @target is not NULL, the page is stored in target->page | ||
4321 | * with extra refcnt got(Callers should handle it). | ||
4322 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | ||
4323 | * target for charge migration. if @target is not NULL, the entry is stored | ||
4324 | * in target->ent. | ||
4325 | * | ||
4326 | * Called with pte lock held. | ||
4327 | */ | ||
4328 | union mc_target { | 4312 | union mc_target { |
4329 | struct page *page; | 4313 | struct page *page; |
4330 | swp_entry_t ent; | 4314 | swp_entry_t ent; |
@@ -4513,6 +4497,25 @@ out: | |||
4513 | return ret; | 4497 | return ret; |
4514 | } | 4498 | } |
4515 | 4499 | ||
4500 | /** | ||
4501 | * get_mctgt_type - get target type of moving charge | ||
4502 | * @vma: the vma the pte to be checked belongs | ||
4503 | * @addr: the address corresponding to the pte to be checked | ||
4504 | * @ptent: the pte to be checked | ||
4505 | * @target: the pointer the target page or swap ent will be stored(can be NULL) | ||
4506 | * | ||
4507 | * Returns | ||
4508 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | ||
4509 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | ||
4510 | * move charge. if @target is not NULL, the page is stored in target->page | ||
4511 | * with extra refcnt got(Callers should handle it). | ||
4512 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | ||
4513 | * target for charge migration. if @target is not NULL, the entry is stored | ||
4514 | * in target->ent. | ||
4515 | * | ||
4516 | * Called with pte lock held. | ||
4517 | */ | ||
4518 | |||
4516 | static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | 4519 | static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, |
4517 | unsigned long addr, pte_t ptent, union mc_target *target) | 4520 | unsigned long addr, pte_t ptent, union mc_target *target) |
4518 | { | 4521 | { |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index caf2a14c37ad..b8ee0806415f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -300,7 +300,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
300 | * multiple nodes we check that this pfn does not already | 300 | * multiple nodes we check that this pfn does not already |
301 | * reside in some other nodes. | 301 | * reside in some other nodes. |
302 | */ | 302 | */ |
303 | if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) | 303 | if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) |
304 | register_page_bootmem_info_section(pfn); | 304 | register_page_bootmem_info_section(pfn); |
305 | } | 305 | } |
306 | } | 306 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5bb2f7698ad7..dfb1ab61fb23 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -443,13 +443,29 @@ static bool __oom_reap_task(struct task_struct *tsk) | |||
443 | { | 443 | { |
444 | struct mmu_gather tlb; | 444 | struct mmu_gather tlb; |
445 | struct vm_area_struct *vma; | 445 | struct vm_area_struct *vma; |
446 | struct mm_struct *mm; | 446 | struct mm_struct *mm = NULL; |
447 | struct task_struct *p; | 447 | struct task_struct *p; |
448 | struct zap_details details = {.check_swap_entries = true, | 448 | struct zap_details details = {.check_swap_entries = true, |
449 | .ignore_dirty = true}; | 449 | .ignore_dirty = true}; |
450 | bool ret = true; | 450 | bool ret = true; |
451 | 451 | ||
452 | /* | 452 | /* |
453 | * We have to make sure to not race with the victim exit path | ||
454 | * and cause premature new oom victim selection: | ||
455 | * __oom_reap_task exit_mm | ||
456 | * atomic_inc_not_zero | ||
457 | * mmput | ||
458 | * atomic_dec_and_test | ||
459 | * exit_oom_victim | ||
460 | * [...] | ||
461 | * out_of_memory | ||
462 | * select_bad_process | ||
463 | * # no TIF_MEMDIE task selects new victim | ||
464 | * unmap_page_range # frees some memory | ||
465 | */ | ||
466 | mutex_lock(&oom_lock); | ||
467 | |||
468 | /* | ||
453 | * Make sure we find the associated mm_struct even when the particular | 469 | * Make sure we find the associated mm_struct even when the particular |
454 | * thread has already terminated and cleared its mm. | 470 | * thread has already terminated and cleared its mm. |
455 | * We might have race with exit path so consider our work done if there | 471 | * We might have race with exit path so consider our work done if there |
@@ -457,19 +473,19 @@ static bool __oom_reap_task(struct task_struct *tsk) | |||
457 | */ | 473 | */ |
458 | p = find_lock_task_mm(tsk); | 474 | p = find_lock_task_mm(tsk); |
459 | if (!p) | 475 | if (!p) |
460 | return true; | 476 | goto unlock_oom; |
461 | 477 | ||
462 | mm = p->mm; | 478 | mm = p->mm; |
463 | if (!atomic_inc_not_zero(&mm->mm_users)) { | 479 | if (!atomic_inc_not_zero(&mm->mm_users)) { |
464 | task_unlock(p); | 480 | task_unlock(p); |
465 | return true; | 481 | goto unlock_oom; |
466 | } | 482 | } |
467 | 483 | ||
468 | task_unlock(p); | 484 | task_unlock(p); |
469 | 485 | ||
470 | if (!down_read_trylock(&mm->mmap_sem)) { | 486 | if (!down_read_trylock(&mm->mmap_sem)) { |
471 | ret = false; | 487 | ret = false; |
472 | goto out; | 488 | goto unlock_oom; |
473 | } | 489 | } |
474 | 490 | ||
475 | tlb_gather_mmu(&tlb, mm, 0, -1); | 491 | tlb_gather_mmu(&tlb, mm, 0, -1); |
@@ -511,13 +527,15 @@ static bool __oom_reap_task(struct task_struct *tsk) | |||
511 | * to release its memory. | 527 | * to release its memory. |
512 | */ | 528 | */ |
513 | set_bit(MMF_OOM_REAPED, &mm->flags); | 529 | set_bit(MMF_OOM_REAPED, &mm->flags); |
514 | out: | 530 | unlock_oom: |
531 | mutex_unlock(&oom_lock); | ||
515 | /* | 532 | /* |
516 | * Drop our reference but make sure the mmput slow path is called from a | 533 | * Drop our reference but make sure the mmput slow path is called from a |
517 | * different context because we shouldn't risk we get stuck there and | 534 | * different context because we shouldn't risk we get stuck there and |
518 | * put the oom_reaper out of the way. | 535 | * put the oom_reaper out of the way. |
519 | */ | 536 | */ |
520 | mmput_async(mm); | 537 | if (mm) |
538 | mmput_async(mm); | ||
521 | return ret; | 539 | return ret; |
522 | } | 540 | } |
523 | 541 | ||
@@ -611,8 +629,6 @@ void try_oom_reaper(struct task_struct *tsk) | |||
611 | 629 | ||
612 | if (!process_shares_mm(p, mm)) | 630 | if (!process_shares_mm(p, mm)) |
613 | continue; | 631 | continue; |
614 | if (same_thread_group(p, tsk)) | ||
615 | continue; | ||
616 | if (fatal_signal_pending(p)) | 632 | if (fatal_signal_pending(p)) |
617 | continue; | 633 | continue; |
618 | 634 | ||
diff --git a/mm/page_ext.c b/mm/page_ext.c index 2d864e64f7fe..44a4c029c8e7 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c | |||
@@ -390,8 +390,10 @@ void __init page_ext_init(void) | |||
390 | * We know some arch can have a nodes layout such as | 390 | * We know some arch can have a nodes layout such as |
391 | * -------------pfn--------------> | 391 | * -------------pfn--------------> |
392 | * N0 | N1 | N2 | N0 | N1 | N2|.... | 392 | * N0 | N1 | N2 | N0 | N1 | N2|.... |
393 | * | ||
394 | * Take into account DEFERRED_STRUCT_PAGE_INIT. | ||
393 | */ | 395 | */ |
394 | if (pfn_to_nid(pfn) != nid) | 396 | if (early_pfn_to_nid(pfn) != nid) |
395 | continue; | 397 | continue; |
396 | if (init_section_page_ext(pfn, nid)) | 398 | if (init_section_page_ext(pfn, nid)) |
397 | goto oom; | 399 | goto oom; |
@@ -1098,6 +1098,8 @@ void page_move_anon_rmap(struct page *page, | |||
1098 | 1098 | ||
1099 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 1099 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
1100 | VM_BUG_ON_VMA(!anon_vma, vma); | 1100 | VM_BUG_ON_VMA(!anon_vma, vma); |
1101 | if (IS_ENABLED(CONFIG_DEBUG_VM) && PageTransHuge(page)) | ||
1102 | address &= HPAGE_PMD_MASK; | ||
1101 | VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); | 1103 | VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); |
1102 | 1104 | ||
1103 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 1105 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |