环境 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 1. CentOS 7.4 环境 [root@localhost ~]# cat /etc/redhat-release CentOS Linux release 7.4.1708 (Core) [root@localhost ~]# 2. 内核版本 [root@localhost ~]# uname -r 3.10.0-693.el7.x86_64 [root@localhost ~]# 3. root 权限 [root@localhost ~]# id uid=0(root) gid=0(root) groups=0(root) [root@localhost ~]#
准备下载 1 2 3 4 5 6 7 [chunli@localhost ~]$ wget https://buildlogs-seed.centos.org/c7.1708.00/crash/20170804014819/7.1.9-2.el7.x86_64/crash-7.1.9-2.el7.x86_64.rpm [chunli@localhost ~]$ wget https://buildlogs.centos.org/c7.1708.00/kernel/20170822030048/3.10.0-693.el7.x86_64/kernel-debuginfo-3.10.0-693.el7.x86_64.rpm [chunli@localhost ~]$ wget https://buildlogs.centos.org/c7.1708.00/kernel/20170822030048/3.10.0-693.el7.x86_64/kernel-debuginfo-common-x86_64-3.10.0-693.el7.x86_64.rpm [chunli@localhost ~]$ wget https://buildlogs.centos.org/c7.1708.00/kexec-tools/20170807142844/2.0.14-17.el7.x86_64/kexec-tools-2.0.14-17.el7.x86_64.rpm
RPM安装 1 2 3 4 5 6 7 8 9 10 11 12 [root@localhost ~]# cd crash/ [root@localhost crash]# ll total 428240 -rw-r--r-- 1 root root 2727680 Oct 20 18:26 crash-7.1.9-2.el7.x86_64.rpm -rw-r--r-- 1 root root 379021156 Oct 20 18:27 kernel-debuginfo-3.10.0-693.el7.x86_64.rpm -rw-r--r-- 1 root root 56091364 Oct 20 18:27 kernel-debuginfo-common-x86_64-3.10.0-693.el7.x86_64.rpm -rw-r--r-- 1 root root 340204 Oct 20 18:27 kexec-tools-2.0.14-17.el7.x86_64.rpm [root@localhost crash]# [root@localhost crash]# rpm -ivh kernel-debuginfo-common-x86_64-3.10.0-693.el7.x86_64.rpm [root@localhost crash]# rpm -ivh kernel-debuginfo-3.10.0-693.el7.x86_64.rpm [root@localhost crash]# rpm -ivh crash-7.1.9-2.el7.x86_64.rpm [root@localhost crash]# rpm -ivh kexec-tools-2.0.14-17.el7.x86_64.rpm
RPM验证安装 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 [root@localhost ~]# rpm -qa |grep kernel-debuginfo-common kernel-debuginfo-common-x86_64-3.10.0-693.el7.x86_64 [root@localhost ~]# [root@localhost ~]# rpm -qa |grep kernel-debuginfo-3.10 kernel-debuginfo-3.10.0-693.el7.x86_64 [root@localhost ~]# [root@localhost ~]# rpm -qa |grep crash-7.1.9 crash-7.1.9-2.el7.x86_64 [root@localhost ~]# [root@localhost ~]# rpm -qa |grep kexec-tools kexec-tools-2.0.14-17.el7.x86_64 [root@localhost ~]#
配置 Grub2 1 2 3 4 5 6 7 8 9 [root@localhost ~]# vim /etc/default/grub 在GRUB_CMDLINE_LINUX项中 找到 crashkernel=auto 改为 crashkernel=256M 如果找不到, 就加一个 crashkernel=256M 保存退出.
Grub2 配置完成示例 1 2 3 4 5 6 7 8 9 [root@localhost ~]# cat /etc/default/grub GRUB_TIMEOUT=5 GRUB_DISTRIBUTOR="$(sed 's, release .*$,,g' /etc/system-release)" GRUB_DEFAULT=saved GRUB_DISABLE_SUBMENU=true GRUB_TERMINAL_OUTPUT="console" GRUB_CMDLINE_LINUX="crashkernel=256M rd.lvm.lv=centos/root rd.lvm.lv=centos/swap rhgb quiet console=ttyS0,115200 default_hugepagesz=1G hugepagesz=1G hugepages=40" GRUB_DISABLE_RECOVERY="true" [root@localhost ~]#
这里的配置项 console=ttyS0,115200 default_hugepagesz=1G hugepagesz=1G hugepages=40 是业务上使用的.可以不用管. 添加了 crashkernel=256M 即可!
更新 grub 1 2 3 4 5 执行更新 [root@localhost ~]# test -e /boot/efi/EFI/redhat/grub.cfg && grub2-mkconfig -o /boot/efi/EFI/redhat/grub.cfg [root@localhost ~]# test -e /boot/efi/EFI/centos/grub.cfg && grub2-mkconfig -o /boot/efi/EFI/centos/grub.cfg [root@localhost ~]# test -e /boot/grub2/grub.cfg && grub2-mkconfig -o /boot/grub2/grub.cfg [root@localhost ~]# reboot
验证grub 1 2 3 4 开机后 [root@localhost ~]# cat /proc/cmdline BOOT_IMAGE=/vmlinuz-3.10.0-693.el7.x86_64 root=/dev/mapper/centos-root ro crashkernel=256M rd.lvm.lv=centos/root rd.lvm.lv=centos/swap rhgb quiet console=ttyS0,115200 default_hugepagesz=1G hugepagesz=1G hugepages=40 [root@localhost ~]#
kdump 配置文件 路径(无需修改) 1 [root@localhost ~]# ls -l /etc/kdump.conf
启动 kdump 服务 1 2 3 [root@localhost ~]# systemctl status kdump.service //检查服务状态 [root@localhost ~]# systemctl restart kdump.service //启动kdump [root@localhost ~]# systemctl enable kdump.service //设置开机启动
触发测试 1 2 3 4 5 6 ## 清理目录 [root@localhost ~]# rm -rf /var/crash/* ## 触发 [root@localhost ~]# echo 1 > /proc/sys/kernel/sysrq [root@localhost ~]# echo c > /proc/sysrq-trigger
内核转储文件 1 2 3 4 5 6 7 8 [root@localhost ~]# tree /var/crash/ /var/crash/ └── 127.0.0.1-2021-10-20-19:08:20 ├── vmcore └── vmcore-dmesg.txt 1 directories, 2 files [root@localhost ~]#
查看系统dmesg 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 [root@localhost ~]# cat "/var/crash/127.0.0.1-2021-10-20-19:08:20/vmcore-dmesg.txt" ... [ 265.027232] SysRq : Trigger a crash [ 265.030797] BUG: unable to handle kernel NULL pointer dereference at (null) [ 265.038624] IP: [<ffffffff813fe3f6>] sysrq_handle_crash+0x16/0x20 [ 265.044725] PGD 1ae2a03067 PUD 1ae2a22067 PMD 0 [ 265.049350] Oops: 0002 [#1] SMP [ 265.052586] Modules linked in: sb_edac edac_core intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper iTCO_wdt ablk_helper cryptd iTCO_vendor_support mxm_wmi pcspkr sg shpchp i2c_i801 lpc_ich ipmi_si ipmi_devintf ipmi_msghandler wmi acpi_power_meter nfsd auth_rpcgss nfs_acl lockd grace sunrpc rte_kni(OE) igb_uio(OE) uio ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic ixgbe igb ahci libahci libata crct10dif_pclmul crct10dif_common crc32c_intel i2c_algo_bit i2c_core mdio ptp pps_core dca dm_mirror dm_region_hash dm_log dm_mod [ 265.108099] CPU: 2 PID: 13175 Comm: bash Tainted: G OE ------------ 3.10.0-693.el7.x86_64 #1 [ 265.117639] Hardware name: Joinus Tech ACB300/ATCA-ACB300, BIOS 2.0.0 06/20/2017 [ 265.125008] task: ffff88203a311fa0 ti: ffff88202ae18000 task.ti: ffff88202ae18000 [ 265.132463] RIP: 0010:[<ffffffff813fe3f6>] [<ffffffff813fe3f6>] sysrq_handle_crash+0x16/0x20 [ 265.140977] RSP: 0018:ffff88202ae1be88 EFLAGS: 00010246 [ 265.146266] RAX: 000000000000000f RBX: ffffffff81ab8100 RCX: 0000000000000000 [ 265.153376] RDX: 0000000000000000 RSI: ffff88103f88f8b8 RDI: 0000000000000063 [ 265.160482] RBP: ffff88202ae1be88 R08: ffffffff81d86dfc R09: ffffffff81dab7c3 [ 265.167589] R10: 0000000000000691 R11: 0000000000000690 R12: 0000000000000063 [ 265.174695] R13: 0000000000000000 R14: 0000000000000004 R15: 0000000000000000 [ 265.181803] FS: 00007f2bc11ab740(0000) GS:ffff88103f880000(0000) knlGS:0000000000000000 [ 265.189863] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 265.195587] CR2: 0000000000000000 CR3: 0000002035aba000 CR4: 00000000003407e0 [ 265.202695] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 265.209802] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 265.216908] Stack: [ 265.218910] ffff88202ae1beb8 ffffffff813fec17 0000000000000002 00007f2bc11b9000 [ 265.226318] ffff88202ae1bf48 0000000000000002 ffff88202ae1bed0 ffffffff813ff08f [ 265.233727] ffff8810e939c540 ffff88202ae1bef0 ffffffff8127018d 0000000000000002 [ 265.241133] Call Trace: [ 265.243573] [<ffffffff813fec17>] __handle_sysrq+0x107/0x170 [ 265.249218] [<ffffffff813ff08f>] write_sysrq_trigger+0x2f/0x40 [ 265.255122] [<ffffffff8127018d>] proc_reg_write+0x3d/0x80 [ 265.260598] [<ffffffff81200d2d>] vfs_write+0xbd/0x1e0 [ 265.265724] [<ffffffff81201b3f>] SyS_write+0x7f/0xe0 [ 265.270761] [<ffffffff816b4fc9>] system_call_fastpath+0x16/0x1b [ 265.276752] Code: eb 9b 45 01 f4 45 39 65 34 75 e5 4c 89 ef e8 e2 f7 ff ff eb db 0f 1f 44 00 00 55 48 89 e5 c7 05 e1 77 62 00 01 00 00 00 0f ae f8 <c6> 04 25 00 00 00 00 01 5d c3 0f 1f 44 00 00 55 31 c0 c7 05 5e [ 265.296230] RIP [<ffffffff813fe3f6>] sysrq_handle_crash+0x16/0x20 [ 265.302406] RSP <ffff88202ae1be88> [ 265.305879] CR2: 0000000000000000 [root@localhost ~]# 可以看到崩溃原因 [ 265.027232] SysRq : Trigger a crash
crash 调试内核 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 [root@localhost ~]# crash /usr/lib/debug/lib/modules/`uname -r`/vmlinux "/var/crash/127.0.0.1-2021-10-20-19:08:20/vmcore" ... GNU gdb (GDB) 7.6 Copyright (C) 2013 Free Software Foundation, Inc. License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html> This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. Type "show copying" and "show warranty" for details. This GDB was configured as "x86_64-unknown-linux-gnu"... KERNEL: /usr/lib/debug/lib/modules/3.10.0-693.el7.x86_64/vmlinux DUMPFILE: /var/crash/127.0.0.1-2021-10-20-19:08:20/vmcore [PARTIAL DUMP] CPUS: 48 DATE: Wed Oct 20 19:08:12 2021 UPTIME: 00:04:24 LOAD AVERAGE: 1.21, 5.08, 2.74 TASKS: 546 NODENAME: localhost.localdomain RELEASE: 3.10.0-693.el7.x86_64 VERSION: #1 SMP Tue Aug 22 21:09:27 UTC 2017 MACHINE: x86_64 (2200 Mhz) MEMORY: 127.9 GB PANIC: "SysRq : Trigger a crash" PID: 13175 COMMAND: "bash" TASK: ffff88203a311fa0 [THREAD_INFO: ffff88202ae18000] CPU: 2 STATE: TASK_RUNNING (SYSRQ) crash> 可以看到 崩溃信息: MEMORY: 127.9 GB PANIC: "SysRq : Trigger a crash" PID: 13175 COMMAND: "bash" 错误发生时的堆栈 crash> bt PID: 13175 TASK: ffff88203a311fa0 CPU: 2 COMMAND: "bash" #0 [ffff88202ae1bb28] machine_kexec at ffffffff8105c4cb #1 [ffff88202ae1bb88] __crash_kexec at ffffffff81104a32 #2 [ffff88202ae1bc58] crash_kexec at ffffffff81104b20 #3 [ffff88202ae1bc70] oops_end at ffffffff816ad278 #4 [ffff88202ae1bc98] no_context at ffffffff8169d29a #5 [ffff88202ae1bce8] __bad_area_nosemaphore at ffffffff8169d330 #6 [ffff88202ae1bd30] bad_area_nosemaphore at ffffffff8169d49a #7 [ffff88202ae1bd40] __do_page_fault at ffffffff816b013e #8 [ffff88202ae1bda0] do_page_fault at ffffffff816b02e5 #9 [ffff88202ae1bdd0] page_fault at ffffffff816ac508 [exception RIP: sysrq_handle_crash+22] RIP: ffffffff813fe3f6 RSP: ffff88202ae1be88 RFLAGS: 00010246 RAX: 000000000000000f RBX: ffffffff81ab8100 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88103f88f8b8 RDI: 0000000000000063 RBP: ffff88202ae1be88 R8: ffffffff81d86dfc R9: ffffffff81dab7c3 R10: 0000000000000691 R11: 0000000000000690 R12: 0000000000000063 R13: 0000000000000000 R14: 0000000000000004 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #10 [ffff88202ae1be90] __handle_sysrq at ffffffff813fec17 #11 [ffff88202ae1bec0] write_sysrq_trigger at ffffffff813ff08f #12 [ffff88202ae1bed8] proc_reg_write at ffffffff8127018d #13 [ffff88202ae1bef8] vfs_write at ffffffff81200d2d #14 [ffff88202ae1bf38] sys_write at ffffffff81201b3f #15 [ffff88202ae1bf80] system_call_fastpath at ffffffff816b4fc9 RIP: 00007f2bc0899840 RSP: 00007ffcf7194ba8 RFLAGS: 00000246 RAX: 0000000000000001 RBX: ffffffff816b4fc9 RCX: ffffffffffffffff RDX: 0000000000000002 RSI: 00007f2bc11b9000 RDI: 0000000000000001 RBP: 00007f2bc11b9000 R8: 000000000000000a R9: 00007f2bc11ab740 R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001 R13: 0000000000000002 R14: 00007f2bc0b6d400 R15: 0000000000000002 ORIG_RAX: 0000000000000001 CS: 0033 SS: 002b crash> 显示与某个进程PID的堆栈 crash> bt 13175 PID: 13175 TASK: ffff88203a311fa0 CPU: 2 COMMAND: "bash" #0 [ffff88202ae1bb28] machine_kexec at ffffffff8105c4cb #1 [ffff88202ae1bb88] __crash_kexec at ffffffff81104a32 #2 [ffff88202ae1bc58] crash_kexec at ffffffff81104b20 #3 [ffff88202ae1bc70] oops_end at ffffffff816ad278 #4 [ffff88202ae1bc98] no_context at ffffffff8169d29a #5 [ffff88202ae1bce8] __bad_area_nosemaphore at ffffffff8169d330 #6 [ffff88202ae1bd30] bad_area_nosemaphore at ffffffff8169d49a #7 [ffff88202ae1bd40] __do_page_fault at ffffffff816b013e #8 [ffff88202ae1bda0] do_page_fault at ffffffff816b02e5 #9 [ffff88202ae1bdd0] page_fault at ffffffff816ac508 [exception RIP: sysrq_handle_crash+22] RIP: ffffffff813fe3f6 RSP: ffff88202ae1be88 RFLAGS: 00010246 RAX: 000000000000000f RBX: ffffffff81ab8100 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88103f88f8b8 RDI: 0000000000000063 RBP: ffff88202ae1be88 R8: ffffffff81d86dfc R9: ffffffff81dab7c3 R10: 0000000000000691 R11: 0000000000000690 R12: 0000000000000063 R13: 0000000000000000 R14: 0000000000000004 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #10 [ffff88202ae1be90] __handle_sysrq at ffffffff813fec17 #11 [ffff88202ae1bec0] write_sysrq_trigger at ffffffff813ff08f #12 [ffff88202ae1bed8] proc_reg_write at ffffffff8127018d #13 [ffff88202ae1bef8] vfs_write at ffffffff81200d2d #14 [ffff88202ae1bf38] sys_write at ffffffff81201b3f #15 [ffff88202ae1bf80] system_call_fastpath at ffffffff816b4fc9 RIP: 00007f2bc0899840 RSP: 00007ffcf7194ba8 RFLAGS: 00000246 RAX: 0000000000000001 RBX: ffffffff816b4fc9 RCX: ffffffffffffffff RDX: 0000000000000002 RSI: 00007f2bc11b9000 RDI: 0000000000000001 RBP: 00007f2bc11b9000 R8: 000000000000000a R9: 00007f2bc11ab740 R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001 R13: 0000000000000002 R14: 00007f2bc0b6d400 R15: 0000000000000002 ORIG_RAX: 0000000000000001 CS: 0033 SS: 002b crash> 显示所有CPU的堆栈信息 crash> bt -a 查看某个函数的反汇编 crash> dis machine_kexec ps 显示进程 -u 过滤内核 -c 显示命令 crash> ps crash> ps -u crash> ps -c quit 退出
参考: https://www.jianshu.com/p/8e031b28d98b https://www.cnblogs.com/maojun1998/p/14392731.html https://www.cnblogs.com/xzkzzz/p/13740726.html https://www.golinuxhub.com/2018/08/how-to-configure-and-install-kdump-rhel7-crashkernel/