Linux 内核调试 异常转储 CentOS 7.4

环境

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
1. CentOS 7.4 环境
[root@localhost ~]# cat /etc/redhat-release
CentOS Linux release 7.4.1708 (Core)
[root@localhost ~]#


2. 内核版本
[root@localhost ~]# uname -r
3.10.0-693.el7.x86_64
[root@localhost ~]#


3. root 权限
[root@localhost ~]# id
uid=0(root) gid=0(root) groups=0(root)
[root@localhost ~]#

准备下载

1
2
3
4
5
6
7
[chunli@localhost ~]$ wget https://buildlogs-seed.centos.org/c7.1708.00/crash/20170804014819/7.1.9-2.el7.x86_64/crash-7.1.9-2.el7.x86_64.rpm

[chunli@localhost ~]$ wget https://buildlogs.centos.org/c7.1708.00/kernel/20170822030048/3.10.0-693.el7.x86_64/kernel-debuginfo-3.10.0-693.el7.x86_64.rpm

[chunli@localhost ~]$ wget https://buildlogs.centos.org/c7.1708.00/kernel/20170822030048/3.10.0-693.el7.x86_64/kernel-debuginfo-common-x86_64-3.10.0-693.el7.x86_64.rpm

[chunli@localhost ~]$ wget https://buildlogs.centos.org/c7.1708.00/kexec-tools/20170807142844/2.0.14-17.el7.x86_64/kexec-tools-2.0.14-17.el7.x86_64.rpm

RPM安装

1
2
3
4
5
6
7
8
9
10
11
12
[root@localhost ~]# cd crash/
[root@localhost crash]# ll
total 428240
-rw-r--r-- 1 root root 2727680 Oct 20 18:26 crash-7.1.9-2.el7.x86_64.rpm
-rw-r--r-- 1 root root 379021156 Oct 20 18:27 kernel-debuginfo-3.10.0-693.el7.x86_64.rpm
-rw-r--r-- 1 root root 56091364 Oct 20 18:27 kernel-debuginfo-common-x86_64-3.10.0-693.el7.x86_64.rpm
-rw-r--r-- 1 root root 340204 Oct 20 18:27 kexec-tools-2.0.14-17.el7.x86_64.rpm
[root@localhost crash]#
[root@localhost crash]# rpm -ivh kernel-debuginfo-common-x86_64-3.10.0-693.el7.x86_64.rpm
[root@localhost crash]# rpm -ivh kernel-debuginfo-3.10.0-693.el7.x86_64.rpm
[root@localhost crash]# rpm -ivh crash-7.1.9-2.el7.x86_64.rpm
[root@localhost crash]# rpm -ivh kexec-tools-2.0.14-17.el7.x86_64.rpm

RPM验证安装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
[root@localhost ~]# rpm -qa |grep kernel-debuginfo-common
kernel-debuginfo-common-x86_64-3.10.0-693.el7.x86_64
[root@localhost ~]#

[root@localhost ~]# rpm -qa |grep kernel-debuginfo-3.10
kernel-debuginfo-3.10.0-693.el7.x86_64
[root@localhost ~]#

[root@localhost ~]# rpm -qa |grep crash-7.1.9
crash-7.1.9-2.el7.x86_64
[root@localhost ~]#

[root@localhost ~]# rpm -qa |grep kexec-tools
kexec-tools-2.0.14-17.el7.x86_64
[root@localhost ~]#

配置 Grub2

1
2
3
4
5
6
7
8
9
[root@localhost ~]# vim /etc/default/grub
在GRUB_CMDLINE_LINUX项中
找到
crashkernel=auto
改为
crashkernel=256M

如果找不到, 就加一个 crashkernel=256M
保存退出.

Grub2 配置完成示例

1
2
3
4
5
6
7
8
9
[root@localhost ~]# cat /etc/default/grub
GRUB_TIMEOUT=5
GRUB_DISTRIBUTOR="$(sed 's, release .*$,,g' /etc/system-release)"
GRUB_DEFAULT=saved
GRUB_DISABLE_SUBMENU=true
GRUB_TERMINAL_OUTPUT="console"
GRUB_CMDLINE_LINUX="crashkernel=256M rd.lvm.lv=centos/root rd.lvm.lv=centos/swap rhgb quiet console=ttyS0,115200 default_hugepagesz=1G hugepagesz=1G hugepages=40"
GRUB_DISABLE_RECOVERY="true"
[root@localhost ~]#

这里的配置项 console=ttyS0,115200 default_hugepagesz=1G hugepagesz=1G hugepages=40
是业务上使用的.可以不用管.
添加了 crashkernel=256M 即可!

更新 grub

1
2
3
4
5
执行更新
[root@localhost ~]# test -e /boot/efi/EFI/redhat/grub.cfg && grub2-mkconfig -o /boot/efi/EFI/redhat/grub.cfg
[root@localhost ~]# test -e /boot/efi/EFI/centos/grub.cfg && grub2-mkconfig -o /boot/efi/EFI/centos/grub.cfg
[root@localhost ~]# test -e /boot/grub2/grub.cfg && grub2-mkconfig -o /boot/grub2/grub.cfg
[root@localhost ~]# reboot

验证grub

1
2
3
4
开机后
[root@localhost ~]# cat /proc/cmdline
BOOT_IMAGE=/vmlinuz-3.10.0-693.el7.x86_64 root=/dev/mapper/centos-root ro crashkernel=256M rd.lvm.lv=centos/root rd.lvm.lv=centos/swap rhgb quiet console=ttyS0,115200 default_hugepagesz=1G hugepagesz=1G hugepages=40
[root@localhost ~]#

kdump 配置文件 路径(无需修改)

1
[root@localhost ~]# ls -l  /etc/kdump.conf

启动 kdump 服务

1
2
3
[root@localhost ~]# systemctl status   kdump.service    //检查服务状态
[root@localhost ~]# systemctl restart kdump.service //启动kdump
[root@localhost ~]# systemctl enable kdump.service //设置开机启动

触发测试

1
2
3
4
5
6
## 清理目录
[root@localhost ~]# rm -rf /var/crash/*

## 触发
[root@localhost ~]# echo 1 > /proc/sys/kernel/sysrq
[root@localhost ~]# echo c > /proc/sysrq-trigger

内核转储文件

1
2
3
4
5
6
7
8
[root@localhost ~]# tree /var/crash/
/var/crash/
└── 127.0.0.1-2021-10-20-19:08:20
├── vmcore
└── vmcore-dmesg.txt

1 directories, 2 files
[root@localhost ~]#

查看系统dmesg

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
[root@localhost ~]# cat "/var/crash/127.0.0.1-2021-10-20-19:08:20/vmcore-dmesg.txt"
...

[ 265.027232] SysRq : Trigger a crash
[ 265.030797] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 265.038624] IP: [<ffffffff813fe3f6>] sysrq_handle_crash+0x16/0x20
[ 265.044725] PGD 1ae2a03067 PUD 1ae2a22067 PMD 0
[ 265.049350] Oops: 0002 [#1] SMP
[ 265.052586] Modules linked in: sb_edac edac_core intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper iTCO_wdt ablk_helper cryptd iTCO_vendor_support mxm_wmi pcspkr sg shpchp i2c_i801 lpc_ich ipmi_si ipmi_devintf ipmi_msghandler wmi acpi_power_meter nfsd auth_rpcgss nfs_acl lockd grace sunrpc rte_kni(OE) igb_uio(OE) uio ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic ixgbe igb ahci libahci libata crct10dif_pclmul crct10dif_common crc32c_intel i2c_algo_bit i2c_core mdio ptp pps_core dca dm_mirror dm_region_hash dm_log dm_mod
[ 265.108099] CPU: 2 PID: 13175 Comm: bash Tainted: G OE ------------ 3.10.0-693.el7.x86_64 #1
[ 265.117639] Hardware name: Joinus Tech ACB300/ATCA-ACB300, BIOS 2.0.0 06/20/2017
[ 265.125008] task: ffff88203a311fa0 ti: ffff88202ae18000 task.ti: ffff88202ae18000
[ 265.132463] RIP: 0010:[<ffffffff813fe3f6>] [<ffffffff813fe3f6>] sysrq_handle_crash+0x16/0x20
[ 265.140977] RSP: 0018:ffff88202ae1be88 EFLAGS: 00010246
[ 265.146266] RAX: 000000000000000f RBX: ffffffff81ab8100 RCX: 0000000000000000
[ 265.153376] RDX: 0000000000000000 RSI: ffff88103f88f8b8 RDI: 0000000000000063
[ 265.160482] RBP: ffff88202ae1be88 R08: ffffffff81d86dfc R09: ffffffff81dab7c3
[ 265.167589] R10: 0000000000000691 R11: 0000000000000690 R12: 0000000000000063
[ 265.174695] R13: 0000000000000000 R14: 0000000000000004 R15: 0000000000000000
[ 265.181803] FS: 00007f2bc11ab740(0000) GS:ffff88103f880000(0000) knlGS:0000000000000000
[ 265.189863] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 265.195587] CR2: 0000000000000000 CR3: 0000002035aba000 CR4: 00000000003407e0
[ 265.202695] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 265.209802] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 265.216908] Stack:
[ 265.218910] ffff88202ae1beb8 ffffffff813fec17 0000000000000002 00007f2bc11b9000
[ 265.226318] ffff88202ae1bf48 0000000000000002 ffff88202ae1bed0 ffffffff813ff08f
[ 265.233727] ffff8810e939c540 ffff88202ae1bef0 ffffffff8127018d 0000000000000002
[ 265.241133] Call Trace:
[ 265.243573] [<ffffffff813fec17>] __handle_sysrq+0x107/0x170
[ 265.249218] [<ffffffff813ff08f>] write_sysrq_trigger+0x2f/0x40
[ 265.255122] [<ffffffff8127018d>] proc_reg_write+0x3d/0x80
[ 265.260598] [<ffffffff81200d2d>] vfs_write+0xbd/0x1e0
[ 265.265724] [<ffffffff81201b3f>] SyS_write+0x7f/0xe0
[ 265.270761] [<ffffffff816b4fc9>] system_call_fastpath+0x16/0x1b
[ 265.276752] Code: eb 9b 45 01 f4 45 39 65 34 75 e5 4c 89 ef e8 e2 f7 ff ff eb db 0f 1f 44 00 00 55 48 89 e5 c7 05 e1 77 62 00 01 00 00 00 0f ae f8 <c6> 04 25 00 00 00 00 01 5d c3 0f 1f 44 00 00 55 31 c0 c7 05 5e
[ 265.296230] RIP [<ffffffff813fe3f6>] sysrq_handle_crash+0x16/0x20
[ 265.302406] RSP <ffff88202ae1be88>
[ 265.305879] CR2: 0000000000000000
[root@localhost ~]#


可以看到崩溃原因
[ 265.027232] SysRq : Trigger a crash

crash 调试内核

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
[root@localhost ~]# crash /usr/lib/debug/lib/modules/`uname -r`/vmlinux  "/var/crash/127.0.0.1-2021-10-20-19:08:20/vmcore"

...

GNU gdb (GDB) 7.6
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-unknown-linux-gnu"...

KERNEL: /usr/lib/debug/lib/modules/3.10.0-693.el7.x86_64/vmlinux
DUMPFILE: /var/crash/127.0.0.1-2021-10-20-19:08:20/vmcore [PARTIAL DUMP]
CPUS: 48
DATE: Wed Oct 20 19:08:12 2021
UPTIME: 00:04:24
LOAD AVERAGE: 1.21, 5.08, 2.74
TASKS: 546
NODENAME: localhost.localdomain
RELEASE: 3.10.0-693.el7.x86_64
VERSION: #1 SMP Tue Aug 22 21:09:27 UTC 2017
MACHINE: x86_64 (2200 Mhz)
MEMORY: 127.9 GB
PANIC: "SysRq : Trigger a crash"
PID: 13175
COMMAND: "bash"
TASK: ffff88203a311fa0 [THREAD_INFO: ffff88202ae18000]
CPU: 2
STATE: TASK_RUNNING (SYSRQ)

crash>

可以看到 崩溃信息:
MEMORY: 127.9 GB
PANIC: "SysRq : Trigger a crash"
PID: 13175
COMMAND: "bash"



错误发生时的堆栈
crash> bt
PID: 13175 TASK: ffff88203a311fa0 CPU: 2 COMMAND: "bash"
#0 [ffff88202ae1bb28] machine_kexec at ffffffff8105c4cb
#1 [ffff88202ae1bb88] __crash_kexec at ffffffff81104a32
#2 [ffff88202ae1bc58] crash_kexec at ffffffff81104b20
#3 [ffff88202ae1bc70] oops_end at ffffffff816ad278
#4 [ffff88202ae1bc98] no_context at ffffffff8169d29a
#5 [ffff88202ae1bce8] __bad_area_nosemaphore at ffffffff8169d330
#6 [ffff88202ae1bd30] bad_area_nosemaphore at ffffffff8169d49a
#7 [ffff88202ae1bd40] __do_page_fault at ffffffff816b013e
#8 [ffff88202ae1bda0] do_page_fault at ffffffff816b02e5
#9 [ffff88202ae1bdd0] page_fault at ffffffff816ac508
[exception RIP: sysrq_handle_crash+22]
RIP: ffffffff813fe3f6 RSP: ffff88202ae1be88 RFLAGS: 00010246
RAX: 000000000000000f RBX: ffffffff81ab8100 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff88103f88f8b8 RDI: 0000000000000063
RBP: ffff88202ae1be88 R8: ffffffff81d86dfc R9: ffffffff81dab7c3
R10: 0000000000000691 R11: 0000000000000690 R12: 0000000000000063
R13: 0000000000000000 R14: 0000000000000004 R15: 0000000000000000
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#10 [ffff88202ae1be90] __handle_sysrq at ffffffff813fec17
#11 [ffff88202ae1bec0] write_sysrq_trigger at ffffffff813ff08f
#12 [ffff88202ae1bed8] proc_reg_write at ffffffff8127018d
#13 [ffff88202ae1bef8] vfs_write at ffffffff81200d2d
#14 [ffff88202ae1bf38] sys_write at ffffffff81201b3f
#15 [ffff88202ae1bf80] system_call_fastpath at ffffffff816b4fc9
RIP: 00007f2bc0899840 RSP: 00007ffcf7194ba8 RFLAGS: 00000246
RAX: 0000000000000001 RBX: ffffffff816b4fc9 RCX: ffffffffffffffff
RDX: 0000000000000002 RSI: 00007f2bc11b9000 RDI: 0000000000000001
RBP: 00007f2bc11b9000 R8: 000000000000000a R9: 00007f2bc11ab740
R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001
R13: 0000000000000002 R14: 00007f2bc0b6d400 R15: 0000000000000002
ORIG_RAX: 0000000000000001 CS: 0033 SS: 002b
crash>

显示与某个进程PID的堆栈
crash> bt 13175
PID: 13175 TASK: ffff88203a311fa0 CPU: 2 COMMAND: "bash"
#0 [ffff88202ae1bb28] machine_kexec at ffffffff8105c4cb
#1 [ffff88202ae1bb88] __crash_kexec at ffffffff81104a32
#2 [ffff88202ae1bc58] crash_kexec at ffffffff81104b20
#3 [ffff88202ae1bc70] oops_end at ffffffff816ad278
#4 [ffff88202ae1bc98] no_context at ffffffff8169d29a
#5 [ffff88202ae1bce8] __bad_area_nosemaphore at ffffffff8169d330
#6 [ffff88202ae1bd30] bad_area_nosemaphore at ffffffff8169d49a
#7 [ffff88202ae1bd40] __do_page_fault at ffffffff816b013e
#8 [ffff88202ae1bda0] do_page_fault at ffffffff816b02e5
#9 [ffff88202ae1bdd0] page_fault at ffffffff816ac508
[exception RIP: sysrq_handle_crash+22]
RIP: ffffffff813fe3f6 RSP: ffff88202ae1be88 RFLAGS: 00010246
RAX: 000000000000000f RBX: ffffffff81ab8100 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff88103f88f8b8 RDI: 0000000000000063
RBP: ffff88202ae1be88 R8: ffffffff81d86dfc R9: ffffffff81dab7c3
R10: 0000000000000691 R11: 0000000000000690 R12: 0000000000000063
R13: 0000000000000000 R14: 0000000000000004 R15: 0000000000000000
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#10 [ffff88202ae1be90] __handle_sysrq at ffffffff813fec17
#11 [ffff88202ae1bec0] write_sysrq_trigger at ffffffff813ff08f
#12 [ffff88202ae1bed8] proc_reg_write at ffffffff8127018d
#13 [ffff88202ae1bef8] vfs_write at ffffffff81200d2d
#14 [ffff88202ae1bf38] sys_write at ffffffff81201b3f
#15 [ffff88202ae1bf80] system_call_fastpath at ffffffff816b4fc9
RIP: 00007f2bc0899840 RSP: 00007ffcf7194ba8 RFLAGS: 00000246
RAX: 0000000000000001 RBX: ffffffff816b4fc9 RCX: ffffffffffffffff
RDX: 0000000000000002 RSI: 00007f2bc11b9000 RDI: 0000000000000001
RBP: 00007f2bc11b9000 R8: 000000000000000a R9: 00007f2bc11ab740
R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001
R13: 0000000000000002 R14: 00007f2bc0b6d400 R15: 0000000000000002
ORIG_RAX: 0000000000000001 CS: 0033 SS: 002b
crash>


显示所有CPU的堆栈信息
crash> bt -a

查看某个函数的反汇编
crash> dis machine_kexec



ps 显示进程
-u 过滤内核
-c 显示命令
crash> ps
crash> ps -u
crash> ps -c



quit 退出

参考:

https://www.jianshu.com/p/8e031b28d98b
https://www.cnblogs.com/maojun1998/p/14392731.html
https://www.cnblogs.com/xzkzzz/p/13740726.html
https://www.golinuxhub.com/2018/08/how-to-configure-and-install-kdump-rhel7-crashkernel/