环境:
5.10.0-216.0.0.115.oe2203sp4.x86_64 #1 SMP Thu Jun 27 15:13:44 CST 2024 x86_64 x86_64 x86_64 GNU/Linux
opencas:
open-cas-linux-modules_k5.10.0_216.0.0.115.oe2203sp4-22.06.2.0723.release-1.x86_64
open-cas-linux-22.06.2.0723.release-1.x86_64
ovs:
openvswitch-2.12.4-8.oe2203sp4.x86_64
现象:
卸载 cache: casadm --stop-cache --cache-id 1 后等待数分钟物理机直接宕机
很奇怪,两个不相关的东西怎么能相互影响呢?
vmcore.txt
[419106.482903] cache2.core4: Seqential cutoff init
[419106.483199] cache2: Disk lines = 488376661
[419124.922506] cache2: Done loading cache state
[419132.067483] NMI watchdog: Watchdog detected hard LOCKUP on cpu 19
[419132.067484] Modules linked in: ext4 mbcache jbd2 cls_u32 sch_netem sch_prio vhost_net tap tun tcp_diag inet_diag ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter ip_tables overlay 8021q garp mrp stp llc sch_ingress bonding vxlan ip6_udp_tunnel udp_tunnel openvswitch nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 vhost_vsock vmw_vsock_virtio_transport_common vhost vhost_iotlb vsock vfat fat dm_multipath dm_mod intel_rapl_msr intel_rapl_common intel_uncore_frequency intel_uncore_frequency_common isst_if_common skx_edac nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass rapl ipmi_ssif rpcrdma sunrpc rdma_ucm ib_srpt intel_cstate ib_isert iscsi_target_mod target_core_mod ib_iser ib_umad rdma_cm ib_ipoib iw_cm libiscsi scsi_transport_iscsi ib_cm iTCO_wdt iTCO_vendor_support mlx5_ib i40iw ast drm_vram_helper ib_uverbs drm_ttm_helper i2c_algo_bit ttm intel_uncore drm_kms_helper ib_core syscopyarea pcspkr
[419132.067510] sysfillrect mei_me acpi_ipmi ioatdma sysimgblt sg i2c_i801 fb_sys_fops joydev cec mei lpc_ich i2c_smbus ipmi_si dca ipmi_devintf ipmi_msghandler acpi_pad acpi_power_meter nbd jool_siit(OE) jool_common(OE) cas_cache(OE) drm cas_disk(OE) fuse xfs libcrc32c sd_mod mlx5_core nvme ahci libahci crct10dif_pclmul crc32_pclmul nvme_core crc32c_intel megaraid_sas i40e ghash_clmulni_intel libata t10_pi mlxfw pci_hyperv_intf wmi
[419132.067523] CPU: 19 PID: 2438 Comm: handler101 Kdump: loaded Tainted: G S OE 5.10.0-216.0.0.115.oe2203sp4.x86_64 #1
[419132.067524] Hardware name: Inspur SA5212M5/SA5212M5, BIOS 4.1.28 05/05/2023
[419132.067524] RIP: 0010:deactivate_slab+0x103/0x590
[419132.067526] Code: 75 00 48 29 f8 48 99 49 f7 fb 48 85 d2 0f 85 c3 ea 75 00 4c 89 eb 83 c6 01 49 89 cd 4b 8b 4c 0d 00 44 8d 7e ff 48 85 c9 75 9b <41> 89 f7 48 85 c9 75 e1 44 89 7c 24 54 4c 8b 4c 24 48 31 c0 45 31
[419132.067526] RSP: 0018:ffffb8f80dc675e0 EFLAGS: 00000002
[419132.067527] RAX: 0000000000000010 RBX: ffff99858b0a3f60 RCX: ffff99858b0a3f60
[419132.067527] RDX: ffff99858b0a3400 RSI: 00000000114daddc RDI: ffff99858b0a3000
[419132.067528] RBP: ffffb8f80dc676b0 R08: 0000000000000001 R09: 0000000000000010
[419132.067528] R10: 0000000000000000 R11: 0000000000000002 R12: fffff021212c28c0
[419132.067528] R13: ffff99858b0a3f60 R14: ffff998520bc4900 R15: 00000000114daddb
[419132.067529] FS: 00007f8f81bed640(0000) GS:ffff999d1f580000(0000) knlGS:0000000000000000
[419132.067530] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[419132.067530] CR2: 00007fe70630d082 CR3: 0000000188280005 CR4: 00000000007726e0
[419132.067531] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[419132.067531] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[419132.067531] PKRU: 55555554
[419132.067532] Call Trace:
[419132.067532] <NMI>
[419132.067532] ? watchdog_hardlockup_check.part.0.cold+0x21/0x73
[419132.067533] ? __perf_event_overflow+0x4f/0x100
[419132.067533] ? handle_pmi_common+0x21a/0x2f0
[419132.067534] ? set_pte_vaddr_p4d+0x3f/0x50
[419132.067534] ? flush_tlb_one_kernel+0xa/0x20
[419132.067535] ? native_set_fixmap+0x4f/0x70
[419132.067535] ? ghes_copy_tofrom_phys+0x74/0x120
[419132.067536] ? __ghes_peek_estatus.isra.0+0x49/0xb0
[419132.067536] ? intel_pmu_handle_irq+0xf8/0x260
[419132.067537] ? perf_event_nmi_handler+0x28/0x50
[419132.067537] ? nmi_handle+0x55/0x100
[419132.067537] ? default_do_nmi+0x42/0x140
[419132.067538] ? exc_nmi+0x122/0x160
[419132.067538] ? end_repeat_nmi+0x16/0x67
[419132.067538] ? deactivate_slab+0x103/0x590
[419132.067539] ? deactivate_slab+0x103/0x590
[419132.067539] ? deactivate_slab+0x103/0x590
[419132.067539] </NMI>
[419132.067540] ? ttwu_queue+0x41/0xc0
[419132.067540] ? try_to_wake_up+0x1af/0x400
[419132.067540] ___slab_alloc+0x4dd/0x500
[419132.067541] ? ovs_flow_alloc+0x45/0x90 [openvswitch]
[419132.067541] ? sock_def_readable+0x37/0x70
[419132.067541] ? tun_net_xmit+0x238/0x440 [tun]
[419132.067542] ? ovs_flow_alloc+0x45/0x90 [openvswitch]
[419132.067542] kmem_cache_alloc_node+0x100/0x420
[419132.067543] ovs_flow_alloc+0x45/0x90 [openvswitch]
[419132.067543] ovs_flow_cmd_new+0x7b/0x460 [openvswitch]
[419132.067544] ? __dev_queue_xmit+0x3b0/0x8a0
[419132.067544] ? cpumask_next+0x17/0x20
[419132.067544] ? _qos_smt_check_need_resched+0x77/0x190
[419132.067545] ? qos_smt_check_need_resched+0x30/0x50
[419132.067545] ? update_curr+0x76/0x210
[419132.067546] ? newidle_balance+0x23f/0x2f0
[419132.067546] ? cpumask_next+0x17/0x20
[419132.067546] ? qos_smt_send_ipi+0x55/0x100
[419132.067547] ? __nlmsg_put+0x63/0x80
[419132.067547] ? __nla_validate_parse+0x11f/0x190
[419132.067547] ? __nla_parse+0x22/0x30
[419132.067548] ? genl_family_rcv_msg_attrs_parse.constprop.0+0x8f/0xe0
[419132.067548] genl_family_rcv_msg_doit+0xe7/0x150
[419132.067548] genl_family_rcv_msg+0xb0/0x160
[419132.067549] ? ovs_flow_cmd_del+0x280/0x280 [openvswitch]
[419132.067549] genl_rcv_msg+0x47/0xa0
[419132.067549] ? genl_family_rcv_msg+0x160/0x160
[419132.067550] netlink_rcv_skb+0x4e/0x100
[419132.067550] genl_rcv+0x24/0x40
[419132.067551] netlink_unicast+0x12a/0x1d0
[419132.067551] netlink_sendmsg+0x286/0x490
[419132.067552] __sock_sendmsg+0x5f/0x70
[419132.067552] ____sys_sendmsg+0x232/0x270
[419132.067553] ? import_iovec+0x19/0x20
[419132.067553] ? sendmsg_copy_msghdr+0x80/0xa0
[419132.067553] ___sys_sendmsg+0x75/0xc0
[419132.067554] ? ___sys_recvmsg+0x8e/0x110
[419132.067554] ? __fget_files+0x79/0xb0
[419132.067555] __sys_sendmsg+0x59/0xa0
[419132.067555] do_syscall_64+0x3d/0x80
[419132.067555] entry_SYSCALL_64_after_hwframe+0x62/0xc7
[419132.067556] RIP: 0033:0x7f8f89530b8d
[419132.067557] Code: 28 89 54 24 1c 48 89 74 24 10 89 7c 24 08 e8 da 8d f7 ff 8b 54 24 1c 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 44 24 08 e8 2e 8e f7 ff 48
[419132.067557] RSP: 002b:00007f8f81b8de90 EFLAGS: 00000293 ORIG_RAX: 000000000000002e
[419132.067558] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f8f89530b8d
[419132.067558] RDX: 0000000000000000 RSI: 00007f8f81b8df20 RDI: 0000000000000070
[419132.067559] RBP: 00007f8f81b8ed10 R08: 0000000000000000 R09: 0000000000000001
[419132.067559] R10: 0000000000000006 R11: 0000000000000293 R12: 00007f8f50000b70
[419132.067560] R13: 0000000000000001 R14: 00007f8f50000b70 R15: 00007f8f81b8df20
[419132.067560] Kernel panic - not syncing: Hard LOCKUP
[419132.067561] CPU: 19 PID: 2438 Comm: handler101 Kdump: loaded Tainted: G S OE 5.10.0-216.0.0.115.oe2203sp4.x86_64 #1
[419132.067561] Hardware name: Inspur SA5212M5/SA5212M5, BIOS 4.1.28 05/05/2023
[419132.067561] Call Trace:
[419132.067562] <NMI>
[419132.067562] dump_stack+0x57/0x6e
[419132.067562] panic+0x10e/0x2ef
[419132.067563] nmi_panic.cold+0xc/0xc