x86架构22.03SP1版本下物理内存剩余很多还是会swap使用率100%

x86架构22.03SP1版本下物理内存剩余很多还是会swap使用率100%

环境:
5.10.0-136.12.0.86.oe2203sp1.x86_64

~]# lscpu
Architecture:            x86_64
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         46 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  56
  On-line CPU(s) list:   0-55
Vendor ID:               GenuineIntel
  BIOS Vendor ID:        Intel(R) Corporation
  Model name:            Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
    BIOS Model name:     Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
    CPU family:          6
    Model:               79
    Thread(s) per core:  2
    Core(s) per socket:  14
    Socket(s):           2
    Stepping:            1
    CPU max MHz:         3500.0000
    CPU min MHz:         1200.0000
    BogoMIPS:            5187.71
    Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology 
                         nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_faul
                         t epb cat_l3 cdp_l3 invpcid_single pti intel_ppin tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm cqm rdt_a rdseed adx smap intel_pt xsaveopt cqm_llc 
                         cqm_occup_llc cqm_mbm_total cqm_mbm_local dtherm ida arat pln pts
Virtualization features: 
  Virtualization:        VT-x
Caches (sum of all):     
  L1d:                   896 KiB (28 instances)
  L1i:                   896 KiB (28 instances)
  L2:                    7 MiB (28 instances)
  L3:                    70 MiB (2 instances)
NUMA:                    
  NUMA node(s):          2
  NUMA node0 CPU(s):     0-13,28-41
  NUMA node1 CPU(s):     14-27,42-55

场景:MySQL Server

top - 11:24:26 up 403 days, 22:59,  1 user,  load average: 4.58, 4.96, 4.96
Tasks: 624 total,   2 running, 621 sleeping,   0 stopped,   1 zombie
%Cpu(s):  3.6 us,  0.7 sy,  0.0 ni, 94.3 id,  1.1 wa,  0.1 hi,  0.3 si,  0.0 st
MiB Mem : 257020.8 total,   9419.8 free,  92875.7 used, 154725.3 buff/cache    
MiB Swap:   8192.0 total,      0.4 free,   8191.6 used. 136077.3 avail Mem 

    PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM   SWAP     TIME+ COMMAND                                                                                                                                                   
2175575 mysql     20   0   96.1g  77.4g  10856 S 224.8  30.8   8.0g    89w+0d mysqld                                                                                                                                                    
   1401 root      20   0  387256  45292  40388 S   0.7   0.0  14744     2w+4d tuned     
~]$ cat /proc/cmdline 
BOOT_IMAGE=/vmlinuz-5.10.0-136.12.0.86.oe2203sp1.x86_64 root=UUID=84ba7b6b-b2a9-4af4-befa-666ad286b162 ro rhgb quiet crashkernel=auto net.ifnames=1 biosdevname=0 ipv6.disable=1 intel_pstate=enable intel_idle.max_cstate=0 processor.max_cstate=0 sched_steal_node_limit=4 nohz=on audit=0 resume=UUID=066d989b-8036-4912-a60c-a3eab4f24c10 cgroup_disable=files apparmor=0 crashkernel=512M selinux=0

~]# free -m
               total        used        free      shared  buff/cache   available
Mem:          257020       92871        9447        4138      154702      136089
Swap:           8191        8191           0

~]# sysctl -a|grep swap
vm.swappiness = 1

~]# cat /sys/kernel/mm/transparent_hugepage/enabled 
always madvise [never]

 ~]# cat /sys/kernel/mm/transparent_hugepage/defrag 
always defer defer+madvise madvise [never]

 ~]# sysctl -a|grep water
vm.watermark_boost_factor = 15000
vm.watermark_scale_factor = 300

从下图看,分上中下,上图的numa lowest代表2个node里面,最低内存的没有低于1.3G,不存在因为numa关系,某个节点内存不够而使用SWAP

如下图,在free可用内存足够的情况下,swapin,swapout总是会发生

追加信息

 ~]$ grep -A 15 'Normal' /proc/zoneinfo
Node 0, zone   Normal
  pages free     2234835
        min      11125
        low      985991
        high     1960857
        spanned  33030144
        present  33030144
        managed  32495534
        cma      0
        protection: (0, 0, 0, 0, 0)
      nr_free_pages 2234835
      nr_zone_inactive_anon 5853262
      nr_zone_active_anon 5151379
      nr_zone_inactive_file 13587548
      nr_zone_active_file 2997677
      nr_zone_unevictable 768
--
Node 1, zone   Normal
  per-node stats
      nr_inactive_anon 5739732
      nr_active_anon 4659303
      nr_inactive_file 16765889
      nr_active_file 3706790
      nr_unevictable 0
      nr_slab_reclaimable 108129
      nr_slab_unreclaimable 43483
      nr_isolated_anon 0
      nr_isolated_file 0
      workingset_nodes 49455
      workingset_refault_anon 1419427
      workingset_refault_file 1849463
      workingset_activate_anon 28320
      workingset_activate_file 1607655
~]# for file in /proc/*/status ; do awk '/VmSwap|Name|^Pid/{printf $2 " " $3}END{ print ""}' $file; done | sort -k 3 -n -r | head 
mysqld 2175575 8357284 kB
glusterfs 7421 19960 kB
tuned 1401 14740 kB
oradba 2225759 3992 kB
oradba 2225772 3868 kB
oradba 2225782 3832 kB
systemd-udevd 1167 3472 kB
NetworkManager 1380 1260 kB
rngd 1332 960 kB
sudo 2173504 940 kB
~]$ cat /proc/sys/vm/min_free_kbytes 
90112

 ~]$ cat /proc/sys/vm/zone_reclaim_mode 
0

~]$ numactl -H
available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 28 29 30 31 32 33 34 35 36 37 38 39 40 41
node 0 size: 128004 MB
node 0 free: 9419 MB
node 1 cpus: 14 15 16 17 18 19 20 21 22 23 24 25 26 27 42 43 44 45 46 47 48 49 50 51 52 53 54 55
node 1 size: 129016 MB
node 1 free: 4979 MB
node distances:
node   0   1 
  0:  10  21 
  1:  21  10 

sysctl vm.swappiness=10