⭕️ k8s集群断电后发现etcd容器停止运行导致集群问题
1️⃣ 问题
##问题
[root@k8s-master-node1 ~]# kubectl get pod
The connection to the server apiserver.cluster.local:6443 was refused - did you specify the right host or port?
2️⃣ 检查和解决
###查看kubelet日志,没啥太大的内容
[root@k8s-master-node1 ~]# journalctl -xe
Apr 06 00:19:13 k8s-master-node1 kubelet[6164]: E0406 00:19:13.251659 6164 kubelet.go:2407] "Error getting node" err="node \"k8s-master-node1\" not found"
Apr 06 00:19:13 k8s-master-node1 kubelet[6164]: E0406 00:19:13.353761 6164 kubelet.go:2407] "Error getting node" err="node \"k8s-master-node1\" not found"
Apr 06 00:19:13 k8s-master-node1 kubelet[6164]: E0406 00:19:13.455855 6164 kubelet.go:2407] "Error getting node" err="node \"k8s-master-node1\" not found"
Apr 06 00:19:13 k8s-master-node1 kubelet[6164]: E0406 00:19:13.556008 6164 kubelet.go:2407] "Error getting node" err="node \"k8s-master-node1\" not found"
Apr 06 00:19:13 k8s-master-node1 kubelet[6164]: E0406 00:19:13.658080 6164 kubelet.go:2407] "Error getting node" err="node \"k8s-master-node1\" not found"
Apr 06 00:19:13 k8s-master-node1 kubelet[6164]: E0406 00:19:13.758265 6164 kubelet.go:2407] "Error getting node" err="node \"k8s-master-node1\" not found"
Apr 06 00:19:13 k8s-master-node1 kubelet[6164]: E0406 00:19:13.794341 6164 eviction_manager.go:255] "Eviction manager: failed to get summary stats" err="failed to get node info: node \"k8s-master-node1\" no
Apr 06 00:19:13 k8s-master-node1 kubelet[6164]: E0406 00:19:13.859901 6164 kubelet.go:2407] "Error getting node" err="node \"k8s-master-node1\" not found"
###检查容器运行状态,发现etcd容器未启动报错
[root@k8s-master-node1 ~]# docker ps -a |grep etcd
c5c8c6759d06 004811815584 "etcd --advertise-cl…" 2 minutes ago Exited (2) 2 minutes ago k8s_etcd_etcd-k8s-master-node1_kube-system_970efeef4a8f073accb807a0fe4d3bdf_210
9848fd1569b2 k8s.gcr.io/pause:3.5 "/pause" 47 minutes ago Up 47 minutes k8s_POD_etcd-k8s-master-node1_kube-system_970efeef4a8f073accb807a0fe4d3bdf_5
[root@k8s-master-node1 ~]# docker logs c5c
{"level":"info","ts":"2023-04-05T16:18:17.531Z","caller":"etcdmain/etcd.go:72","msg":"Running: ","args":["etcd","--advertise-client-urls=https://192.168.200.10:2379","--cert-file=/etc/kubernetes/pki/etcd/server.crt","--client-cert-auth=true","--data-dir=/var/lib/etcd","--initial-advertise-peer-urls=https://192.168.200.10:2380","--initial-cluster=k8s-master-node1=https://192.168.200.10:2380","--key-file=/etc/kubernetes/pki/etcd/server.key","--listen-client-urls=https://127.0.0.1:2379,https://192.168.200.10:2379","--listen-metrics-urls=http://127.0.0.1:2381","--listen-peer-urls=https://192.168.200.10:2380","--name=k8s-master-node1","--peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt","--peer-client-cert-auth=true","--peer-key-file=/etc/kubernetes/pki/etcd/peer.key","--peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt","--snapshot-count=10000","--trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt"]}
{"level":"info","ts":"2023-04-05T16:18:17.531Z","caller":"etcdmain/etcd.go:115","msg":"server has been already initialized","data-dir":"/var/lib/etcd","dir-type":"member"}
{"level":"info","ts":"2023-04-05T16:18:17.531Z","caller":"embed/etcd.go:131","msg":"configuring peer listeners","listen-peer-urls":["https://192.168.200.10:2380"]}
{"level":"info","ts":"2023-04-05T16:18:17.531Z","caller":"embed/etcd.go:478","msg":"starting with peer TLS","tls-info":"cert = /etc/kubernetes/pki/etcd/peer.crt, key = /etc/kubernetes/pki/etcd/peer.key, client-cert=, client-key=, trusted-ca = /etc/kubernetes/pki/etcd/ca.crt, client-cert-auth = true, crl-file = ","cipher-suites":[]}
{"level":"info","ts":"2023-04-05T16:18:17.532Z","caller":"embed/etcd.go:139","msg":"configuring client listeners","listen-client-urls":["https://127.0.0.1:2379","https://192.168.200.10:2379"]}
{"level":"info","ts":"2023-04-05T16:18:17.532Z","caller":"embed/etcd.go:307","msg":"starting an etcd server","etcd-version":"3.5.0","git-sha":"946a5a6f2","go-version":"go1.16.3","go-os":"linux","go-arch":"amd64","max-cpu-set":4,"max-cpu-available":4,"member-initialized":true,"name":"k8s-master-node1","data-dir":"/var/lib/etcd","wal-dir":"","wal-dir-dedicated":"","member-dir":"/var/lib/etcd/member","force-new-cluster":false,"heartbeat-interval":"100ms","election-timeout":"1s","initial-election-tick-advance":true,"snapshot-count":10000,"snapshot-catchup-entries":5000,"initial-advertise-peer-urls":["https://192.168.200.10:2380"],"listen-peer-urls":["https://192.168.200.10:2380"],"advertise-client-urls":["https://192.168.200.10:2379"],"listen-client-urls":["https://127.0.0.1:2379","https://192.168.200.10:2379"],"listen-metrics-urls":["http://127.0.0.1:2381"],"cors":["*"],"host-whitelist":["*"],"initial-cluster":"","initial-cluster-state":"new","initial-cluster-token":"","quota-size-bytes":2147483648,"pre-vote":true,"initial-corrupt-check":false,"corrupt-check-time-interval":"0s","auto-compaction-mode":"periodic","auto-compaction-retention":"0s","auto-compaction-interval":"0s","discovery-url":"","discovery-proxy":"","downgrade-check-interval":"5s"}
panic: freepages: failed to get all reachable pages (page 293369838209412419: out of bounds: 2141)
goroutine 163 [running]:
go.etcd.io/bbolt.(*DB).freepages.func2(0xc0004a4540)
/home/remote/sbatsche/.gvm/pkgsets/go1.16.3/global/pkg/mod/go.etcd.io/bbolt@v1.3.6/db.go:1056 +0xe9
created by go.etcd.io/bbolt.(*DB).freepages
/home/remote/sbatsche/.gvm/pkgsets/go1.16.3/global/pkg/mod/go.etcd.io/bbolt@v1.3.6/db.go:1054 +0x1cd
##发现报错报错好像是说啥数据损坏,但是损毁了我也没备份,由于我这不是生产环境,我干脆一边做二不休,把数据文件改名
##首先得查看etcd的数据文件是挂载到哪个目录的
"HostConfig": {
"Binds": [
"/var/lib/etcd:/var/lib/etcd",
"/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd",
"/var/lib/kubelet/pods/970efeef4a8f073accb807a0fe4d3bdf/etc-hosts:/etc/hosts",
"/var/lib/kubelet/pods/970efeef4a8f073accb807a0fe4d3bdf/containers/etcd/34cc8ef0:/dev/termination-log"
],
##是在/var/lib/etcd/目录下
[root@k8s-master-node1 etcd]# pwd
/var/lib/etcd
[root@k8s-master-node1 etcd]# ls
member
[root@k8s-master-node1 etcd]# mv member member.bak
member.bak
###重启etcd容器和kubelet
docker rm -f [pod_name]
systemctl restart kubelet
##查看是否运行
[root@k8s-master-node1 etcd]# docker ps -a |grep etcd
ad7ecfb16212 004811815584 "etcd --advertise-cl…" 8 minutes ago Up 8 minutes k8s_etcd_etcd-k8s-master-node1_kube-system_970efeef4a8f073accb807a0fe4d3bdf_214
62b8e7a11024 k8s.gcr.io/pause:3.5 "/pause" 10 minutes ago Up 10 minutes k8s_POD_etcd-k8s-master-node1_kube-system_970efeef4a8f073accb807a0fe4d3bdf_0
##查看系统是否可用
[root@k8s-master-node1 etcd]# kubectl get pod -n kube-system
NAME READY STATUS RESTARTS AGE
etcd-k8s-master-node1 1/1 Running 214 5m57s
kube-apiserver-k8s-master-node1 1/1 Running 212 6m1s
kube-controller-manager-k8s-master-node1 1/1 Running 8 5m59s
kube-scheduler-k8s-master-node1 1/1 Running 8 4m47s