ceph15（Octopus）问题排查处理 - centos8

由测试集群关闭两周重新启动后而引发的问题排查！！！

eg1：mds error 错误排查

[root@ceph135 ~]# ceph orch ps --daemon-type mds
NAME HOST STATUS REFRESHED AGE VERSION IMAGE NAME IMAGE ID CONTAINER ID
mds.fs-cluster.ceph135.loopbh ceph135 running (98s) 17s ago 13d 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 8922f82d6d47
mds.fs-cluster.ceph136.lhwyag ceph136 error 9s ago 13d <unknown> docker.io/ceph/ceph:v15 <unknown> <unknown>
mds.fs-cluster.ceph137.utnvcx ceph137 running (106s) 73s ago 13d 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 5ab7a1c6c555
#强制删除发生错误的mds daemon
[root@ceph135 ~]# ceph orch daemon rm mds.fs-cluster.ceph136.lhwyag --force
Removed mds.fs-cluster.ceph136.lhwyag from host 'ceph136'
#确认是否已经删除
[root@ceph135 ~]# ceph orch ps --daemon-type mds
NAME HOST STATUS REFRESHED AGE VERSION IMAGE NAME IMAGE ID CONTAINER ID
mds.fs-cluster.ceph135.loopbh ceph135 running (2m) 49s ago 13d 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 8922f82d6d47
mds.fs-cluster.ceph137.utnvcx ceph137 running (2m) 105s ago 13d 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 5ab7a1c6c555
#重新设置placement拉起三个节点的mds
[root@ceph135 ~]# ceph orch apply mds fs-cluster --placement="3 ceph135 ceph136 ceph137"
Scheduled mds.myfs update...
#等待拉起后确认
[root@ceph135 ~]# ceph orch ps --daemon-type mds
NAME HOST STATUS REFRESHED AGE VERSION IMAGE NAME IMAGE ID CONTAINER ID
mds.fs-cluster.ceph135.loopbh ceph135 running (13m) 19s ago 13d 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 8922f82d6d47
mds.fs-cluster.ceph136.ihcqok ceph136 running (10m) 4m ago 10m 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 2f1c324a6424
mds.fs-cluster.ceph137.utnvcx ceph137 running (13m) 4m ago 13d 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 5ab7a1c6c555

eg2：mon error 错误排查

[root@ceph135 ~]# ceph orch ps --daemon-type mon
NAME HOST STATUS REFRESHED AGE VERSION IMAGE NAME IMAGE ID CONTAINER ID
mon.ceph135 ceph135 running (7h) 3m ago 13d 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc e3a7e8e1d410
mon.ceph136 ceph136 running (7h) 114s ago 13d 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 256140fce471
mon.ceph137 ceph137 error 77s ago 13d <unknown> docker.io/ceph/ceph:v15 <unknown> <unknown>
#强制删除发生错误的mon daemon
[root@ceph135 ~]# ceph orch daemon rm mon.ceph137 --force
Removed mon.ceph137 from host 'ceph137'
#重新添加删除的mon
[root@ceph135 ~]# ceph orch host label add ceph137 mon
Added label mon to host ceph137
#重新下发mon更新
[root@ceph135 ~]# ceph orch apply mon label:mon
Scheduled mon update...
#但是，问题没有解决，一查依然是error的状态
[root@ceph135 ~]# ceph orch ps  --daemon-type mon
NAME HOST STATUS REFRESHED AGE VERSION IMAGE NAME IMAGE ID CONTAINER ID
mon.ceph135 ceph135 running (7h) 3m ago 13d 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc e3a7e8e1d410
mon.ceph136 ceph136 running (7h) 114s ago 13d 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 256140fce471
mon.ceph137 ceph137 error 77s ago 13d <unknown> docker.io/ceph/ceph:v15 <unknown> <unknown>
#登陆对应的ceph137查看日志，发现之前的mon容器还有残留，报错信息如下
#less /var/log/message
Jun 29 17:46:21 ceph137 bash[22796]: Error: error creating container storage: the container name "ceph-b3add0aa-aee7-11ea-a3e4-5e7ce92c6bef-mon.ceph137" is already in use by "df9ba2198599a88d19f323d73b093193c00eb8f8cc71863d6f2264d4401e47f9". You have to remove that container to be able to reuse that name.: that name is already in use
#进入到对应的目录删除之前残留的容器目录
[root@ceph137 df9ba2198599a88d19f323d73b093193c00eb8f8cc71863d6f2264d4401e47f9]# pwd
/var/lib/containers/storage/overlay-containers/df9ba2198599a88d19f323d73b093193c00eb8f8cc71863d6f2264d4401e47f9
[root@ceph137 overlay-containers]# rm -rf df9ba2198599a88d19f323d73b093193c00eb8f8cc71863d6f2264d4401e47f9
#并编辑容器的json文件，删除残留字段
[root@ceph137 overlay-containers]# vim containers.json
{"id":"df9ba2198599a88d19f323d73b093193c00eb8f8cc71863d6f2264d4401e47f9","names":["ceph-b3add0aa-aee7-11ea-a3e4-5e7ce92c6bef-mon.ceph137"],"image":"d72755c420bcbdae08d063de6035d060ea0487f8a43f777c75bdbfcd9fd907fa","layer":"4b2d43a92e75486a3d333193d1d6e61e5d2c95440f11460f05c9dee4967842a7","metadata":"{\"image-name\":\"docker.io/ceph/ceph:v15\",\"image-id\":\"d72755c420bcbdae08d063de6035d060ea0487f8a43f777c75bdbfcd9fd907fa\",\"name\":\"ceph-b3add0aa-aee7-11ea-a3e4-5e7ce92c6bef-mon.ceph137\",\"created-at\":1592265875}","created":"2020-06-16T00:04:37.995764867Z","flags":{"MountLabel":"","ProcessLabel":""}}

#再重新执行前面执行过的重建mon命令
[root@ceph135 ~]# ceph orch daemon rm mon.ceph137 --force
Removed mon.ceph137 from host 'ceph137'
[root@ceph135 ~]# ceph orch host label add ceph137 mon
Added label mon to host ceph137
[root@ceph135 ~]# ceph orch apply mon label:mon
Scheduled mon update...
#验证
[root@ceph135 ~]# ceph orch ls
NAME                       RUNNING  REFRESHED  AGE  PLACEMENT                IMAGE NAME                           IMAGE ID
alertmanager                   1/1  8m ago     2w   count:1                  docker.io/prom/alertmanager:latest   0881eb8f169f
crash                          3/3  8m ago     2w   *                        docker.io/ceph/ceph:v15              d72755c420bc
grafana                        1/1  8m ago     2w   count:1                  docker.io/ceph/ceph-grafana:latest   87a51ecf0b1c
mds.fs-cluster                 3/3  8m ago     2w   count:3                  docker.io/ceph/ceph:v15              d72755c420bc
mgr                            3/3  8m ago     2w   count:3                  docker.io/ceph/ceph:v15              d72755c420bc
mon                            3/3  8m ago     5h   label:mon                docker.io/ceph/ceph:v15              d72755c420bc
node-exporter                  3/3  8m ago     2w   *                        docker.io/prom/node-exporter:latest  14191dbfb45b
osd.all-available-devices      0/3  -          -    *                        <unknown>                            <unknown>
prometheus                     1/1  8m ago     2w   count:1                  docker.io/prom/prometheus:latest     39d1866a438a
rgw.rgw-org.zone-dc1           2/2  6m ago     2w   count:2 ceph136,ceph137  docker.io/ceph/ceph:v15              d72755c420bc

[root@ceph135 ~]# ceph orch ps --daemon_type mon
NAME HOST STATUS REFRESHED AGE VERSION IMAGE NAME IMAGE ID CONTAINER ID
mon.ceph135 ceph135 running (29m) 4m ago 2w 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 2b8ec4c38156
mon.ceph136 ceph136 running (29m) 4m ago 2w 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 2440614e92f3
mon.ceph137 ceph137 running (2m) 54s ago 2m 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc ff696183e61f

#登陆对应的机器验证，恢复正常
[root@ceph137 overlay-containers]# podman ps -a
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
ff696183e61f docker.io/ceph/ceph:v15 -n mon.ceph137 -f... About a minute ago Up About a minute ago ceph-b3add0aa-aee7-11ea-a3e4-5e7ce92c6bef-mon.ceph137

eg3：多余cephfs集群删除

[root@ceph135 ~]# ceph orch ls
NAME RUNNING REFRESHED AGE PLACEMENT IMAGE NAME IMAGE ID
alertmanager 1/1 103s ago 2w count:1 docker.io/prom/alertmanager:latest 0881eb8f169f
crash 3/3 3m ago 2w * docker.io/ceph/ceph:v15 d72755c420bc
grafana 1/1 103s ago 2w count:1 docker.io/ceph/ceph-grafana:latest 87a51ecf0b1c
mds.fs-cluster 3/3 3m ago 13d count:3 docker.io/ceph/ceph:v15 d72755c420bc
mds.myfs 3/3 - 41m count:3 ceph135,ceph136,ceph137 mix mix
mgr 3/3 3m ago 13d count:3 docker.io/ceph/ceph:v15 d72755c420bc
mon 2/3 3m ago 17m label:mon docker.io/ceph/ceph:v15 mix
node-exporter 3/3 3m ago 2w * docker.io/prom/node-exporter:latest 14191dbfb45b
osd.all-available-devices 0/3 - - * <unknown> <unknown>
prometheus 1/1 103s ago 2w count:1 docker.io/prom/prometheus:latest 39d1866a438a
rgw.rgw-org.zone-dc1 2/2 3m ago 13d count:2 ceph136,ceph137 docker.io/ceph/ceph:v15 d72755c420bc

[root@ceph135 ~]# ceph orch rm mds.myfs
Removed service mds.myfs

[root@ceph135 ~]# ceph orch ls
NAME RUNNING REFRESHED AGE PLACEMENT IMAGE NAME IMAGE ID
alertmanager 1/1 2m ago 2w count:1 docker.io/prom/alertmanager:latest 0881eb8f169f
crash 3/3 2m ago 2w * docker.io/ceph/ceph:v15 d72755c420bc
grafana 1/1 2m ago 2w count:1 docker.io/ceph/ceph-grafana:latest 87a51ecf0b1c
mds.fs-cluster 3/3 2m ago 13d count:3 docker.io/ceph/ceph:v15 d72755c420bc
mds.myfs 3/0 2m ago - <unmanaged> docker.io/ceph/ceph:v15 d72755c420bc
mgr 3/3 2m ago 13d count:3 docker.io/ceph/ceph:v15 d72755c420bc
mon 2/3 2m ago 18m label:mon docker.io/ceph/ceph:v15 mix
node-exporter 3/3 2m ago 2w * docker.io/prom/node-exporter:latest 14191dbfb45b
osd.all-available-devices 0/3 - - * <unknown> <unknown>
prometheus 1/1 2m ago 2w count:1 docker.io/prom/prometheus:latest 39d1866a438a
rgw.rgw-org.zone-dc1 2/2 46s ago 13d count:2 ceph136,ceph137 docker.io/ceph/ceph:v15 d72755c420bc

[root@ceph135 ~]# ceph orch ls
NAME RUNNING REFRESHED AGE PLACEMENT IMAGE NAME IMAGE ID
alertmanager 1/1 34s ago 2w count:1 docker.io/prom/alertmanager:latest 0881eb8f169f
crash 3/3 2m ago 2w * docker.io/ceph/ceph:v15 d72755c420bc
grafana 1/1 34s ago 2w count:1 docker.io/ceph/ceph-grafana:latest 87a51ecf0b1c
mds.fs-cluster 3/3 2m ago 13d count:3 docker.io/ceph/ceph:v15 d72755c420bc
mgr 3/3 2m ago 13d count:3 docker.io/ceph/ceph:v15 d72755c420bc
mon 2/3 2m ago 20m label:mon docker.io/ceph/ceph:v15 mix
node-exporter 3/3 2m ago 2w * docker.io/prom/node-exporter:latest 14191dbfb45b
osd.all-available-devices 0/3 - - * <unknown> <unknown>
prometheus 1/1 34s ago 2w count:1 docker.io/prom/prometheus:latest 39d1866a438a
rgw.rgw-org.zone-dc1 2/2 2m ago 13d count:2 ceph136,ceph137 docker.io/ceph/ceph:v15 d72755c420bc

[root@ceph135 ~]# ceph orch ps --daemon_type mds
NAME HOST STATUS REFRESHED AGE VERSION IMAGE NAME IMAGE ID CONTAINER ID
mds.fs-cluster.ceph135.loopbh ceph135 running (48m) 21s ago 13d 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 8922f82d6d47
mds.fs-cluster.ceph136.ihcqok ceph136 running (45m) 13s ago 45m 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 2f1c324a6424
mds.fs-cluster.ceph137.utnvcx ceph137 running (48m) 115s ago 13d 15.2.3 docker.io/ceph/ceph:v15 d72755c420bc 5ab7a1c6c555

eg4：期间还踩到了一个osd的bug，暂未解决，可能得重建

bug：https://tracker.ceph.com/issues/44356

[root@ceph135 ~]# ceph orch daemon add osd ceph136:/dev/sdb
Error EINVAL: Traceback (most recent call last):
  File "/usr/share/ceph/mgr/mgr_module.py", line 1153, in _handle_command
    return self.handle_command(inbuf, cmd)
  File "/usr/share/ceph/mgr/orchestrator/_interface.py", line 110, in handle_command
    return dispatch[cmd['prefix']].call(self, cmd, inbuf)
  File "/usr/share/ceph/mgr/mgr_module.py", line 308, in call
    return self.func(mgr, **kwargs)
  File "/usr/share/ceph/mgr/orchestrator/_interface.py", line 72, in <lambda>
    wrapper_copy = lambda *l_args, **l_kwargs: wrapper(*l_args, **l_kwargs)
  File "/usr/share/ceph/mgr/orchestrator/_interface.py", line 63, in wrapper
    return func(*args, **kwargs)
  File "/usr/share/ceph/mgr/orchestrator/module.py", line 597, in _daemon_add_osd
    completion = self.create_osds(drive_group)
  File "/usr/share/ceph/mgr/orchestrator/_interface.py", line 1542, in inner
    completion = self._oremote(method_name, args, kwargs)
  File "/usr/share/ceph/mgr/orchestrator/_interface.py", line 1614, in _oremote
    return mgr.remote(o, meth, *args, **kwargs)
  File "/usr/share/ceph/mgr/mgr_module.py", line 1515, in remote
    args, kwargs)
RuntimeError: Remote method threw exception: Traceback (most recent call last):
  File "/usr/share/ceph/mgr/cephadm/module.py", line 561, in wrapper
    return AsyncCompletion(value=f(*args, **kwargs), name=f.__name__)
  File "/usr/share/ceph/mgr/cephadm/module.py", line 2148, in create_osds
    replace_osd_ids=drive_group.osd_id_claims.get(host, []))
  File "/usr/share/ceph/mgr/cephadm/module.py", line 2254, in _create_osd
    code, '\n'.join(err)))
RuntimeError: cephadm exited with an error code: 1, stderr:INFO:cephadm:/bin/podman:stderr WARNING: The same type, major and minor should not be used for multiple devices.
INFO:cephadm:/bin/podman:stderr Running command: /usr/bin/ceph-authtool --gen-print-key
INFO:cephadm:/bin/podman:stderr Running command: /usr/bin/ceph --cluster ceph --name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/ceph.keyring -i - osd new f9b16345-adac-4662-9beb-8178f7ab0d6a
INFO:cephadm:/bin/podman:stderr  stderr: Error EEXIST: entity osd.2 exists but key does not match
INFO:cephadm:/bin/podman:stderr -->  RuntimeError: Unable to create a new OSD id
Traceback (most recent call last):
  File "<stdin>", line 4581, in <module>
  File "<stdin>", line 1060, in _infer_fsid
  File "<stdin>", line 1107, in _infer_image
  File "<stdin>", line 2845, in command_ceph_volume
  File "<stdin>", line 839, in call_throws
RuntimeError: Failed command: /bin/podman run --rm --net=host --privileged --group-add=disk -e CONTAINER_IMAGE=docker.io/ceph/ceph:v15 -e NODE_NAME=ceph136 -v /var/run/ceph/b3add0aa-aee7-11ea-a3e4-5e7ce92c6bef:/var/run/ceph:z -v /var/log/ceph/b3add0aa-aee7-11ea-a3e4-5e7ce92c6bef:/var/log/ceph:z -v /var/lib/ceph/b3add0aa-aee7-11ea-a3e4-5e7ce92c6bef/crash:/var/lib/ceph/crash:z -v /dev:/dev -v /run/udev:/run/udev -v /sys:/sys -v /run/lvm:/run/lvm -v /run/lock/lvm:/run/lock/lvm -v/tmp/ceph-tmp8waesu1r:/etc/ceph/ceph.conf:z -v /tmp/ceph-tmp2yjwbbre:/var/lib/ceph/bootstrap-osd/ceph.keyring:z --entrypoint /usr/sbin/ceph-volume docker.io/ceph/ceph:v15 lvm prepare --bluestore --data /dev/sdb --no-systemd

ceph15（Octopus）问题排查处理 - centos8

由测试集群关闭两周重新启动后而引发的问题排查！！！

eg1：mds error 错误排查

eg2：mon error 错误排查

eg3：多余cephfs集群删除

eg4：期间还踩到了一个osd的bug，暂未解决，可能得重建

猜你喜欢