裸机部署端到端机器学习平台Kubeflow

Version
OS Ubuntu20.04 (8c/16g/200g)
Kubernetes v1.19.9
Kubeflow v1.4

reference: github.com/kubeflow/ma…

Prerequisites

kubernets

# install by kubesphere
# https://kubesphere.com.cn/docs/quick-start/all-in-one-on-linux/
export KKZONE=cn
curl -sfL https://get-kk.kubesphere.io | VERSION=v2.0.0 sh -
chmod +x kk
./kk create cluster --with-kubernetes v1.19.9
# verify
root@vm-u:~# kubectl get pod -A
NAMESPACE     NAME                                            READY   STATUS              RESTARTS   AGE
kube-system   calico-kube-controllers-d75c96f46-4wmmh         1/1     Running             0          16m
kube-system   calico-node-8t85k                               1/1     Running             0          16m
kube-system   coredns-867b49865c-c2gg7                        1/1     Running             0          16m
kube-system   coredns-867b49865c-lzq8p                        1/1     Running             0          16m
kube-system   kube-apiserver-vm-u                             1/1     Running             0          17m
kube-system   kube-controller-manager-vm-u                    1/1     Running             0          17m
kube-system   kube-proxy-vzq42                                1/1     Running             0          16m
kube-system   kube-scheduler-vm-u                             1/1     Running             0          17m
kube-system   nodelocaldns-kkdss                              1/1     Running             0          16m


# install nfs-server 
root@vm-u:~# sudo apt install nfs-kernel-server -y
root@vm-u:~# sudo mkdir -p /mnt/demo
root@vm-u:~# sudo chown nobody:nogroup /mnt/demo
root@vm-u:~# sudo chmod 777 /mnt/demo
# 授予客户端机器访问 NFS 服务器的权限
root@vm-u:~# vim /etc/exports
root@vm-u:~# cat /etc/exports 
/mnt/demo *(rw,sync,no_subtree_check,no_root_squash)
root@vm-u:~# sudo exportfs -a
root@vm-u:~# sudo systemctl restart nfs-kernel-server
root@vm-u:~# showmount -e
Export list for vm-u:
/mnt/demo *


# make nfs as default storageclass
# pull images
docker pull hisunyh/k8s.gcr.io.sig-storage.nfs-subdir-external-provisioner:v4.0.2
docker tag hisunyh/k8s.gcr.io.sig-storage.nfs-subdir-external-provisioner:v4.0.2 k8s.gcr.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2

# nfs.server 指定nfs服务端地址
# nfs.path 指定nfs服务端路径
$ helm upgrade --install nfs-subdir-external-provisioner --namespace nfs ./nfs-subdir-external-provisioner-4.0.16.tgz --create-namespace \
    --set storageClass.defaultClass=true \
    --set nfs.server=192.168.153.130 \
    --set nfs.path=/mnt/demo
# verify
root@vm-u:~# kubectl --namespace nfs get pods -l "release=nfs-subdir-external-provisioner"
NAME                                               READY   STATUS    RESTARTS   AGE
nfs-subdir-external-provisioner-866d8f6b89-c4cj2   1/1     Running   0          7m12s
root@vm-u:~# kubectl get sc
NAME                   PROVISIONER                                     RECLAIMPOLICY   VOLUMEBINDINGMODE   ALLOWVOLUMEEXPANSION   AGE
nfs-client (default)   cluster.local/nfs-subdir-external-provisioner   Delete          Immediate           true                   12m

kustomize

# kustomize version 3.2
# https://github.com/kubernetes-sigs/kustomize/releases/tag/v3.2.0

cp kustomize_3.2.0_linux_amd64 /usr/local/bin/kustomize
chmod +x /usr/local/bin/kustomize
# verify
root@vm-u:~# kustomize version
Version: {KustomizeVersion:3.2.0 GitCommit:a3103f1e62ddb5b696daa3fd359bb6f2e8333b49 BuildDate:2019-09-18T16:26:36Z GoOs:linux GoArch:amd64}

Install with a single command

# clone manifests repo
git clone https://github.com/kubeflow/manifests.git
cd manifests && git checkout v1.4.1
git checkout -b v1.4.1-dev

gcr.io镜像

  • 带digest(@sha256):修改配置文件为中的镜像为pull的tag
  • 不带digest:pull后tag
# knative
gcr.io/knative-releases/knative.dev/eventing/cmd/controller@sha256:6ddffbc286a84048cfd090193d00b4ecda25a3a7bf2de1a8e873f8b3755cc913
gcr.io/knative-releases/knative.dev/eventing/cmd/webhook@sha256:9f70a2a8bb78781472fba0327c5d6ff91f13a29736d4502bf8ad3d60d3f16ccd
gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_controller@sha256:904f42a768a9bc64999e7302d2bc7c1c48a08e74a82355cf57be513e6a124b82
gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_dispatcher@sha256:a6983f71c04619928199cc21e07ee6f1e1c87586621bc03b10c9ba1abd92bfa8
gcr.io/knative-releases/knative.dev/eventing/cmd/mtchannel_broker@sha256:a2678934d280ea19b0804cc7757d559a0312e2acea221b17a99bd830cd9eeaac
gcr.io/knative-releases/knative.dev/eventing/cmd/broker/filter@sha256:0e25aa1613a3a1779b3f7b7f863e651e5f37520a7f6808ccad2164cc2b6a9b12
gcr.io/knative-releases/knative.dev/eventing/cmd/broker/ingress@sha256:cf579f88aa2a37c240e25bb886c1ef5404e326e12c7caf571e49308612243eee
gcr.io/knative-releases/knative.dev/serving/cmd/activator@sha256:fed92af8b9779c97482906db8857f27b5d4826708b75d0298aa30fad8900671f
gcr.io/knative-releases/knative.dev/serving/cmd/autoscaler@sha256:bc5ae3090ab0322ed0e4f9efddb60fa85f6ff3a29156411d24d0e4764b18eba7
gcr.io/knative-releases/knative.dev/serving/cmd/controller@sha256:bd7c6350e5d5c4edaa197a86fb96cff78bdd3e61f33fcb77aa60930de0ec0827
gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook@sha256:1e371db6b1a9f9265fc7a55d15d98c935c0c28925ffde351fb3b93f331c5a08e
gcr.io/knative-releases/knative.dev/net-istio/cmd/controller@sha256:ff8680da52ef47b8573ebc3393cbfa2f0f14b05c1e02232807f22699adbef57a
gcr.io/knative-releases/knative.dev/serving/cmd/webhook@sha256:6f41d379f1aacdfbb8f6d4f539e1769040e4f01bff3ad9c249b427e54dc56ea8

docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.broker.ingress:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.webhook:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.broker.filter:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.mtchannel_broker:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.in_memory.channel_controller:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.controller:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.in_memory.channel_dispatcher:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.serving.cmd.webhook:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.serving.cmd.activator:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.serving.cmd.autoscaler:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.net-istio.cmd.controller:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.serving.cmd.controller:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.net-istio.cmd.webhook:kubeflow1.4.1
# oidc-authservice
gcr.io/arrikto/kubeflow/oidc-authservice:28c59ef

docker pull hisunyh/gcr.io.arrikto.kubeflow.oidc-authservice:28c59ef
docker tag hisunyh/gcr.io.arrikto.kubeflow.oidc-authservice:28c59ef gcr.io/arrikto/kubeflow/oidc-authservice:28c59ef
# pipeline
gcr.io/ml-pipeline/frontend:1.7.0
gcr.io/ml-pipeline/visualization-server:1.7.0
gcr.io/ml-pipeline/cache-deployer:1.7.0
gcr.io/ml-pipeline/cache-server:1.7.0
gcr.io/ml-pipeline/metadata-envoy:1.7.0
gcr.io/ml-pipeline/metadata-writer:1.7.0
gcr.io/ml-pipeline/minio:RELEASE.2019-08-14T20-37-41Z-license-compliance
gcr.io/ml-pipeline/api-server:1.7.0
gcr.io/ml-pipeline/persistenceagent:1.7.0
gcr.io/ml-pipeline/scheduledworkflow:1.7.0
gcr.io/ml-pipeline/viewer-crd-controller:1.7.0
gcr.io/ml-pipeline/mysql:5.7
gcr.io/ml-pipeline/workflow-controller:v3.1.6-patch-license-compliance



docker pull hisunyh/gcr.io.ml-pipeline.minio:RELEASE.2019-08-14T20-37-41Z-license-compliance
docker tag hisunyh/gcr.io.ml-pipeline.minio:RELEASE.2019-08-14T20-37-41Z-license-compliance gcr.io/ml-pipeline/minio:RELEASE.2019-08-14T20-37-41Z-license-compliance

docker pull hisunyh/gcr.io.ml-pipeline.metadata-envoy:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.metadata-envoy:1.7.0 gcr.io/ml-pipeline/metadata-envoy:1.7.0

docker pull hisunyh/gcr.io.ml-pipeline.frontend:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.frontend:1.7.0 gcr.io/ml-pipeline/frontend:1.7.0

docker pull hisunyh/gcr.io.ml-pipeline.metadata-writer:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.metadata-writer:1.7.0 gcr.io/ml-pipeline/metadata-writer:1.7.0

docker pull hisunyh/gcr.io.ml-pipeline.cache-deployer:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.cache-deployer:1.7.0 gcr.io/ml-pipeline/cache-deployer:1.7.0

docker pull hisunyh/gcr.io.ml-pipeline.cache-server:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.cache-server:1.7.0 gcr.io/ml-pipeline/cache-server:1.7.0

docker pull hisunyh/gcr.io.ml-pipeline.visualization-server:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.visualization-server:1.7.0 gcr.io/ml-pipeline/visualization-server:1.7.0

docker pull hisunyh/gcr.io.ml-pipeline.persistenceagent:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.persistenceagent:1.7.0 gcr.io/ml-pipeline/persistenceagent:1.7.0

docker pull hisunyh/gcr.io.ml-pipeline.viewer-crd-controller:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.viewer-crd-controller:1.7.0 gcr.io/ml-pipeline/viewer-crd-controller:1.7.0

docker pull hisunyh/gcr.io.ml-pipeline.scheduledworkflow:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.scheduledworkflow:1.7.0 gcr.io/ml-pipeline/scheduledworkflow:1.7.0

docker pull hisunyh/gcr.io.ml-pipeline.workflow-controller:v3.1.6-patch-license-compliance
docker tag hisunyh/gcr.io.ml-pipeline.workflow-controller:v3.1.6-patch-license-compliance gcr.io/ml-pipeline/workflow-controller:v3.1.6-patch-license-compliance

docker pull hisunyh/gcr.io.ml-pipeline.api-server:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.api-server:1.7.0 gcr.io/ml-pipeline/api-server:1.7.0

docker pull hisunyh/gcr.io.ml-pipeline.mysql:5.7
docker tag hisunyh/gcr.io.ml-pipeline.mysql:5.7 gcr.io/ml-pipeline/mysql:5.7
# more
gcr.io/kubebuilder/kube-rbac-proxy:v0.4.0
gcr.io/tfx-oss-public/ml_metadata_store_server:1.0.0


docker pull hisunyh/gcr.io.kubebuilder.kube-rbac-proxy:v0.4.0
docker tag hisunyh/gcr.io.kubebuilder.kube-rbac-proxy:v0.4.0 gcr.io/kubebuilder/kube-rbac-proxy:v0.4.0

docker pull hisunyh/gcr.io.tfx-oss-public.ml_metadata_store_server:1.0.0
docker tag hisunyh/gcr.io.tfx-oss-public.ml_metadata_store_server:1.0.0 gcr.io/tfx-oss-public/ml_metadata_store_server:1.0.0

Install

# Install with a single command
while ! kustomize build example | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done

troubleshooting

MountVolume.SetUp failed for volume "istiod-ca-cert" : configmap "istio-ca-root-cert" not found

# ref: https://github.com/istio/istio/issues/22463
kubectl rollout restart deployment istiod -n istio-system

MountVolume.SetUp failed for volume "istio-token" : failed to fetch token: the API server does not have TokenRequest endpoints enabled

# ref: https://github.com/kubeflow/manifests/issues/959
# apiserver添加参数来支持第三方令牌
- --service-account-signing-key-file=/etc/kubernetes/pki/sa.key
- --service-account-issuer=kubernetes.default.svc

Failed to pull image

因为镜像下载的问题,需要将imagePullPolicy: Always修改为imagePullPolicy: IfNotPresent

chown: changing ownership of ‘/var/lib/mysql/‘: Operation not permitted

使用nfs作为持久化存储,客户端的访问权限需要添加no_root_squash

# 示例
root@vm-u:~# cat /etc/exports 
/mnt/demo *(rw,sync,no_subtree_check,no_root_squash)

Connect to kubeflow cluster

# verify
root@vm-u:~# kubectl get pods -n cert-manager
NAME                                       READY   STATUS    RESTARTS   AGE
cert-manager-7dd5854bb4-vqr9j              1/1     Running   1          3h1m
cert-manager-cainjector-64c949654c-lw6jt   1/1     Running   1          3h1m
cert-manager-webhook-6bdffc7c9d-ggmz6      1/1     Running   1          3h1m
root@vm-u:~# kubectl get pods -n istio-system
NAME                                     READY   STATUS    RESTARTS   AGE
authservice-0                            1/1     Running   1          174m
cluster-local-gateway-7bf6b98855-8qt72   1/1     Running   1          150m
istio-ingressgateway-78bc678876-5r686    1/1     Running   1          150m
istiod-557487978f-2jfsw                  1/1     Running   1          150m
root@vm-u:~# kubectl get pods -n auth
NAME                   READY   STATUS    RESTARTS   AGE
dex-5ddf47d88d-gfn6h   1/1     Running   1          3h1m
root@vm-u:~# kubectl get pods -n knative-eventing
NAME                                   READY   STATUS    RESTARTS   AGE
eventing-controller-775974c67b-thcq8   1/1     Running   1          3h
eventing-webhook-5774d85d86-qqrt4      1/1     Running   1          3h
imc-controller-6fb9cc659d-ll5nv        1/1     Running   1          3h
imc-dispatcher-7696599fd4-86psj        1/1     Running   2          3h
mt-broker-controller-7d8c98958-zdzjw   1/1     Running   1          3h
mt-broker-filter-5dffcdbfb-j298z       1/1     Running   2          3h
mt-broker-ingress-79686bff7-n2c6r      1/1     Running   2          3h
root@vm-u:~# kubectl get pods -n knative-serving
NAME                                READY   STATUS    RESTARTS   AGE
activator-b7ccffdcb-f4gdp           2/2     Running   1          97m
autoscaler-85d4d9fc74-jsxtj         2/2     Running   1          97m
controller-78bf88df9d-nj8n7         2/2     Running   1          97m
istio-webhook-6fc7b4d848-q6lmz      2/2     Running   1          97m
networking-istio-6fdd64c698-xzlkg   2/2     Running   1          97m
webhook-99b95d84c-s7hjz             2/2     Running   1          97m
root@vm-u:~# kubectl get pods -n kubeflow
NAME                                                        READY   STATUS    RESTARTS   AGE
admission-webhook-deployment-667bd68d94-hjsv2               1/1     Running   0          9m2s
cache-deployer-deployment-585d4647b9-mpmh8                  2/2     Running   1          8m59s
cache-server-85f59fb6f5-g5bhb                               2/2     Running   0          9m2s
centraldashboard-7d496c59bb-n75cr                           1/1     Running   0          9m1s
jupyter-web-app-deployment-6f744fbc54-7tl24                 1/1     Running   0          9m2s
katib-controller-68c47fbf8b-5vxtv                           1/1     Running   0          9m
katib-db-manager-6c76bdc855-zq5x4                           1/1     Running   0          9m2s
katib-mysql-6dcb447c6f-kktf5                                1/1     Running   0          8m59s
katib-ui-64bb96d5bf-2c79s                                   1/1     Running   0          9m2s
kfserving-controller-manager-0                              2/2     Running   2          142m
kfserving-models-web-app-7884f597cf-knbpt                   2/2     Running   0          8m59s
kubeflow-pipelines-profile-controller-7b947f4748-7gbrs      1/1     Running   0          9m
metacontroller-0                                            1/1     Running   1          142m
metadata-envoy-deployment-5b4856dd5-dl9kj                   1/1     Running   0          9m2s
metadata-grpc-deployment-748f868f64-qrvmk                   2/2     Running   4          9m2s
metadata-writer-548bd879bb-njt9s                            2/2     Running   1          9m1s
minio-5b65df66c9-ldb4j                                      2/2     Running   0          8m58s
ml-pipeline-5784f9d9cc-wn4nn                                2/2     Running   2          8m58s
ml-pipeline-persistenceagent-d6bdc77bd-6tdmf                2/2     Running   0          9m2s
ml-pipeline-scheduledworkflow-5db54d75c5-fhcd9              2/2     Running   0          9m
ml-pipeline-ui-5447cb9556-58b8m                             2/2     Running   0          8m59s
ml-pipeline-viewer-crd-7695ffb54d-b5prz                     2/2     Running   3          9m
ml-pipeline-visualizationserver-cf88b98f7-t4mdt             2/2     Running   0          9m1s
mpi-operator-5c55d6cb8f-q8j7m                               1/1     Running   0          9m
mysql-f7b9b7dd4-ghp4h                                       2/2     Running   0          9m1s
notebook-controller-deployment-578fd4dc97-nkmgf             1/1     Running   0          9m1s
profiles-deployment-7cc7956dfd-sdl7m                        2/2     Running   0          8m59s
tensorboard-controller-controller-manager-954b7c544-79q77   3/3     Running   4          9m1s
tensorboards-web-app-deployment-6ff79b7f44-j762j            1/1     Running   0          9m
training-operator-795c5cb864-9xqlm                          1/1     Running   0          9m
volumes-web-app-deployment-8589d664cc-992tv                 1/1     Running   0          9m
workflow-controller-76dd87cd85-q6d2b                        2/2     Running   3          9m1s
root@vm-u:~# kubectl get pods -n kubeflow-user-example-com
NAME                                               READY   STATUS    RESTARTS   AGE
ml-pipeline-ui-artifact-5dd95d555b-9z4qd           2/2     Running   1          132m
ml-pipeline-visualizationserver-6b44c6759f-vfpgl   2/2     Running   0          132m

To access the central dashboard, you need to connect to the Istio gateway that provides access to the Kubeflow service mesh.

# 默认通过Nodeport暴露istio-ingressgateway服务
# 可以看到80对应的主机端口30963,通过http://<host-ip>:30963即可访问kubeflow UI
# default Email:[email protected]
# default Password:12341234
# 修改默认用户密码参考: https://github.com/kubeflow/manifests/tree/v1.4.1#change-default-user-password
root@vm-u:~# kubectl get svc -n istio-system istio-ingressgateway
NAME                   TYPE       CLUSTER-IP     EXTERNAL-IP   PORT(S)                                                                      AGE
istio-ingressgateway   NodePort   10.233.37.60   <none>        15021:31329/TCP,80:30963/TCP,443:32640/TCP,31400:31433/TCP,15443:31029/TCP   3h36m

image.png

troubleshooting

Could not find CSRF cookie XSRF-TOKEN in the request

ref: github.com/kubeflow/ma… github.com/kubeflow/ku… NOTE In order to connect to Kubeflow using NodePort / LoadBalancer / Ingress, you need to setup HTTPS. The reason is that many of our web apps (e.g., Tensorboard Web App, Jupyter Web App, Katib UI) use Secure Cookies, so accessing Kubeflow with HTTP over a non-localhost domain does not work. If you absolutely need to expose Kubeflow over HTTP, you can disable the Secure Cookies feature by setting the APP_SECURE_COOKIES environment variable to false in every relevant web app. This is not recommended, as it poses security risks.

If you don't have a domain or a proper cluster issuer setup in cert-manager, you can use the kubeflow-self-signing-issuer which is also used by various components and a placeholder domain.

root@vm-u:~# kubectl get clusterIssuer
NAME                           READY   AGE
kubeflow-self-signing-issuer   True    11h

# create certificate 
cat <<EOF | kubectl apply -f -
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
  name: kubeflow-ingressgateway-certs
  namespace: istio-system
spec:
  secretName: kubeflow-ingressgateway-certs
  issuerRef:
    name: kubeflow-self-signing-issuer
    kind: ClusterIssuer
  commonName: kubeflow.example.com
  dnsNames:
    - kubeflow.example.com
EOF


# Then change the kubeflow-gateway to the following
# kubectl edit gateway -n kubeflow kubeflow-gateway
apiVersion: networking.istio.io/v1alpha3
kind: Gateway
metadata:
  annotations:
    kubectl.kubernetes.io/last-applied-configuration: >
      {"apiVersion":"networking.istio.io/v1alpha3","kind":"Gateway","metadata":{"annotations":{},"name":"kubeflow-gateway","namespace":"kubeflow"},"spec":{"selector":{"istio":"ingressgateway"},"servers":[{"hosts":["*"],"port":{"name":"http","number":80,"protocol":"HTTP"}}]}}
  name: kubeflow-gateway
  namespace: kubeflow
spec:
  selector:
    istio: ingressgateway
  servers:
    - hosts:
        - '*'
      port:
        name: http
        number: 80
        protocol: HTTP
      tls:
        httpsRedirect: true
    - hosts:
        - '*'
      port:
        name: https
        number: 443
        protocol: HTTPS
      tls:
        credentialName: kubeflow-ingressgateway-certs
        mode: SIMPLE

443对应的主机端口32640,通过https://:32640即可访问kubeflow UI image.png

猜你喜欢

转载自juejin.im/post/7124944761726697502