Version | |
---|---|
OS | Ubuntu20.04 (8c/16g/200g) |
Kubernetes | v1.19.9 |
Kubeflow | v1.4 |
reference: github.com/kubeflow/ma…
Prerequisites
kubernets
# install by kubesphere
# https://kubesphere.com.cn/docs/quick-start/all-in-one-on-linux/
export KKZONE=cn
curl -sfL https://get-kk.kubesphere.io | VERSION=v2.0.0 sh -
chmod +x kk
./kk create cluster --with-kubernetes v1.19.9
# verify
root@vm-u:~# kubectl get pod -A
NAMESPACE NAME READY STATUS RESTARTS AGE
kube-system calico-kube-controllers-d75c96f46-4wmmh 1/1 Running 0 16m
kube-system calico-node-8t85k 1/1 Running 0 16m
kube-system coredns-867b49865c-c2gg7 1/1 Running 0 16m
kube-system coredns-867b49865c-lzq8p 1/1 Running 0 16m
kube-system kube-apiserver-vm-u 1/1 Running 0 17m
kube-system kube-controller-manager-vm-u 1/1 Running 0 17m
kube-system kube-proxy-vzq42 1/1 Running 0 16m
kube-system kube-scheduler-vm-u 1/1 Running 0 17m
kube-system nodelocaldns-kkdss 1/1 Running 0 16m
# install nfs-server
root@vm-u:~# sudo apt install nfs-kernel-server -y
root@vm-u:~# sudo mkdir -p /mnt/demo
root@vm-u:~# sudo chown nobody:nogroup /mnt/demo
root@vm-u:~# sudo chmod 777 /mnt/demo
# 授予客户端机器访问 NFS 服务器的权限
root@vm-u:~# vim /etc/exports
root@vm-u:~# cat /etc/exports
/mnt/demo *(rw,sync,no_subtree_check,no_root_squash)
root@vm-u:~# sudo exportfs -a
root@vm-u:~# sudo systemctl restart nfs-kernel-server
root@vm-u:~# showmount -e
Export list for vm-u:
/mnt/demo *
# make nfs as default storageclass
# pull images
docker pull hisunyh/k8s.gcr.io.sig-storage.nfs-subdir-external-provisioner:v4.0.2
docker tag hisunyh/k8s.gcr.io.sig-storage.nfs-subdir-external-provisioner:v4.0.2 k8s.gcr.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2
# nfs.server 指定nfs服务端地址
# nfs.path 指定nfs服务端路径
$ helm upgrade --install nfs-subdir-external-provisioner --namespace nfs ./nfs-subdir-external-provisioner-4.0.16.tgz --create-namespace \
--set storageClass.defaultClass=true \
--set nfs.server=192.168.153.130 \
--set nfs.path=/mnt/demo
# verify
root@vm-u:~# kubectl --namespace nfs get pods -l "release=nfs-subdir-external-provisioner"
NAME READY STATUS RESTARTS AGE
nfs-subdir-external-provisioner-866d8f6b89-c4cj2 1/1 Running 0 7m12s
root@vm-u:~# kubectl get sc
NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
nfs-client (default) cluster.local/nfs-subdir-external-provisioner Delete Immediate true 12m
kustomize
# kustomize version 3.2
# https://github.com/kubernetes-sigs/kustomize/releases/tag/v3.2.0
cp kustomize_3.2.0_linux_amd64 /usr/local/bin/kustomize
chmod +x /usr/local/bin/kustomize
# verify
root@vm-u:~# kustomize version
Version: {KustomizeVersion:3.2.0 GitCommit:a3103f1e62ddb5b696daa3fd359bb6f2e8333b49 BuildDate:2019-09-18T16:26:36Z GoOs:linux GoArch:amd64}
Install with a single command
# clone manifests repo
git clone https://github.com/kubeflow/manifests.git
cd manifests && git checkout v1.4.1
git checkout -b v1.4.1-dev
gcr.io镜像
- 带digest(@sha256):修改配置文件为中的镜像为pull的tag
- 不带digest:pull后tag
# knative
gcr.io/knative-releases/knative.dev/eventing/cmd/controller@sha256:6ddffbc286a84048cfd090193d00b4ecda25a3a7bf2de1a8e873f8b3755cc913
gcr.io/knative-releases/knative.dev/eventing/cmd/webhook@sha256:9f70a2a8bb78781472fba0327c5d6ff91f13a29736d4502bf8ad3d60d3f16ccd
gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_controller@sha256:904f42a768a9bc64999e7302d2bc7c1c48a08e74a82355cf57be513e6a124b82
gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_dispatcher@sha256:a6983f71c04619928199cc21e07ee6f1e1c87586621bc03b10c9ba1abd92bfa8
gcr.io/knative-releases/knative.dev/eventing/cmd/mtchannel_broker@sha256:a2678934d280ea19b0804cc7757d559a0312e2acea221b17a99bd830cd9eeaac
gcr.io/knative-releases/knative.dev/eventing/cmd/broker/filter@sha256:0e25aa1613a3a1779b3f7b7f863e651e5f37520a7f6808ccad2164cc2b6a9b12
gcr.io/knative-releases/knative.dev/eventing/cmd/broker/ingress@sha256:cf579f88aa2a37c240e25bb886c1ef5404e326e12c7caf571e49308612243eee
gcr.io/knative-releases/knative.dev/serving/cmd/activator@sha256:fed92af8b9779c97482906db8857f27b5d4826708b75d0298aa30fad8900671f
gcr.io/knative-releases/knative.dev/serving/cmd/autoscaler@sha256:bc5ae3090ab0322ed0e4f9efddb60fa85f6ff3a29156411d24d0e4764b18eba7
gcr.io/knative-releases/knative.dev/serving/cmd/controller@sha256:bd7c6350e5d5c4edaa197a86fb96cff78bdd3e61f33fcb77aa60930de0ec0827
gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook@sha256:1e371db6b1a9f9265fc7a55d15d98c935c0c28925ffde351fb3b93f331c5a08e
gcr.io/knative-releases/knative.dev/net-istio/cmd/controller@sha256:ff8680da52ef47b8573ebc3393cbfa2f0f14b05c1e02232807f22699adbef57a
gcr.io/knative-releases/knative.dev/serving/cmd/webhook@sha256:6f41d379f1aacdfbb8f6d4f539e1769040e4f01bff3ad9c249b427e54dc56ea8
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.broker.ingress:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.webhook:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.broker.filter:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.mtchannel_broker:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.in_memory.channel_controller:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.controller:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.eventing.cmd.in_memory.channel_dispatcher:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.serving.cmd.webhook:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.serving.cmd.activator:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.serving.cmd.autoscaler:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.net-istio.cmd.controller:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.serving.cmd.controller:kubeflow1.4.1
docker pull hisunyh/gcr.io.knative-releases.knative.dev.net-istio.cmd.webhook:kubeflow1.4.1
# oidc-authservice
gcr.io/arrikto/kubeflow/oidc-authservice:28c59ef
docker pull hisunyh/gcr.io.arrikto.kubeflow.oidc-authservice:28c59ef
docker tag hisunyh/gcr.io.arrikto.kubeflow.oidc-authservice:28c59ef gcr.io/arrikto/kubeflow/oidc-authservice:28c59ef
# pipeline
gcr.io/ml-pipeline/frontend:1.7.0
gcr.io/ml-pipeline/visualization-server:1.7.0
gcr.io/ml-pipeline/cache-deployer:1.7.0
gcr.io/ml-pipeline/cache-server:1.7.0
gcr.io/ml-pipeline/metadata-envoy:1.7.0
gcr.io/ml-pipeline/metadata-writer:1.7.0
gcr.io/ml-pipeline/minio:RELEASE.2019-08-14T20-37-41Z-license-compliance
gcr.io/ml-pipeline/api-server:1.7.0
gcr.io/ml-pipeline/persistenceagent:1.7.0
gcr.io/ml-pipeline/scheduledworkflow:1.7.0
gcr.io/ml-pipeline/viewer-crd-controller:1.7.0
gcr.io/ml-pipeline/mysql:5.7
gcr.io/ml-pipeline/workflow-controller:v3.1.6-patch-license-compliance
docker pull hisunyh/gcr.io.ml-pipeline.minio:RELEASE.2019-08-14T20-37-41Z-license-compliance
docker tag hisunyh/gcr.io.ml-pipeline.minio:RELEASE.2019-08-14T20-37-41Z-license-compliance gcr.io/ml-pipeline/minio:RELEASE.2019-08-14T20-37-41Z-license-compliance
docker pull hisunyh/gcr.io.ml-pipeline.metadata-envoy:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.metadata-envoy:1.7.0 gcr.io/ml-pipeline/metadata-envoy:1.7.0
docker pull hisunyh/gcr.io.ml-pipeline.frontend:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.frontend:1.7.0 gcr.io/ml-pipeline/frontend:1.7.0
docker pull hisunyh/gcr.io.ml-pipeline.metadata-writer:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.metadata-writer:1.7.0 gcr.io/ml-pipeline/metadata-writer:1.7.0
docker pull hisunyh/gcr.io.ml-pipeline.cache-deployer:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.cache-deployer:1.7.0 gcr.io/ml-pipeline/cache-deployer:1.7.0
docker pull hisunyh/gcr.io.ml-pipeline.cache-server:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.cache-server:1.7.0 gcr.io/ml-pipeline/cache-server:1.7.0
docker pull hisunyh/gcr.io.ml-pipeline.visualization-server:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.visualization-server:1.7.0 gcr.io/ml-pipeline/visualization-server:1.7.0
docker pull hisunyh/gcr.io.ml-pipeline.persistenceagent:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.persistenceagent:1.7.0 gcr.io/ml-pipeline/persistenceagent:1.7.0
docker pull hisunyh/gcr.io.ml-pipeline.viewer-crd-controller:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.viewer-crd-controller:1.7.0 gcr.io/ml-pipeline/viewer-crd-controller:1.7.0
docker pull hisunyh/gcr.io.ml-pipeline.scheduledworkflow:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.scheduledworkflow:1.7.0 gcr.io/ml-pipeline/scheduledworkflow:1.7.0
docker pull hisunyh/gcr.io.ml-pipeline.workflow-controller:v3.1.6-patch-license-compliance
docker tag hisunyh/gcr.io.ml-pipeline.workflow-controller:v3.1.6-patch-license-compliance gcr.io/ml-pipeline/workflow-controller:v3.1.6-patch-license-compliance
docker pull hisunyh/gcr.io.ml-pipeline.api-server:1.7.0
docker tag hisunyh/gcr.io.ml-pipeline.api-server:1.7.0 gcr.io/ml-pipeline/api-server:1.7.0
docker pull hisunyh/gcr.io.ml-pipeline.mysql:5.7
docker tag hisunyh/gcr.io.ml-pipeline.mysql:5.7 gcr.io/ml-pipeline/mysql:5.7
# more
gcr.io/kubebuilder/kube-rbac-proxy:v0.4.0
gcr.io/tfx-oss-public/ml_metadata_store_server:1.0.0
docker pull hisunyh/gcr.io.kubebuilder.kube-rbac-proxy:v0.4.0
docker tag hisunyh/gcr.io.kubebuilder.kube-rbac-proxy:v0.4.0 gcr.io/kubebuilder/kube-rbac-proxy:v0.4.0
docker pull hisunyh/gcr.io.tfx-oss-public.ml_metadata_store_server:1.0.0
docker tag hisunyh/gcr.io.tfx-oss-public.ml_metadata_store_server:1.0.0 gcr.io/tfx-oss-public/ml_metadata_store_server:1.0.0
Install
# Install with a single command
while ! kustomize build example | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done
troubleshooting
MountVolume.SetUp failed for volume "istiod-ca-cert" : configmap "istio-ca-root-cert" not found
# ref: https://github.com/istio/istio/issues/22463
kubectl rollout restart deployment istiod -n istio-system
MountVolume.SetUp failed for volume "istio-token" : failed to fetch token: the API server does not have TokenRequest endpoints enabled
# ref: https://github.com/kubeflow/manifests/issues/959
# apiserver添加参数来支持第三方令牌
- --service-account-signing-key-file=/etc/kubernetes/pki/sa.key
- --service-account-issuer=kubernetes.default.svc
Failed to pull image
因为镜像下载的问题,需要将imagePullPolicy: Always修改为imagePullPolicy: IfNotPresent
chown: changing ownership of ‘/var/lib/mysql/‘: Operation not permitted
使用nfs作为持久化存储,客户端的访问权限需要添加no_root_squash
# 示例
root@vm-u:~# cat /etc/exports
/mnt/demo *(rw,sync,no_subtree_check,no_root_squash)
Connect to kubeflow cluster
# verify
root@vm-u:~# kubectl get pods -n cert-manager
NAME READY STATUS RESTARTS AGE
cert-manager-7dd5854bb4-vqr9j 1/1 Running 1 3h1m
cert-manager-cainjector-64c949654c-lw6jt 1/1 Running 1 3h1m
cert-manager-webhook-6bdffc7c9d-ggmz6 1/1 Running 1 3h1m
root@vm-u:~# kubectl get pods -n istio-system
NAME READY STATUS RESTARTS AGE
authservice-0 1/1 Running 1 174m
cluster-local-gateway-7bf6b98855-8qt72 1/1 Running 1 150m
istio-ingressgateway-78bc678876-5r686 1/1 Running 1 150m
istiod-557487978f-2jfsw 1/1 Running 1 150m
root@vm-u:~# kubectl get pods -n auth
NAME READY STATUS RESTARTS AGE
dex-5ddf47d88d-gfn6h 1/1 Running 1 3h1m
root@vm-u:~# kubectl get pods -n knative-eventing
NAME READY STATUS RESTARTS AGE
eventing-controller-775974c67b-thcq8 1/1 Running 1 3h
eventing-webhook-5774d85d86-qqrt4 1/1 Running 1 3h
imc-controller-6fb9cc659d-ll5nv 1/1 Running 1 3h
imc-dispatcher-7696599fd4-86psj 1/1 Running 2 3h
mt-broker-controller-7d8c98958-zdzjw 1/1 Running 1 3h
mt-broker-filter-5dffcdbfb-j298z 1/1 Running 2 3h
mt-broker-ingress-79686bff7-n2c6r 1/1 Running 2 3h
root@vm-u:~# kubectl get pods -n knative-serving
NAME READY STATUS RESTARTS AGE
activator-b7ccffdcb-f4gdp 2/2 Running 1 97m
autoscaler-85d4d9fc74-jsxtj 2/2 Running 1 97m
controller-78bf88df9d-nj8n7 2/2 Running 1 97m
istio-webhook-6fc7b4d848-q6lmz 2/2 Running 1 97m
networking-istio-6fdd64c698-xzlkg 2/2 Running 1 97m
webhook-99b95d84c-s7hjz 2/2 Running 1 97m
root@vm-u:~# kubectl get pods -n kubeflow
NAME READY STATUS RESTARTS AGE
admission-webhook-deployment-667bd68d94-hjsv2 1/1 Running 0 9m2s
cache-deployer-deployment-585d4647b9-mpmh8 2/2 Running 1 8m59s
cache-server-85f59fb6f5-g5bhb 2/2 Running 0 9m2s
centraldashboard-7d496c59bb-n75cr 1/1 Running 0 9m1s
jupyter-web-app-deployment-6f744fbc54-7tl24 1/1 Running 0 9m2s
katib-controller-68c47fbf8b-5vxtv 1/1 Running 0 9m
katib-db-manager-6c76bdc855-zq5x4 1/1 Running 0 9m2s
katib-mysql-6dcb447c6f-kktf5 1/1 Running 0 8m59s
katib-ui-64bb96d5bf-2c79s 1/1 Running 0 9m2s
kfserving-controller-manager-0 2/2 Running 2 142m
kfserving-models-web-app-7884f597cf-knbpt 2/2 Running 0 8m59s
kubeflow-pipelines-profile-controller-7b947f4748-7gbrs 1/1 Running 0 9m
metacontroller-0 1/1 Running 1 142m
metadata-envoy-deployment-5b4856dd5-dl9kj 1/1 Running 0 9m2s
metadata-grpc-deployment-748f868f64-qrvmk 2/2 Running 4 9m2s
metadata-writer-548bd879bb-njt9s 2/2 Running 1 9m1s
minio-5b65df66c9-ldb4j 2/2 Running 0 8m58s
ml-pipeline-5784f9d9cc-wn4nn 2/2 Running 2 8m58s
ml-pipeline-persistenceagent-d6bdc77bd-6tdmf 2/2 Running 0 9m2s
ml-pipeline-scheduledworkflow-5db54d75c5-fhcd9 2/2 Running 0 9m
ml-pipeline-ui-5447cb9556-58b8m 2/2 Running 0 8m59s
ml-pipeline-viewer-crd-7695ffb54d-b5prz 2/2 Running 3 9m
ml-pipeline-visualizationserver-cf88b98f7-t4mdt 2/2 Running 0 9m1s
mpi-operator-5c55d6cb8f-q8j7m 1/1 Running 0 9m
mysql-f7b9b7dd4-ghp4h 2/2 Running 0 9m1s
notebook-controller-deployment-578fd4dc97-nkmgf 1/1 Running 0 9m1s
profiles-deployment-7cc7956dfd-sdl7m 2/2 Running 0 8m59s
tensorboard-controller-controller-manager-954b7c544-79q77 3/3 Running 4 9m1s
tensorboards-web-app-deployment-6ff79b7f44-j762j 1/1 Running 0 9m
training-operator-795c5cb864-9xqlm 1/1 Running 0 9m
volumes-web-app-deployment-8589d664cc-992tv 1/1 Running 0 9m
workflow-controller-76dd87cd85-q6d2b 2/2 Running 3 9m1s
root@vm-u:~# kubectl get pods -n kubeflow-user-example-com
NAME READY STATUS RESTARTS AGE
ml-pipeline-ui-artifact-5dd95d555b-9z4qd 2/2 Running 1 132m
ml-pipeline-visualizationserver-6b44c6759f-vfpgl 2/2 Running 0 132m
To access the central dashboard, you need to connect to the Istio gateway that provides access to the Kubeflow service mesh.
# 默认通过Nodeport暴露istio-ingressgateway服务
# 可以看到80对应的主机端口30963,通过http://<host-ip>:30963即可访问kubeflow UI
# default Email:[email protected]
# default Password:12341234
# 修改默认用户密码参考: https://github.com/kubeflow/manifests/tree/v1.4.1#change-default-user-password
root@vm-u:~# kubectl get svc -n istio-system istio-ingressgateway
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
istio-ingressgateway NodePort 10.233.37.60 <none> 15021:31329/TCP,80:30963/TCP,443:32640/TCP,31400:31433/TCP,15443:31029/TCP 3h36m
troubleshooting
Could not find CSRF cookie XSRF-TOKEN in the request
ref: github.com/kubeflow/ma… github.com/kubeflow/ku… NOTE In order to connect to Kubeflow using NodePort / LoadBalancer / Ingress, you need to setup HTTPS. The reason is that many of our web apps (e.g., Tensorboard Web App, Jupyter Web App, Katib UI) use Secure Cookies, so accessing Kubeflow with HTTP over a non-localhost domain does not work. If you absolutely need to expose Kubeflow over HTTP, you can disable the Secure Cookies feature by setting the APP_SECURE_COOKIES environment variable to false in every relevant web app. This is not recommended, as it poses security risks.
If you don't have a domain or a proper cluster issuer setup in cert-manager, you can use the kubeflow-self-signing-issuer which is also used by various components and a placeholder domain.
root@vm-u:~# kubectl get clusterIssuer
NAME READY AGE
kubeflow-self-signing-issuer True 11h
# create certificate
cat <<EOF | kubectl apply -f -
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: kubeflow-ingressgateway-certs
namespace: istio-system
spec:
secretName: kubeflow-ingressgateway-certs
issuerRef:
name: kubeflow-self-signing-issuer
kind: ClusterIssuer
commonName: kubeflow.example.com
dnsNames:
- kubeflow.example.com
EOF
# Then change the kubeflow-gateway to the following
# kubectl edit gateway -n kubeflow kubeflow-gateway
apiVersion: networking.istio.io/v1alpha3
kind: Gateway
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: >
{"apiVersion":"networking.istio.io/v1alpha3","kind":"Gateway","metadata":{"annotations":{},"name":"kubeflow-gateway","namespace":"kubeflow"},"spec":{"selector":{"istio":"ingressgateway"},"servers":[{"hosts":["*"],"port":{"name":"http","number":80,"protocol":"HTTP"}}]}}
name: kubeflow-gateway
namespace: kubeflow
spec:
selector:
istio: ingressgateway
servers:
- hosts:
- '*'
port:
name: http
number: 80
protocol: HTTP
tls:
httpsRedirect: true
- hosts:
- '*'
port:
name: https
number: 443
protocol: HTTPS
tls:
credentialName: kubeflow-ingressgateway-certs
mode: SIMPLE
443对应的主机端口32640,通过https://:32640即可访问kubeflow UI