diff --git a/README.md b/README.md index adc95cb..2fd3092 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ You can read more details of Longhorn and its design [here](http://rancher.com/m Longhorn is a work in progress. It's an alpha quality software at the moment. We appreciate your comments as we continue to work on it. -The latest release of Longhorn is **v0.3.3**, shipped with Longhorn Engine **v0.3.3** as the default engine image. +The latest release of Longhorn is **v0.4.0**, shipped with Longhorn Engine **v0.4.0** as the default engine image. ## Source code Longhorn is 100% open source software. Project source code is spread across a number of repos: @@ -263,8 +263,11 @@ Longhorn will always try to maintain at least given number of healthy replicas f ### [Restoring Stateful Set volumes](./docs/restore_statefulset.md) ### [Google Kubernetes Engine](./docs/gke.md) ### [Upgrade](./docs/upgrade.md) +### [Deal with Kubernetes node failure](./docs/node-failure.md) ## Troubleshooting +You can click `Generate Support Bundle` link at the bottom of the UI to download a zip file contains Longhorn related configuration and logs. + See [here](./docs/troubleshooting.md) for the troubleshooting guide. ## Uninstall Longhorn diff --git a/deploy/longhorn.yaml b/deploy/longhorn.yaml index db2bc6c..9f47f30 100644 --- a/deploy/longhorn.yaml +++ b/deploy/longhorn.yaml @@ -21,13 +21,13 @@ rules: verbs: - "*" - apiGroups: [""] - resources: ["pods", "events", "persistentvolumes", "persistentvolumeclaims", "nodes", "proxy/nodes", "pods/log", "secrets", "services", "endpoints"] + resources: ["pods", "events", "persistentvolumes", "persistentvolumeclaims", "nodes", "proxy/nodes", "pods/log", "secrets", "services", "endpoints", "configmaps"] verbs: ["*"] - apiGroups: [""] resources: ["namespaces"] verbs: ["get", "list"] - apiGroups: ["apps"] - resources: ["daemonsets", "statefulsets"] + resources: ["daemonsets", "statefulsets", "deployments"] verbs: ["*"] - apiGroups: ["batch"] resources: ["jobs", "cronjobs"] @@ -35,6 +35,9 @@ rules: - apiGroups: ["storage.k8s.io"] resources: ["storageclasses", "volumeattachments"] verbs: ["*"] +- apiGroups: ["csi.storage.k8s.io"] + resources: ["csinodeinfos"] + verbs: ["get", "list", "watch"] - apiGroups: ["longhorn.rancher.io"] resources: ["volumes", "engines", "replicas", "settings", "engineimages", "nodes"] verbs: ["*"] @@ -178,7 +181,7 @@ spec: spec: containers: - name: longhorn-manager - image: rancher/longhorn-manager:v0.3.3 + image: rancher/longhorn-manager:v0.4.0 imagePullPolicy: Always securityContext: privileged: true @@ -187,9 +190,9 @@ spec: - -d - daemon - --engine-image - - rancher/longhorn-engine:v0.3.3 + - rancher/longhorn-engine:v0.4.0 - --manager-image - - rancher/longhorn-manager:v0.3.3 + - rancher/longhorn-manager:v0.4.0 - --service-account - longhorn-service-account ports: @@ -266,7 +269,7 @@ spec: spec: containers: - name: longhorn-ui - image: rancher/longhorn-ui:v0.3.3 + image: rancher/longhorn-ui:v0.4.0 ports: - containerPort: 8000 env: @@ -305,18 +308,18 @@ spec: spec: initContainers: - name: wait-longhorn-manager - image: rancher/longhorn-manager:v0.3.3 + image: rancher/longhorn-manager:v0.4.0 command: ['sh', '-c', 'while [ $(curl -m 1 -s -o /dev/null -w "%{http_code}" http://longhorn-backend:9500/v1) != "200" ]; do echo waiting; sleep 2; done'] containers: - name: longhorn-driver-deployer - image: rancher/longhorn-manager:v0.3.3 + image: rancher/longhorn-manager:v0.4.0 imagePullPolicy: Always command: - longhorn-manager - -d - deploy-driver - --manager-image - - rancher/longhorn-manager:v0.3.3 + - rancher/longhorn-manager:v0.4.0 - --manager-url - http://longhorn-backend:9500/v1 # manually choose "flexvolume" or "csi" diff --git a/docs/node-failure.md b/docs/node-failure.md new file mode 100644 index 0000000..2e6c977 --- /dev/null +++ b/docs/node-failure.md @@ -0,0 +1,15 @@ +# Node Failure Handling with Longhorn + +## What to expect when a Kubernetes Node fails + +When a Kubernetes node fails with CSI driver installed (all the following are based on Kubernetes v1.12 with default setup): +1. After **one minute**, `kubectl get nodes` will report `NotReady` for the failure node. +2. After about **five minutes**, the states of all the pods on the `NotReady` node will change to either `Unknown` or `NodeLost`. +3. If you're deploying using StatefulSet or Deployment, you need to decide is if it's safe to force deletion the pod of the workload +running on the lost node. See [here](https://kubernetes.io/docs/tasks/run-application/force-delete-stateful-set-pod/). + 1. StatefulSet has stable identity, so Kubernetes won't delete the Pod for the user. + 2. Deployment doesn't have stable identity, but Longhorn is a Read-Write-Once type of storage, which means it can only attached + to one Pod. So the new Pod created by Kubernetes won't be able to start due to the Longhorn volume still attached to the old Pod, + on the lost Node. +4. If you decide to delete the Pod manually (and forcefully), Kubernetes will take about another **six minutes** to delete the VolumeAttachment +object associated with the Pod, thus finally detach the Longhorn volume from the lost Node and allow it to be used by the new Pod. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index ac6a682..2389dc6 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -16,6 +16,10 @@ User can find the correct directory by running `ps aux|grep kubelet` on the host There are a few compontents in the Longhorn. Manager, Engine, Driver and UI. All of those components runnings as pods in the `longhorn-system` namespace by default inside the Kubernetes cluster. +Most of the logs are included in the Support Bundle. You can click Generate Support Bundle link at the bottom of the UI to download a zip file contains Longhorn related configuration and logs. + +One exception is the `dmesg`, which need to retrieve by the user on each node. + ### UI Make use of the Longhorn UI is a good start for the troubleshooting. For example, if Kubernetes cannot mount one volume correctly, after stop the workload, try to attach and mount that volume manually on one node and access the content to check if volume is intact. diff --git a/uninstall/uninstall.yaml b/uninstall/uninstall.yaml index 6be2aaf..e13d1a2 100644 --- a/uninstall/uninstall.yaml +++ b/uninstall/uninstall.yaml @@ -12,7 +12,7 @@ spec: spec: containers: - name: longhorn-uninstall - image: rancher/longhorn-manager:v0.3.3 + image: rancher/longhorn-manager:v0.4.0 imagePullPolicy: Always command: - longhorn-manager