From 631ddeb2ac90ad8db85902b7891ba74eac396caa Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 31 Jan 2019 18:53:14 -0800 Subject: [PATCH 1/8] Create node-failure.md --- docs/node-failure.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 docs/node-failure.md diff --git a/docs/node-failure.md b/docs/node-failure.md new file mode 100644 index 0000000..2e6c977 --- /dev/null +++ b/docs/node-failure.md @@ -0,0 +1,15 @@ +# Node Failure Handling with Longhorn + +## What to expect when a Kubernetes Node fails + +When a Kubernetes node fails with CSI driver installed (all the following are based on Kubernetes v1.12 with default setup): +1. After **one minute**, `kubectl get nodes` will report `NotReady` for the failure node. +2. After about **five minutes**, the states of all the pods on the `NotReady` node will change to either `Unknown` or `NodeLost`. +3. If you're deploying using StatefulSet or Deployment, you need to decide is if it's safe to force deletion the pod of the workload +running on the lost node. See [here](https://kubernetes.io/docs/tasks/run-application/force-delete-stateful-set-pod/). + 1. StatefulSet has stable identity, so Kubernetes won't delete the Pod for the user. + 2. Deployment doesn't have stable identity, but Longhorn is a Read-Write-Once type of storage, which means it can only attached + to one Pod. So the new Pod created by Kubernetes won't be able to start due to the Longhorn volume still attached to the old Pod, + on the lost Node. +4. If you decide to delete the Pod manually (and forcefully), Kubernetes will take about another **six minutes** to delete the VolumeAttachment +object associated with the Pod, thus finally detach the Longhorn volume from the lost Node and allow it to be used by the new Pod. From 6e4c7f8efb397f825065f36bd72e0499bb3b628c Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 20 Feb 2019 14:09:38 -0800 Subject: [PATCH 2/8] Sync with Longhorn Manager commit ccdccee7b3ac8d254fc36fefa9acd6a091df624e Author: Sheng Yang Date: Wed Feb 20 13:59:32 2019 -0800 Longhorn v0.4.0-rc1 release --- deploy/longhorn.yaml | 23 +++++++++++++---------- uninstall/uninstall.yaml | 2 +- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/deploy/longhorn.yaml b/deploy/longhorn.yaml index db2bc6c..644a43a 100644 --- a/deploy/longhorn.yaml +++ b/deploy/longhorn.yaml @@ -21,13 +21,13 @@ rules: verbs: - "*" - apiGroups: [""] - resources: ["pods", "events", "persistentvolumes", "persistentvolumeclaims", "nodes", "proxy/nodes", "pods/log", "secrets", "services", "endpoints"] + resources: ["pods", "events", "persistentvolumes", "persistentvolumeclaims", "nodes", "proxy/nodes", "pods/log", "secrets", "services", "endpoints", "configmaps"] verbs: ["*"] - apiGroups: [""] resources: ["namespaces"] verbs: ["get", "list"] - apiGroups: ["apps"] - resources: ["daemonsets", "statefulsets"] + resources: ["daemonsets", "statefulsets", "deployments"] verbs: ["*"] - apiGroups: ["batch"] resources: ["jobs", "cronjobs"] @@ -35,6 +35,9 @@ rules: - apiGroups: ["storage.k8s.io"] resources: ["storageclasses", "volumeattachments"] verbs: ["*"] +- apiGroups: ["csi.storage.k8s.io"] + resources: ["csinodeinfos"] + verbs: ["get", "list", "watch"] - apiGroups: ["longhorn.rancher.io"] resources: ["volumes", "engines", "replicas", "settings", "engineimages", "nodes"] verbs: ["*"] @@ -178,7 +181,7 @@ spec: spec: containers: - name: longhorn-manager - image: rancher/longhorn-manager:v0.3.3 + image: rancher/longhorn-manager:v0.4.0-rc1 imagePullPolicy: Always securityContext: privileged: true @@ -187,9 +190,9 @@ spec: - -d - daemon - --engine-image - - rancher/longhorn-engine:v0.3.3 + - rancher/longhorn-engine:v0.4.0-rc1 - --manager-image - - rancher/longhorn-manager:v0.3.3 + - rancher/longhorn-manager:v0.4.0-rc1 - --service-account - longhorn-service-account ports: @@ -255,7 +258,7 @@ metadata: name: longhorn-ui namespace: longhorn-system spec: - replicas: 1 + replicas: 3 selector: matchLabels: app: longhorn-ui @@ -266,7 +269,7 @@ spec: spec: containers: - name: longhorn-ui - image: rancher/longhorn-ui:v0.3.3 + image: rancher/longhorn-ui:v0.4.0-rc1 ports: - containerPort: 8000 env: @@ -305,18 +308,18 @@ spec: spec: initContainers: - name: wait-longhorn-manager - image: rancher/longhorn-manager:v0.3.3 + image: rancher/longhorn-manager:v0.4.0-rc1 command: ['sh', '-c', 'while [ $(curl -m 1 -s -o /dev/null -w "%{http_code}" http://longhorn-backend:9500/v1) != "200" ]; do echo waiting; sleep 2; done'] containers: - name: longhorn-driver-deployer - image: rancher/longhorn-manager:v0.3.3 + image: rancher/longhorn-manager:v0.4.0-rc1 imagePullPolicy: Always command: - longhorn-manager - -d - deploy-driver - --manager-image - - rancher/longhorn-manager:v0.3.3 + - rancher/longhorn-manager:v0.4.0-rc1 - --manager-url - http://longhorn-backend:9500/v1 # manually choose "flexvolume" or "csi" diff --git a/uninstall/uninstall.yaml b/uninstall/uninstall.yaml index 6be2aaf..a6fc270 100644 --- a/uninstall/uninstall.yaml +++ b/uninstall/uninstall.yaml @@ -12,7 +12,7 @@ spec: spec: containers: - name: longhorn-uninstall - image: rancher/longhorn-manager:v0.3.3 + image: rancher/longhorn-manager:v0.4.0-rc1 imagePullPolicy: Always command: - longhorn-manager From d9b6eee274156ffedbe6941df433a445d60b112b Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 21 Feb 2019 15:03:18 -0800 Subject: [PATCH 3/8] Sync with Longhorn Manager commit 9cde3deb1d449733e525aeb2d710ce4789d73abf Author: Sheng Yang Date: Thu Feb 21 14:52:59 2019 -0800 Longhorn v0.4.0-rc2 release --- deploy/longhorn.yaml | 16 ++++++++-------- uninstall/uninstall.yaml | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/deploy/longhorn.yaml b/deploy/longhorn.yaml index 644a43a..92167be 100644 --- a/deploy/longhorn.yaml +++ b/deploy/longhorn.yaml @@ -181,7 +181,7 @@ spec: spec: containers: - name: longhorn-manager - image: rancher/longhorn-manager:v0.4.0-rc1 + image: rancher/longhorn-manager:v0.4.0-rc2 imagePullPolicy: Always securityContext: privileged: true @@ -190,9 +190,9 @@ spec: - -d - daemon - --engine-image - - rancher/longhorn-engine:v0.4.0-rc1 + - rancher/longhorn-engine:v0.4.0-rc2 - --manager-image - - rancher/longhorn-manager:v0.4.0-rc1 + - rancher/longhorn-manager:v0.4.0-rc2 - --service-account - longhorn-service-account ports: @@ -258,7 +258,7 @@ metadata: name: longhorn-ui namespace: longhorn-system spec: - replicas: 3 + replicas: 1 selector: matchLabels: app: longhorn-ui @@ -269,7 +269,7 @@ spec: spec: containers: - name: longhorn-ui - image: rancher/longhorn-ui:v0.4.0-rc1 + image: rancher/longhorn-ui:v0.4.0-rc2 ports: - containerPort: 8000 env: @@ -308,18 +308,18 @@ spec: spec: initContainers: - name: wait-longhorn-manager - image: rancher/longhorn-manager:v0.4.0-rc1 + image: rancher/longhorn-manager:v0.4.0-rc2 command: ['sh', '-c', 'while [ $(curl -m 1 -s -o /dev/null -w "%{http_code}" http://longhorn-backend:9500/v1) != "200" ]; do echo waiting; sleep 2; done'] containers: - name: longhorn-driver-deployer - image: rancher/longhorn-manager:v0.4.0-rc1 + image: rancher/longhorn-manager:v0.4.0-rc2 imagePullPolicy: Always command: - longhorn-manager - -d - deploy-driver - --manager-image - - rancher/longhorn-manager:v0.4.0-rc1 + - rancher/longhorn-manager:v0.4.0-rc2 - --manager-url - http://longhorn-backend:9500/v1 # manually choose "flexvolume" or "csi" diff --git a/uninstall/uninstall.yaml b/uninstall/uninstall.yaml index a6fc270..6501f54 100644 --- a/uninstall/uninstall.yaml +++ b/uninstall/uninstall.yaml @@ -12,7 +12,7 @@ spec: spec: containers: - name: longhorn-uninstall - image: rancher/longhorn-manager:v0.4.0-rc1 + image: rancher/longhorn-manager:v0.4.0-rc2 imagePullPolicy: Always command: - longhorn-manager From f7c572cfb65827a5021e994751450f382af0adc2 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 21 Feb 2019 17:00:28 -0800 Subject: [PATCH 4/8] Update README.md Add node-failure.md and support bundle explanation. --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index adc95cb..ec01036 100644 --- a/README.md +++ b/README.md @@ -263,8 +263,11 @@ Longhorn will always try to maintain at least given number of healthy replicas f ### [Restoring Stateful Set volumes](./docs/restore_statefulset.md) ### [Google Kubernetes Engine](./docs/gke.md) ### [Upgrade](./docs/upgrade.md) +### [Deal with Kubernetes node failure](./docs/node-failure.md) ## Troubleshooting +You can click `Generate Support Bundle` link at the bottom of the UI to download a zip file contains Longhorn related configuration and logs. + See [here](./docs/troubleshooting.md) for the troubleshooting guide. ## Uninstall Longhorn From 1c00bbb73d850a10a48d6b4a5f64bd80fdf8e37a Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 21 Feb 2019 17:02:43 -0800 Subject: [PATCH 5/8] Update troubleshooting.md --- docs/troubleshooting.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index ac6a682..2389dc6 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -16,6 +16,10 @@ User can find the correct directory by running `ps aux|grep kubelet` on the host There are a few compontents in the Longhorn. Manager, Engine, Driver and UI. All of those components runnings as pods in the `longhorn-system` namespace by default inside the Kubernetes cluster. +Most of the logs are included in the Support Bundle. You can click Generate Support Bundle link at the bottom of the UI to download a zip file contains Longhorn related configuration and logs. + +One exception is the `dmesg`, which need to retrieve by the user on each node. + ### UI Make use of the Longhorn UI is a good start for the troubleshooting. For example, if Kubernetes cannot mount one volume correctly, after stop the workload, try to attach and mount that volume manually on one node and access the content to check if volume is intact. From d32395f44bfcb59684f1d71059b44e4946c9ca71 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 21 Feb 2019 19:45:39 -0800 Subject: [PATCH 6/8] Sync with Longhorn manager commit 206e1fa9d98260fb3f72088e8d8e826074815766 Author: Sheng Yang Date: Thu Feb 21 19:21:36 2019 -0800 Longhorn v0.4.0 release --- deploy/longhorn.yaml | 14 +++++++------- uninstall/uninstall.yaml | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/deploy/longhorn.yaml b/deploy/longhorn.yaml index 92167be..9f47f30 100644 --- a/deploy/longhorn.yaml +++ b/deploy/longhorn.yaml @@ -181,7 +181,7 @@ spec: spec: containers: - name: longhorn-manager - image: rancher/longhorn-manager:v0.4.0-rc2 + image: rancher/longhorn-manager:v0.4.0 imagePullPolicy: Always securityContext: privileged: true @@ -190,9 +190,9 @@ spec: - -d - daemon - --engine-image - - rancher/longhorn-engine:v0.4.0-rc2 + - rancher/longhorn-engine:v0.4.0 - --manager-image - - rancher/longhorn-manager:v0.4.0-rc2 + - rancher/longhorn-manager:v0.4.0 - --service-account - longhorn-service-account ports: @@ -269,7 +269,7 @@ spec: spec: containers: - name: longhorn-ui - image: rancher/longhorn-ui:v0.4.0-rc2 + image: rancher/longhorn-ui:v0.4.0 ports: - containerPort: 8000 env: @@ -308,18 +308,18 @@ spec: spec: initContainers: - name: wait-longhorn-manager - image: rancher/longhorn-manager:v0.4.0-rc2 + image: rancher/longhorn-manager:v0.4.0 command: ['sh', '-c', 'while [ $(curl -m 1 -s -o /dev/null -w "%{http_code}" http://longhorn-backend:9500/v1) != "200" ]; do echo waiting; sleep 2; done'] containers: - name: longhorn-driver-deployer - image: rancher/longhorn-manager:v0.4.0-rc2 + image: rancher/longhorn-manager:v0.4.0 imagePullPolicy: Always command: - longhorn-manager - -d - deploy-driver - --manager-image - - rancher/longhorn-manager:v0.4.0-rc2 + - rancher/longhorn-manager:v0.4.0 - --manager-url - http://longhorn-backend:9500/v1 # manually choose "flexvolume" or "csi" diff --git a/uninstall/uninstall.yaml b/uninstall/uninstall.yaml index 6501f54..e13d1a2 100644 --- a/uninstall/uninstall.yaml +++ b/uninstall/uninstall.yaml @@ -12,7 +12,7 @@ spec: spec: containers: - name: longhorn-uninstall - image: rancher/longhorn-manager:v0.4.0-rc2 + image: rancher/longhorn-manager:v0.4.0 imagePullPolicy: Always command: - longhorn-manager From 31798dca74a88c8d21d7c2f900fa137eb71f60bc Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 21 Feb 2019 21:04:09 -0800 Subject: [PATCH 7/8] Update README.md Update version to 0.4.0 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ec01036..6db40ee 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ You can read more details of Longhorn and its design [here](http://rancher.com/m Longhorn is a work in progress. It's an alpha quality software at the moment. We appreciate your comments as we continue to work on it. -The latest release of Longhorn is **v0.3.3**, shipped with Longhorn Engine **v0.3.3** as the default engine image. +The latest release of Longhorn is **v0.4.0**, shipped with Longhorn Engine **v0.4.0** as the default engine image. ## Source code Longhorn is 100% open source software. Project source code is spread across a number of repos: From 26c2a70df95282586ac4c4892ebe15580c7fd4d1 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 22 Feb 2019 11:41:56 -0800 Subject: [PATCH 8/8] Longhorn v0.4.0 release --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6db40ee..2fd3092 100644 --- a/README.md +++ b/README.md @@ -266,7 +266,7 @@ Longhorn will always try to maintain at least given number of healthy replicas f ### [Deal with Kubernetes node failure](./docs/node-failure.md) ## Troubleshooting -You can click `Generate Support Bundle` link at the bottom of the UI to download a zip file contains Longhorn related configuration and logs. +You can click `Generate Support Bundle` link at the bottom of the UI to download a zip file contains Longhorn related configuration and logs. See [here](./docs/troubleshooting.md) for the troubleshooting guide.